├── .gitignore ├── Capstone Project Template.ipynb ├── README.md ├── airflow ├── dags │ └── capstone.py └── plugins │ └── operators │ ├── data_quality.py │ └── stage_redshift.py ├── helper ├── __init__.py └── etl.py ├── images ├── architecture.png ├── dag.PNG ├── etl_country.png ├── etl_immigration.png ├── etl_state.png ├── i94cit.PNG ├── pipeline.png └── star-schema.PNG ├── lookup ├── I94ADDR.csv ├── I94CIT_I94RES.csv ├── I94MODE.csv ├── I94PORT.csv └── I94VISA.csv └── sql └── create_tables.sql /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # datasets 107 | airport-codes_csv.csv 108 | I94_SAS_Labels_Descriptions.SAS 109 | immigration_data_sample.csv 110 | us-cities-demographics.csv 111 | 112 | # VS code 113 | /.vscode 114 | 115 | # data 116 | /data 117 | 118 | # configuration 119 | *.cfg 120 | 121 | # power point 122 | *.pptx -------------------------------------------------------------------------------- /Capstone Project Template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# US Visitors DW\n", 8 | "__Supporting officials' decision-making to provide better visitors experience in the US__\n", 9 | "***\n", 10 | "## Overview\n", 11 | "The purpose of this data engineering capstone project is to give students a chance to combine what they've learned throughout the program. This project will be an important part of learners portfolio that will help to achieve data engineering-related career goals. We could choose to complete the project provided by the Udacity team or define the scope and data ourselves. I took the first approach in building the DW on the data on immigration to the United States provided by Udacity.\n", 12 | "\n", 13 | "## Business Scenario\n", 14 | "We are D2I (Data to Insights), a business consulting firm specialized in data warehouse services through assisting the enterprises with navigating their data needs and creating strategic operational solutions that deliver tangible business results. Specifically, we can help with the modernization of corporations' data warehousing infrastructure by improving performance and ease of use for end users, enhancing functionality, decreasing total cost of ownership while making it possible for real-time decision making. In total, our full suite of services includes helping enterprises with data profiling, data standardization, data acquisition, data transformation and integration.\n", 15 | "\n", 16 | "We have been contracted by the U.S. Customs and Border Protection to help them see what is hidden behind the data flood. We aim to model and create a brand new analytics solution on top of the state-of-the-art technolgies available to enable them to unleash insights from data then providing better customer experiences when coming to the US.\n", 17 | "\n", 18 | "## Structure of the Project\n", 19 | "Following the Udacity guide for this project, we structured this documentation with steps below:\n", 20 | "* Step 1: Scope the Project and Gather Data\n", 21 | "* Step 2: Explore and Assess the Data\n", 22 | "* Step 3: Define the Data Model\n", 23 | "* Step 4: Run ETL to Model the Data\n", 24 | "* Step 5: Complete Project Write Up" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Step 1: Scope the Project and Gather Data\n", 32 | "\n", 33 | "_Explain what you plan to do in the project in more detail. What data do you use? What is your end solution look like? What tools did you use? etc_\n", 34 | "\n", 35 | "### The Scope \n", 36 | "The main deliverable of our work here will be a data warehouse in the cloud that will support answering questions through analytics tables and dashboards. Additionally, as we developed a general source-of-truth database, the Government of the US could open the solution through a web API so backend web services could query the warehouse for information relating to international visitors.\n", 37 | "\n", 38 | "### The Data\n", 39 | "For this work we have used the immigration, the global temperature and demographics datasets as well as the descriptions contained in the `I94_SAS_Labels_Descriptions.SAS` file.\n", 40 | "\n", 41 | "### The Architecture\n", 42 | "The whole solution is cloud based on top of __Amazon Web Services (AWS)__. First, all the datasets were preprocessed with __Apache Spark__ and stored in a staging area in __AWS S3__ bucket. Then, we loaded those to a __Amazon Redshift__ cluster using an __Apache Airflow__ pipeline that transfer and check the quality of the data to finally provide our customers a data mart for their convenient analysis.\n", 43 | "\n", 44 | "![Architecture](images/architecture.png)\n", 45 | "\n", 46 | "The main information and questions a user may want to extract from the data mart would be:\n", 47 | "\n", 48 | "* Visitors by nationality.\n", 49 | "* Visitors by origin.\n", 50 | "* Visitors by airline.\n", 51 | "* Correlations between destination in the U.S and the source country.\n", 52 | "* Correlations between destination in the U.S and source climates.\n", 53 | "* Correlations between immigration by source region, and the source region temperature.\n", 54 | "* Correlations between visitor demographics, and states visited.\n", 55 | "\n", 56 | "***" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Step 2: Explore and Assess the Data\n", 64 | "\n", 65 | "_To familiarize ourselves with the data provided by Udacity we have done an exhaustive exploratory data analysis ([EDA](https://en.wikipedia.org/wiki/Exploratory_data_analysis)) checking what data would be useful and what preprocessing steps we should take in order to clean, organize and join the various datasets in a meaningful data model._\n", 66 | "\n", 67 | "In the following sections we briefly describe the datasets provided and give a summarized idea on the reasons we took into consideration when deciding what data to use.\n", 68 | "\n", 69 | "__Immigration Data__\n", 70 | "\n", 71 | "For decades, U.S. immigration officers issued the I-94 Form (Arrival/Departure Record) to foreign visitors (e.g., business visitors, tourists and foreign students) who lawfully entered the United States. The I-94 was a small white paper form that a foreign visitor received from cabin crews on arrival flights and from U.S. Customs and Border Protection at the time of entry into the United States. It listed the traveler's immigration category, port of entry, data of entry into the United States, status expiration date and had a unique 11-digit identifying number assigned to it. Its purpose was to record the traveler's lawful admission to the United States.\n", 72 | "\n", 73 | "This is the main dataset and there is a file for each month of the year of 2016 available in the directory `../../data/18-83510-I94-Data-2016/` in the [SAS](https://www.sas.com/en_us/home.html) binary database storage format `sas7bdat`. Combined, the 12 datasets have got more than 40 million rows (40.790.529) and 28 columns. For most of the work we used only the month of April of 2016 which has more than three million records (3.096.313)." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 1, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Importing the libraries needed in this project\n", 83 | "import os\n", 84 | "import pandas as pd\n", 85 | "from datetime import datetime" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 2, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "immigration_fname = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'\n", 95 | "immigration = pd.read_sas(immigration_fname, 'sas7bdat', encoding=\"ISO-8859-1\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 3, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/html": [ 106 | "
\n", 107 | "\n", 120 | "\n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | "
cicidi94yri94moni94citi94resi94portarrdatei94modei94addrdepdate...entdepumatflagbiryeardtaddtogenderinsnumairlineadmnumfltnovisatype
06.02016.04.0692.0692.0XXX20573.0NaNNaNNaN...UNaN1979.010282016NaNNaNNaN1.897628e+09NaNB2
17.02016.04.0254.0276.0ATL20551.01.0ALNaN...YNaN1991.0D/SMNaNNaN3.736796e+0900296F1
215.02016.04.0101.0101.0WAS20545.01.0MI20691.0...NaNM1961.009302016MNaNOS6.666432e+0893B2
316.02016.04.0101.0101.0NYC20545.01.0MA20567.0...NaNM1988.009302016NaNNaNAA9.246846e+1000199B2
417.02016.04.0101.0101.0NYC20545.01.0MA20567.0...NaNM2012.009302016NaNNaNAA9.246846e+1000199B2
\n", 270 | "

5 rows × 28 columns

\n", 271 | "
" 272 | ], 273 | "text/plain": [ 274 | " cicid i94yr i94mon i94cit i94res i94port arrdate i94mode i94addr \\\n", 275 | "0 6.0 2016.0 4.0 692.0 692.0 XXX 20573.0 NaN NaN \n", 276 | "1 7.0 2016.0 4.0 254.0 276.0 ATL 20551.0 1.0 AL \n", 277 | "2 15.0 2016.0 4.0 101.0 101.0 WAS 20545.0 1.0 MI \n", 278 | "3 16.0 2016.0 4.0 101.0 101.0 NYC 20545.0 1.0 MA \n", 279 | "4 17.0 2016.0 4.0 101.0 101.0 NYC 20545.0 1.0 MA \n", 280 | "\n", 281 | " depdate ... entdepu matflag biryear dtaddto gender insnum \\\n", 282 | "0 NaN ... U NaN 1979.0 10282016 NaN NaN \n", 283 | "1 NaN ... Y NaN 1991.0 D/S M NaN \n", 284 | "2 20691.0 ... NaN M 1961.0 09302016 M NaN \n", 285 | "3 20567.0 ... NaN M 1988.0 09302016 NaN NaN \n", 286 | "4 20567.0 ... NaN M 2012.0 09302016 NaN NaN \n", 287 | "\n", 288 | " airline admnum fltno visatype \n", 289 | "0 NaN 1.897628e+09 NaN B2 \n", 290 | "1 NaN 3.736796e+09 00296 F1 \n", 291 | "2 OS 6.666432e+08 93 B2 \n", 292 | "3 AA 9.246846e+10 00199 B2 \n", 293 | "4 AA 9.246846e+10 00199 B2 \n", 294 | "\n", 295 | "[5 rows x 28 columns]" 296 | ] 297 | }, 298 | "execution_count": 3, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "immigration.head()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "__Data Dictionary__: Here, we describe the various fields of the dataset. Some descriptions were not clear enough so we had to make assumptions about the meaning." 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "| Column Name | Description |\n", 319 | "| :--- | :--- |\n", 320 | "| CICID* | ID that uniquely identify one record in the dataset |\n", 321 | "| I94YR | 4 digit year |\n", 322 | "| I94MON | Numeric month |\n", 323 | "| I94CIT | 3 digit code of source city for immigration (Born country) |\n", 324 | "| I94RES | 3 digit code of source country for immigration (Residence country) |\n", 325 | "| I94PORT | Port addmitted through |\n", 326 | "| ARRDATE | Arrival date in the USA |\n", 327 | "| I94MODE | Mode of transportation (1 = Air; 2 = Sea; 3 = Land; 9 = Not reported) |\n", 328 | "| I94ADDR | State of arrival |\n", 329 | "| DEPDATE | Departure date |\n", 330 | "| I94BIR | Age of Respondent in Years |\n", 331 | "| I94VISA | Visa codes collapsed into three categories: (1 = Business; 2 = Pleasure; 3 = Student) |\n", 332 | "| COUNT | Used for summary statistics |\n", 333 | "| DTADFILE | Character Date Field |\n", 334 | "| VISAPOST | Department of State where where Visa was issued |\n", 335 | "| OCCUP | Occupation that will be performed in U.S. |\n", 336 | "| ENTDEPA | Arrival Flag. Whether admitted or paroled into the US |\n", 337 | "| ENTDEPD | Departure Flag. Whether departed, lost visa, or deceased |\n", 338 | "| ENTDEPU | Update Flag. Update of visa, either apprehended, overstayed, or updated to PR |\n", 339 | "| MATFLAG | Match flag |\n", 340 | "| BIRYEAR | 4 digit year of birth |\n", 341 | "| DTADDTO | Character date field to when admitted in the US |\n", 342 | "| GENDER | Gender |\n", 343 | "| INSNUM | INS number |\n", 344 | "| AIRLINE | Airline used to arrive in U.S. |\n", 345 | "| ADMNUM | Admission number, should be unique and not nullable |\n", 346 | "| FLTNO | Flight number of Airline used to arrive in U.S. |\n", 347 | "| VISATYPE | Class of admission legally admitting the non-immigrant to temporarily stay in U.S. |" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "The immigration dataset is our fact so that will be at the center of the star schema model of our data warehouse." 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "__Global Temperature Data__\n", 362 | "\n", 363 | "There are a range of organizations that collate climate trends data. The three most cited land and ocean temperature data sets are NOAA’s MLOST, NASA’s GISTEMP and the UK’s HadCrut.\n", 364 | "\n", 365 | "The Berkeley Earth, which is affiliated with Lawrence Berkeley National Laboratory, has repackaged the data from a newer compilation put it all together. The Berkeley Earth Surface Temperature Study combines 1.6 billion temperature reports from 16 pre-existing archives. It is nicely packaged and allows for slicing into interesting subsets (for example by country). They publish the source data and the code for the transformations they applied. They also use methods that allow weather observations from shorter time series to be included, meaning fewer observations need to be thrown away.\n", 366 | "\n", 367 | "In the original dataset from [Kaggle](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data), several files are available but in this capstone project we will be using only the `GlobalLandTemperaturesByCity`." 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 6, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "temperature_fname = '../../data2/GlobalLandTemperaturesByCity.csv'\n", 377 | "world_temperature = pd.read_csv(temperature_fname)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 7, 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/html": [ 388 | "
\n", 389 | "\n", 402 | "\n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | "
dtAverageTemperatureAverageTemperatureUncertaintyCityCountryLatitudeLongitude
01743-11-016.0681.737ÅrhusDenmark57.05N10.33E
11743-12-01NaNNaNÅrhusDenmark57.05N10.33E
21744-01-01NaNNaNÅrhusDenmark57.05N10.33E
31744-02-01NaNNaNÅrhusDenmark57.05N10.33E
41744-03-01NaNNaNÅrhusDenmark57.05N10.33E
\n", 468 | "
" 469 | ], 470 | "text/plain": [ 471 | " dt AverageTemperature AverageTemperatureUncertainty City \\\n", 472 | "0 1743-11-01 6.068 1.737 Århus \n", 473 | "1 1743-12-01 NaN NaN Århus \n", 474 | "2 1744-01-01 NaN NaN Århus \n", 475 | "3 1744-02-01 NaN NaN Århus \n", 476 | "4 1744-03-01 NaN NaN Århus \n", 477 | "\n", 478 | " Country Latitude Longitude \n", 479 | "0 Denmark 57.05N 10.33E \n", 480 | "1 Denmark 57.05N 10.33E \n", 481 | "2 Denmark 57.05N 10.33E \n", 482 | "3 Denmark 57.05N 10.33E \n", 483 | "4 Denmark 57.05N 10.33E " 484 | ] 485 | }, 486 | "execution_count": 7, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "world_temperature.head()" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "__Data Dictionary__\n", 500 | "\n", 501 | "| Column Name | Description |\n", 502 | "| :--- | :--- |\n", 503 | "| dt | Date in format YYYY-MM-DD |\n", 504 | "| AverageTemperature | Average temperature of the city in a given date |\n", 505 | "| City | City Name |\n", 506 | "| Country | Country Name |\n", 507 | "| Latitude | Latitude |\n", 508 | "| Longitude | Longitude |" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "The dataset provides a long period of the world's temperature (from year 1743 to 2013). However, since the immigration dataset only has data of the US National Tourism Office in the year of 2016, the vast majority of the data here seems not to be suitable. We then decided to aggregate this dataset by country, averaging the temperatures and use this reduced table to join with `lookup\\I94CIT_I94RES.csv` lookup table (extracted from `I94_SAS_Labels_Descriptions.SAS`) resulting in the COUNTRY dimension of our model.\n", 516 | "\n", 517 | "> If we had temperatures of the year 2016 we could have provided an interesting analysis crossing the two tables (immigration and temperatures) in order to see how the waves of immigration to the US relate to the changes in the temperature. But this is just unfeasible due to the different dates." 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 8, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "world_temperature = world_temperature.groupby([\"Country\"]).agg({\"AverageTemperature\": \"mean\", \n", 527 | " \"Latitude\": \"first\", \"Longitude\": \"first\"}).reset_index()" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 9, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/html": [ 538 | "
\n", 539 | "\n", 552 | "\n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | "
CountryAverageTemperatureLatitudeLongitude
0Afghanistan13.81649736.17N69.61E
1Albania15.52582840.99N19.17E
2Algeria17.76320636.17N3.98E
3Angola21.75971612.05S13.15E
4Argentina16.99921639.38S62.43W
\n", 600 | "
" 601 | ], 602 | "text/plain": [ 603 | " Country AverageTemperature Latitude Longitude\n", 604 | "0 Afghanistan 13.816497 36.17N 69.61E\n", 605 | "1 Albania 15.525828 40.99N 19.17E\n", 606 | "2 Algeria 17.763206 36.17N 3.98E\n", 607 | "3 Angola 21.759716 12.05S 13.15E\n", 608 | "4 Argentina 16.999216 39.38S 62.43W" 609 | ] 610 | }, 611 | "execution_count": 9, 612 | "metadata": {}, 613 | "output_type": "execute_result" 614 | } 615 | ], 616 | "source": [ 617 | "world_temperature.head()" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "__Airports Data__\n", 625 | "\n", 626 | "The airport codes may refer to either [IATA](https://en.wikipedia.org/wiki/IATA_airport_code) airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the [ICAO](https://en.wikipedia.org/wiki/ICAO_airport_code) airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia).\n", 627 | "\n", 628 | "Airport codes from around the world. Downloaded from public domain source http://ourairports.com/data/ who compiled this data from multiple different sources.\n", 629 | "\n", 630 | "`airport-codes.csv` contains the list of all airport codes, the attributes are identified in datapackage description. Some of the columns contain attributes identifying airport locations, other codes (IATA, local if exist) that are relevant to identification of an airport.\n", 631 | "Original source url is http://ourairports.com/data/airports.csv (stored in archive/data.csv)." 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 10, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "airport = pd.read_csv(\"airport-codes_csv.csv\")" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 11, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/html": [ 651 | "
\n", 652 | "\n", 665 | "\n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | "
identtypenameelevation_ftcontinentiso_countryiso_regionmunicipalitygps_codeiata_codelocal_codecoordinates
000AheliportTotal Rf Heliport11.0NaNUSUS-PABensalem00ANaN00A-74.93360137939453, 40.07080078125
100AAsmall_airportAero B Ranch Airport3435.0NaNUSUS-KSLeoti00AANaN00AA-101.473911, 38.704022
200AKsmall_airportLowell Field450.0NaNUSUS-AKAnchor Point00AKNaN00AK-151.695999146, 59.94919968
300ALsmall_airportEpps Airpark820.0NaNUSUS-ALHarvest00ALNaN00AL-86.77030181884766, 34.86479949951172
400ARclosedNewport Hospital & Clinic Heliport237.0NaNUSUS-ARNewportNaNNaNNaN-91.254898, 35.6087
\n", 761 | "
" 762 | ], 763 | "text/plain": [ 764 | " ident type name elevation_ft \\\n", 765 | "0 00A heliport Total Rf Heliport 11.0 \n", 766 | "1 00AA small_airport Aero B Ranch Airport 3435.0 \n", 767 | "2 00AK small_airport Lowell Field 450.0 \n", 768 | "3 00AL small_airport Epps Airpark 820.0 \n", 769 | "4 00AR closed Newport Hospital & Clinic Heliport 237.0 \n", 770 | "\n", 771 | " continent iso_country iso_region municipality gps_code iata_code \\\n", 772 | "0 NaN US US-PA Bensalem 00A NaN \n", 773 | "1 NaN US US-KS Leoti 00AA NaN \n", 774 | "2 NaN US US-AK Anchor Point 00AK NaN \n", 775 | "3 NaN US US-AL Harvest 00AL NaN \n", 776 | "4 NaN US US-AR Newport NaN NaN \n", 777 | "\n", 778 | " local_code coordinates \n", 779 | "0 00A -74.93360137939453, 40.07080078125 \n", 780 | "1 00AA -101.473911, 38.704022 \n", 781 | "2 00AK -151.695999146, 59.94919968 \n", 782 | "3 00AL -86.77030181884766, 34.86479949951172 \n", 783 | "4 NaN -91.254898, 35.6087 " 784 | ] 785 | }, 786 | "execution_count": 11, 787 | "metadata": {}, 788 | "output_type": "execute_result" 789 | } 790 | ], 791 | "source": [ 792 | "airport.head()" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "__Data Dictionary__\n", 800 | "\n", 801 | "| Column Name | Description |\n", 802 | "| :--- | :--- |\n", 803 | "| ident | Unique identifier |\n", 804 | "| type | Type of the airport |\n", 805 | "| name | Airport Name |\n", 806 | "| elevation_ft | Altitude of the airport |\n", 807 | "| continent | Continent |\n", 808 | "| iso_country | ISO code of the country of the airport |\n", 809 | "| iso_region | ISO code for the region of the airport |\n", 810 | "| municipality | City where the airport is located |\n", 811 | "| gps_code | GPS code of the airport |\n", 812 | "| iata_code | IATA code of the airport |\n", 813 | "| local_code | Local code of the airport |\n", 814 | "| coordinates | GPS coordinates of the airport |" 815 | ] 816 | }, 817 | { 818 | "cell_type": "markdown", 819 | "metadata": {}, 820 | "source": [ 821 | "We are not using the airport dataset in our model. We came to a conclusion that it did not prove to be a good source of analysis once we were not able to join this to the main table immigration. We did not find a valid and consistent key in both tables in order to cross them. None of the codes (ident, gps_code, iata_code or local_code) seemed to match the columns in the immigration fact table." 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "__U.S. City Demographic Data__\n", 829 | "\n", 830 | "This dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. This data comes from the US Census Bureau's 2015 American Community Survey.\n", 831 | "\n", 832 | "This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau." 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": 12, 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "us_cities_demographics = pd.read_csv(\"us-cities-demographics.csv\", sep=\";\")" 842 | ] 843 | }, 844 | { 845 | "cell_type": "code", 846 | "execution_count": 13, 847 | "metadata": {}, 848 | "outputs": [ 849 | { 850 | "data": { 851 | "text/html": [ 852 | "
\n", 853 | "\n", 866 | "\n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | "
CityStateMedian AgeMale PopulationFemale PopulationTotal PopulationNumber of VeteransForeign-bornAverage Household SizeState CodeRaceCount
0Silver SpringMaryland33.840601.041862.0824631562.030908.02.60MDHispanic or Latino25924
1QuincyMassachusetts41.044129.049500.0936294147.032935.02.39MAWhite58723
2HooverAlabama38.538040.046799.0848394819.08229.02.58ALAsian4759
3Rancho CucamongaCalifornia34.588127.087105.01752325821.033878.03.18CABlack or African-American24437
4NewarkNew Jersey34.6138040.0143873.02819135829.086253.02.73NJWhite76402
\n", 962 | "
" 963 | ], 964 | "text/plain": [ 965 | " City State Median Age Male Population \\\n", 966 | "0 Silver Spring Maryland 33.8 40601.0 \n", 967 | "1 Quincy Massachusetts 41.0 44129.0 \n", 968 | "2 Hoover Alabama 38.5 38040.0 \n", 969 | "3 Rancho Cucamonga California 34.5 88127.0 \n", 970 | "4 Newark New Jersey 34.6 138040.0 \n", 971 | "\n", 972 | " Female Population Total Population Number of Veterans Foreign-born \\\n", 973 | "0 41862.0 82463 1562.0 30908.0 \n", 974 | "1 49500.0 93629 4147.0 32935.0 \n", 975 | "2 46799.0 84839 4819.0 8229.0 \n", 976 | "3 87105.0 175232 5821.0 33878.0 \n", 977 | "4 143873.0 281913 5829.0 86253.0 \n", 978 | "\n", 979 | " Average Household Size State Code Race Count \n", 980 | "0 2.60 MD Hispanic or Latino 25924 \n", 981 | "1 2.39 MA White 58723 \n", 982 | "2 2.58 AL Asian 4759 \n", 983 | "3 3.18 CA Black or African-American 24437 \n", 984 | "4 2.73 NJ White 76402 " 985 | ] 986 | }, 987 | "execution_count": 13, 988 | "metadata": {}, 989 | "output_type": "execute_result" 990 | } 991 | ], 992 | "source": [ 993 | "us_cities_demographics.head()" 994 | ] 995 | }, 996 | { 997 | "cell_type": "markdown", 998 | "metadata": {}, 999 | "source": [ 1000 | "__Data Dictionary__\n", 1001 | "\n", 1002 | "| Column Name | Description |\n", 1003 | "| :--- | :--- |\n", 1004 | "| City | Name of the city |\n", 1005 | "| State | US state of the city |\n", 1006 | "| Median Age | The median of the age of the population |\n", 1007 | "| Male Population | Number of the male population |\n", 1008 | "| Female Population | Number of the female population |\n", 1009 | "| Total Population | Number of the total population |\n", 1010 | "| Number of Veterans | Number of veterans living in the city |\n", 1011 | "| Foreign-born | Number of residents of the city that were not born in the city |\n", 1012 | "| Average Household Size | Average size of the houses in the city |\n", 1013 | "| State Code | Code of the state of the city |\n", 1014 | "| Race | Race class |\n", 1015 | "| Count | Number of individual of each race |" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "The `US Cities Demographics` is the source of the STATE dimension in our data model. We aggregated the dataset by State and pivoted the `Race` and `Count` columns in order to make each different value of Race to be a column. That way we create a complete table of statistics that summarizes the information for every US state." 1023 | ] 1024 | }, 1025 | { 1026 | "cell_type": "markdown", 1027 | "metadata": {}, 1028 | "source": [ 1029 | "## Step 3: Define the Data Model\n", 1030 | "\n", 1031 | "_In this section of the documentation we detail the process of extract, transform and load the data from the various datasets. As me mentioned before, we are using 3 of the 4 data sources provided by the Udacity team: immigration, temperatures and demographics. Also, we extract descriptions from labels descriptions file `I94_SAS_Labels_Descriptions.SAS`_\n", 1032 | "\n", 1033 | "#### 3.1 Conceptual Data Model\n", 1034 | "_Map out the conceptual data model and explain why you chose that model_\n", 1035 | "\n", 1036 | "The immigration dataset is the origin of the center of our model. As this represent the facts of what we want to analyse - U.S visitors from the world -, this was transformed to the fact table IMMIGRATION as represented in the schema below. We gave this data most of the focus during our modeling phase. The immigration dataset is also the data source for the DATE dimension table. We extracted all the distinct values of the columns arrdate and depdate and applied various functions to store in the table a number of attributes of a particular date: day, month, year, week of year and day of week.\n", 1037 | "\n", 1038 | "![Star-Schema](images/star-schema.PNG)\n", 1039 | "\n", 1040 | "The STATE dimension table is the result of the aggregation of the demographics dataset by the State column. Median Age, Male Population, Female Population, Total Population, Number of Veterans, Foreign-born were first aggregated by `City` using `first` function, since they are repeated accross the different rows of the same city. Then, we grouped the resulting rows by `State` applying the `sum` function in the numeric columns to make a cosolidated total in each U.S State. We needed to transform the column `Race` in order to make its different values to become different columns. We achieve this by usig the pivot function of the `pyspark` package. As a result we reached to a final structure where we have got the columns (BlackOrAfricanAmerican, White, ForeignBorn, AmericanIndianAndAlaskaNative, HispanicOrLatino, Asian, NumberVeterans, FemalePopulation, MalePopulation, TotalPopulation) for each of the states of the U.S.\n", 1041 | "\n", 1042 | "The COUNTRY dimention completes our star schema model. To get to the structure we see in the figure above we combined the `GlobalLandTemperaturesByCity` with the code-descriptions found in the file `I94_SAS_Labels_Descriptions.SAS` for the columns `i94cit` and `i94res` showed in the image below.\n", 1043 | "Firstly, we extracted the key-value pairs from the `I94_SAS_Labels_Descriptions.SAS` and saved those in csv files in the `lookup` directory. Following we aggregated the temperature dataset by `City` and then by `Country`. Finally, we join the two intermediary results to form the table COUNTRY. \n", 1044 | "\n", 1045 | "![i94cit](images/i94cit.PNG)" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "markdown", 1050 | "metadata": {}, 1051 | "source": [ 1052 | "#### 3.2 Mapping Out Data Pipelines\n", 1053 | "_List the steps necessary to pipeline the data into the chosen data model_\n", 1054 | "\n", 1055 | "\n", 1056 | "To accomplish all the tasks related to the preprocessing of the datasets it was developed a number of functions in a package we called `helper.etl`. There you will find different helper functions to load, select, clean, transform and store the resultind datasets in a very convenient way. The open-source framework Apache Spark was the main tool in this journey. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance.\n", 1057 | "\n", 1058 | "We concentrated all the logic of preprocessing there in order to only represent here the general steps of the ETL. This notebook here is only for document purposes whereas the actual run of the ETL takes place in the Spark in cloud-native big data platform [Amazon EMR](https://aws.amazon.com/emr/?nc1=h_ls) through the execution of the main function of the `etl` package. The documentation of the functions can be found in the docstring alongside the code of the package in `helper/etl.py` file." 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "code", 1063 | "execution_count": 3, 1064 | "metadata": {}, 1065 | "outputs": [], 1066 | "source": [ 1067 | "# import the ETL package\n", 1068 | "from helper.etl import create_spark_session, etl_immigration_data, etl_countries_data, etl_states_data" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": 2, 1074 | "metadata": {}, 1075 | "outputs": [], 1076 | "source": [ 1077 | "# create Spark session\n", 1078 | "spark = create_spark_session()" 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "markdown", 1083 | "metadata": {}, 1084 | "source": [ 1085 | "#### Immigration and Date datasets\n", 1086 | "The preprocessing of the main dataset immigration starts by loading the data from the SAS file and is completed by generating and the storing of the processed dataframes to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n", 1087 | "* Loading of the immigration file into Spark dataframe. We only load useful columns as we identified them in the EDA phase. In particular we discarded the follouwing fields: 'admnum', 'biryear', 'count', 'dtaddto', 'dtadfile', 'entdepa', 'entdepd', 'entdepu', 'insnum', 'matflag', 'occup', 'visapost';\n", 1088 | "* Though some columns were actually of Integer type, the Spark framework loaded them as double or strings. To correct this we convert those fields to the proper class;\n", 1089 | "* The dates in the immigration dataframe are stored in SAS date format, which is a value that represents the number of days between January 1, 1960, and a specified date. We convert the dates in the dataframe to a string date format in the pattern YYYY-MM-DD;\n", 1090 | "* We drop high missing value columns \"visapost\", \"occup\", \"entdepu\" and \"insnum\";\n", 1091 | "* Creation of `stay` column from calculating the difference in days between the departure (depdate) and arrival (arrdate) date of the visitors. That will be useful to analyse how long is the average stay of visitors and where they tend to stay longer;\n", 1092 | "* From the date columns arrdate and depdate we create a second dataframe DATE;\n", 1093 | "* Save the processed immigration and date dataframes to the Amazon S3 in the parquet format;" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "metadata": {}, 1099 | "source": [ 1100 | "\"etl_immigration\"" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": 3, 1106 | "metadata": {}, 1107 | "outputs": [], 1108 | "source": [ 1109 | "# Perform ETL process for the Immigration dataset generating immigration and date tables and save them in the S3 bucket indicated in the output_path parameters.\n", 1110 | "immigration = etl_immigration_data(spark, input_path='../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat',\n", 1111 | " output_path=\"s3a://data-engineer-capstone/immigration.parquet\",\n", 1112 | " date_output_path=\"s3a://data-engineer-capstone/date.parquet\",\n", 1113 | " input_format = \"com.github.saurfang.sas.spark\", \n", 1114 | " load_size=1000, partitionBy=None, \n", 1115 | " columns_to_save = '*')" 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "markdown", 1120 | "metadata": {}, 1121 | "source": [ 1122 | "#### Countries dataset\n", 1123 | "The generation of the country dataset starts by loading the data global temperature dataset as well as I94CIT_I94RES lookup table and is completed by generating and the storing of the processed dataframe to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n", 1124 | "* Loading of the csv file of the global temperature and I94CIT_I94RES lookup table;\n", 1125 | "* Aggregation of the temperatures dataset by country and rename new columns;\n", 1126 | "* Join the two datasets;\n", 1127 | "* Save the resulting dataset to the staging area in Amazon S3;" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "markdown", 1132 | "metadata": {}, 1133 | "source": [ 1134 | "\"etl_country\"" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": null, 1140 | "metadata": {}, 1141 | "outputs": [], 1142 | "source": [ 1143 | "# Perform ETL process for the Country table. Generating the Country table and saving it in the S3 bucket indicated in the output_path parameter.\n", 1144 | "countries = etl_countries_data(spark, output_path=e.OUTPUT + \"country.parquet\")" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "markdown", 1149 | "metadata": {}, 1150 | "source": [ 1151 | "#### States dataset\n", 1152 | "The generation of the states dataset starts by loading the data in demographics dataset as well as I94ADDR lookup table and is completed by generating and the storing of the processed dataframe to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n", 1153 | "* Loading of the csv file of the demographics and I94ADDR lookup table;\n", 1154 | "* Aggregation of the demographics dataset by state and rename new columns;\n", 1155 | "* Join the two datasets;\n", 1156 | "* Save the resulting dataset to the staging area in Amazon S3;" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "markdown", 1161 | "metadata": {}, 1162 | "source": [ 1163 | "\"etl_state\"" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": null, 1169 | "metadata": {}, 1170 | "outputs": [], 1171 | "source": [ 1172 | "# Perform ETL process for the State table. Generating the State table and saving it in the S3 bucket indicated in the output_path parameter.\n", 1173 | "states = etl_states_data(spark, output_path=e.OUTPUT + \"state.parquet\")" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "markdown", 1178 | "metadata": {}, 1179 | "source": [ 1180 | "Once the parquet files are saved in the S3 bucket in the AWS, those are used to load the tables of the same name in the Amazon Redshift. We create the schema by running the SQL script found in `sql/create_tables.sql`. From there, our model is ready to be explored by the customers whether through open query editor in Redshift itself or using a dashboard tool such as Tableau or Power BI." 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "markdown", 1185 | "metadata": {}, 1186 | "source": [ 1187 | "### Step 4: Run Pipelines to Model the Data \n", 1188 | "#### 4.1 Create the data model\n", 1189 | "_Build the data pipelines to create the data model._\n", 1190 | "\n", 1191 | "The whole pipeline can be divided into two stages. The first, where we used spark to load, extracted, transform and store the provided datasets into the AWS S3 staging area. The second stage we take advantage of [Apache Airflow](https://airflow.apache.org/) to build a DAG to extract data from S3 and load them into tables of the same name in Amazon Redshift. As a final step we check the data counting checking to ensure completeness.\n", 1192 | "\n", 1193 | "\"architecture\"" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "markdown", 1198 | "metadata": {}, 1199 | "source": [ 1200 | "Below we show the pipeline of the second stage we developed using Apache Airflow.\n", 1201 | "\n", 1202 | "\"dag\"" 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "markdown", 1207 | "metadata": {}, 1208 | "source": [ 1209 | "The code to build the Airflow pipeline is located in the folder `airflow`. There you will find the code of the DAG itself (file `capstone.py` inside folder `dags`) as well as the two custom operators built for this capstone project in folder `plugins/operators`: `stage_redshift.py` and `data_quality.py`.\n", 1210 | "\n", 1211 | "The custom operator `StageToRedshiftOperator` was designed to load data in [parquet](https://parquet.apache.org/) format from S3 buckets in AWS and insert the content into a table in AWS Redshift. That operator is customizable to work with different buckets and with different tables by input parameters. Then it is used in our DAG to load to Redshift both fact and dimension tables.\n", 1212 | "\n", 1213 | "#### 4.2 Data Quality Checks\n", 1214 | "\n", 1215 | "First, we load the `IMMIGRATION` fact table through the step `Immigration_Fact_Table`, which is followed by the steps to load the dimension tables `STATE`, `DATE` and `COUNTRY`, respectively `State_Dimension_Table`, `Date_Dimension_Table`, `Country_Dimension_Table` steps. All the tables have a PK constraint that uniquely identify the records and in the fact table there are FK that guarantee that values in the fact are present in the dimension tables.\n", 1216 | "\n", 1217 | "After completing the loading process, we perform a data quality check through the step `Data_Quality_Checks` to make sure everything was OK. In this check we verify if every table was actually loaded with count check in all the tables of the model." 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "markdown", 1222 | "metadata": {}, 1223 | "source": [ 1224 | "#### 4.3 Data dictionary \n", 1225 | "_Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file._\n", 1226 | "\n", 1227 | "\n", 1228 | "__Table Immigration__\n", 1229 | "\n", 1230 | "| Column Name | Description |\n", 1231 | "| :--- | :--- |\n", 1232 | "| CICID | Primary Key |\n", 1233 | "| I94YR | Year |\n", 1234 | "| I94MON | Month |\n", 1235 | "| I94CIT | 3 digit for the country code where the visitor was born. This is a FK to the COUNTRY dimension table |\n", 1236 | "| I94RES | 3 digit for the country code where the visitor resides in. This is a FK to the COUNTRY dimension table |\n", 1237 | "| ARRDATE | Arrival date in the USA. This is a FK to the DATE dimension table |\n", 1238 | "| I94MODE | Mode of transportation (1 = Air; 2 = Sea; 3 = Land; 9 = Not reported) |\n", 1239 | "| I94ADDR | State of arrival. This is a FK to the STATE dimension table |\n", 1240 | "| DEPDATE | Departure date from the USA. This is a FK to the DATE dimension table |\n", 1241 | "| I94BIR | Age of Respondent in Years |\n", 1242 | "| I94VISA | Visa codes collapsed into three categories: (1 = Business; 2 = Pleasure; 3 = Student) |\n", 1243 | "| BIRYEAR | 4 digit year of birth |\n", 1244 | "| GENDER | Gender |\n", 1245 | "| AIRLINE | Airline used to arrive in U.S. |\n", 1246 | "| FLTNO | Flight number of Airline used to arrive in U.S. |\n", 1247 | "| VISATYPE | Class of admission legally admitting the non-immigrant to temporarily stay in U.S. |\n", 1248 | "| STAY | Number of days in the US |\n", 1249 | "\n", 1250 | "\n", 1251 | "__Table STATE__\n", 1252 | "\n", 1253 | "| Column Name | Description |\n", 1254 | "| :--- | :--- |\n", 1255 | "| Code | Primary Key. This is the code of the State as in I94ADDR lookup table |\n", 1256 | "| State | Name of the state |\n", 1257 | "| BlackOrAfricanAmerican | Number of residents of the race Black Or African American |\n", 1258 | "| White | Number of residents of the race White |\n", 1259 | "| ForeignBorn | Number of residents that born outside th United States |\n", 1260 | "| AmericanIndianAndAlaskaNative | Number of residents of the race American Indian And Alaska Native |\n", 1261 | "| HispanicOrLatino | Number of residents of the race Hispanic Or Latino |\n", 1262 | "| Asian | Number of residents of the race Asian |\n", 1263 | "| NumberVeterans | Number of residents that are war veterans |\n", 1264 | "| FemalePopulation | Number of female population |\n", 1265 | "| MalePopulation | Number of male population |\n", 1266 | "| TotalPopulation | Number total of the population |\n", 1267 | "\n", 1268 | "\n", 1269 | "__Table COUNTRY__\n", 1270 | "\n", 1271 | "| Column Name | Description |\n", 1272 | "| :--- | :--- |\n", 1273 | "| Code | Country Code. This is the PK. |\n", 1274 | "| Country | Country Name |\n", 1275 | "| Temperature | Average temperature of the country between 1743 and 2013 |\n", 1276 | "| Latitude | GPS Latitude |\n", 1277 | "| Longitude | GPS Longitude |\n", 1278 | "\n", 1279 | "\n", 1280 | "__Table DATE__\n", 1281 | "\n", 1282 | "| Column Name | Description |\n", 1283 | "| :--- | :--- |\n", 1284 | "| date | Date in the format YYYY-MM-DD. This is the PK. |\n", 1285 | "| day | Two digit day |\n", 1286 | "| month | Two digit month |\n", 1287 | "| year | Four digit for the year |\n", 1288 | "| weekofyear | The week of the year |\n", 1289 | "| dayofweek | The day of the week |" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "markdown", 1294 | "metadata": {}, 1295 | "source": [ 1296 | "#### Step 5: Complete Project Write Up\n", 1297 | "__Clearly state the rationale for the choice of tools and technologies for the project.__\n", 1298 | "\n", 1299 | "The whole solution implemented here is mounted on top of cloud computing technology, AWS in particular. Because the cloud computing provides a low-cost, scalable, and highly reliable infrastructure platform in the cloud this is a natural choice for every new solution like we did here. Every service we use (S3, EMR, Redshift) has reasonable cost and is ‘pay as you go’ pricing. So we can start small and scale as our solution grows. No up-front costs involved.\n", 1300 | "\n", 1301 | "In particular, why we use the following services:\n", 1302 | "\n", 1303 | "__S3:__ Provides a relatively cheap, easy-to-use with scalability, high availability, security, and performance. This seems to be perfect to a staging area like our solution here;\n", 1304 | "\n", 1305 | "__Spark:__ This is simply the best framework for big data processing, with built-in modules for streaming, SQL, machine learning and graph processing. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance. Most of our team are pythonians and Spark has a very convenient API for python programmers to use;\n", 1306 | "\n", 1307 | "__EMR:__ This is a cloud-native big data platform, allowing teams to process vast amounts of data quickly, and cost-effectively at scale using Spark. EMR is easy to use, secure, elastic and low-cost. Perfect to our project;\n", 1308 | "\n", 1309 | "__Redshift:__ A natural and logical choice since we based all the solution in the cloud in AWS. Redshift provides a massively parallel, column-oriented data warehouse that provides easy-scale functionality. The main analytical tools have native interface to load from Redshift.\n", 1310 | "\n", 1311 | "\n", 1312 | "__Propose how often the data should be updated and why__\n", 1313 | "\n", 1314 | "Since we receive one file per month it seems reasonable to update the model monthly.\n", 1315 | "\n", 1316 | "__Write a description of how you would approach the problem differently under the following scenarios:__\n", 1317 | "\n", 1318 | " * The data was increased by 100x:\n", 1319 | "\n", 1320 | "Scaling the whole pipeline should not be a problem at all. Since the whole solution is on top of Amazon cloud, that are easily scalable, the only thing we would need to do is increase the number of nodes of the clusters in EMR to hadle more data. Also, Amazon Redshift is a data warehouse that can expand to exabyte-scale;\n", 1321 | " \n", 1322 | "* The data populates a dashboard that must be updated on a daily basis by 7am every day.\n", 1323 | "\n", 1324 | "The runnig interval of the Airflow DAG could be changed to daily and scheduled to run overnight to make the data available y 7am.\n", 1325 | " \n", 1326 | "* The database needed to be accessed by 100+ people.\n", 1327 | " \n", 1328 | "Again, not a big problem. With Redshift we can make use of the feature \"elastic resize\" that enables us to add or remove nodes in an Amazon Redshift cluster in minutes. This further increases the agility to get better performance and more storage for demanding workloads, and to reduce cost during periods of low demand." 1329 | ] 1330 | }, 1331 | { 1332 | "cell_type": "code", 1333 | "execution_count": null, 1334 | "metadata": {}, 1335 | "outputs": [], 1336 | "source": [] 1337 | } 1338 | ], 1339 | "metadata": { 1340 | "kernelspec": { 1341 | "display_name": "Python 3", 1342 | "language": "python", 1343 | "name": "python3" 1344 | }, 1345 | "language_info": { 1346 | "codemirror_mode": { 1347 | "name": "ipython", 1348 | "version": 3 1349 | }, 1350 | "file_extension": ".py", 1351 | "mimetype": "text/x-python", 1352 | "name": "python", 1353 | "nbconvert_exporter": "python", 1354 | "pygments_lexer": "ipython3", 1355 | "version": "3.6.3" 1356 | } 1357 | }, 1358 | "nbformat": 4, 1359 | "nbformat_minor": 4 1360 | } 1361 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Capstone Project 2 | 3 | ## Overview 4 | 5 | The purpose of the data engineering capstone project is to give you a chance to combine what you've learned throughout the program. This project will be an important part of your portfolio that will help you achieve your data engineering-related career goals. 6 | 7 | In this project, you can choose to complete the project provided for you, or define the scope and data for a project of your own design. Either way, you'll be expected to go through the same steps outlined below. 8 | 9 | ## Udacity Provided Project 10 | 11 | In the Udacity provided project, you'll work with four datasets to complete the project. The main dataset will include data on immigration to the United States, and supplementary datasets will include data on airport codes, U.S. city demographics, and temperature data. You're also welcome to enrich the project with additional data if you'd like to set your project apart. 12 | 13 | ## Open-Ended Project 14 | 15 | If you decide to design your own project, you can find useful information in the Project Resources section. Rather than go through steps below with the data Udacity provides, you'll gather your own data, and go through the same process. 16 | 17 | ## Instructions 18 | 19 | To help guide your project, we've broken it down into a series of steps. 20 | 21 | ### Step 1: Scope the Project and Gather Data 22 | 23 | Since the scope of the project will be highly dependent on the data, these two things happen simultaneously. In this step, you’ll: 24 | 25 | * Identify and gather the data you'll be using for your project (at least two sources and more than 1 million rows). See Project Resources for ideas of what data you can use. 26 | * Explain what end use cases you'd like to prepare the data for (e.g., analytics table, app back-end, source-of-truth database, etc.) 27 | 28 | ### Step 2: Explore and Assess the Data 29 | 30 | * Explore the data to identify data quality issues, like missing values, duplicate data, etc. 31 | * Document steps necessary to clean the data 32 | 33 | 34 | ### Step 3: Define the Data Model 35 | 36 | * Map out the conceptual data model and explain why you chose that model 37 | * List the steps necessary to pipeline the data into the chosen data model 38 | 39 | 40 | ### Step 4: Run ETL to Model the Data 41 | 42 | * Create the data pipelines and the data model 43 | * Include a data dictionary 44 | * Run data quality checks to ensure the pipeline ran as expected 45 | * Integrity constraints on the relational database (e.g., unique key, data type, etc.) 46 | * Unit tests for the scripts to ensure they are doing the right thing 47 | * Source/count checks to ensure completeness 48 | 49 | ### Step 5: Complete Project Write Up 50 | 51 | * What's the goal? What queries will you want to run? How would Spark or Airflow be incorporated? Why did you choose the model you chose? 52 | * Clearly state the rationale for the choice of tools and technologies for the project. 53 | * Document the steps of the process. 54 | * Propose how often the data should be updated and why. 55 | * Post your write-up and final data model in a GitHub repo. 56 | * Include a description of how you would approach the problem differently under the following scenarios: 57 | * If the data was increased by 100x. 58 | * If the pipelines were run on a daily basis by 7am. 59 | * If the database needed to be accessed by 100+ people. 60 | 61 | ### Rubric 62 | In the [Project Rubric](https://review.udacity.com/#!/rubrics/2497/view), you'll see more detail about the requirements. Use the rubric to assess your own project before you submit to Udacity for review. As with other projects, Udacity reviewers will use this rubric to assess your project and provide feedback. If your project does not meet specifications, you can make changes and resubmit. 63 | -------------------------------------------------------------------------------- /airflow/dags/capstone.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators import (StageToRedshiftOperator, DataQualityOperator) 6 | from helpers import SqlQueries 7 | 8 | default_args = { 9 | 'depends_on_past': False, # The DAG does not have dependencies on past runs 10 | 'owner': 'Fernando Carneiro', 11 | 'retries': 3, # On failure, the task are retried 3 times 12 | 'retry_delay': timedelta(minutes=60), # Retries happen every 60 minutes 13 | 'start_date': datetime(2016, 1, 1), 14 | 'email_on_retry': False, # Do not email on retry 15 | } 16 | 17 | dag = DAG('data_engineering_project', 18 | default_args=default_args, 19 | description='Load and transform data in Redshift with Airflow', 20 | schedule_interval='@monthly' 21 | ) 22 | 23 | start_operator = DummyOperator(task_id='Begin', dag=dag) 24 | 25 | immigration_to_redshift = StageToRedshiftOperator( 26 | task_id='Immigration_Fact_Table', 27 | aws_conn_id = 'aws_credentials', 28 | redshift_conn_id = "redshift", 29 | s3_from = 'data-engineer-capstone', 30 | s3_prefix = 'immigration.parquet', 31 | schema_to = 'public', 32 | table_to = 'immigration', 33 | options = ["FORMAT AS PARQUET"], 34 | dag=dag 35 | ) 36 | 37 | country_to_redshift = StageToRedshiftOperator( 38 | task_id='Country_Dimension_Table', 39 | aws_conn_id = 'aws_credentials', 40 | redshift_conn_id = "redshift", 41 | s3_from = 'data-engineer-capstone', 42 | s3_prefix = 'country.parquet', 43 | schema_to = 'public', 44 | table_to = 'country', 45 | options = ["FORMAT AS PARQUET"], 46 | dag=dag 47 | ) 48 | 49 | state_to_redshift = StageToRedshiftOperator( 50 | task_id='State_Dimension_Table', 51 | aws_conn_id = 'aws_credentials', 52 | redshift_conn_id = "redshift", 53 | s3_from = 'data-engineer-capstone', 54 | s3_prefix = 'state.parquet', 55 | schema_to = 'public', 56 | table_to = 'state', 57 | options = ["FORMAT AS PARQUET"], 58 | dag=dag 59 | ) 60 | 61 | date_to_redshift = StageToRedshiftOperator( 62 | task_id='Date_Dimension_Table', 63 | aws_conn_id = 'aws_credentials', 64 | redshift_conn_id = "redshift", 65 | s3_from = 'data-engineer-capstone', 66 | s3_prefix = 'date.parquet', 67 | schema_to = 'public', 68 | table_to = 'date', 69 | options = ["FORMAT AS PARQUET"], 70 | dag=dag 71 | ) 72 | 73 | run_quality_checks = DataQualityOperator( 74 | task_id='Data_Quality_Checks', 75 | redshift_conn_id = "redshift", 76 | tables=['immigration', 'country', 'state', 'date'], 77 | dag=dag 78 | ) 79 | 80 | end_operator = DummyOperator(task_id='End', dag=dag) 81 | 82 | start_operator >> immigration_to_redshift 83 | immigration_to_redshift >> country_to_redshift 84 | immigration_to_redshift >> state_to_redshift 85 | immigration_to_redshift >> date_to_redshift 86 | country_to_redshift >> run_quality_checks 87 | state_to_redshift >> run_quality_checks 88 | date_to_redshift >> run_quality_checks 89 | run_quality_checks >> end_operator 90 | -------------------------------------------------------------------------------- /airflow/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | # Define your operators params (with defaults) here 12 | redshift_conn_id, 13 | tables, 14 | *args, **kwargs): 15 | 16 | super(DataQualityOperator, self).__init__(*args, **kwargs) 17 | self.redshift_conn_id = redshift_conn_id 18 | self.tables = tables 19 | 20 | def execute(self, context): 21 | self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) 22 | for table in self.tables: 23 | self.log.info(f"Checking table {table}") 24 | records = self.hook.get_records(f"SELECT COUNT(*) FROM {table}") 25 | if len(records) < 1 or len(records[0]) < 1: 26 | raise ValueError(f"Data quality check failed. {table} returned no results") 27 | num_records = records[0][0] 28 | if num_records < 1: 29 | raise ValueError(f"Data quality check failed. {table} contained 0 rows") 30 | self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records") -------------------------------------------------------------------------------- /airflow/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator, Variable 3 | from airflow.utils.decorators import apply_defaults 4 | from airflow.hooks.S3_hook import S3Hook 5 | from airflow.contrib.hooks.aws_hook import AwsHook 6 | 7 | import datetime 8 | import logging 9 | 10 | class StageToRedshiftOperator(BaseOperator): 11 | ui_color = '#358140' 12 | 13 | @apply_defaults 14 | def __init__(self, 15 | # Define your operators params (with defaults) here 16 | aws_conn_id, 17 | redshift_conn_id, 18 | s3_from, 19 | s3_prefix, 20 | schema_to, 21 | table_to, 22 | options, 23 | *args, **kwargs): 24 | 25 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 26 | self.aws_conn_id = aws_conn_id 27 | self.redshift_conn_id = redshift_conn_id 28 | self.s3_from = s3_from 29 | self.s3_prefix = s3_prefix 30 | self.schema = schema_to 31 | self.table = table_to 32 | self.options = options 33 | self.autocommit = True 34 | self.region = 'us-west-2' 35 | 36 | def execute(self, context): 37 | self.log.info('Initializing COPY procedure...') 38 | aws_hook = AwsHook(self.aws_conn_id) 39 | credentials = aws_hook.get_credentials() 40 | self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) 41 | copy_options = '\n\t\t\t'.join(self.options) 42 | 43 | copy_query = """ 44 | COPY {schema}.{table} 45 | FROM 's3://{s3_bucket}/{s3_key}' 46 | IAM_ROLE 'arn:aws:iam::900646315604:role/myRedshiftRole' 47 | {copy_options}; 48 | """.format(schema=self.schema, 49 | table=self.table, 50 | s3_bucket=self.s3_from, 51 | s3_key=self.s3_prefix, 52 | copy_options=copy_options) 53 | 54 | self.log.info(f'Executing COPY command from bucket s3://{self.s3_from}/{self.s3_prefix} to {self.schema}.{self.table} in Redshift') 55 | self.hook.run(copy_query, self.autocommit) 56 | self.log.info("COPY command complete!") -------------------------------------------------------------------------------- /helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/helper/__init__.py -------------------------------------------------------------------------------- /helper/etl.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | import configparser 3 | from datetime import timedelta, datetime 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import udf, col, when, lower, isnull, year, month, dayofmonth, hour, weekofyear, dayofweek, date_format, to_date 6 | from pyspark.sql.types import StructField, StructType, IntegerType, DoubleType 7 | 8 | # The date format string preferred to our work here: YYYY-MM-DD 9 | date_format = "%Y-%m-%d" 10 | 11 | # The AWS key id and password are configured in a configuration file "dl.cfg" 12 | config = configparser.ConfigParser() 13 | config.read('dl.cfg') 14 | 15 | # Reads and saves the AWS access key information and saves them in a environment variable 16 | os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID'] 17 | os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY'] 18 | OUTPUT = config['ETL']['OUTPUT_DATA'] 19 | 20 | def create_spark_session(): 21 | """ 22 | This function creates a session with Spark, the entry point to programming Spark with the Dataset and DataFrame API. 23 | """ 24 | spark = SparkSession.builder.config("spark.jars.packages", 25 | "saurfang:spark-sas7bdat:2.0.0-s_2.11,org.apache.hadoop:hadoop-aws:2.7.0")\ 26 | .enableHiveSupport().getOrCreate() 27 | return spark 28 | 29 | def read_data(spark, input_path, input_format = "csv", columns = '*', debug_size = None, **options): 30 | """ 31 | Loads data from a data source using the pyspark module and returns it as a spark 'DataFrame'. 32 | 33 | Args: 34 | spark (:obj:`SparkSession`): Spark session. 35 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 36 | input_path (:obj:`str`): Directory where to find the input files. 37 | input_format (:obj:`str`): Optional string for format of the data source. Default to 'csv'. 38 | columns (:obj:`list`): List of columns of the dataframe to return. Default to "*", which means 'all columns'. 39 | debug_size (int): Define the number of rows to read for debug purposes. The default value None means 'all rows'. 40 | options: All other string options. 41 | """ 42 | if debug_size is None: 43 | df = spark.read.load(input_path, format=input_format, **options).select(columns) 44 | else: 45 | df = spark.read.load(input_path, format=input_format, **options).select(columns).limit(debug_size) 46 | return df 47 | 48 | def save(df, output_path, mode = "overwrite", output_format = "parquet", columns = '*', partitionBy=None, **options): 49 | """ 50 | Saves the contents of the DataFrame to a data source. 51 | 52 | The data source is specified by the format and a set of options. If format is not specified, 'parquet' will be used. 53 | 54 | Args: 55 | df (:obj:`DataFrame`): Spark DataFrame. 56 | output_path (:obj:`str`): The path in a Hadoop supported file system where the DataFrame contentes will be saved. 57 | mode (:obj:`str`): Specifies the behavior of the save operation when data already exists. Default to 'overwrite'. 58 | output_format (:obj:`str`): Optional string for format of the data source to be saved. Default to 'parquet'. 59 | columns (:obj:`list`): List of columns of the dataframe to save. Default to "*", which means 'all columns'. 60 | partitionBy (:obj:`list`): Names of partitioning columns. The default value None means 'no partitions'. 61 | options: All other string options. 62 | """ 63 | 64 | df.select(columns).write.save(output_path, mode= mode, format=output_format, partitionBy = partitionBy, **options) 65 | 66 | def etl_immigration_data(spark, input_path="immigration_data_sample.csv", output_path="out/immigration.parquet", 67 | date_output_path="out/date.parquet", 68 | input_format = "csv", columns = ['i94addr', 'i94mon','cicid','i94visa','i94res','arrdate','i94yr','depdate', 69 | 'airline', 'fltno', 'i94mode', 'i94port', 'visatype', 'gender', 70 | 'i94cit', 'i94bir'], 71 | load_size = None, partitionBy = ["i94yr", "i94mon"], columns_to_save='*', header=True, **options): 72 | """ 73 | Reads the immigration dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter 74 | out_put path. 75 | 76 | Args: 77 | spark (:obj:`SparkSession`): Spark session. 78 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 79 | input_path (:obj:`str`): Directory where to find the input files. 80 | output_path (:obj:`str`): Directory where to save immigration output files. 81 | date_output_path (:obj:`str`): Directory where to save date output files. 82 | input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value). 83 | columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful. 84 | load_size (int): Number of rows to read for debug purposes. 85 | partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list. 86 | columns_to_save (:obj:`list`): Define what columns will be saved. 87 | header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false. 88 | options: All other string options. 89 | """ 90 | 91 | # Loads the immigration dataframe using Spark 92 | # We discard the columns ['admnum', 'biryear', 'count', 'dtaddto', 'dtadfile', 'entdepa', 'entdepd', 'entdepu', 'insnum', 'matflag', 'occup', 'visapost'] as they seemed not to be very useful for our goals. 93 | # Some of them were very unclear of what they really represent. 94 | immigration = read_data(spark, input_path=input_path, input_format=input_format, 95 | columns=columns, debug_size = load_size, header=header, **options) 96 | 97 | int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 98 | 'arrdate', 'i94mode', 'i94bir', 'i94visa', 'count', 'biryear', 'dtadfile', 'depdate'] 99 | 100 | date_cols = ['arrdate', 'depdate'] 101 | 102 | high_null = ["visapost", "occup", "entdepu", "insnum"] 103 | not_useful_cols = ["count", "entdepa", "entdepd", "matflag", "dtaddto", "biryear", "admnum"] 104 | 105 | # Convert columns read as string/double to integer 106 | immigration = cast_type(immigration, dict(zip(int_cols, len(int_cols)*[IntegerType()]))) 107 | 108 | # Convert SAS date to a meaningful string date in the format of YYYY-MM-DD 109 | immigration = convert_sas_date(immigration, date_cols) 110 | 111 | # Drop high null columns and not useful columns 112 | immigration = immigration.drop(*high_null) 113 | immigration = immigration.drop(*not_useful_cols) 114 | 115 | # Create a new columns to store the length of the visitor stay in the US 116 | immigration = immigration.withColumn('stay', date_diff_udf(immigration.arrdate, immigration.depdate)) 117 | immigration = cast_type(immigration, {'stay': IntegerType()}) 118 | 119 | # Generate DATE dataframe and save it to the output_path indicated as parameter of the function 120 | if date_output_path is not None: 121 | arrdate = immigration.select('arrdate').distinct() 122 | depdate = immigration.select('depdate').distinct() 123 | dates = arrdate.union(depdate) 124 | dates = dates.withColumn("date", to_date(dates.arrdate, date_format)) 125 | dates = dates.withColumn("year", year(dates.date)) 126 | dates = dates.withColumn("month", month(dates.date)) 127 | dates = dates.withColumn("day", dayofmonth(dates.date)) 128 | dates = dates.withColumn("weekofyear", weekofyear(dates.date)) 129 | dates = dates.withColumn("dayofweek", dayofweek(dates.date)) 130 | dates = dates.drop("date").withColumnRenamed('arrdate', 'date') 131 | save(df=dates.select("date", "year", "month", "day", "weekofyear", "dayofweek"), output_path=date_output_path) 132 | 133 | # Save the processed immigration dataset to the output_path 134 | if output_path is not None: 135 | save(df=immigration.select(columns_to_save), output_path=output_path, partitionBy = partitionBy) 136 | return immigration 137 | 138 | def etl_temperature_data(spark, input_path="../../data2/GlobalLandTemperaturesByCity.csv", output_path="out/temperature.parquet", 139 | input_format = "csv", columns = '*', load_size = None, partitionBy = ["Country", "City"], header=True, **options): 140 | """ 141 | Reads the global temperature dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path. 142 | 143 | Args: 144 | spark (:obj:`SparkSession`): Spark session. 145 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 146 | input_path (:obj:`str`): Directory where to find the input files. 147 | output_path (:obj:`str`): Directory where to save immigration output files. 148 | input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value). 149 | columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful. 150 | load_size (int): Number of rows to read for debug purposes. 151 | partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list. 152 | header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false. 153 | options: All other string options. 154 | """ 155 | # Loads the global temperature dataframe using Spark 156 | temperature = read_data(spark, input_path=input_path, input_format=input_format, 157 | columns=columns, debug_size = load_size, header=header, **options) 158 | # Save the temperature dataset to the output_path 159 | save(df=temperature, output_path=output_path, partitionBy = partitionBy) 160 | return temperature 161 | 162 | def etl_airport_data(spark, input_path="airport-codes_csv.csv", output_path="out/airport.parquet", 163 | input_format = "csv", columns = '*', load_size = None, partitionBy = ["iso_country"], header=True, **options): 164 | """ 165 | Reads the airport dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path. 166 | 167 | Args: 168 | spark (:obj:`SparkSession`): Spark session. 169 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 170 | input_path (:obj:`str`): Directory where to find the input files. 171 | output_path (:obj:`str`): Directory where to save immigration output files. 172 | input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value). 173 | columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful. 174 | load_size (int): Number of rows to read for debug purposes. 175 | partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list. 176 | header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false. 177 | options: All other string options. 178 | """ 179 | # Loads the airport dataframe using Spark 180 | airport = read_data(spark, input_path=input_path, input_format=input_format, 181 | columns=columns, debug_size = load_size, header=header, **options) 182 | # Save the airport dataset to the output_path 183 | save(df=airport, output_path=output_path, partitionBy = partitionBy) 184 | return airport 185 | 186 | def etl_demographics_data(spark, input_path="us-cities-demographics.csv", output_path="out/demographics.parquet", 187 | input_format = "csv", columns='*', 188 | load_size = None, partitionBy = ["State Code"], header=True, sep=";", **options): 189 | """ 190 | Reads the demographics dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path. 191 | 192 | Args: 193 | spark (:obj:`SparkSession`): Spark session. 194 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 195 | input_path (:obj:`str`): Directory where to find the input files. 196 | output_path (:obj:`str`): Directory where to save immigration output files. 197 | input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value). 198 | columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful. 199 | load_size (int): Number of rows to read for debug purposes. 200 | partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list. 201 | header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false. 202 | options: All other string options. 203 | """ 204 | # Loads the demographics dataframe using Spark 205 | demographics = read_data(spark, input_path=input_path, input_format=input_format, 206 | columns=columns, debug_size = load_size, header=header, sep=sep, **options) 207 | 208 | # Convert numeric columns to the proper types: Integer and Double 209 | int_cols = ['Count', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born'] 210 | float_cols = ['Median Age', 'Average Household Size'] 211 | demographics = cast_type(demographics, dict(zip(int_cols, len(int_cols)*[IntegerType()]))) 212 | demographics = cast_type(demographics, dict(zip(float_cols, len(float_cols)*[DoubleType()]))) 213 | 214 | first_agg = {"Median Age": "first", "Male Population": "first", "Female Population": "first", 215 | "Total Population": "first", "Number of Veterans": "first", "Foreign-born": "first", "Average Household Size": "first"} 216 | # First aggregation - City 217 | agg_df = demographics.groupby(["City", "State", "State Code"]).agg(first_agg) 218 | # Pivot Table to transform values of the column Race to different columns 219 | piv_df = demographics.groupBy(["City", "State", "State Code"]).pivot("Race").sum("Count") 220 | 221 | # Rename column names removing the spaces to avoid problems when saving to disk (we got errors when trying to save column names with spaces) 222 | demographics = agg_df.join(other=piv_df, on=["City", "State", "State Code"], how="inner")\ 223 | .withColumnRenamed('first(Total Population)', 'TotalPopulation')\ 224 | .withColumnRenamed('first(Female Population)', 'FemalePopulation')\ 225 | .withColumnRenamed('first(Male Population)', 'MalePopulation')\ 226 | .withColumnRenamed('first(Median Age)', 'MedianAge')\ 227 | .withColumnRenamed('first(Number of Veterans)', 'NumberVeterans')\ 228 | .withColumnRenamed('first(Foreign-born)', 'ForeignBorn')\ 229 | .withColumnRenamed('first(Average Household Size)', 'AverageHouseholdSize')\ 230 | .withColumnRenamed('Hispanic or Latino', 'HispanicOrLatino')\ 231 | .withColumnRenamed('Black or African-American', 'BlackOrAfricanAmerican')\ 232 | .withColumnRenamed('American Indian and Alaska Native', 'AmericanIndianAndAlaskaNative') 233 | 234 | numeric_cols = ['TotalPopulation', 'FemalePopulation', 'MedianAge', 'NumberVeterans', 'ForeignBorn', 'MalePopulation', 'AverageHouseholdSize', 235 | 'AmericanIndianAndAlaskaNative', 'Asian', 'BlackOrAfricanAmerican', 'HispanicOrLatino', 'White'] 236 | # Fill the null values with 0 237 | demographics = demographics.fillna(0, numeric_cols) 238 | 239 | # Save the demographics dataset to the output_path 240 | if output_path is not None: 241 | save(df=demographics, output_path=output_path, partitionBy = partitionBy) 242 | 243 | return demographics 244 | 245 | def etl_states_data(spark, output_path="out/state.parquet"): 246 | cols = ['TotalPopulation', 'FemalePopulation', 'MalePopulation', 'NumberVeterans', 'ForeignBorn', 247 | 'AmericanIndianAndAlaskaNative', 'Asian', 'BlackOrAfricanAmerican', 'HispanicOrLatino', 'White'] 248 | """ 249 | Reads the states dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path. 250 | 251 | Args: 252 | spark (:obj:`SparkSession`): Spark session. 253 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 254 | output_path (:obj:`str`): Directory where to save immigration output files. 255 | """ 256 | # Loads the demographics dataframe using Spark 257 | demographics = etl_demographics_data(spark, output_path=None) 258 | # Aggregates the dataset by State 259 | states = demographics.groupby(["State Code", "State"]).agg(dict(zip(cols, len(cols)*["sum"]))) 260 | # Loads the lookup table I94ADDR 261 | addr = read_data(spark, input_path="lookup/I94ADDR.csv", input_format="csv", columns="*", header=True)\ 262 | .withColumnRenamed('State', 'State Original') 263 | 264 | # Join the two datasets 265 | addr = addr.join(states, states["State Code"] == addr.Code, "left") 266 | addr = addr.withColumn("State", when(isnull(addr["State"]), capitalize_udf(addr['State Original'])).otherwise(addr["State"])) 267 | addr = addr.drop('State Original', 'State Code') 268 | 269 | cols = ['sum(BlackOrAfricanAmerican)', 'sum(White)', 'sum(AmericanIndianAndAlaskaNative)', 270 | 'sum(HispanicOrLatino)', 'sum(Asian)', 'sum(NumberVeterans)', 'sum(ForeignBorn)', 'sum(FemalePopulation)', 271 | 'sum(MalePopulation)', 'sum(TotalPopulation)'] 272 | 273 | # Rename the columns to modify default names returned when Spark aggregates the values of the columns. 274 | # For example: column 'sum(MalePopulation)' becomes 'MalePopulation' 275 | mapping = dict(zip(cols, [re.search(r'\((.*?)\)', c).group(1) for c in cols])) 276 | addr = rename_columns(addr, mapping) 277 | 278 | # Save the resulting dataset to the output_path 279 | if output_path is not None: 280 | save(df=addr, output_path=output_path) 281 | return addr 282 | 283 | def etl_countries_data(spark, input_path="../../data2/GlobalLandTemperaturesByCity.csv", output_path="out/country.parquet", 284 | input_format = "csv", columns = '*', load_size = None, header=True, **options): 285 | """ 286 | Reads the global temperatures dataset indicated in the input_path and transform it to generate the country dataframe. Performs the ETL process and saves it in the output path indicated by the parameter out_put path. 287 | 288 | Args: 289 | spark (:obj:`SparkSession`): Spark session. 290 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 291 | input_path (:obj:`str`): Directory where to find the input files. 292 | output_path (:obj:`str`): Directory where to save immigration output files. 293 | input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value). 294 | columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful. 295 | load_size (int): Number of rows to read for debug purposes. 296 | header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false. 297 | options: All other string options. 298 | """ 299 | # Loads the demographics dataframe using Spark 300 | countries = read_data(spark, input_path=input_path, input_format=input_format, 301 | columns=columns, debug_size = load_size, header=header, **options) 302 | # Aggregates the dataset by Country and rename the name of new columns 303 | countries = countries.groupby(["Country"]).agg({"AverageTemperature": "avg", "Latitude": "first", "Longitude": "first"})\ 304 | .withColumnRenamed('avg(AverageTemperature)', 'Temperature')\ 305 | .withColumnRenamed('first(Latitude)', 'Latitude')\ 306 | .withColumnRenamed('first(Longitude)', 'Longitude') 307 | 308 | # Rename specific country names to match the I94CIT_I94RES lookup table when joining them 309 | change_countries = [("Country", "Congo (Democratic Republic Of The)", "Congo"), ("Country", "Côte D'Ivoire", "Ivory Coast")] 310 | countries = change_field_value_condition(countries, change_countries) 311 | countries = countries.withColumn('Country_Lower', lower(countries.Country)) 312 | 313 | # Rename specific country names to match the demographics dataset when joining them 314 | change_res = [("I94CTRY", "BOSNIA-HERZEGOVINA", "BOSNIA AND HERZEGOVINA"), 315 | ("I94CTRY", "INVALID: CANADA", "CANADA"), 316 | ("I94CTRY", "CHINA, PRC", "CHINA"), 317 | ("I94CTRY", "GUINEA-BISSAU", "GUINEA BISSAU"), 318 | ("I94CTRY", "INVALID: PUERTO RICO", "PUERTO RICO"), 319 | ("I94CTRY", "INVALID: UNITED STATES", "UNITED STATES")] 320 | 321 | # Loads the lookup table I94CIT_I94RES 322 | res = read_data(spark, input_path="lookup/I94CIT_I94RES.csv", input_format=input_format, columns="*", 323 | debug_size = load_size, header=header, **options) 324 | res = cast_type(res, {"Code": IntegerType()}) 325 | res = change_field_value_condition(res, change_res) 326 | res = res.withColumn('Country_Lower', lower(res.I94CTRY)) 327 | # Join the two datasets to create the country dimmension table 328 | res = res.join(countries, res.Country_Lower == countries.Country_Lower, how="left") 329 | res = res.withColumn("Country", when(isnull(res["Country"]), capitalize_udf(res.I94CTRY)).otherwise(res["Country"])) 330 | res = res.drop("I94CTRY", "Country_Lower") 331 | 332 | # Save the resulting dataset to the output_path 333 | if output_path is not None: 334 | save(df=res, output_path=output_path) 335 | return res 336 | 337 | def cast_type(df, cols): 338 | """ 339 | Convert the types of the columns according to the configuration supplied in the cols dictionary in the format {"column_name": type} 340 | 341 | Args: 342 | df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 343 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 344 | cols (:obj:`dict`): Dictionary in the format of {"column_name": type} indicating what columns and types they should be converted to 345 | """ 346 | for k,v in cols.items(): 347 | if k in df.columns: 348 | df = df.withColumn(k, df[k].cast(v)) 349 | return df 350 | 351 | def convert_sas_date(df, cols): 352 | """ 353 | Convert dates in the SAS datatype to a date in a string format YYYY-MM-DD 354 | 355 | Args: 356 | df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 357 | Represents the entry point to programming Spark with the Dataset and DataFrame API. 358 | cols (:obj:`list`): List of columns in the SAS date format to be convert 359 | """ 360 | for c in [c for c in cols if c in df.columns]: 361 | df = df.withColumn(c, convert_sas_udf(df[c])) 362 | return df 363 | 364 | def change_field_value_condition(df, change_list): 365 | ''' 366 | Helper function used to rename column values based on condition. 367 | 368 | Args: 369 | df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 370 | change_list (:obj: `list`): List of tuples in the format (field, old value, new value) 371 | ''' 372 | for field, old, new in change_list: 373 | df = df.withColumn(field, when(df[field] == old, new).otherwise(df[field])) 374 | return df 375 | 376 | def rename_columns(df, mapping): 377 | ''' 378 | Rename the columns of the dataset based in the mapping dictionary 379 | 380 | Args: 381 | df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 382 | mapping (:obj: `dict`): Mapping dictionary in the format {old_name: new_name} 383 | ''' 384 | df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns]) 385 | return df 386 | 387 | def date_diff(date1, date2): 388 | ''' 389 | Calculates the difference in days between two dates 390 | ''' 391 | if date2 is None: 392 | return None 393 | else: 394 | a = datetime.strptime(date1, date_format) 395 | b = datetime.strptime(date2, date_format) 396 | delta = b - a 397 | return delta.days 398 | 399 | # User defined functions using Spark udf wrapper function to convert SAS dates into string dates in the format YYYY-MM-DD, to capitalize the first letters of the string and to calculate the difference between two dates in days. 400 | convert_sas_udf = udf(lambda x: x if x is None else (timedelta(days=x) + datetime(1960, 1, 1)).strftime(date_format)) 401 | capitalize_udf = udf(lambda x: x if x is None else x.title()) 402 | date_diff_udf = udf(date_diff) 403 | 404 | if __name__ == "__main__" : 405 | spark = create_spark_session() 406 | # Perform ETL process for the Immigration dataset generating immigration and date tables and save them in the S3 bucket indicated in the output_path parameters. 407 | immigration = etl_immigration_data(spark, input_path='../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat', 408 | output_path="s3a://data-engineer-capstone/immigration.parquet", 409 | date_output_path="s3a://data-engineer-capstone/date.parquet", 410 | input_format = "com.github.saurfang.sas.spark", 411 | load_size=1000, partitionBy=None, 412 | columns_to_save = '*') 413 | # Perform ETL process for the Country table. Generating the Country table and saving it in the S3 bucket indicated in the output_path parameter. 414 | countries = e.etl_countries_data(spark, output_path=e.OUTPUT + "country.parquet") 415 | # Perform ETL process for the State table. Generating the State table and saving it in the S3 bucket indicated in the output_path parameter. 416 | states = e.etl_states_data(spark, output_path=e.OUTPUT + "state.parquet") -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/architecture.png -------------------------------------------------------------------------------- /images/dag.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/dag.PNG -------------------------------------------------------------------------------- /images/etl_country.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_country.png -------------------------------------------------------------------------------- /images/etl_immigration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_immigration.png -------------------------------------------------------------------------------- /images/etl_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_state.png -------------------------------------------------------------------------------- /images/i94cit.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/i94cit.PNG -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/pipeline.png -------------------------------------------------------------------------------- /images/star-schema.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/star-schema.PNG -------------------------------------------------------------------------------- /lookup/I94ADDR.csv: -------------------------------------------------------------------------------- 1 | Code,State 2 | AL,ALABAMA 3 | AK,ALASKA 4 | AZ,ARIZONA 5 | AR,ARKANSAS 6 | CA,CALIFORNIA 7 | CO,COLORADO 8 | CT,CONNECTICUT 9 | DE,DELAWARE 10 | DC,DIST. OF COLUMBIA 11 | FL,FLORIDA 12 | GA,GEORGIA 13 | GU,GUAM 14 | HI,HAWAII 15 | ID,IDAHO 16 | IL,ILLINOIS 17 | IN,INDIANA 18 | IA,IOWA 19 | KS,KANSAS 20 | KY,KENTUCKY 21 | LA,LOUISIANA 22 | ME,MAINE 23 | MD,MARYLAND 24 | MA,MASSACHUSETTS 25 | MI,MICHIGAN 26 | MN,MINNESOTA 27 | MS,MISSISSIPPI 28 | MO,MISSOURI 29 | MT,MONTANA 30 | NC,N. CAROLINA 31 | ND,N. DAKOTA 32 | NE,NEBRASKA 33 | NV,NEVADA 34 | NH,NEW HAMPSHIRE 35 | NJ,NEW JERSEY 36 | NM,NEW MEXICO 37 | NY,NEW YORK 38 | OH,OHIO 39 | OK,OKLAHOMA 40 | OR,OREGON 41 | PA,PENNSYLVANIA 42 | PR,PUERTO RICO 43 | RI,RHODE ISLAND 44 | SC,S. CAROLINA 45 | SD,S. DAKOTA 46 | TN,TENNESSEE 47 | TX,TEXAS 48 | UT,UTAH 49 | VT,VERMONT 50 | VI,VIRGIN ISLANDS 51 | VA,VIRGINIA 52 | WV,W. VIRGINIA 53 | WA,WASHINGTON 54 | WI,WISCONSON 55 | WY,WYOMING 56 | 99,All Other Codes -------------------------------------------------------------------------------- /lookup/I94CIT_I94RES.csv: -------------------------------------------------------------------------------- 1 | Code,I94CTRY 2 | 582,MEXICO 3 | 236,AFGHANISTAN 4 | 101,ALBANIA 5 | 316,ALGERIA 6 | 102,ANDORRA 7 | 324,ANGOLA 8 | 529,ANGUILLA 9 | 518,ANTIGUA-BARBUDA 10 | 687,ARGENTINA 11 | 151,ARMENIA 12 | 532,ARUBA 13 | 438,AUSTRALIA 14 | 103,AUSTRIA 15 | 152,AZERBAIJAN 16 | 512,BAHAMAS 17 | 298,BAHRAIN 18 | 274,BANGLADESH 19 | 513,BARBADOS 20 | 104,BELGIUM 21 | 581,BELIZE 22 | 386,BENIN 23 | 509,BERMUDA 24 | 153,BELARUS 25 | 242,BHUTAN 26 | 688,BOLIVIA 27 | 717,"BONAIRE, ST EUSTATIUS, SABA" 28 | 164,BOSNIA-HERZEGOVINA 29 | 336,BOTSWANA 30 | 689,BRAZIL 31 | 525,BRITISH VIRGIN ISLANDS 32 | 217,BRUNEI 33 | 105,BULGARIA 34 | 393,BURKINA FASO 35 | 243,BURMA 36 | 375,BURUNDI 37 | 310,CAMEROON 38 | 326,CAPE VERDE 39 | 526,CAYMAN ISLANDS 40 | 383,CENTRAL AFRICAN REPUBLIC 41 | 384,CHAD 42 | 690,CHILE 43 | 245,"CHINA, PRC" 44 | 721,CURACAO 45 | 270,CHRISTMAS ISLAND 46 | 271,COCOS ISLANDS 47 | 691,COLOMBIA 48 | 317,COMOROS 49 | 385,CONGO 50 | 467,COOK ISLANDS 51 | 575,COSTA RICA 52 | 165,CROATIA 53 | 584,CUBA 54 | 218,CYPRUS 55 | 140,CZECH REPUBLIC 56 | 723,FAROE ISLANDS (PART OF DENMARK) 57 | 108,DENMARK 58 | 322,DJIBOUTI 59 | 519,DOMINICA 60 | 585,DOMINICAN REPUBLIC 61 | 240,EAST TIMOR 62 | 692,ECUADOR 63 | 368,EGYPT 64 | 576,EL SALVADOR 65 | 399,EQUATORIAL GUINEA 66 | 372,ERITREA 67 | 109,ESTONIA 68 | 369,ETHIOPIA 69 | 604,FALKLAND ISLANDS 70 | 413,FIJI 71 | 110,FINLAND 72 | 111,FRANCE 73 | 601,FRENCH GUIANA 74 | 411,FRENCH POLYNESIA 75 | 387,GABON 76 | 338,GAMBIA 77 | 758,GAZA STRIP 78 | 154,GEORGIA 79 | 112,GERMANY 80 | 339,GHANA 81 | 143,GIBRALTAR 82 | 113,GREECE 83 | 520,GRENADA 84 | 507,GUADELOUPE 85 | 577,GUATEMALA 86 | 382,GUINEA 87 | 327,GUINEA-BISSAU 88 | 603,GUYANA 89 | 586,HAITI 90 | 726,HEARD AND MCDONALD IS. 91 | 149,HOLY SEE/VATICAN 92 | 528,HONDURAS 93 | 206,HONG KONG 94 | 114,HUNGARY 95 | 115,ICELAND 96 | 213,INDIA 97 | 759,INDIAN OCEAN AREAS (FRENCH) 98 | 729,INDIAN OCEAN TERRITORY 99 | 204,INDONESIA 100 | 249,IRAN 101 | 250,IRAQ 102 | 116,IRELAND 103 | 251,ISRAEL 104 | 117,ITALY 105 | 388,IVORY COAST 106 | 514,JAMAICA 107 | 209,JAPAN 108 | 253,JORDAN 109 | 201,KAMPUCHEA 110 | 155,KAZAKHSTAN 111 | 340,KENYA 112 | 414,KIRIBATI 113 | 732,KOSOVO 114 | 272,KUWAIT 115 | 156,KYRGYZSTAN 116 | 203,LAOS 117 | 118,LATVIA 118 | 255,LEBANON 119 | 335,LESOTHO 120 | 370,LIBERIA 121 | 381,LIBYA 122 | 119,LIECHTENSTEIN 123 | 120,LITHUANIA 124 | 121,LUXEMBOURG 125 | 214,MACAU 126 | 167,MACEDONIA 127 | 320,MADAGASCAR 128 | 345,MALAWI 129 | 273,MALAYSIA 130 | 220,MALDIVES 131 | 392,MALI 132 | 145,MALTA 133 | 472,MARSHALL ISLANDS 134 | 511,MARTINIQUE 135 | 389,MAURITANIA 136 | 342,MAURITIUS 137 | 760,MAYOTTE (AFRICA - FRENCH) 138 | 473,"MICRONESIA, FED. STATES OF" 139 | 157,MOLDOVA 140 | 122,MONACO 141 | 299,MONGOLIA 142 | 735,MONTENEGRO 143 | 521,MONTSERRAT 144 | 332,MOROCCO 145 | 329,MOZAMBIQUE 146 | 371,NAMIBIA 147 | 440,NAURU 148 | 257,NEPAL 149 | 123,NETHERLANDS 150 | 508,NETHERLANDS ANTILLES 151 | 409,NEW CALEDONIA 152 | 464,NEW ZEALAND 153 | 579,NICARAGUA 154 | 390,NIGER 155 | 343,NIGERIA 156 | 470,NIUE 157 | 275,NORTH KOREA 158 | 124,NORWAY 159 | 256,OMAN 160 | 258,PAKISTAN 161 | 474,PALAU 162 | 743,PALESTINE 163 | 504,PANAMA 164 | 441,PAPUA NEW GUINEA 165 | 693,PARAGUAY 166 | 694,PERU 167 | 260,PHILIPPINES 168 | 416,PITCAIRN ISLANDS 169 | 107,POLAND 170 | 126,PORTUGAL 171 | 297,QATAR 172 | 748,REPUBLIC OF SOUTH SUDAN 173 | 321,REUNION 174 | 127,ROMANIA 175 | 158,RUSSIA 176 | 376,RWANDA 177 | 128,SAN MARINO 178 | 330,SAO TOME AND PRINCIPE 179 | 261,SAUDI ARABIA 180 | 391,SENEGAL 181 | 142,SERBIA AND MONTENEGRO 182 | 745,SERBIA 183 | 347,SEYCHELLES 184 | 348,SIERRA LEONE 185 | 207,SINGAPORE 186 | 141,SLOVAKIA 187 | 166,SLOVENIA 188 | 412,SOLOMON ISLANDS 189 | 397,SOMALIA 190 | 373,SOUTH AFRICA 191 | 276,SOUTH KOREA 192 | 129,SPAIN 193 | 244,SRI LANKA 194 | 346,ST. HELENA 195 | 522,ST. KITTS-NEVIS 196 | 523,ST. LUCIA 197 | 502,ST. PIERRE AND MIQUELON 198 | 524,ST. VINCENT-GRENADINES 199 | 716,SAINT BARTHELEMY 200 | 736,SAINT MARTIN 201 | 749,SAINT MAARTEN 202 | 350,SUDAN 203 | 602,SURINAME 204 | 351,SWAZILAND 205 | 130,SWEDEN 206 | 131,SWITZERLAND 207 | 262,SYRIA 208 | 268,TAIWAN 209 | 159,TAJIKISTAN 210 | 353,TANZANIA 211 | 263,THAILAND 212 | 304,TOGO 213 | 417,TONGA 214 | 516,TRINIDAD AND TOBAGO 215 | 323,TUNISIA 216 | 264,TURKEY 217 | 161,TURKMENISTAN 218 | 527,TURKS AND CAICOS ISLANDS 219 | 420,TUVALU 220 | 352,UGANDA 221 | 162,UKRAINE 222 | 296,UNITED ARAB EMIRATES 223 | 135,UNITED KINGDOM 224 | 695,URUGUAY 225 | 163,UZBEKISTAN 226 | 410,VANUATU 227 | 696,VENEZUELA 228 | 266,VIETNAM 229 | 469,WALLIS AND FUTUNA ISLANDS 230 | 757,WEST INDIES (FRENCH) 231 | 333,WESTERN SAHARA 232 | 465,WESTERN SAMOA 233 | 216,YEMEN 234 | 139,YUGOSLAVIA 235 | 301,ZAIRE 236 | 344,ZAMBIA 237 | 315,ZIMBABWE 238 | 403,INVALID: AMERICAN SAMOA 239 | 712,INVALID: ANTARCTICA 240 | 700,INVALID: BORN ON BOARD SHIP 241 | 719,INVALID: BOUVET ISLAND (ANTARCTICA/NORWAY TERR.) 242 | 574,INVALID: CANADA 243 | 720,INVALID: CANTON AND ENDERBURY ISLS 244 | 106,INVALID: CZECHOSLOVAKIA 245 | 739,INVALID: DRONNING MAUD LAND (ANTARCTICA-NORWAY) 246 | 394,INVALID: FRENCH SOUTHERN AND ANTARCTIC 247 | 501,INVALID: GREENLAND 248 | 404,INVALID: GUAM 249 | 730,INVALID: INTERNATIONAL WATERS 250 | 731,INVALID: JOHNSON ISLAND 251 | 471,"INVALID: MARIANA ISLANDS, NORTHERN" 252 | 737,INVALID: MIDWAY ISLANDS 253 | 753,INVALID: MINOR OUTLYING ISLANDS - USA 254 | 740,INVALID: NEUTRAL ZONE (S. ARABIA/IRAQ) 255 | 710,INVALID: NON-QUOTA IMMIGRANT 256 | 505,INVALID: PUERTO RICO 257 | 0,INVALID: STATELESS 258 | 705,INVALID: STATELESS 259 | 583,INVALID: UNITED STATES 260 | 407,INVALID: UNITED STATES 261 | 999,INVALID: UNKNOWN 262 | 239,INVALID: UNKNOWN COUNTRY 263 | 134,INVALID: USSR 264 | 506,INVALID: U.S. VIRGIN ISLANDS 265 | 755,INVALID: WAKE ISLAND 266 | 311,Collapsed Tanzania (should not show) 267 | 741,Collapsed Curacao (should not show) 268 | 54,No Country Code (54) 269 | 100,No Country Code (100) 270 | 187,No Country Code (187) 271 | 190,No Country Code (190) 272 | 200,No Country Code (200) 273 | 219,No Country Code (219) 274 | 238,No Country Code (238) 275 | 277,No Country Code (277) 276 | 293,No Country Code (293) 277 | 300,No Country Code (300) 278 | 319,No Country Code (319) 279 | 365,No Country Code (365) 280 | 395,No Country Code (395) 281 | 400,No Country Code (400) 282 | 485,No Country Code (485) 283 | 503,No Country Code (503) 284 | 589,No Country Code (589) 285 | 592,No Country Code (592) 286 | 791,No Country Code (791) 287 | 849,No Country Code (849) 288 | 914,No Country Code (914) 289 | 944,No Country Code (944) 290 | 996,No Country Code (996) -------------------------------------------------------------------------------- /lookup/I94MODE.csv: -------------------------------------------------------------------------------- 1 | ID,Mode 2 | 1,Air 3 | 2,Sea 4 | 3,Land 5 | 9,Not reported -------------------------------------------------------------------------------- /lookup/I94PORT.csv: -------------------------------------------------------------------------------- 1 | ID,Port 2 | ALC,"ALCAN, AK" 3 | ANC,"ANCHORAGE, AK" 4 | BAR,"BAKER AAF - BAKER ISLAND, AK" 5 | DAC,"DALTONS CACHE, AK" 6 | PIZ,"DEW STATION PT LAY DEW, AK" 7 | DTH,"DUTCH HARBOR, AK" 8 | EGL,"EAGLE, AK" 9 | FRB,"FAIRBANKS, AK" 10 | HOM,"HOMER, AK" 11 | HYD,"HYDER, AK" 12 | JUN,"JUNEAU, AK" 13 | 5KE,"KETCHIKAN, AK" 14 | KET,"KETCHIKAN, AK" 15 | MOS,"MOSES POINT INTERMEDIATE, AK" 16 | NIK,"NIKISKI, AK" 17 | NOM,"NOM, AK" 18 | PKC,"POKER CREEK, AK" 19 | ORI,"PORT LIONS SPB, AK" 20 | SKA,"SKAGWAY, AK" 21 | SNP,"ST. PAUL ISLAND, AK" 22 | TKI,"TOKEEN, AK" 23 | WRA,"WRANGELL, AK" 24 | HSV,"MADISON COUNTY - HUNTSVILLE, AL" 25 | MOB,"MOBILE, AL" 26 | LIA,"LITTLE ROCK, AR (BPS)" 27 | ROG,"ROGERS ARPT, AR" 28 | DOU,"DOUGLAS, AZ" 29 | LUK,"LUKEVILLE, AZ" 30 | MAP,MARIPOSA AZ 31 | NAC,"NACO, AZ" 32 | NOG,"NOGALES, AZ" 33 | PHO,"PHOENIX, AZ" 34 | POR,"PORTAL, AZ" 35 | SLU,"SAN LUIS, AZ" 36 | SAS,"SASABE, AZ" 37 | TUC,"TUCSON, AZ" 38 | YUI,"YUMA, AZ" 39 | AND,"ANDRADE, CA" 40 | BUR,"BURBANK, CA" 41 | CAL,"CALEXICO, CA" 42 | CAO,"CAMPO, CA" 43 | FRE,"FRESNO, CA" 44 | ICP,"IMPERIAL COUNTY, CA" 45 | LNB,"LONG BEACH, CA" 46 | LOS,"LOS ANGELES, CA" 47 | BFL,"MEADOWS FIELD - BAKERSFIELD, CA" 48 | OAK,"OAKLAND, CA" 49 | ONT,"ONTARIO, CA" 50 | OTM,"OTAY MESA, CA" 51 | BLT,"PACIFIC, HWY. STATION, CA" 52 | PSP,"PALM SPRINGS, CA" 53 | SAC,"SACRAMENTO, CA" 54 | SLS,"SALINAS, CA (BPS)" 55 | SDP,"SAN DIEGO, CA" 56 | SFR,"SAN FRANCISCO, CA" 57 | SNJ,"SAN JOSE, CA" 58 | SLO,"SAN LUIS OBISPO, CA" 59 | SLI,"SAN LUIS OBISPO, CA (BPS)" 60 | SPC,"SAN PEDRO, CA" 61 | SYS,"SAN YSIDRO, CA" 62 | SAA,"SANTA ANA, CA" 63 | STO,"STOCKTON, CA (BPS)" 64 | TEC,"TECATE, CA" 65 | TRV,"TRAVIS-AFB, CA" 66 | APA,"ARAPAHOE COUNTY, CO" 67 | ASE,"ASPEN, CO #ARPT" 68 | COS,"COLORADO SPRINGS, CO" 69 | DEN,"DENVER, CO" 70 | DRO,"LA PLATA - DURANGO, CO" 71 | BDL,"BRADLEY INTERNATIONAL, CT" 72 | BGC,"BRIDGEPORT, CT" 73 | GRT,"GROTON, CT" 74 | HAR,"HARTFORD, CT" 75 | NWH,"NEW HAVEN, CT" 76 | NWL,"NEW LONDON, CT" 77 | TST,"NEWINGTON DATA CENTER TEST, CT" 78 | WAS,WASHINGTON DC 79 | DOV,"DOVER AFB, DE" 80 | DVD,"DOVER-AFB, DE" 81 | WLL,"WILMINGTON, DE" 82 | BOC,"BOCAGRANDE, FL" 83 | SRQ,"BRADENTON - SARASOTA, FL" 84 | CAN,"CAPE CANAVERAL, FL" 85 | DAB,"DAYTONA BEACH INTERNATIONAL, FL" 86 | FRN,"FERNANDINA, FL" 87 | FTL,"FORT LAUDERDALE, FL" 88 | FMY,"FORT MYERS, FL" 89 | FPF,"FORT PIERCE, FL" 90 | HUR,"HURLBURT FIELD, FL" 91 | GNV,"J R ALISON MUNI - GAINESVILLE, FL" 92 | JAC,"JACKSONVILLE, FL" 93 | KEY,"KEY WEST, FL" 94 | LEE,"LEESBURG MUNICIPAL AIRPORT, FL" 95 | MLB,"MELBOURNE, FL" 96 | MIA,"MIAMI, FL" 97 | APF,"NAPLES, FL #ARPT" 98 | OPF,"OPA LOCKA, FL" 99 | ORL,"ORLANDO, FL" 100 | PAN,"PANAMA CITY, FL" 101 | PEN,"PENSACOLA, FL" 102 | PCF,"PORT CANAVERAL, FL" 103 | PEV,"PORT EVERGLADES, FL" 104 | PSJ,"PORT ST JOE, FL" 105 | SFB,"SANFORD, FL" 106 | SGJ,"ST AUGUSTINE ARPT, FL" 107 | SAU,"ST AUGUSTINE, FL" 108 | FPR,"ST LUCIE COUNTY, FL" 109 | SPE,"ST PETERSBURG, FL" 110 | TAM,"TAMPA, FL" 111 | WPB,"WEST PALM BEACH, FL" 112 | ATL,"ATLANTA, GA" 113 | BRU,"BRUNSWICK, GA" 114 | AGS,"BUSH FIELD - AUGUSTA, GA" 115 | SAV,"SAVANNAH, GA" 116 | AGA,"AGANA, GU" 117 | HHW,"HONOLULU, HI" 118 | OGG,"KAHULUI - MAUI, HI" 119 | KOA,"KEAHOLE-KONA, HI" 120 | LIH,"LIHUE, HI" 121 | CID,"CEDAR RAPIDS/IOWA CITY, IA" 122 | DSM,"DES MOINES, IA" 123 | BOI,"AIR TERM. (GOWEN FLD) BOISE, ID" 124 | EPI,"EASTPORT, ID" 125 | IDA,"FANNING FIELD - IDAHO FALLS, ID" 126 | PTL,"PORTHILL, ID" 127 | SPI,"CAPITAL - SPRINGFIELD, IL" 128 | CHI,"CHICAGO, IL" 129 | DPA,"DUPAGE COUNTY, IL" 130 | PIA,"GREATER PEORIA, IL" 131 | RFD,"GREATER ROCKFORD, IL" 132 | UGN,"MEMORIAL - WAUKEGAN, IL" 133 | GAR,"GARY, IN" 134 | HMM,"HAMMOND, IN" 135 | INP,"INDIANAPOLIS, IN" 136 | MRL,"MERRILLVILLE, IN" 137 | SBN,"SOUTH BEND, IN" 138 | ICT,"MID-CONTINENT - WITCHITA, KS" 139 | LEX,"BLUE GRASS - LEXINGTON, KY" 140 | LOU,"LOUISVILLE, KY" 141 | BTN,"BATON ROUGE, LA" 142 | LKC,"LAKE CHARLES, LA" 143 | LAK,"LAKE CHARLES, LA (BPS)" 144 | MLU,"MONROE, LA" 145 | MGC,"MORGAN CITY, LA" 146 | NOL,"NEW ORLEANS, LA" 147 | BOS,"BOSTON, MA" 148 | GLO,"GLOUCESTER, MA" 149 | BED,"HANSCOM FIELD - BEDFORD, MA" 150 | LYN,"LYNDEN, WA" 151 | ADW,"ANDREWS AFB, MD" 152 | BAL,"BALTIMORE, MD" 153 | MKG,"MUSKEGON, MD" 154 | PAX,"PATUXENT RIVER, MD" 155 | BGM,"BANGOR, ME" 156 | BOO,"BOOTHBAY HARBOR, ME" 157 | BWM,"BRIDGEWATER, ME" 158 | BCK,"BUCKPORT, ME" 159 | CLS,"CALAIS, ME" 160 | CRB,"CARIBOU, ME" 161 | COB,"COBURN GORE, ME" 162 | EST,"EASTCOURT, ME" 163 | EPT,"EASTPORT MUNICIPAL, ME" 164 | EPM,"EASTPORT, ME" 165 | FOR,"FOREST CITY, ME" 166 | FTF,"FORT FAIRFIELD, ME" 167 | FTK,"FORT KENT, ME" 168 | HML,"HAMIIN, ME" 169 | HTM,"HOULTON, ME" 170 | JKM,"JACKMAN, ME" 171 | KAL,"KALISPEL, MT" 172 | LIM,"LIMESTONE, ME" 173 | LUB,"LUBEC, ME" 174 | MAD,"MADAWASKA, ME" 175 | POM,"PORTLAND, ME" 176 | RGM,"RANGELEY, ME (BPS)" 177 | SBR,"SOUTH BREWER, ME" 178 | SRL,"ST AURELIE, ME" 179 | SPA,"ST PAMPILE, ME" 180 | VNB,"VAN BUREN, ME" 181 | VCB,"VANCEBORO, ME" 182 | AGN,"ALGONAC, MI" 183 | ALP,"ALPENA, MI" 184 | BCY,"BAY CITY, MI" 185 | DET,"DETROIT, MI" 186 | GRP,"GRAND RAPIDS, MI" 187 | GRO,"GROSSE ISLE, MI" 188 | ISL,"ISLE ROYALE, MI" 189 | MRC,"MARINE CITY, MI" 190 | MRY,"MARYSVILLE, MI" 191 | PTK,"OAKLAND COUNTY - PONTIAC, MI" 192 | PHU,"PORT HURON, MI" 193 | RBT,"ROBERTS LANDING, MI" 194 | SAG,"SAGINAW, MI" 195 | SSM,"SAULT STE. MARIE, MI" 196 | SCL,"ST CLAIR, MI" 197 | YIP,"WILLOW RUN - YPSILANTI, MI" 198 | BAU,"BAUDETTE, MN" 199 | CAR,"CARIBOU MUNICIPAL AIRPORT, MN" 200 | GTF,"Collapsed into INT, MN" 201 | INL,"Collapsed into INT, MN" 202 | CRA,"CRANE LAKE, MN" 203 | MIC,"CRYSTAL MUNICIPAL AIRPORT, MN" 204 | DUL,"DULUTH, MN" 205 | ELY,"ELY, MN" 206 | GPM,"GRAND PORTAGE, MN" 207 | SVC,"GRANT COUNTY - SILVER CITY, MN" 208 | INT,"INTL FALLS, MN" 209 | LAN,"LANCASTER, MN" 210 | MSP,"MINN./ST PAUL, MN" 211 | LIN,"NORTHERN SVC CENTER, MN" 212 | NOY,"NOYES, MN" 213 | PIN,"PINE CREEK, MN" 214 | 48Y,"PINECREEK BORDER ARPT, MN" 215 | RAN,"RAINER, MN" 216 | RST,"ROCHESTER, MN" 217 | ROS,"ROSEAU, MN" 218 | SPM,"ST PAUL, MN" 219 | WSB,"WARROAD INTL, SPB, MN" 220 | WAR,"WARROAD, MN" 221 | KAN,"KANSAS CITY, MO" 222 | SGF,"SPRINGFIELD-BRANSON, MO" 223 | STL,"ST LOUIS, MO" 224 | WHI,"WHITETAIL, MT" 225 | WHM,"WILD HORSE, MT" 226 | GPT,"BILOXI REGIONAL, MS" 227 | GTR,"GOLDEN TRIANGLE LOWNDES CNTY, MS" 228 | GUL,"GULFPORT, MS" 229 | PAS,"PASCAGOULA, MS" 230 | JAN,"THOMPSON FIELD - JACKSON, MS" 231 | BIL,"BILLINGS, MT" 232 | BTM,"BUTTE, MT" 233 | CHF,"CHIEF MT, MT" 234 | CTB,"CUT BANK MUNICIPAL, MT" 235 | CUT,"CUT BANK, MT" 236 | DLB,"DEL BONITA, MT" 237 | EUR,"EUREKA, MT (BPS)" 238 | BZN,"GALLATIN FIELD - BOZEMAN, MT" 239 | FCA,"GLACIER NATIONAL PARK, MT" 240 | GGW,"GLASGOW, MT" 241 | GRE,"GREAT FALLS, MT" 242 | HVR,"HAVRE, MT" 243 | HEL,"HELENA, MT" 244 | LWT,"LEWISTON, MT" 245 | MGM,"MORGAN, MT" 246 | OPH,"OPHEIM, MT" 247 | PIE,"PIEGAN, MT" 248 | RAY,"RAYMOND, MT" 249 | ROO,"ROOSVILLE, MT" 250 | SCO,"SCOBEY, MT" 251 | SWE,"SWEETGTASS, MT" 252 | TRL,"TRIAL CREEK, MT" 253 | TUR,"TURNER, MT" 254 | WCM,"WILLOW CREEK, MT" 255 | CLT,"CHARLOTTE, NC" 256 | FAY,"FAYETTEVILLE, NC" 257 | MRH,"MOREHEAD CITY, NC" 258 | FOP,"MORRIS FIELDS AAF, NC" 259 | GSO,"PIEDMONT TRIAD INTL AIRPORT, NC" 260 | RDU,"RALEIGH/DURHAM, NC" 261 | SSC,"SHAW AFB - SUMTER, NC" 262 | WIL,"WILMINGTON, NC" 263 | AMB,"AMBROSE, ND" 264 | ANT,"ANTLER, ND" 265 | CRY,"CARBURY, ND" 266 | DNS,"DUNSEITH, ND" 267 | FAR,"FARGO, ND" 268 | FRT,"FORTUNA, ND" 269 | GRF,"GRAND FORKS, ND" 270 | HNN,"HANNAH, ND" 271 | HNS,"HANSBORO, ND" 272 | MAI,"MAIDA, ND" 273 | MND,"MINOT, ND" 274 | NEC,"NECHE, ND" 275 | NOO,"NOONAN, ND" 276 | NRG,"NORTHGATE, ND" 277 | PEM,"PEMBINA, ND" 278 | SAR,"SARLES, ND" 279 | SHR,"SHERWOOD, ND" 280 | SJO,"ST JOHN, ND" 281 | WAL,"WALHALLA, ND" 282 | WHO,"WESTHOPE, ND" 283 | WND,"WILLISTON, ND" 284 | OMA,"OMAHA, NE" 285 | LEB,"LEBANON, NH" 286 | MHT,"MANCHESTER, NH" 287 | PNH,"PITTSBURG, NH" 288 | PSM,"PORTSMOUTH, NH" 289 | BYO,"BAYONNE, NJ" 290 | CNJ,"CAMDEN, NJ" 291 | HOB,"HOBOKEN, NJ" 292 | JER,"JERSEY CITY, NJ" 293 | WRI,"MC GUIRE AFB - WRIGHTSOWN, NJ" 294 | MMU,"MORRISTOWN, NJ" 295 | NEW,"NEWARK/TETERBORO, NJ" 296 | PER,"PERTH AMBOY, NJ" 297 | ACY,"POMONA FIELD - ATLANTIC CITY, NJ" 298 | ALA,"ALAMAGORDO, NM (BPS)" 299 | ABQ,"ALBUQUERQUE, NM" 300 | ANP,"ANTELOPE WELLS, NM" 301 | CRL,"CARLSBAD, NM" 302 | COL,"COLUMBUS, NM" 303 | CDD,"CRANE LAKE - ST. LOUIS CNTY, NM" 304 | DNM,"DEMING, NM (BPS)" 305 | LAS,"LAS CRUCES, NM" 306 | LOB,"LORDSBURG, NM (BPS)" 307 | RUI,"RUIDOSO, NM" 308 | STR,"SANTA TERESA, NM" 309 | RNO,"CANNON INTL - RENO/TAHOE, NV" 310 | FLX,"FALLON MUNICIPAL AIRPORT, NV" 311 | LVG,"LAS VEGAS, NV" 312 | REN,"RENO, NV" 313 | ALB,"ALBANY, NY" 314 | AXB,"ALEXANDRIA BAY, NY" 315 | BUF,"BUFFALO, NY" 316 | CNH,"CANNON CORNERS, NY" 317 | CAP,"CAPE VINCENT, NY" 318 | CHM,"CHAMPLAIN, NY" 319 | CHT,"CHATEAUGAY, NY" 320 | CLA,"CLAYTON, NY" 321 | FTC,"FORT COVINGTON, NY" 322 | LAG,"LA GUARDIA, NY" 323 | LEW,"LEWISTON, NY" 324 | MAS,"MASSENA, NY" 325 | MAG,"MCGUIRE AFB, NY" 326 | MOO,"MOORES, NY" 327 | MRR,"MORRISTOWN, NY" 328 | NYC,"NEW YORK, NY" 329 | NIA,"NIAGARA FALLS, NY" 330 | OGD,"OGDENSBURG, NY" 331 | OSW,"OSWEGO, NY" 332 | ELM,"REGIONAL ARPT - HORSEHEAD, NY" 333 | ROC,"ROCHESTER, NY" 334 | ROU,"ROUSES POINT, NY" 335 | SWF,"STEWART - ORANGE CNTY, NY" 336 | SYR,"SYRACUSE, NY" 337 | THO,"THOUSAND ISLAND BRIDGE, NY" 338 | TRO,"TROUT RIVER, NY" 339 | WAT,"WATERTOWN, NY" 340 | HPN,"WESTCHESTER - WHITE PLAINS, NY" 341 | WRB,"WHIRLPOOL BRIDGE, NY" 342 | YOU,"YOUNGSTOWN, NY" 343 | AKR,"AKRON, OH" 344 | ATB,"ASHTABULA, OH" 345 | CIN,"CINCINNATI, OH" 346 | CLE,"CLEVELAND, OH" 347 | CLM,"COLUMBUS, OH" 348 | LOR,"LORAIN, OH" 349 | MBO,"MARBLE HEADS, OH" 350 | SDY,"SANDUSKY, OH" 351 | TOL,"TOLEDO, OH" 352 | OKC,"OKLAHOMA CITY, OK" 353 | TUL,"TULSA, OK" 354 | AST,"ASTORIA, OR" 355 | COO,"COOS BAY, OR" 356 | HIO,"HILLSBORO, OR" 357 | MED,"MEDFORD, OR" 358 | NPT,"NEWPORT, OR" 359 | POO,"PORTLAND, OR" 360 | PUT,"PUT-IN-BAY, OH" 361 | RDM,"ROBERTS FIELDS - REDMOND, OR" 362 | ERI,"ERIE, PA" 363 | MDT,"HARRISBURG, PA" 364 | HSB,"HARRISONBURG, PA" 365 | PHI,"PHILADELPHIA, PA" 366 | PIT,"PITTSBURG, PA" 367 | AGU,"AGUADILLA, PR" 368 | BQN,"BORINQUEN - AGUADILLO, PR" 369 | JCP,"CULEBRA - BENJAMIN RIVERA, PR" 370 | ENS,"ENSENADA, PR" 371 | FAJ,"FAJARDO, PR" 372 | HUM,"HUMACAO, PR" 373 | JOB,"JOBOS, PR" 374 | MAY,"MAYAGUEZ, PR" 375 | PON,"PONCE, PR" 376 | PSE,"PONCE-MERCEDITA, PR" 377 | SAJ,"SAN JUAN, PR" 378 | VQS,"VIEQUES-ARPT, PR" 379 | PRO,"PROVIDENCE, RI" 380 | PVD,"THEODORE FRANCIS - WARWICK, RI" 381 | CHL,"CHARLESTON, SC" 382 | CAE,"COLUMBIA, SC #ARPT" 383 | GEO,"GEORGETOWN, SC" 384 | GSP,"GREENVILLE, SC" 385 | GRR,"GREER, SC" 386 | MYR,"MYRTLE BEACH, SC" 387 | SPF,"BLACK HILLS, SPEARFISH, SD" 388 | HON,"HOWES REGIONAL ARPT - HURON, SD" 389 | SAI,"SAIPAN, SPN" 390 | TYS,"MC GHEE TYSON - ALCOA, TN" 391 | MEM,"MEMPHIS, TN" 392 | NSV,"NASHVILLE, TN" 393 | TRI,"TRI CITY ARPT, TN" 394 | ADS,"ADDISON AIRPORT- ADDISON, TX" 395 | ADT,"AMISTAD DAM, TX" 396 | ANZ,"ANZALDUAS, TX" 397 | AUS,"AUSTIN, TX" 398 | BEA,"BEAUMONT, TX" 399 | BBP,"BIG BEND PARK, TX (BPS)" 400 | SCC,"BP SPEC COORD. CTR, TX" 401 | BTC,"BP TACTICAL UNIT, TX" 402 | BOA,"BRIDGE OF AMERICAS, TX" 403 | BRO,"BROWNSVILLE, TX" 404 | CRP,"CORPUS CHRISTI, TX" 405 | DAL,"DALLAS, TX" 406 | DLR,"DEL RIO, TX" 407 | DNA,"DONNA, TX" 408 | EGP,"EAGLE PASS, TX" 409 | ELP,"EL PASO, TX" 410 | FAB,"FABENS, TX" 411 | FAL,"FALCON HEIGHTS, TX" 412 | FTH,"FORT HANCOCK, TX" 413 | AFW,"FORT WORTH ALLIANCE, TX" 414 | FPT,"FREEPORT, TX" 415 | GAL,"GALVESTON, TX" 416 | HLG,"HARLINGEN, TX" 417 | HID,"HIDALGO, TX" 418 | HOU,"HOUSTON, TX" 419 | SGR,"HULL FIELD, SUGAR LAND ARPT, TX" 420 | LLB,"JUAREZ-LINCOLN BRIDGE, TX" 421 | LCB,"LAREDO COLUMBIA BRIDGE, TX" 422 | LRN,"LAREDO NORTH, TX" 423 | LAR,"LAREDO, TX" 424 | LSE,"LOS EBANOS, TX" 425 | IND,"LOS INDIOS, TX" 426 | LOI,"LOS INDIOS, TX" 427 | MRS,"MARFA, TX (BPS)" 428 | MCA,"MCALLEN, TX" 429 | MAF,"ODESSA REGIONAL, TX" 430 | PDN,"PASO DEL NORTE,TX" 431 | PBB,"PEACE BRIDGE, NY" 432 | PHR,"PHARR, TX" 433 | PAR,"PORT ARTHUR, TX" 434 | ISB,"PORT ISABEL, TX" 435 | POE,"PORT OF EL PASO, TX" 436 | PRE,"PRESIDIO, TX" 437 | PGR,"PROGRESO, TX" 438 | RIO,"RIO GRANDE CITY, TX" 439 | ROM,"ROMA, TX" 440 | SNA,"SAN ANTONIO, TX" 441 | SNN,"SANDERSON, TX" 442 | VIB,"VETERAN INTL BRIDGE, TX" 443 | YSL,"YSLETA, TX" 444 | CHA,"CHARLOTTE AMALIE, VI" 445 | CHR,"CHRISTIANSTED, VI" 446 | CRU,"CRUZ BAY, ST JOHN, VI" 447 | FRK,"FREDERIKSTED, VI" 448 | STT,"ST THOMAS, VI" 449 | LGU,"CACHE AIRPORT - LOGAN, UT" 450 | SLC,"SALT LAKE CITY, UT" 451 | CHO,"ALBEMARLE CHARLOTTESVILLE, VA" 452 | DAA,"DAVISON AAF - FAIRFAX CNTY, VA" 453 | HOP,"HOPEWELL, VA" 454 | HEF,"MANASSAS, VA #ARPT" 455 | NWN,"NEWPORT, VA" 456 | NOR,"NORFOLK, VA" 457 | RCM,"RICHMOND, VA" 458 | ABS,"ALBURG SPRINGS, VT" 459 | ABG,"ALBURG, VT" 460 | BEB,"BEEBE PLAIN, VT" 461 | BEE,"BEECHER FALLS, VT" 462 | BRG,"BURLINGTON, VT" 463 | CNA,"CANAAN, VT" 464 | DER,"DERBY LINE, VT (I-91)" 465 | DLV,"DERBY LINE, VT (RT. 5)" 466 | ERC,"EAST RICHFORD, VT" 467 | HIG,"HIGHGATE SPRINGS, VT" 468 | MOR,"MORSES LINE, VT" 469 | NPV,"NEWPORT, VT" 470 | NRT,"NORTH TROY, VT" 471 | NRN,"NORTON, VT" 472 | PIV,"PINNACLE ROAD, VT" 473 | RIF,"RICHFORT, VT" 474 | STA,"ST ALBANS, VT" 475 | SWB,"SWANTON, VT (BP - SECTOR HQ)" 476 | WBE,"WEST BERKSHIRE, VT" 477 | ABE,"ABERDEEN, WA" 478 | ANA,"ANACORTES, WA" 479 | BEL,"BELLINGHAM, WA" 480 | BLI,"BELLINGHAM, WASHINGTON #INTL" 481 | BLA,"BLAINE, WA" 482 | BWA,"BOUNDARY, WA" 483 | CUR,"CURLEW, WA (BPS)" 484 | DVL,"DANVILLE, WA" 485 | EVE,"EVERETT, WA" 486 | FER,"FERRY, WA" 487 | FRI,"FRIDAY HARBOR, WA" 488 | FWA,"FRONTIER, WA" 489 | KLM,"KALAMA, WA" 490 | LAU,"LAURIER, WA" 491 | LON,"LONGVIEW, WA" 492 | MET,"METALINE FALLS, WA" 493 | MWH,"MOSES LAKE GRANT COUNTY ARPT, WA" 494 | NEA,"NEAH BAY, WA" 495 | NIG,"NIGHTHAWK, WA" 496 | OLY,"OLYMPIA, WA" 497 | ORO,"OROVILLE, WA" 498 | PWB,"PASCO, WA" 499 | PIR,"POINT ROBERTS, WA" 500 | PNG,"PORT ANGELES, WA" 501 | PTO,"PORT TOWNSEND, WA" 502 | SEA,"SEATTLE, WA" 503 | SPO,"SPOKANE, WA" 504 | SUM,"SUMAS, WA" 505 | TAC,"TACOMA, WA" 506 | PSC,"TRI-CITIES - PASCO, WA" 507 | VAN,"VANCOUVER, WA" 508 | AGM,"ALGOMA, WI" 509 | BAY,"BAYFIELD, WI" 510 | GRB,"GREEN BAY, WI" 511 | MNW,"MANITOWOC, WI" 512 | MIL,"MILWAUKEE, WI" 513 | MSN,"TRUAX FIELD - DANE COUNTY, WI" 514 | CHS,"CHARLESTON, WV" 515 | CLK,"CLARKSBURG, WV" 516 | BLF,"MERCER COUNTY, WV" 517 | CSP,"CASPER, WY" 518 | XXX,NOT REPORTED/UNKNOWN 519 | 888,UNIDENTIFED AIR / SEAPORT 520 | UNK,UNKNOWN POE 521 | CLG,"CALGARY, CANADA" 522 | EDA,"EDMONTON, CANADA" 523 | YHC,"HAKAI PASS, CANADA" 524 | HAL,"Halifax, NS, Canada" 525 | MON,"MONTREAL, CANADA" 526 | OTT,"OTTAWA, CANADA" 527 | YXE,"SASKATOON, CANADA" 528 | TOR,"TORONTO, CANADA" 529 | VCV,"VANCOUVER, CANADA" 530 | VIC,"VICTORIA, CANADA" 531 | WIN,"WINNIPEG, CANADA" 532 | AMS,"AMSTERDAM-SCHIPHOL, NETHERLANDS" 533 | ARB,"ARUBA, NETH ANTILLES" 534 | BAN,"BANKOK, THAILAND" 535 | BEI,"BEICA #ARPT, ETHIOPIA" 536 | PEK,"BEIJING CAPITAL INTL, PRC" 537 | BDA,"KINDLEY FIELD, BERMUDA" 538 | BOG,"BOGOTA, EL DORADO #ARPT, COLOMBIA" 539 | EZE,"BUENOS AIRES, MINISTRO PIST, ARGENTINA" 540 | CUN,"CANCUN, MEXICO" 541 | CRQ,"CARAVELAS, BA #ARPT, BRAZIL" 542 | MVD,"CARRASCO, URUGUAY" 543 | DUB,"DUBLIN, IRELAND" 544 | FOU,"FOUGAMOU #ARPT, GABON" 545 | FBA,"FREEPORT, BAHAMAS" 546 | MTY,"GEN M. ESCOBEDO, Monterrey, MX" 547 | HMO,"GEN PESQUEIRA GARCIA, MX" 548 | GCM,"GRAND CAYMAN, CAYMAN ISLAND" 549 | GDL,"GUADALAJARA, MIGUEL HIDAL, MX" 550 | HAM,"HAMILTON, BERMUDA" 551 | ICN,"INCHON, SEOUL KOREA" 552 | IWA,"INVALID - IWAKUNI, JAPAN" 553 | CND,"KOGALNICEANU, ROMANIA" 554 | LAH,"LABUHA ARPT, INDONESIA" 555 | DUR,"LOUIS BOTHA, SOUTH AFRICA" 556 | MAL,"MANGOLE ARPT, INDONESIA" 557 | MDE,"MEDELLIN, COLOMBIA" 558 | MEX,"JUAREZ INTL, MEXICO CITY, MX" 559 | LHR,"MIDDLESEX, ENGLAND" 560 | NBO,"NAIROBI, KENYA" 561 | NAS,"NASSAU, BAHAMAS" 562 | NCA,"NORTH CAICOS, TURK & CAIMAN" 563 | PTY,"OMAR TORRIJOS, PANAMA" 564 | SPV,"PAPUA, NEW GUINEA" 565 | UIO,"QUITO (MARISCAL SUCR), ECUADOR" 566 | RIT,"ROME, ITALY" 567 | SNO,"SAKON NAKHON #ARPT, THAILAND" 568 | SLP,"SAN LUIS POTOSI #ARPT, MEXICO" 569 | SAN,"SAN SALVADOR, EL SALVADOR" 570 | SRO,"SANTANA RAMOS #ARPT, COLOMBIA" 571 | GRU,"GUARULHOS INTL, SAO PAULO, BRAZIL" 572 | SHA,"SHANNON, IRELAND" 573 | HIL,"SHILLAVO, ETHIOPIA" 574 | TOK,"TOROKINA #ARPT, PAPUA, NEW GUINEA" 575 | VER,"VERACRUZ, MEXICO" 576 | LGW,"WEST SUSSEX, ENGLAND" 577 | ZZZ,MEXICO Land (Banco de Mexico) 578 | CHN,No PORT Code (CHN) 579 | CNC,"CANNON CORNERS, NY" 580 | MAA,Abu Dhabi 581 | AG0,"MAGNOLIA, AR" 582 | BHM,"BAR HARBOR, ME" 583 | BHX,"BIRMINGHAM, AL" 584 | CAK,"AKRON, OH" 585 | FOK,"SUFFOLK COUNTY, NY" 586 | LND,"LANDER, WY" 587 | MAR,"MARFA, TX" 588 | MLI,"MOLINE, IL" 589 | RIV,"RIVERSIDE, CA" 590 | RME,"ROME, NY" 591 | VNY,"VAN NUYS, CA" 592 | YUM,"YUMA, AZ" 593 | FRG,Collapsed (FOK) 06/15 594 | HRL,Collapsed (HLG) 06/15 595 | ISP,Collapsed (FOK) 06/15 596 | JSJ,Collapsed (SAJ) 06/15 597 | BUS,Collapsed (BUF) 06/15 598 | IAG,Collapsed (NIA) 06/15 599 | PHN,Collapsed (PHU) 06/15 600 | STN,Collapsed (STR) 06/15 601 | VMB,Collapsed (VNB) 06/15 602 | T01,Collapsed (SEA) 06/15 603 | PHF,No PORT Code (PHF) 604 | DRV,No PORT Code (DRV) 605 | FTB,No PORT Code (FTB) 606 | GAC,No PORT Code (GAC) 607 | GMT,No PORT Code (GMT) 608 | JFA,No PORT Code (JFA) 609 | JMZ,No PORT Code (JMZ) 610 | NC8,No PORT Code (NC8) 611 | NYL,No PORT Code (NYL) 612 | OAI,No PORT Code (OAI) 613 | PCW,No PORT Code (PCW) 614 | WA5,No PORT Code (WAS) 615 | WTR,No PORT Code (WTR) 616 | X96,No PORT Code (X96) 617 | XNA,No PORT Code (XNA) 618 | YGF,No PORT Code (YGF) 619 | 5T6,No PORT Code (5T6) 620 | 060,No PORT Code (60) 621 | SP0,No PORT Code (SP0) 622 | W55,No PORT Code (W55) 623 | X44,No PORT Code (X44) 624 | AUH,No PORT Code (AUH) 625 | RYY,No PORT Code (RYY) 626 | SUS,No PORT Code (SUS) 627 | 74S,No PORT Code (74S) 628 | ATW,No PORT Code (ATW) 629 | CPX,No PORT Code (CPX) 630 | MTH,No PORT Code (MTH) 631 | PFN,No PORT Code (PFN) 632 | SCH,No PORT Code (SCH) 633 | ASI,No PORT Code (ASI) 634 | BKF,No PORT Code (BKF) 635 | DAY,No PORT Code (DAY) 636 | Y62,No PORT Code (Y62) 637 | AG,No PORT Code (AG) 638 | BCM,No PORT Code (BCM) 639 | DEC,No PORT Code (DEC) 640 | PLB,No PORT Code (PLB) 641 | CXO,No PORT Code (CXO) 642 | JBQ,No PORT Code (JBQ) 643 | JIG,No PORT Code (JIG) 644 | OGS,No PORT Code (OGS) 645 | TIW,No PORT Code (TIW) 646 | OTS,No PORT Code (OTS) 647 | AMT,No PORT Code (AMT) 648 | EGE,No PORT Code (EGE) 649 | GPI,No PORT Code (GPI) 650 | NGL,No PORT Code (NGL) 651 | OLM,No PORT Code (OLM) 652 | .GA,No PORT Code (.GA) 653 | CLX,No PORT Code (CLX) 654 | CP ,No PORT Code (CP) 655 | FSC,No PORT Code (FSC) 656 | NK,No PORT Code (NK) 657 | ADU,No PORT Code (ADU) 658 | AKT,No PORT Code (AKT) 659 | LIT,No PORT Code (LIT) 660 | A2A,No PORT Code (A2A) 661 | OSN,No PORT Code (OSN) 662 | -------------------------------------------------------------------------------- /lookup/I94VISA.csv: -------------------------------------------------------------------------------- 1 | ID,Type 2 | 1,Business 3 | 2,Pleasure 4 | 3,Student -------------------------------------------------------------------------------- /sql/create_tables.sql: -------------------------------------------------------------------------------- 1 | -- STAGING TABLES 2 | 3 | CREATE TABLE public.immigration ( 4 | i94mon int4, 5 | cicid int4, 6 | i94visa int4, 7 | i94res int4, 8 | i94yr int4, 9 | i94mode int4, 10 | i94cit int4, 11 | i94bir int4, 12 | stay int4, 13 | arrdate varchar, 14 | depdate varchar, 15 | airline varchar, 16 | fltno varchar, 17 | i94port varchar, 18 | visatype varchar, 19 | gender varchar, 20 | i94addr varchar, 21 | CONSTRAINT immigration_pkey PRIMARY KEY ("cicid") 22 | ); 23 | 24 | CREATE TABLE public.country ( 25 | Code int4, 26 | Country varchar, 27 | Temperature float, 28 | Latitude varchar, 29 | Longitude varchar, 30 | CONSTRAINT country_pkey PRIMARY KEY ("Code") 31 | ); 32 | 33 | CREATE TABLE public.state ( 34 | Code varchar, 35 | State varchar, 36 | BlackOrAfricanAmerican int8, 37 | White int8, 38 | ForeignBorn int8, 39 | AmericanIndianAndAlaskaNative int8, 40 | HispanicOrLatino int8, 41 | Asian int8, 42 | NumberVeterans int8, 43 | FemalePopulation int8, 44 | MalePopulation int8, 45 | TotalPopulation int8, 46 | CONSTRAINT state_pkey PRIMARY KEY ("Code") 47 | ); 48 | 49 | CREATE TABLE public."date" ( 50 | "date" varchar NOT NULL, 51 | "day" int4, 52 | "month" int4, 53 | "year" int4, 54 | weekofyear int4, 55 | dayofweek int4, 56 | CONSTRAINT date_pkey PRIMARY KEY ("date") 57 | ) ; 58 | --------------------------------------------------------------------------------