├── .gitignore
├── Capstone Project Template.ipynb
├── README.md
├── airflow
    ├── dags
    │   └── capstone.py
    └── plugins
    │   └── operators
    │       ├── data_quality.py
    │       └── stage_redshift.py
├── helper
    ├── __init__.py
    └── etl.py
├── images
    ├── architecture.png
    ├── dag.PNG
    ├── etl_country.png
    ├── etl_immigration.png
    ├── etl_state.png
    ├── i94cit.PNG
    ├── pipeline.png
    └── star-schema.PNG
├── lookup
    ├── I94ADDR.csv
    ├── I94CIT_I94RES.csv
    ├── I94MODE.csv
    ├── I94PORT.csv
    └── I94VISA.csv
└── sql
    └── create_tables.sql


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # datasets
107 | airport-codes_csv.csv
108 | I94_SAS_Labels_Descriptions.SAS
109 | immigration_data_sample.csv
110 | us-cities-demographics.csv
111 | 
112 | # VS code
113 | /.vscode
114 | 
115 | # data
116 | /data
117 | 
118 | # configuration
119 | *.cfg
120 | 
121 | # power point
122 | *.pptx


--------------------------------------------------------------------------------
/Capstone Project Template.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# US Visitors DW\n",
   8 |     "__Supporting officials' decision-making to provide better visitors experience in the US__\n",
   9 |     "***\n",
  10 |     "## Overview\n",
  11 |     "The purpose of this data engineering capstone project is to give students a chance to combine what they've learned throughout the program. This project will be an important part of learners portfolio that will help to achieve data engineering-related career goals. We could choose to complete the project provided by the Udacity team or define the scope and data ourselves. I took the first approach in building the DW on the data on immigration to the United States provided by Udacity.\n",
  12 |     "\n",
  13 |     "## Business Scenario\n",
  14 |     "We are D2I (Data to Insights), a business consulting firm specialized in data warehouse services through assisting the enterprises with navigating their data needs and creating strategic operational solutions that deliver tangible business results. Specifically, we can help with the modernization of corporations' data warehousing infrastructure by improving performance and ease of use for end users, enhancing functionality, decreasing total cost of ownership while making it possible for real-time decision making. In total, our full suite of services includes helping enterprises with data profiling, data standardization, data acquisition, data transformation and integration.\n",
  15 |     "\n",
  16 |     "We have been contracted by the U.S. Customs and Border Protection to help them see what is hidden behind the data flood. We aim to model and create a brand new analytics solution on top of the state-of-the-art technolgies available to enable them to unleash insights from data then providing better customer experiences when coming to the US.\n",
  17 |     "\n",
  18 |     "## Structure of the Project\n",
  19 |     "Following the Udacity guide for this project, we structured this documentation with steps below:\n",
  20 |     "* Step 1: Scope the Project and Gather Data\n",
  21 |     "* Step 2: Explore and Assess the Data\n",
  22 |     "* Step 3: Define the Data Model\n",
  23 |     "* Step 4: Run ETL to Model the Data\n",
  24 |     "* Step 5: Complete Project Write Up"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "metadata": {},
  30 |    "source": [
  31 |     "## Step 1: Scope the Project and Gather Data\n",
  32 |     "\n",
  33 |     "_Explain what you plan to do in the project in more detail. What data do you use? What is your end solution look like? What tools did you use? etc_\n",
  34 |     "\n",
  35 |     "### The Scope \n",
  36 |     "The main deliverable of our work here will be a data warehouse in the cloud that will support answering questions through analytics tables and dashboards. Additionally, as we developed a general source-of-truth database, the Government of the US could open the solution through a web API so backend web services could query the warehouse for information relating to international visitors.\n",
  37 |     "\n",
  38 |     "### The Data\n",
  39 |     "For this work we have used the immigration, the global temperature and demographics datasets as well as the descriptions contained in the `I94_SAS_Labels_Descriptions.SAS` file.\n",
  40 |     "\n",
  41 |     "### The Architecture\n",
  42 |     "The whole solution is cloud based on top of __Amazon Web Services (AWS)__. First, all the datasets were preprocessed with __Apache Spark__ and stored in a staging area in __AWS S3__ bucket. Then, we loaded those to a __Amazon Redshift__ cluster using an __Apache Airflow__ pipeline that transfer and check the quality of the data to finally provide our customers a data mart for their convenient analysis.\n",
  43 |     "\n",
  44 |     "![Architecture](images/architecture.png)\n",
  45 |     "\n",
  46 |     "The main information and questions a user may want to extract from the data mart would be:\n",
  47 |     "\n",
  48 |     "* Visitors by nationality.\n",
  49 |     "* Visitors by origin.\n",
  50 |     "* Visitors by airline.\n",
  51 |     "* Correlations between destination in the U.S and the source country.\n",
  52 |     "* Correlations between destination in the U.S and source climates.\n",
  53 |     "* Correlations between immigration by source region, and the source region temperature.\n",
  54 |     "* Correlations between visitor demographics, and states visited.\n",
  55 |     "\n",
  56 |     "***"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "markdown",
  61 |    "metadata": {},
  62 |    "source": [
  63 |     "## Step 2: Explore and Assess the Data\n",
  64 |     "\n",
  65 |     "_To familiarize ourselves with the data provided by Udacity we have done an exhaustive exploratory data analysis ([EDA](https://en.wikipedia.org/wiki/Exploratory_data_analysis)) checking what data would be useful and what preprocessing steps we should take in order to clean, organize and join the various datasets in a meaningful data model._\n",
  66 |     "\n",
  67 |     "In the following sections we briefly describe the datasets provided and give a summarized idea on the reasons we took into consideration when deciding what data to use.\n",
  68 |     "\n",
  69 |     "__Immigration Data__\n",
  70 |     "\n",
  71 |     "For decades, U.S. immigration officers issued the I-94 Form (Arrival/Departure Record) to foreign visitors (e.g., business visitors, tourists and foreign students) who lawfully entered the United States. The I-94 was a small white paper form that a foreign visitor received from cabin crews on arrival flights and from U.S. Customs and Border Protection at the time of entry into the United States. It listed the traveler's immigration category, port of entry, data of entry into the United States, status expiration date and had a unique 11-digit identifying number assigned to it. Its purpose was to record the traveler's lawful admission to the United States.\n",
  72 |     "\n",
  73 |     "This is the main dataset and there is a file for each month of the year of 2016 available in the directory `../../data/18-83510-I94-Data-2016/` in the [SAS](https://www.sas.com/en_us/home.html) binary database storage format `sas7bdat`. Combined, the 12 datasets have got more than 40 million rows (40.790.529) and 28 columns. For most of the work we used only the month of April of 2016 which has more than three million records (3.096.313)."
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": 1,
  79 |    "metadata": {},
  80 |    "outputs": [],
  81 |    "source": [
  82 |     "# Importing the libraries needed in this project\n",
  83 |     "import os\n",
  84 |     "import pandas as pd\n",
  85 |     "from datetime import datetime"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 2,
  91 |    "metadata": {},
  92 |    "outputs": [],
  93 |    "source": [
  94 |     "immigration_fname = '../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat'\n",
  95 |     "immigration = pd.read_sas(immigration_fname, 'sas7bdat', encoding=\"ISO-8859-1\")"
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "code",
 100 |    "execution_count": 3,
 101 |    "metadata": {},
 102 |    "outputs": [
 103 |     {
 104 |      "data": {
 105 |       "text/html": [
 106 |        "<div>\n",
 107 |        "<style scoped>\n",
 108 |        "    .dataframe tbody tr th:only-of-type {\n",
 109 |        "        vertical-align: middle;\n",
 110 |        "    }\n",
 111 |        "\n",
 112 |        "    .dataframe tbody tr th {\n",
 113 |        "        vertical-align: top;\n",
 114 |        "    }\n",
 115 |        "\n",
 116 |        "    .dataframe thead th {\n",
 117 |        "        text-align: right;\n",
 118 |        "    }\n",
 119 |        "</style>\n",
 120 |        "<table border=\"1\" class=\"dataframe\">\n",
 121 |        "  <thead>\n",
 122 |        "    <tr style=\"text-align: right;\">\n",
 123 |        "      <th></th>\n",
 124 |        "      <th>cicid</th>\n",
 125 |        "      <th>i94yr</th>\n",
 126 |        "      <th>i94mon</th>\n",
 127 |        "      <th>i94cit</th>\n",
 128 |        "      <th>i94res</th>\n",
 129 |        "      <th>i94port</th>\n",
 130 |        "      <th>arrdate</th>\n",
 131 |        "      <th>i94mode</th>\n",
 132 |        "      <th>i94addr</th>\n",
 133 |        "      <th>depdate</th>\n",
 134 |        "      <th>...</th>\n",
 135 |        "      <th>entdepu</th>\n",
 136 |        "      <th>matflag</th>\n",
 137 |        "      <th>biryear</th>\n",
 138 |        "      <th>dtaddto</th>\n",
 139 |        "      <th>gender</th>\n",
 140 |        "      <th>insnum</th>\n",
 141 |        "      <th>airline</th>\n",
 142 |        "      <th>admnum</th>\n",
 143 |        "      <th>fltno</th>\n",
 144 |        "      <th>visatype</th>\n",
 145 |        "    </tr>\n",
 146 |        "  </thead>\n",
 147 |        "  <tbody>\n",
 148 |        "    <tr>\n",
 149 |        "      <th>0</th>\n",
 150 |        "      <td>6.0</td>\n",
 151 |        "      <td>2016.0</td>\n",
 152 |        "      <td>4.0</td>\n",
 153 |        "      <td>692.0</td>\n",
 154 |        "      <td>692.0</td>\n",
 155 |        "      <td>XXX</td>\n",
 156 |        "      <td>20573.0</td>\n",
 157 |        "      <td>NaN</td>\n",
 158 |        "      <td>NaN</td>\n",
 159 |        "      <td>NaN</td>\n",
 160 |        "      <td>...</td>\n",
 161 |        "      <td>U</td>\n",
 162 |        "      <td>NaN</td>\n",
 163 |        "      <td>1979.0</td>\n",
 164 |        "      <td>10282016</td>\n",
 165 |        "      <td>NaN</td>\n",
 166 |        "      <td>NaN</td>\n",
 167 |        "      <td>NaN</td>\n",
 168 |        "      <td>1.897628e+09</td>\n",
 169 |        "      <td>NaN</td>\n",
 170 |        "      <td>B2</td>\n",
 171 |        "    </tr>\n",
 172 |        "    <tr>\n",
 173 |        "      <th>1</th>\n",
 174 |        "      <td>7.0</td>\n",
 175 |        "      <td>2016.0</td>\n",
 176 |        "      <td>4.0</td>\n",
 177 |        "      <td>254.0</td>\n",
 178 |        "      <td>276.0</td>\n",
 179 |        "      <td>ATL</td>\n",
 180 |        "      <td>20551.0</td>\n",
 181 |        "      <td>1.0</td>\n",
 182 |        "      <td>AL</td>\n",
 183 |        "      <td>NaN</td>\n",
 184 |        "      <td>...</td>\n",
 185 |        "      <td>Y</td>\n",
 186 |        "      <td>NaN</td>\n",
 187 |        "      <td>1991.0</td>\n",
 188 |        "      <td>D/S</td>\n",
 189 |        "      <td>M</td>\n",
 190 |        "      <td>NaN</td>\n",
 191 |        "      <td>NaN</td>\n",
 192 |        "      <td>3.736796e+09</td>\n",
 193 |        "      <td>00296</td>\n",
 194 |        "      <td>F1</td>\n",
 195 |        "    </tr>\n",
 196 |        "    <tr>\n",
 197 |        "      <th>2</th>\n",
 198 |        "      <td>15.0</td>\n",
 199 |        "      <td>2016.0</td>\n",
 200 |        "      <td>4.0</td>\n",
 201 |        "      <td>101.0</td>\n",
 202 |        "      <td>101.0</td>\n",
 203 |        "      <td>WAS</td>\n",
 204 |        "      <td>20545.0</td>\n",
 205 |        "      <td>1.0</td>\n",
 206 |        "      <td>MI</td>\n",
 207 |        "      <td>20691.0</td>\n",
 208 |        "      <td>...</td>\n",
 209 |        "      <td>NaN</td>\n",
 210 |        "      <td>M</td>\n",
 211 |        "      <td>1961.0</td>\n",
 212 |        "      <td>09302016</td>\n",
 213 |        "      <td>M</td>\n",
 214 |        "      <td>NaN</td>\n",
 215 |        "      <td>OS</td>\n",
 216 |        "      <td>6.666432e+08</td>\n",
 217 |        "      <td>93</td>\n",
 218 |        "      <td>B2</td>\n",
 219 |        "    </tr>\n",
 220 |        "    <tr>\n",
 221 |        "      <th>3</th>\n",
 222 |        "      <td>16.0</td>\n",
 223 |        "      <td>2016.0</td>\n",
 224 |        "      <td>4.0</td>\n",
 225 |        "      <td>101.0</td>\n",
 226 |        "      <td>101.0</td>\n",
 227 |        "      <td>NYC</td>\n",
 228 |        "      <td>20545.0</td>\n",
 229 |        "      <td>1.0</td>\n",
 230 |        "      <td>MA</td>\n",
 231 |        "      <td>20567.0</td>\n",
 232 |        "      <td>...</td>\n",
 233 |        "      <td>NaN</td>\n",
 234 |        "      <td>M</td>\n",
 235 |        "      <td>1988.0</td>\n",
 236 |        "      <td>09302016</td>\n",
 237 |        "      <td>NaN</td>\n",
 238 |        "      <td>NaN</td>\n",
 239 |        "      <td>AA</td>\n",
 240 |        "      <td>9.246846e+10</td>\n",
 241 |        "      <td>00199</td>\n",
 242 |        "      <td>B2</td>\n",
 243 |        "    </tr>\n",
 244 |        "    <tr>\n",
 245 |        "      <th>4</th>\n",
 246 |        "      <td>17.0</td>\n",
 247 |        "      <td>2016.0</td>\n",
 248 |        "      <td>4.0</td>\n",
 249 |        "      <td>101.0</td>\n",
 250 |        "      <td>101.0</td>\n",
 251 |        "      <td>NYC</td>\n",
 252 |        "      <td>20545.0</td>\n",
 253 |        "      <td>1.0</td>\n",
 254 |        "      <td>MA</td>\n",
 255 |        "      <td>20567.0</td>\n",
 256 |        "      <td>...</td>\n",
 257 |        "      <td>NaN</td>\n",
 258 |        "      <td>M</td>\n",
 259 |        "      <td>2012.0</td>\n",
 260 |        "      <td>09302016</td>\n",
 261 |        "      <td>NaN</td>\n",
 262 |        "      <td>NaN</td>\n",
 263 |        "      <td>AA</td>\n",
 264 |        "      <td>9.246846e+10</td>\n",
 265 |        "      <td>00199</td>\n",
 266 |        "      <td>B2</td>\n",
 267 |        "    </tr>\n",
 268 |        "  </tbody>\n",
 269 |        "</table>\n",
 270 |        "<p>5 rows × 28 columns</p>\n",
 271 |        "</div>"
 272 |       ],
 273 |       "text/plain": [
 274 |        "   cicid   i94yr  i94mon  i94cit  i94res i94port  arrdate  i94mode i94addr  \\\n",
 275 |        "0    6.0  2016.0     4.0   692.0   692.0     XXX  20573.0      NaN     NaN   \n",
 276 |        "1    7.0  2016.0     4.0   254.0   276.0     ATL  20551.0      1.0      AL   \n",
 277 |        "2   15.0  2016.0     4.0   101.0   101.0     WAS  20545.0      1.0      MI   \n",
 278 |        "3   16.0  2016.0     4.0   101.0   101.0     NYC  20545.0      1.0      MA   \n",
 279 |        "4   17.0  2016.0     4.0   101.0   101.0     NYC  20545.0      1.0      MA   \n",
 280 |        "\n",
 281 |        "   depdate   ...     entdepu  matflag  biryear   dtaddto gender insnum  \\\n",
 282 |        "0      NaN   ...           U      NaN   1979.0  10282016    NaN    NaN   \n",
 283 |        "1      NaN   ...           Y      NaN   1991.0       D/S      M    NaN   \n",
 284 |        "2  20691.0   ...         NaN        M   1961.0  09302016      M    NaN   \n",
 285 |        "3  20567.0   ...         NaN        M   1988.0  09302016    NaN    NaN   \n",
 286 |        "4  20567.0   ...         NaN        M   2012.0  09302016    NaN    NaN   \n",
 287 |        "\n",
 288 |        "  airline        admnum  fltno visatype  \n",
 289 |        "0     NaN  1.897628e+09    NaN       B2  \n",
 290 |        "1     NaN  3.736796e+09  00296       F1  \n",
 291 |        "2      OS  6.666432e+08     93       B2  \n",
 292 |        "3      AA  9.246846e+10  00199       B2  \n",
 293 |        "4      AA  9.246846e+10  00199       B2  \n",
 294 |        "\n",
 295 |        "[5 rows x 28 columns]"
 296 |       ]
 297 |      },
 298 |      "execution_count": 3,
 299 |      "metadata": {},
 300 |      "output_type": "execute_result"
 301 |     }
 302 |    ],
 303 |    "source": [
 304 |     "immigration.head()"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "markdown",
 309 |    "metadata": {},
 310 |    "source": [
 311 |     "__Data Dictionary__: Here, we describe the various fields of the dataset. Some descriptions were not clear enough so we had to make assumptions about the meaning."
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "markdown",
 316 |    "metadata": {},
 317 |    "source": [
 318 |     "| Column Name | Description |\n",
 319 |     "| :--- | :--- |\n",
 320 |     "| CICID* | ID that uniquely identify one record in the dataset |\n",
 321 |     "| I94YR | 4 digit year |\n",
 322 |     "| I94MON | Numeric month |\n",
 323 |     "| I94CIT | 3 digit code of source city for immigration (Born country) |\n",
 324 |     "| I94RES | 3 digit code of source country for immigration (Residence country) |\n",
 325 |     "| I94PORT | Port addmitted through |\n",
 326 |     "| ARRDATE | Arrival date in the USA |\n",
 327 |     "| I94MODE | Mode of transportation (1 = Air; 2 = Sea; 3 = Land; 9 = Not reported) |\n",
 328 |     "| I94ADDR | State of arrival |\n",
 329 |     "| DEPDATE | Departure date |\n",
 330 |     "| I94BIR | Age of Respondent in Years |\n",
 331 |     "| I94VISA | Visa codes collapsed into three categories: (1 = Business; 2 = Pleasure; 3 = Student) |\n",
 332 |     "| COUNT | Used for summary statistics |\n",
 333 |     "| DTADFILE | Character Date Field |\n",
 334 |     "| VISAPOST | Department of State where where Visa was issued |\n",
 335 |     "| OCCUP | Occupation that will be performed in U.S. |\n",
 336 |     "| ENTDEPA | Arrival Flag. Whether admitted or paroled into the US |\n",
 337 |     "| ENTDEPD | Departure Flag. Whether departed, lost visa, or deceased |\n",
 338 |     "| ENTDEPU | Update Flag. Update of visa, either apprehended, overstayed, or updated to PR |\n",
 339 |     "| MATFLAG | Match flag |\n",
 340 |     "| BIRYEAR | 4 digit year of birth |\n",
 341 |     "| DTADDTO | Character date field to when admitted in the US |\n",
 342 |     "| GENDER | Gender |\n",
 343 |     "| INSNUM | INS number |\n",
 344 |     "| AIRLINE | Airline used to arrive in U.S. |\n",
 345 |     "| ADMNUM | Admission number, should be unique and not nullable |\n",
 346 |     "| FLTNO | Flight number of Airline used to arrive in U.S. |\n",
 347 |     "| VISATYPE | Class of admission legally admitting the non-immigrant to temporarily stay in U.S. |"
 348 |    ]
 349 |   },
 350 |   {
 351 |    "cell_type": "markdown",
 352 |    "metadata": {},
 353 |    "source": [
 354 |     "The immigration dataset is our fact so that will be at the center of the star schema model of our data warehouse."
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "metadata": {},
 360 |    "source": [
 361 |     "__Global Temperature Data__\n",
 362 |     "\n",
 363 |     "There are a range of organizations that collate climate trends data. The three most cited land and ocean temperature data sets are NOAA’s MLOST, NASA’s GISTEMP and the UK’s HadCrut.\n",
 364 |     "\n",
 365 |     "The Berkeley Earth, which is affiliated with Lawrence Berkeley National Laboratory, has repackaged the data from a newer compilation put it all together. The Berkeley Earth Surface Temperature Study combines 1.6 billion temperature reports from 16 pre-existing archives. It is nicely packaged and allows for slicing into interesting subsets (for example by country). They publish the source data and the code for the transformations they applied. They also use methods that allow weather observations from shorter time series to be included, meaning fewer observations need to be thrown away.\n",
 366 |     "\n",
 367 |     "In the original dataset from [Kaggle](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data), several files are available but in this capstone project we will be using only the `GlobalLandTemperaturesByCity`."
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": 6,
 373 |    "metadata": {},
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "temperature_fname = '../../data2/GlobalLandTemperaturesByCity.csv'\n",
 377 |     "world_temperature = pd.read_csv(temperature_fname)"
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": 7,
 383 |    "metadata": {},
 384 |    "outputs": [
 385 |     {
 386 |      "data": {
 387 |       "text/html": [
 388 |        "<div>\n",
 389 |        "<style scoped>\n",
 390 |        "    .dataframe tbody tr th:only-of-type {\n",
 391 |        "        vertical-align: middle;\n",
 392 |        "    }\n",
 393 |        "\n",
 394 |        "    .dataframe tbody tr th {\n",
 395 |        "        vertical-align: top;\n",
 396 |        "    }\n",
 397 |        "\n",
 398 |        "    .dataframe thead th {\n",
 399 |        "        text-align: right;\n",
 400 |        "    }\n",
 401 |        "</style>\n",
 402 |        "<table border=\"1\" class=\"dataframe\">\n",
 403 |        "  <thead>\n",
 404 |        "    <tr style=\"text-align: right;\">\n",
 405 |        "      <th></th>\n",
 406 |        "      <th>dt</th>\n",
 407 |        "      <th>AverageTemperature</th>\n",
 408 |        "      <th>AverageTemperatureUncertainty</th>\n",
 409 |        "      <th>City</th>\n",
 410 |        "      <th>Country</th>\n",
 411 |        "      <th>Latitude</th>\n",
 412 |        "      <th>Longitude</th>\n",
 413 |        "    </tr>\n",
 414 |        "  </thead>\n",
 415 |        "  <tbody>\n",
 416 |        "    <tr>\n",
 417 |        "      <th>0</th>\n",
 418 |        "      <td>1743-11-01</td>\n",
 419 |        "      <td>6.068</td>\n",
 420 |        "      <td>1.737</td>\n",
 421 |        "      <td>Århus</td>\n",
 422 |        "      <td>Denmark</td>\n",
 423 |        "      <td>57.05N</td>\n",
 424 |        "      <td>10.33E</td>\n",
 425 |        "    </tr>\n",
 426 |        "    <tr>\n",
 427 |        "      <th>1</th>\n",
 428 |        "      <td>1743-12-01</td>\n",
 429 |        "      <td>NaN</td>\n",
 430 |        "      <td>NaN</td>\n",
 431 |        "      <td>Århus</td>\n",
 432 |        "      <td>Denmark</td>\n",
 433 |        "      <td>57.05N</td>\n",
 434 |        "      <td>10.33E</td>\n",
 435 |        "    </tr>\n",
 436 |        "    <tr>\n",
 437 |        "      <th>2</th>\n",
 438 |        "      <td>1744-01-01</td>\n",
 439 |        "      <td>NaN</td>\n",
 440 |        "      <td>NaN</td>\n",
 441 |        "      <td>Århus</td>\n",
 442 |        "      <td>Denmark</td>\n",
 443 |        "      <td>57.05N</td>\n",
 444 |        "      <td>10.33E</td>\n",
 445 |        "    </tr>\n",
 446 |        "    <tr>\n",
 447 |        "      <th>3</th>\n",
 448 |        "      <td>1744-02-01</td>\n",
 449 |        "      <td>NaN</td>\n",
 450 |        "      <td>NaN</td>\n",
 451 |        "      <td>Århus</td>\n",
 452 |        "      <td>Denmark</td>\n",
 453 |        "      <td>57.05N</td>\n",
 454 |        "      <td>10.33E</td>\n",
 455 |        "    </tr>\n",
 456 |        "    <tr>\n",
 457 |        "      <th>4</th>\n",
 458 |        "      <td>1744-03-01</td>\n",
 459 |        "      <td>NaN</td>\n",
 460 |        "      <td>NaN</td>\n",
 461 |        "      <td>Århus</td>\n",
 462 |        "      <td>Denmark</td>\n",
 463 |        "      <td>57.05N</td>\n",
 464 |        "      <td>10.33E</td>\n",
 465 |        "    </tr>\n",
 466 |        "  </tbody>\n",
 467 |        "</table>\n",
 468 |        "</div>"
 469 |       ],
 470 |       "text/plain": [
 471 |        "           dt  AverageTemperature  AverageTemperatureUncertainty   City  \\\n",
 472 |        "0  1743-11-01               6.068                          1.737  Århus   \n",
 473 |        "1  1743-12-01                 NaN                            NaN  Århus   \n",
 474 |        "2  1744-01-01                 NaN                            NaN  Århus   \n",
 475 |        "3  1744-02-01                 NaN                            NaN  Århus   \n",
 476 |        "4  1744-03-01                 NaN                            NaN  Århus   \n",
 477 |        "\n",
 478 |        "   Country Latitude Longitude  \n",
 479 |        "0  Denmark   57.05N    10.33E  \n",
 480 |        "1  Denmark   57.05N    10.33E  \n",
 481 |        "2  Denmark   57.05N    10.33E  \n",
 482 |        "3  Denmark   57.05N    10.33E  \n",
 483 |        "4  Denmark   57.05N    10.33E  "
 484 |       ]
 485 |      },
 486 |      "execution_count": 7,
 487 |      "metadata": {},
 488 |      "output_type": "execute_result"
 489 |     }
 490 |    ],
 491 |    "source": [
 492 |     "world_temperature.head()"
 493 |    ]
 494 |   },
 495 |   {
 496 |    "cell_type": "markdown",
 497 |    "metadata": {},
 498 |    "source": [
 499 |     "__Data Dictionary__\n",
 500 |     "\n",
 501 |     "| Column Name | Description |\n",
 502 |     "| :--- | :--- |\n",
 503 |     "| dt | Date in format YYYY-MM-DD |\n",
 504 |     "| AverageTemperature | Average temperature of the city in a given date |\n",
 505 |     "| City | City Name |\n",
 506 |     "| Country | Country Name |\n",
 507 |     "| Latitude | Latitude |\n",
 508 |     "| Longitude | Longitude |"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "markdown",
 513 |    "metadata": {},
 514 |    "source": [
 515 |     "The dataset provides a long period of the world's temperature (from year 1743 to 2013). However, since the immigration dataset only has data of the US National Tourism Office in the year of 2016, the vast majority of the data here seems not to be suitable. We then decided to aggregate this dataset by country, averaging the temperatures and use this reduced table to join with `lookup\\I94CIT_I94RES.csv` lookup table (extracted from `I94_SAS_Labels_Descriptions.SAS`) resulting in the COUNTRY dimension of our model.\n",
 516 |     "\n",
 517 |     "> If we had temperatures of the year 2016 we could have provided an interesting analysis crossing the two tables (immigration and temperatures) in order to see how the waves of immigration to the US relate to the changes in the temperature. But this is just unfeasible due to the different dates."
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": 8,
 523 |    "metadata": {},
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "world_temperature = world_temperature.groupby([\"Country\"]).agg({\"AverageTemperature\": \"mean\", \n",
 527 |     "                                                                        \"Latitude\": \"first\", \"Longitude\": \"first\"}).reset_index()"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "code",
 532 |    "execution_count": 9,
 533 |    "metadata": {},
 534 |    "outputs": [
 535 |     {
 536 |      "data": {
 537 |       "text/html": [
 538 |        "<div>\n",
 539 |        "<style scoped>\n",
 540 |        "    .dataframe tbody tr th:only-of-type {\n",
 541 |        "        vertical-align: middle;\n",
 542 |        "    }\n",
 543 |        "\n",
 544 |        "    .dataframe tbody tr th {\n",
 545 |        "        vertical-align: top;\n",
 546 |        "    }\n",
 547 |        "\n",
 548 |        "    .dataframe thead th {\n",
 549 |        "        text-align: right;\n",
 550 |        "    }\n",
 551 |        "</style>\n",
 552 |        "<table border=\"1\" class=\"dataframe\">\n",
 553 |        "  <thead>\n",
 554 |        "    <tr style=\"text-align: right;\">\n",
 555 |        "      <th></th>\n",
 556 |        "      <th>Country</th>\n",
 557 |        "      <th>AverageTemperature</th>\n",
 558 |        "      <th>Latitude</th>\n",
 559 |        "      <th>Longitude</th>\n",
 560 |        "    </tr>\n",
 561 |        "  </thead>\n",
 562 |        "  <tbody>\n",
 563 |        "    <tr>\n",
 564 |        "      <th>0</th>\n",
 565 |        "      <td>Afghanistan</td>\n",
 566 |        "      <td>13.816497</td>\n",
 567 |        "      <td>36.17N</td>\n",
 568 |        "      <td>69.61E</td>\n",
 569 |        "    </tr>\n",
 570 |        "    <tr>\n",
 571 |        "      <th>1</th>\n",
 572 |        "      <td>Albania</td>\n",
 573 |        "      <td>15.525828</td>\n",
 574 |        "      <td>40.99N</td>\n",
 575 |        "      <td>19.17E</td>\n",
 576 |        "    </tr>\n",
 577 |        "    <tr>\n",
 578 |        "      <th>2</th>\n",
 579 |        "      <td>Algeria</td>\n",
 580 |        "      <td>17.763206</td>\n",
 581 |        "      <td>36.17N</td>\n",
 582 |        "      <td>3.98E</td>\n",
 583 |        "    </tr>\n",
 584 |        "    <tr>\n",
 585 |        "      <th>3</th>\n",
 586 |        "      <td>Angola</td>\n",
 587 |        "      <td>21.759716</td>\n",
 588 |        "      <td>12.05S</td>\n",
 589 |        "      <td>13.15E</td>\n",
 590 |        "    </tr>\n",
 591 |        "    <tr>\n",
 592 |        "      <th>4</th>\n",
 593 |        "      <td>Argentina</td>\n",
 594 |        "      <td>16.999216</td>\n",
 595 |        "      <td>39.38S</td>\n",
 596 |        "      <td>62.43W</td>\n",
 597 |        "    </tr>\n",
 598 |        "  </tbody>\n",
 599 |        "</table>\n",
 600 |        "</div>"
 601 |       ],
 602 |       "text/plain": [
 603 |        "       Country  AverageTemperature Latitude Longitude\n",
 604 |        "0  Afghanistan           13.816497   36.17N    69.61E\n",
 605 |        "1      Albania           15.525828   40.99N    19.17E\n",
 606 |        "2      Algeria           17.763206   36.17N     3.98E\n",
 607 |        "3       Angola           21.759716   12.05S    13.15E\n",
 608 |        "4    Argentina           16.999216   39.38S    62.43W"
 609 |       ]
 610 |      },
 611 |      "execution_count": 9,
 612 |      "metadata": {},
 613 |      "output_type": "execute_result"
 614 |     }
 615 |    ],
 616 |    "source": [
 617 |     "world_temperature.head()"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "markdown",
 622 |    "metadata": {},
 623 |    "source": [
 624 |     "__Airports Data__\n",
 625 |     "\n",
 626 |     "The airport codes may refer to either [IATA](https://en.wikipedia.org/wiki/IATA_airport_code) airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the [ICAO](https://en.wikipedia.org/wiki/ICAO_airport_code) airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia).\n",
 627 |     "\n",
 628 |     "Airport codes from around the world. Downloaded from public domain source http://ourairports.com/data/ who compiled this data from multiple different sources.\n",
 629 |     "\n",
 630 |     "`airport-codes.csv` contains the list of all airport codes, the attributes are identified in datapackage description. Some of the columns contain attributes identifying airport locations, other codes (IATA, local if exist) that are relevant to identification of an airport.\n",
 631 |     "Original source url is http://ourairports.com/data/airports.csv (stored in archive/data.csv)."
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "code",
 636 |    "execution_count": 10,
 637 |    "metadata": {},
 638 |    "outputs": [],
 639 |    "source": [
 640 |     "airport = pd.read_csv(\"airport-codes_csv.csv\")"
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": 11,
 646 |    "metadata": {},
 647 |    "outputs": [
 648 |     {
 649 |      "data": {
 650 |       "text/html": [
 651 |        "<div>\n",
 652 |        "<style scoped>\n",
 653 |        "    .dataframe tbody tr th:only-of-type {\n",
 654 |        "        vertical-align: middle;\n",
 655 |        "    }\n",
 656 |        "\n",
 657 |        "    .dataframe tbody tr th {\n",
 658 |        "        vertical-align: top;\n",
 659 |        "    }\n",
 660 |        "\n",
 661 |        "    .dataframe thead th {\n",
 662 |        "        text-align: right;\n",
 663 |        "    }\n",
 664 |        "</style>\n",
 665 |        "<table border=\"1\" class=\"dataframe\">\n",
 666 |        "  <thead>\n",
 667 |        "    <tr style=\"text-align: right;\">\n",
 668 |        "      <th></th>\n",
 669 |        "      <th>ident</th>\n",
 670 |        "      <th>type</th>\n",
 671 |        "      <th>name</th>\n",
 672 |        "      <th>elevation_ft</th>\n",
 673 |        "      <th>continent</th>\n",
 674 |        "      <th>iso_country</th>\n",
 675 |        "      <th>iso_region</th>\n",
 676 |        "      <th>municipality</th>\n",
 677 |        "      <th>gps_code</th>\n",
 678 |        "      <th>iata_code</th>\n",
 679 |        "      <th>local_code</th>\n",
 680 |        "      <th>coordinates</th>\n",
 681 |        "    </tr>\n",
 682 |        "  </thead>\n",
 683 |        "  <tbody>\n",
 684 |        "    <tr>\n",
 685 |        "      <th>0</th>\n",
 686 |        "      <td>00A</td>\n",
 687 |        "      <td>heliport</td>\n",
 688 |        "      <td>Total Rf Heliport</td>\n",
 689 |        "      <td>11.0</td>\n",
 690 |        "      <td>NaN</td>\n",
 691 |        "      <td>US</td>\n",
 692 |        "      <td>US-PA</td>\n",
 693 |        "      <td>Bensalem</td>\n",
 694 |        "      <td>00A</td>\n",
 695 |        "      <td>NaN</td>\n",
 696 |        "      <td>00A</td>\n",
 697 |        "      <td>-74.93360137939453, 40.07080078125</td>\n",
 698 |        "    </tr>\n",
 699 |        "    <tr>\n",
 700 |        "      <th>1</th>\n",
 701 |        "      <td>00AA</td>\n",
 702 |        "      <td>small_airport</td>\n",
 703 |        "      <td>Aero B Ranch Airport</td>\n",
 704 |        "      <td>3435.0</td>\n",
 705 |        "      <td>NaN</td>\n",
 706 |        "      <td>US</td>\n",
 707 |        "      <td>US-KS</td>\n",
 708 |        "      <td>Leoti</td>\n",
 709 |        "      <td>00AA</td>\n",
 710 |        "      <td>NaN</td>\n",
 711 |        "      <td>00AA</td>\n",
 712 |        "      <td>-101.473911, 38.704022</td>\n",
 713 |        "    </tr>\n",
 714 |        "    <tr>\n",
 715 |        "      <th>2</th>\n",
 716 |        "      <td>00AK</td>\n",
 717 |        "      <td>small_airport</td>\n",
 718 |        "      <td>Lowell Field</td>\n",
 719 |        "      <td>450.0</td>\n",
 720 |        "      <td>NaN</td>\n",
 721 |        "      <td>US</td>\n",
 722 |        "      <td>US-AK</td>\n",
 723 |        "      <td>Anchor Point</td>\n",
 724 |        "      <td>00AK</td>\n",
 725 |        "      <td>NaN</td>\n",
 726 |        "      <td>00AK</td>\n",
 727 |        "      <td>-151.695999146, 59.94919968</td>\n",
 728 |        "    </tr>\n",
 729 |        "    <tr>\n",
 730 |        "      <th>3</th>\n",
 731 |        "      <td>00AL</td>\n",
 732 |        "      <td>small_airport</td>\n",
 733 |        "      <td>Epps Airpark</td>\n",
 734 |        "      <td>820.0</td>\n",
 735 |        "      <td>NaN</td>\n",
 736 |        "      <td>US</td>\n",
 737 |        "      <td>US-AL</td>\n",
 738 |        "      <td>Harvest</td>\n",
 739 |        "      <td>00AL</td>\n",
 740 |        "      <td>NaN</td>\n",
 741 |        "      <td>00AL</td>\n",
 742 |        "      <td>-86.77030181884766, 34.86479949951172</td>\n",
 743 |        "    </tr>\n",
 744 |        "    <tr>\n",
 745 |        "      <th>4</th>\n",
 746 |        "      <td>00AR</td>\n",
 747 |        "      <td>closed</td>\n",
 748 |        "      <td>Newport Hospital &amp; Clinic Heliport</td>\n",
 749 |        "      <td>237.0</td>\n",
 750 |        "      <td>NaN</td>\n",
 751 |        "      <td>US</td>\n",
 752 |        "      <td>US-AR</td>\n",
 753 |        "      <td>Newport</td>\n",
 754 |        "      <td>NaN</td>\n",
 755 |        "      <td>NaN</td>\n",
 756 |        "      <td>NaN</td>\n",
 757 |        "      <td>-91.254898, 35.6087</td>\n",
 758 |        "    </tr>\n",
 759 |        "  </tbody>\n",
 760 |        "</table>\n",
 761 |        "</div>"
 762 |       ],
 763 |       "text/plain": [
 764 |        "  ident           type                                name  elevation_ft  \\\n",
 765 |        "0   00A       heliport                   Total Rf Heliport          11.0   \n",
 766 |        "1  00AA  small_airport                Aero B Ranch Airport        3435.0   \n",
 767 |        "2  00AK  small_airport                        Lowell Field         450.0   \n",
 768 |        "3  00AL  small_airport                        Epps Airpark         820.0   \n",
 769 |        "4  00AR         closed  Newport Hospital & Clinic Heliport         237.0   \n",
 770 |        "\n",
 771 |        "  continent iso_country iso_region  municipality gps_code iata_code  \\\n",
 772 |        "0       NaN          US      US-PA      Bensalem      00A       NaN   \n",
 773 |        "1       NaN          US      US-KS         Leoti     00AA       NaN   \n",
 774 |        "2       NaN          US      US-AK  Anchor Point     00AK       NaN   \n",
 775 |        "3       NaN          US      US-AL       Harvest     00AL       NaN   \n",
 776 |        "4       NaN          US      US-AR       Newport      NaN       NaN   \n",
 777 |        "\n",
 778 |        "  local_code                            coordinates  \n",
 779 |        "0        00A     -74.93360137939453, 40.07080078125  \n",
 780 |        "1       00AA                 -101.473911, 38.704022  \n",
 781 |        "2       00AK            -151.695999146, 59.94919968  \n",
 782 |        "3       00AL  -86.77030181884766, 34.86479949951172  \n",
 783 |        "4        NaN                    -91.254898, 35.6087  "
 784 |       ]
 785 |      },
 786 |      "execution_count": 11,
 787 |      "metadata": {},
 788 |      "output_type": "execute_result"
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "airport.head()"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "metadata": {},
 798 |    "source": [
 799 |     "__Data Dictionary__\n",
 800 |     "\n",
 801 |     "| Column Name | Description |\n",
 802 |     "| :--- | :--- |\n",
 803 |     "| ident | Unique identifier |\n",
 804 |     "| type | Type of the airport |\n",
 805 |     "| name | Airport Name |\n",
 806 |     "| elevation_ft | Altitude of the airport |\n",
 807 |     "| continent | Continent |\n",
 808 |     "| iso_country | ISO code of the country of the airport |\n",
 809 |     "| iso_region | ISO code for the region of the airport |\n",
 810 |     "| municipality | City where the airport is located |\n",
 811 |     "| gps_code | GPS code of the airport |\n",
 812 |     "| iata_code | IATA code of the airport |\n",
 813 |     "| local_code | Local code of the airport |\n",
 814 |     "| coordinates | GPS coordinates of the airport |"
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "markdown",
 819 |    "metadata": {},
 820 |    "source": [
 821 |     "We are not using the airport dataset in our model. We came to a conclusion that it did not prove to be a good source of analysis once we were not able to join this to the main table immigration. We did not find a valid and consistent key in both tables in order to cross them. None of the codes (ident, gps_code, iata_code or local_code) seemed to match the columns in the immigration fact table."
 822 |    ]
 823 |   },
 824 |   {
 825 |    "cell_type": "markdown",
 826 |    "metadata": {},
 827 |    "source": [
 828 |     "__U.S. City Demographic Data__\n",
 829 |     "\n",
 830 |     "This dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000. This data comes from the US Census Bureau's 2015 American Community Survey.\n",
 831 |     "\n",
 832 |     "This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau."
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "code",
 837 |    "execution_count": 12,
 838 |    "metadata": {},
 839 |    "outputs": [],
 840 |    "source": [
 841 |     "us_cities_demographics = pd.read_csv(\"us-cities-demographics.csv\", sep=\";\")"
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "code",
 846 |    "execution_count": 13,
 847 |    "metadata": {},
 848 |    "outputs": [
 849 |     {
 850 |      "data": {
 851 |       "text/html": [
 852 |        "<div>\n",
 853 |        "<style scoped>\n",
 854 |        "    .dataframe tbody tr th:only-of-type {\n",
 855 |        "        vertical-align: middle;\n",
 856 |        "    }\n",
 857 |        "\n",
 858 |        "    .dataframe tbody tr th {\n",
 859 |        "        vertical-align: top;\n",
 860 |        "    }\n",
 861 |        "\n",
 862 |        "    .dataframe thead th {\n",
 863 |        "        text-align: right;\n",
 864 |        "    }\n",
 865 |        "</style>\n",
 866 |        "<table border=\"1\" class=\"dataframe\">\n",
 867 |        "  <thead>\n",
 868 |        "    <tr style=\"text-align: right;\">\n",
 869 |        "      <th></th>\n",
 870 |        "      <th>City</th>\n",
 871 |        "      <th>State</th>\n",
 872 |        "      <th>Median Age</th>\n",
 873 |        "      <th>Male Population</th>\n",
 874 |        "      <th>Female Population</th>\n",
 875 |        "      <th>Total Population</th>\n",
 876 |        "      <th>Number of Veterans</th>\n",
 877 |        "      <th>Foreign-born</th>\n",
 878 |        "      <th>Average Household Size</th>\n",
 879 |        "      <th>State Code</th>\n",
 880 |        "      <th>Race</th>\n",
 881 |        "      <th>Count</th>\n",
 882 |        "    </tr>\n",
 883 |        "  </thead>\n",
 884 |        "  <tbody>\n",
 885 |        "    <tr>\n",
 886 |        "      <th>0</th>\n",
 887 |        "      <td>Silver Spring</td>\n",
 888 |        "      <td>Maryland</td>\n",
 889 |        "      <td>33.8</td>\n",
 890 |        "      <td>40601.0</td>\n",
 891 |        "      <td>41862.0</td>\n",
 892 |        "      <td>82463</td>\n",
 893 |        "      <td>1562.0</td>\n",
 894 |        "      <td>30908.0</td>\n",
 895 |        "      <td>2.60</td>\n",
 896 |        "      <td>MD</td>\n",
 897 |        "      <td>Hispanic or Latino</td>\n",
 898 |        "      <td>25924</td>\n",
 899 |        "    </tr>\n",
 900 |        "    <tr>\n",
 901 |        "      <th>1</th>\n",
 902 |        "      <td>Quincy</td>\n",
 903 |        "      <td>Massachusetts</td>\n",
 904 |        "      <td>41.0</td>\n",
 905 |        "      <td>44129.0</td>\n",
 906 |        "      <td>49500.0</td>\n",
 907 |        "      <td>93629</td>\n",
 908 |        "      <td>4147.0</td>\n",
 909 |        "      <td>32935.0</td>\n",
 910 |        "      <td>2.39</td>\n",
 911 |        "      <td>MA</td>\n",
 912 |        "      <td>White</td>\n",
 913 |        "      <td>58723</td>\n",
 914 |        "    </tr>\n",
 915 |        "    <tr>\n",
 916 |        "      <th>2</th>\n",
 917 |        "      <td>Hoover</td>\n",
 918 |        "      <td>Alabama</td>\n",
 919 |        "      <td>38.5</td>\n",
 920 |        "      <td>38040.0</td>\n",
 921 |        "      <td>46799.0</td>\n",
 922 |        "      <td>84839</td>\n",
 923 |        "      <td>4819.0</td>\n",
 924 |        "      <td>8229.0</td>\n",
 925 |        "      <td>2.58</td>\n",
 926 |        "      <td>AL</td>\n",
 927 |        "      <td>Asian</td>\n",
 928 |        "      <td>4759</td>\n",
 929 |        "    </tr>\n",
 930 |        "    <tr>\n",
 931 |        "      <th>3</th>\n",
 932 |        "      <td>Rancho Cucamonga</td>\n",
 933 |        "      <td>California</td>\n",
 934 |        "      <td>34.5</td>\n",
 935 |        "      <td>88127.0</td>\n",
 936 |        "      <td>87105.0</td>\n",
 937 |        "      <td>175232</td>\n",
 938 |        "      <td>5821.0</td>\n",
 939 |        "      <td>33878.0</td>\n",
 940 |        "      <td>3.18</td>\n",
 941 |        "      <td>CA</td>\n",
 942 |        "      <td>Black or African-American</td>\n",
 943 |        "      <td>24437</td>\n",
 944 |        "    </tr>\n",
 945 |        "    <tr>\n",
 946 |        "      <th>4</th>\n",
 947 |        "      <td>Newark</td>\n",
 948 |        "      <td>New Jersey</td>\n",
 949 |        "      <td>34.6</td>\n",
 950 |        "      <td>138040.0</td>\n",
 951 |        "      <td>143873.0</td>\n",
 952 |        "      <td>281913</td>\n",
 953 |        "      <td>5829.0</td>\n",
 954 |        "      <td>86253.0</td>\n",
 955 |        "      <td>2.73</td>\n",
 956 |        "      <td>NJ</td>\n",
 957 |        "      <td>White</td>\n",
 958 |        "      <td>76402</td>\n",
 959 |        "    </tr>\n",
 960 |        "  </tbody>\n",
 961 |        "</table>\n",
 962 |        "</div>"
 963 |       ],
 964 |       "text/plain": [
 965 |        "               City          State  Median Age  Male Population  \\\n",
 966 |        "0     Silver Spring       Maryland        33.8          40601.0   \n",
 967 |        "1            Quincy  Massachusetts        41.0          44129.0   \n",
 968 |        "2            Hoover        Alabama        38.5          38040.0   \n",
 969 |        "3  Rancho Cucamonga     California        34.5          88127.0   \n",
 970 |        "4            Newark     New Jersey        34.6         138040.0   \n",
 971 |        "\n",
 972 |        "   Female Population  Total Population  Number of Veterans  Foreign-born  \\\n",
 973 |        "0            41862.0             82463              1562.0       30908.0   \n",
 974 |        "1            49500.0             93629              4147.0       32935.0   \n",
 975 |        "2            46799.0             84839              4819.0        8229.0   \n",
 976 |        "3            87105.0            175232              5821.0       33878.0   \n",
 977 |        "4           143873.0            281913              5829.0       86253.0   \n",
 978 |        "\n",
 979 |        "   Average Household Size State Code                       Race  Count  \n",
 980 |        "0                    2.60         MD         Hispanic or Latino  25924  \n",
 981 |        "1                    2.39         MA                      White  58723  \n",
 982 |        "2                    2.58         AL                      Asian   4759  \n",
 983 |        "3                    3.18         CA  Black or African-American  24437  \n",
 984 |        "4                    2.73         NJ                      White  76402  "
 985 |       ]
 986 |      },
 987 |      "execution_count": 13,
 988 |      "metadata": {},
 989 |      "output_type": "execute_result"
 990 |     }
 991 |    ],
 992 |    "source": [
 993 |     "us_cities_demographics.head()"
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "markdown",
 998 |    "metadata": {},
 999 |    "source": [
1000 |     "__Data Dictionary__\n",
1001 |     "\n",
1002 |     "| Column Name | Description |\n",
1003 |     "| :--- | :--- |\n",
1004 |     "| City | Name of the city |\n",
1005 |     "| State | US state of the city |\n",
1006 |     "| Median Age | The median of the age of the population |\n",
1007 |     "| Male Population | Number of the male population |\n",
1008 |     "| Female Population | Number of the female population |\n",
1009 |     "| Total Population | Number of the total population |\n",
1010 |     "| Number of Veterans | Number of veterans living in the city |\n",
1011 |     "| Foreign-born | Number of residents of the city that were not born in the city |\n",
1012 |     "| Average Household Size | Average size of the houses in the city |\n",
1013 |     "| State Code | Code of the state of the city |\n",
1014 |     "| Race | Race class |\n",
1015 |     "| Count | Number of individual of each race |"
1016 |    ]
1017 |   },
1018 |   {
1019 |    "cell_type": "markdown",
1020 |    "metadata": {},
1021 |    "source": [
1022 |     "The `US Cities Demographics` is the source of the STATE dimension in our data model. We aggregated the dataset by State and pivoted the `Race` and `Count` columns in order to make each different value of Race to be a column. That way we create a complete table of statistics that summarizes the information for every US state."
1023 |    ]
1024 |   },
1025 |   {
1026 |    "cell_type": "markdown",
1027 |    "metadata": {},
1028 |    "source": [
1029 |     "## Step 3: Define the Data Model\n",
1030 |     "\n",
1031 |     "_In this section of the documentation we detail the process of extract, transform and load the data from the various datasets. As me mentioned before, we are using 3 of the 4 data sources provided by the Udacity team: immigration, temperatures and demographics. Also, we extract descriptions from labels descriptions file `I94_SAS_Labels_Descriptions.SAS`_\n",
1032 |     "\n",
1033 |     "#### 3.1 Conceptual Data Model\n",
1034 |     "_Map out the conceptual data model and explain why you chose that model_\n",
1035 |     "\n",
1036 |     "The immigration dataset is the origin of the center of our model. As this represent the facts of what we want to analyse - U.S visitors from the world -, this was transformed to the fact table IMMIGRATION as represented in the schema below. We gave this data most of the focus during our modeling phase. The immigration dataset is also the data source for the DATE dimension table. We extracted all the distinct values of the columns arrdate and depdate and applied various functions to store in the table a number of attributes of a particular date: day, month, year, week of year and day of week.\n",
1037 |     "\n",
1038 |     "![Star-Schema](images/star-schema.PNG)\n",
1039 |     "\n",
1040 |     "The STATE dimension table is the result of the aggregation of the demographics dataset by the State column. Median Age, Male Population, Female Population, Total Population, Number of Veterans, Foreign-born were first aggregated by `City` using `first` function, since they are repeated accross the different rows of the same city. Then, we grouped the resulting rows by `State` applying the `sum` function in the numeric columns to make a cosolidated total in each U.S State. We needed to transform the column `Race` in order to make its different values to become different columns. We achieve this by usig the pivot function of the `pyspark` package. As a result we reached to a final structure where we have got the columns (BlackOrAfricanAmerican, White, ForeignBorn, AmericanIndianAndAlaskaNative, HispanicOrLatino, Asian, NumberVeterans, FemalePopulation, MalePopulation, TotalPopulation) for each of the states of the U.S.\n",
1041 |     "\n",
1042 |     "The COUNTRY dimention completes our star schema model. To get to the structure we see in the figure above we combined the `GlobalLandTemperaturesByCity` with the code-descriptions found in the file `I94_SAS_Labels_Descriptions.SAS` for the columns `i94cit` and `i94res` showed in the image below.\n",
1043 |     "Firstly, we extracted the key-value pairs from the `I94_SAS_Labels_Descriptions.SAS` and saved those in csv files in the `lookup` directory. Following we aggregated the temperature dataset by `City` and then by `Country`. Finally, we join the two intermediary results to form the table COUNTRY. \n",
1044 |     "\n",
1045 |     "![i94cit](images/i94cit.PNG)"
1046 |    ]
1047 |   },
1048 |   {
1049 |    "cell_type": "markdown",
1050 |    "metadata": {},
1051 |    "source": [
1052 |     "#### 3.2 Mapping Out Data Pipelines\n",
1053 |     "_List the steps necessary to pipeline the data into the chosen data model_\n",
1054 |     "\n",
1055 |     "\n",
1056 |     "To accomplish all the tasks related to the preprocessing of the datasets it was developed a number of functions in a package we called `helper.etl`. There you will find different helper functions to load, select, clean, transform and store the resultind datasets in a very convenient way. The open-source framework Apache Spark was the main tool in this journey. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance.\n",
1057 |     "\n",
1058 |     "We concentrated all the logic of preprocessing there in order to only represent here the general steps of the ETL. This notebook here is only for document purposes whereas the actual run of the ETL takes place in the Spark in cloud-native big data platform [Amazon EMR](https://aws.amazon.com/emr/?nc1=h_ls) through the execution of the main function of the `etl` package. The documentation of the functions can be found in the docstring alongside the code of the package in `helper/etl.py` file."
1059 |    ]
1060 |   },
1061 |   {
1062 |    "cell_type": "code",
1063 |    "execution_count": 3,
1064 |    "metadata": {},
1065 |    "outputs": [],
1066 |    "source": [
1067 |     "# import the ETL package\n",
1068 |     "from helper.etl import create_spark_session, etl_immigration_data, etl_countries_data, etl_states_data"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": 2,
1074 |    "metadata": {},
1075 |    "outputs": [],
1076 |    "source": [
1077 |     "# create Spark session\n",
1078 |     "spark = create_spark_session()"
1079 |    ]
1080 |   },
1081 |   {
1082 |    "cell_type": "markdown",
1083 |    "metadata": {},
1084 |    "source": [
1085 |     "#### Immigration and Date datasets\n",
1086 |     "The preprocessing of the main dataset immigration starts by loading the data from the SAS file and is completed by generating and the storing of the processed dataframes to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n",
1087 |     "* Loading of the immigration file into Spark dataframe. We only load useful columns as we identified them in the EDA phase. In particular we discarded the follouwing fields: 'admnum', 'biryear', 'count', 'dtaddto', 'dtadfile', 'entdepa', 'entdepd', 'entdepu', 'insnum', 'matflag', 'occup', 'visapost';\n",
1088 |     "* Though some columns were actually of Integer type, the Spark framework loaded them as double or strings. To correct this we convert those fields to the proper class;\n",
1089 |     "* The dates in the immigration dataframe are stored in SAS date format, which is a value that represents the number of days between January 1, 1960, and a specified date. We convert the dates in the dataframe to a string date format in the pattern YYYY-MM-DD;\n",
1090 |     "* We drop high missing value columns \"visapost\", \"occup\", \"entdepu\" and \"insnum\";\n",
1091 |     "* Creation of `stay` column from calculating the difference in days between the departure (depdate) and arrival (arrdate) date of the visitors. That will be useful to analyse how long is the average stay of visitors and where they tend to stay longer;\n",
1092 |     "* From the date columns arrdate and depdate we create a second dataframe DATE;\n",
1093 |     "* Save the processed immigration and date dataframes to the Amazon S3 in the parquet format;"
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "markdown",
1098 |    "metadata": {},
1099 |    "source": [
1100 |     "<img src=\"images/etl_immigration.png\" alt=\"etl_immigration\" width=\"400\"/>"
1101 |    ]
1102 |   },
1103 |   {
1104 |    "cell_type": "code",
1105 |    "execution_count": 3,
1106 |    "metadata": {},
1107 |    "outputs": [],
1108 |    "source": [
1109 |     "# Perform ETL process for the Immigration dataset generating immigration and date tables and save them in the S3 bucket indicated in the output_path parameters.\n",
1110 |     "immigration = etl_immigration_data(spark, input_path='../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat',\n",
1111 |     "                                     output_path=\"s3a://data-engineer-capstone/immigration.parquet\",\n",
1112 |     "                                     date_output_path=\"s3a://data-engineer-capstone/date.parquet\",\n",
1113 |     "                                     input_format = \"com.github.saurfang.sas.spark\", \n",
1114 |     "                                     load_size=1000, partitionBy=None, \n",
1115 |     "                                     columns_to_save = '*')"
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "markdown",
1120 |    "metadata": {},
1121 |    "source": [
1122 |     "#### Countries dataset\n",
1123 |     "The generation of the country dataset starts by loading the data global temperature dataset as well as I94CIT_I94RES lookup table and is completed by generating and the storing of the processed dataframe to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n",
1124 |     "* Loading of the csv file of the global temperature and I94CIT_I94RES lookup table;\n",
1125 |     "* Aggregation of the temperatures dataset by country and rename new columns;\n",
1126 |     "* Join the two datasets;\n",
1127 |     "* Save the resulting dataset to the staging area in Amazon S3;"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "markdown",
1132 |    "metadata": {},
1133 |    "source": [
1134 |     "<img src=\"images/etl_country.png\" alt=\"etl_country\" width=\"400\"/>"
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "code",
1139 |    "execution_count": null,
1140 |    "metadata": {},
1141 |    "outputs": [],
1142 |    "source": [
1143 |     "# Perform ETL process for the Country table. Generating the Country table and saving it in the S3 bucket indicated in the output_path parameter.\n",
1144 |     "countries = etl_countries_data(spark, output_path=e.OUTPUT + \"country.parquet\")"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "markdown",
1149 |    "metadata": {},
1150 |    "source": [
1151 |     "#### States dataset\n",
1152 |     "The generation of the states dataset starts by loading the data in demographics dataset as well as I94ADDR lookup table and is completed by generating and the storing of the processed dataframe to a bucket in Amazon S3. In summary, the following tasks are performed throughout the process:\n",
1153 |     "* Loading of the csv file of the demographics and I94ADDR lookup table;\n",
1154 |     "* Aggregation of the demographics dataset by state and rename new columns;\n",
1155 |     "* Join the two datasets;\n",
1156 |     "* Save the resulting dataset to the staging area in Amazon S3;"
1157 |    ]
1158 |   },
1159 |   {
1160 |    "cell_type": "markdown",
1161 |    "metadata": {},
1162 |    "source": [
1163 |     "<img src=\"images/etl_state.png\" alt=\"etl_state\" width=\"400\"/>"
1164 |    ]
1165 |   },
1166 |   {
1167 |    "cell_type": "code",
1168 |    "execution_count": null,
1169 |    "metadata": {},
1170 |    "outputs": [],
1171 |    "source": [
1172 |     "# Perform ETL process for the State table. Generating the State table and saving it in the S3 bucket indicated in the output_path parameter.\n",
1173 |     "states = etl_states_data(spark, output_path=e.OUTPUT + \"state.parquet\")"
1174 |    ]
1175 |   },
1176 |   {
1177 |    "cell_type": "markdown",
1178 |    "metadata": {},
1179 |    "source": [
1180 |     "Once the parquet files are saved in the S3 bucket in the AWS, those are used to load the tables of the same name in the Amazon Redshift. We create the schema by running the SQL script found in `sql/create_tables.sql`. From there, our model is ready to be explored by the customers whether through open query editor in Redshift itself or using a dashboard tool such as Tableau or Power BI."
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "markdown",
1185 |    "metadata": {},
1186 |    "source": [
1187 |     "### Step 4: Run Pipelines to Model the Data \n",
1188 |     "#### 4.1 Create the data model\n",
1189 |     "_Build the data pipelines to create the data model._\n",
1190 |     "\n",
1191 |     "The whole pipeline can be divided into two stages. The first, where we used spark to load, extracted, transform and store the provided datasets into the AWS S3 staging area. The second stage we take advantage of [Apache Airflow](https://airflow.apache.org/) to build a DAG to extract data from S3 and load them into tables of the same name in Amazon Redshift. As a final step we check the data counting checking to ensure completeness.\n",
1192 |     "\n",
1193 |     "<img src=\"images/architecture.png\" alt=\"architecture\" width=\"800\"/>"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "markdown",
1198 |    "metadata": {},
1199 |    "source": [
1200 |     "Below we show the pipeline of the second stage we developed using Apache Airflow.\n",
1201 |     "\n",
1202 |     "<img src=\"images/dag.PNG\" alt=\"dag\" width=\"800\"/>"
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "markdown",
1207 |    "metadata": {},
1208 |    "source": [
1209 |     "The code to build the Airflow pipeline is located in the folder `airflow`. There you will find the code of the DAG itself (file `capstone.py` inside folder `dags`) as well as the two custom operators built for this capstone project in folder `plugins/operators`: `stage_redshift.py` and `data_quality.py`.\n",
1210 |     "\n",
1211 |     "The custom operator `StageToRedshiftOperator` was designed to load data in [parquet](https://parquet.apache.org/) format from S3 buckets in AWS and insert the content into a table in AWS Redshift. That operator is customizable to work with different buckets and with different tables by input parameters. Then it is used in our DAG to load to Redshift both fact and dimension tables.\n",
1212 |     "\n",
1213 |     "#### 4.2 Data Quality Checks\n",
1214 |     "\n",
1215 |     "First, we load the `IMMIGRATION` fact table through the step `Immigration_Fact_Table`, which is followed by the steps to load the dimension tables `STATE`, `DATE` and `COUNTRY`, respectively `State_Dimension_Table`, `Date_Dimension_Table`, `Country_Dimension_Table` steps. All the tables have a PK constraint that uniquely identify the records and in the fact table there are FK that guarantee that values in the fact are present in the dimension tables.\n",
1216 |     "\n",
1217 |     "After completing the loading process, we perform a data quality check through the step `Data_Quality_Checks` to make sure everything was OK. In this check we verify if every table was actually loaded with count check in all the tables of the model."
1218 |    ]
1219 |   },
1220 |   {
1221 |    "cell_type": "markdown",
1222 |    "metadata": {},
1223 |    "source": [
1224 |     "#### 4.3 Data dictionary \n",
1225 |     "_Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where it came from. You can include the data dictionary in the notebook or in a separate file._\n",
1226 |     "\n",
1227 |     "\n",
1228 |     "__Table Immigration__\n",
1229 |     "\n",
1230 |     "| Column Name | Description |\n",
1231 |     "| :--- | :--- |\n",
1232 |     "| CICID | Primary Key |\n",
1233 |     "| I94YR | Year |\n",
1234 |     "| I94MON | Month |\n",
1235 |     "| I94CIT | 3 digit for the country code where the visitor was born. This is a FK to the COUNTRY dimension table |\n",
1236 |     "| I94RES | 3 digit for the country code where the visitor resides in. This is a FK to the COUNTRY dimension table |\n",
1237 |     "| ARRDATE | Arrival date in the USA. This is a FK to the DATE dimension table |\n",
1238 |     "| I94MODE | Mode of transportation (1 = Air; 2 = Sea; 3 = Land; 9 = Not reported) |\n",
1239 |     "| I94ADDR | State of arrival. This is a FK to the STATE dimension table |\n",
1240 |     "| DEPDATE | Departure date from the USA. This is a FK to the DATE dimension table |\n",
1241 |     "| I94BIR | Age of Respondent in Years |\n",
1242 |     "| I94VISA | Visa codes collapsed into three categories: (1 = Business; 2 = Pleasure; 3 = Student) |\n",
1243 |     "| BIRYEAR | 4 digit year of birth |\n",
1244 |     "| GENDER | Gender |\n",
1245 |     "| AIRLINE | Airline used to arrive in U.S. |\n",
1246 |     "| FLTNO | Flight number of Airline used to arrive in U.S. |\n",
1247 |     "| VISATYPE | Class of admission legally admitting the non-immigrant to temporarily stay in U.S. |\n",
1248 |     "| STAY | Number of days in the US |\n",
1249 |     "\n",
1250 |     "\n",
1251 |     "__Table STATE__\n",
1252 |     "\n",
1253 |     "| Column Name | Description |\n",
1254 |     "| :--- | :--- |\n",
1255 |     "| Code | Primary Key. This is the code of the State as in I94ADDR lookup table |\n",
1256 |     "| State | Name of the state |\n",
1257 |     "| BlackOrAfricanAmerican | Number of residents of the race Black Or African American |\n",
1258 |     "| White | Number of residents of the race White |\n",
1259 |     "| ForeignBorn | Number of residents that born outside th United States |\n",
1260 |     "| AmericanIndianAndAlaskaNative | Number of residents of the race American Indian And Alaska Native |\n",
1261 |     "| HispanicOrLatino | Number of residents of the race Hispanic Or Latino |\n",
1262 |     "| Asian | Number of residents of the race Asian |\n",
1263 |     "| NumberVeterans | Number of residents that are war veterans |\n",
1264 |     "| FemalePopulation | Number of female population |\n",
1265 |     "| MalePopulation | Number of male population |\n",
1266 |     "| TotalPopulation | Number total of the population |\n",
1267 |     "\n",
1268 |     "\n",
1269 |     "__Table COUNTRY__\n",
1270 |     "\n",
1271 |     "| Column Name | Description |\n",
1272 |     "| :--- | :--- |\n",
1273 |     "| Code | Country Code. This is the PK. |\n",
1274 |     "| Country | Country Name |\n",
1275 |     "| Temperature | Average temperature of the country between 1743 and 2013 |\n",
1276 |     "| Latitude | GPS Latitude |\n",
1277 |     "| Longitude | GPS Longitude |\n",
1278 |     "\n",
1279 |     "\n",
1280 |     "__Table DATE__\n",
1281 |     "\n",
1282 |     "| Column Name | Description |\n",
1283 |     "| :--- | :--- |\n",
1284 |     "| date | Date in the format YYYY-MM-DD. This is the PK. |\n",
1285 |     "| day | Two digit day |\n",
1286 |     "| month | Two digit month |\n",
1287 |     "| year | Four digit for the year |\n",
1288 |     "| weekofyear | The week of the year |\n",
1289 |     "| dayofweek | The day of the week |"
1290 |    ]
1291 |   },
1292 |   {
1293 |    "cell_type": "markdown",
1294 |    "metadata": {},
1295 |    "source": [
1296 |     "#### Step 5: Complete Project Write Up\n",
1297 |     "__Clearly state the rationale for the choice of tools and technologies for the project.__\n",
1298 |     "\n",
1299 |     "The whole solution implemented here is mounted on top of cloud computing technology, AWS in particular. Because the cloud computing provides a low-cost, scalable, and highly reliable infrastructure platform in the cloud this is a natural choice for every new solution like we did here. Every service we use (S3, EMR, Redshift) has reasonable cost and is ‘pay as you go’ pricing. So we can start small and scale as our solution grows. No up-front costs involved.\n",
1300 |     "\n",
1301 |     "In particular, why we use the following services:\n",
1302 |     "\n",
1303 |     "__S3:__ Provides a relatively cheap, easy-to-use with scalability, high availability, security, and performance. This seems to be perfect to a staging area like our solution here;\n",
1304 |     "\n",
1305 |     "__Spark:__ This is simply the best framework for big data processing, with built-in modules for streaming, SQL, machine learning and graph processing. Spark provides an interface for programming entire clusters with implicit data parallelism and fault tolerance. Most of our team are pythonians and Spark has a very convenient API for python programmers to use;\n",
1306 |     "\n",
1307 |     "__EMR:__ This is a cloud-native big data platform, allowing teams to process vast amounts of data quickly, and cost-effectively at scale using Spark. EMR is easy to use, secure, elastic and low-cost. Perfect to our project;\n",
1308 |     "\n",
1309 |     "__Redshift:__ A natural and logical choice since we based all the solution in the cloud in AWS. Redshift provides a massively parallel, column-oriented data warehouse that provides easy-scale functionality. The main analytical tools have native interface to load from Redshift.\n",
1310 |     "\n",
1311 |     "\n",
1312 |     "__Propose how often the data should be updated and why__\n",
1313 |     "\n",
1314 |     "Since we receive one file per month it seems reasonable to update the model monthly.\n",
1315 |     "\n",
1316 |     "__Write a description of how you would approach the problem differently under the following scenarios:__\n",
1317 |     "\n",
1318 |     " * The data was increased by 100x:\n",
1319 |     "\n",
1320 |     "Scaling the whole pipeline should not be a problem at all. Since the whole solution is on top of Amazon cloud, that are easily scalable, the only thing we would need to do is increase the number of nodes of the clusters in EMR to hadle more data. Also, Amazon Redshift is a data warehouse that can expand to exabyte-scale;\n",
1321 |     " \n",
1322 |     "* The data populates a dashboard that must be updated on a daily basis by 7am every day.\n",
1323 |     "\n",
1324 |     "The runnig interval of the Airflow DAG could be changed to daily and scheduled to run overnight to make the data available y 7am.\n",
1325 |     " \n",
1326 |     "* The database needed to be accessed by 100+ people.\n",
1327 |     " \n",
1328 |     "Again, not a big problem. With Redshift we can make use of the feature \"elastic resize\" that enables us to add or remove nodes in an Amazon Redshift cluster in minutes. This further increases the agility to get better performance and more storage for demanding workloads, and to reduce cost during periods of low demand."
1329 |    ]
1330 |   },
1331 |   {
1332 |    "cell_type": "code",
1333 |    "execution_count": null,
1334 |    "metadata": {},
1335 |    "outputs": [],
1336 |    "source": []
1337 |   }
1338 |  ],
1339 |  "metadata": {
1340 |   "kernelspec": {
1341 |    "display_name": "Python 3",
1342 |    "language": "python",
1343 |    "name": "python3"
1344 |   },
1345 |   "language_info": {
1346 |    "codemirror_mode": {
1347 |     "name": "ipython",
1348 |     "version": 3
1349 |    },
1350 |    "file_extension": ".py",
1351 |    "mimetype": "text/x-python",
1352 |    "name": "python",
1353 |    "nbconvert_exporter": "python",
1354 |    "pygments_lexer": "ipython3",
1355 |    "version": "3.6.3"
1356 |   }
1357 |  },
1358 |  "nbformat": 4,
1359 |  "nbformat_minor": 4
1360 | }
1361 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Capstone Project
 2 | 
 3 | ## Overview
 4 | 
 5 | The purpose of the data engineering capstone project is to give you a chance to combine what you've learned throughout the program. This project will be an important part of your portfolio that will help you achieve your data engineering-related career goals.
 6 | 
 7 | In this project, you can choose to complete the project provided for you, or define the scope and data for a project of your own design. Either way, you'll be expected to go through the same steps outlined below.
 8 | 
 9 | ## Udacity Provided Project
10 | 
11 | In the Udacity provided project, you'll work with four datasets to complete the project. The main dataset will include data on immigration to the United States, and supplementary datasets will include data on airport codes, U.S. city demographics, and temperature data. You're also welcome to enrich the project with additional data if you'd like to set your project apart.
12 | 
13 | ## Open-Ended Project
14 | 
15 | If you decide to design your own project, you can find useful information in the Project Resources section. Rather than go through steps below with the data Udacity provides, you'll gather your own data, and go through the same process.
16 | 
17 | ## Instructions
18 | 
19 | To help guide your project, we've broken it down into a series of steps.
20 | 
21 | ### Step 1: Scope the Project and Gather Data
22 | 
23 | Since the scope of the project will be highly dependent on the data, these two things happen simultaneously. In this step, you’ll:
24 | 
25 | * Identify and gather the data you'll be using for your project (at least two sources and more than 1 million rows). See Project Resources for ideas of what data you can use.
26 | * Explain what end use cases you'd like to prepare the data for (e.g., analytics table, app back-end, source-of-truth database, etc.)
27 | 
28 | ### Step 2: Explore and Assess the Data
29 | 
30 | * Explore the data to identify data quality issues, like missing values, duplicate data, etc.
31 | * Document steps necessary to clean the data
32 | 
33 | 
34 | ### Step 3: Define the Data Model
35 | 
36 | * Map out the conceptual data model and explain why you chose that model
37 | * List the steps necessary to pipeline the data into the chosen data model
38 | 
39 | 
40 | ### Step 4: Run ETL to Model the Data
41 | 
42 | * Create the data pipelines and the data model
43 | * Include a data dictionary
44 | * Run data quality checks to ensure the pipeline ran as expected
45 |     * Integrity constraints on the relational database (e.g., unique key, data type, etc.)
46 |     * Unit tests for the scripts to ensure they are doing the right thing
47 |     * Source/count checks to ensure completeness
48 | 
49 | ### Step 5: Complete Project Write Up
50 | 
51 | * What's the goal? What queries will you want to run? How would Spark or Airflow be incorporated? Why did you choose the model you chose?
52 | * Clearly state the rationale for the choice of tools and technologies for the project.
53 | * Document the steps of the process.
54 | * Propose how often the data should be updated and why.
55 | * Post your write-up and final data model in a GitHub repo.
56 | * Include a description of how you would approach the problem differently under the following scenarios:
57 |     * If the data was increased by 100x.
58 |     * If the pipelines were run on a daily basis by 7am.
59 |     * If the database needed to be accessed by 100+ people.
60 | 
61 | ### Rubric
62 | In the [Project Rubric](https://review.udacity.com/#!/rubrics/2497/view), you'll see more detail about the requirements. Use the rubric to assess your own project before you submit to Udacity for review. As with other projects, Udacity reviewers will use this rubric to assess your project and provide feedback. If your project does not meet specifications, you can make changes and resubmit.
63 | 


--------------------------------------------------------------------------------
/airflow/dags/capstone.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | import os
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators import (StageToRedshiftOperator, DataQualityOperator)
 6 | from helpers import SqlQueries
 7 | 
 8 | default_args = {
 9 |     'depends_on_past': False, # The DAG does not have dependencies on past runs
10 |     'owner': 'Fernando Carneiro',
11 |     'retries': 3, # On failure, the task are retried 3 times
12 |     'retry_delay': timedelta(minutes=60), # Retries happen every 60 minutes
13 |     'start_date': datetime(2016, 1, 1),
14 |     'email_on_retry': False, # Do not email on retry
15 | }
16 | 
17 | dag = DAG('data_engineering_project',
18 |           default_args=default_args,
19 |           description='Load and transform data in Redshift with Airflow',
20 |           schedule_interval='@monthly'
21 |         )
22 | 
23 | start_operator = DummyOperator(task_id='Begin',  dag=dag)
24 | 
25 | immigration_to_redshift = StageToRedshiftOperator(
26 |     task_id='Immigration_Fact_Table',
27 |     aws_conn_id = 'aws_credentials',
28 |     redshift_conn_id = "redshift",
29 |     s3_from = 'data-engineer-capstone',
30 |     s3_prefix = 'immigration.parquet',
31 |     schema_to = 'public',
32 |     table_to = 'immigration',
33 |     options = ["FORMAT AS PARQUET"],
34 |     dag=dag
35 | )
36 | 
37 | country_to_redshift = StageToRedshiftOperator(
38 |     task_id='Country_Dimension_Table',
39 |     aws_conn_id = 'aws_credentials',
40 |     redshift_conn_id = "redshift",
41 |     s3_from = 'data-engineer-capstone',
42 |     s3_prefix = 'country.parquet',
43 |     schema_to = 'public',
44 |     table_to = 'country',
45 |     options = ["FORMAT AS PARQUET"],
46 |     dag=dag
47 | )
48 | 
49 | state_to_redshift = StageToRedshiftOperator(
50 |     task_id='State_Dimension_Table',
51 |     aws_conn_id = 'aws_credentials',
52 |     redshift_conn_id = "redshift",
53 |     s3_from = 'data-engineer-capstone',
54 |     s3_prefix = 'state.parquet',
55 |     schema_to = 'public',
56 |     table_to = 'state',
57 |     options = ["FORMAT AS PARQUET"],
58 |     dag=dag
59 | )
60 | 
61 | date_to_redshift = StageToRedshiftOperator(
62 |     task_id='Date_Dimension_Table',
63 |     aws_conn_id = 'aws_credentials',
64 |     redshift_conn_id = "redshift",
65 |     s3_from = 'data-engineer-capstone',
66 |     s3_prefix = 'date.parquet',
67 |     schema_to = 'public',
68 |     table_to = 'date',
69 |     options = ["FORMAT AS PARQUET"],
70 |     dag=dag
71 | )
72 | 
73 | run_quality_checks = DataQualityOperator(
74 |     task_id='Data_Quality_Checks',
75 |     redshift_conn_id = "redshift",
76 |     tables=['immigration', 'country', 'state', 'date'],
77 |     dag=dag
78 | )
79 | 
80 | end_operator = DummyOperator(task_id='End',  dag=dag)
81 | 
82 | start_operator >> immigration_to_redshift
83 | immigration_to_redshift >> country_to_redshift
84 | immigration_to_redshift >> state_to_redshift
85 | immigration_to_redshift >> date_to_redshift
86 | country_to_redshift >> run_quality_checks
87 | state_to_redshift >> run_quality_checks
88 | date_to_redshift >> run_quality_checks
89 | run_quality_checks >> end_operator
90 | 


--------------------------------------------------------------------------------
/airflow/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class DataQualityOperator(BaseOperator):
 6 | 
 7 |     ui_color = '#89DA59'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  # Define your operators params (with defaults) here
12 |                  redshift_conn_id,
13 |                  tables,
14 |                  *args, **kwargs):
15 | 
16 |         super(DataQualityOperator, self).__init__(*args, **kwargs)
17 |         self.redshift_conn_id = redshift_conn_id
18 |         self.tables = tables
19 | 
20 |     def execute(self, context):
21 |         self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
22 |         for table in self.tables:
23 |             self.log.info(f"Checking table {table}")
24 |             records = self.hook.get_records(f"SELECT COUNT(*) FROM {table}")
25 |             if len(records) < 1 or len(records[0]) < 1:
26 |                 raise ValueError(f"Data quality check failed. {table} returned no results")
27 |             num_records = records[0][0]
28 |             if num_records < 1:
29 |                 raise ValueError(f"Data quality check failed. {table} contained 0 rows")
30 |             self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records")


--------------------------------------------------------------------------------
/airflow/plugins/operators/stage_redshift.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator, Variable
 3 | from airflow.utils.decorators import apply_defaults
 4 | from airflow.hooks.S3_hook import S3Hook
 5 | from airflow.contrib.hooks.aws_hook import AwsHook
 6 | 
 7 | import datetime
 8 | import logging
 9 | 
10 | class StageToRedshiftOperator(BaseOperator):
11 |     ui_color = '#358140'
12 | 
13 |     @apply_defaults
14 |     def __init__(self,
15 |                  # Define your operators params (with defaults) here
16 |                  aws_conn_id,
17 |                  redshift_conn_id,
18 |                  s3_from,
19 |                  s3_prefix,
20 |                  schema_to,
21 |                  table_to,
22 |                  options,
23 |                  *args, **kwargs):
24 | 
25 |         super(StageToRedshiftOperator, self).__init__(*args, **kwargs)
26 |         self.aws_conn_id = aws_conn_id
27 |         self.redshift_conn_id = redshift_conn_id
28 |         self.s3_from = s3_from
29 |         self.s3_prefix = s3_prefix
30 |         self.schema = schema_to
31 |         self.table = table_to
32 |         self.options = options
33 |         self.autocommit = True
34 |         self.region = 'us-west-2'
35 | 
36 |     def execute(self, context):
37 |         self.log.info('Initializing COPY procedure...')        
38 |         aws_hook = AwsHook(self.aws_conn_id)
39 |         credentials = aws_hook.get_credentials()
40 |         self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
41 |         copy_options = '\n\t\t\t'.join(self.options)
42 | 
43 |         copy_query = """
44 |             COPY {schema}.{table}
45 |             FROM 's3://{s3_bucket}/{s3_key}'
46 |             IAM_ROLE 'arn:aws:iam::900646315604:role/myRedshiftRole'
47 |             {copy_options};
48 |         """.format(schema=self.schema,
49 |                    table=self.table,
50 |                    s3_bucket=self.s3_from,
51 |                    s3_key=self.s3_prefix,
52 |                    copy_options=copy_options)
53 | 
54 |         self.log.info(f'Executing COPY command from bucket s3://{self.s3_from}/{self.s3_prefix} to {self.schema}.{self.table} in Redshift')
55 |         self.hook.run(copy_query, self.autocommit)
56 |         self.log.info("COPY command complete!")


--------------------------------------------------------------------------------
/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/helper/__init__.py


--------------------------------------------------------------------------------
/helper/etl.py:
--------------------------------------------------------------------------------
  1 | import os, re
  2 | import configparser
  3 | from datetime import timedelta, datetime
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.functions import udf, col, when, lower, isnull, year, month, dayofmonth, hour, weekofyear, dayofweek, date_format, to_date
  6 | from pyspark.sql.types import StructField, StructType, IntegerType, DoubleType
  7 | 
  8 | # The date format string preferred to our work here: YYYY-MM-DD
  9 | date_format = "%Y-%m-%d"
 10 | 
 11 | # The AWS key id and password are configured in a configuration file "dl.cfg"
 12 | config = configparser.ConfigParser()
 13 | config.read('dl.cfg')
 14 | 
 15 | # Reads and saves the AWS access key information and saves them in a environment variable
 16 | os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
 17 | os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']
 18 | OUTPUT = config['ETL']['OUTPUT_DATA']
 19 | 
 20 | def create_spark_session():
 21 |     """
 22 |     This function creates a session with Spark, the entry point to programming Spark with the Dataset and DataFrame API.
 23 |     """
 24 |     spark = SparkSession.builder.config("spark.jars.packages",
 25 |                                         "saurfang:spark-sas7bdat:2.0.0-s_2.11,org.apache.hadoop:hadoop-aws:2.7.0")\
 26 |     .enableHiveSupport().getOrCreate()
 27 |     return spark
 28 | 
 29 | def read_data(spark, input_path, input_format = "csv", columns = '*', debug_size = None, **options):
 30 |     """
 31 |     Loads data from a data source using the pyspark module and returns it as a spark 'DataFrame'.
 32 |     
 33 |     Args:
 34 |         spark (:obj:`SparkSession`): Spark session. 
 35 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
 36 |         input_path (:obj:`str`): Directory where to find the input files.
 37 |         input_format (:obj:`str`): Optional string for format of the data source. Default to 'csv'.
 38 |         columns (:obj:`list`): List of columns of the dataframe to return. Default to "*", which means 'all columns'.
 39 |         debug_size (int): Define the number of rows to read for debug purposes. The default value None means 'all rows'.
 40 |         options: All other string options.
 41 |     """
 42 |     if debug_size is None:
 43 |         df = spark.read.load(input_path, format=input_format, **options).select(columns)
 44 |     else:
 45 |         df = spark.read.load(input_path, format=input_format, **options).select(columns).limit(debug_size)
 46 |     return df
 47 | 
 48 | def save(df, output_path, mode = "overwrite", output_format = "parquet", columns = '*', partitionBy=None, **options):
 49 |     """
 50 |     Saves the contents of the DataFrame to a data source.
 51 | 
 52 |     The data source is specified by the format and a set of options. If format is not specified, 'parquet' will be used.
 53 |     
 54 |     Args:
 55 |         df (:obj:`DataFrame`): Spark DataFrame.
 56 |         output_path (:obj:`str`): The path in a Hadoop supported file system where the DataFrame contentes will be saved.
 57 |         mode (:obj:`str`): Specifies the behavior of the save operation when data already exists. Default to 'overwrite'.
 58 |         output_format (:obj:`str`): Optional string for format of the data source to be saved. Default to 'parquet'.
 59 |         columns (:obj:`list`): List of columns of the dataframe to save. Default to "*", which means 'all columns'.
 60 |         partitionBy (:obj:`list`): Names of partitioning columns. The default value None means 'no partitions'.
 61 |         options: All other string options.
 62 |     """
 63 | 
 64 |     df.select(columns).write.save(output_path, mode= mode, format=output_format, partitionBy = partitionBy, **options)
 65 |     
 66 | def etl_immigration_data(spark, input_path="immigration_data_sample.csv", output_path="out/immigration.parquet",
 67 |                          date_output_path="out/date.parquet",
 68 |                          input_format = "csv", columns = ['i94addr', 'i94mon','cicid','i94visa','i94res','arrdate','i94yr','depdate',
 69 |                                                           'airline', 'fltno', 'i94mode', 'i94port', 'visatype', 'gender', 
 70 |                                                           'i94cit', 'i94bir'], 
 71 |                          load_size = None, partitionBy = ["i94yr", "i94mon"], columns_to_save='*', header=True, **options):
 72 |     """
 73 |     Reads the immigration dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter 
 74 |     out_put path.
 75 |     
 76 |     Args:
 77 |         spark (:obj:`SparkSession`): Spark session. 
 78 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
 79 |         input_path (:obj:`str`): Directory where to find the input files.
 80 |         output_path (:obj:`str`): Directory where to save immigration output files.
 81 |         date_output_path (:obj:`str`): Directory where to save date output files.
 82 |         input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value).
 83 |         columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful.
 84 |         load_size (int): Number of rows to read for debug purposes.
 85 |         partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list.
 86 |         columns_to_save (:obj:`list`): Define what columns will be saved.
 87 |         header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false.
 88 |         options: All other string options.
 89 |     """
 90 |     
 91 |     # Loads the immigration dataframe using Spark
 92 |     # We discard the columns ['admnum', 'biryear', 'count', 'dtaddto', 'dtadfile', 'entdepa', 'entdepd', 'entdepu', 'insnum', 'matflag', 'occup', 'visapost'] as they seemed not to be very useful for our goals.
 93 |     # Some of them were very unclear of what they really represent.
 94 |     immigration = read_data(spark, input_path=input_path, input_format=input_format, 
 95 |                             columns=columns, debug_size = load_size, header=header, **options)
 96 |     
 97 |     int_cols = ['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 
 98 |         'arrdate', 'i94mode', 'i94bir', 'i94visa', 'count', 'biryear', 'dtadfile', 'depdate']
 99 |     
100 |     date_cols = ['arrdate', 'depdate']
101 |     
102 |     high_null = ["visapost", "occup", "entdepu", "insnum"]
103 |     not_useful_cols = ["count", "entdepa", "entdepd", "matflag", "dtaddto", "biryear", "admnum"]
104 |     
105 |     # Convert columns read as string/double to integer
106 |     immigration = cast_type(immigration, dict(zip(int_cols, len(int_cols)*[IntegerType()])))
107 |     
108 |     # Convert SAS date to a meaningful string date in the format of YYYY-MM-DD
109 |     immigration = convert_sas_date(immigration, date_cols)
110 |     
111 |     # Drop high null columns and not useful columns
112 |     immigration = immigration.drop(*high_null)
113 |     immigration = immigration.drop(*not_useful_cols)
114 |     
115 |     # Create a new columns to store the length of the visitor stay in the US
116 |     immigration = immigration.withColumn('stay', date_diff_udf(immigration.arrdate, immigration.depdate))
117 |     immigration = cast_type(immigration, {'stay': IntegerType()})
118 |     
119 |     # Generate DATE dataframe and save it to the output_path indicated as parameter of the function
120 |     if date_output_path is not None:
121 |         arrdate = immigration.select('arrdate').distinct()
122 |         depdate = immigration.select('depdate').distinct()
123 |         dates = arrdate.union(depdate)
124 |         dates = dates.withColumn("date", to_date(dates.arrdate, date_format))
125 |         dates = dates.withColumn("year", year(dates.date))
126 |         dates = dates.withColumn("month", month(dates.date))
127 |         dates = dates.withColumn("day", dayofmonth(dates.date))
128 |         dates = dates.withColumn("weekofyear", weekofyear(dates.date))
129 |         dates = dates.withColumn("dayofweek", dayofweek(dates.date))
130 |         dates = dates.drop("date").withColumnRenamed('arrdate', 'date')
131 |         save(df=dates.select("date", "year", "month", "day", "weekofyear", "dayofweek"), output_path=date_output_path)
132 |     
133 |     # Save the processed immigration dataset to the output_path
134 |     if output_path is not None:
135 |         save(df=immigration.select(columns_to_save), output_path=output_path, partitionBy = partitionBy)
136 |     return immigration
137 | 
138 | def etl_temperature_data(spark, input_path="../../data2/GlobalLandTemperaturesByCity.csv", output_path="out/temperature.parquet", 
139 |                          input_format = "csv", columns = '*', load_size = None, partitionBy = ["Country", "City"], header=True, **options):
140 |     """
141 |     Reads the global temperature dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path.
142 |     
143 |     Args:
144 |         spark (:obj:`SparkSession`): Spark session. 
145 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
146 |         input_path (:obj:`str`): Directory where to find the input files.
147 |         output_path (:obj:`str`): Directory where to save immigration output files.
148 |         input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value).
149 |         columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful.
150 |         load_size (int): Number of rows to read for debug purposes.
151 |         partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list.
152 |         header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false.
153 |         options: All other string options.
154 |     """
155 |     # Loads the global temperature dataframe using Spark
156 |     temperature = read_data(spark, input_path=input_path, input_format=input_format, 
157 |                             columns=columns, debug_size = load_size, header=header, **options)
158 |     # Save the temperature dataset to the output_path
159 |     save(df=temperature, output_path=output_path, partitionBy = partitionBy)
160 |     return temperature
161 | 
162 | def etl_airport_data(spark, input_path="airport-codes_csv.csv", output_path="out/airport.parquet", 
163 |                          input_format = "csv", columns = '*', load_size = None, partitionBy = ["iso_country"], header=True, **options):
164 |     """
165 |     Reads the airport dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path.
166 |     
167 |     Args:
168 |         spark (:obj:`SparkSession`): Spark session. 
169 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
170 |         input_path (:obj:`str`): Directory where to find the input files.
171 |         output_path (:obj:`str`): Directory where to save immigration output files.
172 |         input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value).
173 |         columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful.
174 |         load_size (int): Number of rows to read for debug purposes.
175 |         partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list.
176 |         header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false.
177 |         options: All other string options.
178 |     """
179 |     # Loads the airport dataframe using Spark
180 |     airport = read_data(spark, input_path=input_path, input_format=input_format, 
181 |                             columns=columns, debug_size = load_size, header=header, **options)    
182 |     # Save the airport dataset to the output_path
183 |     save(df=airport, output_path=output_path, partitionBy = partitionBy)
184 |     return airport
185 | 
186 | def etl_demographics_data(spark, input_path="us-cities-demographics.csv", output_path="out/demographics.parquet", 
187 |                          input_format = "csv", columns='*',
188 |                           load_size = None, partitionBy = ["State Code"], header=True, sep=";", **options):
189 |     """
190 |     Reads the demographics dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path.
191 |     
192 |     Args:
193 |         spark (:obj:`SparkSession`): Spark session. 
194 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
195 |         input_path (:obj:`str`): Directory where to find the input files.
196 |         output_path (:obj:`str`): Directory where to save immigration output files.
197 |         input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value).
198 |         columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful.
199 |         load_size (int): Number of rows to read for debug purposes.
200 |         partitionBy (:obj:`list`): Files will be saved in partitions using the columns of this list.
201 |         header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false.
202 |         options: All other string options.
203 |     """
204 |     # Loads the demographics dataframe using Spark
205 |     demographics = read_data(spark, input_path=input_path, input_format=input_format, 
206 |                             columns=columns, debug_size = load_size, header=header, sep=sep, **options)
207 |     
208 |     # Convert numeric columns to the proper types: Integer and Double
209 |     int_cols = ['Count', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born']
210 |     float_cols = ['Median Age', 'Average Household Size']
211 |     demographics = cast_type(demographics, dict(zip(int_cols, len(int_cols)*[IntegerType()])))
212 |     demographics = cast_type(demographics, dict(zip(float_cols, len(float_cols)*[DoubleType()])))
213 |     
214 |     first_agg = {"Median Age": "first", "Male Population": "first", "Female Population": "first", 
215 |                  "Total Population": "first", "Number of Veterans": "first", "Foreign-born": "first", "Average Household Size": "first"}
216 |     # First aggregation - City
217 |     agg_df = demographics.groupby(["City", "State", "State Code"]).agg(first_agg)
218 |     # Pivot Table to transform values of the column Race to different columns
219 |     piv_df = demographics.groupBy(["City", "State", "State Code"]).pivot("Race").sum("Count")
220 |     
221 |     # Rename column names removing the spaces to avoid problems when saving to disk (we got errors when trying to save column names with spaces)
222 |     demographics = agg_df.join(other=piv_df, on=["City", "State", "State Code"], how="inner")\
223 |     .withColumnRenamed('first(Total Population)', 'TotalPopulation')\
224 |     .withColumnRenamed('first(Female Population)', 'FemalePopulation')\
225 |     .withColumnRenamed('first(Male Population)', 'MalePopulation')\
226 |     .withColumnRenamed('first(Median Age)', 'MedianAge')\
227 |     .withColumnRenamed('first(Number of Veterans)', 'NumberVeterans')\
228 |     .withColumnRenamed('first(Foreign-born)', 'ForeignBorn')\
229 |     .withColumnRenamed('first(Average Household Size)', 'AverageHouseholdSize')\
230 |     .withColumnRenamed('Hispanic or Latino', 'HispanicOrLatino')\
231 |     .withColumnRenamed('Black or African-American', 'BlackOrAfricanAmerican')\
232 |     .withColumnRenamed('American Indian and Alaska Native', 'AmericanIndianAndAlaskaNative')
233 |     
234 |     numeric_cols = ['TotalPopulation', 'FemalePopulation', 'MedianAge', 'NumberVeterans', 'ForeignBorn', 'MalePopulation', 'AverageHouseholdSize',
235 |                     'AmericanIndianAndAlaskaNative', 'Asian', 'BlackOrAfricanAmerican', 'HispanicOrLatino', 'White']
236 |     # Fill the null values with 0
237 |     demographics = demographics.fillna(0, numeric_cols)
238 |     
239 |     # Save the demographics dataset to the output_path
240 |     if output_path is not None:
241 |         save(df=demographics, output_path=output_path, partitionBy = partitionBy)
242 |     
243 |     return demographics
244 | 
245 | def etl_states_data(spark, output_path="out/state.parquet"):
246 |     cols = ['TotalPopulation', 'FemalePopulation', 'MalePopulation', 'NumberVeterans', 'ForeignBorn', 
247 |             'AmericanIndianAndAlaskaNative', 'Asian', 'BlackOrAfricanAmerican', 'HispanicOrLatino', 'White']
248 |     """
249 |     Reads the states dataset indicated in the input_path, performs the ETL process and saves it in the output path indicated by the parameter out_put path.
250 |     
251 |     Args:
252 |         spark (:obj:`SparkSession`): Spark session. 
253 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
254 |         output_path (:obj:`str`): Directory where to save immigration output files.
255 |     """
256 |     # Loads the demographics dataframe using Spark
257 |     demographics = etl_demographics_data(spark, output_path=None)
258 |     # Aggregates the dataset by State
259 |     states = demographics.groupby(["State Code", "State"]).agg(dict(zip(cols, len(cols)*["sum"])))
260 |     # Loads the lookup table I94ADDR
261 |     addr = read_data(spark, input_path="lookup/I94ADDR.csv", input_format="csv", columns="*", header=True)\
262 |     .withColumnRenamed('State', 'State Original')
263 |     
264 |     # Join the two datasets
265 |     addr = addr.join(states, states["State Code"] == addr.Code, "left")
266 |     addr = addr.withColumn("State", when(isnull(addr["State"]), capitalize_udf(addr['State Original'])).otherwise(addr["State"]))
267 |     addr = addr.drop('State Original', 'State Code')
268 |     
269 |     cols = ['sum(BlackOrAfricanAmerican)', 'sum(White)', 'sum(AmericanIndianAndAlaskaNative)',
270 |             'sum(HispanicOrLatino)', 'sum(Asian)', 'sum(NumberVeterans)', 'sum(ForeignBorn)', 'sum(FemalePopulation)', 
271 |             'sum(MalePopulation)', 'sum(TotalPopulation)']
272 |     
273 |     # Rename the columns to modify default names returned when Spark aggregates the values of the columns.
274 |     # For example: column 'sum(MalePopulation)' becomes 'MalePopulation'
275 |     mapping = dict(zip(cols, [re.search(r'\((.*?)\)', c).group(1) for c in cols]))
276 |     addr = rename_columns(addr, mapping)
277 |     
278 |     # Save the resulting dataset to the output_path
279 |     if output_path is not None:
280 |         save(df=addr, output_path=output_path)
281 |     return addr
282 |     
283 | def etl_countries_data(spark, input_path="../../data2/GlobalLandTemperaturesByCity.csv", output_path="out/country.parquet", 
284 |                          input_format = "csv", columns = '*', load_size = None, header=True, **options):
285 |     """
286 |     Reads the global temperatures dataset indicated in the input_path and transform it to generate the country dataframe. Performs the ETL process and saves it in the output path indicated by the parameter out_put path.
287 |     
288 |     Args:
289 |         spark (:obj:`SparkSession`): Spark session. 
290 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
291 |         input_path (:obj:`str`): Directory where to find the input files.
292 |         output_path (:obj:`str`): Directory where to save immigration output files.
293 |         input_format (:obj:`str`): Type of the input files. Default to "csv" (comma-separated value).
294 |         columns (:obj:`list`): List of the columns names to read in. Useful when only some columns are useful.
295 |         load_size (int): Number of rows to read for debug purposes.
296 |         header: (bool): Uses the first line as names of columns. If None is set, it uses the default value, false.
297 |         options: All other string options.
298 |     """
299 |     # Loads the demographics dataframe using Spark
300 |     countries = read_data(spark, input_path=input_path, input_format=input_format, 
301 |                             columns=columns, debug_size = load_size, header=header, **options)
302 |     # Aggregates the dataset by Country and rename the name of new columns
303 |     countries = countries.groupby(["Country"]).agg({"AverageTemperature": "avg", "Latitude": "first", "Longitude": "first"})\
304 |     .withColumnRenamed('avg(AverageTemperature)', 'Temperature')\
305 |     .withColumnRenamed('first(Latitude)', 'Latitude')\
306 |     .withColumnRenamed('first(Longitude)', 'Longitude')
307 |     
308 |     # Rename specific country names to match the I94CIT_I94RES lookup table when joining them
309 |     change_countries = [("Country", "Congo (Democratic Republic Of The)", "Congo"), ("Country", "Côte D'Ivoire", "Ivory Coast")]
310 |     countries = change_field_value_condition(countries, change_countries)
311 |     countries = countries.withColumn('Country_Lower', lower(countries.Country))
312 |     
313 |     # Rename specific country names to match the demographics dataset when joining them
314 |     change_res = [("I94CTRY", "BOSNIA-HERZEGOVINA", "BOSNIA AND HERZEGOVINA"), 
315 |                   ("I94CTRY", "INVALID: CANADA", "CANADA"),
316 |                   ("I94CTRY", "CHINA, PRC", "CHINA"),
317 |                   ("I94CTRY", "GUINEA-BISSAU", "GUINEA BISSAU"),
318 |                   ("I94CTRY", "INVALID: PUERTO RICO", "PUERTO RICO"),
319 |                   ("I94CTRY", "INVALID: UNITED STATES", "UNITED STATES")]
320 |     
321 |     # Loads the lookup table I94CIT_I94RES
322 |     res = read_data(spark, input_path="lookup/I94CIT_I94RES.csv", input_format=input_format, columns="*",
323 |                           debug_size = load_size, header=header, **options)
324 |     res = cast_type(res, {"Code": IntegerType()})
325 |     res = change_field_value_condition(res, change_res)
326 |     res = res.withColumn('Country_Lower', lower(res.I94CTRY))
327 |     # Join the two datasets to create the country dimmension table
328 |     res = res.join(countries, res.Country_Lower == countries.Country_Lower, how="left")
329 |     res = res.withColumn("Country", when(isnull(res["Country"]), capitalize_udf(res.I94CTRY)).otherwise(res["Country"]))   
330 |     res = res.drop("I94CTRY", "Country_Lower")
331 |     
332 |     # Save the resulting dataset to the output_path
333 |     if output_path is not None:
334 |         save(df=res, output_path=output_path)
335 |     return res
336 | 
337 | def cast_type(df, cols):
338 |     """
339 |     Convert the types of the columns according to the configuration supplied in the cols dictionary in the format {"column_name": type}
340 |     
341 |     Args:
342 |         df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 
343 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
344 |         cols (:obj:`dict`): Dictionary in the format of {"column_name": type} indicating what columns and types they should be converted to
345 |     """
346 |     for k,v in cols.items():
347 |         if k in df.columns:
348 |             df = df.withColumn(k, df[k].cast(v))
349 |     return df
350 | 
351 | def convert_sas_date(df, cols):
352 |     """
353 |     Convert dates in the SAS datatype to a date in a string format YYYY-MM-DD
354 |     
355 |     Args:
356 |         df (:obj:`SparkDataFrame`): Spark dataframe to be processed. 
357 |             Represents the entry point to programming Spark with the Dataset and DataFrame API.
358 |         cols (:obj:`list`): List of columns in the SAS date format to be convert
359 |     """
360 |     for c in [c for c in cols if c in df.columns]:
361 |         df = df.withColumn(c, convert_sas_udf(df[c]))
362 |     return df
363 | 
364 | def change_field_value_condition(df, change_list):
365 |     '''
366 |     Helper function used to rename column values based on condition.
367 |     
368 |     Args:
369 |         df (:obj:`SparkDataFrame`): Spark dataframe to be processed.
370 |         change_list (:obj: `list`): List of tuples in the format (field, old value, new value)
371 |     '''
372 |     for field, old, new in change_list:
373 |         df = df.withColumn(field, when(df[field] == old, new).otherwise(df[field]))
374 |     return df
375 | 
376 | def rename_columns(df, mapping):
377 |     '''
378 |     Rename the columns of the dataset based in the mapping dictionary
379 |     
380 |     Args:
381 |         df (:obj:`SparkDataFrame`): Spark dataframe to be processed.
382 |         mapping (:obj: `dict`): Mapping dictionary in the format {old_name: new_name}
383 |     '''
384 |     df = df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
385 |     return df
386 | 
387 | def date_diff(date1, date2):
388 |     '''
389 |     Calculates the difference in days between two dates
390 |     '''
391 |     if date2 is None:
392 |         return None
393 |     else:
394 |         a = datetime.strptime(date1, date_format)
395 |         b = datetime.strptime(date2, date_format)
396 |         delta = b - a
397 |         return delta.days
398 | 
399 | # User defined functions using Spark udf wrapper function to convert SAS dates into string dates in the format YYYY-MM-DD, to capitalize the first letters of the string and to calculate the difference between two dates in days.
400 | convert_sas_udf = udf(lambda x: x if x is None else (timedelta(days=x) + datetime(1960, 1, 1)).strftime(date_format))
401 | capitalize_udf = udf(lambda x: x if x is None else x.title())
402 | date_diff_udf = udf(date_diff)
403 | 
404 | if __name__ == "__main__" :
405 |     spark = create_spark_session()
406 |     # Perform ETL process for the Immigration dataset generating immigration and date tables and save them in the S3 bucket indicated in the output_path parameters.
407 |     immigration = etl_immigration_data(spark, input_path='../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat',
408 |                                      output_path="s3a://data-engineer-capstone/immigration.parquet",
409 |                                      date_output_path="s3a://data-engineer-capstone/date.parquet",
410 |                                      input_format = "com.github.saurfang.sas.spark", 
411 |                                      load_size=1000, partitionBy=None, 
412 |                                      columns_to_save = '*')
413 |     # Perform ETL process for the Country table. Generating the Country table and saving it in the S3 bucket indicated in the output_path parameter.
414 |     countries = e.etl_countries_data(spark, output_path=e.OUTPUT + "country.parquet")
415 |     # Perform ETL process for the State table. Generating the State table and saving it in the S3 bucket indicated in the output_path parameter.
416 |     states = e.etl_states_data(spark, output_path=e.OUTPUT + "state.parquet")


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/architecture.png


--------------------------------------------------------------------------------
/images/dag.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/dag.PNG


--------------------------------------------------------------------------------
/images/etl_country.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_country.png


--------------------------------------------------------------------------------
/images/etl_immigration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_immigration.png


--------------------------------------------------------------------------------
/images/etl_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/etl_state.png


--------------------------------------------------------------------------------
/images/i94cit.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/i94cit.PNG


--------------------------------------------------------------------------------
/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/pipeline.png


--------------------------------------------------------------------------------
/images/star-schema.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpcarneiro/data-engineer-project/c79baba2ccb53495c7c9b0d9e68dbf452dba1dfe/images/star-schema.PNG


--------------------------------------------------------------------------------
/lookup/I94ADDR.csv:
--------------------------------------------------------------------------------
 1 | Code,State
 2 | AL,ALABAMA
 3 | AK,ALASKA
 4 | AZ,ARIZONA
 5 | AR,ARKANSAS
 6 | CA,CALIFORNIA
 7 | CO,COLORADO
 8 | CT,CONNECTICUT
 9 | DE,DELAWARE
10 | DC,DIST. OF COLUMBIA
11 | FL,FLORIDA
12 | GA,GEORGIA
13 | GU,GUAM
14 | HI,HAWAII
15 | ID,IDAHO
16 | IL,ILLINOIS
17 | IN,INDIANA
18 | IA,IOWA
19 | KS,KANSAS
20 | KY,KENTUCKY
21 | LA,LOUISIANA
22 | ME,MAINE
23 | MD,MARYLAND
24 | MA,MASSACHUSETTS
25 | MI,MICHIGAN
26 | MN,MINNESOTA
27 | MS,MISSISSIPPI
28 | MO,MISSOURI
29 | MT,MONTANA
30 | NC,N. CAROLINA
31 | ND,N. DAKOTA
32 | NE,NEBRASKA
33 | NV,NEVADA
34 | NH,NEW HAMPSHIRE
35 | NJ,NEW JERSEY
36 | NM,NEW MEXICO
37 | NY,NEW YORK
38 | OH,OHIO
39 | OK,OKLAHOMA
40 | OR,OREGON
41 | PA,PENNSYLVANIA
42 | PR,PUERTO RICO
43 | RI,RHODE ISLAND
44 | SC,S. CAROLINA
45 | SD,S. DAKOTA
46 | TN,TENNESSEE
47 | TX,TEXAS
48 | UT,UTAH
49 | VT,VERMONT
50 | VI,VIRGIN ISLANDS
51 | VA,VIRGINIA
52 | WV,W. VIRGINIA
53 | WA,WASHINGTON
54 | WI,WISCONSON
55 | WY,WYOMING
56 | 99,All Other Codes


--------------------------------------------------------------------------------
/lookup/I94CIT_I94RES.csv:
--------------------------------------------------------------------------------
  1 | Code,I94CTRY
  2 | 582,MEXICO
  3 | 236,AFGHANISTAN
  4 | 101,ALBANIA
  5 | 316,ALGERIA
  6 | 102,ANDORRA
  7 | 324,ANGOLA
  8 | 529,ANGUILLA
  9 | 518,ANTIGUA-BARBUDA
 10 | 687,ARGENTINA
 11 | 151,ARMENIA
 12 | 532,ARUBA
 13 | 438,AUSTRALIA
 14 | 103,AUSTRIA
 15 | 152,AZERBAIJAN
 16 | 512,BAHAMAS
 17 | 298,BAHRAIN
 18 | 274,BANGLADESH
 19 | 513,BARBADOS
 20 | 104,BELGIUM
 21 | 581,BELIZE
 22 | 386,BENIN
 23 | 509,BERMUDA
 24 | 153,BELARUS
 25 | 242,BHUTAN
 26 | 688,BOLIVIA
 27 | 717,"BONAIRE, ST EUSTATIUS, SABA"
 28 | 164,BOSNIA-HERZEGOVINA
 29 | 336,BOTSWANA
 30 | 689,BRAZIL
 31 | 525,BRITISH VIRGIN ISLANDS
 32 | 217,BRUNEI
 33 | 105,BULGARIA
 34 | 393,BURKINA FASO
 35 | 243,BURMA
 36 | 375,BURUNDI
 37 | 310,CAMEROON
 38 | 326,CAPE VERDE
 39 | 526,CAYMAN ISLANDS
 40 | 383,CENTRAL AFRICAN REPUBLIC
 41 | 384,CHAD
 42 | 690,CHILE
 43 | 245,"CHINA, PRC"
 44 | 721,CURACAO
 45 | 270,CHRISTMAS ISLAND
 46 | 271,COCOS ISLANDS
 47 | 691,COLOMBIA
 48 | 317,COMOROS
 49 | 385,CONGO
 50 | 467,COOK ISLANDS
 51 | 575,COSTA RICA
 52 | 165,CROATIA
 53 | 584,CUBA
 54 | 218,CYPRUS
 55 | 140,CZECH REPUBLIC
 56 | 723,FAROE ISLANDS (PART OF DENMARK)
 57 | 108,DENMARK
 58 | 322,DJIBOUTI
 59 | 519,DOMINICA
 60 | 585,DOMINICAN REPUBLIC
 61 | 240,EAST TIMOR
 62 | 692,ECUADOR
 63 | 368,EGYPT
 64 | 576,EL SALVADOR
 65 | 399,EQUATORIAL GUINEA
 66 | 372,ERITREA
 67 | 109,ESTONIA
 68 | 369,ETHIOPIA
 69 | 604,FALKLAND ISLANDS
 70 | 413,FIJI
 71 | 110,FINLAND
 72 | 111,FRANCE
 73 | 601,FRENCH GUIANA
 74 | 411,FRENCH POLYNESIA
 75 | 387,GABON
 76 | 338,GAMBIA
 77 | 758,GAZA STRIP
 78 | 154,GEORGIA
 79 | 112,GERMANY
 80 | 339,GHANA
 81 | 143,GIBRALTAR
 82 | 113,GREECE
 83 | 520,GRENADA
 84 | 507,GUADELOUPE
 85 | 577,GUATEMALA
 86 | 382,GUINEA
 87 | 327,GUINEA-BISSAU
 88 | 603,GUYANA
 89 | 586,HAITI
 90 | 726,HEARD AND MCDONALD IS.
 91 | 149,HOLY SEE/VATICAN
 92 | 528,HONDURAS
 93 | 206,HONG KONG
 94 | 114,HUNGARY
 95 | 115,ICELAND
 96 | 213,INDIA
 97 | 759,INDIAN OCEAN AREAS (FRENCH)
 98 | 729,INDIAN OCEAN TERRITORY
 99 | 204,INDONESIA
100 | 249,IRAN
101 | 250,IRAQ
102 | 116,IRELAND
103 | 251,ISRAEL
104 | 117,ITALY
105 | 388,IVORY COAST
106 | 514,JAMAICA
107 | 209,JAPAN
108 | 253,JORDAN
109 | 201,KAMPUCHEA
110 | 155,KAZAKHSTAN
111 | 340,KENYA
112 | 414,KIRIBATI
113 | 732,KOSOVO
114 | 272,KUWAIT
115 | 156,KYRGYZSTAN
116 | 203,LAOS
117 | 118,LATVIA
118 | 255,LEBANON
119 | 335,LESOTHO
120 | 370,LIBERIA
121 | 381,LIBYA
122 | 119,LIECHTENSTEIN
123 | 120,LITHUANIA
124 | 121,LUXEMBOURG
125 | 214,MACAU
126 | 167,MACEDONIA
127 | 320,MADAGASCAR
128 | 345,MALAWI
129 | 273,MALAYSIA
130 | 220,MALDIVES
131 | 392,MALI
132 | 145,MALTA
133 | 472,MARSHALL ISLANDS
134 | 511,MARTINIQUE
135 | 389,MAURITANIA
136 | 342,MAURITIUS
137 | 760,MAYOTTE (AFRICA - FRENCH)
138 | 473,"MICRONESIA, FED. STATES OF"
139 | 157,MOLDOVA
140 | 122,MONACO
141 | 299,MONGOLIA
142 | 735,MONTENEGRO
143 | 521,MONTSERRAT
144 | 332,MOROCCO
145 | 329,MOZAMBIQUE
146 | 371,NAMIBIA
147 | 440,NAURU
148 | 257,NEPAL
149 | 123,NETHERLANDS
150 | 508,NETHERLANDS ANTILLES
151 | 409,NEW CALEDONIA
152 | 464,NEW ZEALAND
153 | 579,NICARAGUA
154 | 390,NIGER
155 | 343,NIGERIA
156 | 470,NIUE
157 | 275,NORTH KOREA
158 | 124,NORWAY
159 | 256,OMAN
160 | 258,PAKISTAN
161 | 474,PALAU
162 | 743,PALESTINE
163 | 504,PANAMA
164 | 441,PAPUA NEW GUINEA
165 | 693,PARAGUAY
166 | 694,PERU
167 | 260,PHILIPPINES
168 | 416,PITCAIRN ISLANDS
169 | 107,POLAND
170 | 126,PORTUGAL
171 | 297,QATAR
172 | 748,REPUBLIC OF SOUTH SUDAN
173 | 321,REUNION
174 | 127,ROMANIA
175 | 158,RUSSIA
176 | 376,RWANDA
177 | 128,SAN MARINO
178 | 330,SAO TOME AND PRINCIPE
179 | 261,SAUDI ARABIA
180 | 391,SENEGAL
181 | 142,SERBIA AND MONTENEGRO
182 | 745,SERBIA
183 | 347,SEYCHELLES
184 | 348,SIERRA LEONE
185 | 207,SINGAPORE
186 | 141,SLOVAKIA
187 | 166,SLOVENIA
188 | 412,SOLOMON ISLANDS
189 | 397,SOMALIA
190 | 373,SOUTH AFRICA
191 | 276,SOUTH KOREA
192 | 129,SPAIN
193 | 244,SRI LANKA
194 | 346,ST. HELENA
195 | 522,ST. KITTS-NEVIS
196 | 523,ST. LUCIA
197 | 502,ST. PIERRE AND MIQUELON
198 | 524,ST. VINCENT-GRENADINES
199 | 716,SAINT BARTHELEMY
200 | 736,SAINT MARTIN
201 | 749,SAINT MAARTEN
202 | 350,SUDAN
203 | 602,SURINAME
204 | 351,SWAZILAND
205 | 130,SWEDEN
206 | 131,SWITZERLAND
207 | 262,SYRIA
208 | 268,TAIWAN
209 | 159,TAJIKISTAN
210 | 353,TANZANIA
211 | 263,THAILAND
212 | 304,TOGO
213 | 417,TONGA
214 | 516,TRINIDAD AND TOBAGO
215 | 323,TUNISIA
216 | 264,TURKEY
217 | 161,TURKMENISTAN
218 | 527,TURKS AND CAICOS ISLANDS
219 | 420,TUVALU
220 | 352,UGANDA
221 | 162,UKRAINE
222 | 296,UNITED ARAB EMIRATES
223 | 135,UNITED KINGDOM
224 | 695,URUGUAY
225 | 163,UZBEKISTAN
226 | 410,VANUATU
227 | 696,VENEZUELA
228 | 266,VIETNAM
229 | 469,WALLIS AND FUTUNA ISLANDS
230 | 757,WEST INDIES (FRENCH)
231 | 333,WESTERN SAHARA
232 | 465,WESTERN SAMOA
233 | 216,YEMEN
234 | 139,YUGOSLAVIA
235 | 301,ZAIRE
236 | 344,ZAMBIA
237 | 315,ZIMBABWE
238 | 403,INVALID: AMERICAN SAMOA
239 | 712,INVALID: ANTARCTICA
240 | 700,INVALID: BORN ON BOARD SHIP
241 | 719,INVALID: BOUVET ISLAND (ANTARCTICA/NORWAY TERR.)
242 | 574,INVALID: CANADA
243 | 720,INVALID: CANTON AND ENDERBURY ISLS
244 | 106,INVALID: CZECHOSLOVAKIA
245 | 739,INVALID: DRONNING MAUD LAND (ANTARCTICA-NORWAY)
246 | 394,INVALID: FRENCH SOUTHERN AND ANTARCTIC
247 | 501,INVALID: GREENLAND
248 | 404,INVALID: GUAM
249 | 730,INVALID: INTERNATIONAL WATERS
250 | 731,INVALID: JOHNSON ISLAND
251 | 471,"INVALID: MARIANA ISLANDS, NORTHERN"
252 | 737,INVALID: MIDWAY ISLANDS
253 | 753,INVALID: MINOR OUTLYING ISLANDS - USA
254 | 740,INVALID: NEUTRAL ZONE (S. ARABIA/IRAQ)
255 | 710,INVALID: NON-QUOTA IMMIGRANT
256 | 505,INVALID: PUERTO RICO
257 | 0,INVALID: STATELESS
258 | 705,INVALID: STATELESS
259 | 583,INVALID: UNITED STATES
260 | 407,INVALID: UNITED STATES
261 | 999,INVALID: UNKNOWN
262 | 239,INVALID: UNKNOWN COUNTRY
263 | 134,INVALID: USSR
264 | 506,INVALID: U.S. VIRGIN ISLANDS
265 | 755,INVALID: WAKE ISLAND
266 | 311,Collapsed Tanzania (should not show)
267 | 741,Collapsed Curacao (should not show)
268 | 54,No Country Code (54)
269 | 100,No Country Code (100)
270 | 187,No Country Code (187)
271 | 190,No Country Code (190)
272 | 200,No Country Code (200)
273 | 219,No Country Code (219)
274 | 238,No Country Code (238)
275 | 277,No Country Code (277)
276 | 293,No Country Code (293)
277 | 300,No Country Code (300)
278 | 319,No Country Code (319)
279 | 365,No Country Code (365)
280 | 395,No Country Code (395)
281 | 400,No Country Code (400)
282 | 485,No Country Code (485)
283 | 503,No Country Code (503)
284 | 589,No Country Code (589)
285 | 592,No Country Code (592)
286 | 791,No Country Code (791)
287 | 849,No Country Code (849)
288 | 914,No Country Code (914)
289 | 944,No Country Code (944)
290 | 996,No Country Code (996)


--------------------------------------------------------------------------------
/lookup/I94MODE.csv:
--------------------------------------------------------------------------------
1 | ID,Mode
2 | 1,Air
3 | 2,Sea
4 | 3,Land
5 | 9,Not reported


--------------------------------------------------------------------------------
/lookup/I94PORT.csv:
--------------------------------------------------------------------------------
  1 | ID,Port
  2 | ALC,"ALCAN, AK"
  3 | ANC,"ANCHORAGE, AK"
  4 | BAR,"BAKER AAF - BAKER ISLAND, AK"
  5 | DAC,"DALTONS CACHE, AK"
  6 | PIZ,"DEW STATION PT LAY DEW, AK"
  7 | DTH,"DUTCH HARBOR, AK"
  8 | EGL,"EAGLE, AK"
  9 | FRB,"FAIRBANKS, AK"
 10 | HOM,"HOMER, AK"
 11 | HYD,"HYDER, AK"
 12 | JUN,"JUNEAU, AK"
 13 | 5KE,"KETCHIKAN, AK"
 14 | KET,"KETCHIKAN, AK"
 15 | MOS,"MOSES POINT INTERMEDIATE, AK"
 16 | NIK,"NIKISKI, AK"
 17 | NOM,"NOM, AK"
 18 | PKC,"POKER CREEK, AK"
 19 | ORI,"PORT LIONS SPB, AK"
 20 | SKA,"SKAGWAY, AK"
 21 | SNP,"ST. PAUL ISLAND, AK"
 22 | TKI,"TOKEEN, AK"
 23 | WRA,"WRANGELL, AK"
 24 | HSV,"MADISON COUNTY - HUNTSVILLE, AL"
 25 | MOB,"MOBILE, AL"
 26 | LIA,"LITTLE ROCK, AR (BPS)"
 27 | ROG,"ROGERS ARPT, AR"
 28 | DOU,"DOUGLAS, AZ"
 29 | LUK,"LUKEVILLE, AZ"
 30 | MAP,MARIPOSA AZ
 31 | NAC,"NACO, AZ"
 32 | NOG,"NOGALES, AZ"
 33 | PHO,"PHOENIX, AZ"
 34 | POR,"PORTAL, AZ"
 35 | SLU,"SAN LUIS, AZ"
 36 | SAS,"SASABE, AZ"
 37 | TUC,"TUCSON, AZ"
 38 | YUI,"YUMA, AZ"
 39 | AND,"ANDRADE, CA"
 40 | BUR,"BURBANK, CA"
 41 | CAL,"CALEXICO, CA"
 42 | CAO,"CAMPO, CA"
 43 | FRE,"FRESNO, CA"
 44 | ICP,"IMPERIAL COUNTY, CA"
 45 | LNB,"LONG BEACH, CA"
 46 | LOS,"LOS ANGELES, CA"
 47 | BFL,"MEADOWS FIELD - BAKERSFIELD, CA"
 48 | OAK,"OAKLAND, CA"
 49 | ONT,"ONTARIO, CA"
 50 | OTM,"OTAY MESA, CA"
 51 | BLT,"PACIFIC, HWY. STATION, CA"
 52 | PSP,"PALM SPRINGS, CA"
 53 | SAC,"SACRAMENTO, CA"
 54 | SLS,"SALINAS, CA (BPS)"
 55 | SDP,"SAN DIEGO, CA"
 56 | SFR,"SAN FRANCISCO, CA"
 57 | SNJ,"SAN JOSE, CA"
 58 | SLO,"SAN LUIS OBISPO, CA"
 59 | SLI,"SAN LUIS OBISPO, CA (BPS)"
 60 | SPC,"SAN PEDRO, CA"
 61 | SYS,"SAN YSIDRO, CA"
 62 | SAA,"SANTA ANA, CA"
 63 | STO,"STOCKTON, CA (BPS)"
 64 | TEC,"TECATE, CA"
 65 | TRV,"TRAVIS-AFB, CA"
 66 | APA,"ARAPAHOE COUNTY, CO"
 67 | ASE,"ASPEN, CO #ARPT"
 68 | COS,"COLORADO SPRINGS, CO"
 69 | DEN,"DENVER, CO"
 70 | DRO,"LA PLATA - DURANGO, CO"
 71 | BDL,"BRADLEY INTERNATIONAL, CT"
 72 | BGC,"BRIDGEPORT, CT"
 73 | GRT,"GROTON, CT"
 74 | HAR,"HARTFORD, CT"
 75 | NWH,"NEW HAVEN, CT"
 76 | NWL,"NEW LONDON, CT"
 77 | TST,"NEWINGTON DATA CENTER TEST, CT"
 78 | WAS,WASHINGTON DC
 79 | DOV,"DOVER AFB, DE"
 80 | DVD,"DOVER-AFB, DE"
 81 | WLL,"WILMINGTON, DE"
 82 | BOC,"BOCAGRANDE, FL"
 83 | SRQ,"BRADENTON - SARASOTA, FL"
 84 | CAN,"CAPE CANAVERAL, FL"
 85 | DAB,"DAYTONA BEACH INTERNATIONAL, FL"
 86 | FRN,"FERNANDINA, FL"
 87 | FTL,"FORT LAUDERDALE, FL"
 88 | FMY,"FORT MYERS, FL"
 89 | FPF,"FORT PIERCE, FL"
 90 | HUR,"HURLBURT FIELD, FL"
 91 | GNV,"J R ALISON MUNI - GAINESVILLE, FL"
 92 | JAC,"JACKSONVILLE, FL"
 93 | KEY,"KEY WEST, FL"
 94 | LEE,"LEESBURG MUNICIPAL AIRPORT, FL"
 95 | MLB,"MELBOURNE, FL"
 96 | MIA,"MIAMI, FL"
 97 | APF,"NAPLES, FL #ARPT"
 98 | OPF,"OPA LOCKA, FL"
 99 | ORL,"ORLANDO, FL"
100 | PAN,"PANAMA CITY, FL"
101 | PEN,"PENSACOLA, FL"
102 | PCF,"PORT CANAVERAL, FL"
103 | PEV,"PORT EVERGLADES, FL"
104 | PSJ,"PORT ST JOE, FL"
105 | SFB,"SANFORD, FL"
106 | SGJ,"ST AUGUSTINE ARPT, FL"
107 | SAU,"ST AUGUSTINE, FL"
108 | FPR,"ST LUCIE COUNTY, FL"
109 | SPE,"ST PETERSBURG, FL"
110 | TAM,"TAMPA, FL"
111 | WPB,"WEST PALM BEACH, FL"
112 | ATL,"ATLANTA, GA"
113 | BRU,"BRUNSWICK, GA"
114 | AGS,"BUSH FIELD - AUGUSTA, GA"
115 | SAV,"SAVANNAH, GA"
116 | AGA,"AGANA, GU"
117 | HHW,"HONOLULU, HI"
118 | OGG,"KAHULUI - MAUI, HI"
119 | KOA,"KEAHOLE-KONA, HI"
120 | LIH,"LIHUE, HI"
121 | CID,"CEDAR RAPIDS/IOWA CITY, IA"
122 | DSM,"DES MOINES, IA"
123 | BOI,"AIR TERM. (GOWEN FLD) BOISE, ID"
124 | EPI,"EASTPORT, ID"
125 | IDA,"FANNING FIELD - IDAHO FALLS, ID"
126 | PTL,"PORTHILL, ID"
127 | SPI,"CAPITAL - SPRINGFIELD, IL"
128 | CHI,"CHICAGO, IL"
129 | DPA,"DUPAGE COUNTY, IL"
130 | PIA,"GREATER PEORIA, IL"
131 | RFD,"GREATER ROCKFORD, IL"
132 | UGN,"MEMORIAL - WAUKEGAN, IL"
133 | GAR,"GARY, IN"
134 | HMM,"HAMMOND, IN"
135 | INP,"INDIANAPOLIS, IN"
136 | MRL,"MERRILLVILLE, IN"
137 | SBN,"SOUTH BEND, IN"
138 | ICT,"MID-CONTINENT - WITCHITA, KS"
139 | LEX,"BLUE GRASS - LEXINGTON, KY"
140 | LOU,"LOUISVILLE, KY"
141 | BTN,"BATON ROUGE, LA"
142 | LKC,"LAKE CHARLES, LA"
143 | LAK,"LAKE CHARLES, LA (BPS)"
144 | MLU,"MONROE, LA"
145 | MGC,"MORGAN CITY, LA"
146 | NOL,"NEW ORLEANS, LA"
147 | BOS,"BOSTON, MA"
148 | GLO,"GLOUCESTER, MA"
149 | BED,"HANSCOM FIELD - BEDFORD, MA"
150 | LYN,"LYNDEN, WA"
151 | ADW,"ANDREWS AFB, MD"
152 | BAL,"BALTIMORE, MD"
153 | MKG,"MUSKEGON, MD"
154 | PAX,"PATUXENT RIVER, MD"
155 | BGM,"BANGOR, ME"
156 | BOO,"BOOTHBAY HARBOR, ME"
157 | BWM,"BRIDGEWATER, ME"
158 | BCK,"BUCKPORT, ME"
159 | CLS,"CALAIS, ME"
160 | CRB,"CARIBOU, ME"
161 | COB,"COBURN GORE, ME"
162 | EST,"EASTCOURT, ME"
163 | EPT,"EASTPORT MUNICIPAL, ME"
164 | EPM,"EASTPORT, ME"
165 | FOR,"FOREST CITY, ME"
166 | FTF,"FORT FAIRFIELD, ME"
167 | FTK,"FORT KENT, ME"
168 | HML,"HAMIIN, ME"
169 | HTM,"HOULTON, ME"
170 | JKM,"JACKMAN, ME"
171 | KAL,"KALISPEL, MT"
172 | LIM,"LIMESTONE, ME"
173 | LUB,"LUBEC, ME"
174 | MAD,"MADAWASKA, ME"
175 | POM,"PORTLAND, ME"
176 | RGM,"RANGELEY, ME (BPS)"
177 | SBR,"SOUTH BREWER, ME"
178 | SRL,"ST AURELIE, ME"
179 | SPA,"ST PAMPILE, ME"
180 | VNB,"VAN BUREN, ME"
181 | VCB,"VANCEBORO, ME"
182 | AGN,"ALGONAC, MI"
183 | ALP,"ALPENA, MI"
184 | BCY,"BAY CITY, MI"
185 | DET,"DETROIT, MI"
186 | GRP,"GRAND RAPIDS, MI"
187 | GRO,"GROSSE ISLE, MI"
188 | ISL,"ISLE ROYALE, MI"
189 | MRC,"MARINE CITY, MI"
190 | MRY,"MARYSVILLE, MI"
191 | PTK,"OAKLAND COUNTY - PONTIAC, MI"
192 | PHU,"PORT HURON, MI"
193 | RBT,"ROBERTS LANDING, MI"
194 | SAG,"SAGINAW, MI"
195 | SSM,"SAULT STE. MARIE, MI"
196 | SCL,"ST CLAIR, MI"
197 | YIP,"WILLOW RUN - YPSILANTI, MI"
198 | BAU,"BAUDETTE, MN"
199 | CAR,"CARIBOU MUNICIPAL AIRPORT, MN"
200 | GTF,"Collapsed into INT, MN"
201 | INL,"Collapsed into INT, MN"
202 | CRA,"CRANE LAKE, MN"
203 | MIC,"CRYSTAL MUNICIPAL AIRPORT, MN"
204 | DUL,"DULUTH, MN"
205 | ELY,"ELY, MN"
206 | GPM,"GRAND PORTAGE, MN"
207 | SVC,"GRANT COUNTY - SILVER CITY, MN"
208 | INT,"INTL FALLS, MN"
209 | LAN,"LANCASTER, MN"
210 | MSP,"MINN./ST PAUL, MN"
211 | LIN,"NORTHERN SVC CENTER, MN"
212 | NOY,"NOYES, MN"
213 | PIN,"PINE CREEK, MN"
214 | 48Y,"PINECREEK BORDER ARPT, MN"
215 | RAN,"RAINER, MN"
216 | RST,"ROCHESTER, MN"
217 | ROS,"ROSEAU, MN"
218 | SPM,"ST PAUL, MN"
219 | WSB,"WARROAD INTL, SPB, MN"
220 | WAR,"WARROAD, MN"
221 | KAN,"KANSAS CITY, MO"
222 | SGF,"SPRINGFIELD-BRANSON, MO"
223 | STL,"ST LOUIS, MO"
224 | WHI,"WHITETAIL, MT"
225 | WHM,"WILD HORSE, MT"
226 | GPT,"BILOXI REGIONAL, MS"
227 | GTR,"GOLDEN TRIANGLE LOWNDES CNTY, MS"
228 | GUL,"GULFPORT, MS"
229 | PAS,"PASCAGOULA, MS"
230 | JAN,"THOMPSON FIELD - JACKSON, MS"
231 | BIL,"BILLINGS, MT"
232 | BTM,"BUTTE, MT"
233 | CHF,"CHIEF MT, MT"
234 | CTB,"CUT BANK MUNICIPAL, MT"
235 | CUT,"CUT BANK, MT"
236 | DLB,"DEL BONITA, MT"
237 | EUR,"EUREKA, MT (BPS)"
238 | BZN,"GALLATIN FIELD - BOZEMAN, MT"
239 | FCA,"GLACIER NATIONAL PARK, MT"
240 | GGW,"GLASGOW, MT"
241 | GRE,"GREAT FALLS, MT"
242 | HVR,"HAVRE, MT"
243 | HEL,"HELENA, MT"
244 | LWT,"LEWISTON, MT"
245 | MGM,"MORGAN, MT"
246 | OPH,"OPHEIM, MT"
247 | PIE,"PIEGAN, MT"
248 | RAY,"RAYMOND, MT"
249 | ROO,"ROOSVILLE, MT"
250 | SCO,"SCOBEY, MT"
251 | SWE,"SWEETGTASS, MT"
252 | TRL,"TRIAL CREEK, MT"
253 | TUR,"TURNER, MT"
254 | WCM,"WILLOW CREEK, MT"
255 | CLT,"CHARLOTTE, NC"
256 | FAY,"FAYETTEVILLE, NC"
257 | MRH,"MOREHEAD CITY, NC"
258 | FOP,"MORRIS FIELDS AAF, NC"
259 | GSO,"PIEDMONT TRIAD INTL AIRPORT, NC"
260 | RDU,"RALEIGH/DURHAM, NC"
261 | SSC,"SHAW AFB - SUMTER, NC"
262 | WIL,"WILMINGTON, NC"
263 | AMB,"AMBROSE, ND"
264 | ANT,"ANTLER, ND"
265 | CRY,"CARBURY, ND"
266 | DNS,"DUNSEITH, ND"
267 | FAR,"FARGO, ND"
268 | FRT,"FORTUNA, ND"
269 | GRF,"GRAND FORKS, ND"
270 | HNN,"HANNAH, ND"
271 | HNS,"HANSBORO, ND"
272 | MAI,"MAIDA, ND"
273 | MND,"MINOT, ND"
274 | NEC,"NECHE, ND"
275 | NOO,"NOONAN, ND"
276 | NRG,"NORTHGATE, ND"
277 | PEM,"PEMBINA, ND"
278 | SAR,"SARLES, ND"
279 | SHR,"SHERWOOD, ND"
280 | SJO,"ST JOHN, ND"
281 | WAL,"WALHALLA, ND"
282 | WHO,"WESTHOPE, ND"
283 | WND,"WILLISTON, ND"
284 | OMA,"OMAHA, NE"
285 | LEB,"LEBANON, NH"
286 | MHT,"MANCHESTER, NH"
287 | PNH,"PITTSBURG, NH"
288 | PSM,"PORTSMOUTH, NH"
289 | BYO,"BAYONNE, NJ"
290 | CNJ,"CAMDEN, NJ"
291 | HOB,"HOBOKEN, NJ"
292 | JER,"JERSEY CITY, NJ"
293 | WRI,"MC GUIRE AFB - WRIGHTSOWN, NJ"
294 | MMU,"MORRISTOWN, NJ"
295 | NEW,"NEWARK/TETERBORO, NJ"
296 | PER,"PERTH AMBOY, NJ"
297 | ACY,"POMONA FIELD - ATLANTIC CITY, NJ"
298 | ALA,"ALAMAGORDO, NM (BPS)"
299 | ABQ,"ALBUQUERQUE, NM"
300 | ANP,"ANTELOPE WELLS, NM"
301 | CRL,"CARLSBAD, NM"
302 | COL,"COLUMBUS, NM"
303 | CDD,"CRANE LAKE - ST. LOUIS CNTY, NM"
304 | DNM,"DEMING, NM (BPS)"
305 | LAS,"LAS CRUCES, NM"
306 | LOB,"LORDSBURG, NM (BPS)"
307 | RUI,"RUIDOSO, NM"
308 | STR,"SANTA TERESA, NM"
309 | RNO,"CANNON INTL - RENO/TAHOE, NV"
310 | FLX,"FALLON MUNICIPAL AIRPORT, NV"
311 | LVG,"LAS VEGAS, NV"
312 | REN,"RENO, NV"
313 | ALB,"ALBANY, NY"
314 | AXB,"ALEXANDRIA BAY, NY"
315 | BUF,"BUFFALO, NY"
316 | CNH,"CANNON CORNERS, NY"
317 | CAP,"CAPE VINCENT, NY"
318 | CHM,"CHAMPLAIN, NY"
319 | CHT,"CHATEAUGAY, NY"
320 | CLA,"CLAYTON, NY"
321 | FTC,"FORT COVINGTON, NY"
322 | LAG,"LA GUARDIA, NY"
323 | LEW,"LEWISTON, NY"
324 | MAS,"MASSENA, NY"
325 | MAG,"MCGUIRE AFB, NY"
326 | MOO,"MOORES, NY"
327 | MRR,"MORRISTOWN, NY"
328 | NYC,"NEW YORK, NY"
329 | NIA,"NIAGARA FALLS, NY"
330 | OGD,"OGDENSBURG, NY"
331 | OSW,"OSWEGO, NY"
332 | ELM,"REGIONAL ARPT - HORSEHEAD, NY"
333 | ROC,"ROCHESTER, NY"
334 | ROU,"ROUSES POINT, NY"
335 | SWF,"STEWART - ORANGE CNTY, NY"
336 | SYR,"SYRACUSE, NY"
337 | THO,"THOUSAND ISLAND BRIDGE, NY"
338 | TRO,"TROUT RIVER, NY"
339 | WAT,"WATERTOWN, NY"
340 | HPN,"WESTCHESTER - WHITE PLAINS, NY"
341 | WRB,"WHIRLPOOL BRIDGE, NY"
342 | YOU,"YOUNGSTOWN, NY"
343 | AKR,"AKRON, OH"
344 | ATB,"ASHTABULA, OH"
345 | CIN,"CINCINNATI, OH"
346 | CLE,"CLEVELAND, OH"
347 | CLM,"COLUMBUS, OH"
348 | LOR,"LORAIN, OH"
349 | MBO,"MARBLE HEADS, OH"
350 | SDY,"SANDUSKY, OH"
351 | TOL,"TOLEDO, OH"
352 | OKC,"OKLAHOMA CITY, OK"
353 | TUL,"TULSA, OK"
354 | AST,"ASTORIA, OR"
355 | COO,"COOS BAY, OR"
356 | HIO,"HILLSBORO, OR"
357 | MED,"MEDFORD, OR"
358 | NPT,"NEWPORT, OR"
359 | POO,"PORTLAND, OR"
360 | PUT,"PUT-IN-BAY, OH"
361 | RDM,"ROBERTS FIELDS - REDMOND, OR"
362 | ERI,"ERIE, PA"
363 | MDT,"HARRISBURG, PA"
364 | HSB,"HARRISONBURG, PA"
365 | PHI,"PHILADELPHIA, PA"
366 | PIT,"PITTSBURG, PA"
367 | AGU,"AGUADILLA, PR"
368 | BQN,"BORINQUEN - AGUADILLO, PR"
369 | JCP,"CULEBRA - BENJAMIN RIVERA, PR"
370 | ENS,"ENSENADA, PR"
371 | FAJ,"FAJARDO, PR"
372 | HUM,"HUMACAO, PR"
373 | JOB,"JOBOS, PR"
374 | MAY,"MAYAGUEZ, PR"
375 | PON,"PONCE, PR"
376 | PSE,"PONCE-MERCEDITA, PR"
377 | SAJ,"SAN JUAN, PR"
378 | VQS,"VIEQUES-ARPT, PR"
379 | PRO,"PROVIDENCE, RI"
380 | PVD,"THEODORE FRANCIS - WARWICK, RI"
381 | CHL,"CHARLESTON, SC"
382 | CAE,"COLUMBIA, SC #ARPT"
383 | GEO,"GEORGETOWN, SC"
384 | GSP,"GREENVILLE, SC"
385 | GRR,"GREER, SC"
386 | MYR,"MYRTLE BEACH, SC"
387 | SPF,"BLACK HILLS, SPEARFISH, SD"
388 | HON,"HOWES REGIONAL ARPT - HURON, SD"
389 | SAI,"SAIPAN, SPN"
390 | TYS,"MC GHEE TYSON - ALCOA, TN"
391 | MEM,"MEMPHIS, TN"
392 | NSV,"NASHVILLE, TN"
393 | TRI,"TRI CITY ARPT, TN"
394 | ADS,"ADDISON AIRPORT- ADDISON, TX"
395 | ADT,"AMISTAD DAM, TX"
396 | ANZ,"ANZALDUAS, TX"
397 | AUS,"AUSTIN, TX"
398 | BEA,"BEAUMONT, TX"
399 | BBP,"BIG BEND PARK, TX (BPS)"
400 | SCC,"BP SPEC COORD. CTR, TX"
401 | BTC,"BP TACTICAL UNIT, TX"
402 | BOA,"BRIDGE OF AMERICAS, TX"
403 | BRO,"BROWNSVILLE, TX"
404 | CRP,"CORPUS CHRISTI, TX"
405 | DAL,"DALLAS, TX"
406 | DLR,"DEL RIO, TX"
407 | DNA,"DONNA, TX"
408 | EGP,"EAGLE PASS, TX"
409 | ELP,"EL PASO, TX"
410 | FAB,"FABENS, TX"
411 | FAL,"FALCON HEIGHTS, TX"
412 | FTH,"FORT HANCOCK, TX"
413 | AFW,"FORT WORTH ALLIANCE, TX"
414 | FPT,"FREEPORT, TX"
415 | GAL,"GALVESTON, TX"
416 | HLG,"HARLINGEN, TX"
417 | HID,"HIDALGO, TX"
418 | HOU,"HOUSTON, TX"
419 | SGR,"HULL FIELD, SUGAR LAND ARPT, TX"
420 | LLB,"JUAREZ-LINCOLN BRIDGE, TX"
421 | LCB,"LAREDO COLUMBIA BRIDGE, TX"
422 | LRN,"LAREDO NORTH, TX"
423 | LAR,"LAREDO, TX"
424 | LSE,"LOS EBANOS, TX"
425 | IND,"LOS INDIOS, TX"
426 | LOI,"LOS INDIOS, TX"
427 | MRS,"MARFA, TX (BPS)"
428 | MCA,"MCALLEN, TX"
429 | MAF,"ODESSA REGIONAL, TX"
430 | PDN,"PASO DEL NORTE,TX"
431 | PBB,"PEACE BRIDGE, NY"
432 | PHR,"PHARR, TX"
433 | PAR,"PORT ARTHUR, TX"
434 | ISB,"PORT ISABEL, TX"
435 | POE,"PORT OF EL PASO, TX"
436 | PRE,"PRESIDIO, TX"
437 | PGR,"PROGRESO, TX"
438 | RIO,"RIO GRANDE CITY, TX"
439 | ROM,"ROMA, TX"
440 | SNA,"SAN ANTONIO, TX"
441 | SNN,"SANDERSON, TX"
442 | VIB,"VETERAN INTL BRIDGE, TX"
443 | YSL,"YSLETA, TX"
444 | CHA,"CHARLOTTE AMALIE, VI"
445 | CHR,"CHRISTIANSTED, VI"
446 | CRU,"CRUZ BAY, ST JOHN, VI"
447 | FRK,"FREDERIKSTED, VI"
448 | STT,"ST THOMAS, VI"
449 | LGU,"CACHE AIRPORT - LOGAN, UT"
450 | SLC,"SALT LAKE CITY, UT"
451 | CHO,"ALBEMARLE CHARLOTTESVILLE, VA"
452 | DAA,"DAVISON AAF - FAIRFAX CNTY, VA"
453 | HOP,"HOPEWELL, VA"
454 | HEF,"MANASSAS, VA #ARPT"
455 | NWN,"NEWPORT, VA"
456 | NOR,"NORFOLK, VA"
457 | RCM,"RICHMOND, VA"
458 | ABS,"ALBURG SPRINGS, VT"
459 | ABG,"ALBURG, VT"
460 | BEB,"BEEBE PLAIN, VT"
461 | BEE,"BEECHER FALLS, VT"
462 | BRG,"BURLINGTON, VT"
463 | CNA,"CANAAN, VT"
464 | DER,"DERBY LINE, VT (I-91)"
465 | DLV,"DERBY LINE, VT (RT. 5)"
466 | ERC,"EAST RICHFORD, VT"
467 | HIG,"HIGHGATE SPRINGS, VT"
468 | MOR,"MORSES LINE, VT"
469 | NPV,"NEWPORT, VT"
470 | NRT,"NORTH TROY, VT"
471 | NRN,"NORTON, VT"
472 | PIV,"PINNACLE ROAD, VT"
473 | RIF,"RICHFORT, VT"
474 | STA,"ST ALBANS, VT"
475 | SWB,"SWANTON, VT (BP - SECTOR HQ)"
476 | WBE,"WEST BERKSHIRE, VT"
477 | ABE,"ABERDEEN, WA"
478 | ANA,"ANACORTES, WA"
479 | BEL,"BELLINGHAM, WA"
480 | BLI,"BELLINGHAM, WASHINGTON #INTL"
481 | BLA,"BLAINE, WA"
482 | BWA,"BOUNDARY, WA"
483 | CUR,"CURLEW, WA (BPS)"
484 | DVL,"DANVILLE, WA"
485 | EVE,"EVERETT, WA"
486 | FER,"FERRY, WA"
487 | FRI,"FRIDAY HARBOR, WA"
488 | FWA,"FRONTIER, WA"
489 | KLM,"KALAMA, WA"
490 | LAU,"LAURIER, WA"
491 | LON,"LONGVIEW, WA"
492 | MET,"METALINE FALLS, WA"
493 | MWH,"MOSES LAKE GRANT COUNTY ARPT, WA"
494 | NEA,"NEAH BAY, WA"
495 | NIG,"NIGHTHAWK, WA"
496 | OLY,"OLYMPIA, WA"
497 | ORO,"OROVILLE, WA"
498 | PWB,"PASCO, WA"
499 | PIR,"POINT ROBERTS, WA"
500 | PNG,"PORT ANGELES, WA"
501 | PTO,"PORT TOWNSEND, WA"
502 | SEA,"SEATTLE, WA"
503 | SPO,"SPOKANE, WA"
504 | SUM,"SUMAS, WA"
505 | TAC,"TACOMA, WA"
506 | PSC,"TRI-CITIES - PASCO, WA"
507 | VAN,"VANCOUVER, WA"
508 | AGM,"ALGOMA, WI"
509 | BAY,"BAYFIELD, WI"
510 | GRB,"GREEN BAY, WI"
511 | MNW,"MANITOWOC, WI"
512 | MIL,"MILWAUKEE, WI"
513 | MSN,"TRUAX FIELD - DANE COUNTY, WI"
514 | CHS,"CHARLESTON, WV"
515 | CLK,"CLARKSBURG, WV"
516 | BLF,"MERCER COUNTY, WV"
517 | CSP,"CASPER, WY"
518 | XXX,NOT REPORTED/UNKNOWN
519 | 888,UNIDENTIFED AIR / SEAPORT
520 | UNK,UNKNOWN POE
521 | CLG,"CALGARY, CANADA"
522 | EDA,"EDMONTON, CANADA"
523 | YHC,"HAKAI PASS, CANADA"
524 | HAL,"Halifax, NS, Canada"
525 | MON,"MONTREAL, CANADA"
526 | OTT,"OTTAWA, CANADA"
527 | YXE,"SASKATOON, CANADA"
528 | TOR,"TORONTO, CANADA"
529 | VCV,"VANCOUVER, CANADA"
530 | VIC,"VICTORIA, CANADA"
531 | WIN,"WINNIPEG, CANADA"
532 | AMS,"AMSTERDAM-SCHIPHOL, NETHERLANDS"
533 | ARB,"ARUBA, NETH ANTILLES"
534 | BAN,"BANKOK, THAILAND"
535 | BEI,"BEICA #ARPT, ETHIOPIA"
536 | PEK,"BEIJING CAPITAL INTL, PRC"
537 | BDA,"KINDLEY FIELD, BERMUDA"
538 | BOG,"BOGOTA, EL DORADO #ARPT, COLOMBIA"
539 | EZE,"BUENOS AIRES, MINISTRO PIST, ARGENTINA"
540 | CUN,"CANCUN, MEXICO"
541 | CRQ,"CARAVELAS, BA #ARPT, BRAZIL"
542 | MVD,"CARRASCO, URUGUAY"
543 | DUB,"DUBLIN, IRELAND"
544 | FOU,"FOUGAMOU #ARPT, GABON"
545 | FBA,"FREEPORT, BAHAMAS"
546 | MTY,"GEN M. ESCOBEDO, Monterrey, MX"
547 | HMO,"GEN PESQUEIRA GARCIA, MX"
548 | GCM,"GRAND CAYMAN, CAYMAN ISLAND"
549 | GDL,"GUADALAJARA, MIGUEL HIDAL, MX"
550 | HAM,"HAMILTON, BERMUDA"
551 | ICN,"INCHON, SEOUL KOREA"
552 | IWA,"INVALID - IWAKUNI, JAPAN"
553 | CND,"KOGALNICEANU, ROMANIA"
554 | LAH,"LABUHA ARPT, INDONESIA"
555 | DUR,"LOUIS BOTHA, SOUTH AFRICA"
556 | MAL,"MANGOLE ARPT, INDONESIA"
557 | MDE,"MEDELLIN, COLOMBIA"
558 | MEX,"JUAREZ INTL, MEXICO CITY, MX"
559 | LHR,"MIDDLESEX, ENGLAND"
560 | NBO,"NAIROBI, KENYA"
561 | NAS,"NASSAU, BAHAMAS"
562 | NCA,"NORTH CAICOS, TURK & CAIMAN"
563 | PTY,"OMAR TORRIJOS, PANAMA"
564 | SPV,"PAPUA, NEW GUINEA"
565 | UIO,"QUITO (MARISCAL SUCR), ECUADOR"
566 | RIT,"ROME, ITALY"
567 | SNO,"SAKON NAKHON #ARPT, THAILAND"
568 | SLP,"SAN LUIS POTOSI #ARPT, MEXICO"
569 | SAN,"SAN SALVADOR, EL SALVADOR"
570 | SRO,"SANTANA RAMOS #ARPT, COLOMBIA"
571 | GRU,"GUARULHOS INTL, SAO PAULO, BRAZIL"
572 | SHA,"SHANNON, IRELAND"
573 | HIL,"SHILLAVO, ETHIOPIA"
574 | TOK,"TOROKINA #ARPT, PAPUA, NEW GUINEA"
575 | VER,"VERACRUZ, MEXICO"
576 | LGW,"WEST SUSSEX, ENGLAND"
577 | ZZZ,MEXICO Land (Banco de Mexico)
578 | CHN,No PORT Code (CHN)
579 | CNC,"CANNON CORNERS, NY"
580 | MAA,Abu Dhabi
581 | AG0,"MAGNOLIA, AR"
582 | BHM,"BAR HARBOR, ME"
583 | BHX,"BIRMINGHAM, AL"
584 | CAK,"AKRON, OH"
585 | FOK,"SUFFOLK COUNTY, NY"
586 | LND,"LANDER, WY"
587 | MAR,"MARFA, TX"
588 | MLI,"MOLINE, IL"
589 | RIV,"RIVERSIDE, CA"
590 | RME,"ROME, NY"
591 | VNY,"VAN NUYS, CA"
592 | YUM,"YUMA, AZ"
593 | FRG,Collapsed (FOK) 06/15
594 | HRL,Collapsed (HLG) 06/15
595 | ISP,Collapsed (FOK) 06/15
596 | JSJ,Collapsed (SAJ) 06/15
597 | BUS,Collapsed (BUF) 06/15
598 | IAG,Collapsed (NIA) 06/15
599 | PHN,Collapsed (PHU) 06/15
600 | STN,Collapsed (STR) 06/15
601 | VMB,Collapsed (VNB) 06/15
602 | T01,Collapsed (SEA) 06/15
603 | PHF,No PORT Code (PHF)
604 | DRV,No PORT Code (DRV)
605 | FTB,No PORT Code (FTB)
606 | GAC,No PORT Code (GAC)
607 | GMT,No PORT Code (GMT)
608 | JFA,No PORT Code (JFA)
609 | JMZ,No PORT Code (JMZ)
610 | NC8,No PORT Code (NC8)
611 | NYL,No PORT Code (NYL)
612 | OAI,No PORT Code (OAI)
613 | PCW,No PORT Code (PCW)
614 | WA5,No PORT Code (WAS)
615 | WTR,No PORT Code (WTR)
616 | X96,No PORT Code (X96)
617 | XNA,No PORT Code (XNA)
618 | YGF,No PORT Code (YGF)
619 | 5T6,No PORT Code (5T6)
620 | 060,No PORT Code (60)
621 | SP0,No PORT Code (SP0)
622 | W55,No PORT Code (W55)
623 | X44,No PORT Code (X44)
624 | AUH,No PORT Code (AUH)
625 | RYY,No PORT Code (RYY)
626 | SUS,No PORT Code (SUS)
627 | 74S,No PORT Code (74S)
628 | ATW,No PORT Code (ATW)
629 | CPX,No PORT Code (CPX)
630 | MTH,No PORT Code (MTH)
631 | PFN,No PORT Code (PFN)
632 | SCH,No PORT Code (SCH)
633 | ASI,No PORT Code (ASI)
634 | BKF,No PORT Code (BKF)
635 | DAY,No PORT Code (DAY)
636 | Y62,No PORT Code (Y62)
637 | AG,No PORT Code (AG)
638 | BCM,No PORT Code (BCM)
639 | DEC,No PORT Code (DEC)
640 | PLB,No PORT Code (PLB)
641 | CXO,No PORT Code (CXO)
642 | JBQ,No PORT Code (JBQ)
643 | JIG,No PORT Code (JIG)
644 | OGS,No PORT Code (OGS)
645 | TIW,No PORT Code (TIW)
646 | OTS,No PORT Code (OTS)
647 | AMT,No PORT Code (AMT)
648 | EGE,No PORT Code (EGE)
649 | GPI,No PORT Code (GPI)
650 | NGL,No PORT Code (NGL)
651 | OLM,No PORT Code (OLM)
652 | .GA,No PORT Code (.GA)
653 | CLX,No PORT Code (CLX)
654 | CP ,No PORT Code (CP)
655 | FSC,No PORT Code (FSC)
656 | NK,No PORT Code (NK)
657 | ADU,No PORT Code (ADU)
658 | AKT,No PORT Code (AKT)
659 | LIT,No PORT Code (LIT)
660 | A2A,No PORT Code (A2A)
661 | OSN,No PORT Code (OSN)
662 | 


--------------------------------------------------------------------------------
/lookup/I94VISA.csv:
--------------------------------------------------------------------------------
1 | ID,Type
2 | 1,Business
3 | 2,Pleasure
4 | 3,Student


--------------------------------------------------------------------------------
/sql/create_tables.sql:
--------------------------------------------------------------------------------
 1 | -- STAGING TABLES
 2 | 
 3 | CREATE TABLE public.immigration (
 4 |   	i94mon int4,
 5 |   	cicid int4,
 6 |   	i94visa int4,
 7 |   	i94res int4,
 8 |   	i94yr int4,
 9 |   	i94mode int4,
10 |   	i94cit int4,
11 |   	i94bir int4,
12 |   	stay int4,
13 |   	arrdate varchar,
14 |   	depdate varchar,
15 |   	airline varchar,
16 |   	fltno varchar,
17 |   	i94port varchar,
18 |   	visatype varchar,
19 |   	gender varchar,
20 | 	i94addr varchar,
21 | 	CONSTRAINT immigration_pkey PRIMARY KEY ("cicid")
22 | );
23 | 
24 | CREATE TABLE public.country (
25 | 	Code int4,
26 | 	Country varchar,
27 | 	Temperature float,
28 | 	Latitude varchar,
29 | 	Longitude varchar,
30 | 	CONSTRAINT country_pkey PRIMARY KEY ("Code")
31 | );
32 | 
33 | CREATE TABLE public.state (
34 | 	Code varchar,
35 | 	State varchar,
36 | 	BlackOrAfricanAmerican int8,
37 | 	White int8,
38 | 	ForeignBorn int8,
39 | 	AmericanIndianAndAlaskaNative int8,
40 | 	HispanicOrLatino int8,
41 | 	Asian int8,
42 | 	NumberVeterans int8,
43 | 	FemalePopulation int8,
44 | 	MalePopulation int8,
45 | 	TotalPopulation int8,
46 | 	CONSTRAINT state_pkey PRIMARY KEY ("Code")
47 | );
48 | 
49 | CREATE TABLE public."date" (
50 | 	"date" varchar NOT NULL,
51 | 	"day" int4,
52 | 	"month" int4,
53 | 	"year" int4,
54 | 	weekofyear int4,
55 | 	dayofweek int4,
56 | 	CONSTRAINT date_pkey PRIMARY KEY ("date")
57 | ) ;
58 | 


--------------------------------------------------------------------------------