├── Capstone Project
    ├── Capstone Project.ipynb
    ├── I94_SAS_Labels_Descriptions.SAS
    ├── README.md
    ├── airport-codes_csv.csv
    ├── create_tables.py
    ├── immigration_data_sample.csv
    ├── sql_queries.py
    └── us-cities-demographics.csv
├── Data Lake
    ├── README.md
    ├── dl.cfg
    └── etl.py
├── Data Modeling with Cassandra
    ├── .DS_Store
    ├── Project_1B_ Project_Template.ipynb
    ├── README.md
    └── images
    │   └── image_event_datafile_new.jpg
├── Data Modeling with Postgres
    ├── README.md
    ├── create_tables.py
    ├── etl.ipynb
    ├── etl.py
    ├── requirements.txt
    ├── sql_queries.py
    └── test.ipynb
├── Data Pipeline
    ├── .DS_Store
    ├── README.md
    ├── create_tables.sql
    ├── dags
    │   └── udac_example_dag.py
    └── plugins
    │   ├── __init__.py
    │   ├── helpers
    │       ├── __init__.py
    │       └── sql_queries.py
    │   └── operators
    │       ├── __init__.py
    │       ├── data_quality.py
    │       ├── load_dimension.py
    │       ├── load_fact.py
    │       └── stage_redshift.py
├── Data Warehouse
    ├── README.md
    ├── create_tables.py
    ├── dwh.cfg
    ├── etl.py
    └── sql_queries.py
└── README.md


/Capstone Project/I94_SAS_Labels_Descriptions.SAS:
--------------------------------------------------------------------------------
   1 | libname library 'Your file location' ;
   2 | proc format library=library ;
   3 | 
   4 | /* I94YR - 4 digit year */
   5 | 
   6 | /* I94MON - Numeric month */
   7 | 
   8 | /* I94CIT & I94RES - This format shows all the valid and invalid codes for processing */
   9 |   value i94cntyl
  10 |    582 =  'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'
  11 |    236 =  'AFGHANISTAN'
  12 |    101 =  'ALBANIA'
  13 |    316 =  'ALGERIA'
  14 |    102 =  'ANDORRA'
  15 |    324 =  'ANGOLA'
  16 |    529 =  'ANGUILLA'
  17 |    518 =  'ANTIGUA-BARBUDA'
  18 |    687 =  'ARGENTINA '
  19 |    151 =  'ARMENIA'
  20 |    532 =  'ARUBA'
  21 |    438 =  'AUSTRALIA'
  22 |    103 =  'AUSTRIA'
  23 |    152 =  'AZERBAIJAN'
  24 |    512 =  'BAHAMAS'
  25 |    298 =  'BAHRAIN'
  26 |    274 =  'BANGLADESH'
  27 |    513 =  'BARBADOS'
  28 |    104 =  'BELGIUM'
  29 |    581 =  'BELIZE'
  30 |    386 =  'BENIN'
  31 |    509 =  'BERMUDA'
  32 |    153 =  'BELARUS'
  33 |    242 =  'BHUTAN'
  34 |    688 =  'BOLIVIA'
  35 |    717 =  'BONAIRE, ST EUSTATIUS, SABA' 
  36 |    164 =  'BOSNIA-HERZEGOVINA'
  37 |    336 =  'BOTSWANA'
  38 |    689 =  'BRAZIL'
  39 |    525 =  'BRITISH VIRGIN ISLANDS'
  40 |    217 =  'BRUNEI'
  41 |    105 =  'BULGARIA'
  42 |    393 =  'BURKINA FASO'
  43 |    243 =  'BURMA'
  44 |    375 =  'BURUNDI'
  45 |    310 =  'CAMEROON'
  46 |    326 =  'CAPE VERDE'
  47 |    526 =  'CAYMAN ISLANDS'
  48 |    383 =  'CENTRAL AFRICAN REPUBLIC'
  49 |    384 =  'CHAD'
  50 |    690 =  'CHILE'
  51 |    245 =  'CHINA, PRC'
  52 |    721 =  'CURACAO' 
  53 |    270 =  'CHRISTMAS ISLAND'
  54 |    271 =  'COCOS ISLANDS'
  55 |    691 =  'COLOMBIA'
  56 |    317 =  'COMOROS'
  57 |    385 =  'CONGO'
  58 |    467 =  'COOK ISLANDS'
  59 |    575 =  'COSTA RICA'
  60 |    165 =  'CROATIA'
  61 |    584 =  'CUBA'
  62 |    218 =  'CYPRUS'
  63 |    140 =  'CZECH REPUBLIC'
  64 |    723 =  'FAROE ISLANDS (PART OF DENMARK)'  
  65 |    108 =  'DENMARK'
  66 |    322 =  'DJIBOUTI'
  67 |    519 =  'DOMINICA'
  68 |    585 =  'DOMINICAN REPUBLIC'
  69 |    240 =  'EAST TIMOR'
  70 |    692 =  'ECUADOR'
  71 |    368 =  'EGYPT'
  72 |    576 =  'EL SALVADOR'
  73 |    399 =  'EQUATORIAL GUINEA'
  74 |    372 =  'ERITREA'
  75 |    109 =  'ESTONIA'
  76 |    369 =  'ETHIOPIA'
  77 |    604 =  'FALKLAND ISLANDS'
  78 |    413 =  'FIJI'
  79 |    110 =  'FINLAND'
  80 |    111 =  'FRANCE'
  81 |    601 =  'FRENCH GUIANA'
  82 |    411 =  'FRENCH POLYNESIA'
  83 |    387 =  'GABON'
  84 |    338 =  'GAMBIA'
  85 |    758 =  'GAZA STRIP' 
  86 |    154 =  'GEORGIA'
  87 |    112 =  'GERMANY'
  88 |    339 =  'GHANA'
  89 |    143 =  'GIBRALTAR'
  90 |    113 =  'GREECE'
  91 |    520 =  'GRENADA'
  92 |    507 =  'GUADELOUPE'
  93 |    577 =  'GUATEMALA'
  94 |    382 =  'GUINEA'
  95 |    327 =  'GUINEA-BISSAU'
  96 |    603 =  'GUYANA'
  97 |    586 =  'HAITI'
  98 |    726 =  'HEARD AND MCDONALD IS.'
  99 |    149 =  'HOLY SEE/VATICAN'
 100 |    528 =  'HONDURAS'
 101 |    206 =  'HONG KONG'
 102 |    114 =  'HUNGARY'
 103 |    115 =  'ICELAND'
 104 |    213 =  'INDIA'
 105 |    759 =  'INDIAN OCEAN AREAS (FRENCH)' 
 106 |    729 =  'INDIAN OCEAN TERRITORY' 
 107 |    204 =  'INDONESIA'
 108 |    249 =  'IRAN'
 109 |    250 =  'IRAQ'
 110 |    116 =  'IRELAND'
 111 |    251 =  'ISRAEL'
 112 |    117 =  'ITALY'
 113 |    388 =  'IVORY COAST'
 114 |    514 =  'JAMAICA'
 115 |    209 =  'JAPAN'
 116 |    253 =  'JORDAN'
 117 |    201 =  'KAMPUCHEA'
 118 |    155 =  'KAZAKHSTAN'
 119 |    340 =  'KENYA'
 120 |    414 =  'KIRIBATI'
 121 |    732 =  'KOSOVO' 
 122 |    272 =  'KUWAIT'
 123 |    156 =  'KYRGYZSTAN'
 124 |    203 =  'LAOS'
 125 |    118 =  'LATVIA'
 126 |    255 =  'LEBANON'
 127 |    335 =  'LESOTHO'
 128 |    370 =  'LIBERIA'
 129 |    381 =  'LIBYA'
 130 |    119 =  'LIECHTENSTEIN'
 131 |    120 =  'LITHUANIA'
 132 |    121 =  'LUXEMBOURG'
 133 |    214 =  'MACAU'
 134 |    167 =  'MACEDONIA'
 135 |    320 =  'MADAGASCAR'
 136 |    345 =  'MALAWI'
 137 |    273 =  'MALAYSIA'
 138 |    220 =  'MALDIVES'
 139 |    392 =  'MALI'
 140 |    145 =  'MALTA'
 141 |    472 =  'MARSHALL ISLANDS'
 142 |    511 =  'MARTINIQUE'
 143 |    389 =  'MAURITANIA'
 144 |    342 =  'MAURITIUS'
 145 |    760 =  'MAYOTTE (AFRICA - FRENCH)' 
 146 |    473 =  'MICRONESIA, FED. STATES OF'
 147 |    157 =  'MOLDOVA'
 148 |    122 =  'MONACO'
 149 |    299 =  'MONGOLIA'
 150 |    735 =  'MONTENEGRO' 
 151 |    521 =  'MONTSERRAT'
 152 |    332 =  'MOROCCO'
 153 |    329 =  'MOZAMBIQUE'
 154 |    371 =  'NAMIBIA'
 155 |    440 =  'NAURU'
 156 |    257 =  'NEPAL'
 157 |    123 =  'NETHERLANDS'
 158 |    508 =  'NETHERLANDS ANTILLES'
 159 |    409 =  'NEW CALEDONIA'
 160 |    464 =  'NEW ZEALAND'
 161 |    579 =  'NICARAGUA'
 162 |    390 =  'NIGER'
 163 |    343 =  'NIGERIA'
 164 |    470 =  'NIUE'
 165 |    275 =  'NORTH KOREA'
 166 |    124 =  'NORWAY'
 167 |    256 =  'OMAN'
 168 |    258 =  'PAKISTAN'
 169 |    474 =  'PALAU'
 170 |    743 =  'PALESTINE' 
 171 |    504 =  'PANAMA'
 172 |    441 =  'PAPUA NEW GUINEA'
 173 |    693 =  'PARAGUAY'
 174 |    694 =  'PERU'
 175 |    260 =  'PHILIPPINES'
 176 |    416 =  'PITCAIRN ISLANDS'
 177 |    107 =  'POLAND'
 178 |    126 =  'PORTUGAL'
 179 |    297 =  'QATAR'
 180 |    748 =  'REPUBLIC OF SOUTH SUDAN'
 181 |    321 =  'REUNION'
 182 |    127 =  'ROMANIA'
 183 |    158 =  'RUSSIA'
 184 |    376 =  'RWANDA'
 185 |    128 =  'SAN MARINO'
 186 |    330 =  'SAO TOME AND PRINCIPE'
 187 |    261 =  'SAUDI ARABIA'
 188 |    391 =  'SENEGAL'
 189 |    142 =  'SERBIA AND MONTENEGRO'
 190 |    745 =  'SERBIA' 
 191 |    347 =  'SEYCHELLES'
 192 |    348 =  'SIERRA LEONE'
 193 |    207 =  'SINGAPORE'
 194 |    141 =  'SLOVAKIA'
 195 |    166 =  'SLOVENIA'
 196 |    412 =  'SOLOMON ISLANDS'
 197 |    397 =  'SOMALIA'
 198 |    373 =  'SOUTH AFRICA'
 199 |    276 =  'SOUTH KOREA'
 200 |    129 =  'SPAIN'
 201 |    244 =  'SRI LANKA'
 202 |    346 =  'ST. HELENA'
 203 |    522 =  'ST. KITTS-NEVIS'
 204 |    523 =  'ST. LUCIA'
 205 |    502 =  'ST. PIERRE AND MIQUELON'
 206 |    524 =  'ST. VINCENT-GRENADINES'
 207 |    716 =  'SAINT BARTHELEMY' 
 208 |    736 =  'SAINT MARTIN' 
 209 |    749 =  'SAINT MAARTEN' 
 210 |    350 =  'SUDAN'
 211 |    602 =  'SURINAME'
 212 |    351 =  'SWAZILAND'
 213 |    130 =  'SWEDEN'
 214 |    131 =  'SWITZERLAND'
 215 |    262 =  'SYRIA'
 216 |    268 =  'TAIWAN'
 217 |    159 =  'TAJIKISTAN'
 218 |    353 =  'TANZANIA'
 219 |    263 =  'THAILAND'
 220 |    304 =  'TOGO'
 221 |    417 =  'TONGA'
 222 |    516 =  'TRINIDAD AND TOBAGO'
 223 |    323 =  'TUNISIA'
 224 |    264 =  'TURKEY'
 225 |    161 =  'TURKMENISTAN'
 226 |    527 =  'TURKS AND CAICOS ISLANDS'
 227 |    420 =  'TUVALU'
 228 |    352 =  'UGANDA'
 229 |    162 =  'UKRAINE'
 230 |    296 =  'UNITED ARAB EMIRATES'
 231 |    135 =  'UNITED KINGDOM'
 232 |    695 =  'URUGUAY'
 233 |    163 =  'UZBEKISTAN'
 234 |    410 =  'VANUATU'
 235 |    696 =  'VENEZUELA'
 236 |    266 =  'VIETNAM'
 237 |    469 =  'WALLIS AND FUTUNA ISLANDS'
 238 |    757 =  'WEST INDIES (FRENCH)' 
 239 |    333 =  'WESTERN SAHARA'
 240 |    465 =  'WESTERN SAMOA'
 241 |    216 =  'YEMEN'
 242 |    139 =  'YUGOSLAVIA'
 243 |    301 =  'ZAIRE'
 244 |    344 =  'ZAMBIA'
 245 |    315 =  'ZIMBABWE'
 246 |    403 =  'INVALID: AMERICAN SAMOA'
 247 |    712 =  'INVALID: ANTARCTICA' 
 248 |    700 =  'INVALID: BORN ON BOARD SHIP'
 249 |    719 =  'INVALID: BOUVET ISLAND (ANTARCTICA/NORWAY TERR.)'
 250 |    574 =  'INVALID: CANADA'
 251 |    720 =  'INVALID: CANTON AND ENDERBURY ISLS' 
 252 |    106 =  'INVALID: CZECHOSLOVAKIA'
 253 |    739 =  'INVALID: DRONNING MAUD LAND (ANTARCTICA-NORWAY)' 
 254 |    394 =  'INVALID: FRENCH SOUTHERN AND ANTARCTIC'
 255 |    501 =  'INVALID: GREENLAND'
 256 |    404 =  'INVALID: GUAM'
 257 |    730 =  'INVALID: INTERNATIONAL WATERS' 
 258 |    731 =  'INVALID: JOHNSON ISLAND' 
 259 |    471 =  'INVALID: MARIANA ISLANDS, NORTHERN'
 260 |    737 =  'INVALID: MIDWAY ISLANDS' 
 261 |    753 =  'INVALID: MINOR OUTLYING ISLANDS - USA'
 262 |    740 =  'INVALID: NEUTRAL ZONE (S. ARABIA/IRAQ)' 
 263 |    710 =  'INVALID: NON-QUOTA IMMIGRANT'
 264 |    505 =  'INVALID: PUERTO RICO'
 265 |     0  =  'INVALID: STATELESS'
 266 |    705 =  'INVALID: STATELESS'
 267 |    583 =  'INVALID: UNITED STATES'
 268 |    407 =  'INVALID: UNITED STATES'
 269 |    999 =  'INVALID: UNKNOWN'
 270 |    239 =  'INVALID: UNKNOWN COUNTRY'
 271 |    134 =  'INVALID: USSR'
 272 |    506 =  'INVALID: U.S. VIRGIN ISLANDS'
 273 |    755 =  'INVALID: WAKE ISLAND'  
 274 |    311 =  'Collapsed Tanzania (should not show)'
 275 |    741 =  'Collapsed Curacao (should not show)'
 276 |     54 =  'No Country Code (54)'
 277 |    100 =  'No Country Code (100)'
 278 |    187 =  'No Country Code (187)'
 279 |    190 =  'No Country Code (190)'
 280 |    200 =  'No Country Code (200)'
 281 |    219 =  'No Country Code (219)'
 282 |    238 =  'No Country Code (238)'
 283 |    277 =  'No Country Code (277)'
 284 |    293 =  'No Country Code (293)'
 285 |    300 =  'No Country Code (300)'
 286 |    319 =  'No Country Code (319)'
 287 |    365 =  'No Country Code (365)'
 288 |    395 =  'No Country Code (395)'
 289 |    400 =  'No Country Code (400)'
 290 |    485 =  'No Country Code (485)'
 291 |    503 =  'No Country Code (503)'
 292 |    589 =  'No Country Code (589)'
 293 |    592 =  'No Country Code (592)'
 294 |    791 =  'No Country Code (791)'
 295 |    849 =  'No Country Code (849)'
 296 |    914 =  'No Country Code (914)'
 297 |    944 =  'No Country Code (944)'
 298 |    996 =  'No Country Code (996)' ;
 299 | 
 300 | 
 301 | /* I94PORT - This format shows all the valid and invalid codes for processing */
 302 |   value $i94prtl
 303 |    'ALC'	=	'ALCAN, AK             '
 304 |    'ANC'	=	'ANCHORAGE, AK         '
 305 |    'BAR'	=	'BAKER AAF - BAKER ISLAND, AK'
 306 |    'DAC'	=	'DALTONS CACHE, AK     '
 307 |    'PIZ'	=	'DEW STATION PT LAY DEW, AK'
 308 |    'DTH'	=	'DUTCH HARBOR, AK      '
 309 |    'EGL'	=	'EAGLE, AK             '
 310 |    'FRB'	=	'FAIRBANKS, AK         '
 311 |    'HOM'	=	'HOMER, AK             '           
 312 |    'HYD'	=	'HYDER, AK             '
 313 |    'JUN'	=	'JUNEAU, AK            '
 314 |    '5KE'	=	'KETCHIKAN, AK'
 315 |    'KET'	=	'KETCHIKAN, AK         '
 316 |    'MOS'	=	'MOSES POINT INTERMEDIATE, AK'
 317 |    'NIK'	=	'NIKISKI, AK           '
 318 |    'NOM'	=	'NOM, AK               '
 319 |    'PKC'	=	'POKER CREEK, AK       '
 320 |    'ORI'	=	'PORT LIONS SPB, AK'
 321 |    'SKA'	=	'SKAGWAY, AK           '
 322 |    'SNP'	=	'ST. PAUL ISLAND, AK'
 323 |    'TKI'	=	'TOKEEN, AK'
 324 |    'WRA'	=	'WRANGELL, AK          '
 325 |    'HSV'	=	'MADISON COUNTY - HUNTSVILLE, AL'
 326 |    'MOB'	=	'MOBILE, AL            '
 327 |    'LIA'	=	'LITTLE ROCK, AR (BPS)'
 328 |    'ROG'	=	'ROGERS ARPT, AR'
 329 |    'DOU'	=	'DOUGLAS, AZ           '
 330 |    'LUK'	=	'LUKEVILLE, AZ         '
 331 |    'MAP'	=	'MARIPOSA AZ           '
 332 |    'NAC'	=	'NACO, AZ              '
 333 |    'NOG'	=	'NOGALES, AZ           '
 334 |    'PHO'	=	'PHOENIX, AZ           '
 335 |    'POR'	=	'PORTAL, AZ'
 336 |    'SLU'	=	'SAN LUIS, AZ          '
 337 |    'SAS'	=	'SASABE, AZ            '
 338 |    'TUC'	=	'TUCSON, AZ            '
 339 |    'YUI'	=	'YUMA, AZ              ' 
 340 |    'AND'	=	'ANDRADE, CA           '
 341 |    'BUR'	=	'BURBANK, CA'
 342 |    'CAL'	=	'CALEXICO, CA          '
 343 |    'CAO'	=	'CAMPO, CA             ' 
 344 |    'FRE'	=	'FRESNO, CA            '
 345 |    'ICP'	=	'IMPERIAL COUNTY, CA   '
 346 |    'LNB'	=	'LONG BEACH, CA         '
 347 |    'LOS'	=	'LOS ANGELES, CA       '
 348 |    'BFL'	=	'MEADOWS FIELD - BAKERSFIELD, CA'
 349 |    'OAK'	=	'OAKLAND, CA ' 
 350 |    'ONT'	=	'ONTARIO, CA'
 351 |    'OTM'	=	'OTAY MESA, CA          '
 352 |    'BLT'	=	'PACIFIC, HWY. STATION, CA '
 353 |    'PSP'	=	'PALM SPRINGS, CA'
 354 |    'SAC'	=	'SACRAMENTO, CA        '
 355 |    'SLS'	=	'SALINAS, CA (BPS)'
 356 |    'SDP'	=	'SAN DIEGO, CA'
 357 |    'SFR'	=	'SAN FRANCISCO, CA     '
 358 |    'SNJ'	=	'SAN JOSE, CA          '
 359 |    'SLO'	=	'SAN LUIS OBISPO, CA   '
 360 |    'SLI'	=	'SAN LUIS OBISPO, CA (BPS)'
 361 |    'SPC'	=	'SAN PEDRO, CA         '
 362 |    'SYS'	=	'SAN YSIDRO, CA        '
 363 |    'SAA'	=	'SANTA ANA, CA         '
 364 |    'STO'	=	'STOCKTON, CA (BPS)'
 365 |    'TEC'	=	'TECATE, CA            '
 366 |    'TRV'	=	'TRAVIS-AFB, CA        '
 367 |    'APA'	=	'ARAPAHOE COUNTY, CO'
 368 |    'ASE'	=	'ASPEN, CO #ARPT'
 369 |    'COS'	=	'COLORADO SPRINGS, CO'
 370 |    'DEN'	=	'DENVER, CO            '
 371 |    'DRO'	=	'LA PLATA - DURANGO, CO'
 372 |    'BDL'	=	'BRADLEY INTERNATIONAL, CT'
 373 |    'BGC'	=	'BRIDGEPORT, CT        '
 374 |    'GRT'	=	'GROTON, CT            '
 375 |    'HAR'	=	'HARTFORD, CT          '
 376 |    'NWH'	=	'NEW HAVEN, CT         '
 377 |    'NWL'	=	'NEW LONDON, CT        '
 378 |    'TST'	=	'NEWINGTON DATA CENTER TEST, CT'
 379 |    'WAS'	=	'WASHINGTON DC         '
 380 |    'DOV'	=	'DOVER AFB, DE'
 381 |    'DVD'	=	'DOVER-AFB, DE         '
 382 |    'WLL'	=	'WILMINGTON, DE        '
 383 |    'BOC'	=	'BOCAGRANDE, FL        '
 384 |    'SRQ'	=	'BRADENTON - SARASOTA, FL'
 385 |    'CAN'	=	'CAPE CANAVERAL, FL    '
 386 |    'DAB'	=	'DAYTONA BEACH INTERNATIONAL, FL'
 387 |    'FRN'	=	'FERNANDINA, FL        '
 388 |    'FTL'	=	'FORT LAUDERDALE, FL   '
 389 |    'FMY'	=	'FORT MYERS, FL        '
 390 |    'FPF'	=	'FORT PIERCE, FL       '
 391 |    'HUR'	=	'HURLBURT FIELD, FL'
 392 |    'GNV'	=	'J R ALISON MUNI - GAINESVILLE, FL'
 393 |    'JAC'	=	'JACKSONVILLE, FL      '
 394 |    'KEY'	=	'KEY WEST, FL          '
 395 |    'LEE'	=	'LEESBURG MUNICIPAL AIRPORT, FL'
 396 |    'MLB'	=	'MELBOURNE, FL'
 397 |    'MIA'	=	'MIAMI, FL             '
 398 |    'APF'	=	'NAPLES, FL #ARPT'
 399 |    'OPF'	=	'OPA LOCKA, FL'
 400 |    'ORL'	=	'ORLANDO, FL           '
 401 |    'PAN'	=	'PANAMA CITY, FL       '
 402 |    'PEN'	=	'PENSACOLA, FL         '
 403 |    'PCF'	=	'PORT CANAVERAL, FL    '
 404 |    'PEV'	=	'PORT EVERGLADES, FL   '
 405 |    'PSJ'	=	'PORT ST JOE, FL       '
 406 |    'SFB'	=	'SANFORD, FL           '
 407 |    'SGJ'	=	'ST AUGUSTINE ARPT, FL'
 408 |    'SAU'	=	'ST AUGUSTINE, FL      '
 409 |    'FPR'	=	'ST LUCIE COUNTY, FL'
 410 |    'SPE'	=	'ST PETERSBURG, FL     '
 411 |    'TAM'	=	'TAMPA, FL             '
 412 |    'WPB'	=	'WEST PALM BEACH, FL   '
 413 |    'ATL'	=	'ATLANTA, GA           '
 414 |    'BRU'	=	'BRUNSWICK, GA         '
 415 |    'AGS'	=	'BUSH FIELD - AUGUSTA, GA'
 416 |    'SAV'	=	'SAVANNAH, GA          '
 417 |    'AGA'	=	'AGANA, GU             '
 418 |    'HHW'	=	'HONOLULU, HI          '
 419 |    'OGG'	=	'KAHULUI - MAUI, HI'
 420 |    'KOA'	=	'KEAHOLE-KONA, HI      '
 421 |    'LIH'	=	'LIHUE, HI             '
 422 |    'CID'	=	'CEDAR RAPIDS/IOWA CITY, IA'
 423 |    'DSM'	=	'DES MOINES, IA'
 424 |    'BOI'	=	'AIR TERM. (GOWEN FLD) BOISE, ID'
 425 |    'EPI'	=	'EASTPORT, ID          '
 426 |    'IDA'	=	'FANNING FIELD - IDAHO FALLS, ID'
 427 |    'PTL'	=	'PORTHILL, ID          '
 428 |    'SPI'	=	'CAPITAL - SPRINGFIELD, IL'
 429 |    'CHI'	=	'CHICAGO, IL           '
 430 |    'DPA'	=	'DUPAGE COUNTY, IL'
 431 |    'PIA'	=	'GREATER PEORIA, IL'
 432 |    'RFD'	=	'GREATER ROCKFORD, IL'
 433 |    'UGN'	=	'MEMORIAL - WAUKEGAN, IL'
 434 |    'GAR'	=	'GARY, IN              '
 435 |    'HMM'	=	'HAMMOND, IN           '
 436 |    'INP'	=	'INDIANAPOLIS, IN      '
 437 |    'MRL'	=	'MERRILLVILLE, IN      '
 438 |    'SBN'	=	'SOUTH BEND, IN'
 439 |    'ICT'	=	'MID-CONTINENT - WITCHITA, KS'
 440 |    'LEX'	=	'BLUE GRASS - LEXINGTON, KY'
 441 |    'LOU'	=	'LOUISVILLE, KY        '
 442 |    'BTN'	=	'BATON ROUGE, LA       '
 443 |    'LKC'	=	'LAKE CHARLES, LA      '
 444 |    'LAK'	=	'LAKE CHARLES, LA (BPS)'
 445 |    'MLU'	=	'MONROE, LA'
 446 |    'MGC'	=	'MORGAN CITY, LA       '
 447 |    'NOL'	=	'NEW ORLEANS, LA       '
 448 |    'BOS'	=	'BOSTON, MA            '
 449 |    'GLO'	=	'GLOUCESTER, MA        '
 450 |    'BED'	=	'HANSCOM FIELD - BEDFORD, MA'
 451 |    'LYN'	=	'LYNDEN, WA            '
 452 |    'ADW'	=	'ANDREWS AFB, MD'
 453 |    'BAL'	=	'BALTIMORE, MD         '
 454 |    'MKG'	=	'MUSKEGON, MD'
 455 |    'PAX'	=	'PATUXENT RIVER, MD    '
 456 |    'BGM'	=	'BANGOR, ME            '
 457 |    'BOO'	=	'BOOTHBAY HARBOR, ME   '
 458 |    'BWM'	=	'BRIDGEWATER, ME       '
 459 |    'BCK'	=	'BUCKPORT, ME          '
 460 |    'CLS'	=	'CALAIS, ME   '
 461 |    'CRB'	=	'CARIBOU, ME           '
 462 |    'COB'	=	'COBURN GORE, ME       '
 463 |    'EST'	=	'EASTCOURT, ME         '
 464 |    'EPT'	=	'EASTPORT MUNICIPAL, ME'
 465 |    'EPM'	=	'EASTPORT, ME          '
 466 |    'FOR'	=	'FOREST CITY, ME       '
 467 |    'FTF'	=	'FORT FAIRFIELD, ME    '
 468 |    'FTK'	=	'FORT KENT, ME         '
 469 |    'HML'	=	'HAMIIN, ME            '
 470 |    'HTM'	=	'HOULTON, ME           '
 471 |    'JKM'	=	'JACKMAN, ME           '
 472 |    'KAL'	=	'KALISPEL, MT          '
 473 |    'LIM'	=	'LIMESTONE, ME         '
 474 |    'LUB'	=	'LUBEC, ME             '
 475 |    'MAD'	=	'MADAWASKA, ME         '
 476 |    'POM'	=	'PORTLAND, ME          '
 477 |    'RGM'	=	'RANGELEY, ME (BPS)'
 478 |    'SBR'	=	'SOUTH BREWER, ME      '
 479 |    'SRL'	=	'ST AURELIE, ME        '
 480 |    'SPA'	=	'ST PAMPILE, ME        '
 481 |    'VNB'	=	'VAN BUREN, ME         '
 482 |    'VCB'	=	'VANCEBORO, ME         '
 483 |    'AGN'	=	'ALGONAC, MI           '
 484 |    'ALP'	=	'ALPENA, MI            '
 485 |    'BCY'	=	'BAY CITY, MI          '
 486 |    'DET'	=	'DETROIT, MI           '
 487 |    'GRP'	=	'GRAND RAPIDS, MI'
 488 |    'GRO'	=	'GROSSE ISLE, MI       '
 489 |    'ISL'	=	'ISLE ROYALE, MI       '
 490 |    'MRC'	=	'MARINE CITY, MI       '
 491 |    'MRY'	=	'MARYSVILLE, MI        '
 492 |    'PTK'	=	'OAKLAND COUNTY - PONTIAC, MI'
 493 |    'PHU'	=	'PORT HURON, MI        '
 494 |    'RBT'	=	'ROBERTS LANDING, MI   '
 495 |    'SAG'	=	'SAGINAW, MI           '
 496 |    'SSM'	=	'SAULT STE. MARIE, MI  '
 497 |    'SCL'	=	'ST CLAIR, MI          '
 498 |    'YIP'	=	'WILLOW RUN - YPSILANTI, MI'
 499 |    'BAU'	=	'BAUDETTE, MN          '
 500 |    'CAR'	=	'CARIBOU MUNICIPAL AIRPORT, MN'
 501 |    'GTF'	=	'Collapsed into INT, MN'
 502 |    'INL'	=	'Collapsed into INT, MN'
 503 |    'CRA'	=	'CRANE LAKE, MN        '
 504 |    'MIC'	=	'CRYSTAL MUNICIPAL AIRPORT, MN'
 505 |    'DUL'	=	'DULUTH, MN            '
 506 |    'ELY'	=	'ELY, MN               '
 507 |    'GPM'	=	'GRAND PORTAGE, MN     '
 508 |    'SVC'	=	'GRANT COUNTY - SILVER CITY, MN'
 509 |    'INT'	=	'INT''L FALLS, MN      '
 510 |    'LAN'	=	'LANCASTER, MN         '
 511 |    'MSP'	=	'MINN./ST PAUL, MN     '
 512 |    'LIN'	=	'NORTHERN SVC CENTER, MN   '
 513 |    'NOY'	=	'NOYES, MN             '
 514 |    'PIN'	=	'PINE CREEK, MN        '
 515 |    '48Y'	=	'PINECREEK BORDER ARPT, MN'
 516 |    'RAN'	=	'RAINER, MN            '
 517 |    'RST'	=	'ROCHESTER, MN'
 518 |    'ROS'	=	'ROSEAU, MN            '
 519 |    'SPM'	=	'ST PAUL, MN           '
 520 |    'WSB'	=	'WARROAD INTL, SPB, MN'
 521 |    'WAR'	=	'WARROAD, MN           '
 522 |    'KAN'	=	'KANSAS CITY, MO       '
 523 |    'SGF'	=	'SPRINGFIELD-BRANSON, MO'
 524 |    'STL'	=	'ST LOUIS, MO          '
 525 |    'WHI'	=	'WHITETAIL, MT         '
 526 |    'WHM'	=	'WILD HORSE, MT        '
 527 |    'GPT'	=	'BILOXI REGIONAL, MS'
 528 |    'GTR'	=	'GOLDEN TRIANGLE LOWNDES CNTY, MS'
 529 |    'GUL'	=	'GULFPORT, MS          '
 530 |    'PAS'	=	'PASCAGOULA, MS        '
 531 |    'JAN'	=	'THOMPSON FIELD - JACKSON, MS'
 532 |    'BIL'	=	'BILLINGS, MT          '
 533 |    'BTM'	=	'BUTTE, MT             '
 534 |    'CHF'	=	'CHIEF MT, MT          '
 535 |    'CTB'	=	'CUT BANK MUNICIPAL, MT'
 536 |    'CUT'	=	'CUT BANK, MT          '
 537 |    'DLB'	=	'DEL BONITA, MT        '
 538 |    'EUR'	=	'EUREKA, MT (BPS)'
 539 |    'BZN'	=	'GALLATIN FIELD - BOZEMAN, MT'
 540 |    'FCA'	=	'GLACIER NATIONAL PARK, MT'
 541 |    'GGW'	=	'GLASGOW, MT           '
 542 |    'GRE'	=	'GREAT FALLS, MT       '
 543 |    'HVR'	=	'HAVRE, MT             '
 544 |    'HEL'	=	'HELENA, MT            '
 545 |    'LWT'	=	'LEWISTON, MT          '
 546 |    'MGM'	=	'MORGAN, MT            '
 547 |    'OPH'	=	'OPHEIM, MT            '
 548 |    'PIE'	=	'PIEGAN, MT            '
 549 |    'RAY'	=	'RAYMOND, MT           '
 550 |    'ROO'	=	'ROOSVILLE, MT         '
 551 |    'SCO'	=	'SCOBEY, MT            '
 552 |    'SWE'	=	'SWEETGTASS, MT        '
 553 |    'TRL'	=	'TRIAL CREEK, MT       '
 554 |    'TUR'	=	'TURNER, MT            '
 555 |    'WCM'	=	'WILLOW CREEK, MT      '
 556 |    'CLT'	=	'CHARLOTTE, NC         '
 557 |    'FAY'	=	'FAYETTEVILLE, NC'
 558 |    'MRH'	=	'MOREHEAD CITY, NC     '
 559 |    'FOP'	=	'MORRIS FIELDS AAF, NC'
 560 |    'GSO'	=	'PIEDMONT TRIAD INTL AIRPORT, NC'
 561 |    'RDU'	=	'RALEIGH/DURHAM, NC    '
 562 |    'SSC'	=	'SHAW AFB - SUMTER, NC'
 563 |    'WIL'	=	'WILMINGTON, NC        '
 564 |    'AMB'	=	'AMBROSE, ND           '
 565 |    'ANT'	=	'ANTLER, ND            '
 566 |    'CRY'	=	'CARBURY, ND           '
 567 |    'DNS'	=	'DUNSEITH, ND          '
 568 |    'FAR'	=	'FARGO, ND             '
 569 |    'FRT'	=	'FORTUNA, ND           '
 570 |    'GRF'	=	'GRAND FORKS, ND       '
 571 |    'HNN'	=	'HANNAH, ND            '
 572 |    'HNS'	=	'HANSBORO, ND          '
 573 |    'MAI'	=	'MAIDA, ND             '
 574 |    'MND'	=	'MINOT, ND             '
 575 |    'NEC'	=	'NECHE, ND             '
 576 |    'NOO'	=	'NOONAN, ND            '
 577 |    'NRG'	=	'NORTHGATE, ND         '
 578 |    'PEM'	=	'PEMBINA, ND           '
 579 |    'SAR'	=	'SARLES, ND            '
 580 |    'SHR'	=	'SHERWOOD, ND          '
 581 |    'SJO'	=	'ST JOHN, ND           '
 582 |    'WAL'	=	'WALHALLA, ND          '
 583 |    'WHO'	=	'WESTHOPE, ND          '
 584 |    'WND'	=	'WILLISTON, ND         '
 585 |    'OMA'	=	'OMAHA, NE             '
 586 |    'LEB'	=	'LEBANON, NH           '
 587 |    'MHT'	=	'MANCHESTER, NH'
 588 |    'PNH'	=	'PITTSBURG, NH         '
 589 |    'PSM'	=	'PORTSMOUTH, NH        '
 590 |    'BYO'	=	'BAYONNE, NJ           '
 591 |    'CNJ'	=	'CAMDEN, NJ            '
 592 |    'HOB'	=	'HOBOKEN, NJ           '
 593 |    'JER'	=	'JERSEY CITY, NJ       '
 594 |    'WRI'	=	'MC GUIRE AFB - WRIGHTSOWN, NJ'
 595 |    'MMU'	=	'MORRISTOWN, NJ'
 596 |    'NEW'	=	'NEWARK/TETERBORO, NJ  '
 597 |    'PER'	=	'PERTH AMBOY, NJ       '
 598 |    'ACY'	=	'POMONA FIELD - ATLANTIC CITY, NJ'
 599 |    'ALA'	=	'ALAMAGORDO, NM (BPS)'
 600 |    'ABQ'	=	'ALBUQUERQUE, NM       '
 601 |    'ANP'	=	'ANTELOPE WELLS, NM    '
 602 |    'CRL'	=	'CARLSBAD, NM          '
 603 |    'COL'	=	'COLUMBUS, NM          '
 604 |    'CDD'	=	'CRANE LAKE - ST. LOUIS CNTY, NM'
 605 |    'DNM'	=	'DEMING, NM (BPS)'
 606 |    'LAS'	=	'LAS CRUCES, NM        '
 607 |    'LOB'	=	'LORDSBURG, NM (BPS)'
 608 |    'RUI'	=	'RUIDOSO, NM'
 609 |    'STR'	=	'SANTA TERESA, NM      '
 610 |    'RNO'	=	'CANNON INTL - RENO/TAHOE, NV'
 611 |    'FLX'	=	'FALLON MUNICIPAL AIRPORT, NV'
 612 |    'LVG'	=	'LAS VEGAS, NV         '
 613 |    'REN'	=	'RENO, NV              '
 614 |    'ALB'	=	'ALBANY, NY            '
 615 |    'AXB'	=	'ALEXANDRIA BAY, NY    '
 616 |    'BUF'	=	'BUFFALO, NY           '
 617 |    'CNH'	=	'CANNON CORNERS, NY'
 618 |    'CAP'	=	'CAPE VINCENT, NY      '
 619 |    'CHM'	=	'CHAMPLAIN, NY         '
 620 |    'CHT'	=	'CHATEAUGAY, NY        '
 621 |    'CLA'	=	'CLAYTON, NY           '
 622 |    'FTC'	=	'FORT COVINGTON, NY    '
 623 |    'LAG'	=	'LA GUARDIA, NY        '
 624 |    'LEW'	=	'LEWISTON, NY          '
 625 |    'MAS'	=	'MASSENA, NY           '
 626 |    'MAG'	=	'MCGUIRE AFB, NY       '
 627 |    'MOO'	=	'MOORES, NY            '
 628 |    'MRR'	=	'MORRISTOWN, NY        '
 629 |    'NYC'	=	'NEW YORK, NY          '
 630 |    'NIA'	=	'NIAGARA FALLS, NY     '
 631 |    'OGD'	=	'OGDENSBURG, NY        '
 632 |    'OSW'	=	'OSWEGO, NY            '
 633 |    'ELM'	=	'REGIONAL ARPT - HORSEHEAD, NY'
 634 |    'ROC'	=	'ROCHESTER, NY         '
 635 |    'ROU'	=	'ROUSES POINT, NY      '
 636 |    'SWF'	=	'STEWART - ORANGE CNTY, NY'
 637 |    'SYR'	=	'SYRACUSE, NY          '
 638 |    'THO'	=	'THOUSAND ISLAND BRIDGE, NY'
 639 |    'TRO'	=	'TROUT RIVER, NY       '
 640 |    'WAT'	=	'WATERTOWN, NY         '
 641 |    'HPN'	=	'WESTCHESTER - WHITE PLAINS, NY'
 642 |    'WRB'	=	'WHIRLPOOL BRIDGE, NY'
 643 |    'YOU'	=	'YOUNGSTOWN, NY        '
 644 |    'AKR'	=	'AKRON, OH             '
 645 |    'ATB'	=	'ASHTABULA, OH         '
 646 |    'CIN'	=	'CINCINNATI, OH        '
 647 |    'CLE'	=	'CLEVELAND, OH         '
 648 |    'CLM'	=	'COLUMBUS, OH          '
 649 |    'LOR'	=	'LORAIN, OH            '
 650 |    'MBO'	=	'MARBLE HEADS, OH      '
 651 |    'SDY'	=	'SANDUSKY, OH          '
 652 |    'TOL'	=	'TOLEDO, OH            '
 653 |    'OKC'	=	'OKLAHOMA CITY, OK     '
 654 |    'TUL'	=	'TULSA, OK'
 655 |    'AST'	=	'ASTORIA, OR           '
 656 |    'COO'	=	'COOS BAY, OR          '
 657 |    'HIO'	=	'HILLSBORO, OR'
 658 |    'MED'	=	'MEDFORD, OR           '
 659 |    'NPT'	=	'NEWPORT, OR           '
 660 |    'POO'	=	'PORTLAND, OR          '
 661 |    'PUT'	=	'PUT-IN-BAY, OH        '
 662 |    'RDM'	=	'ROBERTS FIELDS - REDMOND, OR'
 663 |    'ERI'	=	'ERIE, PA              '
 664 |    'MDT'	=	'HARRISBURG, PA'
 665 |    'HSB'	=	'HARRISONBURG, PA      '
 666 |    'PHI'	=	'PHILADELPHIA, PA      '
 667 |    'PIT'	=	'PITTSBURG, PA         '
 668 |    'AGU'	=	'AGUADILLA, PR         '
 669 |    'BQN'	=	'BORINQUEN - AGUADILLO, PR'
 670 |    'JCP'	=	'CULEBRA - BENJAMIN RIVERA, PR'
 671 |    'ENS'	=	'ENSENADA, PR          '
 672 |    'FAJ'	=	'FAJARDO, PR           '
 673 |    'HUM'	=	'HUMACAO, PR           '
 674 |    'JOB'	=	'JOBOS, PR             '
 675 |    'MAY'	=	'MAYAGUEZ, PR          '
 676 |    'PON'	=	'PONCE, PR             '
 677 |    'PSE'	=	'PONCE-MERCEDITA, PR'
 678 |    'SAJ'	=	'SAN JUAN, PR          '
 679 |    'VQS'	=	'VIEQUES-ARPT, PR'
 680 |    'PRO'	=	'PROVIDENCE, RI        '
 681 |    'PVD'	=	'THEODORE FRANCIS - WARWICK, RI'
 682 |    'CHL'	=	'CHARLESTON, SC        '
 683 |    'CAE'	=	'COLUMBIA, SC #ARPT'
 684 |    'GEO'	=	'GEORGETOWN, SC        '
 685 |    'GSP'	=	'GREENVILLE, SC'
 686 |    'GRR'	=	'GREER, SC'
 687 |    'MYR'	=	'MYRTLE BEACH, SC'
 688 |    'SPF'	=	'BLACK HILLS, SPEARFISH, SD'
 689 |    'HON'	=	'HOWES REGIONAL ARPT - HURON, SD'
 690 |    'SAI'	=	'SAIPAN, SPN           '
 691 |    'TYS'	=	'MC GHEE TYSON - ALCOA, TN'
 692 |    'MEM'	=	'MEMPHIS, TN           '
 693 |    'NSV'	=	'NASHVILLE, TN         '
 694 |    'TRI'	=	'TRI CITY ARPT, TN'
 695 |    'ADS'	=	'ADDISON AIRPORT- ADDISON, TX'
 696 |    'ADT'	=	'AMISTAD DAM, TX       '
 697 |    'ANZ'	=	'ANZALDUAS, TX'
 698 |    'AUS'	=	'AUSTIN, TX            '
 699 |    'BEA'	=	'BEAUMONT, TX          '
 700 |    'BBP'	=	'BIG BEND PARK, TX (BPS)'
 701 |    'SCC'	=	'BP SPEC COORD. CTR, TX'
 702 |    'BTC'	=	'BP TACTICAL UNIT, TX  ' 
 703 |    'BOA'	=	'BRIDGE OF AMERICAS, TX'
 704 |    'BRO'	=	'BROWNSVILLE, TX       '
 705 |    'CRP'	=	'CORPUS CHRISTI, TX    '
 706 |    'DAL'	=	'DALLAS, TX            '
 707 |    'DLR'	=	'DEL RIO, TX           '
 708 |    'DNA'	=	'DONNA, TX'
 709 |    'EGP'	=	'EAGLE PASS, TX        '
 710 |    'ELP'	=	'EL PASO, TX           '
 711 |    'FAB'	=	'FABENS, TX            '
 712 |    'FAL'	=	'FALCON HEIGHTS, TX    '
 713 |    'FTH'	=	'FORT HANCOCK, TX      '
 714 |    'AFW'	=	'FORT WORTH ALLIANCE, TX'
 715 |    'FPT'	=	'FREEPORT, TX          '
 716 |    'GAL'	=	'GALVESTON, TX         '
 717 |    'HLG'	=	'HARLINGEN, TX         '
 718 |    'HID'	=	'HIDALGO, TX           '
 719 |    'HOU'	=	'HOUSTON, TX           '
 720 |    'SGR'	=	'HULL FIELD, SUGAR LAND ARPT, TX'
 721 |    'LLB'	=	'JUAREZ-LINCOLN BRIDGE, TX'
 722 |    'LCB'	=	'LAREDO COLUMBIA BRIDGE, TX'
 723 |    'LRN'	=	'LAREDO NORTH, TX      '
 724 |    'LAR'	=	'LAREDO, TX            '
 725 |    'LSE'	=	'LOS EBANOS, TX        '
 726 |    'IND'	=	'LOS INDIOS, TX'
 727 |    'LOI'	=	'LOS INDIOS, TX        '
 728 |    'MRS'	=	'MARFA, TX (BPS)'
 729 |    'MCA'	=	'MCALLEN, TX           '
 730 |    'MAF'	=	'ODESSA REGIONAL, TX'
 731 |    'PDN'	=	'PASO DEL NORTE,TX     '
 732 |    'PBB'	=	'PEACE BRIDGE, NY      '
 733 |    'PHR'	=	'PHARR, TX             '
 734 |    'PAR'	=	'PORT ARTHUR, TX       '
 735 |    'ISB'	=	'PORT ISABEL, TX       '
 736 |    'POE'	=	'PORT OF EL PASO, TX   '
 737 |    'PRE'	=	'PRESIDIO, TX          '
 738 |    'PGR'	=	'PROGRESO, TX          '
 739 |    'RIO'	=	'RIO GRANDE CITY, TX   '
 740 |    'ROM'	=	'ROMA, TX              '
 741 |    'SNA'	=	'SAN ANTONIO, TX       '
 742 |    'SNN'	=	'SANDERSON, TX         '
 743 |    'VIB'	=	'VETERAN INTL BRIDGE, TX'
 744 |    'YSL'	=	'YSLETA, TX            '
 745 |    'CHA'	=	'CHARLOTTE AMALIE, VI  '
 746 |    'CHR'	=	'CHRISTIANSTED, VI     '
 747 |    'CRU'	=	'CRUZ BAY, ST JOHN, VI '
 748 |    'FRK'	=	'FREDERIKSTED, VI      '
 749 |    'STT'	=	'ST THOMAS, VI         '
 750 |    'LGU'	=	'CACHE AIRPORT - LOGAN, UT'
 751 |    'SLC'	=	'SALT LAKE CITY, UT    '
 752 |    'CHO'	=	'ALBEMARLE CHARLOTTESVILLE, VA'
 753 |    'DAA'	=	'DAVISON AAF - FAIRFAX CNTY, VA'
 754 |    'HOP'	=	'HOPEWELL, VA          '
 755 |    'HEF'	=	'MANASSAS, VA #ARPT'
 756 |    'NWN'	=	'NEWPORT, VA           '
 757 |    'NOR'	=	'NORFOLK, VA           '
 758 |    'RCM'	=	'RICHMOND, VA          '
 759 |    'ABS'	=	'ALBURG SPRINGS, VT    '
 760 |    'ABG'	=	'ALBURG, VT            '
 761 |    'BEB'	=	'BEEBE PLAIN, VT       '
 762 |    'BEE'	=	'BEECHER FALLS, VT     '
 763 |    'BRG'	=	'BURLINGTON, VT        '
 764 |    'CNA'	=	'CANAAN, VT            '
 765 |    'DER'	=	'DERBY LINE, VT (I-91) '
 766 |    'DLV'	=	'DERBY LINE, VT (RT. 5)'
 767 |    'ERC'	=	'EAST RICHFORD, VT     '
 768 |    'HIG'	=	'HIGHGATE SPRINGS, VT  '
 769 |    'MOR'	=	'MORSES LINE, VT       '
 770 |    'NPV'	=	'NEWPORT, VT           '
 771 |    'NRT'	=	'NORTH TROY, VT        '
 772 |    'NRN'	=	'NORTON, VT            '
 773 |    'PIV'	=	'PINNACLE ROAD, VT     '
 774 |    'RIF'	=	'RICHFORT, VT          '
 775 |    'STA'	=	'ST ALBANS, VT         '
 776 |    'SWB'	=	'SWANTON, VT (BP - SECTOR HQ)'
 777 |    'WBE'	=	'WEST BERKSHIRE, VT    '
 778 |    'ABE'	=	'ABERDEEN, WA          '
 779 |    'ANA'	=	'ANACORTES, WA         '
 780 |    'BEL'	=	'BELLINGHAM, WA        '
 781 |    'BLI'	=	'BELLINGHAM, WASHINGTON #INTL'
 782 |    'BLA'	=	'BLAINE, WA            '
 783 |    'BWA'	=	'BOUNDARY, WA          '
 784 |    'CUR'	=	'CURLEW, WA (BPS)'
 785 |    'DVL'	=	'DANVILLE, WA          '
 786 |    'EVE'	=	'EVERETT, WA           '
 787 |    'FER'	=	'FERRY, WA             '
 788 |    'FRI'	=	'FRIDAY HARBOR, WA     '
 789 |    'FWA'	=	'FRONTIER, WA          '
 790 |    'KLM'	=	'KALAMA, WA            '
 791 |    'LAU'	=	'LAURIER, WA           '
 792 |    'LON'	=	'LONGVIEW, WA          '
 793 |    'MET'	=	'METALINE FALLS, WA    '
 794 |    'MWH'	=	'MOSES LAKE GRANT COUNTY ARPT, WA'
 795 |    'NEA'	=	'NEAH BAY, WA          '
 796 |    'NIG'	=	'NIGHTHAWK, WA         '
 797 |    'OLY'	=	'OLYMPIA, WA           '
 798 |    'ORO'	=	'OROVILLE, WA          '
 799 |    'PWB'	=	'PASCO, WA             '
 800 |    'PIR'	=	'POINT ROBERTS, WA     '
 801 |    'PNG'	=	'PORT ANGELES, WA      '
 802 |    'PTO'	=	'PORT TOWNSEND, WA     '
 803 |    'SEA'	=	'SEATTLE, WA           '
 804 |    'SPO'	=	'SPOKANE, WA           '
 805 |    'SUM'	=	'SUMAS, WA             '
 806 |    'TAC'	=	'TACOMA, WA            '
 807 |    'PSC'	=	'TRI-CITIES - PASCO, WA'
 808 |    'VAN'	=	'VANCOUVER, WA         '
 809 |    'AGM'	=	'ALGOMA, WI            '
 810 |    'BAY'	=	'BAYFIELD, WI          '
 811 |    'GRB'	=	'GREEN BAY, WI         '
 812 |    'MNW'	=	'MANITOWOC, WI         '
 813 |    'MIL'	=	'MILWAUKEE, WI         '
 814 |    'MSN'	=	'TRUAX FIELD - DANE COUNTY, WI'
 815 |    'CHS'	=	'CHARLESTON, WV        '
 816 |    'CLK'	=	'CLARKSBURG, WV        '
 817 |    'BLF'	=	'MERCER COUNTY, WV'
 818 |    'CSP'	=	'CASPER, WY            '
 819 |    'XXX'	=	'NOT REPORTED/UNKNOWN  ' 
 820 |    '888'	=	'UNIDENTIFED AIR / SEAPORT'
 821 |    'UNK'	=	'UNKNOWN POE           '
 822 |    'CLG'	=	'CALGARY, CANADA       '
 823 |    'EDA'	=	'EDMONTON, CANADA      '
 824 |    'YHC'	=	'HAKAI PASS, CANADA'
 825 |    'HAL'	=	'Halifax, NS, Canada   '
 826 |    'MON'	=	'MONTREAL, CANADA      '
 827 |    'OTT'	=	'OTTAWA, CANADA        '
 828 |    'YXE'	=	'SASKATOON, CANADA'
 829 |    'TOR'	=	'TORONTO, CANADA       '
 830 |    'VCV'	=	'VANCOUVER, CANADA     '
 831 |    'VIC'	=	'VICTORIA, CANADA      '
 832 |    'WIN'	=	'WINNIPEG, CANADA      '
 833 |    'AMS'	=	'AMSTERDAM-SCHIPHOL, NETHERLANDS'
 834 |    'ARB'	=	'ARUBA, NETH ANTILLES  '
 835 |    'BAN'	=	'BANKOK, THAILAND      '
 836 |    'BEI'	=	'BEICA #ARPT, ETHIOPIA'
 837 |    'PEK'	=	'BEIJING CAPITAL INTL, PRC'
 838 |    'BDA'	=	'KINDLEY FIELD, BERMUDA'
 839 |    'BOG'	=	'BOGOTA, EL DORADO #ARPT, COLOMBIA'
 840 |    'EZE'	=	'BUENOS AIRES, MINISTRO PIST, ARGENTINA'
 841 |    'CUN'	=	'CANCUN, MEXICO'
 842 |    'CRQ'	=	'CARAVELAS, BA #ARPT, BRAZIL'
 843 |    'MVD'	=	'CARRASCO, URUGUAY'
 844 |    'DUB'	=	'DUBLIN, IRELAND       '
 845 |    'FOU'	=	'FOUGAMOU #ARPT, GABON'
 846 |    'FBA'	=	'FREEPORT, BAHAMAS      '
 847 |    'MTY'	=	'GEN M. ESCOBEDO, Monterrey, MX'
 848 |    'HMO'	=	'GEN PESQUEIRA GARCIA, MX'
 849 |    'GCM'	=	'GRAND CAYMAN, CAYMAN ISLAND'
 850 |    'GDL'	=	'GUADALAJARA, MIGUEL HIDAL, MX'
 851 |    'HAM'	=	'HAMILTON, BERMUDA     '
 852 |    'ICN'	=	'INCHON, SEOUL KOREA'
 853 |    'IWA'	=	'INVALID - IWAKUNI, JAPAN'
 854 |    'CND'	=	'KOGALNICEANU, ROMANIA'
 855 |    'LAH'	=	'LABUHA ARPT, INDONESIA'
 856 |    'DUR'	=	'LOUIS BOTHA, SOUTH AFRICA'
 857 |    'MAL'	=	'MANGOLE ARPT, INDONESIA'
 858 |    'MDE'	=	'MEDELLIN, COLOMBIA'
 859 |    'MEX'	=	'JUAREZ INTL, MEXICO CITY, MX'
 860 |    'LHR'	=	'MIDDLESEX, ENGLAND'
 861 |    'NBO'	=	'NAIROBI, KENYA        '
 862 |    'NAS'	=	'NASSAU, BAHAMAS       '
 863 |    'NCA'	=	'NORTH CAICOS, TURK & CAIMAN'
 864 |    'PTY'	=	'OMAR TORRIJOS, PANAMA'
 865 |    'SPV'	=	'PAPUA, NEW GUINEA'
 866 |    'UIO'	=	'QUITO (MARISCAL SUCR), ECUADOR'
 867 |    'RIT'	=	'ROME, ITALY           '
 868 |    'SNO'	=	'SAKON NAKHON #ARPT, THAILAND'
 869 |    'SLP'	=	'SAN LUIS POTOSI #ARPT, MEXICO'
 870 |    'SAN'	=	'SAN SALVADOR, EL SALVADOR'
 871 |    'SRO'	=	'SANTANA RAMOS #ARPT, COLOMBIA'
 872 |    'GRU'	=	'GUARULHOS INTL, SAO PAULO, BRAZIL'
 873 |    'SHA'	=	'SHANNON, IRELAND      '
 874 |    'HIL'	=	'SHILLAVO, ETHIOPIA'
 875 |    'TOK'	=	'TOROKINA #ARPT, PAPUA, NEW GUINEA'
 876 |    'VER'	=	'VERACRUZ, MEXICO'
 877 |    'LGW'	=	'WEST SUSSEX, ENGLAND  '
 878 |    'ZZZ'	=	'MEXICO Land (Banco de Mexico) '
 879 |    'CHN'	=	'No PORT Code (CHN)'
 880 |    'CNC'	=	'CANNON CORNERS, NY'
 881 |    'MAA'	=	'Abu Dhabi'
 882 |    'AG0'	=	'MAGNOLIA, AR'
 883 |    'BHM'	=	'BAR HARBOR, ME'
 884 |    'BHX'	=	'BIRMINGHAM, AL'
 885 |    'CAK'	=	'AKRON, OH'
 886 |    'FOK'	=	'SUFFOLK COUNTY, NY'
 887 |    'LND'	=	'LANDER, WY'
 888 |    'MAR'	=	'MARFA, TX'
 889 |    'MLI'	=	'MOLINE, IL'
 890 |    'RIV'	=	'RIVERSIDE, CA'
 891 |    'RME'	=	'ROME, NY'
 892 |    'VNY'	=	'VAN NUYS, CA'
 893 |    'YUM'	=	'YUMA, AZ'
 894 |    'FRG'	=	'Collapsed (FOK) 06/15'
 895 |    'HRL'	=	'Collapsed (HLG) 06/15'
 896 |    'ISP'	=	'Collapsed (FOK) 06/15'
 897 |    'JSJ'	=	'Collapsed (SAJ) 06/15'
 898 |    'BUS'	=	'Collapsed (BUF) 06/15'
 899 |    'IAG'	=	'Collapsed (NIA) 06/15'
 900 |    'PHN'	=	'Collapsed (PHU) 06/15'
 901 |    'STN'	=	'Collapsed (STR) 06/15'
 902 |    'VMB'	=	'Collapsed (VNB) 06/15'
 903 |    'T01'	=	'Collapsed (SEA) 06/15'
 904 |    'PHF'	=	'No PORT Code (PHF)'
 905 |    'DRV'	=	'No PORT Code (DRV)'
 906 |    'FTB'	=	'No PORT Code (FTB)'
 907 |    'GAC'	=	'No PORT Code (GAC)'
 908 |    'GMT'	=	'No PORT Code (GMT)'
 909 |    'JFA'	=	'No PORT Code (JFA)'
 910 |    'JMZ'	=	'No PORT Code (JMZ)'
 911 |    'NC8'	=	'No PORT Code (NC8)'
 912 |    'NYL'	=	'No PORT Code (NYL)'
 913 |    'OAI'	=	'No PORT Code (OAI)'
 914 |    'PCW'	=	'No PORT Code (PCW)'
 915 |    'WA5'	=	'No PORT Code (WAS)'
 916 |    'WTR'	=	'No PORT Code (WTR)'
 917 |    'X96'	=	'No PORT Code (X96)'
 918 |    'XNA'	=	'No PORT Code (XNA)'
 919 |    'YGF'	=	'No PORT Code (YGF)'
 920 |    '5T6'	=	'No PORT Code (5T6)'
 921 |    '060'	=	'No PORT Code (60)'
 922 |    'SP0'	=	'No PORT Code (SP0)'
 923 |    'W55'	=	'No PORT Code (W55)'
 924 |    'X44'	=	'No PORT Code (X44)'
 925 |    'AUH'	=	'No PORT Code (AUH)'
 926 |    'RYY'	=	'No PORT Code (RYY)'
 927 |    'SUS'	=	'No PORT Code (SUS)'
 928 |    '74S'	=	'No PORT Code (74S)'
 929 |    'ATW'	=	'No PORT Code (ATW)'
 930 |    'CPX'	=	'No PORT Code (CPX)'
 931 |    'MTH'	=	'No PORT Code (MTH)'
 932 |    'PFN'	=	'No PORT Code (PFN)'
 933 |    'SCH'	=	'No PORT Code (SCH)'
 934 |    'ASI'	=	'No PORT Code (ASI)'
 935 |    'BKF'	=	'No PORT Code (BKF)'
 936 |    'DAY'	=	'No PORT Code (DAY)'
 937 |    'Y62'	=	'No PORT Code (Y62)'
 938 |    'AG'		=	'No PORT Code (AG)'
 939 |    'BCM'	=	'No PORT Code (BCM)'
 940 |    'DEC'	=	'No PORT Code (DEC)'
 941 |    'PLB'	=	'No PORT Code (PLB)'
 942 |    'CXO'	=	'No PORT Code (CXO)'
 943 |    'JBQ'	=	'No PORT Code (JBQ)'
 944 |    'JIG'	=	'No PORT Code (JIG)'
 945 |    'OGS'	=	'No PORT Code (OGS)'
 946 |    'TIW'	=	'No PORT Code (TIW)'
 947 |    'OTS'	=	'No PORT Code (OTS)'
 948 |    'AMT'	=	'No PORT Code (AMT)'
 949 |    'EGE'	=	'No PORT Code (EGE)'
 950 |    'GPI'	=	'No PORT Code (GPI)'
 951 |    'NGL'	=	'No PORT Code (NGL)'
 952 |    'OLM'	=	'No PORT Code (OLM)'
 953 |    '.GA'	=	'No PORT Code (.GA)'
 954 |    'CLX'	=	'No PORT Code (CLX)'
 955 |    'CP '	=	'No PORT Code (CP)'
 956 |    'FSC'	=	'No PORT Code (FSC)'
 957 |    'NK' 	=	'No PORT Code (NK)'
 958 |    'ADU' 	=	'No PORT Code (ADU)'
 959 |    'AKT' 	=	'No PORT Code (AKT)'
 960 |    'LIT' 	=	'No PORT Code (LIT)'
 961 |    'A2A' 	=	'No PORT Code (A2A)'
 962 |    'OSN' 	=	'No PORT Code (OSN)'
 963 | ;
 964 | 
 965 | 
 966 | /* ARRDATE is the Arrival Date in the USA. It is a SAS date numeric field that a 
 967 |    permament format has not been applied.  Please apply whichever date format 
 968 |    works for you. */
 969 | 
 970 | 
 971 | /* I94MODE - There are missing values as well as not reported (9) */
 972 | value i94model
 973 | 	1 = 'Air'
 974 | 	2 = 'Sea'
 975 | 	3 = 'Land'
 976 | 	9 = 'Not reported' ;
 977 | 	
 978 | 
 979 | /* I94ADDR - There is lots of invalid codes in this variable and the list below 
 980 |    shows what we have found to be valid, everything else goes into 'other' */
 981 | value i94addrl
 982 | 	'AL'='ALABAMA'
 983 | 	'AK'='ALASKA'
 984 | 	'AZ'='ARIZONA'
 985 | 	'AR'='ARKANSAS'
 986 | 	'CA'='CALIFORNIA'
 987 | 	'CO'='COLORADO'
 988 | 	'CT'='CONNECTICUT'
 989 | 	'DE'='DELAWARE'
 990 | 	'DC'='DIST. OF COLUMBIA'
 991 | 	'FL'='FLORIDA'
 992 | 	'GA'='GEORGIA'
 993 | 	'GU'='GUAM'
 994 | 	'HI'='HAWAII'
 995 | 	'ID'='IDAHO'
 996 | 	'IL'='ILLINOIS'
 997 | 	'IN'='INDIANA'
 998 | 	'IA'='IOWA'
 999 | 	'KS'='KANSAS'
1000 | 	'KY'='KENTUCKY'
1001 | 	'LA'='LOUISIANA'
1002 | 	'ME'='MAINE'
1003 | 	'MD'='MARYLAND'
1004 | 	'MA'='MASSACHUSETTS'
1005 | 	'MI'='MICHIGAN'
1006 | 	'MN'='MINNESOTA'
1007 | 	'MS'='MISSISSIPPI'
1008 | 	'MO'='MISSOURI'
1009 | 	'MT'='MONTANA'
1010 | 	'NC'='N. CAROLINA'
1011 | 	'ND'='N. DAKOTA'
1012 | 	'NE'='NEBRASKA'
1013 | 	'NV'='NEVADA'
1014 | 	'NH'='NEW HAMPSHIRE'
1015 | 	'NJ'='NEW JERSEY'
1016 | 	'NM'='NEW MEXICO'
1017 | 	'NY'='NEW YORK'
1018 | 	'OH'='OHIO'
1019 | 	'OK'='OKLAHOMA'
1020 | 	'OR'='OREGON'
1021 | 	'PA'='PENNSYLVANIA'
1022 | 	'PR'='PUERTO RICO'
1023 | 	'RI'='RHODE ISLAND'
1024 | 	'SC'='S. CAROLINA'
1025 | 	'SD'='S. DAKOTA'
1026 | 	'TN'='TENNESSEE'
1027 | 	'TX'='TEXAS'
1028 | 	'UT'='UTAH'
1029 | 	'VT'='VERMONT'
1030 | 	'VI'='VIRGIN ISLANDS'
1031 | 	'VA'='VIRGINIA'
1032 | 	'WV'='W. VIRGINIA'
1033 | 	'WA'='WASHINGTON'
1034 | 	'WI'='WISCONSON'
1035 | 	'WY'='WYOMING' 
1036 | 	'99'='All Other Codes' ;
1037 | 
1038 | /* DEPDATE is the Departure Date from the USA. It is a SAS date numeric field that 
1039 |    a permament format has not been applied.  Please apply whichever date format 
1040 |    works for you. */
1041 | 
1042 | 
1043 | /* I94BIR - Age of Respondent in Years */
1044 | 
1045 | 
1046 | /* I94VISA - Visa codes collapsed into three categories:
1047 |    1 = Business
1048 |    2 = Pleasure
1049 |    3 = Student
1050 | */
1051 | 
1052 | 
1053 | /* COUNT - Used for summary statistics */
1054 | 
1055 | 
1056 | /* DTADFILE - Character Date Field - Date added to I-94 Files - CIC does not use */
1057 | 
1058 | 
1059 | /* VISAPOST - Department of State where where Visa was issued - CIC does not use */
1060 | 
1061 | 
1062 | /* OCCUP - Occupation that will be performed in U.S. - CIC does not use */
1063 | 
1064 | 
1065 | /* ENTDEPA - Arrival Flag - admitted or paroled into the U.S. - CIC does not use */
1066 | 
1067 | 
1068 | /* ENTDEPD - Departure Flag - Departed, lost I-94 or is deceased - CIC does not use */
1069 | 
1070 | 
1071 | /* ENTDEPU - Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use */
1072 | 
1073 | 
1074 | /* MATFLAG - Match flag - Match of arrival and departure records */
1075 | 
1076 | 
1077 | /* BIRYEAR - 4 digit year of birth */
1078 | 
1079 | 
1080 | /* DTADDTO - Character Date Field - Date to which admitted to U.S. (allowed to stay until) - CIC does not use */
1081 | 
1082 | 
1083 | /* GENDER - Non-immigrant sex */
1084 | 
1085 | 
1086 | /* INSNUM - INS number */
1087 | 
1088 | 
1089 | /* AIRLINE - Airline used to arrive in U.S. */
1090 | 
1091 | 
1092 | /* ADMNUM - Admission Number */
1093 | 
1094 | 
1095 | /* FLTNO - Flight number of Airline used to arrive in U.S. */
1096 | 
1097 | 
1098 | /* VISATYPE - Class of admission legally admitting the non-immigrant to temporarily stay in U.S. */
1099 | run ;
1100 | 
1101 | 


--------------------------------------------------------------------------------
/Capstone Project/README.md:
--------------------------------------------------------------------------------
 1 | # CapstoneProject
 2 | 
 3 | This projects aims to enrich the US I94 immigration data with further data such as US airport data, US demographics and temperature data to have a wider basis for analysis on the immigration data.
 4 | 
 5 | ## Data sources
 6 | 
 7 | ### I94 Immigration Data
 8 | This data comes from the US National Tourism and Trade Office. A data dictionary is included in the workspace. [This](https://travel.trade.gov/research/reports/i94/historical/2016.html) is where the data comes from. There's a sample file so you can take a look at the data in csv format before reading it all in.
 9 | 
10 | ### World Temperature Data
11 | This dataset came from Kaggle. You can read more about it [here](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data).
12 | 
13 | ### U.S. City Demographic Data
14 | This data comes from OpenSoft. You can read more about it [here](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/).
15 | 
16 | ### Airport Code Table
17 | This is a simple table of airport codes and corresponding cities. It comes from [here](https://datahub.io/core/airport-codes#data).
18 | 
19 | ## Data cleaning
20 | 
21 | * Filter temperature data to only use US data.
22 | * Remove irregular ports from I94 data.
23 | * Drop rows with missing IATA codes from I94 data. We need the IATA codes to join the data with other sources.
24 | 
25 | ## Conceptual Data Model
26 | 
27 | ### Tables:
28 | | table name | columns | description | type |
29 | | ------- | ---------- | ----------- | ---- |
30 | | airports | iata_code - name - type - local_code - coordinates - city | stores information related to airports | dimension table |
31 | | demographics | city - state - media_age - male_population - female_population - total_population - num_veterans - foreign_born - average_household_size - state_code - race - count | stores demographics data for cities | dimension table |
32 | | immigrations | cicid - year - month - cit - res - iata - arrdate - mode - addr - depdate - bir - visa - coun- dtadfil - visapost - occup - entdepa - entdepd - entdepu - matflag - biryear - dtaddto - gender - insnum - airline - admnum - fltno - visatype | stores all i94 immigrations data | fact table |
33 | | temperature | timestamp - average_temperature - average_temperatur_uncertainty - city - country - latitude - longitude | stores temperature information | dimension table |
34 | 
35 | ### Table decision
36 | 
37 | We want to have the immigrations data to store the key information. We can then enrich the data with airports, demographics and temperature data. To do so efficiently, we need identifiers on all tables so they can be joined efficiently. This includes the city and the iata code.
38 | 
39 | ## Mapping Out Data Pipelines
40 | 
41 | 1. Create tables by executing `create_tables.py`.
42 | 2. Join city to airports data.
43 | 3. Insert data.
44 | 
45 | ## Choice of tools and technologies for the project
46 | 
47 | Pandas is used to ease data preprocessing and visualisation. It is helpful to efficiently load and manipulate data. At a later stage, instead of pandas dataframes, I recommend using Spark dataframes to allow distributed processing using for example Amazon Elastic Map Reduce (EMR). Also, to perform automated updates, I recommend integrating the ETL pipeline into an Airflow DAG.
48 | 
49 | I used a Jupyter Notebook to show the data structure and the need for data cleaning. Python is an often used programming language and was used because it is the language I am the most comfortable with.
50 | 
51 | ## How often the data should be updated and why
52 | 
53 | The I94 data described immigration events aggregated on a monthly base. Thus, updating the data on a monthly base is recommended.
54 | 
55 | ## FAQ: What would I do if...
56 | * The data was increased by 100x.
57 |   * Use Spark to process the data efficiently in a distributed way e.g. with EMR. In case we recognize that we need a write-heavy operation, I would suggest using a Cassandra database instead of PostgreSQL.
58 | * The data populates a dashboard that must be updated on a daily basis by 7am every day.
59 |   * Use Airflow and create a DAG that performs the logic of the described pipeline. If executing the DAG fails, I recommend to automatically send emails to the engineering team using Airflow's builtin feature, so they can fix potential issues soon.
60 | * The database needed to be accessed by 100+ people.
61 |   * Use RedShift to have the data stored in a way that it can efficiently be accessed by many people. Also, we can use a database such as PostgreSQL in a more cost-efficient setting that will, however, have slightly lower performance due to its nature.
62 | 


--------------------------------------------------------------------------------
/Capstone Project/create_tables.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from sql_queries import create_table_queries, drop_table_queries
 3 | 
 4 | 
 5 | def create_database():
 6 |     """
 7 |     - Creates and connects to the sparkifydb
 8 |     @return: cursor and connection to sparkifydb
 9 |     """
10 |     
11 |     # connect to default database
12 |     conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student")
13 |     conn.set_session(autocommit=True)
14 |     cur = conn.cursor()
15 |     
16 |     # create sparkify database with UTF8 encoding
17 |     cur.execute("DROP DATABASE IF EXISTS sparkifydb")
18 |     cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0")
19 | 
20 |     # close connection to default database
21 |     conn.close()    
22 |     
23 |     # connect to sparkify database
24 |     conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
25 |     cur = conn.cursor()
26 |     
27 |     return cur, conn
28 | 
29 | 
30 | def drop_tables(cur, conn):
31 |     """
32 |     Drops each table using the queries in `drop_table_queries` list.
33 |     @param cur:
34 |     @param conn:
35 |     """
36 |     for query in drop_table_queries:
37 |         cur.execute(query)
38 |         conn.commit()
39 | 
40 | 
41 | def create_tables(cur, conn):
42 |     """
43 |     Creates each table using the queries in `create_table_queries` list.
44 |     @param cur:
45 |     @param conn:
46 |     """
47 |     for query in create_table_queries:
48 |         cur.execute(query)
49 |         conn.commit()
50 | 
51 | 
52 | def main():
53 |     """
54 |     - Drops (if exists) and Creates the sparkify database. 
55 |     
56 |     - Establishes connection with the sparkify database and gets
57 |     cursor to it.  
58 |     
59 |     - Drops all the tables.  
60 |     
61 |     - Creates all tables needed. 
62 |     
63 |     - Finally, closes the connection. 
64 |     """
65 |     cur, conn = create_database()
66 |     
67 |     drop_tables(cur, conn)
68 |     create_tables(cur, conn)
69 | 
70 |     conn.close()
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()


--------------------------------------------------------------------------------
/Capstone Project/sql_queries.py:
--------------------------------------------------------------------------------
  1 | create_airports = """
  2 | CREATE TABLE IF NOT EXISTS public.airports (
  3 |     iata_code    VARCHAR PRIMARY KEY,
  4 |     name         VARCHAR,
  5 |     type         VARCHAR,
  6 |     local_code   VARCHAR,
  7 |     coordinates  VARCHAR,
  8 |     city         VARCHAR,
  9 |     elevation_ft FLOAT,
 10 |     continent    VARCHAR,
 11 |     iso_country  VARCHAR,
 12 |     iso_region   VARCHAR,
 13 |     municipality VARCHAR,
 14 |     gps_code     VARCHAR
 15 | );
 16 | """
 17 | 
 18 | drop_airports = "DROP TABLE IF EXISTS airports;"
 19 | 
 20 | airport_insert = """
 21 | INSERT INTO airports (iata_code, name, type, local_code, coordinates, city, elevation_ft, continent, \
 22 |     iso_country, iso_region, municipality, gps_code) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
 23 | 
 24 | create_demographics = """
 25 | CREATE TABLE IF NOT EXISTS public.demographics (
 26 |     city                   VARCHAR,
 27 |     state                  VARCHAR,
 28 |     media_age              FLOAT,
 29 |     male_population        INT,
 30 |     female_population      INT,
 31 |     total_population       INT,
 32 |     num_veterans           INT,
 33 |     foreign_born           INT,
 34 |     average_household_size FLOAT,
 35 |     state_code             VARCHAR(2),
 36 |     race                   VARCHAR,
 37 |     count                  INT
 38 | );
 39 | """
 40 | 
 41 | drop_demographics = "DROP TABLE IF EXISTS demographics;"
 42 | 
 43 | demographic_insert = """
 44 | INSERT INTO demographics (city, state, media_age, male_population, female_population, total_population, \
 45 | num_veterans, foreign_born, average_household_size, state_code, race, count) VALUES (%s, %s, %s, %s, \
 46 | %s, %s, %s, %s, %s, %s, %s, %s)"""
 47 | 
 48 | create_immigrations = """
 49 | CREATE TABLE IF NOT EXISTS public.immigrations (
 50 |     cicid    FLOAT PRIMARY KEY,
 51 |     year     FLOAT,
 52 |     month    FLOAT,
 53 |     cit      FLOAT,
 54 |     res      FLOAT,
 55 |     iata     VARCHAR(3),
 56 |     arrdate  FLOAT,
 57 |     mode     FLOAT,
 58 |     addr     VARCHAR,
 59 |     depdate  FLOAT,
 60 |     bir      FLOAT,
 61 |     visa     FLOAT,
 62 |     count    FLOAT,
 63 |     dtadfile VARCHAR,
 64 |     entdepa  VARCHAR(1),
 65 |     entdepd  VARCHAR(1),
 66 |     matflag  VARCHAR(1),
 67 |     biryear  FLOAT,
 68 |     dtaddto  VARCHAR,
 69 |     gender   VARCHAR(1),
 70 |     airline  VARCHAR,
 71 |     admnum   FLOAT,
 72 |     fltno    VARCHAR,
 73 |     visatype VARCHAR
 74 | );
 75 | """
 76 | 
 77 | drop_immigrations = "DROP TABLE IF EXISTS immigrations;"
 78 | 
 79 | immigration_insert = ("""
 80 | INSERT INTO immigrations (cicid, year, month, cit, res, iata, arrdate, mode, addr, depdate, bir, visa, count, dtadfile, \
 81 | entdepa, entdepd, matflag, biryear, dtaddto, gender, airline, admnum, fltno, visatype) \
 82 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""")
 83 | 
 84 | create_temperature = """
 85 | CREATE TABLE IF NOT EXISTS temperature (
 86 |     timestamp                      DATE,
 87 |     average_temperature            FLOAT,
 88 |     average_temperature_uncertainty FLOAT,
 89 |     city                           VARCHAR,
 90 |     country                        VARCHAR,
 91 |     latitude                       VARCHAR,
 92 |     longitude                      VARCHAR
 93 | );
 94 | """
 95 | 
 96 | temperature_insert = ("""
 97 | INSERT INTO temperature (timestamp, average_temperature, average_temperature_uncertainty, city, country, \
 98 | latitude, longitude) VALUES (%s, %s, %s, %s, %s, %s, %s)""")
 99 | 
100 | drop_temperature = "DROP TABLE IF EXISTS weather;"
101 | 
102 | drop_table_queries = [drop_airports, drop_demographics, drop_immigrations, drop_temperature]
103 | create_table_queries = [create_airports, create_demographics, create_immigrations, create_temperature]


--------------------------------------------------------------------------------
/Data Lake/README.md:
--------------------------------------------------------------------------------
 1 | # Data-Lake
 2 | Project Data Lake as part of Udacity's Data Engineering Nanodegree
 3 | 
 4 | ## Purpose of this project
 5 | 
 6 | As the startup sparkify is scaling up quickly, their existing data warehouse can not handle the massive data resources efficiently any longer. They heard about Spark and were curious how it could help them. With this project, they can now analyse their data in a distributed way in-memory. This leads to huge speedups comparing to the existing approach and allows them to keep track of their clients' behavior easily.
 7 | 
 8 | Moreover, this data lake automates the entire data fusion process combining multiple data sources from AWS S3 into structured data. Also, the structured data gets then stored on AWS S3 again, so Sparkify can use it for further analysis.
 9 | 
10 | ## Used input data
11 | 
12 | The input data contains two S3 buckets:
13 | * Song data: `s3://udacity-dend/song_data`
14 | * Log data: `s3://udacity-dend/log_data`
15 | 
16 | The song dataset is a subset of real data from the Million Song Dataset. Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset.
17 | 
18 | ```
19 | song_data/A/B/C/TRABCEI128F424C983.json
20 | song_data/A/A/B/TRAABJL12903CDCF1A.json
21 | ```
22 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like.
23 | 
24 | ```json
25 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0}
26 | ```
27 | 
28 | The log dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from an imaginary music streaming app based on configuration settings.
29 | 
30 | The log files in the dataset are partitioned by year and month. For example, here are filepaths to two files in this dataset.
31 | 
32 | ```
33 | log_data/2018/11/2018-11-12-events.json
34 | log_data/2018/11/2018-11-13-events.json
35 | ```
36 | 
37 | And below is an example of what the data in a log file, 2018-11-12-events.json, looks like.
38 | 
39 | ![2018-11-12-events.json structure](https://video.udacity-data.com/topher/2019/February/5c6c3f0a_log-data/log-data.png)
40 | 
41 | ## Generated tables
42 | 
43 | | name | type | description | columns |
44 | | ---- | ---- | ----------- | ------- |
45 | | songplays | fact table | records in log data associated with song plays i.e. records with page NextSong | songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent |
46 | | users | dimension table | users in the app | user_id, first_name, last_name, gender, level |
47 | | songs | dimension table | songs in music database | song_id, title, artist_id, year, duration |
48 | | artists | dimension table |  artists in music database | artist_id, name, location, lattitude, longitude |
49 | | time | dimension table | timestamps of records in songplays broken down into specific units | start_time, hour, day, week, month, year, weekday |
50 | 
51 | ## ETL pipeline
52 | 
53 | The ETL pipeline (see `etl.py`) loads the S3 data sources into Spark dataframes, aggregrates and transforms the data into the described schema and writes the data back to S3 in the parquet format.
54 | 
55 | ## Instructions
56 | 
57 | 1. Create an AWS IAM role with S3 read and write access.
58 | 2. Enter the IAM's credentials in the `dl.cfg` configuration file.
59 | 3. Create an S3 bucket (note that the zone eu-central-1 may cause issues) and enter the URL to the bucket in `etl.py` as the value of output_data.
60 | 4. Run `python3 etl.py` to process the data and store it on your created S3 bucket.
61 | 


--------------------------------------------------------------------------------
/Data Lake/dl.cfg:
--------------------------------------------------------------------------------
1 | [AWS]
2 | AWS_ACCESS_KEY_ID=
3 | AWS_SECRET_ACCESS_KEY=


--------------------------------------------------------------------------------
/Data Lake/etl.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | from datetime import datetime
  3 | import os
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.functions import udf, col, year, month, dayofweek, hour, weekofyear, dayofmonth, \
  6 |     monotonically_increasing_id, from_unixtime
  7 | from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, TimestampType
  8 | 
  9 | config = configparser.ConfigParser()
 10 | config.read('dl.cfg')
 11 | 
 12 | os.environ['AWS_ACCESS_KEY_ID'] = config.get('AWS', 'AWS_ACCESS_KEY_ID')
 13 | os.environ['AWS_SECRET_ACCESS_KEY'] = config.get('AWS', 'AWS_SECRET_ACCESS_KEY')
 14 | 
 15 | 
 16 | def create_spark_session():
 17 |     """
 18 |     Creates a new or uses the existing spark session.
 19 |     """
 20 |     spark = SparkSession \
 21 |         .builder \
 22 |         .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
 23 |         .getOrCreate()
 24 |     return spark
 25 | 
 26 | 
 27 | def process_song_data(spark, input_data, output_data):
 28 |     """
 29 |     Processes all song data JSON files in the given input folder and stores them in parquet format in the output folder.
 30 |     :param spark: spark session
 31 |     :param input_data: input data path
 32 |     :param output_data: output data path
 33 |     """
 34 |     song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
 35 | 
 36 |     song_schema = StructType([
 37 |         StructField("artist_id", StringType()),
 38 |         StructField("artist_latitude", DoubleType()),
 39 |         StructField("artist_location", StringType()),
 40 |         StructField("artist_longitude", StringType()),
 41 |         StructField("artist_name", StringType()),
 42 |         StructField("duration", DoubleType()),
 43 |         StructField("num_songs", IntegerType()),
 44 |         StructField("title", StringType()),
 45 |         StructField("year", IntegerType()),
 46 |     ])
 47 | 
 48 |     # read song data file
 49 |     df = spark.read.json(song_data, schema=song_schema)
 50 | 
 51 |     # extract columns to create songs table
 52 |     song_fields = ["title", "artist_id", "year", "duration"]
 53 |     songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
 54 | 
 55 |     # write songs table to parquet files partitioned by year and artist
 56 |     songs_table.write.mode("overwrite").partitionBy("year", "artist_id").parquet(output_data + "songs")
 57 | 
 58 |     # extract columns to create artists table
 59 |     artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude",
 60 |                       "artist_longitude as longitude"]
 61 |     artists_table = df.selectExpr(artists_fields).dropDuplicates()
 62 | 
 63 |     # write artists table to parquet files
 64 |     artists_table.write.mode("overwrite").parquet(output_data + 'artists')
 65 | 
 66 | 
 67 | def process_log_data(spark, input_data, output_data):
 68 |     """
 69 |     Processes all log data JSON files in the given input folder and stores them in parquet format in the output folder.
 70 |     :param spark: spark session
 71 |     :param input_data: input data path
 72 |     :param output_data: output data path
 73 |     """
 74 |     # get filepath to log data file
 75 |     log_data = os.path.join(input_data, 'log_data/*/*/*.json')
 76 | 
 77 |     # read log data file
 78 |     log_df = spark.read.json(log_data)
 79 | 
 80 |     # filter by actions for song plays
 81 |     log_df = log_df.filter(log_df.page == 'NextSong')
 82 | 
 83 |     # extract columns for users table
 84 |     users_fields = ["userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"]
 85 |     users_table = log_df.selectExpr(users_fields).dropDuplicates()
 86 | 
 87 |     # write users table to parquet files
 88 |     users_table.write.mode("overwrite").parquet(output_data + 'users')
 89 | 
 90 |     # create timestamp column from original timestamp column
 91 |     get_timestamp = udf(lambda x: x / 1000, TimestampType())
 92 |     log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts))
 93 | 
 94 |     # create datetime column from original timestamp column
 95 |     get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType())
 96 |     log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp))
 97 | 
 98 |     # extract columns to create time table
 99 |     log_df = log_df.withColumn("hour", hour("start_time")) \
100 |         .withColumn("day", dayofmonth("start_time")) \
101 |         .withColumn("week", weekofyear("start_time")) \
102 |         .withColumn("month", month("start_time")) \
103 |         .withColumn("year", year("start_time")) \
104 |         .withColumn("weekday", dayofweek("start_time"))
105 | 
106 |     time_table = log_df.select("start_time", "hour", "day", "week", "month", "year", "weekday")
107 | 
108 |     # write time table to parquet files partitioned by year and month
109 |     time_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data + "time")
110 | 
111 |     # read in song data to use for songplays table
112 |     songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*"))
113 |     songs_logs = log_df.join(songs_df, (log_df.song == songs_df.title))
114 | 
115 |     # extract columns from joined song and log datasets to create songplays table
116 |     artists_df = spark.read.parquet(os.path.join(output_data, "artists"))
117 |     artists_songs_logs = songs_logs.join(artists_df, (songs_logs.artist == artists_df.name))
118 |     songplays = artists_songs_logs.join(
119 |         time_table,
120 |         artists_songs_logs.ts == time_table.ts, 'left'
121 |     ).drop(artists_songs_logs.year)
122 | 
123 |     # write songplays table to parquet files partitioned by year and month
124 |     songplays_table = songplays.select(
125 |         col('start_time'),
126 |         col('userId').alias('user_id'),
127 |         col('level'),
128 |         col('song_id'),
129 |         col('artist_id'),
130 |         col('sessionId').alias('session_id'),
131 |         col('location'),
132 |         col('userAgent').alias('user_agent'),
133 |         col('year'),
134 |         col('month'),
135 |     ).repartition("year", "month")
136 | 
137 |     songplays_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data, 'songplays')
138 | 
139 | 
140 | def main():
141 |     spark = create_spark_session()
142 |     input_data = "s3a://udacity-dend/"
143 |     output_data = "s3a://sparkify-data-udend/"
144 | 
145 |     process_song_data(spark, input_data, output_data)
146 |     process_log_data(spark, input_data, output_data)
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     main()
151 | 


--------------------------------------------------------------------------------
/Data Modeling with Cassandra/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Modeling with Cassandra/.DS_Store


--------------------------------------------------------------------------------
/Data Modeling with Cassandra/Project_1B_ Project_Template.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "editable": true
  7 |    },
  8 |    "source": [
  9 |     "# Part I. ETL Pipeline for Pre-Processing the Files"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "editable": true
 16 |    },
 17 |    "source": [
 18 |     "## PLEASE RUN THE FOLLOWING CODE FOR PRE-PROCESSING THE FILES"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {
 24 |     "editable": true
 25 |    },
 26 |    "source": [
 27 |     "#### Import Python packages "
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {
 34 |     "editable": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Import Python packages \n",
 39 |     "import pandas as pd\n",
 40 |     "import cassandra\n",
 41 |     "import re\n",
 42 |     "import os\n",
 43 |     "import glob\n",
 44 |     "import numpy as np\n",
 45 |     "import json\n",
 46 |     "import csv"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "editable": true
 53 |    },
 54 |    "source": [
 55 |     "#### Creating list of filepaths to process original event csv data files"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "metadata": {
 62 |     "editable": true
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stdout",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "/home/workspace\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "# checking your current working directory\n",
 75 |     "print(os.getcwd())\n",
 76 |     "\n",
 77 |     "# Get your current folder and subfolder event data\n",
 78 |     "filepath = os.getcwd() + '/event_data'\n",
 79 |     "\n",
 80 |     "# Create a for loop to create a list of files and collect each filepath\n",
 81 |     "for root, dirs, files in os.walk(filepath):\n",
 82 |     "    \n",
 83 |     "# join the file path and roots with the subdirectories using glob\n",
 84 |     "    file_path_list = glob.glob(os.path.join(root,'*'))\n",
 85 |     "    #print(file_path_list)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {
 91 |     "editable": true
 92 |    },
 93 |    "source": [
 94 |     "#### Processing the files to create the data file csv that will be used for Apache Casssandra tables"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 3,
100 |    "metadata": {
101 |     "editable": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "# initiating an empty list of rows that will be generated from each file\n",
106 |     "full_data_rows_list = [] \n",
107 |     "    \n",
108 |     "# for every filepath in the file path list \n",
109 |     "for f in file_path_list:\n",
110 |     "\n",
111 |     "# reading csv file \n",
112 |     "    with open(f, 'r', encoding = 'utf8', newline='') as csvfile: \n",
113 |     "        # creating a csv reader object \n",
114 |     "        csvreader = csv.reader(csvfile) \n",
115 |     "        next(csvreader)\n",
116 |     "        \n",
117 |     " # extracting each data row one by one and append it        \n",
118 |     "        for line in csvreader:\n",
119 |     "            #print(line)\n",
120 |     "            full_data_rows_list.append(line) \n",
121 |     "            \n",
122 |     "\n",
123 |     "# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \\\n",
124 |     "# Apache Cassandra tables\n",
125 |     "csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)\n",
126 |     "\n",
127 |     "with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f:\n",
128 |     "    writer = csv.writer(f, dialect='myDialect')\n",
129 |     "    writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\\\n",
130 |     "                'level','location','sessionId','song','userId'])\n",
131 |     "    for row in full_data_rows_list:\n",
132 |     "        if (row[0] == ''):\n",
133 |     "            continue\n",
134 |     "        writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))\n"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 4,
140 |    "metadata": {
141 |     "editable": true
142 |    },
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "6821\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "# check the number of rows in your csv file\n",
154 |     "with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f:\n",
155 |     "    print(sum(1 for line in f))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {
161 |     "editable": true
162 |    },
163 |    "source": [
164 |     "# Part II. Complete the Apache Cassandra coding portion of your project. \n",
165 |     "\n",
166 |     "## Now you are ready to work with the CSV file titled <font color=red>event_datafile_new.csv</font>, located within the Workspace directory.  The event_datafile_new.csv contains the following columns: \n",
167 |     "- artist \n",
168 |     "- firstName of user\n",
169 |     "- gender of user\n",
170 |     "- item number in session\n",
171 |     "- last name of user\n",
172 |     "- length of the song\n",
173 |     "- level (paid or free song)\n",
174 |     "- location of the user\n",
175 |     "- sessionId\n",
176 |     "- song title\n",
177 |     "- userId\n",
178 |     "\n",
179 |     "The image below is a screenshot of what the denormalized data should appear like in the <font color=red>**event_datafile_new.csv**</font> after the code above is run:<br>\n",
180 |     "\n",
181 |     "<img src=\"images/image_event_datafile_new.jpg\">"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {
187 |     "editable": true
188 |    },
189 |    "source": [
190 |     "## Begin writing your Apache Cassandra code in the cells below"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "editable": true
197 |    },
198 |    "source": [
199 |     "#### Creating a Cluster"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 5,
205 |    "metadata": {
206 |     "editable": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "# This should make a connection to a Cassandra instance your local machine \n",
211 |     "# (127.0.0.1)\n",
212 |     "\n",
213 |     "from cassandra.cluster import Cluster\n",
214 |     "cluster = Cluster()\n",
215 |     "\n",
216 |     "# To establish connection and begin executing queries, need a session\n",
217 |     "session = cluster.connect()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {
223 |     "editable": true
224 |    },
225 |    "source": [
226 |     "#### Create Keyspace"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 6,
232 |    "metadata": {
233 |     "editable": true
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "try:\n",
238 |     "    session.execute(\"\"\"\n",
239 |     "    CREATE KEYSPACE IF NOT EXISTS udacity \n",
240 |     "    WITH REPLICATION = \n",
241 |     "    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n",
242 |     ")\n",
243 |     "\n",
244 |     "except Exception as e:\n",
245 |     "    print(e)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {
251 |     "editable": true
252 |    },
253 |    "source": [
254 |     "#### Set Keyspace"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 7,
260 |    "metadata": {
261 |     "editable": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "try:\n",
266 |     "    session.set_keyspace('udacity')\n",
267 |     "except Exception as e:\n",
268 |     "    print(e)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {
274 |     "editable": true
275 |    },
276 |    "source": [
277 |     "### Now we need to create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {
283 |     "editable": true
284 |    },
285 |    "source": [
286 |     "## Create queries to ask the following three questions of the data\n",
287 |     "\n",
288 |     "### 1. Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4\n",
289 |     "\n",
290 |     "\n",
291 |     "### 2. Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182\n",
292 |     "    \n",
293 |     "\n",
294 |     "### 3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'\n",
295 |     "\n",
296 |     "\n"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {
302 |     "editable": true
303 |    },
304 |    "source": [
305 |     "#### Delete and create the sessions table to match query 1"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 8,
311 |    "metadata": {
312 |     "editable": true
313 |    },
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "<cassandra.cluster.ResultSet at 0x7f402de48f60>"
319 |       ]
320 |      },
321 |      "execution_count": 8,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "delete_sessions_table_query = \"DROP TABLE IF EXISTS sessions\"\n",
328 |     "session.execute(delete_sessions_table_query)\n",
329 |     "\n",
330 |     "create_sessions_table_query = \"CREATE TABLE IF NOT EXISTS sessions (artist text, item_in_session int, \\\n",
331 |     "length float, session_id int, song_title text, PRIMARY KEY (session_id, item_in_session))\"\n",
332 |     "session.execute(create_sessions_table_query)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {
338 |     "editable": true
339 |    },
340 |    "source": [
341 |     "#### Insert all sessions from the csv data into the sessions table"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 9,
347 |    "metadata": {
348 |     "editable": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "file = 'event_datafile_new.csv'\n",
353 |     "\n",
354 |     "with open(file, encoding = 'utf8') as f:\n",
355 |     "    csvreader = csv.reader(f)\n",
356 |     "    next(csvreader) # skip header\n",
357 |     "    for line in csvreader:\n",
358 |     "        query = \"INSERT INTO sessions (artist, item_in_session, length, session_id, song_title)\"\n",
359 |     "        query = query + \" VALUES (%s, %s, %s, %s, %s)\"\n",
360 |     "        session.execute(query, (line[0], int(line[3]), float(line[5]), int(line[8]), line[9]))"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {
366 |     "editable": true
367 |    },
368 |    "source": [
369 |     "#### Do a SELECT to verify that the data have been inserted into the table\n",
370 |     "#### Give me the artist, song title and song's length in the music app history that was heard during  sessionId = 338, and itemInSession  = 4"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 10,
376 |    "metadata": {
377 |     "editable": true,
378 |     "scrolled": true
379 |    },
380 |    "outputs": [
381 |     {
382 |      "name": "stdout",
383 |      "output_type": "stream",
384 |      "text": [
385 |       "Faithless Music Matters (Mark Knight Dub) 495.30731201171875\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "select_session_quert = \"select artist, song_title, length from sessions WHERE session_id = 338 and item_in_session = 4\"\n",
391 |     "rows = session.execute(select_session_quert)\n",
392 |     "for row in rows:\n",
393 |     "    print (row.artist, row.song_title, row.length)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {
399 |     "editable": true
400 |    },
401 |    "source": [
402 |     "### COPY AND REPEAT THE ABOVE THREE CELLS FOR EACH OF THE THREE QUESTIONS"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "editable": true
409 |    },
410 |    "source": [
411 |     "#### Delete and create the users table to match query 2"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 11,
417 |    "metadata": {
418 |     "editable": true
419 |    },
420 |    "outputs": [
421 |     {
422 |      "data": {
423 |       "text/plain": [
424 |        "<cassandra.cluster.ResultSet at 0x7f402de44ac8>"
425 |       ]
426 |      },
427 |      "execution_count": 11,
428 |      "metadata": {},
429 |      "output_type": "execute_result"
430 |     }
431 |    ],
432 |    "source": [
433 |     "delete_users_table_query = \"DROP TABLE IF EXISTS users\"\n",
434 |     "session.execute(delete_users_table_query)\n",
435 |     "\n",
436 |     "create_users_table_query = \"CREATE TABLE IF NOT EXISTS users (artist text, first_name text, \\\n",
437 |     "item_in_session int, last_name text, session_id int, song_title text, user_id int, \\\n",
438 |     "PRIMARY KEY ((user_id, session_id), item_in_session))\"\n",
439 |     "session.execute(create_users_table_query)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {
445 |     "editable": true
446 |    },
447 |    "source": [
448 |     "#### Insert all users into the users table"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 12,
454 |    "metadata": {
455 |     "editable": true
456 |    },
457 |    "outputs": [],
458 |    "source": [
459 |     "file = 'event_datafile_new.csv'\n",
460 |     "\n",
461 |     "with open(file, encoding = 'utf8') as f:\n",
462 |     "    csvreader = csv.reader(f)\n",
463 |     "    next(csvreader) # skip header\n",
464 |     "    for line in csvreader:\n",
465 |     "        query = \"INSERT INTO users (artist, first_name, item_in_session, last_name, session_id, song_title, user_id)\"\n",
466 |     "        query = query + \" VALUES (%s, %s, %s, %s, %s, %s, %s)\"\n",
467 |     "        session.execute(query, (line[0], line[1], int(line[3]), line[4], int(line[8]), line[9], int(line[10]))) "
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {
473 |     "editable": true
474 |    },
475 |    "source": [
476 |     "#### Do a SELECT to verify that the data have been inserted into the table\n",
477 |     "#### Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 13,
483 |    "metadata": {
484 |     "editable": true
485 |    },
486 |    "outputs": [
487 |     {
488 |      "name": "stdout",
489 |      "output_type": "stream",
490 |      "text": [
491 |       "Down To The Bone Keep On Keepin' On Sylvie Cruz\n",
492 |       "Three Drives Greece 2000 Sylvie Cruz\n",
493 |       "Sebastien Tellier Kilometer Sylvie Cruz\n",
494 |       "Lonnie Gordon Catch You Baby (Steve Pitron & Max Sanna Radio Edit) Sylvie Cruz\n"
495 |      ]
496 |     }
497 |    ],
498 |    "source": [
499 |     "select_user_query = \"select artist, song_title, first_name, last_name from users WHERE session_id = 182 and user_id = 10\"\n",
500 |     "rows = session.execute(select_user_query)\n",
501 |     "for row in rows:\n",
502 |     "    print (row.artist, row.song_title, row.first_name, row.last_name)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {
508 |     "editable": true
509 |    },
510 |    "source": [
511 |     "#### Delete and create the song_listens table to match query 3."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 14,
517 |    "metadata": {
518 |     "editable": true
519 |    },
520 |    "outputs": [
521 |     {
522 |      "data": {
523 |       "text/plain": [
524 |        "<cassandra.cluster.ResultSet at 0x7f402de3ee80>"
525 |       ]
526 |      },
527 |      "execution_count": 14,
528 |      "metadata": {},
529 |      "output_type": "execute_result"
530 |     }
531 |    ],
532 |    "source": [
533 |     "delete_song_listens_table_query = \"DROP TABLE IF EXISTS song_listens\"\n",
534 |     "session.execute(delete_song_listens_table_query)\n",
535 |     "\n",
536 |     "create_song_listens_table_query = \"CREATE TABLE IF NOT EXISTS song_listens (first_name text, last_name text, song_title text, user_id int, \\\n",
537 |     "PRIMARY KEY (song_title, user_id))\"\n",
538 |     "session.execute(create_song_listens_table_query)"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "markdown",
543 |    "metadata": {
544 |     "editable": true
545 |    },
546 |    "source": [
547 |     "#### Insert all song_listens into the table"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": 15,
553 |    "metadata": {
554 |     "editable": true
555 |    },
556 |    "outputs": [],
557 |    "source": [
558 |     "file = 'event_datafile_new.csv'\n",
559 |     "\n",
560 |     "with open(file, encoding = 'utf8') as f:\n",
561 |     "    csvreader = csv.reader(f)\n",
562 |     "    next(csvreader) # skip header\n",
563 |     "    for line in csvreader:\n",
564 |     "        query = \"INSERT INTO song_listens (first_name, last_name, song_title, user_id)\"\n",
565 |     "        query = query + \" VALUES (%s, %s, %s, %s)\"\n",
566 |     "        session.execute(query, (line[1], line[4], line[9], int(line[10])))           "
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {
572 |     "editable": true
573 |    },
574 |    "source": [
575 |     "#### Do a SELECT to verify that the data have been inserted into the table\n",
576 |     "#### Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 16,
582 |    "metadata": {
583 |     "editable": true
584 |    },
585 |    "outputs": [
586 |     {
587 |      "name": "stdout",
588 |      "output_type": "stream",
589 |      "text": [
590 |       "Jacqueline Lynch\n",
591 |       "Tegan Levine\n",
592 |       "Sara Johnson\n"
593 |      ]
594 |     }
595 |    ],
596 |    "source": [
597 |     "select_song_listens_query = \"select first_name, last_name from song_listens WHERE song_title = 'All Hands Against His Own'\"\n",
598 |     "rows = session.execute(select_song_listens_query)\n",
599 |     "for row in rows:\n",
600 |     "    print (row.first_name, row.last_name)"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {
606 |     "editable": true
607 |    },
608 |    "source": [
609 |     "### Drop the tables before closing out the sessions"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 17,
615 |    "metadata": {
616 |     "editable": true
617 |    },
618 |    "outputs": [
619 |     {
620 |      "data": {
621 |       "text/plain": [
622 |        "<cassandra.cluster.ResultSet at 0x7f402de37a90>"
623 |       ]
624 |      },
625 |      "execution_count": 17,
626 |      "metadata": {},
627 |      "output_type": "execute_result"
628 |     }
629 |    ],
630 |    "source": [
631 |     "delete_sessions_table_query = \"DROP TABLE IF EXISTS sessions\"\n",
632 |     "session.execute(delete_sessions_table_query)\n",
633 |     "\n",
634 |     "delete_users_table_query = \"DROP TABLE IF EXISTS users\"\n",
635 |     "session.execute(delete_users_table_query)\n",
636 |     "\n",
637 |     "delete_song_listens_table_query = \"DROP TABLE IF EXISTS song_listens\"\n",
638 |     "session.execute(delete_song_listens_table_query)"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "markdown",
643 |    "metadata": {
644 |     "editable": true
645 |    },
646 |    "source": [
647 |     "### Close the session and cluster connection¶"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": 18,
653 |    "metadata": {
654 |     "editable": true
655 |    },
656 |    "outputs": [],
657 |    "source": [
658 |     "session.shutdown()\n",
659 |     "cluster.shutdown()"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "code",
664 |    "execution_count": null,
665 |    "metadata": {
666 |     "editable": true
667 |    },
668 |    "outputs": [],
669 |    "source": []
670 |   }
671 |  ],
672 |  "metadata": {
673 |   "kernelspec": {
674 |    "display_name": "Python 3",
675 |    "language": "python",
676 |    "name": "python3"
677 |   },
678 |   "language_info": {
679 |    "codemirror_mode": {
680 |     "name": "ipython",
681 |     "version": 3
682 |    },
683 |    "file_extension": ".py",
684 |    "mimetype": "text/x-python",
685 |    "name": "python",
686 |    "nbconvert_exporter": "python",
687 |    "pygments_lexer": "ipython3",
688 |    "version": "3.6.3"
689 |   }
690 |  },
691 |  "nbformat": 4,
692 |  "nbformat_minor": 4
693 | }
694 | 


--------------------------------------------------------------------------------
/Data Modeling with Cassandra/README.md:
--------------------------------------------------------------------------------
1 | # Udactity-Data-Modeling-with-Cassandra
2 | 
3 | A startup called Sparkify wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. The analysis team is particularly interested in understanding what songs users are listening to. Currently, there is no easy way to query the data to generate the results, since the data reside in a directory of CSV files on user activity on the app.
4 | 
5 | They'd like a data engineer to create an Apache Cassandra database which can create queries on song play data to answer the questions, and wish to bring you on the project. Your role is to create a database for this analysis. You'll be able to test your database by running queries given to you by the analytics team from Sparkify to create the results.
6 | 


--------------------------------------------------------------------------------
/Data Modeling with Cassandra/images/image_event_datafile_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Modeling with Cassandra/images/image_event_datafile_new.jpg


--------------------------------------------------------------------------------
/Data Modeling with Postgres/README.md:
--------------------------------------------------------------------------------
 1 | ## Udactity Data Engineer Nanodegree Project: Data Modeling with Postgres
 2 | 
 3 | A startup called Sparkify wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. The analytics team is particularly interested in understanding what songs users are listening to. Currently, they don't have an easy way to query their data, which resides in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app.
 4 | 
 5 | They'd like a data engineer to create a Postgres database with tables designed to optimize queries on song play analysis, and bring you on the project. Your role is to create a database schema and ETL pipeline for this analysis. You'll be able to test your database and ETL pipeline by running queries given to you by the analytics team from Sparkify and compare your results with their expected results.
 6 | 
 7 | ### Datasets available
 8 | 
 9 | The song dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID.
10 | 
11 | The log dataset consists of log files in JSON format generated by [this event simulator](https://github.com/Interana/eventsim) based on the songs in the dataset above. These simulate activity logs from a music streaming app based on specified configurations.
12 | 
13 | ### Setup Instructions and Steps followed
14 | 
15 | * Install requirements with `pip3 install -r requirements.txt`.
16 | * Set up a local PostgreSQL instance on port 5432. Please see detailed instructions in the [PostgreSQL documentation](https://www.postgresql.org/docs/9.1/runtime.html).
17 | 
18 | ### Program execution
19 | 
20 | * Execute the script to generate the database and its tables by executing `python3 create_tables.py`.
21 | * Load the data and insert it to the database by executing `python3 etl.py`.
22 | 
23 | ### Schema Design
24 | 
25 | * The fact table `songplays` stores the records in log data associated with song plays i.e. records with page.
26 | * The dimension table `users` stores the users in the app.
27 | * The dimension table `song` stores the songs in the music database.
28 | * The dimension table `artists` stores the artists the in music database.
29 | * The dimension table `time` stores the timestamps of records in songplays broken down into specific units.
30 | 
31 | ### Purpose of this database
32 | 
33 | This database allows to aggregate all songs, artists, users and songplays in a single database. In this way, the company disposes of the needed data stored in a unique structure and can thus analyze different scenarios easily. For instance, one can analyze the popularity of different songs or artists. Also, it is possible to perform analysis with geographic information. One can for instance determine which song is popular in which country and region.
34 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/create_tables.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from sql_queries import create_table_queries, drop_table_queries
 3 | 
 4 | 
 5 | def create_database():
 6 |     """
 7 |     - Creates and connects to the sparkifydb
 8 |     @return: cursor and connection to sparkifydb
 9 |     """
10 |     
11 |     # connect to default database
12 |     conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student")
13 |     conn.set_session(autocommit=True)
14 |     cur = conn.cursor()
15 |     
16 |     # create sparkify database with UTF8 encoding
17 |     cur.execute("DROP DATABASE IF EXISTS sparkifydb")
18 |     cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0")
19 | 
20 |     # close connection to default database
21 |     conn.close()    
22 |     
23 |     # connect to sparkify database
24 |     conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
25 |     cur = conn.cursor()
26 |     
27 |     return cur, conn
28 | 
29 | 
30 | def drop_tables(cur, conn):
31 |     """
32 |     Drops each table using the queries in `drop_table_queries` list.
33 |     @param cur:
34 |     @param conn:
35 |     """
36 |     for query in drop_table_queries:
37 |         cur.execute(query)
38 |         conn.commit()
39 | 
40 | 
41 | def create_tables(cur, conn):
42 |     """
43 |     Creates each table using the queries in `create_table_queries` list.
44 |     @param cur:
45 |     @param conn:
46 |     """
47 |     for query in create_table_queries:
48 |         cur.execute(query)
49 |         conn.commit()
50 | 
51 | 
52 | def main():
53 |     """
54 |     - Drops (if exists) and Creates the sparkify database. 
55 |     
56 |     - Establishes connection with the sparkify database and gets
57 |     cursor to it.  
58 |     
59 |     - Drops all the tables.  
60 |     
61 |     - Creates all tables needed. 
62 |     
63 |     - Finally, closes the connection. 
64 |     """
65 |     cur, conn = create_database()
66 |     
67 |     drop_tables(cur, conn)
68 |     create_tables(cur, conn)
69 | 
70 |     conn.close()
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/etl.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "editable": true
   7 |    },
   8 |    "source": [
   9 |     "# ETL Processes\n",
  10 |     "Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets."
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "code",
  15 |    "execution_count": 1,
  16 |    "metadata": {
  17 |     "editable": true
  18 |    },
  19 |    "outputs": [],
  20 |    "source": [
  21 |     "import os\n",
  22 |     "import glob\n",
  23 |     "import psycopg2\n",
  24 |     "import pandas as pd\n",
  25 |     "from sql_queries import *"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "code",
  30 |    "execution_count": 2,
  31 |    "metadata": {
  32 |     "editable": true
  33 |    },
  34 |    "outputs": [],
  35 |    "source": [
  36 |     "conn = psycopg2.connect(\"host=127.0.0.1 dbname=sparkifydb user=student password=student\")\n",
  37 |     "cur = conn.cursor()"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 3,
  43 |    "metadata": {
  44 |     "editable": true
  45 |    },
  46 |    "outputs": [],
  47 |    "source": [
  48 |     "def get_files(filepath):\n",
  49 |     "    all_files = []\n",
  50 |     "    for root, dirs, files in os.walk(filepath):\n",
  51 |     "        files = glob.glob(os.path.join(root,'*.json'))\n",
  52 |     "        for f in files :\n",
  53 |     "            all_files.append(os.path.abspath(f))\n",
  54 |     "    \n",
  55 |     "    return all_files"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {
  61 |     "editable": true
  62 |    },
  63 |    "source": [
  64 |     "# Process `song_data`\n",
  65 |     "In this first part, you'll perform ETL on the first dataset, `song_data`, to create the `songs` and `artists` dimensional tables.\n",
  66 |     "\n",
  67 |     "Let's perform ETL on a single song file and load a single record into each table to start.\n",
  68 |     "- Use the `get_files` function provided above to get a list of all song JSON files in `data/song_data`\n",
  69 |     "- Select the first song in this list\n",
  70 |     "- Read the song file and view the data"
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": 4,
  76 |    "metadata": {
  77 |     "editable": true
  78 |    },
  79 |    "outputs": [
  80 |     {
  81 |      "data": {
  82 |       "text/plain": [
  83 |        "['/home/workspace/data/song_data/A/B/C/TRABCRU128F423F449.json',\n",
  84 |        " '/home/workspace/data/song_data/A/B/C/TRABCTK128F934B224.json',\n",
  85 |        " '/home/workspace/data/song_data/A/B/C/TRABCUQ128E0783E2B.json']"
  86 |       ]
  87 |      },
  88 |      "execution_count": 4,
  89 |      "metadata": {},
  90 |      "output_type": "execute_result"
  91 |     }
  92 |    ],
  93 |    "source": [
  94 |     "song_files = get_files(\"./data/song_data\")\n",
  95 |     "song_files[0:3]"
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "code",
 100 |    "execution_count": 5,
 101 |    "metadata": {
 102 |     "editable": true
 103 |    },
 104 |    "outputs": [
 105 |     {
 106 |      "data": {
 107 |       "text/plain": [
 108 |        "'/home/workspace/data/song_data/A/B/C/TRABCRU128F423F449.json'"
 109 |       ]
 110 |      },
 111 |      "execution_count": 5,
 112 |      "metadata": {},
 113 |      "output_type": "execute_result"
 114 |     }
 115 |    ],
 116 |    "source": [
 117 |     "filepath = song_files[0]\n",
 118 |     "filepath"
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": 6,
 124 |    "metadata": {
 125 |     "editable": true
 126 |    },
 127 |    "outputs": [
 128 |     {
 129 |      "data": {
 130 |       "text/plain": [
 131 |        "num_songs                            1\n",
 132 |        "artist_id           AR8IEZO1187B99055E\n",
 133 |        "artist_latitude                   None\n",
 134 |        "artist_longitude                  None\n",
 135 |        "artist_location                       \n",
 136 |        "artist_name               Marc Shaiman\n",
 137 |        "song_id             SOINLJW12A8C13314C\n",
 138 |        "title                    City Slickers\n",
 139 |        "duration                       149.864\n",
 140 |        "year                              2008\n",
 141 |        "dtype: object"
 142 |       ]
 143 |      },
 144 |      "execution_count": 6,
 145 |      "metadata": {},
 146 |      "output_type": "execute_result"
 147 |     }
 148 |    ],
 149 |    "source": [
 150 |     "df = pd.read_json(filepath, typ='series')\n",
 151 |     "df.head(20)"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {
 157 |     "editable": true
 158 |    },
 159 |    "source": [
 160 |     "## #1: `songs` Table\n",
 161 |     "#### Extract Data for Songs Table\n",
 162 |     "- Select columns for song ID, title, artist ID, year, and duration\n",
 163 |     "- Use `df.values` to select just the values from the dataframe\n",
 164 |     "- Index to select the first (only) record in the dataframe\n",
 165 |     "- Convert the array to a list and set it to `song_data`"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": 7,
 171 |    "metadata": {
 172 |     "editable": true
 173 |    },
 174 |    "outputs": [
 175 |     {
 176 |      "data": {
 177 |       "text/plain": [
 178 |        "['SOINLJW12A8C13314C', 'City Slickers', 'AR8IEZO1187B99055E', 2008, 149.86404]"
 179 |       ]
 180 |      },
 181 |      "execution_count": 7,
 182 |      "metadata": {},
 183 |      "output_type": "execute_result"
 184 |     }
 185 |    ],
 186 |    "source": [
 187 |     "song_data = df[[\"song_id\", \"title\", \"artist_id\", \"year\", \"duration\"]]\n",
 188 |     "song_data = list(song_data.values)\n",
 189 |     "song_data"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "markdown",
 194 |    "metadata": {
 195 |     "editable": true
 196 |    },
 197 |    "source": [
 198 |     "#### Insert Record into Song Table\n",
 199 |     "Implement the `song_table_insert` query in `sql_queries.py` and run the cell below to insert a record for this song into the `songs` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `songs` table in the sparkify database."
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "code",
 204 |    "execution_count": 8,
 205 |    "metadata": {
 206 |     "editable": true
 207 |    },
 208 |    "outputs": [],
 209 |    "source": [
 210 |     "cur.execute(song_table_insert, song_data)\n",
 211 |     "conn.commit()"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "markdown",
 216 |    "metadata": {
 217 |     "editable": true
 218 |    },
 219 |    "source": [
 220 |     "Run `test.ipynb` to see if you've successfully added a record to this table."
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "markdown",
 225 |    "metadata": {
 226 |     "editable": true
 227 |    },
 228 |    "source": [
 229 |     "## #2: `artists` Table\n",
 230 |     "#### Extract Data for Artists Table\n",
 231 |     "- Select columns for artist ID, name, location, latitude, and longitude\n",
 232 |     "- Use `df.values` to select just the values from the dataframe\n",
 233 |     "- Index to select the first (only) record in the dataframe\n",
 234 |     "- Convert the array to a list and set it to `artist_data`"
 235 |    ]
 236 |   },
 237 |   {
 238 |    "cell_type": "code",
 239 |    "execution_count": 9,
 240 |    "metadata": {
 241 |     "editable": true
 242 |    },
 243 |    "outputs": [
 244 |     {
 245 |      "data": {
 246 |       "text/plain": [
 247 |        "artist_id           AR8IEZO1187B99055E\n",
 248 |        "artist_name               Marc Shaiman\n",
 249 |        "artist_location                       \n",
 250 |        "artist_latitude                   None\n",
 251 |        "artist_longitude                  None\n",
 252 |        "dtype: object"
 253 |       ]
 254 |      },
 255 |      "execution_count": 9,
 256 |      "metadata": {},
 257 |      "output_type": "execute_result"
 258 |     }
 259 |    ],
 260 |    "source": [
 261 |     "artist_data = df[[\"artist_id\", \"artist_name\", \"artist_location\", \"artist_latitude\", \"artist_longitude\"]]\n",
 262 |     "artist_data"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "markdown",
 267 |    "metadata": {
 268 |     "editable": true
 269 |    },
 270 |    "source": [
 271 |     "#### Insert Record into Artist Table\n",
 272 |     "Implement the `artist_table_insert` query in `sql_queries.py` and run the cell below to insert a record for this song's artist into the `artists` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `artists` table in the sparkify database."
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": 10,
 278 |    "metadata": {
 279 |     "editable": true
 280 |    },
 281 |    "outputs": [],
 282 |    "source": [
 283 |     "cur.execute(artist_table_insert, artist_data)\n",
 284 |     "conn.commit()"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "markdown",
 289 |    "metadata": {
 290 |     "editable": true
 291 |    },
 292 |    "source": [
 293 |     "Run `test.ipynb` to see if you've successfully added a record to this table."
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "markdown",
 298 |    "metadata": {
 299 |     "editable": true
 300 |    },
 301 |    "source": [
 302 |     "# Process `log_data`\n",
 303 |     "In this part, you'll perform ETL on the second dataset, `log_data`, to create the `time` and `users` dimensional tables, as well as the `songplays` fact table.\n",
 304 |     "\n",
 305 |     "Let's perform ETL on a single log file and load a single record into each table.\n",
 306 |     "- Use the `get_files` function provided above to get a list of all log JSON files in `data/log_data`\n",
 307 |     "- Select the first log file in this list\n",
 308 |     "- Read the log file and view the data"
 309 |    ]
 310 |   },
 311 |   {
 312 |    "cell_type": "code",
 313 |    "execution_count": 11,
 314 |    "metadata": {
 315 |     "editable": true
 316 |    },
 317 |    "outputs": [],
 318 |    "source": [
 319 |     "log_files = get_files(\"./data/log_data\")"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": 12,
 325 |    "metadata": {
 326 |     "editable": true
 327 |    },
 328 |    "outputs": [],
 329 |    "source": [
 330 |     "filepath = log_files[0]"
 331 |    ]
 332 |   },
 333 |   {
 334 |    "cell_type": "code",
 335 |    "execution_count": 13,
 336 |    "metadata": {
 337 |     "editable": true
 338 |    },
 339 |    "outputs": [
 340 |     {
 341 |      "data": {
 342 |       "text/html": [
 343 |        "<div>\n",
 344 |        "<style scoped>\n",
 345 |        "    .dataframe tbody tr th:only-of-type {\n",
 346 |        "        vertical-align: middle;\n",
 347 |        "    }\n",
 348 |        "\n",
 349 |        "    .dataframe tbody tr th {\n",
 350 |        "        vertical-align: top;\n",
 351 |        "    }\n",
 352 |        "\n",
 353 |        "    .dataframe thead th {\n",
 354 |        "        text-align: right;\n",
 355 |        "    }\n",
 356 |        "</style>\n",
 357 |        "<table border=\"1\" class=\"dataframe\">\n",
 358 |        "  <thead>\n",
 359 |        "    <tr style=\"text-align: right;\">\n",
 360 |        "      <th></th>\n",
 361 |        "      <th>artist</th>\n",
 362 |        "      <th>auth</th>\n",
 363 |        "      <th>firstName</th>\n",
 364 |        "      <th>gender</th>\n",
 365 |        "      <th>itemInSession</th>\n",
 366 |        "      <th>lastName</th>\n",
 367 |        "      <th>length</th>\n",
 368 |        "      <th>level</th>\n",
 369 |        "      <th>location</th>\n",
 370 |        "      <th>method</th>\n",
 371 |        "      <th>page</th>\n",
 372 |        "      <th>registration</th>\n",
 373 |        "      <th>sessionId</th>\n",
 374 |        "      <th>song</th>\n",
 375 |        "      <th>status</th>\n",
 376 |        "      <th>ts</th>\n",
 377 |        "      <th>userAgent</th>\n",
 378 |        "      <th>userId</th>\n",
 379 |        "    </tr>\n",
 380 |        "  </thead>\n",
 381 |        "  <tbody>\n",
 382 |        "    <tr>\n",
 383 |        "      <th>0</th>\n",
 384 |        "      <td>Sydney Youngblood</td>\n",
 385 |        "      <td>Logged In</td>\n",
 386 |        "      <td>Jacob</td>\n",
 387 |        "      <td>M</td>\n",
 388 |        "      <td>53</td>\n",
 389 |        "      <td>Klein</td>\n",
 390 |        "      <td>238.07955</td>\n",
 391 |        "      <td>paid</td>\n",
 392 |        "      <td>Tampa-St. Petersburg-Clearwater, FL</td>\n",
 393 |        "      <td>PUT</td>\n",
 394 |        "      <td>NextSong</td>\n",
 395 |        "      <td>1.540558e+12</td>\n",
 396 |        "      <td>954</td>\n",
 397 |        "      <td>Ain't No Sunshine</td>\n",
 398 |        "      <td>200</td>\n",
 399 |        "      <td>1543449657796</td>\n",
 400 |        "      <td>\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...</td>\n",
 401 |        "      <td>73</td>\n",
 402 |        "    </tr>\n",
 403 |        "    <tr>\n",
 404 |        "      <th>1</th>\n",
 405 |        "      <td>Gang Starr</td>\n",
 406 |        "      <td>Logged In</td>\n",
 407 |        "      <td>Layla</td>\n",
 408 |        "      <td>F</td>\n",
 409 |        "      <td>88</td>\n",
 410 |        "      <td>Griffin</td>\n",
 411 |        "      <td>151.92771</td>\n",
 412 |        "      <td>paid</td>\n",
 413 |        "      <td>Lake Havasu City-Kingman, AZ</td>\n",
 414 |        "      <td>PUT</td>\n",
 415 |        "      <td>NextSong</td>\n",
 416 |        "      <td>1.541057e+12</td>\n",
 417 |        "      <td>984</td>\n",
 418 |        "      <td>My Advice 2 You (Explicit)</td>\n",
 419 |        "      <td>200</td>\n",
 420 |        "      <td>1543449690796</td>\n",
 421 |        "      <td>\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...</td>\n",
 422 |        "      <td>24</td>\n",
 423 |        "    </tr>\n",
 424 |        "    <tr>\n",
 425 |        "      <th>2</th>\n",
 426 |        "      <td>3OH!3</td>\n",
 427 |        "      <td>Logged In</td>\n",
 428 |        "      <td>Layla</td>\n",
 429 |        "      <td>F</td>\n",
 430 |        "      <td>89</td>\n",
 431 |        "      <td>Griffin</td>\n",
 432 |        "      <td>192.52200</td>\n",
 433 |        "      <td>paid</td>\n",
 434 |        "      <td>Lake Havasu City-Kingman, AZ</td>\n",
 435 |        "      <td>PUT</td>\n",
 436 |        "      <td>NextSong</td>\n",
 437 |        "      <td>1.541057e+12</td>\n",
 438 |        "      <td>984</td>\n",
 439 |        "      <td>My First Kiss (Feat. Ke$ha) [Album Version]</td>\n",
 440 |        "      <td>200</td>\n",
 441 |        "      <td>1543449841796</td>\n",
 442 |        "      <td>\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...</td>\n",
 443 |        "      <td>24</td>\n",
 444 |        "    </tr>\n",
 445 |        "  </tbody>\n",
 446 |        "</table>\n",
 447 |        "</div>"
 448 |       ],
 449 |       "text/plain": [
 450 |        "              artist       auth firstName gender  itemInSession lastName  \\\n",
 451 |        "0  Sydney Youngblood  Logged In     Jacob      M             53    Klein   \n",
 452 |        "1         Gang Starr  Logged In     Layla      F             88  Griffin   \n",
 453 |        "2              3OH!3  Logged In     Layla      F             89  Griffin   \n",
 454 |        "\n",
 455 |        "      length level                             location method      page  \\\n",
 456 |        "0  238.07955  paid  Tampa-St. Petersburg-Clearwater, FL    PUT  NextSong   \n",
 457 |        "1  151.92771  paid         Lake Havasu City-Kingman, AZ    PUT  NextSong   \n",
 458 |        "2  192.52200  paid         Lake Havasu City-Kingman, AZ    PUT  NextSong   \n",
 459 |        "\n",
 460 |        "   registration  sessionId                                         song  \\\n",
 461 |        "0  1.540558e+12        954                            Ain't No Sunshine   \n",
 462 |        "1  1.541057e+12        984                   My Advice 2 You (Explicit)   \n",
 463 |        "2  1.541057e+12        984  My First Kiss (Feat. Ke$ha) [Album Version]   \n",
 464 |        "\n",
 465 |        "   status             ts                                          userAgent  \\\n",
 466 |        "0     200  1543449657796  \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...   \n",
 467 |        "1     200  1543449690796  \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...   \n",
 468 |        "2     200  1543449841796  \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...   \n",
 469 |        "\n",
 470 |        "  userId  \n",
 471 |        "0     73  \n",
 472 |        "1     24  \n",
 473 |        "2     24  "
 474 |       ]
 475 |      },
 476 |      "execution_count": 13,
 477 |      "metadata": {},
 478 |      "output_type": "execute_result"
 479 |     }
 480 |    ],
 481 |    "source": [
 482 |     "df = pd.read_json(filepath, lines=True)\n",
 483 |     "df.head(3)"
 484 |    ]
 485 |   },
 486 |   {
 487 |    "cell_type": "markdown",
 488 |    "metadata": {
 489 |     "editable": true
 490 |    },
 491 |    "source": [
 492 |     "## #3: `time` Table\n",
 493 |     "#### Extract Data for Time Table\n",
 494 |     "- Filter records by `NextSong` action\n",
 495 |     "- Convert the `ts` timestamp column to datetime\n",
 496 |     "  - Hint: the current timestamp is in milliseconds\n",
 497 |     "- Extract the timestamp, hour, day, week of year, month, year, and weekday from the `ts` column and set `time_data` to a list containing these values in order\n",
 498 |     "  - Hint: use pandas' [`dt` attribute](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html) to access easily datetimelike properties.\n",
 499 |     "- Specify labels for these columns and set to `column_labels`\n",
 500 |     "- Create a dataframe, `time_df,` containing the time data for this file by combining `column_labels` and `time_data` into a dictionary and converting this into a dataframe"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": 14,
 506 |    "metadata": {
 507 |     "editable": true
 508 |    },
 509 |    "outputs": [
 510 |     {
 511 |      "data": {
 512 |       "text/html": [
 513 |        "<div>\n",
 514 |        "<style scoped>\n",
 515 |        "    .dataframe tbody tr th:only-of-type {\n",
 516 |        "        vertical-align: middle;\n",
 517 |        "    }\n",
 518 |        "\n",
 519 |        "    .dataframe tbody tr th {\n",
 520 |        "        vertical-align: top;\n",
 521 |        "    }\n",
 522 |        "\n",
 523 |        "    .dataframe thead th {\n",
 524 |        "        text-align: right;\n",
 525 |        "    }\n",
 526 |        "</style>\n",
 527 |        "<table border=\"1\" class=\"dataframe\">\n",
 528 |        "  <thead>\n",
 529 |        "    <tr style=\"text-align: right;\">\n",
 530 |        "      <th></th>\n",
 531 |        "      <th>artist</th>\n",
 532 |        "      <th>auth</th>\n",
 533 |        "      <th>firstName</th>\n",
 534 |        "      <th>gender</th>\n",
 535 |        "      <th>itemInSession</th>\n",
 536 |        "      <th>lastName</th>\n",
 537 |        "      <th>length</th>\n",
 538 |        "      <th>level</th>\n",
 539 |        "      <th>location</th>\n",
 540 |        "      <th>method</th>\n",
 541 |        "      <th>page</th>\n",
 542 |        "      <th>registration</th>\n",
 543 |        "      <th>sessionId</th>\n",
 544 |        "      <th>song</th>\n",
 545 |        "      <th>status</th>\n",
 546 |        "      <th>ts</th>\n",
 547 |        "      <th>userAgent</th>\n",
 548 |        "      <th>userId</th>\n",
 549 |        "    </tr>\n",
 550 |        "  </thead>\n",
 551 |        "  <tbody>\n",
 552 |        "    <tr>\n",
 553 |        "      <th>0</th>\n",
 554 |        "      <td>Sydney Youngblood</td>\n",
 555 |        "      <td>Logged In</td>\n",
 556 |        "      <td>Jacob</td>\n",
 557 |        "      <td>M</td>\n",
 558 |        "      <td>53</td>\n",
 559 |        "      <td>Klein</td>\n",
 560 |        "      <td>238.07955</td>\n",
 561 |        "      <td>paid</td>\n",
 562 |        "      <td>Tampa-St. Petersburg-Clearwater, FL</td>\n",
 563 |        "      <td>PUT</td>\n",
 564 |        "      <td>NextSong</td>\n",
 565 |        "      <td>1.540558e+12</td>\n",
 566 |        "      <td>954</td>\n",
 567 |        "      <td>Ain't No Sunshine</td>\n",
 568 |        "      <td>200</td>\n",
 569 |        "      <td>2018-11-29 00:00:57.796</td>\n",
 570 |        "      <td>\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...</td>\n",
 571 |        "      <td>73</td>\n",
 572 |        "    </tr>\n",
 573 |        "    <tr>\n",
 574 |        "      <th>1</th>\n",
 575 |        "      <td>Gang Starr</td>\n",
 576 |        "      <td>Logged In</td>\n",
 577 |        "      <td>Layla</td>\n",
 578 |        "      <td>F</td>\n",
 579 |        "      <td>88</td>\n",
 580 |        "      <td>Griffin</td>\n",
 581 |        "      <td>151.92771</td>\n",
 582 |        "      <td>paid</td>\n",
 583 |        "      <td>Lake Havasu City-Kingman, AZ</td>\n",
 584 |        "      <td>PUT</td>\n",
 585 |        "      <td>NextSong</td>\n",
 586 |        "      <td>1.541057e+12</td>\n",
 587 |        "      <td>984</td>\n",
 588 |        "      <td>My Advice 2 You (Explicit)</td>\n",
 589 |        "      <td>200</td>\n",
 590 |        "      <td>2018-11-29 00:01:30.796</td>\n",
 591 |        "      <td>\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...</td>\n",
 592 |        "      <td>24</td>\n",
 593 |        "    </tr>\n",
 594 |        "    <tr>\n",
 595 |        "      <th>2</th>\n",
 596 |        "      <td>3OH!3</td>\n",
 597 |        "      <td>Logged In</td>\n",
 598 |        "      <td>Layla</td>\n",
 599 |        "      <td>F</td>\n",
 600 |        "      <td>89</td>\n",
 601 |        "      <td>Griffin</td>\n",
 602 |        "      <td>192.52200</td>\n",
 603 |        "      <td>paid</td>\n",
 604 |        "      <td>Lake Havasu City-Kingman, AZ</td>\n",
 605 |        "      <td>PUT</td>\n",
 606 |        "      <td>NextSong</td>\n",
 607 |        "      <td>1.541057e+12</td>\n",
 608 |        "      <td>984</td>\n",
 609 |        "      <td>My First Kiss (Feat. Ke$ha) [Album Version]</td>\n",
 610 |        "      <td>200</td>\n",
 611 |        "      <td>2018-11-29 00:04:01.796</td>\n",
 612 |        "      <td>\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...</td>\n",
 613 |        "      <td>24</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>3</th>\n",
 617 |        "      <td>RÃÂ¶yksopp</td>\n",
 618 |        "      <td>Logged In</td>\n",
 619 |        "      <td>Jacob</td>\n",
 620 |        "      <td>M</td>\n",
 621 |        "      <td>54</td>\n",
 622 |        "      <td>Klein</td>\n",
 623 |        "      <td>369.81506</td>\n",
 624 |        "      <td>paid</td>\n",
 625 |        "      <td>Tampa-St. Petersburg-Clearwater, FL</td>\n",
 626 |        "      <td>PUT</td>\n",
 627 |        "      <td>NextSong</td>\n",
 628 |        "      <td>1.540558e+12</td>\n",
 629 |        "      <td>954</td>\n",
 630 |        "      <td>The Girl and The Robot</td>\n",
 631 |        "      <td>200</td>\n",
 632 |        "      <td>2018-11-29 00:04:55.796</td>\n",
 633 |        "      <td>\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...</td>\n",
 634 |        "      <td>73</td>\n",
 635 |        "    </tr>\n",
 636 |        "    <tr>\n",
 637 |        "      <th>4</th>\n",
 638 |        "      <td>Kajagoogoo</td>\n",
 639 |        "      <td>Logged In</td>\n",
 640 |        "      <td>Layla</td>\n",
 641 |        "      <td>F</td>\n",
 642 |        "      <td>90</td>\n",
 643 |        "      <td>Griffin</td>\n",
 644 |        "      <td>223.55546</td>\n",
 645 |        "      <td>paid</td>\n",
 646 |        "      <td>Lake Havasu City-Kingman, AZ</td>\n",
 647 |        "      <td>PUT</td>\n",
 648 |        "      <td>NextSong</td>\n",
 649 |        "      <td>1.541057e+12</td>\n",
 650 |        "      <td>984</td>\n",
 651 |        "      <td>Too Shy</td>\n",
 652 |        "      <td>200</td>\n",
 653 |        "      <td>2018-11-29 00:07:13.796</td>\n",
 654 |        "      <td>\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...</td>\n",
 655 |        "      <td>24</td>\n",
 656 |        "    </tr>\n",
 657 |        "  </tbody>\n",
 658 |        "</table>\n",
 659 |        "</div>"
 660 |       ],
 661 |       "text/plain": [
 662 |        "              artist       auth firstName gender  itemInSession lastName  \\\n",
 663 |        "0  Sydney Youngblood  Logged In     Jacob      M             53    Klein   \n",
 664 |        "1         Gang Starr  Logged In     Layla      F             88  Griffin   \n",
 665 |        "2              3OH!3  Logged In     Layla      F             89  Griffin   \n",
 666 |        "3        RÃÂ¶yksopp  Logged In     Jacob      M             54    Klein   \n",
 667 |        "4         Kajagoogoo  Logged In     Layla      F             90  Griffin   \n",
 668 |        "\n",
 669 |        "      length level                             location method      page  \\\n",
 670 |        "0  238.07955  paid  Tampa-St. Petersburg-Clearwater, FL    PUT  NextSong   \n",
 671 |        "1  151.92771  paid         Lake Havasu City-Kingman, AZ    PUT  NextSong   \n",
 672 |        "2  192.52200  paid         Lake Havasu City-Kingman, AZ    PUT  NextSong   \n",
 673 |        "3  369.81506  paid  Tampa-St. Petersburg-Clearwater, FL    PUT  NextSong   \n",
 674 |        "4  223.55546  paid         Lake Havasu City-Kingman, AZ    PUT  NextSong   \n",
 675 |        "\n",
 676 |        "   registration  sessionId                                         song  \\\n",
 677 |        "0  1.540558e+12        954                            Ain't No Sunshine   \n",
 678 |        "1  1.541057e+12        984                   My Advice 2 You (Explicit)   \n",
 679 |        "2  1.541057e+12        984  My First Kiss (Feat. Ke$ha) [Album Version]   \n",
 680 |        "3  1.540558e+12        954                       The Girl and The Robot   \n",
 681 |        "4  1.541057e+12        984                                      Too Shy   \n",
 682 |        "\n",
 683 |        "   status                      ts  \\\n",
 684 |        "0     200 2018-11-29 00:00:57.796   \n",
 685 |        "1     200 2018-11-29 00:01:30.796   \n",
 686 |        "2     200 2018-11-29 00:04:01.796   \n",
 687 |        "3     200 2018-11-29 00:04:55.796   \n",
 688 |        "4     200 2018-11-29 00:07:13.796   \n",
 689 |        "\n",
 690 |        "                                           userAgent userId  \n",
 691 |        "0  \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...     73  \n",
 692 |        "1  \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...     24  \n",
 693 |        "2  \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...     24  \n",
 694 |        "3  \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...     73  \n",
 695 |        "4  \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...     24  "
 696 |       ]
 697 |      },
 698 |      "execution_count": 14,
 699 |      "metadata": {},
 700 |      "output_type": "execute_result"
 701 |     }
 702 |    ],
 703 |    "source": [
 704 |     "df[\"ts\"] = pd.to_datetime(df[\"ts\"], unit='ms')\n",
 705 |     "df.head()"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "code",
 710 |    "execution_count": 15,
 711 |    "metadata": {
 712 |     "editable": true
 713 |    },
 714 |    "outputs": [
 715 |     {
 716 |      "data": {
 717 |       "text/plain": [
 718 |        "0   2018-11-29 00:00:57.796\n",
 719 |        "1   2018-11-29 00:01:30.796\n",
 720 |        "2   2018-11-29 00:04:01.796\n",
 721 |        "3   2018-11-29 00:04:55.796\n",
 722 |        "4   2018-11-29 00:07:13.796\n",
 723 |        "Name: ts, dtype: datetime64[ns]"
 724 |       ]
 725 |      },
 726 |      "execution_count": 15,
 727 |      "metadata": {},
 728 |      "output_type": "execute_result"
 729 |     }
 730 |    ],
 731 |    "source": [
 732 |     "t = df[\"ts\"]\n",
 733 |     "t.head()"
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": 16,
 739 |    "metadata": {
 740 |     "editable": true
 741 |    },
 742 |    "outputs": [],
 743 |    "source": [
 744 |     "#  timestamp, hour, day, week of year, month, year, and weekday\n",
 745 |     "timestamps = df[\"ts\"].dt.time\n",
 746 |     "hours = df[\"ts\"].dt.hour\n",
 747 |     "days = df[\"ts\"].dt.day\n",
 748 |     "weeks = df[\"ts\"].dt.week\n",
 749 |     "months = df[\"ts\"].dt.month\n",
 750 |     "years = df[\"ts\"].dt.year\n",
 751 |     "weekdays = df[\"ts\"].dt.weekday\n",
 752 |     "column_labels = (\"timestamp\", \"hour\", \"day\", \"week of year\", \"month\", \"year\", \"weekday\")\n",
 753 |     "time_data = pd.DataFrame({\"timestamp\": timestamps, \"hour\": hours, \"day\": days, \"week\": weeks, \"month\": months, \"year\": years, \"weekday\": weekdays})"
 754 |    ]
 755 |   },
 756 |   {
 757 |    "cell_type": "code",
 758 |    "execution_count": 17,
 759 |    "metadata": {
 760 |     "editable": true
 761 |    },
 762 |    "outputs": [
 763 |     {
 764 |      "data": {
 765 |       "text/html": [
 766 |        "<div>\n",
 767 |        "<style scoped>\n",
 768 |        "    .dataframe tbody tr th:only-of-type {\n",
 769 |        "        vertical-align: middle;\n",
 770 |        "    }\n",
 771 |        "\n",
 772 |        "    .dataframe tbody tr th {\n",
 773 |        "        vertical-align: top;\n",
 774 |        "    }\n",
 775 |        "\n",
 776 |        "    .dataframe thead th {\n",
 777 |        "        text-align: right;\n",
 778 |        "    }\n",
 779 |        "</style>\n",
 780 |        "<table border=\"1\" class=\"dataframe\">\n",
 781 |        "  <thead>\n",
 782 |        "    <tr style=\"text-align: right;\">\n",
 783 |        "      <th></th>\n",
 784 |        "      <th>timestamp</th>\n",
 785 |        "      <th>hour</th>\n",
 786 |        "      <th>day</th>\n",
 787 |        "      <th>week</th>\n",
 788 |        "      <th>month</th>\n",
 789 |        "      <th>year</th>\n",
 790 |        "      <th>weekday</th>\n",
 791 |        "    </tr>\n",
 792 |        "  </thead>\n",
 793 |        "  <tbody>\n",
 794 |        "    <tr>\n",
 795 |        "      <th>0</th>\n",
 796 |        "      <td>00:00:57.796000</td>\n",
 797 |        "      <td>0</td>\n",
 798 |        "      <td>29</td>\n",
 799 |        "      <td>48</td>\n",
 800 |        "      <td>11</td>\n",
 801 |        "      <td>2018</td>\n",
 802 |        "      <td>3</td>\n",
 803 |        "    </tr>\n",
 804 |        "    <tr>\n",
 805 |        "      <th>1</th>\n",
 806 |        "      <td>00:01:30.796000</td>\n",
 807 |        "      <td>0</td>\n",
 808 |        "      <td>29</td>\n",
 809 |        "      <td>48</td>\n",
 810 |        "      <td>11</td>\n",
 811 |        "      <td>2018</td>\n",
 812 |        "      <td>3</td>\n",
 813 |        "    </tr>\n",
 814 |        "    <tr>\n",
 815 |        "      <th>2</th>\n",
 816 |        "      <td>00:04:01.796000</td>\n",
 817 |        "      <td>0</td>\n",
 818 |        "      <td>29</td>\n",
 819 |        "      <td>48</td>\n",
 820 |        "      <td>11</td>\n",
 821 |        "      <td>2018</td>\n",
 822 |        "      <td>3</td>\n",
 823 |        "    </tr>\n",
 824 |        "    <tr>\n",
 825 |        "      <th>3</th>\n",
 826 |        "      <td>00:04:55.796000</td>\n",
 827 |        "      <td>0</td>\n",
 828 |        "      <td>29</td>\n",
 829 |        "      <td>48</td>\n",
 830 |        "      <td>11</td>\n",
 831 |        "      <td>2018</td>\n",
 832 |        "      <td>3</td>\n",
 833 |        "    </tr>\n",
 834 |        "    <tr>\n",
 835 |        "      <th>4</th>\n",
 836 |        "      <td>00:07:13.796000</td>\n",
 837 |        "      <td>0</td>\n",
 838 |        "      <td>29</td>\n",
 839 |        "      <td>48</td>\n",
 840 |        "      <td>11</td>\n",
 841 |        "      <td>2018</td>\n",
 842 |        "      <td>3</td>\n",
 843 |        "    </tr>\n",
 844 |        "  </tbody>\n",
 845 |        "</table>\n",
 846 |        "</div>"
 847 |       ],
 848 |       "text/plain": [
 849 |        "         timestamp  hour  day  week  month  year  weekday\n",
 850 |        "0  00:00:57.796000     0   29    48     11  2018        3\n",
 851 |        "1  00:01:30.796000     0   29    48     11  2018        3\n",
 852 |        "2  00:04:01.796000     0   29    48     11  2018        3\n",
 853 |        "3  00:04:55.796000     0   29    48     11  2018        3\n",
 854 |        "4  00:07:13.796000     0   29    48     11  2018        3"
 855 |       ]
 856 |      },
 857 |      "execution_count": 17,
 858 |      "metadata": {},
 859 |      "output_type": "execute_result"
 860 |     }
 861 |    ],
 862 |    "source": [
 863 |     "time_df = time_data\n",
 864 |     "time_df.head()"
 865 |    ]
 866 |   },
 867 |   {
 868 |    "cell_type": "markdown",
 869 |    "metadata": {
 870 |     "editable": true
 871 |    },
 872 |    "source": [
 873 |     "#### Insert Records into Time Table\n",
 874 |     "Implement the `time_table_insert` query in `sql_queries.py` and run the cell below to insert records for the timestamps in this log file into the `time` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `time` table in the sparkify database."
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": 18,
 880 |    "metadata": {
 881 |     "editable": true
 882 |    },
 883 |    "outputs": [],
 884 |    "source": [
 885 |     "for i, row in time_df.iterrows():\n",
 886 |     "    cur.execute(time_table_insert, list(row))\n",
 887 |     "    conn.commit()"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "markdown",
 892 |    "metadata": {
 893 |     "editable": true
 894 |    },
 895 |    "source": [
 896 |     "Run `test.ipynb` to see if you've successfully added records to this table."
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "markdown",
 901 |    "metadata": {
 902 |     "editable": true
 903 |    },
 904 |    "source": [
 905 |     "## #4: `users` Table\n",
 906 |     "#### Extract Data for Users Table\n",
 907 |     "- Select columns for user ID, first name, last name, gender and level and set to `user_df`"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "code",
 912 |    "execution_count": 19,
 913 |    "metadata": {
 914 |     "editable": true
 915 |    },
 916 |    "outputs": [
 917 |     {
 918 |      "data": {
 919 |       "text/html": [
 920 |        "<div>\n",
 921 |        "<style scoped>\n",
 922 |        "    .dataframe tbody tr th:only-of-type {\n",
 923 |        "        vertical-align: middle;\n",
 924 |        "    }\n",
 925 |        "\n",
 926 |        "    .dataframe tbody tr th {\n",
 927 |        "        vertical-align: top;\n",
 928 |        "    }\n",
 929 |        "\n",
 930 |        "    .dataframe thead th {\n",
 931 |        "        text-align: right;\n",
 932 |        "    }\n",
 933 |        "</style>\n",
 934 |        "<table border=\"1\" class=\"dataframe\">\n",
 935 |        "  <thead>\n",
 936 |        "    <tr style=\"text-align: right;\">\n",
 937 |        "      <th></th>\n",
 938 |        "      <th>userId</th>\n",
 939 |        "      <th>firstName</th>\n",
 940 |        "      <th>lastName</th>\n",
 941 |        "      <th>gender</th>\n",
 942 |        "      <th>level</th>\n",
 943 |        "    </tr>\n",
 944 |        "  </thead>\n",
 945 |        "  <tbody>\n",
 946 |        "    <tr>\n",
 947 |        "      <th>0</th>\n",
 948 |        "      <td>73</td>\n",
 949 |        "      <td>Jacob</td>\n",
 950 |        "      <td>Klein</td>\n",
 951 |        "      <td>M</td>\n",
 952 |        "      <td>paid</td>\n",
 953 |        "    </tr>\n",
 954 |        "    <tr>\n",
 955 |        "      <th>1</th>\n",
 956 |        "      <td>24</td>\n",
 957 |        "      <td>Layla</td>\n",
 958 |        "      <td>Griffin</td>\n",
 959 |        "      <td>F</td>\n",
 960 |        "      <td>paid</td>\n",
 961 |        "    </tr>\n",
 962 |        "    <tr>\n",
 963 |        "      <th>21</th>\n",
 964 |        "      <td>50</td>\n",
 965 |        "      <td>Ava</td>\n",
 966 |        "      <td>Robinson</td>\n",
 967 |        "      <td>F</td>\n",
 968 |        "      <td>free</td>\n",
 969 |        "    </tr>\n",
 970 |        "    <tr>\n",
 971 |        "      <th>35</th>\n",
 972 |        "      <td>54</td>\n",
 973 |        "      <td>Kaleb</td>\n",
 974 |        "      <td>Cook</td>\n",
 975 |        "      <td>M</td>\n",
 976 |        "      <td>free</td>\n",
 977 |        "    </tr>\n",
 978 |        "    <tr>\n",
 979 |        "      <th>40</th>\n",
 980 |        "      <td>32</td>\n",
 981 |        "      <td>Lily</td>\n",
 982 |        "      <td>Burns</td>\n",
 983 |        "      <td>F</td>\n",
 984 |        "      <td>free</td>\n",
 985 |        "    </tr>\n",
 986 |        "    <tr>\n",
 987 |        "      <th>42</th>\n",
 988 |        "      <td>4</td>\n",
 989 |        "      <td>Alivia</td>\n",
 990 |        "      <td>Terrell</td>\n",
 991 |        "      <td>F</td>\n",
 992 |        "      <td>free</td>\n",
 993 |        "    </tr>\n",
 994 |        "    <tr>\n",
 995 |        "      <th>43</th>\n",
 996 |        "      <td>52</td>\n",
 997 |        "      <td>Theodore</td>\n",
 998 |        "      <td>Smith</td>\n",
 999 |        "      <td>M</td>\n",
1000 |        "      <td>free</td>\n",
1001 |        "    </tr>\n",
1002 |        "    <tr>\n",
1003 |        "      <th>46</th>\n",
1004 |        "      <td>14</td>\n",
1005 |        "      <td>Theodore</td>\n",
1006 |        "      <td>Harris</td>\n",
1007 |        "      <td>M</td>\n",
1008 |        "      <td>free</td>\n",
1009 |        "    </tr>\n",
1010 |        "    <tr>\n",
1011 |        "      <th>48</th>\n",
1012 |        "      <td>98</td>\n",
1013 |        "      <td>Jordyn</td>\n",
1014 |        "      <td>Powell</td>\n",
1015 |        "      <td>F</td>\n",
1016 |        "      <td>free</td>\n",
1017 |        "    </tr>\n",
1018 |        "    <tr>\n",
1019 |        "      <th>51</th>\n",
1020 |        "      <td>101</td>\n",
1021 |        "      <td>Jayden</td>\n",
1022 |        "      <td>Fox</td>\n",
1023 |        "      <td>M</td>\n",
1024 |        "      <td>free</td>\n",
1025 |        "    </tr>\n",
1026 |        "    <tr>\n",
1027 |        "      <th>53</th>\n",
1028 |        "      <td>78</td>\n",
1029 |        "      <td>Chloe</td>\n",
1030 |        "      <td>Roth</td>\n",
1031 |        "      <td>F</td>\n",
1032 |        "      <td>free</td>\n",
1033 |        "    </tr>\n",
1034 |        "    <tr>\n",
1035 |        "      <th>56</th>\n",
1036 |        "      <td>10</td>\n",
1037 |        "      <td>Sylvie</td>\n",
1038 |        "      <td>Cruz</td>\n",
1039 |        "      <td>F</td>\n",
1040 |        "      <td>free</td>\n",
1041 |        "    </tr>\n",
1042 |        "    <tr>\n",
1043 |        "      <th>58</th>\n",
1044 |        "      <td>53</td>\n",
1045 |        "      <td>Celeste</td>\n",
1046 |        "      <td>Williams</td>\n",
1047 |        "      <td>F</td>\n",
1048 |        "      <td>free</td>\n",
1049 |        "    </tr>\n",
1050 |        "    <tr>\n",
1051 |        "      <th>59</th>\n",
1052 |        "      <td>61</td>\n",
1053 |        "      <td>Samuel</td>\n",
1054 |        "      <td>Gonzalez</td>\n",
1055 |        "      <td>M</td>\n",
1056 |        "      <td>free</td>\n",
1057 |        "    </tr>\n",
1058 |        "    <tr>\n",
1059 |        "      <th>60</th>\n",
1060 |        "      <td>49</td>\n",
1061 |        "      <td>Chloe</td>\n",
1062 |        "      <td>Cuevas</td>\n",
1063 |        "      <td>F</td>\n",
1064 |        "      <td>paid</td>\n",
1065 |        "    </tr>\n",
1066 |        "    <tr>\n",
1067 |        "      <th>80</th>\n",
1068 |        "      <td>16</td>\n",
1069 |        "      <td>Rylan</td>\n",
1070 |        "      <td>George</td>\n",
1071 |        "      <td>M</td>\n",
1072 |        "      <td>paid</td>\n",
1073 |        "    </tr>\n",
1074 |        "    <tr>\n",
1075 |        "      <th>84</th>\n",
1076 |        "      <td>26</td>\n",
1077 |        "      <td>Ryan</td>\n",
1078 |        "      <td>Smith</td>\n",
1079 |        "      <td>M</td>\n",
1080 |        "      <td>free</td>\n",
1081 |        "    </tr>\n",
1082 |        "    <tr>\n",
1083 |        "      <th>95</th>\n",
1084 |        "      <td>79</td>\n",
1085 |        "      <td>James</td>\n",
1086 |        "      <td>Martin</td>\n",
1087 |        "      <td>M</td>\n",
1088 |        "      <td>free</td>\n",
1089 |        "    </tr>\n",
1090 |        "    <tr>\n",
1091 |        "      <th>150</th>\n",
1092 |        "      <td>82</td>\n",
1093 |        "      <td>Avery</td>\n",
1094 |        "      <td>Martinez</td>\n",
1095 |        "      <td>F</td>\n",
1096 |        "      <td>paid</td>\n",
1097 |        "    </tr>\n",
1098 |        "    <tr>\n",
1099 |        "      <th>158</th>\n",
1100 |        "      <td>44</td>\n",
1101 |        "      <td>Aleena</td>\n",
1102 |        "      <td>Kirby</td>\n",
1103 |        "      <td>F</td>\n",
1104 |        "      <td>paid</td>\n",
1105 |        "    </tr>\n",
1106 |        "    <tr>\n",
1107 |        "      <th>199</th>\n",
1108 |        "      <td>75</td>\n",
1109 |        "      <td>Joseph</td>\n",
1110 |        "      <td>Gutierrez</td>\n",
1111 |        "      <td>M</td>\n",
1112 |        "      <td>free</td>\n",
1113 |        "    </tr>\n",
1114 |        "    <tr>\n",
1115 |        "      <th>204</th>\n",
1116 |        "      <td>39</td>\n",
1117 |        "      <td>Walter</td>\n",
1118 |        "      <td>Frye</td>\n",
1119 |        "      <td>M</td>\n",
1120 |        "      <td>free</td>\n",
1121 |        "    </tr>\n",
1122 |        "    <tr>\n",
1123 |        "      <th>286</th>\n",
1124 |        "      <td>80</td>\n",
1125 |        "      <td>Tegan</td>\n",
1126 |        "      <td>Levine</td>\n",
1127 |        "      <td>F</td>\n",
1128 |        "      <td>paid</td>\n",
1129 |        "    </tr>\n",
1130 |        "    <tr>\n",
1131 |        "      <th>314</th>\n",
1132 |        "      <td>55</td>\n",
1133 |        "      <td>Martin</td>\n",
1134 |        "      <td>Johnson</td>\n",
1135 |        "      <td>M</td>\n",
1136 |        "      <td>free</td>\n",
1137 |        "    </tr>\n",
1138 |        "    <tr>\n",
1139 |        "      <th>328</th>\n",
1140 |        "      <td>12</td>\n",
1141 |        "      <td>Austin</td>\n",
1142 |        "      <td>Rosales</td>\n",
1143 |        "      <td>M</td>\n",
1144 |        "      <td>free</td>\n",
1145 |        "    </tr>\n",
1146 |        "    <tr>\n",
1147 |        "      <th>335</th>\n",
1148 |        "      <td>9</td>\n",
1149 |        "      <td>Wyatt</td>\n",
1150 |        "      <td>Scott</td>\n",
1151 |        "      <td>M</td>\n",
1152 |        "      <td>free</td>\n",
1153 |        "    </tr>\n",
1154 |        "    <tr>\n",
1155 |        "      <th>346</th>\n",
1156 |        "      <td>22</td>\n",
1157 |        "      <td>Sean</td>\n",
1158 |        "      <td>Wilson</td>\n",
1159 |        "      <td>F</td>\n",
1160 |        "      <td>free</td>\n",
1161 |        "    </tr>\n",
1162 |        "    <tr>\n",
1163 |        "      <th>359</th>\n",
1164 |        "      <td>74</td>\n",
1165 |        "      <td>Braden</td>\n",
1166 |        "      <td>Parker</td>\n",
1167 |        "      <td>M</td>\n",
1168 |        "      <td>free</td>\n",
1169 |        "    </tr>\n",
1170 |        "    <tr>\n",
1171 |        "      <th>365</th>\n",
1172 |        "      <td>89</td>\n",
1173 |        "      <td>Kynnedi</td>\n",
1174 |        "      <td>Sanchez</td>\n",
1175 |        "      <td>F</td>\n",
1176 |        "      <td>free</td>\n",
1177 |        "    </tr>\n",
1178 |        "  </tbody>\n",
1179 |        "</table>\n",
1180 |        "</div>"
1181 |       ],
1182 |       "text/plain": [
1183 |        "    userId firstName   lastName gender level\n",
1184 |        "0       73     Jacob      Klein      M  paid\n",
1185 |        "1       24     Layla    Griffin      F  paid\n",
1186 |        "21      50       Ava   Robinson      F  free\n",
1187 |        "35      54     Kaleb       Cook      M  free\n",
1188 |        "40      32      Lily      Burns      F  free\n",
1189 |        "42       4    Alivia    Terrell      F  free\n",
1190 |        "43      52  Theodore      Smith      M  free\n",
1191 |        "46      14  Theodore     Harris      M  free\n",
1192 |        "48      98    Jordyn     Powell      F  free\n",
1193 |        "51     101    Jayden        Fox      M  free\n",
1194 |        "53      78     Chloe       Roth      F  free\n",
1195 |        "56      10    Sylvie       Cruz      F  free\n",
1196 |        "58      53   Celeste   Williams      F  free\n",
1197 |        "59      61    Samuel   Gonzalez      M  free\n",
1198 |        "60      49     Chloe     Cuevas      F  paid\n",
1199 |        "80      16     Rylan     George      M  paid\n",
1200 |        "84      26      Ryan      Smith      M  free\n",
1201 |        "95      79     James     Martin      M  free\n",
1202 |        "150     82     Avery   Martinez      F  paid\n",
1203 |        "158     44    Aleena      Kirby      F  paid\n",
1204 |        "199     75    Joseph  Gutierrez      M  free\n",
1205 |        "204     39    Walter       Frye      M  free\n",
1206 |        "286     80     Tegan     Levine      F  paid\n",
1207 |        "314     55    Martin    Johnson      M  free\n",
1208 |        "328     12    Austin    Rosales      M  free\n",
1209 |        "335      9     Wyatt      Scott      M  free\n",
1210 |        "346     22      Sean     Wilson      F  free\n",
1211 |        "359     74    Braden     Parker      M  free\n",
1212 |        "365     89   Kynnedi    Sanchez      F  free"
1213 |       ]
1214 |      },
1215 |      "execution_count": 19,
1216 |      "metadata": {},
1217 |      "output_type": "execute_result"
1218 |     }
1219 |    ],
1220 |    "source": [
1221 |     "user_df = df[[\"userId\", \"firstName\", \"lastName\", \"gender\", \"level\"]]\n",
1222 |     "user_df = user_df.drop_duplicates()\n",
1223 |     "user_df = user_df[user_df[\"userId\"]!=\"\"]\n",
1224 |     "user_df"
1225 |    ]
1226 |   },
1227 |   {
1228 |    "cell_type": "markdown",
1229 |    "metadata": {
1230 |     "editable": true
1231 |    },
1232 |    "source": [
1233 |     "#### Insert Records into Users Table\n",
1234 |     "Implement the `user_table_insert` query in `sql_queries.py` and run the cell below to insert records for the users in this log file into the `users` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `users` table in the sparkify database."
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "code",
1239 |    "execution_count": 20,
1240 |    "metadata": {
1241 |     "editable": true
1242 |    },
1243 |    "outputs": [],
1244 |    "source": [
1245 |     "for i, row in user_df.iterrows():\n",
1246 |     "    cur.execute(user_table_insert, row)\n",
1247 |     "    conn.commit()"
1248 |    ]
1249 |   },
1250 |   {
1251 |    "cell_type": "markdown",
1252 |    "metadata": {
1253 |     "editable": true
1254 |    },
1255 |    "source": [
1256 |     "Run `test.ipynb` to see if you've successfully added records to this table."
1257 |    ]
1258 |   },
1259 |   {
1260 |    "cell_type": "markdown",
1261 |    "metadata": {
1262 |     "editable": true
1263 |    },
1264 |    "source": [
1265 |     "## #5: `songplays` Table\n",
1266 |     "#### Extract Data and Songplays Table\n",
1267 |     "This one is a little more complicated since information from the songs table, artists table, and original log file are all needed for the `songplays` table. Since the log file does not specify an ID for either the song or the artist, you'll need to get the song ID and artist ID by querying the songs and artists tables to find matches based on song title, artist name, and song duration time.\n",
1268 |     "- Implement the `song_select` query in `sql_queries.py` to find the song ID and artist ID based on the title, artist name, and duration of a song.\n",
1269 |     "- Select the timestamp, user ID, level, song ID, artist ID, session ID, location, and user agent and set to `songplay_data`\n",
1270 |     "\n",
1271 |     "#### Insert Records into Songplays Table\n",
1272 |     "- Implement the `songplay_table_insert` query and run the cell below to insert records for the songplay actions in this log file into the `songplays` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `songplays` table in the sparkify database."
1273 |    ]
1274 |   },
1275 |   {
1276 |    "cell_type": "code",
1277 |    "execution_count": 21,
1278 |    "metadata": {
1279 |     "editable": true
1280 |    },
1281 |    "outputs": [],
1282 |    "source": [
1283 |     "for index, row in df.iterrows():\n",
1284 |     "\n",
1285 |     "    # get songid and artistid from song and artist tables\n",
1286 |     "    cur.execute(song_select, (row.song, row.artist, row.length))\n",
1287 |     "    results = cur.fetchone()\n",
1288 |     "    if results:\n",
1289 |     "        songid, artistid = results\n",
1290 |     "    else:\n",
1291 |     "        songid, artistid = None, None\n",
1292 |     "\n",
1293 |     "    # insert songplay record\n",
1294 |     "    songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent)\n",
1295 |     "    if row.userId != \"\":\n",
1296 |     "        cur.execute(songplay_table_insert, songplay_data)\n",
1297 |     "        conn.commit()"
1298 |    ]
1299 |   },
1300 |   {
1301 |    "cell_type": "markdown",
1302 |    "metadata": {
1303 |     "editable": true
1304 |    },
1305 |    "source": [
1306 |     "Run `test.ipynb` to see if you've successfully added records to this table."
1307 |    ]
1308 |   },
1309 |   {
1310 |    "cell_type": "markdown",
1311 |    "metadata": {
1312 |     "editable": true
1313 |    },
1314 |    "source": [
1315 |     "# Close Connection to Sparkify Database"
1316 |    ]
1317 |   },
1318 |   {
1319 |    "cell_type": "code",
1320 |    "execution_count": 22,
1321 |    "metadata": {
1322 |     "editable": true
1323 |    },
1324 |    "outputs": [],
1325 |    "source": [
1326 |     "conn.close()"
1327 |    ]
1328 |   },
1329 |   {
1330 |    "cell_type": "markdown",
1331 |    "metadata": {
1332 |     "editable": true
1333 |    },
1334 |    "source": [
1335 |     "# Implement `etl.py`\n",
1336 |     "Use what you've completed in this notebook to implement `etl.py`."
1337 |    ]
1338 |   },
1339 |   {
1340 |    "cell_type": "code",
1341 |    "execution_count": null,
1342 |    "metadata": {
1343 |     "editable": true
1344 |    },
1345 |    "outputs": [],
1346 |    "source": []
1347 |   }
1348 |  ],
1349 |  "metadata": {
1350 |   "kernelspec": {
1351 |    "display_name": "Python 3",
1352 |    "language": "python",
1353 |    "name": "python3"
1354 |   },
1355 |   "language_info": {
1356 |    "codemirror_mode": {
1357 |     "name": "ipython",
1358 |     "version": 3
1359 |    },
1360 |    "file_extension": ".py",
1361 |    "mimetype": "text/x-python",
1362 |    "name": "python",
1363 |    "nbconvert_exporter": "python",
1364 |    "pygments_lexer": "ipython3",
1365 |    "version": "3.6.3"
1366 |   }
1367 |  },
1368 |  "nbformat": 4,
1369 |  "nbformat_minor": 4
1370 | }
1371 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/etl.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | import numpy as np
  4 | import os
  5 | import glob
  6 | import psycopg2
  7 | import pandas as pd
  8 | from sql_queries import *
  9 | 
 10 | 
 11 | def process_song_file(cur: Any, filepath: str):
 12 |     """
 13 |     Processes a single song file.
 14 |     @param cur: the database cursor
 15 |     @param filepath: the path to the song file
 16 |     """
 17 |     # open song file
 18 |     df = pd.read_json(filepath, typ='series')
 19 | 
 20 |     # insert song record
 21 |     song_data = df[["song_id", "title", "artist_id", "year", "duration"]]
 22 |     song_data = list(song_data.values)
 23 |     cur.execute(song_table_insert, song_data)
 24 | 
 25 |     # insert artist record
 26 |     artist_data = df[["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]]
 27 |     cur.execute(artist_table_insert, artist_data)
 28 | 
 29 | 
 30 | def process_log_file(cur: Any, filepath: str):
 31 |     """
 32 |     Processes a single log file.
 33 |     @param cur: the database cursor
 34 |     @param filepath: the path to the log file
 35 |     """
 36 |     # open log file
 37 |     df = pd.read_json(filepath, lines=True)
 38 | 
 39 |     # convert timestamp column to datetime
 40 |     df["ts"] = pd.to_datetime(df["ts"], unit='ms')
 41 | 
 42 |     # get all the wanted information from the timestamps
 43 |     timestamps = df["ts"].dt.time
 44 |     hours = df["ts"].dt.hour
 45 |     days = df["ts"].dt.day
 46 |     weeks = df["ts"].dt.week
 47 |     months = df["ts"].dt.month
 48 |     years = df["ts"].dt.year
 49 |     weekdays = df["ts"].dt.weekday
 50 | 
 51 |     # create a dataframe with the wanted information
 52 |     time_df = pd.DataFrame(
 53 |         {"timestamp": timestamps, "hour": hours, "day": days, "week": weeks, "month": months, "year": years,
 54 |          "weekday": weekdays})
 55 | 
 56 |     for i, row in time_df.iterrows():
 57 |         cur.execute(time_table_insert, list(row))
 58 | 
 59 |     # load user table
 60 |     user_df = df[["userId", "firstName", "lastName", "gender", "level"]]
 61 |     user_df = user_df.drop_duplicates()
 62 | 
 63 |     # insert user records
 64 |     for i, row in user_df.iterrows():
 65 |         cur.execute(user_table_insert, row)
 66 | 
 67 |     # insert songplay records
 68 |     for index, row in df.iterrows():
 69 | 
 70 |         # get songid and artistid from song and artist tables
 71 |         cur.execute(song_select, (row.song, row.artist, row.length))
 72 |         results = cur.fetchone()
 73 | 
 74 |         if results:
 75 |             songid, artistid = results
 76 |         else:
 77 |             songid, artistid = None, None
 78 | 
 79 |         # insert songplay record
 80 |         songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent)
 81 |         cur.execute(songplay_table_insert, songplay_data)
 82 | 
 83 | 
 84 | def process_data(cur: Any, conn: Any, filepath: str, func: Any):
 85 |     """
 86 |     Processes either logs or songs depending on the given function.
 87 |     @param cur: the database cursor
 88 |     @param conn: the database connection
 89 |     @param filepath: the path to the data directory
 90 |     @param func: the function (process songs or logs)
 91 |     """
 92 |     # get all files matching extension from directory
 93 |     all_files = []
 94 |     for root, dirs, files in os.walk(filepath):
 95 |         files = glob.glob(os.path.join(root, '*.json'))
 96 |         for f in files:
 97 |             all_files.append(os.path.abspath(f))
 98 | 
 99 |     # get total number of files found
100 |     num_files = len(all_files)
101 |     print('{} files found in {}'.format(num_files, filepath))
102 | 
103 |     # iterate over files and process
104 |     for i, datafile in enumerate(all_files, 1):
105 |         func(cur, datafile)
106 |         conn.commit()
107 |         print('{}/{} files processed.'.format(i, num_files))
108 | 
109 | 
110 | def insert_songs_and_logs():
111 |     """
112 |     Inserts songs and logs to our custom database.
113 |     """
114 |     conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
115 |     cur = conn.cursor()
116 | 
117 |     process_data(cur, conn, filepath='data/song_data', func=process_song_file)
118 |     process_data(cur, conn, filepath='data/log_data', func=process_log_file)
119 | 
120 |     conn.close()
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     insert_songs_and_logs()
125 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | pandas==1.0.3
3 | psycopg2==2.8.5
4 | python-dateutil==2.8.1
5 | pytz==2019.3
6 | six==1.14.0
7 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/sql_queries.py:
--------------------------------------------------------------------------------
 1 | # DROP TABLES
 2 | 
 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays;"
 4 | user_table_drop = "DROP TABLE IF EXISTS users;"
 5 | song_table_drop = "DROP TABLE IF EXISTS songs;"
 6 | artist_table_drop = "DROP TABLE IF EXISTS artists;"
 7 | time_table_drop = "DROP TABLE IF EXISTS time;"
 8 | 
 9 | # CREATE TABLES
10 | 
11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays (songplay_id SERIAL PRIMARY KEY, \
12 | start_time timestamp NOT NULL, user_id varchar NOT NULL, level varchar, song_id varchar, artist_id varchar, \
13 | session_id int, location varchar, user_agent varchar);""")
14 | 
15 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users (user_id varchar PRIMARY KEY, first_name varchar, \
16 | last_name varchar, gender varchar, level varchar NOT NULL);""")
17 | 
18 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs (song_id varchar PRIMARY KEY, title varchar, \
19 | artist_id varchar, year int, duration decimal)""")
20 | 
21 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists (artist_id varchar PRIMARY KEY, name varchar, \
22 | location varchar, latitude decimal, longitude decimal);""")
23 | 
24 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time (start_time time PRIMARY KEY, hour int, day int, week int, \
25 | month int, year int, weekday int);""")
26 | 
27 | # INSERT RECORDS
28 | 
29 | songplay_table_insert = ("""INSERT INTO songplays (start_time, user_id, level, song_id, artist_id, session_id, \
30 | location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""")
31 | 
32 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level) VALUES (%s, %s, %s, %s, %s) \
33 | ON CONFLICT (user_id) DO UPDATE SET level=EXCLUDED.level""")
34 | 
35 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) VALUES (%s, %s, %s, %s, %s) \
36 | ON CONFLICT (song_id) DO NOTHING""")
37 | 
38 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, latitude, longitude) \
39 | VALUES (%s, %s, %s, %s, %s) ON CONFLICT (artist_id) DO NOTHING""")
40 | 
41 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, week, month, year, weekday) \
42 | VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (start_time) DO NOTHING""")
43 | 
44 | # FIND SONGS
45 | 
46 | song_select = ("""SELECT song_id, songs.artist_id FROM songs JOIN artists ON songs.artist_id = artists.artist_id \
47 | WHERE title = %s AND artists.name = %s AND songs.duration = %s""")
48 | 
49 | # QUERY LISTS
50 | 
51 | create_table_queries = [songplay_table_create, user_table_create, song_table_create, artist_table_create,
52 |                         time_table_create]
53 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop]
54 | 


--------------------------------------------------------------------------------
/Data Modeling with Postgres/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "editable": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%load_ext sql"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "editable": true
 19 |    },
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/plain": [
 24 |        "'Connected: student@sparkifydb'"
 25 |       ]
 26 |      },
 27 |      "execution_count": 2,
 28 |      "metadata": {},
 29 |      "output_type": "execute_result"
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "%sql postgresql://student:student@127.0.0.1/sparkifydb"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {
 40 |     "editable": true
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       " * postgresql://student:***@127.0.0.1/sparkifydb\n",
 48 |       "5 rows affected.\n"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "data": {
 53 |       "text/html": [
 54 |        "<table>\n",
 55 |        "    <tr>\n",
 56 |        "        <th>songplay_id</th>\n",
 57 |        "        <th>start_time</th>\n",
 58 |        "        <th>user_id</th>\n",
 59 |        "        <th>level</th>\n",
 60 |        "        <th>song_id</th>\n",
 61 |        "        <th>artist_id</th>\n",
 62 |        "        <th>session_id</th>\n",
 63 |        "        <th>location</th>\n",
 64 |        "        <th>user_agent</th>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "        <td>1</td>\n",
 68 |        "        <td>2018-11-29 00:00:57.796000</td>\n",
 69 |        "        <td>73</td>\n",
 70 |        "        <td>paid</td>\n",
 71 |        "        <td>None</td>\n",
 72 |        "        <td>None</td>\n",
 73 |        "        <td>954</td>\n",
 74 |        "        <td>Tampa-St. Petersburg-Clearwater, FL</td>\n",
 75 |        "        <td>&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2&quot;</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "        <td>2</td>\n",
 79 |        "        <td>2018-11-29 00:01:30.796000</td>\n",
 80 |        "        <td>24</td>\n",
 81 |        "        <td>paid</td>\n",
 82 |        "        <td>None</td>\n",
 83 |        "        <td>None</td>\n",
 84 |        "        <td>984</td>\n",
 85 |        "        <td>Lake Havasu City-Kingman, AZ</td>\n",
 86 |        "        <td>&quot;Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36&quot;</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "        <td>3</td>\n",
 90 |        "        <td>2018-11-29 00:04:01.796000</td>\n",
 91 |        "        <td>24</td>\n",
 92 |        "        <td>paid</td>\n",
 93 |        "        <td>None</td>\n",
 94 |        "        <td>None</td>\n",
 95 |        "        <td>984</td>\n",
 96 |        "        <td>Lake Havasu City-Kingman, AZ</td>\n",
 97 |        "        <td>&quot;Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36&quot;</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "        <td>4</td>\n",
101 |        "        <td>2018-11-29 00:04:55.796000</td>\n",
102 |        "        <td>73</td>\n",
103 |        "        <td>paid</td>\n",
104 |        "        <td>None</td>\n",
105 |        "        <td>None</td>\n",
106 |        "        <td>954</td>\n",
107 |        "        <td>Tampa-St. Petersburg-Clearwater, FL</td>\n",
108 |        "        <td>&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2&quot;</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "        <td>5</td>\n",
112 |        "        <td>2018-11-29 00:07:13.796000</td>\n",
113 |        "        <td>24</td>\n",
114 |        "        <td>paid</td>\n",
115 |        "        <td>None</td>\n",
116 |        "        <td>None</td>\n",
117 |        "        <td>984</td>\n",
118 |        "        <td>Lake Havasu City-Kingman, AZ</td>\n",
119 |        "        <td>&quot;Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36&quot;</td>\n",
120 |        "    </tr>\n",
121 |        "</table>"
122 |       ],
123 |       "text/plain": [
124 |        "[(1, datetime.datetime(2018, 11, 29, 0, 0, 57, 796000), 73, 'paid', None, None, 954, 'Tampa-St. Petersburg-Clearwater, FL', '\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2\"'),\n",
125 |        " (2, datetime.datetime(2018, 11, 29, 0, 1, 30, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"'),\n",
126 |        " (3, datetime.datetime(2018, 11, 29, 0, 4, 1, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"'),\n",
127 |        " (4, datetime.datetime(2018, 11, 29, 0, 4, 55, 796000), 73, 'paid', None, None, 954, 'Tampa-St. Petersburg-Clearwater, FL', '\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2\"'),\n",
128 |        " (5, datetime.datetime(2018, 11, 29, 0, 7, 13, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"')]"
129 |       ]
130 |      },
131 |      "execution_count": 3,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "%sql SELECT * FROM songplays LIMIT 5;"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 4,
143 |    "metadata": {
144 |     "editable": true
145 |    },
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       " * postgresql://student:***@127.0.0.1/sparkifydb\n",
152 |       "5 rows affected.\n"
153 |      ]
154 |     },
155 |     {
156 |      "data": {
157 |       "text/html": [
158 |        "<table>\n",
159 |        "    <tr>\n",
160 |        "        <th>user_id</th>\n",
161 |        "        <th>first_name</th>\n",
162 |        "        <th>last_name</th>\n",
163 |        "        <th>gender</th>\n",
164 |        "        <th>level</th>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "        <td>73</td>\n",
168 |        "        <td>Jacob</td>\n",
169 |        "        <td>Klein</td>\n",
170 |        "        <td>M</td>\n",
171 |        "        <td>paid</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "        <td>24</td>\n",
175 |        "        <td>Layla</td>\n",
176 |        "        <td>Griffin</td>\n",
177 |        "        <td>F</td>\n",
178 |        "        <td>paid</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "        <td>50</td>\n",
182 |        "        <td>Ava</td>\n",
183 |        "        <td>Robinson</td>\n",
184 |        "        <td>F</td>\n",
185 |        "        <td>free</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "        <td>54</td>\n",
189 |        "        <td>Kaleb</td>\n",
190 |        "        <td>Cook</td>\n",
191 |        "        <td>M</td>\n",
192 |        "        <td>free</td>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "        <td>32</td>\n",
196 |        "        <td>Lily</td>\n",
197 |        "        <td>Burns</td>\n",
198 |        "        <td>F</td>\n",
199 |        "        <td>free</td>\n",
200 |        "    </tr>\n",
201 |        "</table>"
202 |       ],
203 |       "text/plain": [
204 |        "[(73, 'Jacob', 'Klein', 'M', 'paid'),\n",
205 |        " (24, 'Layla', 'Griffin', 'F', 'paid'),\n",
206 |        " (50, 'Ava', 'Robinson', 'F', 'free'),\n",
207 |        " (54, 'Kaleb', 'Cook', 'M', 'free'),\n",
208 |        " (32, 'Lily', 'Burns', 'F', 'free')]"
209 |       ]
210 |      },
211 |      "execution_count": 4,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "%sql SELECT * FROM users LIMIT 5;"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 5,
223 |    "metadata": {
224 |     "editable": true
225 |    },
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       " * postgresql://student:***@127.0.0.1/sparkifydb\n",
232 |       "1 rows affected.\n"
233 |      ]
234 |     },
235 |     {
236 |      "data": {
237 |       "text/html": [
238 |        "<table>\n",
239 |        "    <tr>\n",
240 |        "        <th>song_id</th>\n",
241 |        "        <th>title</th>\n",
242 |        "        <th>artist_id</th>\n",
243 |        "        <th>year</th>\n",
244 |        "        <th>duration</th>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "        <td>SOINLJW12A8C13314C</td>\n",
248 |        "        <td>City Slickers</td>\n",
249 |        "        <td>AR8IEZO1187B99055E</td>\n",
250 |        "        <td>2008</td>\n",
251 |        "        <td>149.86404</td>\n",
252 |        "    </tr>\n",
253 |        "</table>"
254 |       ],
255 |       "text/plain": [
256 |        "[('SOINLJW12A8C13314C', 'City Slickers', 'AR8IEZO1187B99055E', 2008, Decimal('149.86404'))]"
257 |       ]
258 |      },
259 |      "execution_count": 5,
260 |      "metadata": {},
261 |      "output_type": "execute_result"
262 |     }
263 |    ],
264 |    "source": [
265 |     "%sql SELECT * FROM songs LIMIT 5;"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 6,
271 |    "metadata": {
272 |     "editable": true
273 |    },
274 |    "outputs": [
275 |     {
276 |      "name": "stdout",
277 |      "output_type": "stream",
278 |      "text": [
279 |       " * postgresql://student:***@127.0.0.1/sparkifydb\n",
280 |       "1 rows affected.\n"
281 |      ]
282 |     },
283 |     {
284 |      "data": {
285 |       "text/html": [
286 |        "<table>\n",
287 |        "    <tr>\n",
288 |        "        <th>artist_id</th>\n",
289 |        "        <th>name</th>\n",
290 |        "        <th>location</th>\n",
291 |        "        <th>latitude</th>\n",
292 |        "        <th>longitude</th>\n",
293 |        "    </tr>\n",
294 |        "    <tr>\n",
295 |        "        <td>AR8IEZO1187B99055E</td>\n",
296 |        "        <td>Marc Shaiman</td>\n",
297 |        "        <td></td>\n",
298 |        "        <td>None</td>\n",
299 |        "        <td>None</td>\n",
300 |        "    </tr>\n",
301 |        "</table>"
302 |       ],
303 |       "text/plain": [
304 |        "[('AR8IEZO1187B99055E', 'Marc Shaiman', '', None, None)]"
305 |       ]
306 |      },
307 |      "execution_count": 6,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "%sql SELECT * FROM artists LIMIT 5;"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 7,
319 |    "metadata": {
320 |     "editable": true
321 |    },
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       " * postgresql://student:***@127.0.0.1/sparkifydb\n",
328 |       "5 rows affected.\n"
329 |      ]
330 |     },
331 |     {
332 |      "data": {
333 |       "text/html": [
334 |        "<table>\n",
335 |        "    <tr>\n",
336 |        "        <th>start_time</th>\n",
337 |        "        <th>hour</th>\n",
338 |        "        <th>day</th>\n",
339 |        "        <th>week</th>\n",
340 |        "        <th>month</th>\n",
341 |        "        <th>year</th>\n",
342 |        "        <th>weekday</th>\n",
343 |        "    </tr>\n",
344 |        "    <tr>\n",
345 |        "        <td>00:00:57.796000</td>\n",
346 |        "        <td>0</td>\n",
347 |        "        <td>29</td>\n",
348 |        "        <td>48</td>\n",
349 |        "        <td>11</td>\n",
350 |        "        <td>2018</td>\n",
351 |        "        <td>3</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "        <td>00:01:30.796000</td>\n",
355 |        "        <td>0</td>\n",
356 |        "        <td>29</td>\n",
357 |        "        <td>48</td>\n",
358 |        "        <td>11</td>\n",
359 |        "        <td>2018</td>\n",
360 |        "        <td>3</td>\n",
361 |        "    </tr>\n",
362 |        "    <tr>\n",
363 |        "        <td>00:04:01.796000</td>\n",
364 |        "        <td>0</td>\n",
365 |        "        <td>29</td>\n",
366 |        "        <td>48</td>\n",
367 |        "        <td>11</td>\n",
368 |        "        <td>2018</td>\n",
369 |        "        <td>3</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "        <td>00:04:55.796000</td>\n",
373 |        "        <td>0</td>\n",
374 |        "        <td>29</td>\n",
375 |        "        <td>48</td>\n",
376 |        "        <td>11</td>\n",
377 |        "        <td>2018</td>\n",
378 |        "        <td>3</td>\n",
379 |        "    </tr>\n",
380 |        "    <tr>\n",
381 |        "        <td>00:07:13.796000</td>\n",
382 |        "        <td>0</td>\n",
383 |        "        <td>29</td>\n",
384 |        "        <td>48</td>\n",
385 |        "        <td>11</td>\n",
386 |        "        <td>2018</td>\n",
387 |        "        <td>3</td>\n",
388 |        "    </tr>\n",
389 |        "</table>"
390 |       ],
391 |       "text/plain": [
392 |        "[(datetime.time(0, 0, 57, 796000), 0, 29, 48, 11, 2018, 3),\n",
393 |        " (datetime.time(0, 1, 30, 796000), 0, 29, 48, 11, 2018, 3),\n",
394 |        " (datetime.time(0, 4, 1, 796000), 0, 29, 48, 11, 2018, 3),\n",
395 |        " (datetime.time(0, 4, 55, 796000), 0, 29, 48, 11, 2018, 3),\n",
396 |        " (datetime.time(0, 7, 13, 796000), 0, 29, 48, 11, 2018, 3)]"
397 |       ]
398 |      },
399 |      "execution_count": 7,
400 |      "metadata": {},
401 |      "output_type": "execute_result"
402 |     }
403 |    ],
404 |    "source": [
405 |     "%sql SELECT * FROM time LIMIT 5;"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {
411 |     "editable": true
412 |    },
413 |    "source": [
414 |     "## REMEMBER: Restart this notebook to close connection to `sparkifydb`\n",
415 |     "Each time you run the cells above, remember to restart this notebook to close the connection to your database. Otherwise, you won't be able to run your code in `create_tables.py`, `etl.py`, or `etl.ipynb` files since you can't make multiple connections to the same database (in this case, sparkifydb)."
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {
422 |     "editable": true
423 |    },
424 |    "outputs": [],
425 |    "source": []
426 |   }
427 |  ],
428 |  "metadata": {
429 |   "kernelspec": {
430 |    "display_name": "Python 3",
431 |    "language": "python",
432 |    "name": "python3"
433 |   },
434 |   "language_info": {
435 |    "codemirror_mode": {
436 |     "name": "ipython",
437 |     "version": 3
438 |    },
439 |    "file_extension": ".py",
440 |    "mimetype": "text/x-python",
441 |    "name": "python",
442 |    "nbconvert_exporter": "python",
443 |    "pygments_lexer": "ipython3",
444 |    "version": "3.6.3"
445 |   }
446 |  },
447 |  "nbformat": 4,
448 |  "nbformat_minor": 4
449 | }
450 | 


--------------------------------------------------------------------------------
/Data Pipeline/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Pipeline/.DS_Store


--------------------------------------------------------------------------------
/Data Pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Data-Pipelines-with-Airflow


--------------------------------------------------------------------------------
/Data Pipeline/create_tables.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE public.artists (
 2 | 	artistid varchar(256) NOT NULL,
 3 | 	name varchar(256),
 4 | 	location varchar(256),
 5 | 	lattitude numeric(18,0),
 6 | 	longitude numeric(18,0)
 7 | );
 8 | 
 9 | CREATE TABLE public.songplays (
10 | 	playid varchar(32) NOT NULL,
11 | 	start_time timestamp NOT NULL,
12 | 	userid int4 NOT NULL,
13 | 	"level" varchar(256),
14 | 	songid varchar(256),
15 | 	artistid varchar(256),
16 | 	sessionid int4,
17 | 	location varchar(256),
18 | 	user_agent varchar(256),
19 | 	CONSTRAINT songplays_pkey PRIMARY KEY (playid)
20 | );
21 | 
22 | CREATE TABLE public.songs (
23 | 	songid varchar(256) NOT NULL,
24 | 	title varchar(256),
25 | 	artistid varchar(256),
26 | 	"year" int4,
27 | 	duration numeric(18,0),
28 | 	CONSTRAINT songs_pkey PRIMARY KEY (songid)
29 | );
30 | 
31 | CREATE TABLE public.staging_events (
32 | 	artist varchar(256),
33 | 	auth varchar(256),
34 | 	firstname varchar(256),
35 | 	gender varchar(256),
36 | 	iteminsession int4,
37 | 	lastname varchar(256),
38 | 	length numeric(18,0),
39 | 	"level" varchar(256),
40 | 	location varchar(256),
41 | 	"method" varchar(256),
42 | 	page varchar(256),
43 | 	registration numeric(18,0),
44 | 	sessionid int4,
45 | 	song varchar(256),
46 | 	status int4,
47 | 	ts int8,
48 | 	useragent varchar(256),
49 | 	userid int4
50 | );
51 | 
52 | CREATE TABLE public.staging_songs (
53 | 	num_songs int4,
54 | 	artist_id varchar(256),
55 | 	artist_name varchar(256),
56 | 	artist_latitude numeric(18,0),
57 | 	artist_longitude numeric(18,0),
58 | 	artist_location varchar(256),
59 | 	song_id varchar(256),
60 | 	title varchar(256),
61 | 	duration numeric(18,0),
62 | 	"year" int4
63 | );
64 | 
65 | CREATE TABLE public.staging_songs (
66 | 	num_songs int4,
67 | 	artist_id varchar(256),
68 | 	artist_name varchar(256),
69 | 	artist_latitude numeric(18,0),
70 | 	artist_longitude numeric(18,0),
71 | 	artist_location varchar(256),
72 | 	song_id varchar(256),
73 | 	title varchar(256),
74 | 	duration numeric(18,0),
75 | 	"year" int4
76 | );
77 | 
78 | CREATE TABLE public.users (
79 | 	userid int4 NOT NULL,
80 | 	first_name varchar(256),
81 | 	last_name varchar(256),
82 | 	gender varchar(256),
83 | 	"level" varchar(256),
84 | 	CONSTRAINT users_pkey PRIMARY KEY (userid)
85 | );
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/Data Pipeline/dags/udac_example_dag.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import os
  3 | from airflow import DAG
  4 | from airflow.operators.dummy_operator import DummyOperator
  5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator,
  6 |                                 LoadDimensionOperator, DataQualityOperator)
  7 | from helpers import SqlQueries
  8 | 
  9 | # AWS_KEY = os.environ.get('AWS_KEY')
 10 | # AWS_SECRET = os.environ.get('AWS_SECRET')
 11 | 
 12 | default_args = {
 13 |     'owner': 'udacity',
 14 |     'start_date': datetime(2019, 1, 12),
 15 |     'depends_on_past': False,
 16 |     'retries': 3,
 17 |     'retry_delay': timedelta(minutes=5),
 18 |     'catchup': False
 19 | }
 20 | 
 21 | dag = DAG('udac_example_dag',
 22 |           default_args=default_args,
 23 |           description='Load and transform data in Redshift with Airflow',
 24 |           schedule_interval='0 * * * *'
 25 |         )
 26 | 
 27 | start_operator = DummyOperator(task_id='Begin_execution',  dag=dag)
 28 | 
 29 | stage_events_to_redshift = StageToRedshiftOperator(
 30 |     task_id='Stage_events',
 31 |     dag=dag,
 32 |     table="staging_events",
 33 |     conn_id="redshift",
 34 |     aws_credentials_id="aws_credentials",
 35 |     s3_bucket="udacity-dend",
 36 |     s3_key="log_data",
 37 |     json_path="s3://udacity-dend/log_json_path.json"
 38 | )
 39 | 
 40 | stage_songs_to_redshift = StageToRedshiftOperator(
 41 |     task_id='Stage_songs',
 42 |     dag=dag,
 43 |     table="staging_songs",
 44 |     conn_id="redshift",
 45 |     aws_credentials_id="aws_credentials",
 46 |     s3_bucket="udacity-dend",
 47 |     s3_key="song_data"
 48 | )
 49 | 
 50 | load_songplays_table = LoadFactOperator(
 51 |     task_id='Load_songplays_fact_table',
 52 |     dag=dag,
 53 |     conn_id="redshift",
 54 |     table="songplays",
 55 |     query=SqlQueries.songplay_table_insert
 56 | )
 57 | 
 58 | load_user_dimension_table = LoadDimensionOperator(
 59 |     task_id='Load_user_dim_table',
 60 |     dag=dag,
 61 |     conn_id="redshift",
 62 |     table="users",
 63 |     query=SqlQueries.user_table_insert,
 64 |     truncate=True
 65 | )
 66 | 
 67 | load_song_dimension_table = LoadDimensionOperator(
 68 |     task_id='Load_song_dim_table',
 69 |     dag=dag,
 70 |     conn_id="redshift",
 71 |     table="songs",
 72 |     query=SqlQueries.song_table_insert,
 73 |     truncate=True
 74 | )
 75 | 
 76 | load_artist_dimension_table = LoadDimensionOperator(
 77 |     task_id='Load_artist_dim_table',
 78 |     dag=dag,
 79 |     conn_id="redshift",
 80 |     table="artists",
 81 |     query=SqlQueries.artist_table_insert,
 82 |     truncate=True
 83 | )
 84 | 
 85 | load_time_dimension_table = LoadDimensionOperator(
 86 |     task_id='Load_time_dim_table',
 87 |     dag=dag,
 88 |     conn_id="redshift",
 89 |     table="time",
 90 |     query=SqlQueries.time_table_insert,
 91 |     truncate=True
 92 | )
 93 | 
 94 | run_quality_checks = DataQualityOperator(
 95 |     task_id='Run_data_quality_checks',
 96 |     dag=dag
 97 | )
 98 | 
 99 | end_operator = DummyOperator(task_id='Stop_execution',  dag=dag)
100 | 
101 | start_operator >> stage_events_to_redshift
102 | start_operator >> stage_songs_to_redshift
103 | 
104 | stage_events_to_redshift >> load_songplays_table
105 | stage_songs_to_redshift >> load_songplays_table
106 | 
107 | load_songplays_table >> load_song_dimension_table
108 | load_songplays_table >> load_user_dimension_table
109 | load_songplays_table >> load_artist_dimension_table
110 | load_songplays_table >> load_time_dimension_table
111 | 
112 | load_song_dimension_table >> run_quality_checks
113 | load_user_dimension_table >> run_quality_checks
114 | load_artist_dimension_table >> run_quality_checks
115 | load_time_dimension_table >> run_quality_checks
116 | 
117 | run_quality_checks >> end_operator
118 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, absolute_import, print_function
 2 | 
 3 | from airflow.plugins_manager import AirflowPlugin
 4 | 
 5 | import operators
 6 | import helpers
 7 | 
 8 | # Defining the plugin class
 9 | class UdacityPlugin(AirflowPlugin):
10 |     name = "udacity_plugin"
11 |     operators = [
12 |         operators.StageToRedshiftOperator,
13 |         operators.LoadFactOperator,
14 |         operators.LoadDimensionOperator,
15 |         operators.DataQualityOperator
16 |     ]
17 |     helpers = [
18 |         helpers.SqlQueries
19 |     ]
20 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.sql_queries import SqlQueries
2 | 
3 | __all__ = [
4 |     'SqlQueries',
5 | ]


--------------------------------------------------------------------------------
/Data Pipeline/plugins/helpers/sql_queries.py:
--------------------------------------------------------------------------------
 1 | class SqlQueries:
 2 |     songplay_table_insert = ("""
 3 |         SELECT
 4 |                 md5(events.sessionid || events.start_time) songplay_id,
 5 |                 events.start_time, 
 6 |                 events.userid, 
 7 |                 events.level, 
 8 |                 songs.song_id, 
 9 |                 songs.artist_id, 
10 |                 events.sessionid, 
11 |                 events.location, 
12 |                 events.useragent
13 |                 FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, *
14 |             FROM staging_events
15 |             WHERE page='NextSong') events
16 |             LEFT JOIN staging_songs songs
17 |             ON events.song = songs.title
18 |                 AND events.artist = songs.artist_name
19 |                 AND events.length = songs.duration
20 |     """)
21 | 
22 |     user_table_insert = ("""
23 |         SELECT distinct userid, firstname, lastname, gender, level
24 |         FROM staging_events
25 |         WHERE page='NextSong'
26 |     """)
27 | 
28 |     song_table_insert = ("""
29 |         SELECT distinct song_id, title, artist_id, year, duration
30 |         FROM staging_songs
31 |     """)
32 | 
33 |     artist_table_insert = ("""
34 |         SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude
35 |         FROM staging_songs
36 |     """)
37 | 
38 |     time_table_insert = ("""
39 |         SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 
40 |                extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time)
41 |         FROM songplays
42 |     """)


--------------------------------------------------------------------------------
/Data Pipeline/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
 1 | from operators.stage_redshift import StageToRedshiftOperator
 2 | from operators.load_fact import LoadFactOperator
 3 | from operators.load_dimension import LoadDimensionOperator
 4 | from operators.data_quality import DataQualityOperator
 5 | 
 6 | __all__ = [
 7 |     'StageToRedshiftOperator',
 8 |     'LoadFactOperator',
 9 |     'LoadDimensionOperator',
10 |     'DataQualityOperator'
11 | ]
12 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class DataQualityOperator(BaseOperator):
 6 | 
 7 |     ui_color = '#89DA59'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  conn_id,
12 |                  tables,
13 |                  *args, **kwargs):
14 | 
15 |         super(DataQualityOperator, self).__init__(*args, **kwargs)
16 |         self.conn_id = redshift_conn_id
17 |         self.tables = tables
18 | 
19 |     def execute(self, context):
20 |         redshift_hook = PostgresHook(self.conn_id)
21 |         for table in self.tables:
22 |             records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
23 |             if len(records) < 1 or len(records[0]) < 1:
24 |                 raise ValueError(f"Data quality check failed. {table} returned no results")
25 |             if records[0][0] < 1:
26 |                 raise ValueError(f"Data quality check failed. {table} contained 0 rows")
27 |             self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records")
28 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/operators/load_dimension.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class LoadDimensionOperator(BaseOperator):
 6 | 
 7 |     ui_color = '#80BD9E'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  conn_id,
12 |                  table,
13 |                  query,
14 |                  truncate = False,
15 |                  *args, **kwargs):
16 | 
17 |         super(LoadDimensionOperator, self).__init__(*args, **kwargs)
18 |         self.conn_id = conn_id
19 |         self.table = table
20 |         self.query = query
21 |         self.truncate = truncate
22 | 
23 |     def execute(self, context):
24 |         redshift = PostgresHook(postgres_conn_id=conn_id)
25 |         if self.truncate:
26 |             redshift.run(f"TRUNCATE TABLE {self.table}")
27 |         redshift.run(f"INSERT INTO {self.table} {self.query}")
28 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/operators/load_fact.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class LoadFactOperator(BaseOperator):
 6 | 
 7 |     ui_color = '#F98866'
 8 | 
 9 |     @apply_defaults
10 |     def __init__(self,
11 |                  conn_id,
12 |                  table,
13 |                  query,
14 |                  *args, **kwargs):
15 | 
16 |         super(LoadFactOperator, self).__init__(*args, **kwargs)
17 |         self.conn_id = conn_id
18 |         self.table = table
19 |         self.query = query
20 | 
21 |     def execute(self, context):
22 |         redshift = PostgresHook(postgres_conn_id=self.conn_id)
23 |         redshift.run(f"INSERT INTO {self.table} {self.query}")
24 | 


--------------------------------------------------------------------------------
/Data Pipeline/plugins/operators/stage_redshift.py:
--------------------------------------------------------------------------------
 1 | from airflow.hooks.postgres_hook import PostgresHook
 2 | from airflow.models import BaseOperator
 3 | from airflow.utils.decorators import apply_defaults
 4 | 
 5 | class StageToRedshiftOperator(BaseOperator):
 6 |     ui_color = '#358140'
 7 | 
 8 |     @apply_defaults
 9 |     def __init__(self,
10 |                  conn_id,
11 |                  aws_credentials_id,
12 |                  table,
13 |                  s3_bucket,
14 |                  s3_key,
15 |                  json_path,
16 |                  *args, **kwargs):
17 | 
18 |         super(StageToRedshiftOperator, self).__init__(*args, **kwargs):
19 |         self.conn_id = conn_id
20 |         self.table = table
21 |         self.s3_bucket = s3_bucket
22 |         self.s3_key = s3_key
23 |         self.aws_credentials_id = aws_credentials_id
24 |         self.json_path = json_path
25 | 
26 |     def execute(self, context):
27 |         aws_hook = AwsHook(self.aws_credentials_id)
28 |         credentials = aws_hook.get_credentials()
29 |         redshift = PostgresHook(postgres_conn_id=self.conn_id)
30 |         redshift.run(f"DELETE FROM {self.table}")
31 |         self.s3_key = self.s3_key.format(**context)
32 |         s3_path = f"s3://{self.s3_bucket}/{self.s3_key}")
33 |         redshift.run(f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' \
34 |             SECRET_ACCESS_KEY '{credentials.secret_key}' FORMAT AS JSON '{self.json_path}'")
35 | 


--------------------------------------------------------------------------------
/Data Warehouse/README.md:
--------------------------------------------------------------------------------
 1 | # Data Warehouse
 2 | 
 3 | Project Data Warehouse as part of the Udactiy Data Engineer Nanodegree.
 4 | 
 5 | ## Project Summary
 6 | An implementation of a Data Warehouse leveraging AWS RedShift. This projects contains the ETL pipeline that extracts data from S3, stages them in Redshift, and transforms data into a set of dimensional tables for their analytics team.
 7 | 
 8 | The data on S3 contains song and log information from a music store. This solution enables music stores to easily process loads of information efficiently.
 9 | 
10 | ## Purpose of this project
11 | This projects processes data from different sources (in this case multiple S3 buckets) in a way that it can be analyzed easily and efficiently. The startup Sparkify thus enjoys the eased analysis of the run-time data of their application.
12 | 
13 | ## Project instructions
14 | 1. Setup a redshift cluster on AWS and insert the connection details in `dwh.cfg`.
15 | 2. Create the needed the database structure by executing `create_tables.py`.
16 | 3. Process the data from the configured S3 data sources by executing `etl.py`.
17 | 
18 | ## Database schema
19 | | Table | Description |
20 | | ---- | ---- |
21 | | staging_events | stating table for event data |
22 | | staging_songs | staging table for song data |
23 | | songplays | information how songs were played, e.g. when by which user in which session | 
24 | | users | user-related information such as name, gender and level | 
25 | | songs | song-related information containing name, artist, year and duration | 
26 | | artists | artist name and location (geo-coords and textual location) | 
27 | | time | time-related info for timestamps | 
28 | 
29 | ## ETL pipeline
30 | 1. Load song and log data both from S3 buckets.
31 | 2. Stage the loaded data.
32 | 3. Transform the data into the above described data schema.
33 | 
34 | ## Example queries
35 | 
36 | * Find all users at a certain location: ```SELECT DISTINCT users.user_id FROM users JOIN songplays ON songplays.user_id = users.user_id WHERE songplays.location = <LOCATION>```
37 | * Find all songs by a given artist: ```SELECT songs.song_id FROM songs JOIN artists ON songs.artist_id = artists.artist_id WHERE artist.name = <ARTIST>```
38 | 


--------------------------------------------------------------------------------
/Data Warehouse/create_tables.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import psycopg2
 3 | from sql_queries import create_table_queries, drop_table_queries
 4 | 
 5 | 
 6 | def drop_tables(cur, conn):
 7 |     """
 8 |     Executes all drop table queries.
 9 |     :param cur: database cursor
10 |     :param conn: database connector
11 |     """
12 |     for query in drop_table_queries:
13 |         cur.execute(query)
14 |         conn.commit()
15 | 
16 | 
17 | def create_tables(cur, conn):
18 |     """
19 |     Executes all create table queries.
20 |     :param cur: database cursor
21 |     :param conn: database connector
22 |     :return:
23 |     """
24 |     for query in create_table_queries:
25 |         cur.execute(query)
26 |         conn.commit()
27 | 
28 | 
29 | def main():
30 |     config = configparser.ConfigParser()
31 |     config.read('dwh.cfg')
32 | 
33 |     conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
34 |     cur = conn.cursor()
35 | 
36 |     drop_tables(cur, conn)
37 |     create_tables(cur, conn)
38 | 
39 |     conn.close()
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()


--------------------------------------------------------------------------------
/Data Warehouse/dwh.cfg:
--------------------------------------------------------------------------------
 1 | [CLUSTER]
 2 | HOST=
 3 | DB_NAME=
 4 | DB_USER=
 5 | DB_PASSWORD=
 6 | DB_PORT=
 7 | 
 8 | [IAM_ROLE]
 9 | ARN=''
10 | 
11 | [S3]
12 | LOG_DATA='s3://udacity-dend/log_data'
13 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json'
14 | SONG_DATA='s3://udacity-dend/song_data'


--------------------------------------------------------------------------------
/Data Warehouse/etl.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import psycopg2
 3 | from sql_queries import copy_table_queries, insert_table_queries
 4 | 
 5 | 
 6 | def load_staging_tables(cur, conn):
 7 |     """
 8 |     Executes all copy table queries.
 9 |     :param cur: database cursor
10 |     :param conn: database connector
11 |     """
12 |     for query in copy_table_queries:
13 |         cur.execute(query)
14 |         conn.commit()
15 | 
16 | 
17 | def insert_tables(cur, conn):
18 |     """
19 |     Executes all  insert table queries.
20 |     :param cur: database cursor
21 |     :param conn: database connector
22 |     """
23 |     for query in insert_table_queries:
24 |         cur.execute(query)
25 |         conn.commit()
26 | 
27 | 
28 | def main():
29 |     config = configparser.ConfigParser()
30 |     config.read('dwh.cfg')
31 | 
32 |     conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
33 |     cur = conn.cursor()
34 |     
35 |     load_staging_tables(cur, conn)
36 |     insert_tables(cur, conn)
37 | 
38 |     conn.close()
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()


--------------------------------------------------------------------------------
/Data Warehouse/sql_queries.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | 
  3 | # CONFIG
  4 | config = configparser.ConfigParser()
  5 | config.read('dwh.cfg')
  6 | 
  7 | # DROP TABLES
  8 | 
  9 | staging_events_table_drop = "DROP TABLE IF EXISTS staging_events"
 10 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs"
 11 | songplay_table_drop = "DROP TABLE IF EXISTS songplays"
 12 | user_table_drop = "DROP TABLE IF EXISTS users"
 13 | song_table_drop = "DROP TABLE IF EXISTS songs"
 14 | artist_table_drop = "DROP TABLE IF EXISTS artists"
 15 | time_table_drop = "DROP TABLE IF EXISTS time"
 16 | 
 17 | # CREATE TABLES
 18 | 
 19 | staging_events_table_create = ("""
 20 |     CREATE TABLE IF NOT EXISTS staging_events (
 21 |         event_id    BIGINT IDENTITY(0,1),
 22 |         artist      VARCHAR,
 23 |         auth        VARCHAR,
 24 |         firstName   VARCHAR,
 25 |         gender      VARCHAR,
 26 |         itemInSession VARCHAR,
 27 |         lastName    VARCHAR,
 28 |         length      VARCHAR,
 29 |         level       VARCHAR,
 30 |         location    VARCHAR,
 31 |         method      VARCHAR,
 32 |         page        VARCHAR,
 33 |         registration VARCHAR,
 34 |         sessionId   INTEGER SORTKEY DISTKEY,
 35 |         song        VARCHAR,
 36 |         status      INTEGER,
 37 |         ts          BIGINT,
 38 |         userAgent   VARCHAR,
 39 |         userId      INTEGER
 40 |     );
 41 | """)
 42 | 
 43 | staging_songs_table_create = ("""
 44 |     CREATE TABLE IF NOT EXISTS staging_songs (
 45 |         num_songs           INTEGER,
 46 |         artist_id           VARCHAR SORTKEY DISTKEY,
 47 |         artist_latitude     VARCHAR,
 48 |         artist_longitude    VARCHAR,
 49 |         artist_location     VARCHAR(500),
 50 |         artist_name         VARCHAR(500),
 51 |         song_id             VARCHAR,
 52 |         title               VARCHAR(500),
 53 |         duration            DECIMAL(9),
 54 |         year                INTEGER
 55 |     );
 56 | """)
 57 | 
 58 | songplay_table_create = ("""
 59 |     CREATE TABLE IF NOT EXISTS songplays (
 60 |         songplay_id INTEGER IDENTITY(0,1)   NOT NULL SORTKEY,
 61 |         start_time  TIMESTAMP               NOT NULL,
 62 |         user_id     VARCHAR(50)             NOT NULL DISTKEY,
 63 |         level       VARCHAR(10)             NOT NULL,
 64 |         song_id     VARCHAR(40)             NOT NULL,
 65 |         artist_id   VARCHAR(50)             NOT NULL,
 66 |         session_id  VARCHAR(50)             NOT NULL,
 67 |         location    VARCHAR(100)            NULL,
 68 |         user_agent  VARCHAR(255)            NULL
 69 |     );
 70 | """)
 71 | 
 72 | user_table_create = ("""
 73 |     CREATE TABLE IF NOT EXISTS users (
 74 |         user_id     INTEGER         NOT NULL SORTKEY,
 75 |         first_name  VARCHAR(50)     NULL,
 76 |         last_name   VARCHAR(80)     NULL,
 77 |         gender      VARCHAR(10)     NULL,
 78 |         level       VARCHAR(10)     NULL
 79 |     ) diststyle all;
 80 | """)
 81 | 
 82 | song_table_create = ("""
 83 |     CREATE TABLE IF NOT EXISTS songs (
 84 |         song_id     VARCHAR(50)     NOT NULL SORTKEY,
 85 |         title       VARCHAR(500)    NOT NULL,
 86 |         artist_id   VARCHAR(50)     NOT NULL,
 87 |         year        INTEGER         NOT NULL,
 88 |         duration    DECIMAL(9)      NOT NULL
 89 |     );
 90 | """)
 91 | 
 92 | artist_table_create = ("""
 93 |     CREATE TABLE IF NOT EXISTS artists (
 94 |         artist_id   VARCHAR(50)             NOT NULL SORTKEY,
 95 |         name        VARCHAR(500)            NULL,
 96 |         location    VARCHAR(500)            NULL,
 97 |         latitude    DECIMAL(9)              NULL,
 98 |         longitude   DECIMAL(9)              NULL
 99 |     ) diststyle all;
100 | """)
101 | 
102 | time_table_create = ("""
103 |     CREATE TABLE IF NOT EXISTS time (
104 |         start_time  TIMESTAMP   NOT NULL SORTKEY,
105 |         hour        SMALLINT    NULL,
106 |         day         SMALLINT    NULL,
107 |         week        SMALLINT    NULL,
108 |         month       SMALLINT    NULL,
109 |         year        SMALLINT    NULL,
110 |         weekday     SMALLINT    NULL
111 |     ) diststyle all;
112 | """)
113 | 
114 | # STAGING TABLES
115 | 
116 | staging_events_copy = ("""
117 |     COPY staging_events FROM {}
118 |     credentials 'aws_iam_role={}'
119 |     format as json {}
120 |     STATUPDATE ON
121 |     region 'us-west-2';
122 | """).format(config.get('S3', 'LOG_DATA'), config.get('IAM_ROLE', 'ARN'), config.get('S3', 'LOG_JSONPATH'))
123 | 
124 | staging_songs_copy = ("""
125 |     COPY staging_songs FROM {}
126 |     credentials 'aws_iam_role={}'
127 |     format as json 'auto'
128 |     ACCEPTINVCHARS AS '^'
129 |     STATUPDATE ON
130 |     region 'us-west-2';
131 | """).format(config.get('S3', 'SONG_DATA'), config.get('IAM_ROLE', 'ARN'))
132 | 
133 | # FINAL TABLES
134 | 
135 | songplay_table_insert = ("""
136 |     INSERT INTO songplays (
137 |         start_time,
138 |         user_id,
139 |         level,
140 |         song_id,
141 |         artist_id,
142 |         session_id,
143 |         location,
144 |         user_agent)
145 |     SELECT  DISTINCT TIMESTAMP 'epoch' + se.ts/1000 \
146 |                 * INTERVAL '1 second'   AS start_time,
147 |             se.userId                   AS user_id,
148 |             se.level                    AS level,
149 |             ss.song_id                  AS song_id,
150 |             ss.artist_id                AS artist_id,
151 |             se.sessionId                AS session_id,
152 |             se.location                 AS location,
153 |             se.userAgent                AS user_agent
154 |     FROM songplays AS se
155 |     JOIN staging_songs AS ss ON (se.song = ss.title AND se.artist = ss.artist_name)
156 |     WHERE se.page = 'NextSong';
157 | """)
158 | 
159 | user_table_insert = ("""
160 |     INSERT INTO users (
161 |         user_id,
162 |         first_name,
163 |         last_name,
164 |         gender,
165 |         level)
166 |     SELECT  DISTINCT se.userId  AS user_id,
167 |             se.firstName        AS first_name,
168 |             se.lastName         AS last_name,
169 |             se.gender           AS gender,
170 |             se.level            AS level
171 |     FROM songplays AS se
172 |     WHERE se.page = 'NextSong';
173 | """)
174 | 
175 | song_table_insert = ("""
176 |     INSERT INTO songs (
177 |         song_id,
178 |         title,
179 |         artist_id,
180 |         year,
181 |         duration)
182 |     SELECT  DISTINCT ss.song_id AS song_id,
183 |             ss.title            AS title,
184 |             ss.artist_id        AS artist_id,
185 |             ss.year             AS year,
186 |             ss.duration         AS duration
187 |     FROM staging_songs AS ss;
188 | """)
189 | 
190 | artist_table_insert = ("""
191 |     INSERT INTO artists (
192 |         artist_id,
193 |         name,
194 |         location,
195 |         latitude,
196 |         longitude)
197 |     SELECT  DISTINCT ss.artist_id   AS artist_id,
198 |         ss.artist_name              AS name,
199 |         ss.artist_location          AS location,
200 |         ss.artist_latitude          AS latitude,
201 |         ss.artist_longitude         AS longitude
202 |     FROM staging_songs AS ss;
203 | """)
204 | 
205 | time_table_insert = ("""
206 |     INSERT INTO time (                  
207 |         start_time,
208 |         hour,
209 |         day,
210 |         week,
211 |         month,
212 |         year,
213 |         weekday)
214 |     SELECT  DISTINCT TIMESTAMP 'epoch' + se.ts/1000 \
215 |                 * INTERVAL '1 second'        AS start_time,
216 |             EXTRACT(hour FROM start_time)    AS hour,
217 |             EXTRACT(day FROM start_time)     AS day,
218 |             EXTRACT(week FROM start_time)    AS week,
219 |             EXTRACT(month FROM start_time)   AS month,
220 |             EXTRACT(year FROM start_time)    AS year,
221 |             EXTRACT(week FROM start_time)    AS weekday
222 |     FROM    songplays                   AS se
223 |     WHERE se.page = 'NextSong';
224 | """)
225 | 
226 | # QUERY LISTS
227 | 
228 | create_table_queries = [staging_events_table_create, staging_songs_table_create, songplay_table_create,
229 |                         user_table_create, song_table_create, artist_table_create, time_table_create]
230 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop,
231 |                       song_table_drop, artist_table_drop, time_table_drop]
232 | copy_table_queries = [staging_events_copy, staging_songs_copy]
233 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert,
234 |                         time_table_insert]
235 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Engineering Nanodegree
 2 | All projects of Udacity's [Data Engineering Nanodegree](https://www.udacity.com/course/data-engineer-nanodegree--nd027). 
 3 | 
 4 | ## Projects
 5 | 
 6 | ### Data Modeling
 7 | Learn to create relational and NoSQL data models to fit the diverse needs of data consumers. Use ETL to build databases in PostgreSQL and Apache Cassandra.
 8 | 
 9 | [DATA MODELING WITH POSTGRES](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Modeling%20with%20Postgres)
10 | 
11 | [DATA MODELING WITH APACHE CASSANDRA](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Modeling%20with%20Cassandra)
12 | 
13 | ### Cloud Data Warehouses
14 | Sharpen your data warehousing skills and deepen your understanding of data infrastructure. Create cloud-based data warehouses on Amazon Web Services (AWS).
15 | 
16 | [BUILD A CLOUD DATA WAREHOUSE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Warehouse)
17 | 
18 | ### Spark and Data Lakes
19 | Understand the big data ecosystem and how to use Spark to work with massive datasets. Store big data in a data lake and query it with Spark.
20 | 
21 | [BUILD A DATA LAKE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Lake)
22 | 
23 | ### Data Pipelines with Airflow
24 | Schedule, automate, and monitor data pipelines using Apache Airflow. Run data quality checks, track data lineage, and work with data pipelines in production.
25 | 
26 | [DATA PIPELINES WITH AIRFLOW](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Pipeline)
27 | 
28 | ### Capstone Project
29 | Combine what you've learned throughout the program to build your own data engineering portfolio project.
30 | 
31 | [DATA ENGINEERING CAPSTONE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Capstone%20Project)
32 | 


--------------------------------------------------------------------------------