├── Capstone Project ├── Capstone Project.ipynb ├── I94_SAS_Labels_Descriptions.SAS ├── README.md ├── airport-codes_csv.csv ├── create_tables.py ├── immigration_data_sample.csv ├── sql_queries.py └── us-cities-demographics.csv ├── Data Lake ├── README.md ├── dl.cfg └── etl.py ├── Data Modeling with Cassandra ├── .DS_Store ├── Project_1B_ Project_Template.ipynb ├── README.md └── images │ └── image_event_datafile_new.jpg ├── Data Modeling with Postgres ├── README.md ├── create_tables.py ├── etl.ipynb ├── etl.py ├── requirements.txt ├── sql_queries.py └── test.ipynb ├── Data Pipeline ├── .DS_Store ├── README.md ├── create_tables.sql ├── dags │ └── udac_example_dag.py └── plugins │ ├── __init__.py │ ├── helpers │ ├── __init__.py │ └── sql_queries.py │ └── operators │ ├── __init__.py │ ├── data_quality.py │ ├── load_dimension.py │ ├── load_fact.py │ └── stage_redshift.py ├── Data Warehouse ├── README.md ├── create_tables.py ├── dwh.cfg ├── etl.py └── sql_queries.py └── README.md /Capstone Project/I94_SAS_Labels_Descriptions.SAS: -------------------------------------------------------------------------------- 1 | libname library 'Your file location' ; 2 | proc format library=library ; 3 | 4 | /* I94YR - 4 digit year */ 5 | 6 | /* I94MON - Numeric month */ 7 | 8 | /* I94CIT & I94RES - This format shows all the valid and invalid codes for processing */ 9 | value i94cntyl 10 | 582 = 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)' 11 | 236 = 'AFGHANISTAN' 12 | 101 = 'ALBANIA' 13 | 316 = 'ALGERIA' 14 | 102 = 'ANDORRA' 15 | 324 = 'ANGOLA' 16 | 529 = 'ANGUILLA' 17 | 518 = 'ANTIGUA-BARBUDA' 18 | 687 = 'ARGENTINA ' 19 | 151 = 'ARMENIA' 20 | 532 = 'ARUBA' 21 | 438 = 'AUSTRALIA' 22 | 103 = 'AUSTRIA' 23 | 152 = 'AZERBAIJAN' 24 | 512 = 'BAHAMAS' 25 | 298 = 'BAHRAIN' 26 | 274 = 'BANGLADESH' 27 | 513 = 'BARBADOS' 28 | 104 = 'BELGIUM' 29 | 581 = 'BELIZE' 30 | 386 = 'BENIN' 31 | 509 = 'BERMUDA' 32 | 153 = 'BELARUS' 33 | 242 = 'BHUTAN' 34 | 688 = 'BOLIVIA' 35 | 717 = 'BONAIRE, ST EUSTATIUS, SABA' 36 | 164 = 'BOSNIA-HERZEGOVINA' 37 | 336 = 'BOTSWANA' 38 | 689 = 'BRAZIL' 39 | 525 = 'BRITISH VIRGIN ISLANDS' 40 | 217 = 'BRUNEI' 41 | 105 = 'BULGARIA' 42 | 393 = 'BURKINA FASO' 43 | 243 = 'BURMA' 44 | 375 = 'BURUNDI' 45 | 310 = 'CAMEROON' 46 | 326 = 'CAPE VERDE' 47 | 526 = 'CAYMAN ISLANDS' 48 | 383 = 'CENTRAL AFRICAN REPUBLIC' 49 | 384 = 'CHAD' 50 | 690 = 'CHILE' 51 | 245 = 'CHINA, PRC' 52 | 721 = 'CURACAO' 53 | 270 = 'CHRISTMAS ISLAND' 54 | 271 = 'COCOS ISLANDS' 55 | 691 = 'COLOMBIA' 56 | 317 = 'COMOROS' 57 | 385 = 'CONGO' 58 | 467 = 'COOK ISLANDS' 59 | 575 = 'COSTA RICA' 60 | 165 = 'CROATIA' 61 | 584 = 'CUBA' 62 | 218 = 'CYPRUS' 63 | 140 = 'CZECH REPUBLIC' 64 | 723 = 'FAROE ISLANDS (PART OF DENMARK)' 65 | 108 = 'DENMARK' 66 | 322 = 'DJIBOUTI' 67 | 519 = 'DOMINICA' 68 | 585 = 'DOMINICAN REPUBLIC' 69 | 240 = 'EAST TIMOR' 70 | 692 = 'ECUADOR' 71 | 368 = 'EGYPT' 72 | 576 = 'EL SALVADOR' 73 | 399 = 'EQUATORIAL GUINEA' 74 | 372 = 'ERITREA' 75 | 109 = 'ESTONIA' 76 | 369 = 'ETHIOPIA' 77 | 604 = 'FALKLAND ISLANDS' 78 | 413 = 'FIJI' 79 | 110 = 'FINLAND' 80 | 111 = 'FRANCE' 81 | 601 = 'FRENCH GUIANA' 82 | 411 = 'FRENCH POLYNESIA' 83 | 387 = 'GABON' 84 | 338 = 'GAMBIA' 85 | 758 = 'GAZA STRIP' 86 | 154 = 'GEORGIA' 87 | 112 = 'GERMANY' 88 | 339 = 'GHANA' 89 | 143 = 'GIBRALTAR' 90 | 113 = 'GREECE' 91 | 520 = 'GRENADA' 92 | 507 = 'GUADELOUPE' 93 | 577 = 'GUATEMALA' 94 | 382 = 'GUINEA' 95 | 327 = 'GUINEA-BISSAU' 96 | 603 = 'GUYANA' 97 | 586 = 'HAITI' 98 | 726 = 'HEARD AND MCDONALD IS.' 99 | 149 = 'HOLY SEE/VATICAN' 100 | 528 = 'HONDURAS' 101 | 206 = 'HONG KONG' 102 | 114 = 'HUNGARY' 103 | 115 = 'ICELAND' 104 | 213 = 'INDIA' 105 | 759 = 'INDIAN OCEAN AREAS (FRENCH)' 106 | 729 = 'INDIAN OCEAN TERRITORY' 107 | 204 = 'INDONESIA' 108 | 249 = 'IRAN' 109 | 250 = 'IRAQ' 110 | 116 = 'IRELAND' 111 | 251 = 'ISRAEL' 112 | 117 = 'ITALY' 113 | 388 = 'IVORY COAST' 114 | 514 = 'JAMAICA' 115 | 209 = 'JAPAN' 116 | 253 = 'JORDAN' 117 | 201 = 'KAMPUCHEA' 118 | 155 = 'KAZAKHSTAN' 119 | 340 = 'KENYA' 120 | 414 = 'KIRIBATI' 121 | 732 = 'KOSOVO' 122 | 272 = 'KUWAIT' 123 | 156 = 'KYRGYZSTAN' 124 | 203 = 'LAOS' 125 | 118 = 'LATVIA' 126 | 255 = 'LEBANON' 127 | 335 = 'LESOTHO' 128 | 370 = 'LIBERIA' 129 | 381 = 'LIBYA' 130 | 119 = 'LIECHTENSTEIN' 131 | 120 = 'LITHUANIA' 132 | 121 = 'LUXEMBOURG' 133 | 214 = 'MACAU' 134 | 167 = 'MACEDONIA' 135 | 320 = 'MADAGASCAR' 136 | 345 = 'MALAWI' 137 | 273 = 'MALAYSIA' 138 | 220 = 'MALDIVES' 139 | 392 = 'MALI' 140 | 145 = 'MALTA' 141 | 472 = 'MARSHALL ISLANDS' 142 | 511 = 'MARTINIQUE' 143 | 389 = 'MAURITANIA' 144 | 342 = 'MAURITIUS' 145 | 760 = 'MAYOTTE (AFRICA - FRENCH)' 146 | 473 = 'MICRONESIA, FED. STATES OF' 147 | 157 = 'MOLDOVA' 148 | 122 = 'MONACO' 149 | 299 = 'MONGOLIA' 150 | 735 = 'MONTENEGRO' 151 | 521 = 'MONTSERRAT' 152 | 332 = 'MOROCCO' 153 | 329 = 'MOZAMBIQUE' 154 | 371 = 'NAMIBIA' 155 | 440 = 'NAURU' 156 | 257 = 'NEPAL' 157 | 123 = 'NETHERLANDS' 158 | 508 = 'NETHERLANDS ANTILLES' 159 | 409 = 'NEW CALEDONIA' 160 | 464 = 'NEW ZEALAND' 161 | 579 = 'NICARAGUA' 162 | 390 = 'NIGER' 163 | 343 = 'NIGERIA' 164 | 470 = 'NIUE' 165 | 275 = 'NORTH KOREA' 166 | 124 = 'NORWAY' 167 | 256 = 'OMAN' 168 | 258 = 'PAKISTAN' 169 | 474 = 'PALAU' 170 | 743 = 'PALESTINE' 171 | 504 = 'PANAMA' 172 | 441 = 'PAPUA NEW GUINEA' 173 | 693 = 'PARAGUAY' 174 | 694 = 'PERU' 175 | 260 = 'PHILIPPINES' 176 | 416 = 'PITCAIRN ISLANDS' 177 | 107 = 'POLAND' 178 | 126 = 'PORTUGAL' 179 | 297 = 'QATAR' 180 | 748 = 'REPUBLIC OF SOUTH SUDAN' 181 | 321 = 'REUNION' 182 | 127 = 'ROMANIA' 183 | 158 = 'RUSSIA' 184 | 376 = 'RWANDA' 185 | 128 = 'SAN MARINO' 186 | 330 = 'SAO TOME AND PRINCIPE' 187 | 261 = 'SAUDI ARABIA' 188 | 391 = 'SENEGAL' 189 | 142 = 'SERBIA AND MONTENEGRO' 190 | 745 = 'SERBIA' 191 | 347 = 'SEYCHELLES' 192 | 348 = 'SIERRA LEONE' 193 | 207 = 'SINGAPORE' 194 | 141 = 'SLOVAKIA' 195 | 166 = 'SLOVENIA' 196 | 412 = 'SOLOMON ISLANDS' 197 | 397 = 'SOMALIA' 198 | 373 = 'SOUTH AFRICA' 199 | 276 = 'SOUTH KOREA' 200 | 129 = 'SPAIN' 201 | 244 = 'SRI LANKA' 202 | 346 = 'ST. HELENA' 203 | 522 = 'ST. KITTS-NEVIS' 204 | 523 = 'ST. LUCIA' 205 | 502 = 'ST. PIERRE AND MIQUELON' 206 | 524 = 'ST. VINCENT-GRENADINES' 207 | 716 = 'SAINT BARTHELEMY' 208 | 736 = 'SAINT MARTIN' 209 | 749 = 'SAINT MAARTEN' 210 | 350 = 'SUDAN' 211 | 602 = 'SURINAME' 212 | 351 = 'SWAZILAND' 213 | 130 = 'SWEDEN' 214 | 131 = 'SWITZERLAND' 215 | 262 = 'SYRIA' 216 | 268 = 'TAIWAN' 217 | 159 = 'TAJIKISTAN' 218 | 353 = 'TANZANIA' 219 | 263 = 'THAILAND' 220 | 304 = 'TOGO' 221 | 417 = 'TONGA' 222 | 516 = 'TRINIDAD AND TOBAGO' 223 | 323 = 'TUNISIA' 224 | 264 = 'TURKEY' 225 | 161 = 'TURKMENISTAN' 226 | 527 = 'TURKS AND CAICOS ISLANDS' 227 | 420 = 'TUVALU' 228 | 352 = 'UGANDA' 229 | 162 = 'UKRAINE' 230 | 296 = 'UNITED ARAB EMIRATES' 231 | 135 = 'UNITED KINGDOM' 232 | 695 = 'URUGUAY' 233 | 163 = 'UZBEKISTAN' 234 | 410 = 'VANUATU' 235 | 696 = 'VENEZUELA' 236 | 266 = 'VIETNAM' 237 | 469 = 'WALLIS AND FUTUNA ISLANDS' 238 | 757 = 'WEST INDIES (FRENCH)' 239 | 333 = 'WESTERN SAHARA' 240 | 465 = 'WESTERN SAMOA' 241 | 216 = 'YEMEN' 242 | 139 = 'YUGOSLAVIA' 243 | 301 = 'ZAIRE' 244 | 344 = 'ZAMBIA' 245 | 315 = 'ZIMBABWE' 246 | 403 = 'INVALID: AMERICAN SAMOA' 247 | 712 = 'INVALID: ANTARCTICA' 248 | 700 = 'INVALID: BORN ON BOARD SHIP' 249 | 719 = 'INVALID: BOUVET ISLAND (ANTARCTICA/NORWAY TERR.)' 250 | 574 = 'INVALID: CANADA' 251 | 720 = 'INVALID: CANTON AND ENDERBURY ISLS' 252 | 106 = 'INVALID: CZECHOSLOVAKIA' 253 | 739 = 'INVALID: DRONNING MAUD LAND (ANTARCTICA-NORWAY)' 254 | 394 = 'INVALID: FRENCH SOUTHERN AND ANTARCTIC' 255 | 501 = 'INVALID: GREENLAND' 256 | 404 = 'INVALID: GUAM' 257 | 730 = 'INVALID: INTERNATIONAL WATERS' 258 | 731 = 'INVALID: JOHNSON ISLAND' 259 | 471 = 'INVALID: MARIANA ISLANDS, NORTHERN' 260 | 737 = 'INVALID: MIDWAY ISLANDS' 261 | 753 = 'INVALID: MINOR OUTLYING ISLANDS - USA' 262 | 740 = 'INVALID: NEUTRAL ZONE (S. ARABIA/IRAQ)' 263 | 710 = 'INVALID: NON-QUOTA IMMIGRANT' 264 | 505 = 'INVALID: PUERTO RICO' 265 | 0 = 'INVALID: STATELESS' 266 | 705 = 'INVALID: STATELESS' 267 | 583 = 'INVALID: UNITED STATES' 268 | 407 = 'INVALID: UNITED STATES' 269 | 999 = 'INVALID: UNKNOWN' 270 | 239 = 'INVALID: UNKNOWN COUNTRY' 271 | 134 = 'INVALID: USSR' 272 | 506 = 'INVALID: U.S. VIRGIN ISLANDS' 273 | 755 = 'INVALID: WAKE ISLAND' 274 | 311 = 'Collapsed Tanzania (should not show)' 275 | 741 = 'Collapsed Curacao (should not show)' 276 | 54 = 'No Country Code (54)' 277 | 100 = 'No Country Code (100)' 278 | 187 = 'No Country Code (187)' 279 | 190 = 'No Country Code (190)' 280 | 200 = 'No Country Code (200)' 281 | 219 = 'No Country Code (219)' 282 | 238 = 'No Country Code (238)' 283 | 277 = 'No Country Code (277)' 284 | 293 = 'No Country Code (293)' 285 | 300 = 'No Country Code (300)' 286 | 319 = 'No Country Code (319)' 287 | 365 = 'No Country Code (365)' 288 | 395 = 'No Country Code (395)' 289 | 400 = 'No Country Code (400)' 290 | 485 = 'No Country Code (485)' 291 | 503 = 'No Country Code (503)' 292 | 589 = 'No Country Code (589)' 293 | 592 = 'No Country Code (592)' 294 | 791 = 'No Country Code (791)' 295 | 849 = 'No Country Code (849)' 296 | 914 = 'No Country Code (914)' 297 | 944 = 'No Country Code (944)' 298 | 996 = 'No Country Code (996)' ; 299 | 300 | 301 | /* I94PORT - This format shows all the valid and invalid codes for processing */ 302 | value $i94prtl 303 | 'ALC' = 'ALCAN, AK ' 304 | 'ANC' = 'ANCHORAGE, AK ' 305 | 'BAR' = 'BAKER AAF - BAKER ISLAND, AK' 306 | 'DAC' = 'DALTONS CACHE, AK ' 307 | 'PIZ' = 'DEW STATION PT LAY DEW, AK' 308 | 'DTH' = 'DUTCH HARBOR, AK ' 309 | 'EGL' = 'EAGLE, AK ' 310 | 'FRB' = 'FAIRBANKS, AK ' 311 | 'HOM' = 'HOMER, AK ' 312 | 'HYD' = 'HYDER, AK ' 313 | 'JUN' = 'JUNEAU, AK ' 314 | '5KE' = 'KETCHIKAN, AK' 315 | 'KET' = 'KETCHIKAN, AK ' 316 | 'MOS' = 'MOSES POINT INTERMEDIATE, AK' 317 | 'NIK' = 'NIKISKI, AK ' 318 | 'NOM' = 'NOM, AK ' 319 | 'PKC' = 'POKER CREEK, AK ' 320 | 'ORI' = 'PORT LIONS SPB, AK' 321 | 'SKA' = 'SKAGWAY, AK ' 322 | 'SNP' = 'ST. PAUL ISLAND, AK' 323 | 'TKI' = 'TOKEEN, AK' 324 | 'WRA' = 'WRANGELL, AK ' 325 | 'HSV' = 'MADISON COUNTY - HUNTSVILLE, AL' 326 | 'MOB' = 'MOBILE, AL ' 327 | 'LIA' = 'LITTLE ROCK, AR (BPS)' 328 | 'ROG' = 'ROGERS ARPT, AR' 329 | 'DOU' = 'DOUGLAS, AZ ' 330 | 'LUK' = 'LUKEVILLE, AZ ' 331 | 'MAP' = 'MARIPOSA AZ ' 332 | 'NAC' = 'NACO, AZ ' 333 | 'NOG' = 'NOGALES, AZ ' 334 | 'PHO' = 'PHOENIX, AZ ' 335 | 'POR' = 'PORTAL, AZ' 336 | 'SLU' = 'SAN LUIS, AZ ' 337 | 'SAS' = 'SASABE, AZ ' 338 | 'TUC' = 'TUCSON, AZ ' 339 | 'YUI' = 'YUMA, AZ ' 340 | 'AND' = 'ANDRADE, CA ' 341 | 'BUR' = 'BURBANK, CA' 342 | 'CAL' = 'CALEXICO, CA ' 343 | 'CAO' = 'CAMPO, CA ' 344 | 'FRE' = 'FRESNO, CA ' 345 | 'ICP' = 'IMPERIAL COUNTY, CA ' 346 | 'LNB' = 'LONG BEACH, CA ' 347 | 'LOS' = 'LOS ANGELES, CA ' 348 | 'BFL' = 'MEADOWS FIELD - BAKERSFIELD, CA' 349 | 'OAK' = 'OAKLAND, CA ' 350 | 'ONT' = 'ONTARIO, CA' 351 | 'OTM' = 'OTAY MESA, CA ' 352 | 'BLT' = 'PACIFIC, HWY. STATION, CA ' 353 | 'PSP' = 'PALM SPRINGS, CA' 354 | 'SAC' = 'SACRAMENTO, CA ' 355 | 'SLS' = 'SALINAS, CA (BPS)' 356 | 'SDP' = 'SAN DIEGO, CA' 357 | 'SFR' = 'SAN FRANCISCO, CA ' 358 | 'SNJ' = 'SAN JOSE, CA ' 359 | 'SLO' = 'SAN LUIS OBISPO, CA ' 360 | 'SLI' = 'SAN LUIS OBISPO, CA (BPS)' 361 | 'SPC' = 'SAN PEDRO, CA ' 362 | 'SYS' = 'SAN YSIDRO, CA ' 363 | 'SAA' = 'SANTA ANA, CA ' 364 | 'STO' = 'STOCKTON, CA (BPS)' 365 | 'TEC' = 'TECATE, CA ' 366 | 'TRV' = 'TRAVIS-AFB, CA ' 367 | 'APA' = 'ARAPAHOE COUNTY, CO' 368 | 'ASE' = 'ASPEN, CO #ARPT' 369 | 'COS' = 'COLORADO SPRINGS, CO' 370 | 'DEN' = 'DENVER, CO ' 371 | 'DRO' = 'LA PLATA - DURANGO, CO' 372 | 'BDL' = 'BRADLEY INTERNATIONAL, CT' 373 | 'BGC' = 'BRIDGEPORT, CT ' 374 | 'GRT' = 'GROTON, CT ' 375 | 'HAR' = 'HARTFORD, CT ' 376 | 'NWH' = 'NEW HAVEN, CT ' 377 | 'NWL' = 'NEW LONDON, CT ' 378 | 'TST' = 'NEWINGTON DATA CENTER TEST, CT' 379 | 'WAS' = 'WASHINGTON DC ' 380 | 'DOV' = 'DOVER AFB, DE' 381 | 'DVD' = 'DOVER-AFB, DE ' 382 | 'WLL' = 'WILMINGTON, DE ' 383 | 'BOC' = 'BOCAGRANDE, FL ' 384 | 'SRQ' = 'BRADENTON - SARASOTA, FL' 385 | 'CAN' = 'CAPE CANAVERAL, FL ' 386 | 'DAB' = 'DAYTONA BEACH INTERNATIONAL, FL' 387 | 'FRN' = 'FERNANDINA, FL ' 388 | 'FTL' = 'FORT LAUDERDALE, FL ' 389 | 'FMY' = 'FORT MYERS, FL ' 390 | 'FPF' = 'FORT PIERCE, FL ' 391 | 'HUR' = 'HURLBURT FIELD, FL' 392 | 'GNV' = 'J R ALISON MUNI - GAINESVILLE, FL' 393 | 'JAC' = 'JACKSONVILLE, FL ' 394 | 'KEY' = 'KEY WEST, FL ' 395 | 'LEE' = 'LEESBURG MUNICIPAL AIRPORT, FL' 396 | 'MLB' = 'MELBOURNE, FL' 397 | 'MIA' = 'MIAMI, FL ' 398 | 'APF' = 'NAPLES, FL #ARPT' 399 | 'OPF' = 'OPA LOCKA, FL' 400 | 'ORL' = 'ORLANDO, FL ' 401 | 'PAN' = 'PANAMA CITY, FL ' 402 | 'PEN' = 'PENSACOLA, FL ' 403 | 'PCF' = 'PORT CANAVERAL, FL ' 404 | 'PEV' = 'PORT EVERGLADES, FL ' 405 | 'PSJ' = 'PORT ST JOE, FL ' 406 | 'SFB' = 'SANFORD, FL ' 407 | 'SGJ' = 'ST AUGUSTINE ARPT, FL' 408 | 'SAU' = 'ST AUGUSTINE, FL ' 409 | 'FPR' = 'ST LUCIE COUNTY, FL' 410 | 'SPE' = 'ST PETERSBURG, FL ' 411 | 'TAM' = 'TAMPA, FL ' 412 | 'WPB' = 'WEST PALM BEACH, FL ' 413 | 'ATL' = 'ATLANTA, GA ' 414 | 'BRU' = 'BRUNSWICK, GA ' 415 | 'AGS' = 'BUSH FIELD - AUGUSTA, GA' 416 | 'SAV' = 'SAVANNAH, GA ' 417 | 'AGA' = 'AGANA, GU ' 418 | 'HHW' = 'HONOLULU, HI ' 419 | 'OGG' = 'KAHULUI - MAUI, HI' 420 | 'KOA' = 'KEAHOLE-KONA, HI ' 421 | 'LIH' = 'LIHUE, HI ' 422 | 'CID' = 'CEDAR RAPIDS/IOWA CITY, IA' 423 | 'DSM' = 'DES MOINES, IA' 424 | 'BOI' = 'AIR TERM. (GOWEN FLD) BOISE, ID' 425 | 'EPI' = 'EASTPORT, ID ' 426 | 'IDA' = 'FANNING FIELD - IDAHO FALLS, ID' 427 | 'PTL' = 'PORTHILL, ID ' 428 | 'SPI' = 'CAPITAL - SPRINGFIELD, IL' 429 | 'CHI' = 'CHICAGO, IL ' 430 | 'DPA' = 'DUPAGE COUNTY, IL' 431 | 'PIA' = 'GREATER PEORIA, IL' 432 | 'RFD' = 'GREATER ROCKFORD, IL' 433 | 'UGN' = 'MEMORIAL - WAUKEGAN, IL' 434 | 'GAR' = 'GARY, IN ' 435 | 'HMM' = 'HAMMOND, IN ' 436 | 'INP' = 'INDIANAPOLIS, IN ' 437 | 'MRL' = 'MERRILLVILLE, IN ' 438 | 'SBN' = 'SOUTH BEND, IN' 439 | 'ICT' = 'MID-CONTINENT - WITCHITA, KS' 440 | 'LEX' = 'BLUE GRASS - LEXINGTON, KY' 441 | 'LOU' = 'LOUISVILLE, KY ' 442 | 'BTN' = 'BATON ROUGE, LA ' 443 | 'LKC' = 'LAKE CHARLES, LA ' 444 | 'LAK' = 'LAKE CHARLES, LA (BPS)' 445 | 'MLU' = 'MONROE, LA' 446 | 'MGC' = 'MORGAN CITY, LA ' 447 | 'NOL' = 'NEW ORLEANS, LA ' 448 | 'BOS' = 'BOSTON, MA ' 449 | 'GLO' = 'GLOUCESTER, MA ' 450 | 'BED' = 'HANSCOM FIELD - BEDFORD, MA' 451 | 'LYN' = 'LYNDEN, WA ' 452 | 'ADW' = 'ANDREWS AFB, MD' 453 | 'BAL' = 'BALTIMORE, MD ' 454 | 'MKG' = 'MUSKEGON, MD' 455 | 'PAX' = 'PATUXENT RIVER, MD ' 456 | 'BGM' = 'BANGOR, ME ' 457 | 'BOO' = 'BOOTHBAY HARBOR, ME ' 458 | 'BWM' = 'BRIDGEWATER, ME ' 459 | 'BCK' = 'BUCKPORT, ME ' 460 | 'CLS' = 'CALAIS, ME ' 461 | 'CRB' = 'CARIBOU, ME ' 462 | 'COB' = 'COBURN GORE, ME ' 463 | 'EST' = 'EASTCOURT, ME ' 464 | 'EPT' = 'EASTPORT MUNICIPAL, ME' 465 | 'EPM' = 'EASTPORT, ME ' 466 | 'FOR' = 'FOREST CITY, ME ' 467 | 'FTF' = 'FORT FAIRFIELD, ME ' 468 | 'FTK' = 'FORT KENT, ME ' 469 | 'HML' = 'HAMIIN, ME ' 470 | 'HTM' = 'HOULTON, ME ' 471 | 'JKM' = 'JACKMAN, ME ' 472 | 'KAL' = 'KALISPEL, MT ' 473 | 'LIM' = 'LIMESTONE, ME ' 474 | 'LUB' = 'LUBEC, ME ' 475 | 'MAD' = 'MADAWASKA, ME ' 476 | 'POM' = 'PORTLAND, ME ' 477 | 'RGM' = 'RANGELEY, ME (BPS)' 478 | 'SBR' = 'SOUTH BREWER, ME ' 479 | 'SRL' = 'ST AURELIE, ME ' 480 | 'SPA' = 'ST PAMPILE, ME ' 481 | 'VNB' = 'VAN BUREN, ME ' 482 | 'VCB' = 'VANCEBORO, ME ' 483 | 'AGN' = 'ALGONAC, MI ' 484 | 'ALP' = 'ALPENA, MI ' 485 | 'BCY' = 'BAY CITY, MI ' 486 | 'DET' = 'DETROIT, MI ' 487 | 'GRP' = 'GRAND RAPIDS, MI' 488 | 'GRO' = 'GROSSE ISLE, MI ' 489 | 'ISL' = 'ISLE ROYALE, MI ' 490 | 'MRC' = 'MARINE CITY, MI ' 491 | 'MRY' = 'MARYSVILLE, MI ' 492 | 'PTK' = 'OAKLAND COUNTY - PONTIAC, MI' 493 | 'PHU' = 'PORT HURON, MI ' 494 | 'RBT' = 'ROBERTS LANDING, MI ' 495 | 'SAG' = 'SAGINAW, MI ' 496 | 'SSM' = 'SAULT STE. MARIE, MI ' 497 | 'SCL' = 'ST CLAIR, MI ' 498 | 'YIP' = 'WILLOW RUN - YPSILANTI, MI' 499 | 'BAU' = 'BAUDETTE, MN ' 500 | 'CAR' = 'CARIBOU MUNICIPAL AIRPORT, MN' 501 | 'GTF' = 'Collapsed into INT, MN' 502 | 'INL' = 'Collapsed into INT, MN' 503 | 'CRA' = 'CRANE LAKE, MN ' 504 | 'MIC' = 'CRYSTAL MUNICIPAL AIRPORT, MN' 505 | 'DUL' = 'DULUTH, MN ' 506 | 'ELY' = 'ELY, MN ' 507 | 'GPM' = 'GRAND PORTAGE, MN ' 508 | 'SVC' = 'GRANT COUNTY - SILVER CITY, MN' 509 | 'INT' = 'INT''L FALLS, MN ' 510 | 'LAN' = 'LANCASTER, MN ' 511 | 'MSP' = 'MINN./ST PAUL, MN ' 512 | 'LIN' = 'NORTHERN SVC CENTER, MN ' 513 | 'NOY' = 'NOYES, MN ' 514 | 'PIN' = 'PINE CREEK, MN ' 515 | '48Y' = 'PINECREEK BORDER ARPT, MN' 516 | 'RAN' = 'RAINER, MN ' 517 | 'RST' = 'ROCHESTER, MN' 518 | 'ROS' = 'ROSEAU, MN ' 519 | 'SPM' = 'ST PAUL, MN ' 520 | 'WSB' = 'WARROAD INTL, SPB, MN' 521 | 'WAR' = 'WARROAD, MN ' 522 | 'KAN' = 'KANSAS CITY, MO ' 523 | 'SGF' = 'SPRINGFIELD-BRANSON, MO' 524 | 'STL' = 'ST LOUIS, MO ' 525 | 'WHI' = 'WHITETAIL, MT ' 526 | 'WHM' = 'WILD HORSE, MT ' 527 | 'GPT' = 'BILOXI REGIONAL, MS' 528 | 'GTR' = 'GOLDEN TRIANGLE LOWNDES CNTY, MS' 529 | 'GUL' = 'GULFPORT, MS ' 530 | 'PAS' = 'PASCAGOULA, MS ' 531 | 'JAN' = 'THOMPSON FIELD - JACKSON, MS' 532 | 'BIL' = 'BILLINGS, MT ' 533 | 'BTM' = 'BUTTE, MT ' 534 | 'CHF' = 'CHIEF MT, MT ' 535 | 'CTB' = 'CUT BANK MUNICIPAL, MT' 536 | 'CUT' = 'CUT BANK, MT ' 537 | 'DLB' = 'DEL BONITA, MT ' 538 | 'EUR' = 'EUREKA, MT (BPS)' 539 | 'BZN' = 'GALLATIN FIELD - BOZEMAN, MT' 540 | 'FCA' = 'GLACIER NATIONAL PARK, MT' 541 | 'GGW' = 'GLASGOW, MT ' 542 | 'GRE' = 'GREAT FALLS, MT ' 543 | 'HVR' = 'HAVRE, MT ' 544 | 'HEL' = 'HELENA, MT ' 545 | 'LWT' = 'LEWISTON, MT ' 546 | 'MGM' = 'MORGAN, MT ' 547 | 'OPH' = 'OPHEIM, MT ' 548 | 'PIE' = 'PIEGAN, MT ' 549 | 'RAY' = 'RAYMOND, MT ' 550 | 'ROO' = 'ROOSVILLE, MT ' 551 | 'SCO' = 'SCOBEY, MT ' 552 | 'SWE' = 'SWEETGTASS, MT ' 553 | 'TRL' = 'TRIAL CREEK, MT ' 554 | 'TUR' = 'TURNER, MT ' 555 | 'WCM' = 'WILLOW CREEK, MT ' 556 | 'CLT' = 'CHARLOTTE, NC ' 557 | 'FAY' = 'FAYETTEVILLE, NC' 558 | 'MRH' = 'MOREHEAD CITY, NC ' 559 | 'FOP' = 'MORRIS FIELDS AAF, NC' 560 | 'GSO' = 'PIEDMONT TRIAD INTL AIRPORT, NC' 561 | 'RDU' = 'RALEIGH/DURHAM, NC ' 562 | 'SSC' = 'SHAW AFB - SUMTER, NC' 563 | 'WIL' = 'WILMINGTON, NC ' 564 | 'AMB' = 'AMBROSE, ND ' 565 | 'ANT' = 'ANTLER, ND ' 566 | 'CRY' = 'CARBURY, ND ' 567 | 'DNS' = 'DUNSEITH, ND ' 568 | 'FAR' = 'FARGO, ND ' 569 | 'FRT' = 'FORTUNA, ND ' 570 | 'GRF' = 'GRAND FORKS, ND ' 571 | 'HNN' = 'HANNAH, ND ' 572 | 'HNS' = 'HANSBORO, ND ' 573 | 'MAI' = 'MAIDA, ND ' 574 | 'MND' = 'MINOT, ND ' 575 | 'NEC' = 'NECHE, ND ' 576 | 'NOO' = 'NOONAN, ND ' 577 | 'NRG' = 'NORTHGATE, ND ' 578 | 'PEM' = 'PEMBINA, ND ' 579 | 'SAR' = 'SARLES, ND ' 580 | 'SHR' = 'SHERWOOD, ND ' 581 | 'SJO' = 'ST JOHN, ND ' 582 | 'WAL' = 'WALHALLA, ND ' 583 | 'WHO' = 'WESTHOPE, ND ' 584 | 'WND' = 'WILLISTON, ND ' 585 | 'OMA' = 'OMAHA, NE ' 586 | 'LEB' = 'LEBANON, NH ' 587 | 'MHT' = 'MANCHESTER, NH' 588 | 'PNH' = 'PITTSBURG, NH ' 589 | 'PSM' = 'PORTSMOUTH, NH ' 590 | 'BYO' = 'BAYONNE, NJ ' 591 | 'CNJ' = 'CAMDEN, NJ ' 592 | 'HOB' = 'HOBOKEN, NJ ' 593 | 'JER' = 'JERSEY CITY, NJ ' 594 | 'WRI' = 'MC GUIRE AFB - WRIGHTSOWN, NJ' 595 | 'MMU' = 'MORRISTOWN, NJ' 596 | 'NEW' = 'NEWARK/TETERBORO, NJ ' 597 | 'PER' = 'PERTH AMBOY, NJ ' 598 | 'ACY' = 'POMONA FIELD - ATLANTIC CITY, NJ' 599 | 'ALA' = 'ALAMAGORDO, NM (BPS)' 600 | 'ABQ' = 'ALBUQUERQUE, NM ' 601 | 'ANP' = 'ANTELOPE WELLS, NM ' 602 | 'CRL' = 'CARLSBAD, NM ' 603 | 'COL' = 'COLUMBUS, NM ' 604 | 'CDD' = 'CRANE LAKE - ST. LOUIS CNTY, NM' 605 | 'DNM' = 'DEMING, NM (BPS)' 606 | 'LAS' = 'LAS CRUCES, NM ' 607 | 'LOB' = 'LORDSBURG, NM (BPS)' 608 | 'RUI' = 'RUIDOSO, NM' 609 | 'STR' = 'SANTA TERESA, NM ' 610 | 'RNO' = 'CANNON INTL - RENO/TAHOE, NV' 611 | 'FLX' = 'FALLON MUNICIPAL AIRPORT, NV' 612 | 'LVG' = 'LAS VEGAS, NV ' 613 | 'REN' = 'RENO, NV ' 614 | 'ALB' = 'ALBANY, NY ' 615 | 'AXB' = 'ALEXANDRIA BAY, NY ' 616 | 'BUF' = 'BUFFALO, NY ' 617 | 'CNH' = 'CANNON CORNERS, NY' 618 | 'CAP' = 'CAPE VINCENT, NY ' 619 | 'CHM' = 'CHAMPLAIN, NY ' 620 | 'CHT' = 'CHATEAUGAY, NY ' 621 | 'CLA' = 'CLAYTON, NY ' 622 | 'FTC' = 'FORT COVINGTON, NY ' 623 | 'LAG' = 'LA GUARDIA, NY ' 624 | 'LEW' = 'LEWISTON, NY ' 625 | 'MAS' = 'MASSENA, NY ' 626 | 'MAG' = 'MCGUIRE AFB, NY ' 627 | 'MOO' = 'MOORES, NY ' 628 | 'MRR' = 'MORRISTOWN, NY ' 629 | 'NYC' = 'NEW YORK, NY ' 630 | 'NIA' = 'NIAGARA FALLS, NY ' 631 | 'OGD' = 'OGDENSBURG, NY ' 632 | 'OSW' = 'OSWEGO, NY ' 633 | 'ELM' = 'REGIONAL ARPT - HORSEHEAD, NY' 634 | 'ROC' = 'ROCHESTER, NY ' 635 | 'ROU' = 'ROUSES POINT, NY ' 636 | 'SWF' = 'STEWART - ORANGE CNTY, NY' 637 | 'SYR' = 'SYRACUSE, NY ' 638 | 'THO' = 'THOUSAND ISLAND BRIDGE, NY' 639 | 'TRO' = 'TROUT RIVER, NY ' 640 | 'WAT' = 'WATERTOWN, NY ' 641 | 'HPN' = 'WESTCHESTER - WHITE PLAINS, NY' 642 | 'WRB' = 'WHIRLPOOL BRIDGE, NY' 643 | 'YOU' = 'YOUNGSTOWN, NY ' 644 | 'AKR' = 'AKRON, OH ' 645 | 'ATB' = 'ASHTABULA, OH ' 646 | 'CIN' = 'CINCINNATI, OH ' 647 | 'CLE' = 'CLEVELAND, OH ' 648 | 'CLM' = 'COLUMBUS, OH ' 649 | 'LOR' = 'LORAIN, OH ' 650 | 'MBO' = 'MARBLE HEADS, OH ' 651 | 'SDY' = 'SANDUSKY, OH ' 652 | 'TOL' = 'TOLEDO, OH ' 653 | 'OKC' = 'OKLAHOMA CITY, OK ' 654 | 'TUL' = 'TULSA, OK' 655 | 'AST' = 'ASTORIA, OR ' 656 | 'COO' = 'COOS BAY, OR ' 657 | 'HIO' = 'HILLSBORO, OR' 658 | 'MED' = 'MEDFORD, OR ' 659 | 'NPT' = 'NEWPORT, OR ' 660 | 'POO' = 'PORTLAND, OR ' 661 | 'PUT' = 'PUT-IN-BAY, OH ' 662 | 'RDM' = 'ROBERTS FIELDS - REDMOND, OR' 663 | 'ERI' = 'ERIE, PA ' 664 | 'MDT' = 'HARRISBURG, PA' 665 | 'HSB' = 'HARRISONBURG, PA ' 666 | 'PHI' = 'PHILADELPHIA, PA ' 667 | 'PIT' = 'PITTSBURG, PA ' 668 | 'AGU' = 'AGUADILLA, PR ' 669 | 'BQN' = 'BORINQUEN - AGUADILLO, PR' 670 | 'JCP' = 'CULEBRA - BENJAMIN RIVERA, PR' 671 | 'ENS' = 'ENSENADA, PR ' 672 | 'FAJ' = 'FAJARDO, PR ' 673 | 'HUM' = 'HUMACAO, PR ' 674 | 'JOB' = 'JOBOS, PR ' 675 | 'MAY' = 'MAYAGUEZ, PR ' 676 | 'PON' = 'PONCE, PR ' 677 | 'PSE' = 'PONCE-MERCEDITA, PR' 678 | 'SAJ' = 'SAN JUAN, PR ' 679 | 'VQS' = 'VIEQUES-ARPT, PR' 680 | 'PRO' = 'PROVIDENCE, RI ' 681 | 'PVD' = 'THEODORE FRANCIS - WARWICK, RI' 682 | 'CHL' = 'CHARLESTON, SC ' 683 | 'CAE' = 'COLUMBIA, SC #ARPT' 684 | 'GEO' = 'GEORGETOWN, SC ' 685 | 'GSP' = 'GREENVILLE, SC' 686 | 'GRR' = 'GREER, SC' 687 | 'MYR' = 'MYRTLE BEACH, SC' 688 | 'SPF' = 'BLACK HILLS, SPEARFISH, SD' 689 | 'HON' = 'HOWES REGIONAL ARPT - HURON, SD' 690 | 'SAI' = 'SAIPAN, SPN ' 691 | 'TYS' = 'MC GHEE TYSON - ALCOA, TN' 692 | 'MEM' = 'MEMPHIS, TN ' 693 | 'NSV' = 'NASHVILLE, TN ' 694 | 'TRI' = 'TRI CITY ARPT, TN' 695 | 'ADS' = 'ADDISON AIRPORT- ADDISON, TX' 696 | 'ADT' = 'AMISTAD DAM, TX ' 697 | 'ANZ' = 'ANZALDUAS, TX' 698 | 'AUS' = 'AUSTIN, TX ' 699 | 'BEA' = 'BEAUMONT, TX ' 700 | 'BBP' = 'BIG BEND PARK, TX (BPS)' 701 | 'SCC' = 'BP SPEC COORD. CTR, TX' 702 | 'BTC' = 'BP TACTICAL UNIT, TX ' 703 | 'BOA' = 'BRIDGE OF AMERICAS, TX' 704 | 'BRO' = 'BROWNSVILLE, TX ' 705 | 'CRP' = 'CORPUS CHRISTI, TX ' 706 | 'DAL' = 'DALLAS, TX ' 707 | 'DLR' = 'DEL RIO, TX ' 708 | 'DNA' = 'DONNA, TX' 709 | 'EGP' = 'EAGLE PASS, TX ' 710 | 'ELP' = 'EL PASO, TX ' 711 | 'FAB' = 'FABENS, TX ' 712 | 'FAL' = 'FALCON HEIGHTS, TX ' 713 | 'FTH' = 'FORT HANCOCK, TX ' 714 | 'AFW' = 'FORT WORTH ALLIANCE, TX' 715 | 'FPT' = 'FREEPORT, TX ' 716 | 'GAL' = 'GALVESTON, TX ' 717 | 'HLG' = 'HARLINGEN, TX ' 718 | 'HID' = 'HIDALGO, TX ' 719 | 'HOU' = 'HOUSTON, TX ' 720 | 'SGR' = 'HULL FIELD, SUGAR LAND ARPT, TX' 721 | 'LLB' = 'JUAREZ-LINCOLN BRIDGE, TX' 722 | 'LCB' = 'LAREDO COLUMBIA BRIDGE, TX' 723 | 'LRN' = 'LAREDO NORTH, TX ' 724 | 'LAR' = 'LAREDO, TX ' 725 | 'LSE' = 'LOS EBANOS, TX ' 726 | 'IND' = 'LOS INDIOS, TX' 727 | 'LOI' = 'LOS INDIOS, TX ' 728 | 'MRS' = 'MARFA, TX (BPS)' 729 | 'MCA' = 'MCALLEN, TX ' 730 | 'MAF' = 'ODESSA REGIONAL, TX' 731 | 'PDN' = 'PASO DEL NORTE,TX ' 732 | 'PBB' = 'PEACE BRIDGE, NY ' 733 | 'PHR' = 'PHARR, TX ' 734 | 'PAR' = 'PORT ARTHUR, TX ' 735 | 'ISB' = 'PORT ISABEL, TX ' 736 | 'POE' = 'PORT OF EL PASO, TX ' 737 | 'PRE' = 'PRESIDIO, TX ' 738 | 'PGR' = 'PROGRESO, TX ' 739 | 'RIO' = 'RIO GRANDE CITY, TX ' 740 | 'ROM' = 'ROMA, TX ' 741 | 'SNA' = 'SAN ANTONIO, TX ' 742 | 'SNN' = 'SANDERSON, TX ' 743 | 'VIB' = 'VETERAN INTL BRIDGE, TX' 744 | 'YSL' = 'YSLETA, TX ' 745 | 'CHA' = 'CHARLOTTE AMALIE, VI ' 746 | 'CHR' = 'CHRISTIANSTED, VI ' 747 | 'CRU' = 'CRUZ BAY, ST JOHN, VI ' 748 | 'FRK' = 'FREDERIKSTED, VI ' 749 | 'STT' = 'ST THOMAS, VI ' 750 | 'LGU' = 'CACHE AIRPORT - LOGAN, UT' 751 | 'SLC' = 'SALT LAKE CITY, UT ' 752 | 'CHO' = 'ALBEMARLE CHARLOTTESVILLE, VA' 753 | 'DAA' = 'DAVISON AAF - FAIRFAX CNTY, VA' 754 | 'HOP' = 'HOPEWELL, VA ' 755 | 'HEF' = 'MANASSAS, VA #ARPT' 756 | 'NWN' = 'NEWPORT, VA ' 757 | 'NOR' = 'NORFOLK, VA ' 758 | 'RCM' = 'RICHMOND, VA ' 759 | 'ABS' = 'ALBURG SPRINGS, VT ' 760 | 'ABG' = 'ALBURG, VT ' 761 | 'BEB' = 'BEEBE PLAIN, VT ' 762 | 'BEE' = 'BEECHER FALLS, VT ' 763 | 'BRG' = 'BURLINGTON, VT ' 764 | 'CNA' = 'CANAAN, VT ' 765 | 'DER' = 'DERBY LINE, VT (I-91) ' 766 | 'DLV' = 'DERBY LINE, VT (RT. 5)' 767 | 'ERC' = 'EAST RICHFORD, VT ' 768 | 'HIG' = 'HIGHGATE SPRINGS, VT ' 769 | 'MOR' = 'MORSES LINE, VT ' 770 | 'NPV' = 'NEWPORT, VT ' 771 | 'NRT' = 'NORTH TROY, VT ' 772 | 'NRN' = 'NORTON, VT ' 773 | 'PIV' = 'PINNACLE ROAD, VT ' 774 | 'RIF' = 'RICHFORT, VT ' 775 | 'STA' = 'ST ALBANS, VT ' 776 | 'SWB' = 'SWANTON, VT (BP - SECTOR HQ)' 777 | 'WBE' = 'WEST BERKSHIRE, VT ' 778 | 'ABE' = 'ABERDEEN, WA ' 779 | 'ANA' = 'ANACORTES, WA ' 780 | 'BEL' = 'BELLINGHAM, WA ' 781 | 'BLI' = 'BELLINGHAM, WASHINGTON #INTL' 782 | 'BLA' = 'BLAINE, WA ' 783 | 'BWA' = 'BOUNDARY, WA ' 784 | 'CUR' = 'CURLEW, WA (BPS)' 785 | 'DVL' = 'DANVILLE, WA ' 786 | 'EVE' = 'EVERETT, WA ' 787 | 'FER' = 'FERRY, WA ' 788 | 'FRI' = 'FRIDAY HARBOR, WA ' 789 | 'FWA' = 'FRONTIER, WA ' 790 | 'KLM' = 'KALAMA, WA ' 791 | 'LAU' = 'LAURIER, WA ' 792 | 'LON' = 'LONGVIEW, WA ' 793 | 'MET' = 'METALINE FALLS, WA ' 794 | 'MWH' = 'MOSES LAKE GRANT COUNTY ARPT, WA' 795 | 'NEA' = 'NEAH BAY, WA ' 796 | 'NIG' = 'NIGHTHAWK, WA ' 797 | 'OLY' = 'OLYMPIA, WA ' 798 | 'ORO' = 'OROVILLE, WA ' 799 | 'PWB' = 'PASCO, WA ' 800 | 'PIR' = 'POINT ROBERTS, WA ' 801 | 'PNG' = 'PORT ANGELES, WA ' 802 | 'PTO' = 'PORT TOWNSEND, WA ' 803 | 'SEA' = 'SEATTLE, WA ' 804 | 'SPO' = 'SPOKANE, WA ' 805 | 'SUM' = 'SUMAS, WA ' 806 | 'TAC' = 'TACOMA, WA ' 807 | 'PSC' = 'TRI-CITIES - PASCO, WA' 808 | 'VAN' = 'VANCOUVER, WA ' 809 | 'AGM' = 'ALGOMA, WI ' 810 | 'BAY' = 'BAYFIELD, WI ' 811 | 'GRB' = 'GREEN BAY, WI ' 812 | 'MNW' = 'MANITOWOC, WI ' 813 | 'MIL' = 'MILWAUKEE, WI ' 814 | 'MSN' = 'TRUAX FIELD - DANE COUNTY, WI' 815 | 'CHS' = 'CHARLESTON, WV ' 816 | 'CLK' = 'CLARKSBURG, WV ' 817 | 'BLF' = 'MERCER COUNTY, WV' 818 | 'CSP' = 'CASPER, WY ' 819 | 'XXX' = 'NOT REPORTED/UNKNOWN ' 820 | '888' = 'UNIDENTIFED AIR / SEAPORT' 821 | 'UNK' = 'UNKNOWN POE ' 822 | 'CLG' = 'CALGARY, CANADA ' 823 | 'EDA' = 'EDMONTON, CANADA ' 824 | 'YHC' = 'HAKAI PASS, CANADA' 825 | 'HAL' = 'Halifax, NS, Canada ' 826 | 'MON' = 'MONTREAL, CANADA ' 827 | 'OTT' = 'OTTAWA, CANADA ' 828 | 'YXE' = 'SASKATOON, CANADA' 829 | 'TOR' = 'TORONTO, CANADA ' 830 | 'VCV' = 'VANCOUVER, CANADA ' 831 | 'VIC' = 'VICTORIA, CANADA ' 832 | 'WIN' = 'WINNIPEG, CANADA ' 833 | 'AMS' = 'AMSTERDAM-SCHIPHOL, NETHERLANDS' 834 | 'ARB' = 'ARUBA, NETH ANTILLES ' 835 | 'BAN' = 'BANKOK, THAILAND ' 836 | 'BEI' = 'BEICA #ARPT, ETHIOPIA' 837 | 'PEK' = 'BEIJING CAPITAL INTL, PRC' 838 | 'BDA' = 'KINDLEY FIELD, BERMUDA' 839 | 'BOG' = 'BOGOTA, EL DORADO #ARPT, COLOMBIA' 840 | 'EZE' = 'BUENOS AIRES, MINISTRO PIST, ARGENTINA' 841 | 'CUN' = 'CANCUN, MEXICO' 842 | 'CRQ' = 'CARAVELAS, BA #ARPT, BRAZIL' 843 | 'MVD' = 'CARRASCO, URUGUAY' 844 | 'DUB' = 'DUBLIN, IRELAND ' 845 | 'FOU' = 'FOUGAMOU #ARPT, GABON' 846 | 'FBA' = 'FREEPORT, BAHAMAS ' 847 | 'MTY' = 'GEN M. ESCOBEDO, Monterrey, MX' 848 | 'HMO' = 'GEN PESQUEIRA GARCIA, MX' 849 | 'GCM' = 'GRAND CAYMAN, CAYMAN ISLAND' 850 | 'GDL' = 'GUADALAJARA, MIGUEL HIDAL, MX' 851 | 'HAM' = 'HAMILTON, BERMUDA ' 852 | 'ICN' = 'INCHON, SEOUL KOREA' 853 | 'IWA' = 'INVALID - IWAKUNI, JAPAN' 854 | 'CND' = 'KOGALNICEANU, ROMANIA' 855 | 'LAH' = 'LABUHA ARPT, INDONESIA' 856 | 'DUR' = 'LOUIS BOTHA, SOUTH AFRICA' 857 | 'MAL' = 'MANGOLE ARPT, INDONESIA' 858 | 'MDE' = 'MEDELLIN, COLOMBIA' 859 | 'MEX' = 'JUAREZ INTL, MEXICO CITY, MX' 860 | 'LHR' = 'MIDDLESEX, ENGLAND' 861 | 'NBO' = 'NAIROBI, KENYA ' 862 | 'NAS' = 'NASSAU, BAHAMAS ' 863 | 'NCA' = 'NORTH CAICOS, TURK & CAIMAN' 864 | 'PTY' = 'OMAR TORRIJOS, PANAMA' 865 | 'SPV' = 'PAPUA, NEW GUINEA' 866 | 'UIO' = 'QUITO (MARISCAL SUCR), ECUADOR' 867 | 'RIT' = 'ROME, ITALY ' 868 | 'SNO' = 'SAKON NAKHON #ARPT, THAILAND' 869 | 'SLP' = 'SAN LUIS POTOSI #ARPT, MEXICO' 870 | 'SAN' = 'SAN SALVADOR, EL SALVADOR' 871 | 'SRO' = 'SANTANA RAMOS #ARPT, COLOMBIA' 872 | 'GRU' = 'GUARULHOS INTL, SAO PAULO, BRAZIL' 873 | 'SHA' = 'SHANNON, IRELAND ' 874 | 'HIL' = 'SHILLAVO, ETHIOPIA' 875 | 'TOK' = 'TOROKINA #ARPT, PAPUA, NEW GUINEA' 876 | 'VER' = 'VERACRUZ, MEXICO' 877 | 'LGW' = 'WEST SUSSEX, ENGLAND ' 878 | 'ZZZ' = 'MEXICO Land (Banco de Mexico) ' 879 | 'CHN' = 'No PORT Code (CHN)' 880 | 'CNC' = 'CANNON CORNERS, NY' 881 | 'MAA' = 'Abu Dhabi' 882 | 'AG0' = 'MAGNOLIA, AR' 883 | 'BHM' = 'BAR HARBOR, ME' 884 | 'BHX' = 'BIRMINGHAM, AL' 885 | 'CAK' = 'AKRON, OH' 886 | 'FOK' = 'SUFFOLK COUNTY, NY' 887 | 'LND' = 'LANDER, WY' 888 | 'MAR' = 'MARFA, TX' 889 | 'MLI' = 'MOLINE, IL' 890 | 'RIV' = 'RIVERSIDE, CA' 891 | 'RME' = 'ROME, NY' 892 | 'VNY' = 'VAN NUYS, CA' 893 | 'YUM' = 'YUMA, AZ' 894 | 'FRG' = 'Collapsed (FOK) 06/15' 895 | 'HRL' = 'Collapsed (HLG) 06/15' 896 | 'ISP' = 'Collapsed (FOK) 06/15' 897 | 'JSJ' = 'Collapsed (SAJ) 06/15' 898 | 'BUS' = 'Collapsed (BUF) 06/15' 899 | 'IAG' = 'Collapsed (NIA) 06/15' 900 | 'PHN' = 'Collapsed (PHU) 06/15' 901 | 'STN' = 'Collapsed (STR) 06/15' 902 | 'VMB' = 'Collapsed (VNB) 06/15' 903 | 'T01' = 'Collapsed (SEA) 06/15' 904 | 'PHF' = 'No PORT Code (PHF)' 905 | 'DRV' = 'No PORT Code (DRV)' 906 | 'FTB' = 'No PORT Code (FTB)' 907 | 'GAC' = 'No PORT Code (GAC)' 908 | 'GMT' = 'No PORT Code (GMT)' 909 | 'JFA' = 'No PORT Code (JFA)' 910 | 'JMZ' = 'No PORT Code (JMZ)' 911 | 'NC8' = 'No PORT Code (NC8)' 912 | 'NYL' = 'No PORT Code (NYL)' 913 | 'OAI' = 'No PORT Code (OAI)' 914 | 'PCW' = 'No PORT Code (PCW)' 915 | 'WA5' = 'No PORT Code (WAS)' 916 | 'WTR' = 'No PORT Code (WTR)' 917 | 'X96' = 'No PORT Code (X96)' 918 | 'XNA' = 'No PORT Code (XNA)' 919 | 'YGF' = 'No PORT Code (YGF)' 920 | '5T6' = 'No PORT Code (5T6)' 921 | '060' = 'No PORT Code (60)' 922 | 'SP0' = 'No PORT Code (SP0)' 923 | 'W55' = 'No PORT Code (W55)' 924 | 'X44' = 'No PORT Code (X44)' 925 | 'AUH' = 'No PORT Code (AUH)' 926 | 'RYY' = 'No PORT Code (RYY)' 927 | 'SUS' = 'No PORT Code (SUS)' 928 | '74S' = 'No PORT Code (74S)' 929 | 'ATW' = 'No PORT Code (ATW)' 930 | 'CPX' = 'No PORT Code (CPX)' 931 | 'MTH' = 'No PORT Code (MTH)' 932 | 'PFN' = 'No PORT Code (PFN)' 933 | 'SCH' = 'No PORT Code (SCH)' 934 | 'ASI' = 'No PORT Code (ASI)' 935 | 'BKF' = 'No PORT Code (BKF)' 936 | 'DAY' = 'No PORT Code (DAY)' 937 | 'Y62' = 'No PORT Code (Y62)' 938 | 'AG' = 'No PORT Code (AG)' 939 | 'BCM' = 'No PORT Code (BCM)' 940 | 'DEC' = 'No PORT Code (DEC)' 941 | 'PLB' = 'No PORT Code (PLB)' 942 | 'CXO' = 'No PORT Code (CXO)' 943 | 'JBQ' = 'No PORT Code (JBQ)' 944 | 'JIG' = 'No PORT Code (JIG)' 945 | 'OGS' = 'No PORT Code (OGS)' 946 | 'TIW' = 'No PORT Code (TIW)' 947 | 'OTS' = 'No PORT Code (OTS)' 948 | 'AMT' = 'No PORT Code (AMT)' 949 | 'EGE' = 'No PORT Code (EGE)' 950 | 'GPI' = 'No PORT Code (GPI)' 951 | 'NGL' = 'No PORT Code (NGL)' 952 | 'OLM' = 'No PORT Code (OLM)' 953 | '.GA' = 'No PORT Code (.GA)' 954 | 'CLX' = 'No PORT Code (CLX)' 955 | 'CP ' = 'No PORT Code (CP)' 956 | 'FSC' = 'No PORT Code (FSC)' 957 | 'NK' = 'No PORT Code (NK)' 958 | 'ADU' = 'No PORT Code (ADU)' 959 | 'AKT' = 'No PORT Code (AKT)' 960 | 'LIT' = 'No PORT Code (LIT)' 961 | 'A2A' = 'No PORT Code (A2A)' 962 | 'OSN' = 'No PORT Code (OSN)' 963 | ; 964 | 965 | 966 | /* ARRDATE is the Arrival Date in the USA. It is a SAS date numeric field that a 967 | permament format has not been applied. Please apply whichever date format 968 | works for you. */ 969 | 970 | 971 | /* I94MODE - There are missing values as well as not reported (9) */ 972 | value i94model 973 | 1 = 'Air' 974 | 2 = 'Sea' 975 | 3 = 'Land' 976 | 9 = 'Not reported' ; 977 | 978 | 979 | /* I94ADDR - There is lots of invalid codes in this variable and the list below 980 | shows what we have found to be valid, everything else goes into 'other' */ 981 | value i94addrl 982 | 'AL'='ALABAMA' 983 | 'AK'='ALASKA' 984 | 'AZ'='ARIZONA' 985 | 'AR'='ARKANSAS' 986 | 'CA'='CALIFORNIA' 987 | 'CO'='COLORADO' 988 | 'CT'='CONNECTICUT' 989 | 'DE'='DELAWARE' 990 | 'DC'='DIST. OF COLUMBIA' 991 | 'FL'='FLORIDA' 992 | 'GA'='GEORGIA' 993 | 'GU'='GUAM' 994 | 'HI'='HAWAII' 995 | 'ID'='IDAHO' 996 | 'IL'='ILLINOIS' 997 | 'IN'='INDIANA' 998 | 'IA'='IOWA' 999 | 'KS'='KANSAS' 1000 | 'KY'='KENTUCKY' 1001 | 'LA'='LOUISIANA' 1002 | 'ME'='MAINE' 1003 | 'MD'='MARYLAND' 1004 | 'MA'='MASSACHUSETTS' 1005 | 'MI'='MICHIGAN' 1006 | 'MN'='MINNESOTA' 1007 | 'MS'='MISSISSIPPI' 1008 | 'MO'='MISSOURI' 1009 | 'MT'='MONTANA' 1010 | 'NC'='N. CAROLINA' 1011 | 'ND'='N. DAKOTA' 1012 | 'NE'='NEBRASKA' 1013 | 'NV'='NEVADA' 1014 | 'NH'='NEW HAMPSHIRE' 1015 | 'NJ'='NEW JERSEY' 1016 | 'NM'='NEW MEXICO' 1017 | 'NY'='NEW YORK' 1018 | 'OH'='OHIO' 1019 | 'OK'='OKLAHOMA' 1020 | 'OR'='OREGON' 1021 | 'PA'='PENNSYLVANIA' 1022 | 'PR'='PUERTO RICO' 1023 | 'RI'='RHODE ISLAND' 1024 | 'SC'='S. CAROLINA' 1025 | 'SD'='S. DAKOTA' 1026 | 'TN'='TENNESSEE' 1027 | 'TX'='TEXAS' 1028 | 'UT'='UTAH' 1029 | 'VT'='VERMONT' 1030 | 'VI'='VIRGIN ISLANDS' 1031 | 'VA'='VIRGINIA' 1032 | 'WV'='W. VIRGINIA' 1033 | 'WA'='WASHINGTON' 1034 | 'WI'='WISCONSON' 1035 | 'WY'='WYOMING' 1036 | '99'='All Other Codes' ; 1037 | 1038 | /* DEPDATE is the Departure Date from the USA. It is a SAS date numeric field that 1039 | a permament format has not been applied. Please apply whichever date format 1040 | works for you. */ 1041 | 1042 | 1043 | /* I94BIR - Age of Respondent in Years */ 1044 | 1045 | 1046 | /* I94VISA - Visa codes collapsed into three categories: 1047 | 1 = Business 1048 | 2 = Pleasure 1049 | 3 = Student 1050 | */ 1051 | 1052 | 1053 | /* COUNT - Used for summary statistics */ 1054 | 1055 | 1056 | /* DTADFILE - Character Date Field - Date added to I-94 Files - CIC does not use */ 1057 | 1058 | 1059 | /* VISAPOST - Department of State where where Visa was issued - CIC does not use */ 1060 | 1061 | 1062 | /* OCCUP - Occupation that will be performed in U.S. - CIC does not use */ 1063 | 1064 | 1065 | /* ENTDEPA - Arrival Flag - admitted or paroled into the U.S. - CIC does not use */ 1066 | 1067 | 1068 | /* ENTDEPD - Departure Flag - Departed, lost I-94 or is deceased - CIC does not use */ 1069 | 1070 | 1071 | /* ENTDEPU - Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use */ 1072 | 1073 | 1074 | /* MATFLAG - Match flag - Match of arrival and departure records */ 1075 | 1076 | 1077 | /* BIRYEAR - 4 digit year of birth */ 1078 | 1079 | 1080 | /* DTADDTO - Character Date Field - Date to which admitted to U.S. (allowed to stay until) - CIC does not use */ 1081 | 1082 | 1083 | /* GENDER - Non-immigrant sex */ 1084 | 1085 | 1086 | /* INSNUM - INS number */ 1087 | 1088 | 1089 | /* AIRLINE - Airline used to arrive in U.S. */ 1090 | 1091 | 1092 | /* ADMNUM - Admission Number */ 1093 | 1094 | 1095 | /* FLTNO - Flight number of Airline used to arrive in U.S. */ 1096 | 1097 | 1098 | /* VISATYPE - Class of admission legally admitting the non-immigrant to temporarily stay in U.S. */ 1099 | run ; 1100 | 1101 | -------------------------------------------------------------------------------- /Capstone Project/README.md: -------------------------------------------------------------------------------- 1 | # CapstoneProject 2 | 3 | This projects aims to enrich the US I94 immigration data with further data such as US airport data, US demographics and temperature data to have a wider basis for analysis on the immigration data. 4 | 5 | ## Data sources 6 | 7 | ### I94 Immigration Data 8 | This data comes from the US National Tourism and Trade Office. A data dictionary is included in the workspace. [This](https://travel.trade.gov/research/reports/i94/historical/2016.html) is where the data comes from. There's a sample file so you can take a look at the data in csv format before reading it all in. 9 | 10 | ### World Temperature Data 11 | This dataset came from Kaggle. You can read more about it [here](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data). 12 | 13 | ### U.S. City Demographic Data 14 | This data comes from OpenSoft. You can read more about it [here](https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/). 15 | 16 | ### Airport Code Table 17 | This is a simple table of airport codes and corresponding cities. It comes from [here](https://datahub.io/core/airport-codes#data). 18 | 19 | ## Data cleaning 20 | 21 | * Filter temperature data to only use US data. 22 | * Remove irregular ports from I94 data. 23 | * Drop rows with missing IATA codes from I94 data. We need the IATA codes to join the data with other sources. 24 | 25 | ## Conceptual Data Model 26 | 27 | ### Tables: 28 | | table name | columns | description | type | 29 | | ------- | ---------- | ----------- | ---- | 30 | | airports | iata_code - name - type - local_code - coordinates - city | stores information related to airports | dimension table | 31 | | demographics | city - state - media_age - male_population - female_population - total_population - num_veterans - foreign_born - average_household_size - state_code - race - count | stores demographics data for cities | dimension table | 32 | | immigrations | cicid - year - month - cit - res - iata - arrdate - mode - addr - depdate - bir - visa - coun- dtadfil - visapost - occup - entdepa - entdepd - entdepu - matflag - biryear - dtaddto - gender - insnum - airline - admnum - fltno - visatype | stores all i94 immigrations data | fact table | 33 | | temperature | timestamp - average_temperature - average_temperatur_uncertainty - city - country - latitude - longitude | stores temperature information | dimension table | 34 | 35 | ### Table decision 36 | 37 | We want to have the immigrations data to store the key information. We can then enrich the data with airports, demographics and temperature data. To do so efficiently, we need identifiers on all tables so they can be joined efficiently. This includes the city and the iata code. 38 | 39 | ## Mapping Out Data Pipelines 40 | 41 | 1. Create tables by executing `create_tables.py`. 42 | 2. Join city to airports data. 43 | 3. Insert data. 44 | 45 | ## Choice of tools and technologies for the project 46 | 47 | Pandas is used to ease data preprocessing and visualisation. It is helpful to efficiently load and manipulate data. At a later stage, instead of pandas dataframes, I recommend using Spark dataframes to allow distributed processing using for example Amazon Elastic Map Reduce (EMR). Also, to perform automated updates, I recommend integrating the ETL pipeline into an Airflow DAG. 48 | 49 | I used a Jupyter Notebook to show the data structure and the need for data cleaning. Python is an often used programming language and was used because it is the language I am the most comfortable with. 50 | 51 | ## How often the data should be updated and why 52 | 53 | The I94 data described immigration events aggregated on a monthly base. Thus, updating the data on a monthly base is recommended. 54 | 55 | ## FAQ: What would I do if... 56 | * The data was increased by 100x. 57 | * Use Spark to process the data efficiently in a distributed way e.g. with EMR. In case we recognize that we need a write-heavy operation, I would suggest using a Cassandra database instead of PostgreSQL. 58 | * The data populates a dashboard that must be updated on a daily basis by 7am every day. 59 | * Use Airflow and create a DAG that performs the logic of the described pipeline. If executing the DAG fails, I recommend to automatically send emails to the engineering team using Airflow's builtin feature, so they can fix potential issues soon. 60 | * The database needed to be accessed by 100+ people. 61 | * Use RedShift to have the data stored in a way that it can efficiently be accessed by many people. Also, we can use a database such as PostgreSQL in a more cost-efficient setting that will, however, have slightly lower performance due to its nature. 62 | -------------------------------------------------------------------------------- /Capstone Project/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """ 7 | - Creates and connects to the sparkifydb 8 | @return: cursor and connection to sparkifydb 9 | """ 10 | 11 | # connect to default database 12 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 13 | conn.set_session(autocommit=True) 14 | cur = conn.cursor() 15 | 16 | # create sparkify database with UTF8 encoding 17 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 18 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 19 | 20 | # close connection to default database 21 | conn.close() 22 | 23 | # connect to sparkify database 24 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 25 | cur = conn.cursor() 26 | 27 | return cur, conn 28 | 29 | 30 | def drop_tables(cur, conn): 31 | """ 32 | Drops each table using the queries in `drop_table_queries` list. 33 | @param cur: 34 | @param conn: 35 | """ 36 | for query in drop_table_queries: 37 | cur.execute(query) 38 | conn.commit() 39 | 40 | 41 | def create_tables(cur, conn): 42 | """ 43 | Creates each table using the queries in `create_table_queries` list. 44 | @param cur: 45 | @param conn: 46 | """ 47 | for query in create_table_queries: 48 | cur.execute(query) 49 | conn.commit() 50 | 51 | 52 | def main(): 53 | """ 54 | - Drops (if exists) and Creates the sparkify database. 55 | 56 | - Establishes connection with the sparkify database and gets 57 | cursor to it. 58 | 59 | - Drops all the tables. 60 | 61 | - Creates all tables needed. 62 | 63 | - Finally, closes the connection. 64 | """ 65 | cur, conn = create_database() 66 | 67 | drop_tables(cur, conn) 68 | create_tables(cur, conn) 69 | 70 | conn.close() 71 | 72 | 73 | if __name__ == "__main__": 74 | main() -------------------------------------------------------------------------------- /Capstone Project/sql_queries.py: -------------------------------------------------------------------------------- 1 | create_airports = """ 2 | CREATE TABLE IF NOT EXISTS public.airports ( 3 | iata_code VARCHAR PRIMARY KEY, 4 | name VARCHAR, 5 | type VARCHAR, 6 | local_code VARCHAR, 7 | coordinates VARCHAR, 8 | city VARCHAR, 9 | elevation_ft FLOAT, 10 | continent VARCHAR, 11 | iso_country VARCHAR, 12 | iso_region VARCHAR, 13 | municipality VARCHAR, 14 | gps_code VARCHAR 15 | ); 16 | """ 17 | 18 | drop_airports = "DROP TABLE IF EXISTS airports;" 19 | 20 | airport_insert = """ 21 | INSERT INTO airports (iata_code, name, type, local_code, coordinates, city, elevation_ft, continent, \ 22 | iso_country, iso_region, municipality, gps_code) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" 23 | 24 | create_demographics = """ 25 | CREATE TABLE IF NOT EXISTS public.demographics ( 26 | city VARCHAR, 27 | state VARCHAR, 28 | media_age FLOAT, 29 | male_population INT, 30 | female_population INT, 31 | total_population INT, 32 | num_veterans INT, 33 | foreign_born INT, 34 | average_household_size FLOAT, 35 | state_code VARCHAR(2), 36 | race VARCHAR, 37 | count INT 38 | ); 39 | """ 40 | 41 | drop_demographics = "DROP TABLE IF EXISTS demographics;" 42 | 43 | demographic_insert = """ 44 | INSERT INTO demographics (city, state, media_age, male_population, female_population, total_population, \ 45 | num_veterans, foreign_born, average_household_size, state_code, race, count) VALUES (%s, %s, %s, %s, \ 46 | %s, %s, %s, %s, %s, %s, %s, %s)""" 47 | 48 | create_immigrations = """ 49 | CREATE TABLE IF NOT EXISTS public.immigrations ( 50 | cicid FLOAT PRIMARY KEY, 51 | year FLOAT, 52 | month FLOAT, 53 | cit FLOAT, 54 | res FLOAT, 55 | iata VARCHAR(3), 56 | arrdate FLOAT, 57 | mode FLOAT, 58 | addr VARCHAR, 59 | depdate FLOAT, 60 | bir FLOAT, 61 | visa FLOAT, 62 | count FLOAT, 63 | dtadfile VARCHAR, 64 | entdepa VARCHAR(1), 65 | entdepd VARCHAR(1), 66 | matflag VARCHAR(1), 67 | biryear FLOAT, 68 | dtaddto VARCHAR, 69 | gender VARCHAR(1), 70 | airline VARCHAR, 71 | admnum FLOAT, 72 | fltno VARCHAR, 73 | visatype VARCHAR 74 | ); 75 | """ 76 | 77 | drop_immigrations = "DROP TABLE IF EXISTS immigrations;" 78 | 79 | immigration_insert = (""" 80 | INSERT INTO immigrations (cicid, year, month, cit, res, iata, arrdate, mode, addr, depdate, bir, visa, count, dtadfile, \ 81 | entdepa, entdepd, matflag, biryear, dtaddto, gender, airline, admnum, fltno, visatype) \ 82 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""") 83 | 84 | create_temperature = """ 85 | CREATE TABLE IF NOT EXISTS temperature ( 86 | timestamp DATE, 87 | average_temperature FLOAT, 88 | average_temperature_uncertainty FLOAT, 89 | city VARCHAR, 90 | country VARCHAR, 91 | latitude VARCHAR, 92 | longitude VARCHAR 93 | ); 94 | """ 95 | 96 | temperature_insert = (""" 97 | INSERT INTO temperature (timestamp, average_temperature, average_temperature_uncertainty, city, country, \ 98 | latitude, longitude) VALUES (%s, %s, %s, %s, %s, %s, %s)""") 99 | 100 | drop_temperature = "DROP TABLE IF EXISTS weather;" 101 | 102 | drop_table_queries = [drop_airports, drop_demographics, drop_immigrations, drop_temperature] 103 | create_table_queries = [create_airports, create_demographics, create_immigrations, create_temperature] -------------------------------------------------------------------------------- /Data Lake/README.md: -------------------------------------------------------------------------------- 1 | # Data-Lake 2 | Project Data Lake as part of Udacity's Data Engineering Nanodegree 3 | 4 | ## Purpose of this project 5 | 6 | As the startup sparkify is scaling up quickly, their existing data warehouse can not handle the massive data resources efficiently any longer. They heard about Spark and were curious how it could help them. With this project, they can now analyse their data in a distributed way in-memory. This leads to huge speedups comparing to the existing approach and allows them to keep track of their clients' behavior easily. 7 | 8 | Moreover, this data lake automates the entire data fusion process combining multiple data sources from AWS S3 into structured data. Also, the structured data gets then stored on AWS S3 again, so Sparkify can use it for further analysis. 9 | 10 | ## Used input data 11 | 12 | The input data contains two S3 buckets: 13 | * Song data: `s3://udacity-dend/song_data` 14 | * Log data: `s3://udacity-dend/log_data` 15 | 16 | The song dataset is a subset of real data from the Million Song Dataset. Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset. 17 | 18 | ``` 19 | song_data/A/B/C/TRABCEI128F424C983.json 20 | song_data/A/A/B/TRAABJL12903CDCF1A.json 21 | ``` 22 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like. 23 | 24 | ```json 25 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 26 | ``` 27 | 28 | The log dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from an imaginary music streaming app based on configuration settings. 29 | 30 | The log files in the dataset are partitioned by year and month. For example, here are filepaths to two files in this dataset. 31 | 32 | ``` 33 | log_data/2018/11/2018-11-12-events.json 34 | log_data/2018/11/2018-11-13-events.json 35 | ``` 36 | 37 | And below is an example of what the data in a log file, 2018-11-12-events.json, looks like. 38 | 39 | ![2018-11-12-events.json structure](https://video.udacity-data.com/topher/2019/February/5c6c3f0a_log-data/log-data.png) 40 | 41 | ## Generated tables 42 | 43 | | name | type | description | columns | 44 | | ---- | ---- | ----------- | ------- | 45 | | songplays | fact table | records in log data associated with song plays i.e. records with page NextSong | songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent | 46 | | users | dimension table | users in the app | user_id, first_name, last_name, gender, level | 47 | | songs | dimension table | songs in music database | song_id, title, artist_id, year, duration | 48 | | artists | dimension table | artists in music database | artist_id, name, location, lattitude, longitude | 49 | | time | dimension table | timestamps of records in songplays broken down into specific units | start_time, hour, day, week, month, year, weekday | 50 | 51 | ## ETL pipeline 52 | 53 | The ETL pipeline (see `etl.py`) loads the S3 data sources into Spark dataframes, aggregrates and transforms the data into the described schema and writes the data back to S3 in the parquet format. 54 | 55 | ## Instructions 56 | 57 | 1. Create an AWS IAM role with S3 read and write access. 58 | 2. Enter the IAM's credentials in the `dl.cfg` configuration file. 59 | 3. Create an S3 bucket (note that the zone eu-central-1 may cause issues) and enter the URL to the bucket in `etl.py` as the value of output_data. 60 | 4. Run `python3 etl.py` to process the data and store it on your created S3 bucket. 61 | -------------------------------------------------------------------------------- /Data Lake/dl.cfg: -------------------------------------------------------------------------------- 1 | [AWS] 2 | AWS_ACCESS_KEY_ID= 3 | AWS_SECRET_ACCESS_KEY= -------------------------------------------------------------------------------- /Data Lake/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from datetime import datetime 3 | import os 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import udf, col, year, month, dayofweek, hour, weekofyear, dayofmonth, \ 6 | monotonically_increasing_id, from_unixtime 7 | from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, TimestampType 8 | 9 | config = configparser.ConfigParser() 10 | config.read('dl.cfg') 11 | 12 | os.environ['AWS_ACCESS_KEY_ID'] = config.get('AWS', 'AWS_ACCESS_KEY_ID') 13 | os.environ['AWS_SECRET_ACCESS_KEY'] = config.get('AWS', 'AWS_SECRET_ACCESS_KEY') 14 | 15 | 16 | def create_spark_session(): 17 | """ 18 | Creates a new or uses the existing spark session. 19 | """ 20 | spark = SparkSession \ 21 | .builder \ 22 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 23 | .getOrCreate() 24 | return spark 25 | 26 | 27 | def process_song_data(spark, input_data, output_data): 28 | """ 29 | Processes all song data JSON files in the given input folder and stores them in parquet format in the output folder. 30 | :param spark: spark session 31 | :param input_data: input data path 32 | :param output_data: output data path 33 | """ 34 | song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') 35 | 36 | song_schema = StructType([ 37 | StructField("artist_id", StringType()), 38 | StructField("artist_latitude", DoubleType()), 39 | StructField("artist_location", StringType()), 40 | StructField("artist_longitude", StringType()), 41 | StructField("artist_name", StringType()), 42 | StructField("duration", DoubleType()), 43 | StructField("num_songs", IntegerType()), 44 | StructField("title", StringType()), 45 | StructField("year", IntegerType()), 46 | ]) 47 | 48 | # read song data file 49 | df = spark.read.json(song_data, schema=song_schema) 50 | 51 | # extract columns to create songs table 52 | song_fields = ["title", "artist_id", "year", "duration"] 53 | songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id()) 54 | 55 | # write songs table to parquet files partitioned by year and artist 56 | songs_table.write.mode("overwrite").partitionBy("year", "artist_id").parquet(output_data + "songs") 57 | 58 | # extract columns to create artists table 59 | artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", 60 | "artist_longitude as longitude"] 61 | artists_table = df.selectExpr(artists_fields).dropDuplicates() 62 | 63 | # write artists table to parquet files 64 | artists_table.write.mode("overwrite").parquet(output_data + 'artists') 65 | 66 | 67 | def process_log_data(spark, input_data, output_data): 68 | """ 69 | Processes all log data JSON files in the given input folder and stores them in parquet format in the output folder. 70 | :param spark: spark session 71 | :param input_data: input data path 72 | :param output_data: output data path 73 | """ 74 | # get filepath to log data file 75 | log_data = os.path.join(input_data, 'log_data/*/*/*.json') 76 | 77 | # read log data file 78 | log_df = spark.read.json(log_data) 79 | 80 | # filter by actions for song plays 81 | log_df = log_df.filter(log_df.page == 'NextSong') 82 | 83 | # extract columns for users table 84 | users_fields = ["userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"] 85 | users_table = log_df.selectExpr(users_fields).dropDuplicates() 86 | 87 | # write users table to parquet files 88 | users_table.write.mode("overwrite").parquet(output_data + 'users') 89 | 90 | # create timestamp column from original timestamp column 91 | get_timestamp = udf(lambda x: x / 1000, TimestampType()) 92 | log_df = log_df.withColumn("timestamp", get_timestamp(log_df.ts)) 93 | 94 | # create datetime column from original timestamp column 95 | get_datetime = udf(lambda x: datetime.fromtimestamp(x), TimestampType()) 96 | log_df = log_df.withColumn("start_time", get_datetime(log_df.timestamp)) 97 | 98 | # extract columns to create time table 99 | log_df = log_df.withColumn("hour", hour("start_time")) \ 100 | .withColumn("day", dayofmonth("start_time")) \ 101 | .withColumn("week", weekofyear("start_time")) \ 102 | .withColumn("month", month("start_time")) \ 103 | .withColumn("year", year("start_time")) \ 104 | .withColumn("weekday", dayofweek("start_time")) 105 | 106 | time_table = log_df.select("start_time", "hour", "day", "week", "month", "year", "weekday") 107 | 108 | # write time table to parquet files partitioned by year and month 109 | time_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data + "time") 110 | 111 | # read in song data to use for songplays table 112 | songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*")) 113 | songs_logs = log_df.join(songs_df, (log_df.song == songs_df.title)) 114 | 115 | # extract columns from joined song and log datasets to create songplays table 116 | artists_df = spark.read.parquet(os.path.join(output_data, "artists")) 117 | artists_songs_logs = songs_logs.join(artists_df, (songs_logs.artist == artists_df.name)) 118 | songplays = artists_songs_logs.join( 119 | time_table, 120 | artists_songs_logs.ts == time_table.ts, 'left' 121 | ).drop(artists_songs_logs.year) 122 | 123 | # write songplays table to parquet files partitioned by year and month 124 | songplays_table = songplays.select( 125 | col('start_time'), 126 | col('userId').alias('user_id'), 127 | col('level'), 128 | col('song_id'), 129 | col('artist_id'), 130 | col('sessionId').alias('session_id'), 131 | col('location'), 132 | col('userAgent').alias('user_agent'), 133 | col('year'), 134 | col('month'), 135 | ).repartition("year", "month") 136 | 137 | songplays_table.write.mode("overwrite").partitionBy("year", "month").parquet(output_data, 'songplays') 138 | 139 | 140 | def main(): 141 | spark = create_spark_session() 142 | input_data = "s3a://udacity-dend/" 143 | output_data = "s3a://sparkify-data-udend/" 144 | 145 | process_song_data(spark, input_data, output_data) 146 | process_log_data(spark, input_data, output_data) 147 | 148 | 149 | if __name__ == "__main__": 150 | main() 151 | -------------------------------------------------------------------------------- /Data Modeling with Cassandra/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Modeling with Cassandra/.DS_Store -------------------------------------------------------------------------------- /Data Modeling with Cassandra/Project_1B_ Project_Template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "editable": true 7 | }, 8 | "source": [ 9 | "# Part I. ETL Pipeline for Pre-Processing the Files" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "editable": true 16 | }, 17 | "source": [ 18 | "## PLEASE RUN THE FOLLOWING CODE FOR PRE-PROCESSING THE FILES" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "editable": true 25 | }, 26 | "source": [ 27 | "#### Import Python packages " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": { 34 | "editable": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# Import Python packages \n", 39 | "import pandas as pd\n", 40 | "import cassandra\n", 41 | "import re\n", 42 | "import os\n", 43 | "import glob\n", 44 | "import numpy as np\n", 45 | "import json\n", 46 | "import csv" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "editable": true 53 | }, 54 | "source": [ 55 | "#### Creating list of filepaths to process original event csv data files" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": { 62 | "editable": true 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "/home/workspace\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "# checking your current working directory\n", 75 | "print(os.getcwd())\n", 76 | "\n", 77 | "# Get your current folder and subfolder event data\n", 78 | "filepath = os.getcwd() + '/event_data'\n", 79 | "\n", 80 | "# Create a for loop to create a list of files and collect each filepath\n", 81 | "for root, dirs, files in os.walk(filepath):\n", 82 | " \n", 83 | "# join the file path and roots with the subdirectories using glob\n", 84 | " file_path_list = glob.glob(os.path.join(root,'*'))\n", 85 | " #print(file_path_list)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "editable": true 92 | }, 93 | "source": [ 94 | "#### Processing the files to create the data file csv that will be used for Apache Casssandra tables" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 3, 100 | "metadata": { 101 | "editable": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# initiating an empty list of rows that will be generated from each file\n", 106 | "full_data_rows_list = [] \n", 107 | " \n", 108 | "# for every filepath in the file path list \n", 109 | "for f in file_path_list:\n", 110 | "\n", 111 | "# reading csv file \n", 112 | " with open(f, 'r', encoding = 'utf8', newline='') as csvfile: \n", 113 | " # creating a csv reader object \n", 114 | " csvreader = csv.reader(csvfile) \n", 115 | " next(csvreader)\n", 116 | " \n", 117 | " # extracting each data row one by one and append it \n", 118 | " for line in csvreader:\n", 119 | " #print(line)\n", 120 | " full_data_rows_list.append(line) \n", 121 | " \n", 122 | "\n", 123 | "# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \\\n", 124 | "# Apache Cassandra tables\n", 125 | "csv.register_dialect('myDialect', quoting=csv.QUOTE_ALL, skipinitialspace=True)\n", 126 | "\n", 127 | "with open('event_datafile_new.csv', 'w', encoding = 'utf8', newline='') as f:\n", 128 | " writer = csv.writer(f, dialect='myDialect')\n", 129 | " writer.writerow(['artist','firstName','gender','itemInSession','lastName','length',\\\n", 130 | " 'level','location','sessionId','song','userId'])\n", 131 | " for row in full_data_rows_list:\n", 132 | " if (row[0] == ''):\n", 133 | " continue\n", 134 | " writer.writerow((row[0], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[12], row[13], row[16]))\n" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 4, 140 | "metadata": { 141 | "editable": true 142 | }, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "6821\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# check the number of rows in your csv file\n", 154 | "with open('event_datafile_new.csv', 'r', encoding = 'utf8') as f:\n", 155 | " print(sum(1 for line in f))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "editable": true 162 | }, 163 | "source": [ 164 | "# Part II. Complete the Apache Cassandra coding portion of your project. \n", 165 | "\n", 166 | "## Now you are ready to work with the CSV file titled event_datafile_new.csv, located within the Workspace directory. The event_datafile_new.csv contains the following columns: \n", 167 | "- artist \n", 168 | "- firstName of user\n", 169 | "- gender of user\n", 170 | "- item number in session\n", 171 | "- last name of user\n", 172 | "- length of the song\n", 173 | "- level (paid or free song)\n", 174 | "- location of the user\n", 175 | "- sessionId\n", 176 | "- song title\n", 177 | "- userId\n", 178 | "\n", 179 | "The image below is a screenshot of what the denormalized data should appear like in the **event_datafile_new.csv** after the code above is run:
\n", 180 | "\n", 181 | "" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "editable": true 188 | }, 189 | "source": [ 190 | "## Begin writing your Apache Cassandra code in the cells below" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "editable": true 197 | }, 198 | "source": [ 199 | "#### Creating a Cluster" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 5, 205 | "metadata": { 206 | "editable": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "# This should make a connection to a Cassandra instance your local machine \n", 211 | "# (127.0.0.1)\n", 212 | "\n", 213 | "from cassandra.cluster import Cluster\n", 214 | "cluster = Cluster()\n", 215 | "\n", 216 | "# To establish connection and begin executing queries, need a session\n", 217 | "session = cluster.connect()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "editable": true 224 | }, 225 | "source": [ 226 | "#### Create Keyspace" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 6, 232 | "metadata": { 233 | "editable": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "try:\n", 238 | " session.execute(\"\"\"\n", 239 | " CREATE KEYSPACE IF NOT EXISTS udacity \n", 240 | " WITH REPLICATION = \n", 241 | " { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }\"\"\"\n", 242 | ")\n", 243 | "\n", 244 | "except Exception as e:\n", 245 | " print(e)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "editable": true 252 | }, 253 | "source": [ 254 | "#### Set Keyspace" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 7, 260 | "metadata": { 261 | "editable": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "try:\n", 266 | " session.set_keyspace('udacity')\n", 267 | "except Exception as e:\n", 268 | " print(e)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "editable": true 275 | }, 276 | "source": [ 277 | "### Now we need to create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run." 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": { 283 | "editable": true 284 | }, 285 | "source": [ 286 | "## Create queries to ask the following three questions of the data\n", 287 | "\n", 288 | "### 1. Give me the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4\n", 289 | "\n", 290 | "\n", 291 | "### 2. Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182\n", 292 | " \n", 293 | "\n", 294 | "### 3. Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'\n", 295 | "\n", 296 | "\n" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "editable": true 303 | }, 304 | "source": [ 305 | "#### Delete and create the sessions table to match query 1" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 8, 311 | "metadata": { 312 | "editable": true 313 | }, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "" 319 | ] 320 | }, 321 | "execution_count": 8, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "delete_sessions_table_query = \"DROP TABLE IF EXISTS sessions\"\n", 328 | "session.execute(delete_sessions_table_query)\n", 329 | "\n", 330 | "create_sessions_table_query = \"CREATE TABLE IF NOT EXISTS sessions (artist text, item_in_session int, \\\n", 331 | "length float, session_id int, song_title text, PRIMARY KEY (session_id, item_in_session))\"\n", 332 | "session.execute(create_sessions_table_query)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "editable": true 339 | }, 340 | "source": [ 341 | "#### Insert all sessions from the csv data into the sessions table" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 9, 347 | "metadata": { 348 | "editable": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "file = 'event_datafile_new.csv'\n", 353 | "\n", 354 | "with open(file, encoding = 'utf8') as f:\n", 355 | " csvreader = csv.reader(f)\n", 356 | " next(csvreader) # skip header\n", 357 | " for line in csvreader:\n", 358 | " query = \"INSERT INTO sessions (artist, item_in_session, length, session_id, song_title)\"\n", 359 | " query = query + \" VALUES (%s, %s, %s, %s, %s)\"\n", 360 | " session.execute(query, (line[0], int(line[3]), float(line[5]), int(line[8]), line[9]))" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "editable": true 367 | }, 368 | "source": [ 369 | "#### Do a SELECT to verify that the data have been inserted into the table\n", 370 | "#### Give me the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 10, 376 | "metadata": { 377 | "editable": true, 378 | "scrolled": true 379 | }, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "Faithless Music Matters (Mark Knight Dub) 495.30731201171875\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "select_session_quert = \"select artist, song_title, length from sessions WHERE session_id = 338 and item_in_session = 4\"\n", 391 | "rows = session.execute(select_session_quert)\n", 392 | "for row in rows:\n", 393 | " print (row.artist, row.song_title, row.length)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "editable": true 400 | }, 401 | "source": [ 402 | "### COPY AND REPEAT THE ABOVE THREE CELLS FOR EACH OF THE THREE QUESTIONS" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "editable": true 409 | }, 410 | "source": [ 411 | "#### Delete and create the users table to match query 2" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 11, 417 | "metadata": { 418 | "editable": true 419 | }, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "" 425 | ] 426 | }, 427 | "execution_count": 11, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "delete_users_table_query = \"DROP TABLE IF EXISTS users\"\n", 434 | "session.execute(delete_users_table_query)\n", 435 | "\n", 436 | "create_users_table_query = \"CREATE TABLE IF NOT EXISTS users (artist text, first_name text, \\\n", 437 | "item_in_session int, last_name text, session_id int, song_title text, user_id int, \\\n", 438 | "PRIMARY KEY ((user_id, session_id), item_in_session))\"\n", 439 | "session.execute(create_users_table_query)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "editable": true 446 | }, 447 | "source": [ 448 | "#### Insert all users into the users table" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 12, 454 | "metadata": { 455 | "editable": true 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "file = 'event_datafile_new.csv'\n", 460 | "\n", 461 | "with open(file, encoding = 'utf8') as f:\n", 462 | " csvreader = csv.reader(f)\n", 463 | " next(csvreader) # skip header\n", 464 | " for line in csvreader:\n", 465 | " query = \"INSERT INTO users (artist, first_name, item_in_session, last_name, session_id, song_title, user_id)\"\n", 466 | " query = query + \" VALUES (%s, %s, %s, %s, %s, %s, %s)\"\n", 467 | " session.execute(query, (line[0], line[1], int(line[3]), line[4], int(line[8]), line[9], int(line[10]))) " 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": { 473 | "editable": true 474 | }, 475 | "source": [ 476 | "#### Do a SELECT to verify that the data have been inserted into the table\n", 477 | "#### Give me only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 13, 483 | "metadata": { 484 | "editable": true 485 | }, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "Down To The Bone Keep On Keepin' On Sylvie Cruz\n", 492 | "Three Drives Greece 2000 Sylvie Cruz\n", 493 | "Sebastien Tellier Kilometer Sylvie Cruz\n", 494 | "Lonnie Gordon Catch You Baby (Steve Pitron & Max Sanna Radio Edit) Sylvie Cruz\n" 495 | ] 496 | } 497 | ], 498 | "source": [ 499 | "select_user_query = \"select artist, song_title, first_name, last_name from users WHERE session_id = 182 and user_id = 10\"\n", 500 | "rows = session.execute(select_user_query)\n", 501 | "for row in rows:\n", 502 | " print (row.artist, row.song_title, row.first_name, row.last_name)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": { 508 | "editable": true 509 | }, 510 | "source": [ 511 | "#### Delete and create the song_listens table to match query 3." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 14, 517 | "metadata": { 518 | "editable": true 519 | }, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "" 525 | ] 526 | }, 527 | "execution_count": 14, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "delete_song_listens_table_query = \"DROP TABLE IF EXISTS song_listens\"\n", 534 | "session.execute(delete_song_listens_table_query)\n", 535 | "\n", 536 | "create_song_listens_table_query = \"CREATE TABLE IF NOT EXISTS song_listens (first_name text, last_name text, song_title text, user_id int, \\\n", 537 | "PRIMARY KEY (song_title, user_id))\"\n", 538 | "session.execute(create_song_listens_table_query)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": { 544 | "editable": true 545 | }, 546 | "source": [ 547 | "#### Insert all song_listens into the table" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 15, 553 | "metadata": { 554 | "editable": true 555 | }, 556 | "outputs": [], 557 | "source": [ 558 | "file = 'event_datafile_new.csv'\n", 559 | "\n", 560 | "with open(file, encoding = 'utf8') as f:\n", 561 | " csvreader = csv.reader(f)\n", 562 | " next(csvreader) # skip header\n", 563 | " for line in csvreader:\n", 564 | " query = \"INSERT INTO song_listens (first_name, last_name, song_title, user_id)\"\n", 565 | " query = query + \" VALUES (%s, %s, %s, %s)\"\n", 566 | " session.execute(query, (line[1], line[4], line[9], int(line[10]))) " 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": { 572 | "editable": true 573 | }, 574 | "source": [ 575 | "#### Do a SELECT to verify that the data have been inserted into the table\n", 576 | "#### Give me every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 16, 582 | "metadata": { 583 | "editable": true 584 | }, 585 | "outputs": [ 586 | { 587 | "name": "stdout", 588 | "output_type": "stream", 589 | "text": [ 590 | "Jacqueline Lynch\n", 591 | "Tegan Levine\n", 592 | "Sara Johnson\n" 593 | ] 594 | } 595 | ], 596 | "source": [ 597 | "select_song_listens_query = \"select first_name, last_name from song_listens WHERE song_title = 'All Hands Against His Own'\"\n", 598 | "rows = session.execute(select_song_listens_query)\n", 599 | "for row in rows:\n", 600 | " print (row.first_name, row.last_name)" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": { 606 | "editable": true 607 | }, 608 | "source": [ 609 | "### Drop the tables before closing out the sessions" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 17, 615 | "metadata": { 616 | "editable": true 617 | }, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "" 623 | ] 624 | }, 625 | "execution_count": 17, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "delete_sessions_table_query = \"DROP TABLE IF EXISTS sessions\"\n", 632 | "session.execute(delete_sessions_table_query)\n", 633 | "\n", 634 | "delete_users_table_query = \"DROP TABLE IF EXISTS users\"\n", 635 | "session.execute(delete_users_table_query)\n", 636 | "\n", 637 | "delete_song_listens_table_query = \"DROP TABLE IF EXISTS song_listens\"\n", 638 | "session.execute(delete_song_listens_table_query)" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": { 644 | "editable": true 645 | }, 646 | "source": [ 647 | "### Close the session and cluster connection¶" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 18, 653 | "metadata": { 654 | "editable": true 655 | }, 656 | "outputs": [], 657 | "source": [ 658 | "session.shutdown()\n", 659 | "cluster.shutdown()" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": { 666 | "editable": true 667 | }, 668 | "outputs": [], 669 | "source": [] 670 | } 671 | ], 672 | "metadata": { 673 | "kernelspec": { 674 | "display_name": "Python 3", 675 | "language": "python", 676 | "name": "python3" 677 | }, 678 | "language_info": { 679 | "codemirror_mode": { 680 | "name": "ipython", 681 | "version": 3 682 | }, 683 | "file_extension": ".py", 684 | "mimetype": "text/x-python", 685 | "name": "python", 686 | "nbconvert_exporter": "python", 687 | "pygments_lexer": "ipython3", 688 | "version": "3.6.3" 689 | } 690 | }, 691 | "nbformat": 4, 692 | "nbformat_minor": 4 693 | } 694 | -------------------------------------------------------------------------------- /Data Modeling with Cassandra/README.md: -------------------------------------------------------------------------------- 1 | # Udactity-Data-Modeling-with-Cassandra 2 | 3 | A startup called Sparkify wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. The analysis team is particularly interested in understanding what songs users are listening to. Currently, there is no easy way to query the data to generate the results, since the data reside in a directory of CSV files on user activity on the app. 4 | 5 | They'd like a data engineer to create an Apache Cassandra database which can create queries on song play data to answer the questions, and wish to bring you on the project. Your role is to create a database for this analysis. You'll be able to test your database by running queries given to you by the analytics team from Sparkify to create the results. 6 | -------------------------------------------------------------------------------- /Data Modeling with Cassandra/images/image_event_datafile_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Modeling with Cassandra/images/image_event_datafile_new.jpg -------------------------------------------------------------------------------- /Data Modeling with Postgres/README.md: -------------------------------------------------------------------------------- 1 | ## Udactity Data Engineer Nanodegree Project: Data Modeling with Postgres 2 | 3 | A startup called Sparkify wants to analyze the data they've been collecting on songs and user activity on their new music streaming app. The analytics team is particularly interested in understanding what songs users are listening to. Currently, they don't have an easy way to query their data, which resides in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. 4 | 5 | They'd like a data engineer to create a Postgres database with tables designed to optimize queries on song play analysis, and bring you on the project. Your role is to create a database schema and ETL pipeline for this analysis. You'll be able to test your database and ETL pipeline by running queries given to you by the analytics team from Sparkify and compare your results with their expected results. 6 | 7 | ### Datasets available 8 | 9 | The song dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong/). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. 10 | 11 | The log dataset consists of log files in JSON format generated by [this event simulator](https://github.com/Interana/eventsim) based on the songs in the dataset above. These simulate activity logs from a music streaming app based on specified configurations. 12 | 13 | ### Setup Instructions and Steps followed 14 | 15 | * Install requirements with `pip3 install -r requirements.txt`. 16 | * Set up a local PostgreSQL instance on port 5432. Please see detailed instructions in the [PostgreSQL documentation](https://www.postgresql.org/docs/9.1/runtime.html). 17 | 18 | ### Program execution 19 | 20 | * Execute the script to generate the database and its tables by executing `python3 create_tables.py`. 21 | * Load the data and insert it to the database by executing `python3 etl.py`. 22 | 23 | ### Schema Design 24 | 25 | * The fact table `songplays` stores the records in log data associated with song plays i.e. records with page. 26 | * The dimension table `users` stores the users in the app. 27 | * The dimension table `song` stores the songs in the music database. 28 | * The dimension table `artists` stores the artists the in music database. 29 | * The dimension table `time` stores the timestamps of records in songplays broken down into specific units. 30 | 31 | ### Purpose of this database 32 | 33 | This database allows to aggregate all songs, artists, users and songplays in a single database. In this way, the company disposes of the needed data stored in a unique structure and can thus analyze different scenarios easily. For instance, one can analyze the popularity of different songs or artists. Also, it is possible to perform analysis with geographic information. One can for instance determine which song is popular in which country and region. 34 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """ 7 | - Creates and connects to the sparkifydb 8 | @return: cursor and connection to sparkifydb 9 | """ 10 | 11 | # connect to default database 12 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 13 | conn.set_session(autocommit=True) 14 | cur = conn.cursor() 15 | 16 | # create sparkify database with UTF8 encoding 17 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 18 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 19 | 20 | # close connection to default database 21 | conn.close() 22 | 23 | # connect to sparkify database 24 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 25 | cur = conn.cursor() 26 | 27 | return cur, conn 28 | 29 | 30 | def drop_tables(cur, conn): 31 | """ 32 | Drops each table using the queries in `drop_table_queries` list. 33 | @param cur: 34 | @param conn: 35 | """ 36 | for query in drop_table_queries: 37 | cur.execute(query) 38 | conn.commit() 39 | 40 | 41 | def create_tables(cur, conn): 42 | """ 43 | Creates each table using the queries in `create_table_queries` list. 44 | @param cur: 45 | @param conn: 46 | """ 47 | for query in create_table_queries: 48 | cur.execute(query) 49 | conn.commit() 50 | 51 | 52 | def main(): 53 | """ 54 | - Drops (if exists) and Creates the sparkify database. 55 | 56 | - Establishes connection with the sparkify database and gets 57 | cursor to it. 58 | 59 | - Drops all the tables. 60 | 61 | - Creates all tables needed. 62 | 63 | - Finally, closes the connection. 64 | """ 65 | cur, conn = create_database() 66 | 67 | drop_tables(cur, conn) 68 | create_tables(cur, conn) 69 | 70 | conn.close() 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/etl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "editable": true 7 | }, 8 | "source": [ 9 | "# ETL Processes\n", 10 | "Use this notebook to develop the ETL process for each of your tables before completing the `etl.py` file to load the whole datasets." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "editable": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import os\n", 22 | "import glob\n", 23 | "import psycopg2\n", 24 | "import pandas as pd\n", 25 | "from sql_queries import *" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "editable": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=sparkifydb user=student password=student\")\n", 37 | "cur = conn.cursor()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": { 44 | "editable": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "def get_files(filepath):\n", 49 | " all_files = []\n", 50 | " for root, dirs, files in os.walk(filepath):\n", 51 | " files = glob.glob(os.path.join(root,'*.json'))\n", 52 | " for f in files :\n", 53 | " all_files.append(os.path.abspath(f))\n", 54 | " \n", 55 | " return all_files" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "editable": true 62 | }, 63 | "source": [ 64 | "# Process `song_data`\n", 65 | "In this first part, you'll perform ETL on the first dataset, `song_data`, to create the `songs` and `artists` dimensional tables.\n", 66 | "\n", 67 | "Let's perform ETL on a single song file and load a single record into each table to start.\n", 68 | "- Use the `get_files` function provided above to get a list of all song JSON files in `data/song_data`\n", 69 | "- Select the first song in this list\n", 70 | "- Read the song file and view the data" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": { 77 | "editable": true 78 | }, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "['/home/workspace/data/song_data/A/B/C/TRABCRU128F423F449.json',\n", 84 | " '/home/workspace/data/song_data/A/B/C/TRABCTK128F934B224.json',\n", 85 | " '/home/workspace/data/song_data/A/B/C/TRABCUQ128E0783E2B.json']" 86 | ] 87 | }, 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "song_files = get_files(\"./data/song_data\")\n", 95 | "song_files[0:3]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "editable": true 103 | }, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "'/home/workspace/data/song_data/A/B/C/TRABCRU128F423F449.json'" 109 | ] 110 | }, 111 | "execution_count": 5, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "filepath = song_files[0]\n", 118 | "filepath" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "editable": true 126 | }, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "num_songs 1\n", 132 | "artist_id AR8IEZO1187B99055E\n", 133 | "artist_latitude None\n", 134 | "artist_longitude None\n", 135 | "artist_location \n", 136 | "artist_name Marc Shaiman\n", 137 | "song_id SOINLJW12A8C13314C\n", 138 | "title City Slickers\n", 139 | "duration 149.864\n", 140 | "year 2008\n", 141 | "dtype: object" 142 | ] 143 | }, 144 | "execution_count": 6, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "df = pd.read_json(filepath, typ='series')\n", 151 | "df.head(20)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "editable": true 158 | }, 159 | "source": [ 160 | "## #1: `songs` Table\n", 161 | "#### Extract Data for Songs Table\n", 162 | "- Select columns for song ID, title, artist ID, year, and duration\n", 163 | "- Use `df.values` to select just the values from the dataframe\n", 164 | "- Index to select the first (only) record in the dataframe\n", 165 | "- Convert the array to a list and set it to `song_data`" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "metadata": { 172 | "editable": true 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "['SOINLJW12A8C13314C', 'City Slickers', 'AR8IEZO1187B99055E', 2008, 149.86404]" 179 | ] 180 | }, 181 | "execution_count": 7, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "song_data = df[[\"song_id\", \"title\", \"artist_id\", \"year\", \"duration\"]]\n", 188 | "song_data = list(song_data.values)\n", 189 | "song_data" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "editable": true 196 | }, 197 | "source": [ 198 | "#### Insert Record into Song Table\n", 199 | "Implement the `song_table_insert` query in `sql_queries.py` and run the cell below to insert a record for this song into the `songs` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `songs` table in the sparkify database." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 8, 205 | "metadata": { 206 | "editable": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "cur.execute(song_table_insert, song_data)\n", 211 | "conn.commit()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "editable": true 218 | }, 219 | "source": [ 220 | "Run `test.ipynb` to see if you've successfully added a record to this table." 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "editable": true 227 | }, 228 | "source": [ 229 | "## #2: `artists` Table\n", 230 | "#### Extract Data for Artists Table\n", 231 | "- Select columns for artist ID, name, location, latitude, and longitude\n", 232 | "- Use `df.values` to select just the values from the dataframe\n", 233 | "- Index to select the first (only) record in the dataframe\n", 234 | "- Convert the array to a list and set it to `artist_data`" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": { 241 | "editable": true 242 | }, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "artist_id AR8IEZO1187B99055E\n", 248 | "artist_name Marc Shaiman\n", 249 | "artist_location \n", 250 | "artist_latitude None\n", 251 | "artist_longitude None\n", 252 | "dtype: object" 253 | ] 254 | }, 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "artist_data = df[[\"artist_id\", \"artist_name\", \"artist_location\", \"artist_latitude\", \"artist_longitude\"]]\n", 262 | "artist_data" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "editable": true 269 | }, 270 | "source": [ 271 | "#### Insert Record into Artist Table\n", 272 | "Implement the `artist_table_insert` query in `sql_queries.py` and run the cell below to insert a record for this song's artist into the `artists` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `artists` table in the sparkify database." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 10, 278 | "metadata": { 279 | "editable": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "cur.execute(artist_table_insert, artist_data)\n", 284 | "conn.commit()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "editable": true 291 | }, 292 | "source": [ 293 | "Run `test.ipynb` to see if you've successfully added a record to this table." 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "editable": true 300 | }, 301 | "source": [ 302 | "# Process `log_data`\n", 303 | "In this part, you'll perform ETL on the second dataset, `log_data`, to create the `time` and `users` dimensional tables, as well as the `songplays` fact table.\n", 304 | "\n", 305 | "Let's perform ETL on a single log file and load a single record into each table.\n", 306 | "- Use the `get_files` function provided above to get a list of all log JSON files in `data/log_data`\n", 307 | "- Select the first log file in this list\n", 308 | "- Read the log file and view the data" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 11, 314 | "metadata": { 315 | "editable": true 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "log_files = get_files(\"./data/log_data\")" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 12, 325 | "metadata": { 326 | "editable": true 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "filepath = log_files[0]" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 13, 336 | "metadata": { 337 | "editable": true 338 | }, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/html": [ 343 | "
\n", 344 | "\n", 357 | "\n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | "
artistauthfirstNamegenderitemInSessionlastNamelengthlevellocationmethodpageregistrationsessionIdsongstatustsuserAgentuserId
0Sydney YoungbloodLogged InJacobM53Klein238.07955paidTampa-St. Petersburg-Clearwater, FLPUTNextSong1.540558e+12954Ain't No Sunshine2001543449657796\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...73
1Gang StarrLogged InLaylaF88Griffin151.92771paidLake Havasu City-Kingman, AZPUTNextSong1.541057e+12984My Advice 2 You (Explicit)2001543449690796\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...24
23OH!3Logged InLaylaF89Griffin192.52200paidLake Havasu City-Kingman, AZPUTNextSong1.541057e+12984My First Kiss (Feat. Ke$ha) [Album Version]2001543449841796\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...24
\n", 447 | "
" 448 | ], 449 | "text/plain": [ 450 | " artist auth firstName gender itemInSession lastName \\\n", 451 | "0 Sydney Youngblood Logged In Jacob M 53 Klein \n", 452 | "1 Gang Starr Logged In Layla F 88 Griffin \n", 453 | "2 3OH!3 Logged In Layla F 89 Griffin \n", 454 | "\n", 455 | " length level location method page \\\n", 456 | "0 238.07955 paid Tampa-St. Petersburg-Clearwater, FL PUT NextSong \n", 457 | "1 151.92771 paid Lake Havasu City-Kingman, AZ PUT NextSong \n", 458 | "2 192.52200 paid Lake Havasu City-Kingman, AZ PUT NextSong \n", 459 | "\n", 460 | " registration sessionId song \\\n", 461 | "0 1.540558e+12 954 Ain't No Sunshine \n", 462 | "1 1.541057e+12 984 My Advice 2 You (Explicit) \n", 463 | "2 1.541057e+12 984 My First Kiss (Feat. Ke$ha) [Album Version] \n", 464 | "\n", 465 | " status ts userAgent \\\n", 466 | "0 200 1543449657796 \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4... \n", 467 | "1 200 1543449690796 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... \n", 468 | "2 200 1543449841796 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... \n", 469 | "\n", 470 | " userId \n", 471 | "0 73 \n", 472 | "1 24 \n", 473 | "2 24 " 474 | ] 475 | }, 476 | "execution_count": 13, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "df = pd.read_json(filepath, lines=True)\n", 483 | "df.head(3)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": { 489 | "editable": true 490 | }, 491 | "source": [ 492 | "## #3: `time` Table\n", 493 | "#### Extract Data for Time Table\n", 494 | "- Filter records by `NextSong` action\n", 495 | "- Convert the `ts` timestamp column to datetime\n", 496 | " - Hint: the current timestamp is in milliseconds\n", 497 | "- Extract the timestamp, hour, day, week of year, month, year, and weekday from the `ts` column and set `time_data` to a list containing these values in order\n", 498 | " - Hint: use pandas' [`dt` attribute](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html) to access easily datetimelike properties.\n", 499 | "- Specify labels for these columns and set to `column_labels`\n", 500 | "- Create a dataframe, `time_df,` containing the time data for this file by combining `column_labels` and `time_data` into a dictionary and converting this into a dataframe" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 14, 506 | "metadata": { 507 | "editable": true 508 | }, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/html": [ 513 | "
\n", 514 | "\n", 527 | "\n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
artistauthfirstNamegenderitemInSessionlastNamelengthlevellocationmethodpageregistrationsessionIdsongstatustsuserAgentuserId
0Sydney YoungbloodLogged InJacobM53Klein238.07955paidTampa-St. Petersburg-Clearwater, FLPUTNextSong1.540558e+12954Ain't No Sunshine2002018-11-29 00:00:57.796\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...73
1Gang StarrLogged InLaylaF88Griffin151.92771paidLake Havasu City-Kingman, AZPUTNextSong1.541057e+12984My Advice 2 You (Explicit)2002018-11-29 00:01:30.796\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...24
23OH!3Logged InLaylaF89Griffin192.52200paidLake Havasu City-Kingman, AZPUTNextSong1.541057e+12984My First Kiss (Feat. Ke$ha) [Album Version]2002018-11-29 00:04:01.796\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...24
3RöyksoppLogged InJacobM54Klein369.81506paidTampa-St. Petersburg-Clearwater, FLPUTNextSong1.540558e+12954The Girl and The Robot2002018-11-29 00:04:55.796\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...73
4KajagoogooLogged InLaylaF90Griffin223.55546paidLake Havasu City-Kingman, AZPUTNextSong1.541057e+12984Too Shy2002018-11-29 00:07:13.796\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...24
\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " artist auth firstName gender itemInSession lastName \\\n", 663 | "0 Sydney Youngblood Logged In Jacob M 53 Klein \n", 664 | "1 Gang Starr Logged In Layla F 88 Griffin \n", 665 | "2 3OH!3 Logged In Layla F 89 Griffin \n", 666 | "3 Röyksopp Logged In Jacob M 54 Klein \n", 667 | "4 Kajagoogoo Logged In Layla F 90 Griffin \n", 668 | "\n", 669 | " length level location method page \\\n", 670 | "0 238.07955 paid Tampa-St. Petersburg-Clearwater, FL PUT NextSong \n", 671 | "1 151.92771 paid Lake Havasu City-Kingman, AZ PUT NextSong \n", 672 | "2 192.52200 paid Lake Havasu City-Kingman, AZ PUT NextSong \n", 673 | "3 369.81506 paid Tampa-St. Petersburg-Clearwater, FL PUT NextSong \n", 674 | "4 223.55546 paid Lake Havasu City-Kingman, AZ PUT NextSong \n", 675 | "\n", 676 | " registration sessionId song \\\n", 677 | "0 1.540558e+12 954 Ain't No Sunshine \n", 678 | "1 1.541057e+12 984 My Advice 2 You (Explicit) \n", 679 | "2 1.541057e+12 984 My First Kiss (Feat. Ke$ha) [Album Version] \n", 680 | "3 1.540558e+12 954 The Girl and The Robot \n", 681 | "4 1.541057e+12 984 Too Shy \n", 682 | "\n", 683 | " status ts \\\n", 684 | "0 200 2018-11-29 00:00:57.796 \n", 685 | "1 200 2018-11-29 00:01:30.796 \n", 686 | "2 200 2018-11-29 00:04:01.796 \n", 687 | "3 200 2018-11-29 00:04:55.796 \n", 688 | "4 200 2018-11-29 00:07:13.796 \n", 689 | "\n", 690 | " userAgent userId \n", 691 | "0 \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4... 73 \n", 692 | "1 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... 24 \n", 693 | "2 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... 24 \n", 694 | "3 \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4... 73 \n", 695 | "4 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... 24 " 696 | ] 697 | }, 698 | "execution_count": 14, 699 | "metadata": {}, 700 | "output_type": "execute_result" 701 | } 702 | ], 703 | "source": [ 704 | "df[\"ts\"] = pd.to_datetime(df[\"ts\"], unit='ms')\n", 705 | "df.head()" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 15, 711 | "metadata": { 712 | "editable": true 713 | }, 714 | "outputs": [ 715 | { 716 | "data": { 717 | "text/plain": [ 718 | "0 2018-11-29 00:00:57.796\n", 719 | "1 2018-11-29 00:01:30.796\n", 720 | "2 2018-11-29 00:04:01.796\n", 721 | "3 2018-11-29 00:04:55.796\n", 722 | "4 2018-11-29 00:07:13.796\n", 723 | "Name: ts, dtype: datetime64[ns]" 724 | ] 725 | }, 726 | "execution_count": 15, 727 | "metadata": {}, 728 | "output_type": "execute_result" 729 | } 730 | ], 731 | "source": [ 732 | "t = df[\"ts\"]\n", 733 | "t.head()" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 16, 739 | "metadata": { 740 | "editable": true 741 | }, 742 | "outputs": [], 743 | "source": [ 744 | "# timestamp, hour, day, week of year, month, year, and weekday\n", 745 | "timestamps = df[\"ts\"].dt.time\n", 746 | "hours = df[\"ts\"].dt.hour\n", 747 | "days = df[\"ts\"].dt.day\n", 748 | "weeks = df[\"ts\"].dt.week\n", 749 | "months = df[\"ts\"].dt.month\n", 750 | "years = df[\"ts\"].dt.year\n", 751 | "weekdays = df[\"ts\"].dt.weekday\n", 752 | "column_labels = (\"timestamp\", \"hour\", \"day\", \"week of year\", \"month\", \"year\", \"weekday\")\n", 753 | "time_data = pd.DataFrame({\"timestamp\": timestamps, \"hour\": hours, \"day\": days, \"week\": weeks, \"month\": months, \"year\": years, \"weekday\": weekdays})" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 17, 759 | "metadata": { 760 | "editable": true 761 | }, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/html": [ 766 | "
\n", 767 | "\n", 780 | "\n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | "
timestamphourdayweekmonthyearweekday
000:00:57.796000029481120183
100:01:30.796000029481120183
200:04:01.796000029481120183
300:04:55.796000029481120183
400:07:13.796000029481120183
\n", 846 | "
" 847 | ], 848 | "text/plain": [ 849 | " timestamp hour day week month year weekday\n", 850 | "0 00:00:57.796000 0 29 48 11 2018 3\n", 851 | "1 00:01:30.796000 0 29 48 11 2018 3\n", 852 | "2 00:04:01.796000 0 29 48 11 2018 3\n", 853 | "3 00:04:55.796000 0 29 48 11 2018 3\n", 854 | "4 00:07:13.796000 0 29 48 11 2018 3" 855 | ] 856 | }, 857 | "execution_count": 17, 858 | "metadata": {}, 859 | "output_type": "execute_result" 860 | } 861 | ], 862 | "source": [ 863 | "time_df = time_data\n", 864 | "time_df.head()" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": { 870 | "editable": true 871 | }, 872 | "source": [ 873 | "#### Insert Records into Time Table\n", 874 | "Implement the `time_table_insert` query in `sql_queries.py` and run the cell below to insert records for the timestamps in this log file into the `time` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `time` table in the sparkify database." 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": 18, 880 | "metadata": { 881 | "editable": true 882 | }, 883 | "outputs": [], 884 | "source": [ 885 | "for i, row in time_df.iterrows():\n", 886 | " cur.execute(time_table_insert, list(row))\n", 887 | " conn.commit()" 888 | ] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "metadata": { 893 | "editable": true 894 | }, 895 | "source": [ 896 | "Run `test.ipynb` to see if you've successfully added records to this table." 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": { 902 | "editable": true 903 | }, 904 | "source": [ 905 | "## #4: `users` Table\n", 906 | "#### Extract Data for Users Table\n", 907 | "- Select columns for user ID, first name, last name, gender and level and set to `user_df`" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 19, 913 | "metadata": { 914 | "editable": true 915 | }, 916 | "outputs": [ 917 | { 918 | "data": { 919 | "text/html": [ 920 | "
\n", 921 | "\n", 934 | "\n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | "
userIdfirstNamelastNamegenderlevel
073JacobKleinMpaid
124LaylaGriffinFpaid
2150AvaRobinsonFfree
3554KalebCookMfree
4032LilyBurnsFfree
424AliviaTerrellFfree
4352TheodoreSmithMfree
4614TheodoreHarrisMfree
4898JordynPowellFfree
51101JaydenFoxMfree
5378ChloeRothFfree
5610SylvieCruzFfree
5853CelesteWilliamsFfree
5961SamuelGonzalezMfree
6049ChloeCuevasFpaid
8016RylanGeorgeMpaid
8426RyanSmithMfree
9579JamesMartinMfree
15082AveryMartinezFpaid
15844AleenaKirbyFpaid
19975JosephGutierrezMfree
20439WalterFryeMfree
28680TeganLevineFpaid
31455MartinJohnsonMfree
32812AustinRosalesMfree
3359WyattScottMfree
34622SeanWilsonFfree
35974BradenParkerMfree
36589KynnediSanchezFfree
\n", 1180 | "
" 1181 | ], 1182 | "text/plain": [ 1183 | " userId firstName lastName gender level\n", 1184 | "0 73 Jacob Klein M paid\n", 1185 | "1 24 Layla Griffin F paid\n", 1186 | "21 50 Ava Robinson F free\n", 1187 | "35 54 Kaleb Cook M free\n", 1188 | "40 32 Lily Burns F free\n", 1189 | "42 4 Alivia Terrell F free\n", 1190 | "43 52 Theodore Smith M free\n", 1191 | "46 14 Theodore Harris M free\n", 1192 | "48 98 Jordyn Powell F free\n", 1193 | "51 101 Jayden Fox M free\n", 1194 | "53 78 Chloe Roth F free\n", 1195 | "56 10 Sylvie Cruz F free\n", 1196 | "58 53 Celeste Williams F free\n", 1197 | "59 61 Samuel Gonzalez M free\n", 1198 | "60 49 Chloe Cuevas F paid\n", 1199 | "80 16 Rylan George M paid\n", 1200 | "84 26 Ryan Smith M free\n", 1201 | "95 79 James Martin M free\n", 1202 | "150 82 Avery Martinez F paid\n", 1203 | "158 44 Aleena Kirby F paid\n", 1204 | "199 75 Joseph Gutierrez M free\n", 1205 | "204 39 Walter Frye M free\n", 1206 | "286 80 Tegan Levine F paid\n", 1207 | "314 55 Martin Johnson M free\n", 1208 | "328 12 Austin Rosales M free\n", 1209 | "335 9 Wyatt Scott M free\n", 1210 | "346 22 Sean Wilson F free\n", 1211 | "359 74 Braden Parker M free\n", 1212 | "365 89 Kynnedi Sanchez F free" 1213 | ] 1214 | }, 1215 | "execution_count": 19, 1216 | "metadata": {}, 1217 | "output_type": "execute_result" 1218 | } 1219 | ], 1220 | "source": [ 1221 | "user_df = df[[\"userId\", \"firstName\", \"lastName\", \"gender\", \"level\"]]\n", 1222 | "user_df = user_df.drop_duplicates()\n", 1223 | "user_df = user_df[user_df[\"userId\"]!=\"\"]\n", 1224 | "user_df" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "markdown", 1229 | "metadata": { 1230 | "editable": true 1231 | }, 1232 | "source": [ 1233 | "#### Insert Records into Users Table\n", 1234 | "Implement the `user_table_insert` query in `sql_queries.py` and run the cell below to insert records for the users in this log file into the `users` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `users` table in the sparkify database." 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": 20, 1240 | "metadata": { 1241 | "editable": true 1242 | }, 1243 | "outputs": [], 1244 | "source": [ 1245 | "for i, row in user_df.iterrows():\n", 1246 | " cur.execute(user_table_insert, row)\n", 1247 | " conn.commit()" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "markdown", 1252 | "metadata": { 1253 | "editable": true 1254 | }, 1255 | "source": [ 1256 | "Run `test.ipynb` to see if you've successfully added records to this table." 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "markdown", 1261 | "metadata": { 1262 | "editable": true 1263 | }, 1264 | "source": [ 1265 | "## #5: `songplays` Table\n", 1266 | "#### Extract Data and Songplays Table\n", 1267 | "This one is a little more complicated since information from the songs table, artists table, and original log file are all needed for the `songplays` table. Since the log file does not specify an ID for either the song or the artist, you'll need to get the song ID and artist ID by querying the songs and artists tables to find matches based on song title, artist name, and song duration time.\n", 1268 | "- Implement the `song_select` query in `sql_queries.py` to find the song ID and artist ID based on the title, artist name, and duration of a song.\n", 1269 | "- Select the timestamp, user ID, level, song ID, artist ID, session ID, location, and user agent and set to `songplay_data`\n", 1270 | "\n", 1271 | "#### Insert Records into Songplays Table\n", 1272 | "- Implement the `songplay_table_insert` query and run the cell below to insert records for the songplay actions in this log file into the `songplays` table. Remember to run `create_tables.py` before running the cell below to ensure you've created/resetted the `songplays` table in the sparkify database." 1273 | ] 1274 | }, 1275 | { 1276 | "cell_type": "code", 1277 | "execution_count": 21, 1278 | "metadata": { 1279 | "editable": true 1280 | }, 1281 | "outputs": [], 1282 | "source": [ 1283 | "for index, row in df.iterrows():\n", 1284 | "\n", 1285 | " # get songid and artistid from song and artist tables\n", 1286 | " cur.execute(song_select, (row.song, row.artist, row.length))\n", 1287 | " results = cur.fetchone()\n", 1288 | " if results:\n", 1289 | " songid, artistid = results\n", 1290 | " else:\n", 1291 | " songid, artistid = None, None\n", 1292 | "\n", 1293 | " # insert songplay record\n", 1294 | " songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent)\n", 1295 | " if row.userId != \"\":\n", 1296 | " cur.execute(songplay_table_insert, songplay_data)\n", 1297 | " conn.commit()" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "markdown", 1302 | "metadata": { 1303 | "editable": true 1304 | }, 1305 | "source": [ 1306 | "Run `test.ipynb` to see if you've successfully added records to this table." 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "markdown", 1311 | "metadata": { 1312 | "editable": true 1313 | }, 1314 | "source": [ 1315 | "# Close Connection to Sparkify Database" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "code", 1320 | "execution_count": 22, 1321 | "metadata": { 1322 | "editable": true 1323 | }, 1324 | "outputs": [], 1325 | "source": [ 1326 | "conn.close()" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "metadata": { 1332 | "editable": true 1333 | }, 1334 | "source": [ 1335 | "# Implement `etl.py`\n", 1336 | "Use what you've completed in this notebook to implement `etl.py`." 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": null, 1342 | "metadata": { 1343 | "editable": true 1344 | }, 1345 | "outputs": [], 1346 | "source": [] 1347 | } 1348 | ], 1349 | "metadata": { 1350 | "kernelspec": { 1351 | "display_name": "Python 3", 1352 | "language": "python", 1353 | "name": "python3" 1354 | }, 1355 | "language_info": { 1356 | "codemirror_mode": { 1357 | "name": "ipython", 1358 | "version": 3 1359 | }, 1360 | "file_extension": ".py", 1361 | "mimetype": "text/x-python", 1362 | "name": "python", 1363 | "nbconvert_exporter": "python", 1364 | "pygments_lexer": "ipython3", 1365 | "version": "3.6.3" 1366 | } 1367 | }, 1368 | "nbformat": 4, 1369 | "nbformat_minor": 4 1370 | } 1371 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/etl.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | import os 5 | import glob 6 | import psycopg2 7 | import pandas as pd 8 | from sql_queries import * 9 | 10 | 11 | def process_song_file(cur: Any, filepath: str): 12 | """ 13 | Processes a single song file. 14 | @param cur: the database cursor 15 | @param filepath: the path to the song file 16 | """ 17 | # open song file 18 | df = pd.read_json(filepath, typ='series') 19 | 20 | # insert song record 21 | song_data = df[["song_id", "title", "artist_id", "year", "duration"]] 22 | song_data = list(song_data.values) 23 | cur.execute(song_table_insert, song_data) 24 | 25 | # insert artist record 26 | artist_data = df[["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]] 27 | cur.execute(artist_table_insert, artist_data) 28 | 29 | 30 | def process_log_file(cur: Any, filepath: str): 31 | """ 32 | Processes a single log file. 33 | @param cur: the database cursor 34 | @param filepath: the path to the log file 35 | """ 36 | # open log file 37 | df = pd.read_json(filepath, lines=True) 38 | 39 | # convert timestamp column to datetime 40 | df["ts"] = pd.to_datetime(df["ts"], unit='ms') 41 | 42 | # get all the wanted information from the timestamps 43 | timestamps = df["ts"].dt.time 44 | hours = df["ts"].dt.hour 45 | days = df["ts"].dt.day 46 | weeks = df["ts"].dt.week 47 | months = df["ts"].dt.month 48 | years = df["ts"].dt.year 49 | weekdays = df["ts"].dt.weekday 50 | 51 | # create a dataframe with the wanted information 52 | time_df = pd.DataFrame( 53 | {"timestamp": timestamps, "hour": hours, "day": days, "week": weeks, "month": months, "year": years, 54 | "weekday": weekdays}) 55 | 56 | for i, row in time_df.iterrows(): 57 | cur.execute(time_table_insert, list(row)) 58 | 59 | # load user table 60 | user_df = df[["userId", "firstName", "lastName", "gender", "level"]] 61 | user_df = user_df.drop_duplicates() 62 | 63 | # insert user records 64 | for i, row in user_df.iterrows(): 65 | cur.execute(user_table_insert, row) 66 | 67 | # insert songplay records 68 | for index, row in df.iterrows(): 69 | 70 | # get songid and artistid from song and artist tables 71 | cur.execute(song_select, (row.song, row.artist, row.length)) 72 | results = cur.fetchone() 73 | 74 | if results: 75 | songid, artistid = results 76 | else: 77 | songid, artistid = None, None 78 | 79 | # insert songplay record 80 | songplay_data = (row.ts, row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent) 81 | cur.execute(songplay_table_insert, songplay_data) 82 | 83 | 84 | def process_data(cur: Any, conn: Any, filepath: str, func: Any): 85 | """ 86 | Processes either logs or songs depending on the given function. 87 | @param cur: the database cursor 88 | @param conn: the database connection 89 | @param filepath: the path to the data directory 90 | @param func: the function (process songs or logs) 91 | """ 92 | # get all files matching extension from directory 93 | all_files = [] 94 | for root, dirs, files in os.walk(filepath): 95 | files = glob.glob(os.path.join(root, '*.json')) 96 | for f in files: 97 | all_files.append(os.path.abspath(f)) 98 | 99 | # get total number of files found 100 | num_files = len(all_files) 101 | print('{} files found in {}'.format(num_files, filepath)) 102 | 103 | # iterate over files and process 104 | for i, datafile in enumerate(all_files, 1): 105 | func(cur, datafile) 106 | conn.commit() 107 | print('{}/{} files processed.'.format(i, num_files)) 108 | 109 | 110 | def insert_songs_and_logs(): 111 | """ 112 | Inserts songs and logs to our custom database. 113 | """ 114 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 115 | cur = conn.cursor() 116 | 117 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 118 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 119 | 120 | conn.close() 121 | 122 | 123 | if __name__ == "__main__": 124 | insert_songs_and_logs() 125 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | pandas==1.0.3 3 | psycopg2==2.8.5 4 | python-dateutil==2.8.1 5 | pytz==2019.3 6 | six==1.14.0 7 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays;" 4 | user_table_drop = "DROP TABLE IF EXISTS users;" 5 | song_table_drop = "DROP TABLE IF EXISTS songs;" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists;" 7 | time_table_drop = "DROP TABLE IF EXISTS time;" 8 | 9 | # CREATE TABLES 10 | 11 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays (songplay_id SERIAL PRIMARY KEY, \ 12 | start_time timestamp NOT NULL, user_id varchar NOT NULL, level varchar, song_id varchar, artist_id varchar, \ 13 | session_id int, location varchar, user_agent varchar);""") 14 | 15 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users (user_id varchar PRIMARY KEY, first_name varchar, \ 16 | last_name varchar, gender varchar, level varchar NOT NULL);""") 17 | 18 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs (song_id varchar PRIMARY KEY, title varchar, \ 19 | artist_id varchar, year int, duration decimal)""") 20 | 21 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists (artist_id varchar PRIMARY KEY, name varchar, \ 22 | location varchar, latitude decimal, longitude decimal);""") 23 | 24 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time (start_time time PRIMARY KEY, hour int, day int, week int, \ 25 | month int, year int, weekday int);""") 26 | 27 | # INSERT RECORDS 28 | 29 | songplay_table_insert = ("""INSERT INTO songplays (start_time, user_id, level, song_id, artist_id, session_id, \ 30 | location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""") 31 | 32 | user_table_insert = ("""INSERT INTO users (user_id, first_name, last_name, gender, level) VALUES (%s, %s, %s, %s, %s) \ 33 | ON CONFLICT (user_id) DO UPDATE SET level=EXCLUDED.level""") 34 | 35 | song_table_insert = ("""INSERT INTO songs (song_id, title, artist_id, year, duration) VALUES (%s, %s, %s, %s, %s) \ 36 | ON CONFLICT (song_id) DO NOTHING""") 37 | 38 | artist_table_insert = ("""INSERT INTO artists (artist_id, name, location, latitude, longitude) \ 39 | VALUES (%s, %s, %s, %s, %s) ON CONFLICT (artist_id) DO NOTHING""") 40 | 41 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, week, month, year, weekday) \ 42 | VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (start_time) DO NOTHING""") 43 | 44 | # FIND SONGS 45 | 46 | song_select = ("""SELECT song_id, songs.artist_id FROM songs JOIN artists ON songs.artist_id = artists.artist_id \ 47 | WHERE title = %s AND artists.name = %s AND songs.duration = %s""") 48 | 49 | # QUERY LISTS 50 | 51 | create_table_queries = [songplay_table_create, user_table_create, song_table_create, artist_table_create, 52 | time_table_create] 53 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, artist_table_drop, time_table_drop] 54 | -------------------------------------------------------------------------------- /Data Modeling with Postgres/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "editable": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%load_ext sql" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "editable": true 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "'Connected: student@sparkifydb'" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "%sql postgresql://student:student@127.0.0.1/sparkifydb" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "editable": true 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 48 | "5 rows affected.\n" 49 | ] 50 | }, 51 | { 52 | "data": { 53 | "text/html": [ 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | "
songplay_idstart_timeuser_idlevelsong_idartist_idsession_idlocationuser_agent
12018-11-29 00:00:57.79600073paidNoneNone954Tampa-St. Petersburg-Clearwater, FL"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"
22018-11-29 00:01:30.79600024paidNoneNone984Lake Havasu City-Kingman, AZ"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
32018-11-29 00:04:01.79600024paidNoneNone984Lake Havasu City-Kingman, AZ"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
42018-11-29 00:04:55.79600073paidNoneNone954Tampa-St. Petersburg-Clearwater, FL"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"
52018-11-29 00:07:13.79600024paidNoneNone984Lake Havasu City-Kingman, AZ"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"
" 122 | ], 123 | "text/plain": [ 124 | "[(1, datetime.datetime(2018, 11, 29, 0, 0, 57, 796000), 73, 'paid', None, None, 954, 'Tampa-St. Petersburg-Clearwater, FL', '\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2\"'),\n", 125 | " (2, datetime.datetime(2018, 11, 29, 0, 1, 30, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"'),\n", 126 | " (3, datetime.datetime(2018, 11, 29, 0, 4, 1, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"'),\n", 127 | " (4, datetime.datetime(2018, 11, 29, 0, 4, 55, 796000), 73, 'paid', None, None, 954, 'Tampa-St. Petersburg-Clearwater, FL', '\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2\"'),\n", 128 | " (5, datetime.datetime(2018, 11, 29, 0, 7, 13, 796000), 24, 'paid', None, None, 984, 'Lake Havasu City-Kingman, AZ', '\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36\"')]" 129 | ] 130 | }, 131 | "execution_count": 3, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "%sql SELECT * FROM songplays LIMIT 5;" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": { 144 | "editable": true 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 152 | "5 rows affected.\n" 153 | ] 154 | }, 155 | { 156 | "data": { 157 | "text/html": [ 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
user_idfirst_namelast_namegenderlevel
73JacobKleinMpaid
24LaylaGriffinFpaid
50AvaRobinsonFfree
54KalebCookMfree
32LilyBurnsFfree
" 202 | ], 203 | "text/plain": [ 204 | "[(73, 'Jacob', 'Klein', 'M', 'paid'),\n", 205 | " (24, 'Layla', 'Griffin', 'F', 'paid'),\n", 206 | " (50, 'Ava', 'Robinson', 'F', 'free'),\n", 207 | " (54, 'Kaleb', 'Cook', 'M', 'free'),\n", 208 | " (32, 'Lily', 'Burns', 'F', 'free')]" 209 | ] 210 | }, 211 | "execution_count": 4, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "%sql SELECT * FROM users LIMIT 5;" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 5, 223 | "metadata": { 224 | "editable": true 225 | }, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 232 | "1 rows affected.\n" 233 | ] 234 | }, 235 | { 236 | "data": { 237 | "text/html": [ 238 | "\n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | "
song_idtitleartist_idyearduration
SOINLJW12A8C13314CCity SlickersAR8IEZO1187B99055E2008149.86404
" 254 | ], 255 | "text/plain": [ 256 | "[('SOINLJW12A8C13314C', 'City Slickers', 'AR8IEZO1187B99055E', 2008, Decimal('149.86404'))]" 257 | ] 258 | }, 259 | "execution_count": 5, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "%sql SELECT * FROM songs LIMIT 5;" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 6, 271 | "metadata": { 272 | "editable": true 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 280 | "1 rows affected.\n" 281 | ] 282 | }, 283 | { 284 | "data": { 285 | "text/html": [ 286 | "\n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | "
artist_idnamelocationlatitudelongitude
AR8IEZO1187B99055EMarc ShaimanNoneNone
" 302 | ], 303 | "text/plain": [ 304 | "[('AR8IEZO1187B99055E', 'Marc Shaiman', '', None, None)]" 305 | ] 306 | }, 307 | "execution_count": 6, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "%sql SELECT * FROM artists LIMIT 5;" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 7, 319 | "metadata": { 320 | "editable": true 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | " * postgresql://student:***@127.0.0.1/sparkifydb\n", 328 | "5 rows affected.\n" 329 | ] 330 | }, 331 | { 332 | "data": { 333 | "text/html": [ 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | "
start_timehourdayweekmonthyearweekday
00:00:57.796000029481120183
00:01:30.796000029481120183
00:04:01.796000029481120183
00:04:55.796000029481120183
00:07:13.796000029481120183
" 390 | ], 391 | "text/plain": [ 392 | "[(datetime.time(0, 0, 57, 796000), 0, 29, 48, 11, 2018, 3),\n", 393 | " (datetime.time(0, 1, 30, 796000), 0, 29, 48, 11, 2018, 3),\n", 394 | " (datetime.time(0, 4, 1, 796000), 0, 29, 48, 11, 2018, 3),\n", 395 | " (datetime.time(0, 4, 55, 796000), 0, 29, 48, 11, 2018, 3),\n", 396 | " (datetime.time(0, 7, 13, 796000), 0, 29, 48, 11, 2018, 3)]" 397 | ] 398 | }, 399 | "execution_count": 7, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "%sql SELECT * FROM time LIMIT 5;" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": { 411 | "editable": true 412 | }, 413 | "source": [ 414 | "## REMEMBER: Restart this notebook to close connection to `sparkifydb`\n", 415 | "Each time you run the cells above, remember to restart this notebook to close the connection to your database. Otherwise, you won't be able to run your code in `create_tables.py`, `etl.py`, or `etl.ipynb` files since you can't make multiple connections to the same database (in this case, sparkifydb)." 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "editable": true 423 | }, 424 | "outputs": [], 425 | "source": [] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 3", 431 | "language": "python", 432 | "name": "python3" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 3 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython3", 444 | "version": "3.6.3" 445 | } 446 | }, 447 | "nbformat": 4, 448 | "nbformat_minor": 4 449 | } 450 | -------------------------------------------------------------------------------- /Data Pipeline/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manuel-lang/Data-Engineering-Nanodegree/330b6b3ce020fb479868c44163aeb70e473dd111/Data Pipeline/.DS_Store -------------------------------------------------------------------------------- /Data Pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Data-Pipelines-with-Airflow -------------------------------------------------------------------------------- /Data Pipeline/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE public.artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE public.songplays ( 10 | playid varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | songid varchar(256), 15 | artistid varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 20 | ); 21 | 22 | CREATE TABLE public.songs ( 23 | songid varchar(256) NOT NULL, 24 | title varchar(256), 25 | artistid varchar(256), 26 | "year" int4, 27 | duration numeric(18,0), 28 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 29 | ); 30 | 31 | CREATE TABLE public.staging_events ( 32 | artist varchar(256), 33 | auth varchar(256), 34 | firstname varchar(256), 35 | gender varchar(256), 36 | iteminsession int4, 37 | lastname varchar(256), 38 | length numeric(18,0), 39 | "level" varchar(256), 40 | location varchar(256), 41 | "method" varchar(256), 42 | page varchar(256), 43 | registration numeric(18,0), 44 | sessionid int4, 45 | song varchar(256), 46 | status int4, 47 | ts int8, 48 | useragent varchar(256), 49 | userid int4 50 | ); 51 | 52 | CREATE TABLE public.staging_songs ( 53 | num_songs int4, 54 | artist_id varchar(256), 55 | artist_name varchar(256), 56 | artist_latitude numeric(18,0), 57 | artist_longitude numeric(18,0), 58 | artist_location varchar(256), 59 | song_id varchar(256), 60 | title varchar(256), 61 | duration numeric(18,0), 62 | "year" int4 63 | ); 64 | 65 | CREATE TABLE public.staging_songs ( 66 | num_songs int4, 67 | artist_id varchar(256), 68 | artist_name varchar(256), 69 | artist_latitude numeric(18,0), 70 | artist_longitude numeric(18,0), 71 | artist_location varchar(256), 72 | song_id varchar(256), 73 | title varchar(256), 74 | duration numeric(18,0), 75 | "year" int4 76 | ); 77 | 78 | CREATE TABLE public.users ( 79 | userid int4 NOT NULL, 80 | first_name varchar(256), 81 | last_name varchar(256), 82 | gender varchar(256), 83 | "level" varchar(256), 84 | CONSTRAINT users_pkey PRIMARY KEY (userid) 85 | ); 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /Data Pipeline/dags/udac_example_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import os 3 | from airflow import DAG 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, 6 | LoadDimensionOperator, DataQualityOperator) 7 | from helpers import SqlQueries 8 | 9 | # AWS_KEY = os.environ.get('AWS_KEY') 10 | # AWS_SECRET = os.environ.get('AWS_SECRET') 11 | 12 | default_args = { 13 | 'owner': 'udacity', 14 | 'start_date': datetime(2019, 1, 12), 15 | 'depends_on_past': False, 16 | 'retries': 3, 17 | 'retry_delay': timedelta(minutes=5), 18 | 'catchup': False 19 | } 20 | 21 | dag = DAG('udac_example_dag', 22 | default_args=default_args, 23 | description='Load and transform data in Redshift with Airflow', 24 | schedule_interval='0 * * * *' 25 | ) 26 | 27 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag) 28 | 29 | stage_events_to_redshift = StageToRedshiftOperator( 30 | task_id='Stage_events', 31 | dag=dag, 32 | table="staging_events", 33 | conn_id="redshift", 34 | aws_credentials_id="aws_credentials", 35 | s3_bucket="udacity-dend", 36 | s3_key="log_data", 37 | json_path="s3://udacity-dend/log_json_path.json" 38 | ) 39 | 40 | stage_songs_to_redshift = StageToRedshiftOperator( 41 | task_id='Stage_songs', 42 | dag=dag, 43 | table="staging_songs", 44 | conn_id="redshift", 45 | aws_credentials_id="aws_credentials", 46 | s3_bucket="udacity-dend", 47 | s3_key="song_data" 48 | ) 49 | 50 | load_songplays_table = LoadFactOperator( 51 | task_id='Load_songplays_fact_table', 52 | dag=dag, 53 | conn_id="redshift", 54 | table="songplays", 55 | query=SqlQueries.songplay_table_insert 56 | ) 57 | 58 | load_user_dimension_table = LoadDimensionOperator( 59 | task_id='Load_user_dim_table', 60 | dag=dag, 61 | conn_id="redshift", 62 | table="users", 63 | query=SqlQueries.user_table_insert, 64 | truncate=True 65 | ) 66 | 67 | load_song_dimension_table = LoadDimensionOperator( 68 | task_id='Load_song_dim_table', 69 | dag=dag, 70 | conn_id="redshift", 71 | table="songs", 72 | query=SqlQueries.song_table_insert, 73 | truncate=True 74 | ) 75 | 76 | load_artist_dimension_table = LoadDimensionOperator( 77 | task_id='Load_artist_dim_table', 78 | dag=dag, 79 | conn_id="redshift", 80 | table="artists", 81 | query=SqlQueries.artist_table_insert, 82 | truncate=True 83 | ) 84 | 85 | load_time_dimension_table = LoadDimensionOperator( 86 | task_id='Load_time_dim_table', 87 | dag=dag, 88 | conn_id="redshift", 89 | table="time", 90 | query=SqlQueries.time_table_insert, 91 | truncate=True 92 | ) 93 | 94 | run_quality_checks = DataQualityOperator( 95 | task_id='Run_data_quality_checks', 96 | dag=dag 97 | ) 98 | 99 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag) 100 | 101 | start_operator >> stage_events_to_redshift 102 | start_operator >> stage_songs_to_redshift 103 | 104 | stage_events_to_redshift >> load_songplays_table 105 | stage_songs_to_redshift >> load_songplays_table 106 | 107 | load_songplays_table >> load_song_dimension_table 108 | load_songplays_table >> load_user_dimension_table 109 | load_songplays_table >> load_artist_dimension_table 110 | load_songplays_table >> load_time_dimension_table 111 | 112 | load_song_dimension_table >> run_quality_checks 113 | load_user_dimension_table >> run_quality_checks 114 | load_artist_dimension_table >> run_quality_checks 115 | load_time_dimension_table >> run_quality_checks 116 | 117 | run_quality_checks >> end_operator 118 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator 16 | ] 17 | helpers = [ 18 | helpers.SqlQueries 19 | ] 20 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /Data Pipeline/plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | songplay_table_insert = (""" 3 | SELECT 4 | md5(events.sessionid || events.start_time) songplay_id, 5 | events.start_time, 6 | events.userid, 7 | events.level, 8 | songs.song_id, 9 | songs.artist_id, 10 | events.sessionid, 11 | events.location, 12 | events.useragent 13 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 14 | FROM staging_events 15 | WHERE page='NextSong') events 16 | LEFT JOIN staging_songs songs 17 | ON events.song = songs.title 18 | AND events.artist = songs.artist_name 19 | AND events.length = songs.duration 20 | """) 21 | 22 | user_table_insert = (""" 23 | SELECT distinct userid, firstname, lastname, gender, level 24 | FROM staging_events 25 | WHERE page='NextSong' 26 | """) 27 | 28 | song_table_insert = (""" 29 | SELECT distinct song_id, title, artist_id, year, duration 30 | FROM staging_songs 31 | """) 32 | 33 | artist_table_insert = (""" 34 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 35 | FROM staging_songs 36 | """) 37 | 38 | time_table_insert = (""" 39 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 40 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 41 | FROM songplays 42 | """) -------------------------------------------------------------------------------- /Data Pipeline/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | 6 | __all__ = [ 7 | 'StageToRedshiftOperator', 8 | 'LoadFactOperator', 9 | 'LoadDimensionOperator', 10 | 'DataQualityOperator' 11 | ] 12 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class DataQualityOperator(BaseOperator): 6 | 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | conn_id, 12 | tables, 13 | *args, **kwargs): 14 | 15 | super(DataQualityOperator, self).__init__(*args, **kwargs) 16 | self.conn_id = redshift_conn_id 17 | self.tables = tables 18 | 19 | def execute(self, context): 20 | redshift_hook = PostgresHook(self.conn_id) 21 | for table in self.tables: 22 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}") 23 | if len(records) < 1 or len(records[0]) < 1: 24 | raise ValueError(f"Data quality check failed. {table} returned no results") 25 | if records[0][0] < 1: 26 | raise ValueError(f"Data quality check failed. {table} contained 0 rows") 27 | self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records") 28 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadDimensionOperator(BaseOperator): 6 | 7 | ui_color = '#80BD9E' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | conn_id, 12 | table, 13 | query, 14 | truncate = False, 15 | *args, **kwargs): 16 | 17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 18 | self.conn_id = conn_id 19 | self.table = table 20 | self.query = query 21 | self.truncate = truncate 22 | 23 | def execute(self, context): 24 | redshift = PostgresHook(postgres_conn_id=conn_id) 25 | if self.truncate: 26 | redshift.run(f"TRUNCATE TABLE {self.table}") 27 | redshift.run(f"INSERT INTO {self.table} {self.query}") 28 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class LoadFactOperator(BaseOperator): 6 | 7 | ui_color = '#F98866' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | conn_id, 12 | table, 13 | query, 14 | *args, **kwargs): 15 | 16 | super(LoadFactOperator, self).__init__(*args, **kwargs) 17 | self.conn_id = conn_id 18 | self.table = table 19 | self.query = query 20 | 21 | def execute(self, context): 22 | redshift = PostgresHook(postgres_conn_id=self.conn_id) 23 | redshift.run(f"INSERT INTO {self.table} {self.query}") 24 | -------------------------------------------------------------------------------- /Data Pipeline/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | class StageToRedshiftOperator(BaseOperator): 6 | ui_color = '#358140' 7 | 8 | @apply_defaults 9 | def __init__(self, 10 | conn_id, 11 | aws_credentials_id, 12 | table, 13 | s3_bucket, 14 | s3_key, 15 | json_path, 16 | *args, **kwargs): 17 | 18 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs): 19 | self.conn_id = conn_id 20 | self.table = table 21 | self.s3_bucket = s3_bucket 22 | self.s3_key = s3_key 23 | self.aws_credentials_id = aws_credentials_id 24 | self.json_path = json_path 25 | 26 | def execute(self, context): 27 | aws_hook = AwsHook(self.aws_credentials_id) 28 | credentials = aws_hook.get_credentials() 29 | redshift = PostgresHook(postgres_conn_id=self.conn_id) 30 | redshift.run(f"DELETE FROM {self.table}") 31 | self.s3_key = self.s3_key.format(**context) 32 | s3_path = f"s3://{self.s3_bucket}/{self.s3_key}") 33 | redshift.run(f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' \ 34 | SECRET_ACCESS_KEY '{credentials.secret_key}' FORMAT AS JSON '{self.json_path}'") 35 | -------------------------------------------------------------------------------- /Data Warehouse/README.md: -------------------------------------------------------------------------------- 1 | # Data Warehouse 2 | 3 | Project Data Warehouse as part of the Udactiy Data Engineer Nanodegree. 4 | 5 | ## Project Summary 6 | An implementation of a Data Warehouse leveraging AWS RedShift. This projects contains the ETL pipeline that extracts data from S3, stages them in Redshift, and transforms data into a set of dimensional tables for their analytics team. 7 | 8 | The data on S3 contains song and log information from a music store. This solution enables music stores to easily process loads of information efficiently. 9 | 10 | ## Purpose of this project 11 | This projects processes data from different sources (in this case multiple S3 buckets) in a way that it can be analyzed easily and efficiently. The startup Sparkify thus enjoys the eased analysis of the run-time data of their application. 12 | 13 | ## Project instructions 14 | 1. Setup a redshift cluster on AWS and insert the connection details in `dwh.cfg`. 15 | 2. Create the needed the database structure by executing `create_tables.py`. 16 | 3. Process the data from the configured S3 data sources by executing `etl.py`. 17 | 18 | ## Database schema 19 | | Table | Description | 20 | | ---- | ---- | 21 | | staging_events | stating table for event data | 22 | | staging_songs | staging table for song data | 23 | | songplays | information how songs were played, e.g. when by which user in which session | 24 | | users | user-related information such as name, gender and level | 25 | | songs | song-related information containing name, artist, year and duration | 26 | | artists | artist name and location (geo-coords and textual location) | 27 | | time | time-related info for timestamps | 28 | 29 | ## ETL pipeline 30 | 1. Load song and log data both from S3 buckets. 31 | 2. Stage the loaded data. 32 | 3. Transform the data into the above described data schema. 33 | 34 | ## Example queries 35 | 36 | * Find all users at a certain location: ```SELECT DISTINCT users.user_id FROM users JOIN songplays ON songplays.user_id = users.user_id WHERE songplays.location = ``` 37 | * Find all songs by a given artist: ```SELECT songs.song_id FROM songs JOIN artists ON songs.artist_id = artists.artist_id WHERE artist.name = ``` 38 | -------------------------------------------------------------------------------- /Data Warehouse/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | 6 | def drop_tables(cur, conn): 7 | """ 8 | Executes all drop table queries. 9 | :param cur: database cursor 10 | :param conn: database connector 11 | """ 12 | for query in drop_table_queries: 13 | cur.execute(query) 14 | conn.commit() 15 | 16 | 17 | def create_tables(cur, conn): 18 | """ 19 | Executes all create table queries. 20 | :param cur: database cursor 21 | :param conn: database connector 22 | :return: 23 | """ 24 | for query in create_table_queries: 25 | cur.execute(query) 26 | conn.commit() 27 | 28 | 29 | def main(): 30 | config = configparser.ConfigParser() 31 | config.read('dwh.cfg') 32 | 33 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 34 | cur = conn.cursor() 35 | 36 | drop_tables(cur, conn) 37 | create_tables(cur, conn) 38 | 39 | conn.close() 40 | 41 | 42 | if __name__ == "__main__": 43 | main() -------------------------------------------------------------------------------- /Data Warehouse/dwh.cfg: -------------------------------------------------------------------------------- 1 | [CLUSTER] 2 | HOST= 3 | DB_NAME= 4 | DB_USER= 5 | DB_PASSWORD= 6 | DB_PORT= 7 | 8 | [IAM_ROLE] 9 | ARN='' 10 | 11 | [S3] 12 | LOG_DATA='s3://udacity-dend/log_data' 13 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json' 14 | SONG_DATA='s3://udacity-dend/song_data' -------------------------------------------------------------------------------- /Data Warehouse/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | """ 8 | Executes all copy table queries. 9 | :param cur: database cursor 10 | :param conn: database connector 11 | """ 12 | for query in copy_table_queries: 13 | cur.execute(query) 14 | conn.commit() 15 | 16 | 17 | def insert_tables(cur, conn): 18 | """ 19 | Executes all insert table queries. 20 | :param cur: database cursor 21 | :param conn: database connector 22 | """ 23 | for query in insert_table_queries: 24 | cur.execute(query) 25 | conn.commit() 26 | 27 | 28 | def main(): 29 | config = configparser.ConfigParser() 30 | config.read('dwh.cfg') 31 | 32 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 33 | cur = conn.cursor() 34 | 35 | load_staging_tables(cur, conn) 36 | insert_tables(cur, conn) 37 | 38 | conn.close() 39 | 40 | 41 | if __name__ == "__main__": 42 | main() -------------------------------------------------------------------------------- /Data Warehouse/sql_queries.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | # CONFIG 4 | config = configparser.ConfigParser() 5 | config.read('dwh.cfg') 6 | 7 | # DROP TABLES 8 | 9 | staging_events_table_drop = "DROP TABLE IF EXISTS staging_events" 10 | staging_songs_table_drop = "DROP TABLE IF EXISTS staging_songs" 11 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 12 | user_table_drop = "DROP TABLE IF EXISTS users" 13 | song_table_drop = "DROP TABLE IF EXISTS songs" 14 | artist_table_drop = "DROP TABLE IF EXISTS artists" 15 | time_table_drop = "DROP TABLE IF EXISTS time" 16 | 17 | # CREATE TABLES 18 | 19 | staging_events_table_create = (""" 20 | CREATE TABLE IF NOT EXISTS staging_events ( 21 | event_id BIGINT IDENTITY(0,1), 22 | artist VARCHAR, 23 | auth VARCHAR, 24 | firstName VARCHAR, 25 | gender VARCHAR, 26 | itemInSession VARCHAR, 27 | lastName VARCHAR, 28 | length VARCHAR, 29 | level VARCHAR, 30 | location VARCHAR, 31 | method VARCHAR, 32 | page VARCHAR, 33 | registration VARCHAR, 34 | sessionId INTEGER SORTKEY DISTKEY, 35 | song VARCHAR, 36 | status INTEGER, 37 | ts BIGINT, 38 | userAgent VARCHAR, 39 | userId INTEGER 40 | ); 41 | """) 42 | 43 | staging_songs_table_create = (""" 44 | CREATE TABLE IF NOT EXISTS staging_songs ( 45 | num_songs INTEGER, 46 | artist_id VARCHAR SORTKEY DISTKEY, 47 | artist_latitude VARCHAR, 48 | artist_longitude VARCHAR, 49 | artist_location VARCHAR(500), 50 | artist_name VARCHAR(500), 51 | song_id VARCHAR, 52 | title VARCHAR(500), 53 | duration DECIMAL(9), 54 | year INTEGER 55 | ); 56 | """) 57 | 58 | songplay_table_create = (""" 59 | CREATE TABLE IF NOT EXISTS songplays ( 60 | songplay_id INTEGER IDENTITY(0,1) NOT NULL SORTKEY, 61 | start_time TIMESTAMP NOT NULL, 62 | user_id VARCHAR(50) NOT NULL DISTKEY, 63 | level VARCHAR(10) NOT NULL, 64 | song_id VARCHAR(40) NOT NULL, 65 | artist_id VARCHAR(50) NOT NULL, 66 | session_id VARCHAR(50) NOT NULL, 67 | location VARCHAR(100) NULL, 68 | user_agent VARCHAR(255) NULL 69 | ); 70 | """) 71 | 72 | user_table_create = (""" 73 | CREATE TABLE IF NOT EXISTS users ( 74 | user_id INTEGER NOT NULL SORTKEY, 75 | first_name VARCHAR(50) NULL, 76 | last_name VARCHAR(80) NULL, 77 | gender VARCHAR(10) NULL, 78 | level VARCHAR(10) NULL 79 | ) diststyle all; 80 | """) 81 | 82 | song_table_create = (""" 83 | CREATE TABLE IF NOT EXISTS songs ( 84 | song_id VARCHAR(50) NOT NULL SORTKEY, 85 | title VARCHAR(500) NOT NULL, 86 | artist_id VARCHAR(50) NOT NULL, 87 | year INTEGER NOT NULL, 88 | duration DECIMAL(9) NOT NULL 89 | ); 90 | """) 91 | 92 | artist_table_create = (""" 93 | CREATE TABLE IF NOT EXISTS artists ( 94 | artist_id VARCHAR(50) NOT NULL SORTKEY, 95 | name VARCHAR(500) NULL, 96 | location VARCHAR(500) NULL, 97 | latitude DECIMAL(9) NULL, 98 | longitude DECIMAL(9) NULL 99 | ) diststyle all; 100 | """) 101 | 102 | time_table_create = (""" 103 | CREATE TABLE IF NOT EXISTS time ( 104 | start_time TIMESTAMP NOT NULL SORTKEY, 105 | hour SMALLINT NULL, 106 | day SMALLINT NULL, 107 | week SMALLINT NULL, 108 | month SMALLINT NULL, 109 | year SMALLINT NULL, 110 | weekday SMALLINT NULL 111 | ) diststyle all; 112 | """) 113 | 114 | # STAGING TABLES 115 | 116 | staging_events_copy = (""" 117 | COPY staging_events FROM {} 118 | credentials 'aws_iam_role={}' 119 | format as json {} 120 | STATUPDATE ON 121 | region 'us-west-2'; 122 | """).format(config.get('S3', 'LOG_DATA'), config.get('IAM_ROLE', 'ARN'), config.get('S3', 'LOG_JSONPATH')) 123 | 124 | staging_songs_copy = (""" 125 | COPY staging_songs FROM {} 126 | credentials 'aws_iam_role={}' 127 | format as json 'auto' 128 | ACCEPTINVCHARS AS '^' 129 | STATUPDATE ON 130 | region 'us-west-2'; 131 | """).format(config.get('S3', 'SONG_DATA'), config.get('IAM_ROLE', 'ARN')) 132 | 133 | # FINAL TABLES 134 | 135 | songplay_table_insert = (""" 136 | INSERT INTO songplays ( 137 | start_time, 138 | user_id, 139 | level, 140 | song_id, 141 | artist_id, 142 | session_id, 143 | location, 144 | user_agent) 145 | SELECT DISTINCT TIMESTAMP 'epoch' + se.ts/1000 \ 146 | * INTERVAL '1 second' AS start_time, 147 | se.userId AS user_id, 148 | se.level AS level, 149 | ss.song_id AS song_id, 150 | ss.artist_id AS artist_id, 151 | se.sessionId AS session_id, 152 | se.location AS location, 153 | se.userAgent AS user_agent 154 | FROM songplays AS se 155 | JOIN staging_songs AS ss ON (se.song = ss.title AND se.artist = ss.artist_name) 156 | WHERE se.page = 'NextSong'; 157 | """) 158 | 159 | user_table_insert = (""" 160 | INSERT INTO users ( 161 | user_id, 162 | first_name, 163 | last_name, 164 | gender, 165 | level) 166 | SELECT DISTINCT se.userId AS user_id, 167 | se.firstName AS first_name, 168 | se.lastName AS last_name, 169 | se.gender AS gender, 170 | se.level AS level 171 | FROM songplays AS se 172 | WHERE se.page = 'NextSong'; 173 | """) 174 | 175 | song_table_insert = (""" 176 | INSERT INTO songs ( 177 | song_id, 178 | title, 179 | artist_id, 180 | year, 181 | duration) 182 | SELECT DISTINCT ss.song_id AS song_id, 183 | ss.title AS title, 184 | ss.artist_id AS artist_id, 185 | ss.year AS year, 186 | ss.duration AS duration 187 | FROM staging_songs AS ss; 188 | """) 189 | 190 | artist_table_insert = (""" 191 | INSERT INTO artists ( 192 | artist_id, 193 | name, 194 | location, 195 | latitude, 196 | longitude) 197 | SELECT DISTINCT ss.artist_id AS artist_id, 198 | ss.artist_name AS name, 199 | ss.artist_location AS location, 200 | ss.artist_latitude AS latitude, 201 | ss.artist_longitude AS longitude 202 | FROM staging_songs AS ss; 203 | """) 204 | 205 | time_table_insert = (""" 206 | INSERT INTO time ( 207 | start_time, 208 | hour, 209 | day, 210 | week, 211 | month, 212 | year, 213 | weekday) 214 | SELECT DISTINCT TIMESTAMP 'epoch' + se.ts/1000 \ 215 | * INTERVAL '1 second' AS start_time, 216 | EXTRACT(hour FROM start_time) AS hour, 217 | EXTRACT(day FROM start_time) AS day, 218 | EXTRACT(week FROM start_time) AS week, 219 | EXTRACT(month FROM start_time) AS month, 220 | EXTRACT(year FROM start_time) AS year, 221 | EXTRACT(week FROM start_time) AS weekday 222 | FROM songplays AS se 223 | WHERE se.page = 'NextSong'; 224 | """) 225 | 226 | # QUERY LISTS 227 | 228 | create_table_queries = [staging_events_table_create, staging_songs_table_create, songplay_table_create, 229 | user_table_create, song_table_create, artist_table_create, time_table_create] 230 | drop_table_queries = [staging_events_table_drop, staging_songs_table_drop, songplay_table_drop, user_table_drop, 231 | song_table_drop, artist_table_drop, time_table_drop] 232 | copy_table_queries = [staging_events_copy, staging_songs_copy] 233 | insert_table_queries = [songplay_table_insert, user_table_insert, song_table_insert, artist_table_insert, 234 | time_table_insert] 235 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Nanodegree 2 | All projects of Udacity's [Data Engineering Nanodegree](https://www.udacity.com/course/data-engineer-nanodegree--nd027). 3 | 4 | ## Projects 5 | 6 | ### Data Modeling 7 | Learn to create relational and NoSQL data models to fit the diverse needs of data consumers. Use ETL to build databases in PostgreSQL and Apache Cassandra. 8 | 9 | [DATA MODELING WITH POSTGRES](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Modeling%20with%20Postgres) 10 | 11 | [DATA MODELING WITH APACHE CASSANDRA](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Modeling%20with%20Cassandra) 12 | 13 | ### Cloud Data Warehouses 14 | Sharpen your data warehousing skills and deepen your understanding of data infrastructure. Create cloud-based data warehouses on Amazon Web Services (AWS). 15 | 16 | [BUILD A CLOUD DATA WAREHOUSE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Warehouse) 17 | 18 | ### Spark and Data Lakes 19 | Understand the big data ecosystem and how to use Spark to work with massive datasets. Store big data in a data lake and query it with Spark. 20 | 21 | [BUILD A DATA LAKE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Lake) 22 | 23 | ### Data Pipelines with Airflow 24 | Schedule, automate, and monitor data pipelines using Apache Airflow. Run data quality checks, track data lineage, and work with data pipelines in production. 25 | 26 | [DATA PIPELINES WITH AIRFLOW](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Data%20Pipeline) 27 | 28 | ### Capstone Project 29 | Combine what you've learned throughout the program to build your own data engineering portfolio project. 30 | 31 | [DATA ENGINEERING CAPSTONE](https://github.com/manuel-lang/Udacity-Data-Engineering-Nanodegree/tree/master/Capstone%20Project) 32 | --------------------------------------------------------------------------------