├── data_lake_spark ├── data │ ├── Icon │ ├── log-data.zip │ └── song-data.zip ├── images │ ├── logo.png │ └── star_schema.png ├── README.md ├── etl.py └── test_data.ipynb ├── _config.yml ├── .gitattributes ├── data_pipelines_airflow ├── images │ ├── logo.png │ ├── airflow.png │ └── star_schema.png ├── plugins │ ├── helpers │ │ ├── __init__.py │ │ └── sql_queries.py │ ├── operators │ │ ├── __init__.py │ │ ├── load_fact.py │ │ ├── load_dimension.py │ │ ├── data_quality.py │ │ └── stage_redshift.py │ └── __init__.py ├── create_tables.sql ├── README.md └── dags │ └── etl.py ├── data_warehouse_redshift ├── images │ ├── logo.png │ └── star_schema.png ├── create_tables.py ├── etl.py ├── README.md └── sql_queries.py ├── nosql_db_modeling_apache_cassandra ├── images │ ├── logo.png │ └── image_event_datafile_new.jpg ├── README.md ├── event_data │ ├── 2018-11-01-events.csv │ ├── 2018-11-25-events.csv │ ├── 2018-11-11-events.csv │ ├── 2018-11-22-events.csv │ ├── 2018-11-10-events.csv │ └── 2018-11-03-events.csv └── nosql_db_modeling.ipynb ├── relational_db_modeling_postgresql ├── images │ ├── logo.png │ ├── star_schema.png │ ├── database_schema_diagram.png │ └── generate_graph.ipynb ├── data │ ├── song_data │ │ └── A │ │ │ ├── B │ │ │ ├── C │ │ │ │ ├── TRABCTK128F934B224.json │ │ │ │ ├── TRABCPZ128F4275C32.json │ │ │ │ ├── TRABCAJ12903CDFCC2.json │ │ │ │ ├── TRABCEI128F424C983.json │ │ │ │ ├── TRABCRU128F423F449.json │ │ │ │ ├── TRABCUQ128E0783E2B.json │ │ │ │ ├── TRABCEC128F426456E.json │ │ │ │ ├── TRABCXB128F4286BD3.json │ │ │ │ ├── TRABCYE128F934CE1D.json │ │ │ │ ├── TRABCKL128F423A778.json │ │ │ │ ├── TRABCFL128F149BB0D.json │ │ │ │ └── TRABCIX128F4265903.json │ │ │ ├── A │ │ │ │ ├── TRABATO128F42627E9.json │ │ │ │ ├── TRABAZH128F930419A.json │ │ │ │ ├── TRABACN128F425B784.json │ │ │ │ ├── TRABAXL128F424FC50.json │ │ │ │ ├── TRABAXV128F92F6AE3.json │ │ │ │ ├── TRABAFJ128F42AF24E.json │ │ │ │ ├── TRABAWW128F4250A31.json │ │ │ │ ├── TRABAVQ12903CBF7E0.json │ │ │ │ ├── TRABAIO128F42938F9.json │ │ │ │ ├── TRABAXR128F426515F.json │ │ │ │ └── TRABAFP128F931E9A1.json │ │ │ └── B │ │ │ │ ├── TRABBNP128F932546F.json │ │ │ │ ├── TRABBBV128F42967D7.json │ │ │ │ ├── TRABBZN12903CD9297.json │ │ │ │ ├── TRABBVJ128F92F7EAA.json │ │ │ │ ├── TRABBXU128F92FEF48.json │ │ │ │ ├── TRABBTA128F933D304.json │ │ │ │ ├── TRABBAM128F429D223.json │ │ │ │ ├── TRABBJE12903CDB442.json │ │ │ │ ├── TRABBLU128F93349CF.json │ │ │ │ ├── TRABBKX128F4285205.json │ │ │ │ ├── TRABBOP128F931B50D.json │ │ │ │ └── TRABBOR128F4286200.json │ │ │ └── A │ │ │ ├── A │ │ │ ├── TRAAAFD128F92F423A.json │ │ │ ├── TRAAADZ128F9348C2E.json │ │ │ ├── TRAAARJ128F9320760.json │ │ │ ├── TRAAAMQ128F1460CD3.json │ │ │ ├── TRAAAVG12903CFA543.json │ │ │ ├── TRAAAAW128F429D538.json │ │ │ ├── TRAAAEF128F4273421.json │ │ │ ├── TRAAABD128F429CF47.json │ │ │ ├── TRAAAVO128F93133D4.json │ │ │ ├── TRAAAPK128E0786D96.json │ │ │ └── TRAAAMO128F1481E7F.json │ │ │ ├── B │ │ │ ├── TRAABDL12903CAABBA.json │ │ │ ├── TRAABJL12903CDCF1A.json │ │ │ ├── TRAABVM128F92CA9DC.json │ │ │ ├── TRAABXG128F9318EBD.json │ │ │ ├── TRAABRB128F9306DD5.json │ │ │ ├── TRAABYW128F4244559.json │ │ │ ├── TRAABCL128F4286650.json │ │ │ ├── TRAABNV128F425CEE1.json │ │ │ ├── TRAABJV128F1460C49.json │ │ │ ├── TRAABLR128F423B7E3.json │ │ │ └── TRAABYN12903CFD305.json │ │ │ └── C │ │ │ ├── TRAACIW12903CC0F6D.json │ │ │ ├── TRAACCG128F92E8A55.json │ │ │ ├── TRAACHN128F1489601.json │ │ │ ├── TRAACQT128F9331780.json │ │ │ ├── TRAACZK128F4243829.json │ │ │ ├── TRAACSL128F93462F4.json │ │ │ ├── TRAACOW128F933E35F.json │ │ │ ├── TRAACPE128F421C1B9.json │ │ │ ├── TRAACFV128F935E50B.json │ │ │ ├── TRAACNS128F14A2DF5.json │ │ │ ├── TRAACTB12903CAAF15.json │ │ │ ├── TRAACLV128F427E123.json │ │ │ ├── TRAACER128F4290F96.json │ │ │ └── TRAACVS128E078BE39.json │ └── log_data │ │ └── 2018 │ │ └── 11 │ │ └── 2018-11-01-events.json ├── create_tables.py ├── README.md ├── etl.py └── sql_queries.py ├── .gitignore └── README.md /data_lake_spark/data/Icon : -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * linguist-vendored 2 | *.py linguist-vendored=false 3 | 4 | -------------------------------------------------------------------------------- /data_lake_spark/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_lake_spark/images/logo.png -------------------------------------------------------------------------------- /data_lake_spark/data/log-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_lake_spark/data/log-data.zip -------------------------------------------------------------------------------- /data_lake_spark/data/song-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_lake_spark/data/song-data.zip -------------------------------------------------------------------------------- /data_lake_spark/images/star_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_lake_spark/images/star_schema.png -------------------------------------------------------------------------------- /data_pipelines_airflow/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_pipelines_airflow/images/logo.png -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /data_pipelines_airflow/images/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_pipelines_airflow/images/airflow.png -------------------------------------------------------------------------------- /data_warehouse_redshift/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_warehouse_redshift/images/logo.png -------------------------------------------------------------------------------- /data_pipelines_airflow/images/star_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_pipelines_airflow/images/star_schema.png -------------------------------------------------------------------------------- /data_warehouse_redshift/images/star_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/data_warehouse_redshift/images/star_schema.png -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/nosql_db_modeling_apache_cassandra/images/logo.png -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/relational_db_modeling_postgresql/images/logo.png -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/images/star_schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/relational_db_modeling_postgresql/images/star_schema.png -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/images/database_schema_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/relational_db_modeling_postgresql/images/database_schema_diagram.png -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/images/image_event_datafile_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chandu-muthyala/udacity-dend/master/nosql_db_modeling_apache_cassandra/images/image_event_datafile_new.jpg -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCTK128F934B224.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAFD128F92F423A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABDL12903CAABBA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACIW12903CC0F6D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABJL12903CDCF1A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABVM128F92CA9DC.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABXG128F9318EBD.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABATO128F42627E9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAZH128F930419A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7ZKHQ1187B98DD73", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Glad", "song_id": "SOTUKVB12AB0181477", "title": "Blessed Assurance", "duration": 270.602, "year": 1993} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBNP128F932546F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCPZ128F4275C32.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAADZ128F9348C2E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAARJ128F9320760.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABACN128F425B784.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAXL128F424FC50.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBBV128F42967D7.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCEI128F424C983.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCRU128F423F449.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCUQ128E0783E2B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAVG12903CFA543.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABRB128F9306DD5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACCG128F92E8A55.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACHN128F1489601.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAAW128F429D538.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAEF128F4273421.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABYW128F4244559.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI3BMM1187FB4255E", "artist_latitude": 38.8991, "artist_longitude": -77.029, "artist_location": "Washington", "artist_name": "Alice Stuart", "song_id": "SOBEBDG12A58A76D60", "title": "Kassie Jones", "duration": 220.78649, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBZN12903CD9297.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSAFR1269FB35070", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Blingtones", "song_id": "SOTCKKY12AB018A141", "title": "Sonnerie lalaleul\u00e9 hi houuu", "duration": 29.54404, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCEC128F426456E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCXB128F4286BD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARWB3G61187FB49404", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamilton, Ohio", "artist_name": "Steve Morse", "song_id": "SODAUVL12A8C13D184", "title": "Prognosis", "duration": 363.85914, "year": 2000} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCYE128F934CE1D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREVWGE1187B9B890A", "artist_latitude": -13.442, "artist_longitude": -41.9952, "artist_location": "Noci (BA)", "artist_name": "Bitter End", "song_id": "SOFCHDR12AB01866EF", "title": "Living Hell", "duration": 282.43546, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAABD128F429CF47.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABCL128F4286650.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABNV128F425CEE1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACQT128F9331780.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACZK128F4243829.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGUVEV1187B98BA17", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sierra Maestra", "song_id": "SOGOSOV12AF72A285E", "title": "\u00bfD\u00f3nde va Chichi?", "duration": 313.12934, "year": 1997} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAXV128F92F6AE3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBXU128F92FEF48.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARP6N5A1187B99D1A3", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamtramck, MI", "artist_name": "Mitch Ryder", "song_id": "SOXILUQ12A58A7C72A", "title": "Jenny Take a Ride", "duration": 207.43791, "year": 2004} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAVO128F93133D4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSJW91187B9B1D6B", "artist_latitude": 35.21962, "artist_longitude": -80.01955, "artist_location": "North Carolina", "artist_name": "JennyAnyKind", "song_id": "SOQHXMF12AB0182363", "title": "Young Boy Blues", "duration": 218.77506, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACSL128F93462F4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAJPHH1187FB5566A", "artist_latitude": 40.7038, "artist_longitude": -73.83168, "artist_location": "Queens, NY", "artist_name": "The Shangri-Las", "song_id": "SOYTPEP12AB0180E7B", "title": "Twist and Shout", "duration": 164.80608, "year": 1964} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBTA128F933D304.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABJV128F1460C49.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABLR128F423B7E3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACOW128F933E35F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAFJ128F42AF24E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAWW128F4250A31.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBAM128F429D223.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBJE12903CDB442.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBLU128F93349CF.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAPK128E0786D96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/B/TRAABYN12903CFD305.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACPE128F421C1B9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBKX128F4285205.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBOP128F931B50D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/A/TRAAAMO128F1481E7F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACFV128F935E50B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACNS128F14A2DF5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACTB12903CAAF15.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0RCMP1187FB3F427", "artist_latitude": 30.08615, "artist_longitude": -94.10158, "artist_location": "Beaumont, TX", "artist_name": "Billie Jo Spears", "song_id": "SOGXHEG12AB018653E", "title": "It Makes No Difference Now", "duration": 133.32853, "year": 1992} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAIO128F42938F9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAXR128F426515F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCKL128F423A778.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACLV128F427E123.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCFL128F149BB0D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/C/TRABCIX128F4265903.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACER128F4290F96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/A/C/TRAACVS128E078BE39.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREBBGV1187FB523D2", "artist_latitude": null, "artist_longitude": null, "artist_location": "Houston, TX", "artist_name": "Mike Jones (Featuring CJ_ Mello & Lil' Bran)", "song_id": "SOOLYAZ12A6701F4A6", "title": "Laws Patrolling (Album Version)", "duration": 173.66159, "year": 0} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/A/TRABAFP128F931E9A1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000} -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/song_data/A/B/B/TRABBOR128F4286200.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0} -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | 6 | __all__ = [ 7 | 'StageToRedshiftOperator', 8 | 'LoadFactOperator', 9 | 'LoadDimensionOperator', 10 | 'DataQualityOperator' 11 | ] 12 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator 16 | ] 17 | helpers = [ 18 | helpers.SqlQueries 19 | ] 20 | -------------------------------------------------------------------------------- /data_warehouse_redshift/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | 6 | def drop_tables(cur, conn): 7 | """ 8 | Drop all tables in the studentdb 9 | """ 10 | for idx, query in enumerate(drop_table_queries): 11 | try: 12 | cur.execute(query) 13 | conn.commit() 14 | print("Success: Dropping Table {}".format(idx)) 15 | except psycopg2.Error as e: 16 | print("Error: Dropping Table {}".format(idx)) 17 | print (e) 18 | 19 | 20 | def create_tables(cur, conn): 21 | """ 22 | Create all tables in the studentdb 23 | """ 24 | for idx, query in enumerate(create_table_queries): 25 | try: 26 | cur.execute(query) 27 | conn.commit() 28 | print("Success: Creating Table {}".format(idx)) 29 | except psycopg2.Error as e: 30 | print("Error: Creating Table {}".format(idx)) 31 | print (e) 32 | 33 | 34 | def main(): 35 | config = configparser.ConfigParser() 36 | config.read('dwh.cfg') 37 | 38 | conn = psycopg2.connect("host={} dbname={} user={} \ 39 | password={} port={}".format( 40 | *config['CLUSTER'].values())) 41 | cur = conn.cursor() 42 | 43 | drop_tables(cur, conn) 44 | create_tables(cur, conn) 45 | 46 | conn.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /data_warehouse_redshift/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | """ 8 | Load staging tables 9 | """ 10 | for idx, query in enumerate(copy_table_queries): 11 | try: 12 | cur.execute(query) 13 | conn.commit() 14 | print("Success: Loading Table {}".format(idx)) 15 | except psycopg2.Error as e: 16 | print("Error: Loading Table {}".format(idx)) 17 | print (e) 18 | 19 | 20 | def insert_tables(cur, conn): 21 | """ 22 | Create cube tabls and save these as new tables 23 | """ 24 | for idx, query in enumerate(insert_table_queries): 25 | try: 26 | cur.execute(query) 27 | conn.commit() 28 | print("Success: Inserting Table {}".format(idx)) 29 | except psycopg2.Error as e: 30 | print("Error: Inserting into table {}".format(idx)) 31 | print (e) 32 | 33 | 34 | def main(): 35 | config = configparser.ConfigParser() 36 | config.read('dwh.cfg') 37 | 38 | conn = psycopg2.connect("host={} dbname={} user={} \ 39 | password={} port={}".format( 40 | *config['CLUSTER'].values())) 41 | cur = conn.cursor() 42 | 43 | load_staging_tables(cur, conn) 44 | insert_tables(cur, conn) 45 | 46 | conn.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/images/generate_graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sqlalchemy import MetaData\n", 10 | "from sqlalchemy_schemadisplay import create_schema_graph\n", 11 | "\n", 12 | "connection = \"postgres://student:student@127.0.0.1/sparkifydb\"\n", 13 | "graph = create_schema_graph(metadata=MetaData(connection), \n", 14 | " show_datatypes=True, # show datatypes\n", 15 | " show_indexes=True, # show index (in ourcase unique)\n", 16 | " rankdir='LR', # left to right alignment\n", 17 | " concentrate=False)\n", 18 | "graph.write_png('database_schema_diagram.png')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Python 2", 32 | "language": "python", 33 | "name": "python2" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 2 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython2", 45 | "version": "2.7.14" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 2 50 | } 51 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/README.md: -------------------------------------------------------------------------------- 1 | # Project 2: NoSQL Data Modeling with Apache Cassandra 2 | 3 |

4 | 5 | ## Introduction 6 | As a data engineer, I was responsible for developing a nosql database to empower the analytics team at Sparkify. The analysis team is particularly interested in understanding what songs users are listening to. Currently, there is no easy way to query the data to generate the results, since the data reside in a directory of CSV files on user activity on the app. 7 | 8 | ### Achievements 9 | Data modeling with Apache Cassandra and building an ETL pipeline using Python. Define table schema based on the analytics team query requirements, and write an ETL pipeline that transfers data from a set of CSV files within a directory to create a streamlined CSV file to model and insert data into Apache Cassandra tables using Python and SQL. 10 | Skills include: 11 | * Building out an ETL pipeline using Python 12 | * Creating a database schema and ETL pipeline for this analysis 13 | * Creating an Apache Cassandra database with denormalized tables designed to optimize queries on event data. 14 | * Define robust Partition Keys, Clustering Columns and Composite Primary Keys. 15 | * Testing the database and ETL pipeline by running queries given to you by the analytics team from Sparkify and comparing results with their expected results. 16 | 17 | ### Schema for Data Analysis 18 | Process the event_datafile_new.csv dataset to create a denormalized dataset. 19 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class LoadFactOperator(BaseOperator): 7 | ui_color = '#F98866' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="redshift", 12 | table="", 13 | sql_stmt="", 14 | *args, **kwargs): 15 | 16 | super(LoadFactOperator, self).__init__(*args, **kwargs) 17 | self.redshift_conn_id = redshift_conn_id 18 | self.table = table 19 | self.sql_stmt = sql_stmt 20 | 21 | def execute(self, context): 22 | """ 23 | Insert data into fact tables from staging events and song data. 24 | Typically fact tables are significantly large and thus append only 25 | methods should be utilized. 26 | 27 | Parameters: 28 | ---------- 29 | redshift_conn_id: string 30 | airflow connection to redshift cluster 31 | table: string 32 | target table located in redshift cluster 33 | sql_stmt: string 34 | SQL command to generate insert data. 35 | """ 36 | pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) 37 | formatted_sql = f"INSERT INTO {self.table} ({self.sql_stmt})" 38 | # formatted_sql = f"""ALTER TABLE {self.table} 39 | # APPEND FROM ({self.sql_stmt}) 40 | # IGNOREEXTRA""" 41 | pg_hook.run(formatted_sql) 42 | self.log.info(f"Success: {self.task_id}") 43 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class LoadDimensionOperator(BaseOperator): 7 | ui_color = '#80BD9E' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="redshift", 12 | table="", 13 | sql_stmt="", 14 | truncate=False, 15 | *args, **kwargs): 16 | 17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.table = table 20 | self.sql_stmt = sql_stmt 21 | self.truncate = truncate 22 | 23 | def execute(self, context): 24 | """ 25 | Insert data into dimensional tables from staging events and song data. 26 | Using a truncate-insert method to empty target tables prior to load. 27 | 28 | Parameters: 29 | ---------- 30 | redshift_conn_id: string 31 | airflow connection to redshift cluster 32 | table: string 33 | target table located in redshift cluster 34 | sql_stmt: string 35 | SQL command to generate insert data. 36 | truncate: boolean 37 | Flag to truncate target table prior to load. 38 | """ 39 | pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) 40 | if self.truncate: 41 | pg_hook.run(f"TRUNCATE TABLE {self.table}") 42 | formatted_sql = f"INSERT INTO {self.table} ({self.sql_stmt})" 43 | pg_hook.run(formatted_sql) 44 | self.log.info(f"Success: {self.task_id}") 45 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class DataQualityOperator(BaseOperator): 7 | ui_color = '#89DA59' 8 | 9 | @apply_defaults 10 | def __init__(self, 11 | redshift_conn_id="", 12 | table="", 13 | test_stmt=None, 14 | result=None, 15 | *args, **kwargs): 16 | 17 | super(DataQualityOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.table = table 20 | self.test_stmt = test_stmt 21 | self.result = result 22 | 23 | def execute(self, context): 24 | """ 25 | Perform data quality checks on resulting fact and dimension tables. 26 | 27 | Parameters: 28 | ---------- 29 | redshift_conn_id: string 30 | airflow connection to redshift cluster 31 | table: string 32 | table located in redshift cluster 33 | test_stmt: string 34 | test SQL command to check validity of target table 35 | result: string 36 | result of test_stmt to check validity 37 | """ 38 | pg_hook = PostgresHook(self.redshift_conn_id) 39 | records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}") 40 | if len(records) < 1 or len(records[0]) < 1: 41 | raise ValueError(f"Fail: No results for {self.table}") 42 | num_records = records[0][0] 43 | if num_records < 1: 44 | raise ValueError(f"Fail: 0 rows in {self.table}") 45 | 46 | if self.test_stmt: 47 | output = pg_hook.get_first(self.test_stmt) 48 | if self.result != output: 49 | raise ValueError(f"Fail: {output} != {self.result}") 50 | self.log.info(f"Success: {self.table} has {records[0][0]} records") 51 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | songplay_table_insert = (""" 3 | SELECT 4 | md5(events.sessionid || events.start_time) songplay_id, 5 | events.start_time, 6 | events.userid, 7 | events.level, 8 | songs.song_id, 9 | songs.artist_id, 10 | events.sessionid, 11 | events.location, 12 | events.useragent 13 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 14 | FROM staging_events 15 | WHERE page='NextSong') events 16 | LEFT JOIN staging_songs songs 17 | ON events.song = songs.title 18 | AND events.artist = songs.artist_name 19 | AND events.length = songs.duration 20 | """) 21 | 22 | user_table_insert = (""" 23 | SELECT distinct userid, firstname, lastname, gender, level 24 | FROM staging_events 25 | WHERE page='NextSong' 26 | """) 27 | 28 | song_table_insert = (""" 29 | SELECT distinct song_id, title, artist_id, year, duration 30 | FROM staging_songs 31 | """) 32 | 33 | artist_table_insert = (""" 34 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 35 | FROM staging_songs 36 | """) 37 | 38 | time_table_insert = (""" 39 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 40 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 41 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time 42 | FROM staging_events 43 | WHERE page='NextSong') 44 | """) 45 | 46 | artist_table_data_quality = (""" 47 | SELECT COUNT(*) 48 | FROM artists 49 | WHERE artistid = 'ARD7TVE1187B99BFB1' 50 | """) 51 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | """ 7 | Drop and Create sparkifydb. 8 | """ 9 | # connect to default database 10 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb \ 11 | user=student password=student") 12 | conn.set_session(autocommit=True) 13 | cur = conn.cursor() 14 | 15 | # create sparkify database with UTF8 encoding 16 | try: 17 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 18 | except psycopg2.Error as e: 19 | print ("Error: Dropping Database") 20 | print (e) 21 | 22 | try: 23 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING \ 24 | 'utf8' TEMPLATE template0;") 25 | except psycopg2.Error as e: 26 | print ("Error: Creating Database") 27 | print (e) 28 | # close connection to default database 29 | conn.close() 30 | 31 | # connect to sparkify database 32 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb \ 33 | user=student password=student") 34 | cur = conn.cursor() 35 | 36 | return cur, conn 37 | 38 | 39 | def drop_tables(cur, conn): 40 | """ 41 | Drop all tables in the studentdb 42 | """ 43 | for query in drop_table_queries: 44 | try: 45 | cur.execute(query) 46 | conn.commit() 47 | except psycopg2.Error as e: 48 | print("Error: Dropping table") 49 | print (e) 50 | 51 | 52 | def create_tables(cur, conn): 53 | """ 54 | Create all tables in the studentdb 55 | """ 56 | for query in create_table_queries: 57 | try: 58 | cur.execute(query) 59 | conn.commit() 60 | except psycopg2.Error as e: 61 | print("Error: Creating table") 62 | print (e) 63 | 64 | 65 | def main(): 66 | cur, conn = create_database() 67 | 68 | drop_tables(cur, conn) 69 | create_tables(cur, conn) 70 | 71 | conn.close() 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pth 2 | *.bak 3 | *.log 4 | *~ 5 | .~* 6 | .pypirc 7 | ~* 8 | tmp* 9 | tags 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | env/ 22 | build/ 23 | conda-dist 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | .venv 98 | venv/ 99 | ENV/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | .vscode 115 | *.swp 116 | 117 | # osx generated files 118 | .DS_Store 119 | .DS_Store? 120 | .Trashes 121 | ehthumbs.db 122 | Thumbs.db 123 | .idea 124 | 125 | # pytest 126 | .pytest_cache 127 | 128 | # tools/trust-doc-nbs 129 | examples/.last_checked 130 | 131 | # symlinks to fastai 132 | nbs/dl1/fastai 133 | 134 | # .gitconfig is now autogenerated 135 | .gitconfig 136 | -------------------------------------------------------------------------------- /data_pipelines_airflow/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS public.artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE IF NOT EXISTS public.songplays ( 10 | playid varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | songid varchar(256), 15 | artistid varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 20 | ); 21 | 22 | CREATE TABLE IF NOT EXISTS public.songs ( 23 | songid varchar(256) NOT NULL, 24 | title varchar(256), 25 | artistid varchar(256), 26 | "year" int4, 27 | duration numeric(18,0), 28 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 29 | ); 30 | 31 | CREATE TABLE IF NOT EXISTS public.staging_events ( 32 | artist varchar(256), 33 | auth varchar(256), 34 | firstname varchar(256), 35 | gender varchar(256), 36 | iteminsession int4, 37 | lastname varchar(256), 38 | length numeric(18,0), 39 | "level" varchar(256), 40 | location varchar(256), 41 | "method" varchar(256), 42 | page varchar(256), 43 | registration numeric(18,0), 44 | sessionid int4, 45 | song varchar(256), 46 | status int4, 47 | ts int8, 48 | useragent varchar(256), 49 | userid int4 50 | ); 51 | 52 | CREATE TABLE IF NOT EXISTS public.staging_songs ( 53 | num_songs int4, 54 | artist_id varchar(256), 55 | artist_name varchar(256), 56 | artist_latitude numeric(18,0), 57 | artist_longitude numeric(18,0), 58 | artist_location varchar(256), 59 | song_id varchar(256), 60 | title varchar(256), 61 | duration numeric(18,0), 62 | "year" int4 63 | ); 64 | 65 | CREATE TABLE IF NOT EXISTS public."time" ( 66 | start_time timestamp NOT NULL, 67 | "hour" int4, 68 | "day" int4, 69 | week int4, 70 | "month" varchar(256), 71 | "year" int4, 72 | weekday varchar(256), 73 | CONSTRAINT time_pkey PRIMARY KEY (start_time) 74 | ) ; 75 | 76 | CREATE TABLE IF NOT EXISTS public.users ( 77 | userid int4 NOT NULL, 78 | first_name varchar(256), 79 | last_name varchar(256), 80 | gender varchar(256), 81 | "level" varchar(256), 82 | CONSTRAINT users_pkey PRIMARY KEY (userid) 83 | ); 84 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-01-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39 3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8 4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8 5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8 6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8 7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8 8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8 9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8 10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8 11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8 12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10 13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26 14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26 15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26 16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101 17 | -------------------------------------------------------------------------------- /data_pipelines_airflow/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.hooks.S3_hook import S3Hook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | 7 | class StageToRedshiftOperator(BaseOperator): 8 | ui_color = '#358140' 9 | template_fields = ("s3_key",) 10 | copy_sql = """ 11 | COPY {0} 12 | FROM '{1}' 13 | ACCESS_KEY_ID '{2}' 14 | SECRET_ACCESS_KEY '{3}' 15 | {4} 16 | {5} 17 | {6} 18 | TIMEFORMAT as 'epochmillisecs' 19 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL 20 | """ 21 | 22 | @apply_defaults 23 | def __init__(self, 24 | redshift_conn_id="redshift", 25 | aws_credentials_id="aws_credentials", 26 | table="staging_temp", 27 | s3_bucket="dend", 28 | s3_key="temp", 29 | file_format="JSON", 30 | *args, **kwargs): 31 | 32 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 33 | self.redshift_conn_id = redshift_conn_id 34 | self.aws_credentials_id = aws_credentials_id 35 | self.table = table 36 | self.s3_bucket = s3_bucket 37 | self.s3_key = s3_key 38 | self.file_format = file_format 39 | 40 | def execute(self, context): 41 | """ 42 | Copy data from S3 buckets to redshift cluster into staging tables. 43 | 44 | Parameters: 45 | ---------- 46 | redshift_conn_id: string 47 | airflow connection to redshift cluster 48 | aws_credentials_id: string 49 | airflow connection to AWS 50 | table: string 51 | target table located in redshift cluster 52 | s3_bucket: string 53 | bucket location of staging data 54 | s3_key: boolean 55 | path location of staging data 56 | file_format: string 57 | file format to copy data, default JSON 58 | """ 59 | self.log.info(f"Begin {self.table} StageToRedshiftOperator") 60 | pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) 61 | s3_hook = S3Hook(aws_conn_id=self.aws_credentials_id) 62 | credentials = s3_hook.get_credentials() 63 | 64 | self.log.info(f"Copying {self.table} from S3 to Redshift") 65 | s3_path = "s3://{}/{}/".format(self.s3_bucket, self.s3_key) 66 | if self.file_format == 'CSV': 67 | format = "CSV" 68 | delimiter = "DELIMITER ','" 69 | header = "IGNOREHEADER 1" 70 | else: 71 | format = "FORMAT AS JSON 'auto'" 72 | delimiter = "" 73 | header = "" 74 | 75 | formatted_sql = StageToRedshiftOperator.copy_sql.format( 76 | self.table, 77 | s3_path, 78 | credentials.access_key, 79 | credentials.secret_key, 80 | format, 81 | delimiter, 82 | header 83 | ) 84 | self.log.info(f"QUERY: {formatted_sql}") 85 | pg_hook.run(formatted_sql) 86 | self.log.info(f"Success: Copying {self.table} from S3 to Redshift") 87 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/README.md: -------------------------------------------------------------------------------- 1 | # Project 1: Relational Data Modeling with PostgreSQL 2 | 3 |

4 | 5 | ## Introduction 6 | As a data engineer, I was responsible for developing a relational database to empower the analytics team at Sparkify, a recent startup. They were interested in analyzing data collected on songs and user activity through their new music streaming app. Currently, the data resides in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. The analytics team have no easy way to query their data. 7 | 8 | ### Achievements 9 | Data modeling with PostgreSQL and building an ETL pipeline using Python. Define fact and dimension tables for a star schema for a particular analytic focus, and write an ETL pipeline that transfers data from files in two local directories into these tables in PostgreSQL using Python and SQL. 10 | Skills include: 11 | * Building out an ETL pipeline using Python 12 | * Creating a database schema and ETL pipeline for this analysis 13 | * Creating a PostgreSQL database with tables designed to optimize queries on song play analysis 14 | * Testing the database and ETL pipeline by running queries given to you by the analytics team from Sparkify and comparing results with their expected results. 15 | 16 | # Available Data 17 | ### Song Dataset 18 | The first dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset. 19 | 20 | ``` 21 | song_data/A/B/C/TRABCEI128F424C983.json 22 | song_data/A/A/B/TRAABJL12903CDCF1A.json 23 | ``` 24 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like. 25 | ``` 26 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 27 | ``` 28 | ### Log Dataset 29 | The second dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from a music streaming app based on specified configurations. 30 | 31 | The log files in the dataset you'll be working with are partitioned by year and month. For example, here are filepaths to two files in this dataset. 32 | 33 | ``` 34 | log_data/2018/11/2018-11-12-events.json 35 | log_data/2018/11/2018-11-13-events.json 36 | ``` 37 | 38 | # Schema for Song Play Analysis 39 | Using the song and log datasets, you'll need to create a star schema optimized for queries on song play analysis. This includes the following tables. 40 | 41 | 42 | 43 | #### Fact Table 44 | 1. songplays - records in log data associated with song plays i.e. records with page `NextSong` 45 | * songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 46 | 47 | #### Dimension Tables 48 | 2. users - users in the app 49 | * user_id, first_name, last_name, gender, level 50 | 3. songs - songs in music database 51 | * song_id, title, artist_id, year, duration 52 | 4. artists - artists in music database 53 | * artist_id, name, location, lattitude, longitude 54 | 5. time - timestamps of records in songplays broken down into specific units 55 | * start_time, hour, day, week, month, year, weekday 56 | -------------------------------------------------------------------------------- /data_lake_spark/README.md: -------------------------------------------------------------------------------- 1 | # Project 4: Data Lake & Spark 2 | 3 |

4 | 5 | ## Introduction 6 | As a data engineer, I was responsible for developing a data lake for the analytics team at Sparkify. After considerable growth in user base and song database it was time to move the data warehouse to a data lake and enhance data processing through Spark. 7 | 8 | ### Achievements 9 | As their data engineer, I was responsible for building out an ETL pipeline, extracting data from S3 buckets, processing it through Spark and transforming into a star schema stored in S3 buckets with parquet formatting and efficient partitioning. The database and ETL pipeline were validated by running queries provided by the analytics team and compared expected results. 10 | Skills include: 11 | * Building out an ETL pipeline using Spark, Python, Hadoop Clusters (EMR). 12 | * Fast-tracking the data lake buildout using (serverless) AWS Lambda and cataloging tables with an AWS Glue Crawler 13 | * Setting up IAM Roles, Hadoop Clusters, EMR, Config files and security groups. 14 | * Scaling up the data analysis process through the use of a data lake and Spark, in order to further optimize queries on song play analysis 15 | 16 | # Run The Scripts 17 | The primary file in this repo is the `etl.py`, which will read in files from S3 buckets, process them using Spark and store them as parquet files in S3 buckets, partitioned appropriately. 18 | 19 | # Available Data 20 | ### Song Dataset 21 | The first dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset. 22 | 23 | ``` 24 | song_data/A/B/C/TRABCEI128F424C983.json 25 | song_data/A/A/B/TRAABJL12903CDCF1A.json 26 | ``` 27 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like. 28 | ``` 29 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 30 | ``` 31 | ### Log Dataset 32 | The second dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from a music streaming app based on specified configurations. 33 | 34 | The log files in the dataset you'll be working with are partitioned by year and month. For example, here are filepaths to two files in this dataset. 35 | 36 | ``` 37 | log_data/2018/11/2018-11-12-events.json 38 | log_data/2018/11/2018-11-13-events.json 39 | ``` 40 | 41 | # Schema for Song Play Analysis 42 | Using the song and log datasets, you'll need to create a star schema optimized for queries on song play analysis. This includes the following tables. 43 | 44 | 45 | 46 | #### Fact Table 47 | 1. songplays - records in log data associated with song plays i.e. records with page `NextSong` 48 | * songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 49 | 50 | #### Dimension Tables 51 | 2. users - users in the app 52 | * user_id, first_name, last_name, gender, level 53 | 3. songs - songs in music database 54 | * song_id, title, artist_id, year, duration 55 | 4. artists - artists in music database 56 | * artist_id, name, location, lattitude, longitude 57 | 5. time - timestamps of records in songplays broken down into specific units 58 | * start_time, hour, day, week, month, year, weekday 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Nanodegree 2 | 3 | Projects and resources developed in the [DEND Nanodegree](https://www.udacity.com/course/data-engineer-nanodegree--nd027) from Udacity. 4 | 5 | ## Project 1: [Relational Databases - Data Modeling with PostgreSQL](https://github.com/danieldiamond/udacity-dend/tree/master/relational_db_modeling_postgresql) 6 | 7 |

8 | 9 | Developed a relational database using PostgreSQL to model user activity data for a music streaming app. Skills include: 10 | * Created a relational database using PostgreSQL 11 | * Developed a Star Schema database using optimized definitions of Fact and Dimension tables. Normalization of tables. 12 | * Built out an ETL pipeline to optimize queries in order to understand what songs users listen to. 13 | 14 | Proficiencies include: Python, PostgreSql, Star Schema, ETL pipelines, Normalization 15 | 16 | 17 | ## Project 2: [NoSQL Databases - Data Modeling with Apache Cassandra](https://github.com/danieldiamond/udacity-dend/tree/master/nosql_db_modeling_apache_cassandra) 18 | 19 |

20 | 21 | Designed a NoSQL database using Apache Cassandra based on the original schema outlined in project one. Skills include: 22 | * Created a nosql database using Apache Cassandra (both locally and with docker containers) 23 | * Developed denormalized tables optimized for a specific set queries and business needs 24 | 25 | Proficiencies used: Python, Apache Cassandra, Denormalization 26 | 27 | 28 | ## Project 3: [Data Warehouse - Amazon Redshift](https://github.com/danieldiamond/udacity-dend/tree/master/data_warehouse_redshift) 29 | 30 |

31 | 32 | Created a database warehouse utilizing Amazon Redshift. Skills include: 33 | * Creating a Redshift Cluster, IAM Roles, Security groups. 34 | * Develop an ETL Pipeline that copies data from S3 buckets into staging tables to be processed into a star schema 35 | * Developed a star schema with optimization to specific queries required by the data analytics team. 36 | 37 | Proficiencies used: Python, Amazon Redshift, aws cli, Amazon SDK, SQL, PostgreSQL 38 | 39 | ## Project 4: [Data Lake - Spark](https://github.com/danieldiamond/udacity-dend/tree/master/data_lake_spark) 40 | 41 |

42 | 43 | Scaled up the current ETL pipeline by moving the data warehouse to a data lake. Skills include: 44 | * Create an EMR Hadoop Cluster 45 | * Further develop the ETL Pipeline copying datasets from S3 buckets, data processing using Spark and writing to S3 buckets using efficient partitioning and parquet formatting. 46 | * Fast-tracking the data lake buildout using (serverless) AWS Lambda and cataloging tables with AWS Glue Crawler. 47 | 48 | Technologies used: Spark, S3, EMR, Athena, Amazon Glue, Parquet. 49 | 50 | ## Project 5: [Data Pipelines - Airflow](https://github.com/danieldiamond/udacity-dend/tree/master/data_pipelines_airflow) 51 | 52 |

53 | 54 | Automate the ETL pipeline and creation of data warehouse using Apache Airflow. Skills include: 55 | * Using Airflow to automate ETL pipelines using Airflow, Python, Amazon Redshift. 56 | * Writing custom operators to perform tasks such as staging data, filling the data warehouse, and validation through data quality checks. 57 | * Transforming data from various sources into a star schema optimized for the analytics team's use cases. 58 | 59 | Technologies used: Apache Airflow, S3, Amazon Redshift, Python. 60 | -------------------------------------------------------------------------------- /data_warehouse_redshift/README.md: -------------------------------------------------------------------------------- 1 | # Project 3: Data Warehouse Management With Amazon Redshift 2 | 3 |

4 | 5 | ## Introduction 6 | As a data engineer, I was responsible for developing a data warehouse for the analytics team at Sparkify. After recent growth in their user base and song database they want to move their processes and data onto the cloud. Data currently resides in S3 buckets, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app. 7 | 8 | ### Achievements 9 | As their data engineer, I was responsible for building out an ETL pipeline, extracting data from S3, staging in Redshift, and transforming the data into a set of dimensional tables for their analytics team to continue finding insights in what songs their users are listening to. The database and ETL pipeline were validated by running queries provided by the analytics team and compared expected results. 10 | Skills include: 11 | * Building out an ETL pipeline using AWS SDK, Redshift, Python and PostgreSQL 12 | * Setting up IAM Roles, Redshift Clusters, Config files and security groups. 13 | * Developing seamless pipeline to connect to Redshift cluster and `COPY` data from S3 buckets to redshift staging tables 14 | * Creating a database with tables designed to optimize queries on song play analysis 15 | * Testing the database and ETL pipeline by running queries given to you by the analytics team from Sparkify and comparing results with their expected results. 16 | 17 | # Run The Scripts 18 | Python files in this repo include `create_tables.py`, `etl.py` and `sql_queries.py`. The process will start by running `python create_tables.py` in your terminal to drop any existing tables and create new tables with the correct column data types and conditions. `etl.py` will the execute the COPY and INSERT SQL queries that are outlined in `sql_queries.py`. This will copy the log-data and song-data from the `udacity-dend` S3 bucket, into corresponding staging tables and from there the data will be inserted into various tables that follow a star schema and are optimized for song query analysis (see below). 19 | 20 | There is also a `dwh.cfg` config file, which contains the AWS configuration details to connect to the redshift cluster. This has been excluded from the repo. 21 | 22 | # Available Data 23 | ### Song Dataset 24 | The first dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset. 25 | 26 | ``` 27 | song_data/A/B/C/TRABCEI128F424C983.json 28 | song_data/A/A/B/TRAABJL12903CDCF1A.json 29 | ``` 30 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like. 31 | ``` 32 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 33 | ``` 34 | ### Log Dataset 35 | The second dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from a music streaming app based on specified configurations. 36 | 37 | The log files in the dataset you'll be working with are partitioned by year and month. For example, here are filepaths to two files in this dataset. 38 | 39 | ``` 40 | log_data/2018/11/2018-11-12-events.json 41 | log_data/2018/11/2018-11-13-events.json 42 | ``` 43 | 44 | # Schema for Song Play Analysis 45 | Using the song and log datasets, you'll need to create a star schema optimized for queries on song play analysis. This includes the following tables. 46 | 47 | 48 | 49 | #### Fact Table 50 | 1. songplays - records in log data associated with song plays i.e. records with page `NextSong` 51 | * songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 52 | 53 | #### Dimension Tables 54 | 2. users - users in the app 55 | * user_id, first_name, last_name, gender, level 56 | 3. songs - songs in music database 57 | * song_id, title, artist_id, year, duration 58 | 4. artists - artists in music database 59 | * artist_id, name, location, lattitude, longitude 60 | 5. time - timestamps of records in songplays broken down into specific units 61 | * start_time, hour, day, week, month, year, weekday 62 | -------------------------------------------------------------------------------- /data_pipelines_airflow/README.md: -------------------------------------------------------------------------------- 1 | # Project 5: Data Pipelines with Airflow 2 | 3 |

4 | 5 | ## Introduction 6 | As a data engineer, I was responsible for automating and monitoring the data warehouse ETL pipelines at Sparkify. Specifically, I delivered a dynamic and high grade data pipeline, which not only allowed for scheduled backfilling but also monitoring to ensure data quality. Data quality plays a big part in the analytics team at Sparkify, thus after the automated ETL processes, analyses were executed on top of the data warehouse with tests run to catch any discrepancies in the datasets. 7 | 8 | The source data resides in S3 and needed to be processed in Sparkify's data warehouse in Amazon Redshift. The source datasets consist of CSV logs that tell about user activity in the application and JSON metadata about the songs the users listen to. 9 | 10 | ### Achievements 11 | As their data engineer, I was responsible for automating the ETL pipelines through Airflow, extracting data from S3, loading data into staging tables and transforming the data into a star schema stored in Amazon Redshift. The data warehouse (automatically generated by the Airflow tasks) were then validated using custom analyses to detect any discrepancies in the databases. 12 | Skills include: 13 | * Using Airflow to automate ETL pipelines using Airflow, Python, Amazon Redshift. 14 | * Transforming data from various sources into a star schema optimized for the analytics team's use cases. 15 | * Writing custom operators to perform tasks such as staging data, filling the data warehouse, and validation through data quality checks. 16 | * Setting up IAM Roles, Redshift Clusters, Airflow Connections. 17 | 18 | # Run The Scripts 19 | The primary file in this repo is the `etl.py`, which generates the DAG with all necessary tasks to read in files from S3 buckets, load into staging tables and transform into a star schema which is stored in Redshift. 20 | 21 | ## DAGS 22 | 23 | 24 | ## Operators 25 | 1. Staging Operator: Using Airflow's PostgreSQL & S3 hooks, data is read and copied to staging tables in redshift. 26 | 2. Fact & Dimension Operators: Using Airflow's PostgreSQL hook and variable SQL statements, staging data is transformed into a star schema database and stored in appropriate tables in redshift. 27 | 3. Data Quality Operator: Using Airflow's PostgreSQL hook to access the newly transformed data, custom SQL commands are run against the tables to detect discrepancies within the newly formed data warehouse. 28 | 29 | 30 | # Available Data 31 | ### Song Dataset 32 | The first dataset is a subset of real data from the [Million Song Dataset](https://labrosa.ee.columbia.edu/millionsong). Each file is in JSON format and contains metadata about a song and the artist of that song. The files are partitioned by the first three letters of each song's track ID. For example, here are filepaths to two files in this dataset. 33 | 34 | ``` 35 | song_data/A/B/C/TRABCEI128F424C983.json 36 | song_data/A/A/B/TRAABJL12903CDCF1A.json 37 | ``` 38 | And below is an example of what a single song file, TRAABJL12903CDCF1A.json, looks like. 39 | ``` 40 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 41 | ``` 42 | ### Log Dataset 43 | The second dataset consists of log files in JSON format generated by this event simulator based on the songs in the dataset above. These simulate app activity logs from a music streaming app based on specified configurations. 44 | 45 | The log files in the dataset you'll be working with are partitioned by year and month. For example, here are filepaths to two files in this dataset. 46 | 47 | ``` 48 | log_data/2018/11/2018-11-12-events.json 49 | log_data/2018/11/2018-11-13-events.json 50 | ``` 51 | 52 | # Schema for Song Play Analysis 53 | Using the song and log datasets, you'll need to create a star schema optimized for queries on song play analysis. This includes the following tables. 54 | 55 | 56 | 57 | #### Fact Table 58 | 1. songplays - records in log data associated with song plays i.e. records with page `NextSong` 59 | * songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent 60 | 61 | #### Dimension Tables 62 | 2. users - users in the app 63 | * user_id, first_name, last_name, gender, level 64 | 3. songs - songs in music database 65 | * song_id, title, artist_id, year, duration 66 | 4. artists - artists in music database 67 | * artist_id, name, location, lattitude, longitude 68 | 5. time - timestamps of records in songplays broken down into specific units 69 | * start_time, hour, day, week, month, year, weekday 70 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import (songplay_table_insert, user_table_insert, 6 | song_table_insert, artist_table_insert, 7 | time_table_insert, song_select) 8 | 9 | 10 | def process_song_file(cur, filepath): 11 | """ 12 | Process song_data and construct dimension tables: song & artist. 13 | 14 | PARAMETERS 15 | ------------ 16 | cur : object 17 | postgres cursor 18 | filepath : string 19 | path pertaining to single song data. 20 | """ 21 | # open song file 22 | df = pd.read_json(filepath, lines=True) 23 | 24 | # insert song record 25 | song_data = [list(row) for row in df[['song_id', 'title', 'artist_id', 26 | 'year', 'duration'] 27 | ].itertuples(index=False)][0] 28 | 29 | cur.execute(song_table_insert, song_data) 30 | 31 | # insert artist record 32 | artist_data = [list(row) for row in df[['artist_id', 'artist_name', 33 | 'artist_location', 34 | 'artist_latitude', 35 | 'artist_longitude'] 36 | ].itertuples(index=False)][0] 37 | cur.execute(artist_table_insert, artist_data) 38 | 39 | 40 | def process_log_file(cur, filepath): 41 | """ 42 | Process log_data and construct dimension tables: time & user as well as 43 | fact table: songplay. 44 | 45 | PARAMETERS 46 | ------------ 47 | cur : object 48 | postgres cursor 49 | filepath : string 50 | path pertaining to single log data. 51 | """ 52 | # open log file 53 | df = pd.read_json(filepath, lines=True) 54 | 55 | # filter by NextSong action 56 | df = df[df['page'] == 'NextSong'] 57 | 58 | # convert timestamp column to datetime 59 | df['ts'] = pd.to_datetime(df['ts'], unit='ms') 60 | 61 | # insert time data records 62 | time_data = df['ts'].apply(lambda x: [x, x.hour, x.day, x.week, 63 | x.month, x.year, x.dayofweek] 64 | ).tolist() 65 | column_labels = ('timestamp', 'hour', 'day', 'week', 'month', 66 | 'year', 'weekday') 67 | time_df = pd.DataFrame(time_data, columns=column_labels) 68 | 69 | for i, row in time_df.iterrows(): 70 | cur.execute(time_table_insert, list(row)) 71 | 72 | # load user table 73 | user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']] 74 | 75 | # insert user records 76 | for i, row in user_df.iterrows(): 77 | cur.execute(user_table_insert, row) 78 | 79 | # insert songplay records 80 | for index, row in df.iterrows(): 81 | 82 | # get songid and artistid from song and artist tables 83 | cur.execute(song_select, (row.song, row.artist, row.length)) 84 | results = cur.fetchone() 85 | 86 | if results: 87 | songid, artistid = results 88 | else: 89 | songid, artistid = None, None 90 | 91 | # insert songplay record 92 | songplay_data = (index, row['ts'], row['userId'], row['level'], 93 | songid, artistid, row['sessionId'], row['location'], 94 | row['userAgent']) 95 | cur.execute(songplay_table_insert, songplay_data) 96 | 97 | 98 | def process_data(cur, conn, filepath, func): 99 | """ 100 | Generic function to process both song_data and log_data. 101 | 102 | PARAMETERS 103 | ------------ 104 | cur : object 105 | postgres cursor 106 | conn : object 107 | connection to sparkify db 108 | filepath : string 109 | path pertaining to song or log data directory. 110 | func : function 111 | function to process either song or log data. 112 | """ 113 | # get all files matching extension from directory 114 | all_files = [] 115 | for root, dirs, files in os.walk(filepath): 116 | files = glob.glob(os.path.join(root, '*.json')) 117 | for f in files: 118 | all_files.append(os.path.abspath(f)) 119 | 120 | # get total number of files found 121 | num_files = len(all_files) 122 | print('{} files found in {}'.format(num_files, filepath)) 123 | 124 | # iterate over files and process 125 | for i, datafile in enumerate(all_files, 1): 126 | func(cur, datafile) 127 | conn.commit() 128 | print('{}/{} files processed.'.format(i, num_files)) 129 | 130 | 131 | def main(): 132 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb \ 133 | user=student password=student") 134 | cur = conn.cursor() 135 | 136 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 137 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 138 | 139 | conn.close() 140 | 141 | 142 | if __name__ == "__main__": 143 | main() 144 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/sql_queries.py: -------------------------------------------------------------------------------- 1 | """ 2 | Schema for Song Play Analysis 3 | Using the song and log datasets, you'll need to create a star schema 4 | optimized for queries on song play analysis. 5 | """ 6 | 7 | # DROP TABLES 8 | 9 | songplay_table_drop = "DROP table IF EXISTS songplays" 10 | user_table_drop = "DROP table IF EXISTS users" 11 | song_table_drop = "DROP table IF EXISTS songs" 12 | artist_table_drop = "DROP table IF EXISTS artists" 13 | time_table_drop = "DROP table IF EXISTS time" 14 | 15 | # CREATE TABLES 16 | """ 17 | Fact Table 18 | table: songplays 19 | Records in log data associated with song plays 20 | i.e. records with page NextSong 21 | 22 | attributes: songplay_id, start_time, user_id, level, song_id, 23 | artist_id, session_id, location, user_agent 24 | """ 25 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays 26 | (songplay_id SERIAL PRIMARY KEY, 27 | start_time timestamp, 28 | user_id int NOT NULL REFERENCES users(user_id), 29 | level varchar, 30 | song_id varchar REFERENCES songs(song_id), 31 | artist_id varchar REFERENCES artists(artist_id), 32 | session_id int, 33 | location varchar, 34 | user_agent varchar)""") 35 | 36 | """ 37 | Dimension Tables 38 | table: users 39 | users in the app 40 | attributes: user_id, first_name, last_name, gender, level 41 | 42 | table: songs 43 | songs in music database 44 | attributes: song_id, title, artist_id, year, duration 45 | 46 | table: artists 47 | artists in music database 48 | attributes: artist_id, name, location, lattitude, longitude 49 | 50 | table: time 51 | timestamps of records in songplays broken down into specific units 52 | attributes: start_time, hour, day, week, month, year, weekday 53 | """ 54 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users 55 | (user_id int PRIMARY KEY, 56 | first_name varchar, 57 | last_name varchar, 58 | gender varchar, 59 | level varchar)""") 60 | 61 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs 62 | (song_id varchar PRIMARY KEY, 63 | title varchar, 64 | artist_id varchar, 65 | year numeric, 66 | duration numeric)""") 67 | 68 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists 69 | (artist_id varchar PRIMARY KEY, 70 | artist_name varchar, 71 | artist_location varchar, 72 | artist_latitude numeric, 73 | artist_longitude numeric)""") 74 | 75 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time 76 | (start_time timestamp PRIMARY KEY, 77 | hour int, 78 | day int, 79 | week int, 80 | month int, 81 | year int, 82 | weekday int)""") 83 | 84 | # INSERT RECORDS 85 | 86 | songplay_table_insert = ("""INSERT INTO songplays (songplay_id, start_time, 87 | user_id, level, song_id, artist_id, session_id, 88 | location, user_agent) 89 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 90 | ON CONFLICT (songplay_id) DO NOTHING; 91 | """) 92 | 93 | user_table_insert = ("""INSERT INTO users (user_id, first_name, 94 | last_name, gender, level) 95 | VALUES (%s, %s, %s, %s, %s) 96 | ON CONFLICT (user_id) 97 | DO UPDATE SET level=excluded.level; 98 | """) 99 | 100 | song_table_insert = ("""INSERT INTO songs (song_id, title, 101 | artist_id, year, duration) 102 | VALUES (%s, %s, %s, %s, %s) 103 | ON CONFLICT (song_id) DO NOTHING; 104 | """) 105 | 106 | artist_table_insert = ("""INSERT INTO artists (artist_id, artist_name, 107 | artist_location, 108 | artist_latitude, artist_longitude) 109 | VALUES (%s, %s, %s, %s, %s) 110 | ON CONFLICT (artist_id) DO NOTHING; 111 | """) 112 | 113 | 114 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, 115 | week, month, year, weekday) 116 | VALUES (%s, %s, %s, %s, %s, %s, %s) 117 | ON CONFLICT (start_time) DO NOTHING; 118 | """) 119 | 120 | # FIND SONGS 121 | 122 | song_select = ("""SELECT songs.song_id, artists.artist_id 123 | FROM songs JOIN artists 124 | ON artists.artist_id = songs.artist_id 125 | WHERE songs.title = %s 126 | AND artists.artist_name = %s 127 | AND songs.duration = %s""") 128 | 129 | # QUERY LISTS 130 | 131 | create_table_queries = [user_table_create, 132 | song_table_create, artist_table_create, 133 | time_table_create, songplay_table_create] 134 | drop_table_queries = [songplay_table_drop, user_table_drop, song_table_drop, 135 | artist_table_drop, time_table_drop] 136 | -------------------------------------------------------------------------------- /data_pipelines_airflow/dags/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from datetime import datetime, timedelta 4 | 5 | from airflow import DAG 6 | from airflow.operators.dummy_operator import DummyOperator 7 | from airflow.operators.postgres_operator import PostgresOperator 8 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, 9 | LoadDimensionOperator, DataQualityOperator) 10 | 11 | from helpers import SqlQueries 12 | 13 | # Global Variables 14 | TABLES = ["staging_events", "staging_songs", "users", "songs", 15 | "artists", "time", "songplays"] 16 | 17 | # Convert SQL commands from create_tables file into dictionary of commands 18 | sql_commands = {} 19 | filename = os.path.join(Path(__file__).parents[1], 'create_tables.sql') 20 | with open(filename, 'r') as sqlfile: 21 | commands = s = " ".join(sqlfile.readlines()) 22 | for idx, sql_stmt in enumerate(commands.split(';')[:-1]): 23 | table = sql_stmt.split('.')[-1].split(' ')[0].strip('"').strip('\n') 24 | sql_commands[table] = sql_stmt 25 | 26 | 27 | default_args = { 28 | 'owner': 'danieldiamond', 29 | 'depends_on_past': False, 30 | 'catchup': False, 31 | 'start_date': datetime(2019, 1, 12), 32 | 'retries': 3, 33 | 'retry_delay': timedelta(minutes=5) 34 | } 35 | 36 | dag = DAG('etl_dag', 37 | default_args=default_args, 38 | description='Load and transform data in Redshift with Airflow', 39 | schedule_interval='0 * * * *' 40 | ) 41 | 42 | start_operator = DummyOperator(task_id='start_etl', dag=dag) 43 | load_staging_tables = DummyOperator(task_id='load_staging_tables', dag=dag) 44 | etl_success = DummyOperator(task_id='etl_success', dag=dag) 45 | 46 | # Drop & Create Tables 47 | for table in TABLES: 48 | # Drop Table 49 | drop_table_task = PostgresOperator( 50 | task_id=f"drop_{table}", 51 | postgres_conn_id="redshift", 52 | sql=f"DROP table IF EXISTS {table}", 53 | dag=dag 54 | ) 55 | 56 | # Create Table 57 | create_table_task = PostgresOperator( 58 | task_id=f"create_{table}", 59 | postgres_conn_id="redshift", 60 | sql=sql_commands[table], 61 | dag=dag 62 | ) 63 | 64 | start_operator >> drop_table_task 65 | drop_table_task >> create_table_task 66 | create_table_task >> load_staging_tables 67 | 68 | # Stage Events Data 69 | stage_events_to_redshift = StageToRedshiftOperator( 70 | task_id='stage_events', 71 | dag=dag, 72 | redshift_conn_id="redshift", 73 | aws_credentials_id="aws_credentials", 74 | table="staging_events", 75 | s3_bucket="dend", 76 | s3_key="log_data", 77 | file_format="CSV" 78 | ) 79 | 80 | # Stage Song Data 81 | stage_songs_to_redshift = StageToRedshiftOperator( 82 | task_id='stage_songs', 83 | dag=dag, 84 | redshift_conn_id="redshift", 85 | aws_credentials_id="aws_credentials", 86 | table="staging_songs", 87 | s3_bucket="dend", 88 | s3_key="song_data", 89 | file_format="JSON" 90 | ) 91 | 92 | # Insert Fact Tables 93 | load_songplays_table = LoadFactOperator( 94 | task_id=f'load_songplays', 95 | redshift_conn_id="redshift", 96 | table='songplays', 97 | sql_stmt=SqlQueries.songplay_table_insert, 98 | dag=dag 99 | ) 100 | 101 | # Insert DIM Tables 102 | load_users_table = LoadDimensionOperator( 103 | task_id=f'load_users', 104 | redshift_conn_id="redshift", 105 | table='users', 106 | truncate=True, 107 | sql_stmt=SqlQueries.user_table_insert, 108 | dag=dag 109 | ) 110 | 111 | load_songs_table = LoadDimensionOperator( 112 | task_id=f'load_songs', 113 | redshift_conn_id="redshift", 114 | table='songs', 115 | truncate=True, 116 | sql_stmt=SqlQueries.song_table_insert, 117 | dag=dag 118 | ) 119 | 120 | load_artists_table = LoadDimensionOperator( 121 | task_id=f'load_artists', 122 | redshift_conn_id="redshift", 123 | table='artists', 124 | truncate=True, 125 | sql_stmt=SqlQueries.artist_table_insert, 126 | dag=dag 127 | ) 128 | 129 | load_time_table = LoadDimensionOperator( 130 | task_id=f'load_time', 131 | redshift_conn_id="redshift", 132 | table='time', 133 | truncate=True, 134 | sql_stmt=SqlQueries.time_table_insert, 135 | dag=dag 136 | ) 137 | 138 | songplays_data_quality = DataQualityOperator( 139 | task_id=f"data_quality_check_on_songplays", 140 | redshift_conn_id="redshift", 141 | table='songplays', 142 | dag=dag 143 | ) 144 | 145 | artists_data_quality = DataQualityOperator( 146 | task_id=f"data_quality_check_on_artists", 147 | redshift_conn_id="redshift", 148 | table='artists', 149 | test_stmt=SqlQueries.artist_table_data_quality, 150 | result=(1,), 151 | dag=dag 152 | ) 153 | 154 | users_data_quality = DataQualityOperator( 155 | task_id=f"data_quality_check_on_users", 156 | redshift_conn_id="redshift", 157 | table='users', 158 | dag=dag 159 | ) 160 | 161 | songs_data_quality = DataQualityOperator( 162 | task_id=f"data_quality_check_on_songs", 163 | redshift_conn_id="redshift", 164 | table='songs', 165 | dag=dag 166 | ) 167 | 168 | time_data_quality = DataQualityOperator( 169 | task_id=f"data_quality_check_on_time", 170 | redshift_conn_id="redshift", 171 | table='time', 172 | dag=dag 173 | ) 174 | 175 | load_staging_tables >> [stage_events_to_redshift, stage_songs_to_redshift] 176 | [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table 177 | stage_songs_to_redshift >> [load_songs_table, load_artists_table] 178 | stage_events_to_redshift >> [load_users_table, load_time_table] 179 | load_songplays_table >> songplays_data_quality 180 | load_users_table >> users_data_quality 181 | load_artists_table >> artists_data_quality 182 | load_songs_table >> songs_data_quality 183 | load_time_table >> time_data_quality 184 | 185 | [songplays_data_quality, 186 | users_data_quality, 187 | artists_data_quality, 188 | songs_data_quality, 189 | time_data_quality] >> etl_success 190 | -------------------------------------------------------------------------------- /relational_db_modeling_postgresql/data/log_data/2018/11/2018-11-01-events.json: -------------------------------------------------------------------------------- 1 | {"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"} 2 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":0,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Home","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 3 | {"artist":"Des'ree","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":1,"lastName":"Summers","length":246.30812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"You Gotta Be","status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 4 | {"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":2,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Upgrade","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106132796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 5 | {"artist":"Mr Oizo","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":3,"lastName":"Summers","length":144.03873,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Flat 55","status":200,"ts":1541106352796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 6 | {"artist":"Tamba Trio","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":4,"lastName":"Summers","length":177.18812,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Quem Quiser Encontrar O Amor","status":200,"ts":1541106496796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 7 | {"artist":"The Mars Volta","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":5,"lastName":"Summers","length":380.42077,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Eriatarka","status":200,"ts":1541106673796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 8 | {"artist":"Infected Mushroom","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":6,"lastName":"Summers","length":440.2673,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Becoming Insane","status":200,"ts":1541107053796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 9 | {"artist":"Blue October \/ Imogen Heap","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":7,"lastName":"Summers","length":241.3971,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Congratulations","status":200,"ts":1541107493796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 10 | {"artist":"Girl Talk","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":8,"lastName":"Summers","length":160.15628,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"PUT","page":"NextSong","registration":1540344794796.0,"sessionId":139,"song":"Once again","status":200,"ts":1541107734796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"} 11 | {"artist":"Black Eyed Peas","auth":"Logged In","firstName":"Sylvie","gender":"F","itemInSession":0,"lastName":"Cruz","length":214.93506,"level":"free","location":"Washington-Arlington-Alexandria, DC-VA-MD-WV","method":"PUT","page":"NextSong","registration":1540266185796.0,"sessionId":9,"song":"Pump It","status":200,"ts":1541108520796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.77.4 (KHTML, like Gecko) Version\/7.0.5 Safari\/537.77.4\"","userId":"10"} 12 | {"artist":null,"auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":0,"lastName":"Smith","length":null,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"GET","page":"Home","registration":1541016707796.0,"sessionId":169,"song":null,"status":200,"ts":1541109015796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 13 | {"artist":"Fall Out Boy","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":1,"lastName":"Smith","length":200.72444,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Nobody Puts Baby In The Corner","status":200,"ts":1541109125796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 14 | {"artist":"M.I.A.","auth":"Logged In","firstName":"Ryan","gender":"M","itemInSession":2,"lastName":"Smith","length":233.7171,"level":"free","location":"San Jose-Sunnyvale-Santa Clara, CA","method":"PUT","page":"NextSong","registration":1541016707796.0,"sessionId":169,"song":"Mango Pickle Down River (With The Wilcannia Mob)","status":200,"ts":1541109325796,"userAgent":"\"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) Ubuntu Chromium\/36.0.1985.125 Chrome\/36.0.1985.125 Safari\/537.36\"","userId":"26"} 15 | {"artist":"Survivor","auth":"Logged In","firstName":"Jayden","gender":"M","itemInSession":0,"lastName":"Fox","length":245.36771,"level":"free","location":"New Orleans-Metairie, LA","method":"PUT","page":"NextSong","registration":1541033612796.0,"sessionId":100,"song":"Eye Of The Tiger","status":200,"ts":1541110994796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.3; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"101"} -------------------------------------------------------------------------------- /data_lake_spark/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | 4 | from pyspark.sql import SparkSession 5 | import pyspark.sql.functions as F 6 | 7 | config = configparser.ConfigParser() 8 | config.read_file(open('/Users/danieldiamond/.aws/credentials')) 9 | 10 | os.environ["AWS_ACCESS_KEY_ID"] = config.get('sparkifyuser', 11 | 'AWS_ACCESS_KEY_ID') 12 | os.environ["AWS_SECRET_ACCESS_KEY"] = config.get('sparkifyuser', 13 | 'AWS_SECRET_ACCESS_KEY') 14 | 15 | 16 | def create_spark_session(): 17 | spark = SparkSession \ 18 | .builder \ 19 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 20 | .getOrCreate() 21 | return spark 22 | 23 | 24 | def process_song_data(spark, input_data, output_data): 25 | """ 26 | Description: This function can be used to read the song data in the 27 | filepath (bucket/song_data) to get the song and artist info and 28 | used to populate the songs and artists dim tables. 29 | 30 | Parameters: 31 | spark: the cursor object. 32 | input_path: the path to the bucket containing song data. 33 | output_path: the path to destination bucket where the parquet files 34 | will be stored. 35 | 36 | Returns: 37 | None 38 | """ 39 | # get filepath to song data file 40 | song_data = f'{input_data}/song_data/*/*/*/*.json' 41 | 42 | # read song data file 43 | df = spark.read.json(song_data) 44 | print('Success: Read song_data from S3') 45 | 46 | # extract columns to create songs table 47 | songs_table = df.select('song_id', 'title', 'artist_id', 48 | 'year', 'duration').dropDuplicates() 49 | 50 | # write songs table to parquet files partitioned by year and artist 51 | songs_table.write.parquet(f'{output_data}/songs_table', 52 | mode='overwrite', 53 | partitionBy=['year', 'artist_id']) 54 | print('Success: Wrote songs_table to parquet') 55 | 56 | # extract columns to create artists table 57 | artists_table = df.select('artist_id', 'artist_name', 58 | 'artist_location', 'artist_latitude', 59 | 'artist_longitude').dropDuplicates() 60 | 61 | # write artists table to parquet files 62 | artists_table.write.parquet(f'{output_data}/artists_table', 63 | mode='overwrite') 64 | print('Success: Wrote artists_table to parquet') 65 | 66 | 67 | def process_log_data(spark, input_data, output_data): 68 | """ 69 | Description: This function can be used to read the log data in the 70 | filepath (bucket/log_data) to get the info to populate the 71 | user, time and song dim tables as well as the songplays fact table. 72 | 73 | Parameters: 74 | spark: the cursor object. 75 | input_path: the path to the bucket containing song data. 76 | output_path: the path to destination bucket where the parquet files 77 | will be stored. 78 | 79 | Returns: 80 | None 81 | """ 82 | # get filepath to log data file 83 | log_data = f'{input_data}/log_data/*/*/*.json' 84 | 85 | # read log data file 86 | df = spark.read.json(log_data) 87 | print('Success: Read log_data from S3') 88 | 89 | # filter by actions for song plays 90 | df = df.filter(df['page'] == 'NextSong') 91 | 92 | # extract columns for users table 93 | user_table = df.select('userId', 'firstName', 'lastName', 94 | 'gender', 'level').dropDuplicates() 95 | 96 | # write users table to parquet files 97 | user_table.write.parquet(f'{output_data}/user_table', mode='overwrite') 98 | print('Success: Wrote user_table to parquet') 99 | 100 | # create timestamp column from original timestamp column 101 | df = df.withColumn('start_time', F.from_unixtime(F.col('ts')/1000)) 102 | print('Success: Convert ts to timestamp') 103 | 104 | # create datetime column from original timestamp column 105 | time_table = df.select('ts', 'start_time') \ 106 | .withColumn('year', F.year('start_time')) \ 107 | .withColumn('month', F.month('start_time')) \ 108 | .withColumn('week', F.weekofyear('start_time')) \ 109 | .withColumn('weekday', F.dayofweek('start_time')) \ 110 | .withColumn('day', F.dayofyear('start_time')) \ 111 | .withColumn('hour', F.hour('start_time')).dropDuplicates() 112 | print('Success: Extract DateTime Columns') 113 | 114 | # write time table to parquet files partitioned by year and month 115 | time_table.write.parquet(f'{output_data}/time_table', 116 | mode='overwrite', 117 | partitionBy=['year', 'month']) 118 | print('Success: Wrote time_table to parquet') 119 | 120 | # read in song data to use for songplays table 121 | song_data = f'{input_data}/song_data/A/A/A/*.json' 122 | song_dataset = spark.read.json(song_data) 123 | print('Success: Read song_dataset from S3') 124 | 125 | # join & extract cols from song and log datasets to create songplays table 126 | song_dataset.createOrReplaceTempView('song_dataset') 127 | time_table.createOrReplaceTempView('time_table') 128 | df.createOrReplaceTempView('log_dataset') 129 | 130 | songplays_table = spark.sql("""SELECT DISTINCT 131 | l.ts as ts, 132 | t.year as year, 133 | t.month as month, 134 | l.userId as user_id, 135 | l.level as level, 136 | s.song_id as song_id, 137 | s.artist_id as artist_id, 138 | l.sessionId as session_id, 139 | s.artist_location as artist_location, 140 | l.userAgent as user_agent 141 | FROM song_dataset s 142 | JOIN log_dataset l 143 | ON s.artist_name = l.artist 144 | AND s.title = l.song 145 | AND s.duration = l.length 146 | JOIN time_table t 147 | ON t.ts = l.ts 148 | """).dropDuplicates() 149 | print('Success: SQL Query') 150 | 151 | # write songplays table to parquet files partitioned by year and month 152 | songplays_table.write.parquet(f'{output_data}/songplays_table', 153 | mode='overwrite', 154 | partitionBy=['year', 'month']) 155 | print('Success: Wrote songplays_table to parquet') 156 | 157 | 158 | def main(): 159 | spark = create_spark_session() 160 | input_data = "s3a://udacity-dend" 161 | output_data = "s3a://sparkify-bucket" 162 | 163 | process_song_data(spark, input_data, output_data) 164 | process_log_data(spark, input_data, output_data) 165 | 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /data_warehouse_redshift/sql_queries.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | # CONFIG 4 | config = configparser.ConfigParser() 5 | config.read('dwh.cfg') 6 | 7 | # DROP TABLES 8 | 9 | staging_events_table_drop = "DROP table IF EXISTS staging_events" 10 | staging_songs_table_drop = "DROP table IF EXISTS staging_songs" 11 | songplay_table_drop = "DROP table IF EXISTS songplays" 12 | user_table_drop = "DROP table IF EXISTS users" 13 | song_table_drop = "DROP table IF EXISTS songs" 14 | artist_table_drop = "DROP table IF EXISTS artists" 15 | time_table_drop = "DROP table IF EXISTS time" 16 | 17 | # CREATE TABLES 18 | 19 | staging_events_table_create = ("""CREATE TABLE IF NOT EXISTS staging_events 20 | (artist varchar(100), 21 | auth varchar(10), 22 | firstName varchar(32), 23 | gender varchar(2), 24 | itemInSession int, 25 | lastName varchar(32), 26 | length numeric, 27 | level varchar(16), 28 | location varchar(64), 29 | method varchar(3), 30 | page varchar(16), 31 | registration DOUBLE PRECISION, 32 | sessionId int, 33 | song varchar, 34 | status int, 35 | ts timestamp, 36 | userAgent varchar(140), 37 | userId int);""") 38 | 39 | staging_songs_table_create = ("""CREATE TABLE IF NOT EXISTS staging_songs 40 | (song_id varchar(18), 41 | num_songs int, 42 | title varchar(256), 43 | artist_name varchar(256), 44 | artist_latitude numeric, 45 | year int, 46 | duration numeric, 47 | artist_id varchar, 48 | artist_longitude numeric, 49 | artist_location varchar(100));""") 50 | 51 | 52 | songplay_table_create = ("""CREATE TABLE IF NOT EXISTS songplays 53 | (songplay_id int IDENTITY(0,1) PRIMARY KEY, 54 | start_time timestamp, 55 | user_id int NOT NULL REFERENCES users(user_id), 56 | level varchar, 57 | song_id varchar REFERENCES songs(song_id), 58 | artist_id varchar REFERENCES artists(artist_id), 59 | session_id int, 60 | location varchar, 61 | user_agent varchar)""") 62 | 63 | user_table_create = ("""CREATE TABLE IF NOT EXISTS users 64 | (user_id int PRIMARY KEY, 65 | first_name varchar, 66 | last_name varchar, 67 | gender varchar, 68 | level varchar)""") 69 | 70 | song_table_create = ("""CREATE TABLE IF NOT EXISTS songs 71 | (song_id varchar PRIMARY KEY, 72 | title varchar, 73 | artist_id varchar, 74 | year numeric, 75 | duration numeric)""") 76 | 77 | artist_table_create = ("""CREATE TABLE IF NOT EXISTS artists 78 | (artist_id varchar PRIMARY KEY, 79 | artist_name varchar, 80 | artist_location varchar, 81 | artist_latitude numeric, 82 | artist_longitude numeric)""") 83 | 84 | time_table_create = ("""CREATE TABLE IF NOT EXISTS time 85 | (start_time timestamp PRIMARY KEY, 86 | hour int, 87 | day int, 88 | week int, 89 | month int, 90 | year int, 91 | weekday int)""") 92 | 93 | # STAGING TABLES 94 | staging_events_copy = ("""copy staging_events from {} 95 | credentials 'aws_iam_role={}' 96 | compupdate off region 'us-west-2' FORMAT AS JSON {} 97 | TIMEFORMAT as 'epochmillisecs' 98 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL; 99 | """).format(config['S3'].get('LOG_DATA'), 100 | config['IAM_ROLE'].get('ARN').strip("'"), 101 | config['S3'].get('LOG_JSONPATH')) 102 | 103 | staging_songs_copy = ("""copy staging_songs 104 | from {} 105 | credentials 'aws_iam_role={}' 106 | compupdate off region 'us-west-2' 107 | FORMAT AS JSON 'auto' 108 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL; 109 | """).format(config['S3'].get('SONG_DATA'), 110 | config['IAM_ROLE'].get('ARN').strip("'")) 111 | 112 | songplay_table_insert = ("""INSERT INTO songplays (start_time, 113 | user_id, level, song_id, artist_id, session_id, 114 | location, user_agent) 115 | SELECT DISTINCT to_timestamp(to_char(e.ts, '9999-99-99 99:99:99'),'YYYY-MM-DD HH24:MI:SS'), 116 | e.userid, e.level, s.song_id, 117 | s.artist_id, e.sessionid, s.artist_location, 118 | e.useragent 119 | FROM staging_songs s 120 | JOIN staging_events e 121 | ON s.title = e.song 122 | AND s.artist_name = e.artist 123 | AND s.duration = e.length;""") 124 | 125 | user_table_insert = ("""INSERT INTO users (user_id, first_name, 126 | last_name, gender, level) 127 | SELECT DISTINCT userid, firstname, 128 | lastname, gender, level 129 | FROM staging_events 130 | WHERE userid IS NOT NULL; 131 | """) 132 | 133 | song_table_insert = ("""INSERT INTO songs (song_id, title, 134 | artist_id, year, duration) 135 | SELECT DISTINCT song_id, title, artist_id, year, duration 136 | FROM staging_songs 137 | WHERE song_id IS NOT NULL; 138 | """) 139 | 140 | artist_table_insert = ("""INSERT INTO artists (artist_id, artist_name, 141 | artist_location, 142 | artist_latitude, artist_longitude) 143 | SELECT DISTINCT artist_id, artist_name, artist_location, 144 | artist_latitude, artist_longitude 145 | FROM staging_songs 146 | WHERE artist_id IS NOT NULL; 147 | """) 148 | 149 | 150 | time_table_insert = ("""INSERT INTO time (start_time, hour, day, 151 | week, month, year, weekday) 152 | SELECT DISTINCT ts, extract(hour from ts), extract(day from ts), 153 | extract(week from ts), extract(month from ts), 154 | extract(year from ts), extract(weekday from ts) 155 | FROM staging_events 156 | WHERE ts IS NOT NULL; 157 | """) 158 | 159 | # QUERY LISTS 160 | 161 | create_table_queries = [staging_events_table_create, 162 | staging_songs_table_create, 163 | user_table_create, 164 | song_table_create, 165 | artist_table_create, 166 | time_table_create, 167 | songplay_table_create] 168 | drop_table_queries = [staging_events_table_drop, 169 | staging_songs_table_drop, 170 | songplay_table_drop, 171 | user_table_drop, 172 | song_table_drop, 173 | artist_table_drop, 174 | time_table_drop] 175 | copy_table_queries = [staging_events_copy, staging_songs_copy] 176 | insert_table_queries = [songplay_table_insert, 177 | user_table_insert, 178 | song_table_insert, 179 | artist_table_insert, 180 | time_table_insert] 181 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-25-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | matchbox twenty,Logged In,Jayden,F,0,Duffy,177.65832,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,Argue (LP Version),200,1.54311E+12,76 3 | The Lonely Island / T-Pain,Logged In,Jayden,F,1,Duffy,156.23791,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,846,I'm On A Boat,200,1.54311E+12,76 4 | ,Logged In,Jayden,F,2,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Home,1.54015E+12,846,,200,1.54311E+12,76 5 | ,Logged In,Jayden,F,3,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",GET,Settings,1.54015E+12,846,,200,1.54311E+12,76 6 | ,Logged In,Jayden,F,4,Duffy,,free,"Seattle-Tacoma-Bellevue, WA",PUT,Save Settings,1.54015E+12,846,,307,1.54311E+12,76 7 | John Mayer,Logged In,Wyatt,M,0,Scott,275.27791,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,All We Ever Do Is Say Goodbye,200,1.54311E+12,9 8 | ,Logged In,Wyatt,M,1,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1.54087E+12,856,,200,1.54311E+12,9 9 | 10_000 Maniacs,Logged In,Wyatt,M,2,Scott,251.8722,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1.54087E+12,856,Gun Shy (LP Version),200,1.54311E+12,9 10 | Leona Lewis,Logged In,Chloe,F,0,Cuevas,203.88526,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Forgive Me,200,1.54312E+12,49 11 | Nine Inch Nails,Logged In,Chloe,F,1,Cuevas,277.83791,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,La Mer,200,1.54312E+12,49 12 | Audioslave,Logged In,Chloe,F,2,Cuevas,334.91546,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Am The Highway,200,1.54312E+12,49 13 | Kid Rock,Logged In,Chloe,F,3,Cuevas,296.95955,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,All Summer Long (Album Version),200,1.54312E+12,49 14 | The Jets,Logged In,Chloe,F,4,Cuevas,220.89098,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Do You,200,1.54312E+12,49 15 | The Gerbils,Logged In,Chloe,F,5,Cuevas,27.01016,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,(iii),200,1.54312E+12,49 16 | Damian Marley / Stephen Marley / Yami Bolo,Logged In,Chloe,F,6,Cuevas,304.69179,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Still Searching,200,1.54312E+12,49 17 | ,Logged In,Chloe,F,7,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54312E+12,49 18 | The Bloody Beetroots,Logged In,Chloe,F,8,Cuevas,201.97832,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Warp 1.9 (feat. Steve Aoki),200,1.54312E+12,49 19 | ,Logged In,Chloe,F,9,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 20 | The Specials,Logged In,Chloe,F,10,Cuevas,188.81261,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rat Race,200,1.54313E+12,49 21 | The Lively Ones,Logged In,Chloe,F,11,Cuevas,142.52363,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Walkin' The Board (LP Version),200,1.54313E+12,49 22 | Katie Melua,Logged In,Chloe,F,12,Cuevas,252.78649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Blues In The Night,200,1.54313E+12,49 23 | Jason Mraz,Logged In,Chloe,F,13,Cuevas,243.48689,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I'm Yours (Album Version),200,1.54313E+12,49 24 | Fisher,Logged In,Chloe,F,14,Cuevas,133.98159,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Rianna,200,1.54313E+12,49 25 | Zee Avi,Logged In,Chloe,F,15,Cuevas,160.62649,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,No Christmas For Me,200,1.54313E+12,49 26 | Black Eyed Peas,Logged In,Chloe,F,16,Cuevas,289.12281,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,I Gotta Feeling,200,1.54313E+12,49 27 | Emiliana Torrini,Logged In,Chloe,F,17,Cuevas,184.29342,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,Sunny Road,200,1.54313E+12,49 28 | ,Logged In,Chloe,F,18,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,916,,200,1.54313E+12,49 29 | Days Of The New,Logged In,Chloe,F,19,Cuevas,258.5073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,The Down Town,200,1.54313E+12,49 30 | Julio Iglesias duet with Willie Nelson,Logged In,Chloe,F,20,Cuevas,212.16608,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,916,To All The Girls I've Loved Before (With Julio Iglesias),200,1.54313E+12,49 31 | ,Logged In,Jacqueline,F,0,Lynch,,paid,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54022E+12,914,,200,1.54313E+12,29 32 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,0,Roth,189.6224,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,704,Lucky (Album Version),200,1.54314E+12,78 33 | ,Logged In,Anabelle,F,0,Simpson,,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",GET,Home,1.54104E+12,901,,200,1.54315E+12,69 34 | R. Kelly,Logged In,Anabelle,F,1,Simpson,234.39628,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,901,The World's Greatest,200,1.54315E+12,69 35 | ,Logged In,Kynnedi,F,0,Sanchez,,free,"Cedar Rapids, IA",GET,Home,1.54108E+12,804,,200,1.54315E+12,89 36 | Jacky Terrasson,Logged In,Marina,F,0,Sutton,342.7522,free,"Salinas, CA",PUT,NextSong,1.54106E+12,373,Le Jardin d'Hiver,200,1.54315E+12,48 37 | Papa Roach,Logged In,Theodore,M,0,Harris,202.1873,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Alive,200,1.54316E+12,14 38 | Burt Bacharach,Logged In,Theodore,M,1,Harris,156.96934,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,813,Casino Royale Theme (Main Title),200,1.54316E+12,14 39 | ,Logged In,Chloe,F,0,Cuevas,,paid,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,923,,200,1.54316E+12,49 40 | Floetry,Logged In,Chloe,F,1,Cuevas,254.48444,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Sunshine,200,1.54316E+12,49 41 | The Rakes,Logged In,Chloe,F,2,Cuevas,225.2273,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Leave The City And Come Home,200,1.54316E+12,49 42 | Dwight Yoakam,Logged In,Chloe,F,3,Cuevas,239.3073,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,You're The One,200,1.54316E+12,49 43 | Ween,Logged In,Chloe,F,4,Cuevas,228.10077,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,Voodoo Lady,200,1.54316E+12,49 44 | Café Quijano,Logged In,Chloe,F,5,Cuevas,197.32853,paid,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,923,La Lola,200,1.54316E+12,49 45 | ,Logged In,Chloe,F,0,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 46 | Parov Stelar,Logged In,Chloe,F,1,Roth,203.65016,free,"Indianapolis-Carmel-Anderson, IN",PUT,NextSong,1.5407E+12,925,Good Bye Emily (feat. Gabriella Hanninen),200,1.54317E+12,78 47 | ,Logged In,Chloe,F,2,Roth,,free,"Indianapolis-Carmel-Anderson, IN",GET,Home,1.5407E+12,925,,200,1.54317E+12,78 48 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,915,,200,1.54317E+12,80 49 | Bryan Adams,Logged In,Tegan,F,1,Levine,166.29506,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,I Will Always Return,200,1.54317E+12,80 50 | KT Tunstall,Logged In,Tegan,F,2,Levine,192.31302,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,White Bird,200,1.54317E+12,80 51 | Technicolour,Logged In,Tegan,F,3,Levine,235.12771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Turn Away,200,1.54317E+12,80 52 | The Dears,Logged In,Tegan,F,4,Levine,289.95873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Lost In The Plot,200,1.54317E+12,80 53 | Go West,Logged In,Tegan,F,5,Levine,259.49995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,915,Never Let Them See You Sweat,200,1.54317E+12,80 54 | ,Logged In,Tegan,F,6,Levine,,paid,"Portland-South Portland, ME",PUT,Logout,1.54079E+12,915,,307,1.54317E+12,80 55 | ,Logged In,Sylvie,F,0,Cruz,,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",GET,Home,1.54027E+12,912,,200,1.54317E+12,10 56 | ,Logged Out,,,7,,,paid,,GET,Home,,915,,200,1.54317E+12, 57 | Gondwana,Logged In,Jordan,F,0,Hicks,262.5824,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Mi Princesa,200,1.54319E+12,37 58 | ,Logged In,Kevin,M,0,Arellano,,free,"Harrisburg-Carlisle, PA",GET,Home,1.54001E+12,855,,200,1.54319E+12,66 59 | Ella Fitzgerald,Logged In,Jordan,F,1,Hicks,427.15383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,On Green Dolphin Street (Medley) (1999 Digital Remaster),200,1.54319E+12,37 60 | Creedence Clearwater Revival,Logged In,Jordan,F,2,Hicks,184.73751,free,"Salinas, CA",PUT,NextSong,1.54001E+12,814,Run Through The Jungle,200,1.54319E+12,37 61 | -------------------------------------------------------------------------------- /data_lake_spark/test_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession, functions as F\n", 10 | "import os\n", 11 | "import configparser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "config = configparser.ConfigParser()\n", 21 | "\n", 22 | "config.read_file(open('/Users/danieldiamond/.aws/credentials'))\n", 23 | "os.environ[\"AWS_ACCESS_KEY_ID\"] = config.get('sparkifyuser',\n", 24 | " 'AWS_ACCESS_KEY_ID')\n", 25 | "os.environ[\"AWS_SECRET_ACCESS_KEY\"] = config.get('sparkifyuser',\n", 26 | " 'AWS_SECRET_ACCESS_KEY')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "input_data = \"s3a://udacity-dend\"\n", 36 | "output_data = \"s3a://sparkify-bucket\"" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "spark = SparkSession.builder\\\n", 46 | " .config(\"spark.jars.packages\",\n", 47 | " \"org.apache.hadoop:hadoop-aws:2.7.0\")\\\n", 48 | " .getOrCreate()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "root\n", 61 | " |-- artist_id: string (nullable = true)\n", 62 | " |-- artist_latitude: double (nullable = true)\n", 63 | " |-- artist_location: string (nullable = true)\n", 64 | " |-- artist_longitude: double (nullable = true)\n", 65 | " |-- artist_name: string (nullable = true)\n", 66 | " |-- duration: double (nullable = true)\n", 67 | " |-- num_songs: long (nullable = true)\n", 68 | " |-- song_id: string (nullable = true)\n", 69 | " |-- title: string (nullable = true)\n", 70 | " |-- year: long (nullable = true)\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "song_path = './data/song_data/*/*/*/*.json'\n", 77 | "song_data = spark.read.json(song_path)\n", 78 | "song_data.printSchema()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "root\n", 91 | " |-- artist: string (nullable = true)\n", 92 | " |-- auth: string (nullable = true)\n", 93 | " |-- firstName: string (nullable = true)\n", 94 | " |-- gender: string (nullable = true)\n", 95 | " |-- itemInSession: long (nullable = true)\n", 96 | " |-- lastName: string (nullable = true)\n", 97 | " |-- length: double (nullable = true)\n", 98 | " |-- level: string (nullable = true)\n", 99 | " |-- location: string (nullable = true)\n", 100 | " |-- method: string (nullable = true)\n", 101 | " |-- page: string (nullable = true)\n", 102 | " |-- registration: double (nullable = true)\n", 103 | " |-- sessionId: long (nullable = true)\n", 104 | " |-- song: string (nullable = true)\n", 105 | " |-- status: long (nullable = true)\n", 106 | " |-- ts: long (nullable = true)\n", 107 | " |-- userAgent: string (nullable = true)\n", 108 | " |-- userId: string (nullable = true)\n", 109 | "\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "log_path = './data/log-data/*.json'\n", 115 | "log_data = spark.read.json(log_path)\n", 116 | "log_data.printSchema()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "root\n", 129 | " |-- ts: long (nullable = true)\n", 130 | " |-- start_time: string (nullable = true)\n", 131 | " |-- year: integer (nullable = true)\n", 132 | " |-- month: integer (nullable = true)\n", 133 | " |-- week: integer (nullable = true)\n", 134 | " |-- weekday: integer (nullable = true)\n", 135 | " |-- day: integer (nullable = true)\n", 136 | " |-- hour: integer (nullable = true)\n", 137 | "\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "time_data = log_data.withColumn('start_time', \n", 143 | " F.from_unixtime(F.col('ts')/1000))\n", 144 | "time_data = time_data.select('ts', 'start_time') \\\n", 145 | " .withColumn('year', F.year('start_time')) \\\n", 146 | " .withColumn('month', F.month('start_time')) \\\n", 147 | " .withColumn('week', F.weekofyear('start_time')) \\\n", 148 | " .withColumn('weekday', F.dayofweek('start_time')) \\\n", 149 | " .withColumn('day', F.dayofyear('start_time')) \\\n", 150 | " .withColumn('hour', F.hour('start_time')).dropDuplicates()\n", 151 | "time_data.printSchema()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 9, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "song_data.createOrReplaceTempView('song_data')\n", 161 | "log_data.createOrReplaceTempView('log_data')\n", 162 | "time_data.createOrReplaceTempView('time_data')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 10, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "songplays_table = spark.sql(\"\"\"SELECT DISTINCT\n", 172 | " l.ts as ts,\n", 173 | " t.year as year,\n", 174 | " t.month as month,\n", 175 | " l.userId as user_id,\n", 176 | " l.level as level,\n", 177 | " s.song_id as song_id,\n", 178 | " s.artist_id as artist_id,\n", 179 | " l.sessionId as session_id,\n", 180 | " s.artist_location as artist_location,\n", 181 | " l.userAgent as user_agent\n", 182 | " FROM song_data s\n", 183 | " JOIN log_data l\n", 184 | " ON s.artist_name = l.artist\n", 185 | " AND s.title = l.song\n", 186 | " AND s.duration = l.length\n", 187 | " JOIN time_data t\n", 188 | " ON t.ts = l.ts\n", 189 | " \"\"\")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 11, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "1" 201 | ] 202 | }, 203 | "execution_count": 11, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "songplays_table.count()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 12, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "-RECORD 0-------------------------------\n", 222 | " ts | 1542837407796 \n", 223 | " year | 2018 \n", 224 | " month | 11 \n", 225 | " user_id | 15 \n", 226 | " level | paid \n", 227 | " song_id | SOZCTXZ12AB0182364 \n", 228 | " artist_id | AR5KOSW1187FB35FF4 \n", 229 | " session_id | 818 \n", 230 | " artist_location | Dubai UAE \n", 231 | " user_agent | \"Mozilla/5.0 (X11... \n", 232 | "\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "songplays_table.show(1, vertical=True)" 238 | ] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "spark", 244 | "language": "python", 245 | "name": "spark" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.7.3" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 2 262 | } 263 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-11-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Frumpies,Logged In,Anabelle,F,0,Simpson,134.47791,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,Hello Kitty,200,1.5419E+12,69 3 | Kenny G with Peabo Bryson,Logged In,Anabelle,F,1,Simpson,264.75057,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,By The Time This Night Is Over,200,1.5419E+12,69 4 | Biffy Clyro,Logged In,Anabelle,F,2,Simpson,189.83138,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,455,God & Satan,200,1.5419E+12,69 5 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,456,,200,1.54191E+12,32 6 | HIM,Logged In,Lily,F,1,Burns,212.06159,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,456,Beautiful,200,1.54191E+12,32 7 | Matmos,Logged In,Joseph,M,0,Gutierrez,1449.11628,free,"Columbia, SC",PUT,NextSong,1.54081E+12,284,Supreme Balloon,200,1.54191E+12,75 8 | Gary Allan,Logged In,Ryann,F,0,Smith,259.83955,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,The One,200,1.54193E+12,92 9 | Miracle Fortress,Logged In,Ryann,F,1,Smith,200.9073,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Five Roses,200,1.54193E+12,92 10 | Don Omar,Logged In,Ryann,F,2,Smith,261.35465,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Cuentale,200,1.54193E+12,92 11 | Jay-Z,Logged In,Ryann,F,3,Smith,212.27057,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,D'Evils,200,1.54193E+12,92 12 | Red Hot Chili Peppers,Logged In,Ryann,F,4,Smith,231.33995,free,"Palestine, TX",PUT,NextSong,1.54069E+12,328,Easily (Album Version),200,1.54193E+12,92 13 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 14 | Flogging Molly,Logged In,Chloe,F,1,Cuevas,361.9522,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Rebels of the Sacred Heart,200,1.54193E+12,49 15 | Reverend Horton Heat,Logged In,Chloe,F,2,Cuevas,158.64118,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Now_ Right Now,200,1.54193E+12,49 16 | Sea Wolf,Logged In,Chloe,F,3,Cuevas,232.61995,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,I Made A Resolution,200,1.54193E+12,49 17 | Jason Mraz & Colbie Caillat,Logged In,Chloe,F,4,Cuevas,189.6224,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Lucky (Album Version),200,1.54193E+12,49 18 | Jamie Lidell,Logged In,Chloe,F,5,Cuevas,175.25506,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Enoughs Enough,200,1.54193E+12,49 19 | Feist,Logged In,Chloe,F,6,Cuevas,212.79302,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,Mushaboom (Postal Service Mix),200,1.54193E+12,49 20 | ,Logged In,Chloe,F,7,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",PUT,Logout,1.54094E+12,437,,307,1.54193E+12,49 21 | ,Logged Out,,,8,,,free,,GET,Home,,437,,200,1.54193E+12, 22 | ,Logged Out,,,9,,,free,,GET,Home,,437,,200,1.54193E+12, 23 | ,Logged Out,,,10,,,free,,PUT,Login,,437,,307,1.54193E+12, 24 | ,Logged In,Chloe,F,11,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,437,,200,1.54193E+12,49 25 | Sex Slaves,Logged In,Chloe,F,12,Cuevas,175.51628,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,437,We're Going Out Tonight,200,1.54193E+12,49 26 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,469,,200,1.54194E+12,49 27 | Rise Against,Logged In,Chloe,F,1,Cuevas,169.482,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,469,To Them These Streets Belong,200,1.54194E+12,49 28 | ,Logged In,Mohammad,M,0,Rodriguez,,free,"Sacramento--Roseville--Arden-Arcade, CA",GET,Home,1.54051E+12,441,,200,1.54194E+12,88 29 | Beyoncé,Logged In,Mohammad,M,1,Rodriguez,359.54893,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Get Me Bodied,200,1.54194E+12,88 30 | Nate Dogg,Logged In,Mohammad,M,2,Rodriguez,356.38812,free,"Sacramento--Roseville--Arden-Arcade, CA",PUT,NextSong,1.54051E+12,441,Never Leave Me Alone,200,1.54194E+12,88 31 | ,Logged In,Cierra,F,0,Finley,,free,"Richmond, VA",GET,Home,1.54101E+12,443,,200,1.54195E+12,96 32 | Taylor Swift,Logged In,Cierra,F,1,Finley,233.89995,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Love Story,200,1.54195E+12,96 33 | Lynyrd Skynyrd,Logged In,Ryan,M,0,Smith,216.60689,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,452,Sweet home Alabama,200,1.54195E+12,26 34 | Kelis,Logged In,Cierra,F,2,Finley,293.58975,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Caught Out There (Explicit),200,1.54195E+12,96 35 | The Kills,Logged In,Cierra,F,3,Finley,203.38893,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Last Day Of Magic,200,1.54195E+12,96 36 | ,Logged In,Aleena,F,0,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 37 | Collie Buddz featuring Paul Wall,Logged In,Aleena,F,1,Kirby,271.62077,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,What A Feeling,200,1.54195E+12,44 38 | Charttraxx Karaoke,Logged In,Cierra,F,4,Finley,225.17506,free,"Richmond, VA",PUT,NextSong,1.54101E+12,443,Fireflies,200,1.54195E+12,96 39 | Band Of Horses,Logged In,Aleena,F,2,Kirby,321.14893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,The Funeral (Album Version),200,1.54195E+12,44 40 | Coldplay,Logged In,Aleena,F,3,Kirby,307.51302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Clocks,200,1.54195E+12,44 41 | Bon Jovi,Logged In,Aleena,F,4,Kirby,228.75383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Have A Nice Day,200,1.54195E+12,44 42 | P.O.D.,Logged In,Aleena,F,5,Kirby,203.7024,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Alive (2006 Remastered Album Version),200,1.54195E+12,44 43 | Bloc Party,Logged In,Aleena,F,6,Kirby,222.04036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Plans (Replanned by Mogwai),200,1.54195E+12,44 44 | Los Prisioneros,Logged In,Aleena,F,7,Kirby,211.12118,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Pa Pa Pa,200,1.54195E+12,44 45 | Octopus Project,Logged In,Aleena,F,8,Kirby,175.25506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Lots More Stairs,200,1.54195E+12,44 46 | Roudoudou,Logged In,Aleena,F,9,Kirby,18.41587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Ecoute Ce Scratch,200,1.54195E+12,44 47 | Africando,Logged In,Aleena,F,10,Kirby,253.54404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Tierra Tradicional,200,1.54195E+12,44 48 | RUN-DMC Featuring Method Man_ Kenny Cash_ Mike Ransom_ and Jamel Simmons,Logged In,Aleena,F,11,Kirby,266.52689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Simmons Incorporated,200,1.54195E+12,44 49 | ,Logged In,Colm,M,0,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 50 | Graham Coxon,Logged In,Colm,M,1,Santana,197.14567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,I'm Goin' Away,200,1.54195E+12,67 51 | Queens Of The Stone Age,Logged In,Aleena,F,12,Kirby,231.02649,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,In The Fade,200,1.54195E+12,44 52 | Dance Gavin Dance,Logged In,Colm,M,2,Santana,193.30567,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strawberry André (Album Version),200,1.54195E+12,67 53 | ,Logged In,Colm,M,3,Santana,,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",GET,Home,1.54086E+12,414,,200,1.54195E+12,67 54 | Passion Pit,Logged In,Aleena,F,13,Kirby,243.69587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,448,Eyes As Candles,200,1.54195E+12,44 55 | ,Logged In,Aleena,F,14,Kirby,,paid,"Waterloo-Cedar Falls, IA",GET,Home,1.54102E+12,448,,200,1.54195E+12,44 56 | Black Eyed Peas,Logged In,Colm,M,4,Santana,229.61587,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Let's Get It Started,200,1.54195E+12,67 57 | Plastic Bertrand,Logged In,Colm,M,5,Santana,180.00934,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Ca plane pour moi,200,1.54195E+12,67 58 | Cream,Logged In,Colm,M,6,Santana,166.5824,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Strange Brew,200,1.54195E+12,67 59 | Coldplay,Logged In,Colm,M,7,Santana,284.39465,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,A Message,200,1.54195E+12,67 60 | Cute Is What We Aim For,Logged In,Colm,M,8,Santana,172.22485,free,"Nashville-Davidson--Murfreesboro--Franklin, TN",PUT,NextSong,1.54086E+12,414,Sweat the Battle Before the Battle Sweats You (Album Version),200,1.54195E+12,67 61 | Metallica,Logged In,Connar,M,0,Moreno,256.9922,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Of Wolf And Man,200,1.54195E+12,62 62 | The Kills,Logged In,Connar,M,1,Moreno,217.70404,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Tape Song,200,1.54195E+12,62 63 | Foo Fighters,Logged In,Connar,M,2,Moreno,271.38567,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,The Pretender,200,1.54195E+12,62 64 | Plaid,Logged In,Connar,M,3,Moreno,260.96281,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,218,Eyen [Chosen by fans on Warp20.net],200,1.54195E+12,62 65 | ,Logged In,Brayden,M,0,Clark,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54103E+12,120,,200,1.54195E+12,41 66 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,462,,200,1.54196E+12,14 67 | The Van Pelt,Logged In,Theodore,M,1,Harris,208.71791,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,462,It's New To Me,200,1.54196E+12,14 68 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 69 | 44,Logged In,Ryan,M,1,Smith,224.57424,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Make You Smile,200,1.54196E+12,26 70 | ,Logged In,Rylan,M,0,George,,free,"Birmingham-Hoover, AL",GET,Home,1.54102E+12,446,,200,1.54196E+12,16 71 | Chris Brown,Logged In,Rylan,M,1,George,275.1473,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,I May Never Find,200,1.54196E+12,16 72 | KT Tunstall,Logged In,Ryan,M,2,Smith,170.47465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Horse And The Cherry Tree (Radio Version),200,1.54196E+12,26 73 | Cascada,Logged In,Rylan,M,2,George,184.39791,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,446,Kids In America,200,1.54196E+12,16 74 | Incubus,Logged In,Ryan,M,3,Smith,293.38077,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,472,Black Heart Inertia,200,1.54196E+12,26 75 | ,Logged In,Ryan,M,4,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Help,1.54102E+12,472,,200,1.54196E+12,26 76 | ,Logged In,Ryan,M,5,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,472,,200,1.54196E+12,26 77 | ,Logged In,Tegan,F,0,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 78 | Miike Snow,Logged In,Tegan,F,1,Levine,220.83873,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Black & Blue,200,1.54197E+12,80 79 | Cartola,Logged In,Tegan,F,2,Levine,208.92689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Sala De Recepção,200,1.54197E+12,80 80 | Kill The Client,Logged In,Tegan,F,3,Levine,70.68689,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Commander In Thief,200,1.54197E+12,80 81 | ,Logged In,Tegan,F,4,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,435,,200,1.54197E+12,80 82 | Wolfmother,Logged In,Tegan,F,5,Levine,175.82975,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Woman,200,1.54197E+12,80 83 | Old Crow Medicine Show,Logged In,Tegan,F,6,Levine,231.73179,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Wagon Wheel,200,1.54197E+12,80 84 | Architecture In Helsinki,Logged In,Tegan,F,7,Levine,173.73995,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Debbie,200,1.54197E+12,80 85 | Charlie Louvin,Logged In,Tegan,F,8,Levine,170.86649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Think I'll Live,200,1.54197E+12,80 86 | Miguel Morales,Logged In,Tegan,F,9,Levine,270.78485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,La Derrota de Un Don Juan,200,1.54197E+12,80 87 | Dominique A,Logged In,Tegan,F,10,Levine,153.20771,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Le Courage Des Oiseaux,200,1.54197E+12,80 88 | Cock Sparrer,Logged In,Tegan,F,11,Levine,203.25832,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Run With The Blind,200,1.54197E+12,80 89 | Jimmy Wakely,Logged In,Tegan,F,12,Levine,165.74649,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,I Love You So Much It Hurts,200,1.54197E+12,80 90 | Peter Doherty,Logged In,Tegan,F,13,Levine,217.02485,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,A Little Death Around the Eyes,200,1.54197E+12,80 91 | Katy Perry,Logged In,Tegan,F,14,Levine,246.41261,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,435,Thinking Of You,200,1.54197E+12,80 92 | Sidewalk Prophets,Logged In,Molly,F,0,Taylor,260.62322,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,You Love Me Anyway (Album),200,1.54197E+12,35 93 | Rise Against,Logged In,Molly,F,1,Taylor,221.17832,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Torches,200,1.54197E+12,35 94 | K'Naan,Logged In,Molly,F,2,Taylor,220.49914,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,464,Wavin' Flag,200,1.54197E+12,35 95 | Patrick Jumpen,Logged In,Ryan,M,0,Smith,208.87465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Holiday,200,1.54198E+12,26 96 | Alicia Keys,Logged In,Ryan,M,1,Smith,216.47628,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,480,Empire State Of Mind (Part II) Broken Down,200,1.54198E+12,26 97 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-22-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Dee Dee Bridgewater,Logged In,Lily,F,38,Koch,318.64118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,La Vie En Rose,200,1.54285E+12,15 3 | Tim O'brien,Logged In,Lily,F,39,Koch,176.14322,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Think About Last Night,200,1.54285E+12,15 4 | Nirvana,Logged In,Lily,F,40,Koch,215.11791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Love Buzz,200,1.54285E+12,15 5 | Weezer,Logged In,Lily,F,41,Koch,479.32036,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Only In Dreams,200,1.54285E+12,15 6 | Nightwish,Logged In,Lily,F,42,Koch,286.1971,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,She Is My Sin,200,1.54285E+12,15 7 | California Swag District,Logged In,Lily,F,43,Koch,239.17669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Teach Me How To Dougie,200,1.54285E+12,15 8 | Miike Snow,Logged In,Lily,F,44,Koch,385.35791,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Silvia,200,1.54285E+12,15 9 | Katy Perry,Logged In,Lily,F,45,Koch,179.40853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Kissed A Girl,200,1.54285E+12,15 10 | Sikth,Logged In,Lily,F,46,Koch,250.53995,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Peep Show,200,1.54285E+12,15 11 | Lily Allen,Logged In,Lily,F,47,Koch,199.88853,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Not Fair,200,1.54285E+12,15 12 | The Presidents of the United States of America,Logged In,Lily,F,48,Koch,495.77751,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lump,200,1.54285E+12,15 13 | Wordsworth,Logged In,Lily,F,49,Koch,253.1522,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Right Now (Produced by Ayatollah),200,1.54285E+12,15 14 | Rihanna,Logged In,Lily,F,50,Koch,229.04118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Take A Bow,200,1.54285E+12,15 15 | Tomas Bodin,Logged In,Lily,F,51,Koch,396.53832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Back To The African Garden,200,1.54285E+12,15 16 | Black Eyed Peas,Logged In,Lily,F,52,Koch,326.86975,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,My Humps,200,1.54285E+12,15 17 | Carolina Liar,Logged In,Lily,F,53,Koch,240.45669,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Show Me What I'm Looking For (Album Version),200,1.54285E+12,15 18 | Kansas,Logged In,Lily,F,54,Koch,202.29179,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Dust in The Wind,200,1.54285E+12,15 19 | Onar,Logged In,Lily,F,55,Koch,306.6771,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Xehasmeni Melodia,200,1.54285E+12,15 20 | Live,Logged In,Lily,F,56,Koch,286.98077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lakini's Juice,200,1.54285E+12,15 21 | Abstract Rude,Logged In,Lily,F,57,Koch,196.85832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Nuff Fire,200,1.54285E+12,15 22 | Johnny Horton,Logged In,Lily,F,58,Koch,131.81342,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mean Mean Son Of A Gun,200,1.54285E+12,15 23 | The Men They Couldn't Hang,Logged In,Lily,F,59,Koch,251.14077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Ironmasters,200,1.54285E+12,15 24 | Rilo Kiley,Logged In,Lily,F,60,Koch,234.03057,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Absence Of God (Album Version),200,1.54285E+12,15 25 | Shwayze,Logged In,Lily,F,61,Koch,201.63873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Lost My Mind,200,1.54285E+12,15 26 | Bram Vermeulen,Logged In,Lily,F,62,Koch,251.42812,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Mamma,200,1.54285E+12,15 27 | Death Cab for Cutie,Logged In,Lily,F,63,Koch,189.3873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,I Will Follow You into the Dark (Album Version),200,1.54285E+12,15 28 | Dwight Yoakam,Logged In,Lily,F,64,Koch,239.3073,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,You're The One,200,1.54285E+12,15 29 | Jadakiss / Ghostface Killah / Raekwon,Logged In,Lily,F,65,Koch,173.76608,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Cartel Gathering,200,1.54285E+12,15 30 | Rosana,Logged In,Lily,F,66,Koch,256.31302,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Si tu no estas,200,1.54285E+12,15 31 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 32 | ,Logged In,Kaylee,F,1,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,775,,200,1.54285E+12,8 33 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,775,,200,1.54285E+12,8 34 | The Killers,Logged In,Lily,F,67,Koch,230.39955,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,The Ballad of Michael Valentine,200,1.54285E+12,15 35 | Alliance Ethnik,Logged In,Lily,F,68,Koch,195.94404,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Sincerité Et Jalousie,200,1.54285E+12,15 36 | Enya,Logged In,Lily,F,69,Koch,289.802,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,China Roses,200,1.54285E+12,15 37 | Aya RL,Logged In,Lily,F,70,Koch,225.43628,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,818,Jazz,200,1.54285E+12,15 38 | ,Logged In,Lily,F,71,Koch,,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,Logout,1.54105E+12,818,,307,1.54285E+12,15 39 | ,Logged Out,,,72,,,paid,,GET,Home,,818,,200,1.54285E+12, 40 | ,Logged Out,,,73,,,paid,,GET,About,,818,,200,1.54285E+12, 41 | Clor,Logged In,Ryan,M,0,Smith,227.68281,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Love + Pain,200,1.54286E+12,26 42 | Alejandro Fernandez,Logged In,Ryan,M,1,Smith,262.84363,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Solitario Y Solo,200,1.54286E+12,26 43 | Yonder Mountain String Band,Logged In,Ryan,M,2,Smith,152.18893,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,820,Midwest Gospel Radio,200,1.54286E+12,26 44 | K'Naan,Logged In,Ava,F,0,Robinson,220.49914,free,"New Haven-Milford, CT",PUT,NextSong,1.54093E+12,824,Wavin' Flag,200,1.54287E+12,50 45 | Cradle Of Filth,Logged In,Kate,F,0,Harrell,453.09342,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Her Ghost In The Fog,200,1.54288E+12,97 46 | Amanda Marshall,Logged In,Kate,F,1,Harrell,274.28526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Let It Rain,200,1.54288E+12,97 47 | Rammstein,Logged In,Kate,F,2,Harrell,272.40444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Sonne,200,1.54288E+12,97 48 | Cat Stevens,Logged In,Kate,F,3,Harrell,167.6273,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,If You Want To Sing Out_ Sing Out,200,1.54288E+12,97 49 | Emma Shapplin,Logged In,Kate,F,4,Harrell,267.62404,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Spente Le Stelle,200,1.54289E+12,97 50 | Modest Mouse,Logged In,Kate,F,5,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 51 | Flaco Jimenez,Logged In,Kate,F,6,Harrell,155.81995,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,En El Cielo No Hay Cerveza (In Heaven There Is No Beer),200,1.54289E+12,97 52 | Modest Mouse,Logged In,Kate,F,7,Harrell,209.52771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Float On,200,1.54289E+12,97 53 | Cedric Gervais feat. Second Sun,Logged In,Kate,F,8,Harrell,230.32118,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Pills (Radio Edit) (Radio Edit),200,1.54289E+12,97 54 | Sheena Easton,Logged In,Kate,F,9,Harrell,239.62077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Strut (1993 Digital Remaster),200,1.54289E+12,97 55 | Everything But The Girl,Logged In,Kate,F,10,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,My Baby Don't Love Me,200,1.54289E+12,97 56 | Florence + The Machine,Logged In,Kate,F,11,Harrell,219.66322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Dog Days Are Over (Radio Edit),200,1.54289E+12,97 57 | BoDeans,Logged In,Kate,F,12,Harrell,354.01098,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Naked (Live),200,1.54289E+12,97 58 | OneRepublic,Logged In,Kate,F,13,Harrell,208.14322,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Apologize,200,1.54289E+12,97 59 | Miley Cyrus,Logged In,Kate,F,14,Harrell,194.45506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Full Circle,200,1.54289E+12,97 60 | Coldplay,Logged In,Kate,F,15,Harrell,139.12771,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Don't Panic,200,1.54289E+12,97 61 | Atreyu,Logged In,Kate,F,16,Harrell,308.37506,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,You Were The King_ Now You're Unconscious (Album Version),200,1.54289E+12,97 62 | Bruce Springsteen,Logged In,Kate,F,17,Harrell,270.54975,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Born To Run,200,1.54289E+12,97 63 | Björk,Logged In,Kate,F,18,Harrell,348.57751,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Undo,200,1.54289E+12,97 64 | Big Shug,Logged In,Kate,F,19,Harrell,140.56444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,It Just Don't Stop,200,1.54289E+12,97 65 | The Wallflowers,Logged In,Kate,F,20,Harrell,315.24526,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Be Your Own Girl,200,1.54289E+12,97 66 | Chris Brown,Logged In,Kate,F,21,Harrell,203.80689,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Ain't No Way (You Won't Love Me),200,1.54289E+12,97 67 | Charly García,Logged In,Kate,F,22,Harrell,231.73179,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Filosofia Barata Y Zapatos De Goma,200,1.54289E+12,97 68 | N.W.A ft. Eazy-E,Logged In,Kate,F,23,Harrell,338.18077,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Boyz-N-The-Hood,200,1.54289E+12,97 69 | The Mighty Mighty Bosstones,Logged In,Kate,F,24,Harrell,158.87628,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,1/2/2008,200,1.54289E+12,97 70 | Beastie Boys,Logged In,Kate,F,25,Harrell,211.722,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Unite (2009 Digital Remaster),200,1.54289E+12,97 71 | Yuksek,Logged In,Kate,F,26,Harrell,218.95791,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Take A Ride,200,1.54289E+12,97 72 | Fernando Ubiergo,Logged In,Kate,F,27,Harrell,218.74893,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Cuando Agosto Era 21,200,1.54289E+12,97 73 | Phoenix,Logged In,Kate,F,28,Harrell,192.86159,paid,"Lansing-East Lansing, MI",PUT,NextSong,1.54047E+12,828,Napoleon Says,200,1.54289E+12,97 74 | Radney Foster,Logged In,Jayden,M,0,Fox,288.96608,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Sweet And Wild,200,1.54289E+12,101 75 | Neneh Cherry,Logged In,Jayden,M,1,Fox,232.202,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,790,Manchild,200,1.54289E+12,101 76 | Hooligans,Logged In,Ayla,F,0,Johnson,189.98812,free,"Santa Rosa, CA",PUT,NextSong,1.54088E+12,785,Szex & KV,200,1.54289E+12,63 77 | Kid Cudi / MGMT / Ratatat,Logged In,Lily,F,0,Burns,295.67955,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1.54062E+12,786,Pursuit Of Happiness (nightmare),200,1.5429E+12,32 78 | Foals,Logged In,Morris,M,0,Gilmore,316.89098,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Blue Blood,200,1.5429E+12,23 79 | 'N Sync/Phil Collins,Logged In,Morris,M,1,Gilmore,143.64689,free,"Raleigh, NC",PUT,NextSong,1.54097E+12,351,Trashin' The Camp (Phil And 'N Sync Version),200,1.5429E+12,23 80 | Kristian Stanfill,Logged In,Jayden,M,0,Fox,287.50322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,I Need You,200,1.5429E+12,101 81 | Enrique Iglesias,Logged In,Jayden,M,1,Fox,241.42322,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Tired Of Being Sorry,200,1.5429E+12,101 82 | Michael Cretu,Logged In,Jayden,M,2,Fox,301.06077,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,The Invisible Man,200,1.5429E+12,101 83 | Tommy Emmanuel,Logged In,Jayden,M,3,Fox,168.14975,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,838,Windy & Warm,200,1.5429E+12,101 84 | ,Logged In,Jayden,M,4,Fox,,free,"New Orleans-Metairie, LA",PUT,Logout,1.54103E+12,838,,307,1.5429E+12,101 85 | ,Logged Out,,,5,,,free,,GET,Home,,838,,200,1.5429E+12, 86 | ,Logged Out,,,6,,,free,,PUT,Login,,838,,307,1.5429E+12, 87 | ,Logged In,Jayden,M,7,Fox,,free,"New Orleans-Metairie, LA",GET,Home,1.54103E+12,838,,200,1.5429E+12,101 88 | ,Logged In,Jordan,F,0,Rodriguez,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1.54099E+12,523,,200,1.5429E+12,68 89 | Cherise,Logged In,Stefany,F,0,White,229.69424,free,"Lubbock, TX",PUT,NextSong,1.54071E+12,772,No Good 4 You,200,1.54291E+12,83 90 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,835,,200,1.54291E+12,26 91 | Anna Waronker,Logged In,Jayden,F,0,Duffy,189.6224,free,"Seattle-Tacoma-Bellevue, WA",PUT,NextSong,1.54015E+12,662,Nothing Personal,200,1.54291E+12,76 92 | King Changó,Logged In,Cecilia,F,0,Owens,340.74077,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Confesión,200,1.54292E+12,6 93 | Gang Of Four,Logged In,Cecilia,F,1,Owens,193.14893,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,I Found That Essence Rare,200,1.54292E+12,6 94 | Line Renaud,Logged In,Cecilia,F,2,Owens,176.16934,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,763,Le Soir,200,1.54292E+12,6 95 | ,Logged Out,,,0,,,paid,,PUT,Login,,823,,307,1.54292E+12, 96 | ,Logged In,Tegan,F,1,Levine,,paid,"Portland-South Portland, ME",GET,Home,1.54079E+12,823,,200,1.54292E+12,80 97 | the bird and the bee,Logged In,Tegan,F,2,Levine,198.1122,paid,"Portland-South Portland, ME",PUT,NextSong,1.54079E+12,823,Last Day Of Our Love,200,1.54292E+12,80 98 | ,Logged Out,,,0,,,paid,,GET,Home,,831,,200,1.54293E+12, 99 | ,Logged Out,,,1,,,paid,,GET,Home,,831,,200,1.54293E+12, 100 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-10-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | Hoobastank,Logged In,Cierra,F,0,Finley,241.3971,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Say The Same,200,1.54181E+12,96 3 | Mark Knopfler,Logged In,Cierra,F,1,Finley,249.3122,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Why Aye Man,200,1.54181E+12,96 4 | Mogwai,Logged In,Cierra,F,2,Finley,341.28934,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,We're No Here,200,1.54181E+12,96 5 | The Casualties,Logged In,Cierra,F,3,Finley,181.49832,free,"Richmond, VA",PUT,NextSong,1.54101E+12,132,Punx Unite,200,1.54181E+12,96 6 | ,Logged In,Cecilia,F,0,Owens,,free,"Atlanta-Sandy Springs-Roswell, GA",GET,Home,1.54103E+12,424,,200,1.54181E+12,6 7 | The Living End,Logged In,Ryan,M,0,Smith,188.62975,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,433,Roll On (Album Version),200,1.54182E+12,26 8 | Aloe Blacc,Logged In,Rylan,M,0,George,244.1922,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,I Need A Dollar,200,1.54183E+12,16 9 | Faith No More,Logged In,Rylan,M,1,George,326.50404,free,"Birmingham-Hoover, AL",PUT,NextSong,1.54102E+12,402,Helpless,200,1.54183E+12,16 10 | Chris Cornell,Logged In,Aleena,F,0,Kirby,353.69751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sunshower (Great Expectations Soundtrack),200,1.54184E+12,44 11 | Weezer,Logged In,Aleena,F,1,Kirby,203.93751,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,My Name Is Jonas,200,1.54184E+12,44 12 | Stream of Passion feat. Ayreon,Logged In,Aleena,F,2,Kirby,257.56689,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Valley Of The Queens,200,1.54184E+12,44 13 | Lupe Fiasco,Logged In,Aleena,F,3,Kirby,273.94567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Shining Down [feat. Matthew Santos] (Amended Album Version),200,1.54184E+12,44 14 | Tom Petty,Logged In,Aleena,F,4,Kirby,263.23546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Runnin' Down A Dream,200,1.54184E+12,44 15 | The Killers,Logged In,Aleena,F,5,Kirby,220.89098,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,When You Were Young,200,1.54184E+12,44 16 | Afghan Whigs,Logged In,Aleena,F,6,Kirby,179.40853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,I'm Her Slave (Album),200,1.54184E+12,44 17 | CSS,Logged In,Aleena,F,7,Kirby,213.75955,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Let's Make Love And Listen To Death From Above [Dan Carey Mix] (remastered album version),200,1.54184E+12,44 18 | Mos Def / Talib Kweli,Logged In,Aleena,F,8,Kirby,141.37424,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,History,200,1.54184E+12,44 19 | Ryan Leslie,Logged In,Aleena,F,9,Kirby,203.96363,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,How It Was Supposed To Be,200,1.54184E+12,44 20 | Mark Lowry,Logged In,Aleena,F,10,Kirby,168.28036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Get Together With The Lord (The Best Of Mark Lowry - Volume 2 Version),200,1.54184E+12,44 21 | Beirut,Logged In,Aleena,F,11,Kirby,230.19057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Nantes,200,1.54184E+12,44 22 | MODESELEKTOR FEAT. PUPPETMASTAZ,Logged In,Aleena,F,12,Kirby,52.79302,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,THE DARK SIDE OF THE FROG,200,1.54184E+12,44 23 | Kid Cudi / Kanye West / Common,Logged In,Aleena,F,13,Kirby,237.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Make Her Say,200,1.54184E+12,44 24 | Julie Ruin,Logged In,Aleena,F,14,Kirby,142.47138,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Breakout A-Town,200,1.54184E+12,44 25 | Sons And Daughters,Logged In,Aleena,F,15,Kirby,165.90322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Bell,200,1.54184E+12,44 26 | Children 18:3,Logged In,Aleena,F,16,Kirby,178.52036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Mock The Music,200,1.54184E+12,44 27 | Chris Cagle,Logged In,Aleena,F,17,Kirby,232.85506,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Miss Me Baby,200,1.54184E+12,44 28 | John Waite,Logged In,Aleena,F,18,Kirby,269.76608,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Missing You,200,1.54184E+12,44 29 | Basshunter,Logged In,Aleena,F,19,Kirby,223.32036,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Walk On Water,200,1.54184E+12,44 30 | Jay-Z / Lil Wayne,Logged In,Aleena,F,20,Kirby,236.01587,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Hello Brooklyn 2.0,200,1.54184E+12,44 31 | Snow Patrol,Logged In,Aleena,F,21,Kirby,273.6322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Beginning To Get To Me,200,1.54184E+12,44 32 | Coldcut,Logged In,Aleena,F,22,Kirby,203.07546,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Autumn Leaves,200,1.54184E+12,44 33 | Magic Dirt,Logged In,Aleena,F,23,Kirby,251.79383,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Plastic Loveless Letter,200,1.54184E+12,44 34 | J. Karjalainen & Mustat Lasit,Logged In,Aleena,F,24,Kirby,336.74404,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Sinisten tähtien alla,200,1.54184E+12,44 35 | OneRepublic,Logged In,Aleena,F,25,Kirby,224.67873,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Secrets,200,1.54184E+12,44 36 | Nirvana,Logged In,Aleena,F,26,Kirby,219.08853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Come As You Are,200,1.54184E+12,44 37 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,359,,200,1.54184E+12,52 38 | Joyce Cooling,Logged In,Aleena,F,27,Kirby,248.11057,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,It's Time I Go (Jazz),200,1.54184E+12,44 39 | Beastie Boys,Logged In,Aleena,F,28,Kirby,211.722,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Unite (2009 Digital Remaster),200,1.54184E+12,44 40 | Usher Featuring Lil' Jon & Ludacris,Logged In,Aleena,F,29,Kirby,250.38322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Yeah!,200,1.54184E+12,44 41 | Nelly / Paul Wall / Ali & Gipp,Logged In,Aleena,F,30,Kirby,272.50893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Grillz,200,1.54184E+12,44 42 | The Audition,Logged In,Aleena,F,31,Kirby,207.20281,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,The Running Man,200,1.54184E+12,44 43 | Savage Garden,Logged In,Aleena,F,32,Kirby,277.26322,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Truly Madly Deeply,200,1.54184E+12,44 44 | Adam Green,Logged In,Aleena,F,33,Kirby,141.00853,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Festival Song,200,1.54184E+12,44 45 | Tom Petty,Logged In,Aleena,F,34,Kirby,204.82567,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Square One (Album Version),200,1.54184E+12,44 46 | Muse,Logged In,Aleena,F,35,Kirby,209.34485,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1.54102E+12,350,Supermassive Black Hole (Album Version),200,1.54184E+12,44 47 | The Gerbils,Logged In,Jordan,F,0,Hicks,27.01016,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,(iii),200,1.54184E+12,37 48 | Robert Plant,Logged In,Jordan,F,1,Hicks,265.66485,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Dancing In Heaven (2006 Remastered LP Version),200,1.54184E+12,37 49 | Metallica,Logged In,Jordan,F,2,Hicks,387.02975,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Welcome Home (Sanitarium),200,1.54184E+12,37 50 | Infected Mushroom,Logged In,Jordan,F,3,Hicks,506.51383,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Deeply Disturbed,200,1.54184E+12,37 51 | Eliza Doolittle,Logged In,Jordan,F,4,Hicks,184.60689,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Rollerblades,200,1.54185E+12,37 52 | Alvin And The Chipmunks,Logged In,Jordan,F,5,Hicks,162.63791,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,Ain't No Party,200,1.54185E+12,37 53 | Chromeo,Logged In,Jordan,F,6,Hicks,348.65587,free,"Salinas, CA",PUT,NextSong,1.54001E+12,304,You're So Gangsta,200,1.54185E+12,37 54 | Keisha White,Logged In,Kevin,M,0,Arellano,251.42812,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Brother,200,1.54185E+12,66 55 | Juanes,Logged In,Kevin,M,1,Arellano,247.37914,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1.54001E+12,387,Damelo,200,1.54185E+12,66 56 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,180,,200,1.54185E+12,39 57 | Karnivool,Logged In,Ryan,M,0,Smith,470.80444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,445,Umbra,200,1.54186E+12,26 58 | WES,Logged In,Cecilia,F,0,Owens,221.57016,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Alane,200,1.54186E+12,6 59 | Asia 2001,Logged In,Cecilia,F,1,Owens,150.30812,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,444,Epilogue,200,1.54186E+12,6 60 | Spike Milligan,Logged In,Samuel,M,0,Gonzalez,220.39465,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54049E+12,384,Nothing At All,200,1.54186E+12,61 61 | Laura Izibor,Logged In,Anabelle,F,0,Simpson,211.56526,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,378,Carousel (PSILY Album Version),200,1.54187E+12,69 62 | ,Logged In,Lily,F,0,Burns,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1.54062E+12,426,,200,1.54187E+12,32 63 | Ryan Adams,Logged In,Braden,M,0,Parker,248.5024,free,"Youngstown-Warren-Boardman, OH-PA",PUT,NextSong,1.541E+12,246,Wonderwall,200,1.54187E+12,74 64 | ,Logged In,Adelyn,F,0,Jordan,,free,"Chicago-Naperville-Elgin, IL-IN-WI",GET,Home,1.54013E+12,391,,200,1.54187E+12,7 65 | Method Man,Logged In,Adelyn,F,1,Jordan,204.64281,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,The Motto,200,1.54187E+12,7 66 | The Stanley Brothers,Logged In,Adelyn,F,2,Jordan,179.69587,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,I'm A Man Of Constant Sorrow,200,1.54187E+12,7 67 | Dexter Freebish,Logged In,Adelyn,F,3,Jordan,210.54649,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54013E+12,391,Deeper,200,1.54187E+12,7 68 | Jamiroquai,Logged In,Jacob,M,0,Rogers,362.05669,free,"San Diego-Carlsbad, CA",PUT,NextSong,1.54098E+12,432,Talullah,200,1.54187E+12,18 69 | Michael Cera & Ellen Page,Logged In,Matthew,M,0,Jones,116.71465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Anyone Else But You,200,1.54188E+12,36 70 | The Cat Empire,Logged In,Matthew,M,1,Jones,218.22649,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,How To Explain,200,1.54188E+12,36 71 | Bryn Terfel / Berliner Philharmoniker / Claudio Abbado,Logged In,Matthew,M,2,Jones,967.36608,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wotan's Farewell & Magic Fire Music,200,1.54188E+12,36 72 | The Fugees,Logged In,Matthew,M,3,Jones,281.20771,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Ready Or Not,200,1.54188E+12,36 73 | Hardline,Logged In,Matthew,M,4,Jones,234.73587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Everything,200,1.54188E+12,36 74 | The Funky Lowlives,Logged In,Matthew,M,5,Jones,280.34567,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sail Into the Sun,200,1.54188E+12,36 75 | DL Incognito,Logged In,Matthew,M,6,Jones,221.07383,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Proof,200,1.54188E+12,36 76 | ,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1.54031E+12,447,,200,1.54188E+12,52 77 | Justice,Logged In,Matthew,M,7,Jones,243.40853,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,TTHHEE PPAARRTTYY,200,1.54188E+12,36 78 | Earth_ Wind & Fire,Logged In,Theodore,M,1,Smith,178.20689,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54031E+12,447,Night Dreamin',200,1.54188E+12,52 79 | Strawbs,Logged In,Matthew,M,8,Jones,255.81669,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Sheep,200,1.54188E+12,36 80 | Angus & Julia Stone,Logged In,Matthew,M,9,Jones,172.85179,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Wasted,200,1.54188E+12,36 81 | Sara Bareilles,Logged In,Matthew,M,10,Jones,260.8322,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Love Song,200,1.54188E+12,36 82 | Bruna Caram,Logged In,Matthew,M,11,Jones,198.63465,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Meus Sonhos,200,1.54188E+12,36 83 | Nando Reis,Logged In,Matthew,M,12,Jones,239.82975,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,O Segundo Sol,200,1.54188E+12,36 84 | The Black Keys,Logged In,Matthew,M,13,Jones,189.28281,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Same Old Thing,200,1.54188E+12,36 85 | Kreator,Logged In,Matthew,M,14,Jones,294.53016,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Riot Of Violence,200,1.54188E+12,36 86 | Audioslave,Logged In,Matthew,M,15,Jones,277.83791,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Show Me How To Live,200,1.54188E+12,36 87 | Red Hot Chili Peppers,Logged In,Matthew,M,16,Jones,269.34812,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Parallel Universe (Album Version),200,1.54188E+12,36 88 | Manu Chao,Logged In,Matthew,M,17,Jones,288.15628,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Me Quedo Contigo [Si Me Das A Elegir],200,1.54188E+12,36 89 | Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner,Logged In,Matthew,M,18,Jones,277.15873,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile),200,1.54188E+12,36 90 | Ron Carter,Logged In,Matthew,M,19,Jones,497.13587,paid,"Janesville-Beloit, WI",PUT,NextSong,1.54106E+12,439,I CAN'T GET STARTED,200,1.54188E+12,36 91 | ,Logged In,Theodore,M,0,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 92 | ,Logged In,Theodore,M,1,Harris,,free,"Red Bluff, CA",GET,Home,1.5411E+12,440,,200,1.54188E+12,14 93 | Lifehouse,Logged In,Theodore,M,2,Harris,195.47383,free,"Red Bluff, CA",PUT,NextSong,1.5411E+12,440,You And Me (Wedding Version),200,1.54188E+12,14 94 | Yann Tiersen,Logged In,Kaylee,F,0,Summers,158.71955,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,La Valse D'Amélie (Version Piano),200,1.54189E+12,8 95 | ISRAEL & NEW BREED,Logged In,Kaylee,F,1,Summers,176.48281,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,333,Awesome Medley,200,1.54189E+12,8 96 | ,Logged In,Molly,F,0,Taylor,,free,"St. Louis, MO-IL",GET,Home,1.54099E+12,396,,200,1.54189E+12,35 97 | Stellar Kart,Logged In,Molly,F,1,Taylor,186.17424,free,"St. Louis, MO-IL",PUT,NextSong,1.54099E+12,396,Jesus Loves You (Album Version),200,1.54189E+12,35 98 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/event_data/2018-11-03-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | ,Logged Out,,,0,,,free,,PUT,Login,,52,,307,1.54121E+12, 3 | ,Logged In,Celeste,F,1,Williams,,free,"Klamath Falls, OR",GET,Home,1.54108E+12,52,,200,1.54121E+12,53 4 | Mynt,Logged In,Celeste,F,2,Williams,166.94812,free,"Klamath Falls, OR",PUT,NextSong,1.54108E+12,52,Playa Haters,200,1.54121E+12,53 5 | Taylor Swift,Logged In,Celeste,F,3,Williams,230.47791,free,"Klamath Falls, OR",PUT,NextSong,1.54108E+12,52,You Belong With Me,200,1.54121E+12,53 6 | Amy Winehouse,Logged In,Celeste,F,4,Williams,229.85098,free,"Klamath Falls, OR",PUT,NextSong,1.54108E+12,52,Valerie,200,1.54121E+12,53 7 | Jimmy Eat World,Logged In,Celeste,F,5,Williams,285.83138,free,"Klamath Falls, OR",PUT,NextSong,1.54108E+12,52,Dizzy,200,1.54121E+12,53 8 | ,Logged Out,,,0,,,free,,GET,Home,,18,,200,1.54124E+12, 9 | Maldita Nerea,Logged In,Anabelle,F,0,Simpson,241.162,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1.54104E+12,158,Supelicula,200,1.54125E+12,69 10 | Fluke,Logged In,Connar,M,0,Moreno,478.92853,free,"Houston-The Woodlands-Sugar Land, TX",PUT,NextSong,1.54082E+12,168,Bermuda,200,1.54126E+12,62 11 | Habib Koité,Logged In,Jayden,M,0,Fox,285.1522,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,185,Din Din Wo,200,1.54126E+12,101 12 | The Kooks,Logged In,Sara,F,0,Johnson,132.25751,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Eddie's Gun,200,1.54126E+12,95 13 | Blues Traveler,Logged In,Sara,F,1,Johnson,290.24608,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Hook,200,1.54126E+12,95 14 | Coldplay,Logged In,Sara,F,2,Johnson,298.762,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Shiver,200,1.54126E+12,95 15 | Tom Petty And The Heartbreakers,Logged In,Sara,F,3,Johnson,183.01342,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,The Wild One_ Forever,200,1.54126E+12,95 16 | Girl Talk,Logged In,Sara,F,4,Johnson,173.24363,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Give and Go,200,1.54126E+12,95 17 | Florence + The Machine,Logged In,Sara,F,5,Johnson,258.55955,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Cosmic Love,200,1.54126E+12,95 18 | Three Drives,Logged In,Sara,F,6,Johnson,411.6371,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Greece 2000,200,1.54126E+12,95 19 | Jonas Brothers,Logged In,Sara,F,7,Johnson,192.36526,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Sorry,200,1.54126E+12,95 20 | Tevin Campbell,Logged In,Sara,F,8,Johnson,293.04118,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Round And Round (Soul Mix Edit),200,1.54126E+12,95 21 | Sting,Logged In,Sara,F,9,Johnson,257.12281,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Windmills Of Your Mind,200,1.54126E+12,95 22 | The Champs,Logged In,Sara,F,10,Johnson,132.0224,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Tequila,200,1.54126E+12,95 23 | Röyksopp,Logged In,Sara,F,11,Johnson,214.93506,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Poor Leno Jakatta Radio Mix,200,1.54126E+12,95 24 | Avenged Sevenfold,Logged In,Sara,F,12,Johnson,312.11057,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Bat Country (Album Version),200,1.54126E+12,95 25 | ,Logged In,Sylvie,F,0,Cruz,,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",GET,Home,1.54027E+12,210,,200,1.54126E+12,10 26 | Sam Cooke,Logged In,Sara,F,13,Johnson,122.04363,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Ain't Misbehavin,200,1.54126E+12,95 27 | Apulanta,Logged In,Sara,F,14,Johnson,219.53261,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Hallaa,200,1.54126E+12,95 28 | Era,Logged In,Sara,F,15,Johnson,200.56771,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Cathar Rhythm,200,1.54126E+12,95 29 | Klaus Badelt,Logged In,Sylvie,F,1,Cruz,128.62649,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,210,Moonlight Serenade,200,1.54126E+12,10 30 | Parov Stelar,Logged In,Sara,F,16,Johnson,281.5473,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,KissKiss,200,1.54126E+12,95 31 | Florence + The Machine,Logged In,Lily,F,0,Koch,290.48118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Rabbit Heart (Raise It Up),200,1.54126E+12,15 32 | De La Soul,Logged In,Sara,F,17,Johnson,221.72689,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Me_ Myself & I,200,1.54126E+12,95 33 | Lil Wayne / T-Pain,Logged In,Sara,F,18,Johnson,244.58404,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Got Money,200,1.54126E+12,95 34 | Cher,Logged In,Lily,F,1,Koch,167.41832,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Milord,200,1.54126E+12,15 35 | Edge Of Dawn,Logged In,Lily,F,2,Koch,253.72689,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Losing Ground,200,1.54126E+12,15 36 | The Black Keys,Logged In,Sara,F,19,Johnson,223.84281,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Elevator,200,1.54126E+12,95 37 | Smokie Norful,Logged In,Lily,F,3,Koch,337.84118,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,I Need You Now (Build A Bridge Version),200,1.54126E+12,15 38 | Adam Lambert,Logged In,Sara,F,20,Johnson,227.39546,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Whataya Want From Me,200,1.54127E+12,95 39 | Pulp,Logged In,Sara,F,21,Johnson,208.95302,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Little Girl (With Blue Eyes),200,1.54127E+12,95 40 | Alliance Ethnik,Logged In,Lily,F,4,Koch,265.76934,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Creil City,200,1.54127E+12,15 41 | Tricky,Logged In,Sara,F,22,Johnson,212.79302,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Ponderosa,200,1.54127E+12,95 42 | John Butler Trio,Logged In,Lily,F,5,Koch,223.68608,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Betterman (full-length/album version),200,1.54127E+12,15 43 | Culture Club,Logged In,Sara,F,23,Johnson,282.48771,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Believe (Demo),200,1.54127E+12,95 44 | The Temper Trap,Logged In,Lily,F,6,Koch,192.67873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Fader,200,1.54127E+12,15 45 | La Mosca Tse-Tse,Logged In,Sara,F,24,Johnson,195.082,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Patadas En El Corazon,200,1.54127E+12,95 46 | Cute Is What We Aim For,Logged In,Lily,F,7,Koch,201.22077,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Finger Twist & Split (Album Version),200,1.54127E+12,15 47 | Jack Johnson,Logged In,Sara,F,25,Johnson,236.72118,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Bubble Toes,200,1.54127E+12,95 48 | Siriusmo,Logged In,Lily,F,8,Koch,272.61342,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Simple,200,1.54127E+12,15 49 | Creedence Clearwater Revival,Logged In,Sara,F,26,Johnson,312.89424,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Feelin' Blue,200,1.54127E+12,95 50 | Alberto Plaza,Logged In,Lily,F,9,Koch,271.3073,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,No Seas Cruel (vivo),200,1.54127E+12,15 51 | Brand New,Logged In,Sara,F,27,Johnson,188.49914,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Not The Sun,200,1.54127E+12,95 52 | Sean Lennon,Logged In,Lily,F,10,Koch,202.97098,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Into The Sun,200,1.54127E+12,15 53 | K-OS,Logged In,Sara,F,28,Johnson,211.33016,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,EMCEE Murdah,200,1.54127E+12,95 54 | Nana Caymmi,Logged In,Lily,F,11,Koch,251.0624,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Acercate Mas (2000 Digital Remaster),200,1.54127E+12,15 55 | Casino Versus Japan,Logged In,Sara,F,29,Johnson,86.30812,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Moonlupe,200,1.54127E+12,95 56 | the bird and the bee,Logged In,Sara,F,30,Johnson,281.23383,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Ray Gun,200,1.54127E+12,95 57 | Skyforger,Logged In,Lily,F,12,Koch,38.94812,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,In the Yard of the Father's Son,200,1.54127E+12,15 58 | Bryan Ferry,Logged In,Lily,F,13,Koch,265.87383,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Slave To Love (1999 Digital Remaster),200,1.54127E+12,15 59 | The Black Keys,Logged In,Sara,F,31,Johnson,196.91057,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,All Hands Against His Own,200,1.54127E+12,95 60 | Foxy Shazam,Logged In,Lily,F,14,Koch,201.74322,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Wanna-be Angel (Album Version),200,1.54127E+12,15 61 | The Ruts,Logged In,Sara,F,32,Johnson,338.96444,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,West One (Shine On Me),200,1.54127E+12,95 62 | Old 97's,Logged In,Lily,F,15,Koch,231.28771,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Victoria (LP Version),200,1.54127E+12,15 63 | Lonnie Gordon,Logged In,Lily,F,16,Koch,181.21098,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Catch You Baby (Steve Pitron & Max Sanna Radio Edit),200,1.54127E+12,15 64 | Bon Iver,Logged In,Sara,F,33,Johnson,220.70812,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,For Emma,200,1.54127E+12,95 65 | Florence + The Machine,Logged In,Lily,F,17,Koch,219.66322,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Dog Days Are Over (Radio Edit),200,1.54127E+12,15 66 | Glen Washington,Logged In,Sara,F,34,Johnson,193.82812,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,One Of These Days,200,1.54127E+12,95 67 | Mad Cobra,Logged In,Lily,F,18,Koch,230.68689,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Flex,200,1.54127E+12,15 68 | Dropout Year,Logged In,Sara,F,35,Johnson,227.36934,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,This Notebook,200,1.54127E+12,95 69 | Flogging Molly,Logged In,Lily,F,19,Koch,260.75383,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Punch Drunk Grinning Soul,200,1.54127E+12,15 70 | Young Rebel Set,Logged In,Sara,F,36,Johnson,248.55465,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,If I Was,200,1.54127E+12,95 71 | Sneaker Pimps,Logged In,Lily,F,20,Koch,260.91057,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Spin Spin Sugar,200,1.54127E+12,15 72 | Righteous Brothers,Logged In,Sara,F,37,Johnson,215.90159,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Unchained Melody,200,1.54127E+12,95 73 | Me First And The Gimme Gimmes,Logged In,Sara,F,38,Johnson,64.39138,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Jonny's Blessing,200,1.54127E+12,95 74 | Texas In July,Logged In,Lily,F,21,Koch,190.69342,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Aurora,200,1.54127E+12,15 75 | Alice In Chains,Logged In,Sara,F,39,Johnson,230.60853,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,God Smack,200,1.54127E+12,95 76 | ,Logged In,Sara,F,40,Johnson,,paid,"Winston-Salem, NC",GET,Downgrade,1.54081E+12,152,,200,1.54127E+12,95 77 | 10 Years,Logged In,Lily,F,22,Koch,229.95546,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Wasteland,200,1.54127E+12,15 78 | Juanes,Logged In,Lily,F,23,Koch,233.40363,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,No Siento Penas,200,1.54127E+12,15 79 | ,Logged In,Sara,F,41,Johnson,,paid,"Winston-Salem, NC",GET,Downgrade,1.54081E+12,152,,200,1.54127E+12,95 80 | ,Logged In,Sara,F,42,Johnson,,paid,"Winston-Salem, NC",GET,Home,1.54081E+12,152,,200,1.54127E+12,95 81 | Cat Stevens,Logged In,Sara,F,43,Johnson,200.202,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Wild World,200,1.54127E+12,95 82 | Passion Pit,Logged In,Lily,F,24,Koch,174.75873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Sleepyhead,200,1.54127E+12,15 83 | The Temper Trap,Logged In,Sara,F,44,Johnson,192.67873,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Fader,200,1.54127E+12,95 84 | OneRepublic,Logged In,Lily,F,25,Koch,224.67873,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Secrets,200,1.54127E+12,15 85 | Diplo,Logged In,Sara,F,45,Johnson,96.86159,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Florida,200,1.54127E+12,95 86 | Afro Celt Sound System,Logged In,Sara,F,46,Johnson,425.01179,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Even In My Dreams,200,1.54127E+12,95 87 | PeterLicht,Logged In,Lily,F,26,Koch,306.80771,paid,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1.54105E+12,199,Heiterkeit,200,1.54127E+12,15 88 | ,Logged In,Lily,F,27,Koch,,paid,"Chicago-Naperville-Elgin, IL-IN-WI",GET,Downgrade,1.54105E+12,199,,200,1.54127E+12,15 89 | M.A. Numminen,Logged In,Sara,F,47,Johnson,166.55628,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,TULENLIEKKI,200,1.54127E+12,95 90 | ,Logged In,Sara,F,48,Johnson,,paid,"Winston-Salem, NC",GET,Home,1.54081E+12,152,,200,1.54127E+12,95 91 | Limp Bizkit,Logged In,Sara,F,49,Johnson,214.5171,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Rollin' (Air Raid Vehicle),200,1.54127E+12,95 92 | Downhere,Logged In,Sara,F,50,Johnson,230.71302,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,In America (Album Version),200,1.54127E+12,95 93 | Roots Manuva,Logged In,Sara,F,51,Johnson,233.92608,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,No Love,200,1.54127E+12,95 94 | Lykke Li,Logged In,Sara,F,52,Johnson,162.19383,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Let It Fall,200,1.54127E+12,95 95 | Plan B,Logged In,Sara,F,53,Johnson,222.82404,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Free,200,1.54127E+12,95 96 | Octopus Project,Logged In,Sara,F,54,Johnson,498.33751,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Hypnopaedia,200,1.54127E+12,95 97 | OneRepublic,Logged In,Sara,F,55,Johnson,224.67873,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Secrets,200,1.54127E+12,95 98 | Gilberto Santa Rosa,Logged In,Sara,F,56,Johnson,272.16934,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,La Sigo Amando Tanto,200,1.54127E+12,95 99 | Eric Clapton,Logged In,Sara,F,57,Johnson,271.80363,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Tears In Heaven,200,1.54127E+12,95 100 | ARRESTED DEVELOPMENT,Logged In,Sara,F,58,Johnson,200.98567,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Fountain Of Youth,200,1.54127E+12,95 101 | ,Logged In,Sara,F,59,Johnson,,paid,"Winston-Salem, NC",GET,Home,1.54081E+12,152,,200,1.54127E+12,95 102 | Future Rock,Logged In,Sara,F,60,Johnson,239.90812,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Gears,200,1.54127E+12,95 103 | Steppenwolf,Logged In,Sara,F,61,Johnson,208.14322,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Born To Be Wild,200,1.54127E+12,95 104 | Jason Mraz & Colbie Caillat,Logged In,Sara,F,62,Johnson,189.6224,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,Lucky (Album Version),200,1.54127E+12,95 105 | Leonard Cohen,Logged In,Sara,F,63,Johnson,298.57914,paid,"Winston-Salem, NC",PUT,NextSong,1.54081E+12,152,The Stranger Song,200,1.54127E+12,95 106 | Bright Eyes,Logged In,Ayla,F,0,Johnson,279.09179,free,"Santa Rosa, CA",PUT,NextSong,1.54088E+12,135,Kathy with a K's Song,200,1.54128E+12,63 107 | ,Logged In,Chloe,F,0,Cuevas,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54094E+12,195,,200,1.54128E+12,49 108 | The Smiths,Logged In,Chloe,F,1,Cuevas,196.67546,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,195,The Boy With The Thorn In His Side,200,1.54128E+12,49 109 | Quique Gonzalez,Logged In,Chloe,F,2,Cuevas,214.20363,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,195,Cuando Eramos Reyes,200,1.54128E+12,49 110 | Muse,Logged In,Chloe,F,3,Cuevas,210.46812,free,"San Francisco-Oakland-Hayward, CA",PUT,NextSong,1.54094E+12,195,Pink Ego Box,200,1.54128E+12,49 111 | Sugarland,Logged In,Cecilia,F,0,Owens,247.77098,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,5,Just Might (Make Me Believe),200,1.54129E+12,6 112 | A Hope For Home,Logged In,Cecilia,F,1,Owens,388.38812,free,"Atlanta-Sandy Springs-Roswell, GA",PUT,NextSong,1.54103E+12,5,Absolution: Of Flight and Failure,200,1.54129E+12,6 113 | -------------------------------------------------------------------------------- /nosql_db_modeling_apache_cassandra/nosql_db_modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part I. ETL Pipeline for Pre-Processing the Files" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Import Python packages " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Import Python packages \n", 24 | "import pandas as pd\n", 25 | "import cassandra\n", 26 | "import re\n", 27 | "import os\n", 28 | "import glob\n", 29 | "import numpy as np\n", 30 | "import json\n", 31 | "import csv" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "#### Creating list of filepaths to process original event csv data files" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "/Users/danieldiamond/Desktop/DEND_gdrive/DEND/project2_nosql_db_cassandra\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "# checking current working directory\n", 56 | "print(os.getcwd())\n", 57 | "\n", 58 | "# Get path to subfolder event data\n", 59 | "filepath = os.getcwd() + '/event_data'\n", 60 | "\n", 61 | "fnames = [os.path.join(filepath, f) for f in os.listdir(filepath) if 'csv' in f]" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "#### Processing the files to create the data file csv that will be used for Apache Casssandra tables" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# initiating an empty list of rows that will be generated from each file\n", 78 | "full_data_rows_list = [] \n", 79 | " \n", 80 | "# for every filepath in the fnames list \n", 81 | "for f in fnames:\n", 82 | "\n", 83 | " # reading csv file as dataframe\n", 84 | " df = pd.read_csv(f)\n", 85 | " \n", 86 | " # extracting each data row one by one and append it \n", 87 | " for index, row in df.iterrows():\n", 88 | " full_data_rows_list.append(row.tolist()) \n", 89 | " \n", 90 | "# creating a smaller event data csv file called event_datafile_full csv that will be used to insert data into the \\\n", 91 | "# Apache Cassandra tables\n", 92 | "df = pd.DataFrame(full_data_rows_list)[[0, 2, 3, 4, 5, 6, 7, 8, 12, 13, 16]]\n", 93 | "df.columns = ['artist','firstName','gender','itemInSession','lastName','length',\\\n", 94 | " 'level','location','sessionId','song','userId']\n", 95 | "\n", 96 | "# Drop any missing artists and save to csv file\n", 97 | "df[df['artist'].notnull()].to_csv('event_datafile_new.csv', index=False)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "6820" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "# check the number of rows in your csv file\n", 118 | "pd.read_csv('event_datafile_new.csv').shape[0]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "# Part II. Complete the Apache Cassandra coding portion of your project. \n", 126 | "\n", 127 | "## Now you are ready to work with the CSV file titled event_datafile_new.csv, located within the Workspace directory. The event_datafile_new.csv contains the following columns: \n", 128 | "- artist \n", 129 | "- firstName of user\n", 130 | "- gender of user\n", 131 | "- item number in session\n", 132 | "- last name of user\n", 133 | "- length of the song\n", 134 | "- level (paid or free song)\n", 135 | "- location of the user\n", 136 | "- sessionId\n", 137 | "- song title\n", 138 | "- userId\n", 139 | "\n", 140 | "The image below is a screenshot of what the denormalized data should appear like in the **event_datafile_new.csv** after the code above is run:
\n", 141 | "\n", 142 | "" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Begin writing your Apache Cassandra code in the cells below" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### Creating a Cluster" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "from cassandra.cluster import Cluster\n", 166 | "cluster = Cluster(['127.0.0.1'])\n", 167 | "\n", 168 | "# To establish connection and begin executing queries, need a session\n", 169 | "session = cluster.connect()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "#### Create Keyspace" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "try:\n", 186 | " session.execute(\"\"\"\n", 187 | " CREATE KEYSPACE IF NOT EXISTS sparkifydb\n", 188 | " WITH REPLICATION =\n", 189 | " { 'class' : 'SimpleStrategy',\n", 190 | " 'replication_factor' : 1 }\"\"\"\n", 191 | " )\n", 192 | "except Exception as e:\n", 193 | " print (e)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "#### Set Keyspace" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 9, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "try:\n", 210 | " session.set_keyspace('sparkifydb')\n", 211 | "except Exception as e:\n", 212 | " print (e)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Now we need to create tables to run the following queries. Remember, with Apache Cassandra you model the database tables on the queries you want to run." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "#### Query 1\n", 227 | "Return the artist, song title and song's length in the music app history that was heard during sessionId = 338, and itemInSession = 4\n", 228 | "- Partition Key: sessionId\n", 229 | "- Cluster Column: itemInSession\n", 230 | "- Composite Primary Key: (sessionId, itemInSession)\n", 231 | "- Additional Columns: artist, song, length" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 10, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Create Table\n", 241 | "query = \"CREATE TABLE IF NOT EXISTS session_library \"\n", 242 | "query = query + \"(sessionId int, itemInSession int, \\\n", 243 | " artist varchar, song varchar, length float, \\\n", 244 | " PRIMARY KEY (sessionId, itemInSession))\"\n", 245 | "try:\n", 246 | " session.execute(query)\n", 247 | "except Exception as e:\n", 248 | " print (e)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 11, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# Insert Data from CSV\n", 258 | "filename = 'event_datafile_new.csv'\n", 259 | "\n", 260 | "df = pd.read_csv(filename)\n", 261 | "\n", 262 | "for index, row in df.iterrows():\n", 263 | " query = \"INSERT INTO session_library \\\n", 264 | " (sessionId, itemInSession, artist, song, length)\"\n", 265 | " query = query + \"VALUES (%s, %s, %s, %s, %s)\"\n", 266 | " session.execute(query, (int(row.sessionId), int(row.itemInSession),\n", 267 | " row.artist, row.song, float(row.length)))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "(u'Faithless', u'Music Matters (Mark Knight Dub)', 495.30731201171875)\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "# Select data accordingly to query 1\n", 285 | "\n", 286 | "query = \"\"\"SELECT artist, song, length from session_library \n", 287 | " WHERE sessionId = 338 and itemInSession = 4 ;\"\"\"\n", 288 | "try:\n", 289 | " rows = session.execute(query)\n", 290 | "except Exception as e:\n", 291 | " print (e)\n", 292 | " \n", 293 | "for row in rows:\n", 294 | " print (row.artist, row.song, row.length)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 13, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "# Drop Table\n", 304 | "session.execute('DROP TABLE session_library');" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "### Query 2\n", 312 | "Return only the following: name of artist, song (sorted by itemInSession) and user (first and last name) for userid = 10, sessionid = 182\n", 313 | "- Partition Key: userId\n", 314 | "- Cluster Column: sessionId, itemInSession\n", 315 | "- Composite Primary Key: (userId, sessionId, itemInSession)\n", 316 | "- Additional Columns: firstName, lastName, artist, song" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 14, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "# Create Table\n", 326 | "query = \"CREATE TABLE IF NOT EXISTS song_playlist_session \"\n", 327 | "query = query + \"(userId int, sessionId int, itemInSession int, \\\n", 328 | " firstName varchar, lastName varchar, \\\n", 329 | " artist varchar, song varchar, PRIMARY KEY \\\n", 330 | " (userId, sessionId, itemInSession))\"\n", 331 | "try:\n", 332 | " session.execute(query)\n", 333 | "except Exception as e:\n", 334 | " print (e)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 15, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Insert Data From Previous Dataframe (csv)\n", 344 | "for index, row in df.iterrows():\n", 345 | " query = \"INSERT INTO song_playlist_session \\\n", 346 | " (userId, sessionId, itemInSession, \\\n", 347 | " firstName, lastName, artist, song) \"\n", 348 | " query = query + \"VALUES (%s, %s, %s, %s, %s, %s, %s)\"\n", 349 | " session.execute(query, (int(row.userId), int(row.sessionId), \n", 350 | " int(row.itemInSession), row.firstName, \n", 351 | " row.lastName, row.artist, row.song))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 16, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "(u'Down To The Bone', u\"Keep On Keepin' On\", u'Sylvie', u'Cruz')\n", 364 | "(u'Three Drives', u'Greece 2000', u'Sylvie', u'Cruz')\n", 365 | "(u'Sebastien Tellier', u'Kilometer', u'Sylvie', u'Cruz')\n", 366 | "(u'Lonnie Gordon', u'Catch You Baby (Steve Pitron & Max Sanna Radio Edit)', u'Sylvie', u'Cruz')\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "# Select data accordingly to query 2\n", 372 | "query = \"\"\"SELECT artist, song, firstName, lastName from song_playlist_session \n", 373 | " WHERE userId = 10 and sessionId = 182 ;\"\"\"\n", 374 | "try:\n", 375 | " rows = session.execute(query)\n", 376 | "except Exception as e:\n", 377 | " print (e)\n", 378 | " \n", 379 | "for row in rows:\n", 380 | " print (row.artist, row.song, row.firstname, row.lastname)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 17, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "# Drop Table\n", 390 | "session.execute('DROP TABLE song_playlist_session');" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "### Query 3\n", 398 | "Return every user name (first and last) in my music app history who listened to the song 'All Hands Against His Own'\n", 399 | "- Partition Key: song\n", 400 | "- Cluster Column: userId\n", 401 | "- Composite Primary Key: (song, userId)\n", 402 | "- Additional Columns: firstName, lastName" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 18, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# Create Table\n", 412 | "query = \"CREATE TABLE IF NOT EXISTS user_songchoice_table \"\n", 413 | "query = query + \"(song varchar, userId int, \\\n", 414 | " firstName varchar, lastName varchar, \\\n", 415 | " PRIMARY KEY (song, userId))\"\n", 416 | "try:\n", 417 | " session.execute(query)\n", 418 | "except Exception as e:\n", 419 | " print (e)\n", 420 | " " 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 19, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# Insert Data From Previous Dataframe (csv)\n", 430 | "for index, row in df.iterrows():\n", 431 | " query = \"INSERT INTO user_songchoice_table \\\n", 432 | " (song, userId, \\\n", 433 | " firstName, lastName) \"\n", 434 | " query = query + \"VALUES (%s, %s, %s, %s)\"\n", 435 | " session.execute(query, (row.song, int(row.userId), \n", 436 | " row.firstName, row.lastName))" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 20, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "(u'Jacqueline', u'Lynch')\n", 449 | "(u'Tegan', u'Levine')\n", 450 | "(u'Sara', u'Johnson')\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "# Select data according to query 3\n", 456 | "query = \"\"\"SELECT firstName, lastName from user_songchoice_table\n", 457 | " WHERE song = 'All Hands Against His Own';\"\"\"\n", 458 | "try:\n", 459 | " rows = session.execute(query)\n", 460 | "except Exception as e:\n", 461 | " print (e)\n", 462 | " \n", 463 | "for row in rows:\n", 464 | " print (row.firstname, row.lastname)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 21, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "# Drop Table\n", 474 | "session.execute('DROP TABLE user_songchoice_table');" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 22, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "# Close session and cluster connection\n", 484 | "session.shutdown()\n", 485 | "cluster.shutdown()" 486 | ] 487 | } 488 | ], 489 | "metadata": { 490 | "kernelspec": { 491 | "display_name": "Python 3", 492 | "language": "python", 493 | "name": "python3" 494 | }, 495 | "language_info": { 496 | "codemirror_mode": { 497 | "name": "ipython", 498 | "version": 2 499 | }, 500 | "file_extension": ".py", 501 | "mimetype": "text/x-python", 502 | "name": "python", 503 | "nbconvert_exporter": "python", 504 | "pygments_lexer": "ipython2", 505 | "version": "2.7.8" 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 2 510 | } 511 | --------------------------------------------------------------------------------