├── .gitignore ├── .idea ├── Data-engineering-nanodegree.iml ├── markdown-navigator.xml ├── markdown-navigator │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── .vscode └── settings.json ├── 1_dend_data_modeling ├── Data Modeling.pdf ├── P1_Postgres_Data_Modeling_and_ETL │ ├── README.md │ ├── create_tables.py │ ├── data │ │ ├── log_data │ │ │ └── 2018 │ │ │ │ └── 11 │ │ │ │ ├── 2018-11-01-events.json │ │ │ │ ├── 2018-11-02-events.json │ │ │ │ ├── 2018-11-03-events.json │ │ │ │ ├── 2018-11-04-events.json │ │ │ │ ├── 2018-11-05-events.json │ │ │ │ ├── 2018-11-06-events.json │ │ │ │ ├── 2018-11-07-events.json │ │ │ │ ├── 2018-11-08-events.json │ │ │ │ ├── 2018-11-09-events.json │ │ │ │ ├── 2018-11-10-events.json │ │ │ │ ├── 2018-11-11-events.json │ │ │ │ ├── 2018-11-12-events.json │ │ │ │ ├── 2018-11-13-events.json │ │ │ │ ├── 2018-11-14-events.json │ │ │ │ ├── 2018-11-15-events.json │ │ │ │ ├── 2018-11-16-events.json │ │ │ │ ├── 2018-11-17-events.json │ │ │ │ ├── 2018-11-18-events.json │ │ │ │ ├── 2018-11-19-events.json │ │ │ │ ├── 2018-11-20-events.json │ │ │ │ ├── 2018-11-21-events.json │ │ │ │ ├── 2018-11-22-events.json │ │ │ │ ├── 2018-11-23-events.json │ │ │ │ ├── 2018-11-24-events.json │ │ │ │ ├── 2018-11-25-events.json │ │ │ │ ├── 2018-11-26-events.json │ │ │ │ ├── 2018-11-27-events.json │ │ │ │ ├── 2018-11-28-events.json │ │ │ │ ├── 2018-11-29-events.json │ │ │ │ └── 2018-11-30-events.json │ │ └── song_data │ │ │ └── A │ │ │ ├── A │ │ │ ├── A │ │ │ │ ├── TRAAAAW128F429D538.json │ │ │ │ ├── TRAAABD128F429CF47.json │ │ │ │ ├── TRAAADZ128F9348C2E.json │ │ │ │ ├── TRAAAEF128F4273421.json │ │ │ │ ├── TRAAAFD128F92F423A.json │ │ │ │ ├── TRAAAMO128F1481E7F.json │ │ │ │ ├── TRAAAMQ128F1460CD3.json │ │ │ │ ├── TRAAAPK128E0786D96.json │ │ │ │ ├── TRAAARJ128F9320760.json │ │ │ │ ├── TRAAAVG12903CFA543.json │ │ │ │ └── TRAAAVO128F93133D4.json │ │ │ ├── B │ │ │ │ ├── TRAABCL128F4286650.json │ │ │ │ ├── TRAABDL12903CAABBA.json │ │ │ │ ├── TRAABJL12903CDCF1A.json │ │ │ │ ├── TRAABJV128F1460C49.json │ │ │ │ ├── TRAABLR128F423B7E3.json │ │ │ │ ├── TRAABNV128F425CEE1.json │ │ │ │ ├── TRAABRB128F9306DD5.json │ │ │ │ ├── TRAABVM128F92CA9DC.json │ │ │ │ ├── TRAABXG128F9318EBD.json │ │ │ │ ├── TRAABYN12903CFD305.json │ │ │ │ └── TRAABYW128F4244559.json │ │ │ └── C │ │ │ │ ├── TRAACCG128F92E8A55.json │ │ │ │ ├── TRAACER128F4290F96.json │ │ │ │ ├── TRAACFV128F935E50B.json │ │ │ │ ├── TRAACHN128F1489601.json │ │ │ │ ├── TRAACIW12903CC0F6D.json │ │ │ │ ├── TRAACLV128F427E123.json │ │ │ │ ├── TRAACNS128F14A2DF5.json │ │ │ │ ├── TRAACOW128F933E35F.json │ │ │ │ ├── TRAACPE128F421C1B9.json │ │ │ │ ├── TRAACQT128F9331780.json │ │ │ │ ├── TRAACSL128F93462F4.json │ │ │ │ ├── TRAACTB12903CAAF15.json │ │ │ │ ├── TRAACVS128E078BE39.json │ │ │ │ └── TRAACZK128F4243829.json │ │ │ └── B │ │ │ ├── A │ │ │ ├── TRABACN128F425B784.json │ │ │ ├── TRABAFJ128F42AF24E.json │ │ │ ├── TRABAFP128F931E9A1.json │ │ │ ├── TRABAIO128F42938F9.json │ │ │ ├── TRABATO128F42627E9.json │ │ │ ├── TRABAVQ12903CBF7E0.json │ │ │ ├── TRABAWW128F4250A31.json │ │ │ ├── TRABAXL128F424FC50.json │ │ │ ├── TRABAXR128F426515F.json │ │ │ ├── TRABAXV128F92F6AE3.json │ │ │ └── TRABAZH128F930419A.json │ │ │ ├── B │ │ │ ├── TRABBAM128F429D223.json │ │ │ ├── TRABBBV128F42967D7.json │ │ │ ├── TRABBJE12903CDB442.json │ │ │ ├── TRABBKX128F4285205.json │ │ │ ├── TRABBLU128F93349CF.json │ │ │ ├── TRABBNP128F932546F.json │ │ │ ├── TRABBOP128F931B50D.json │ │ │ ├── TRABBOR128F4286200.json │ │ │ ├── TRABBTA128F933D304.json │ │ │ ├── TRABBVJ128F92F7EAA.json │ │ │ ├── TRABBXU128F92FEF48.json │ │ │ └── TRABBZN12903CD9297.json │ │ │ └── C │ │ │ ├── TRABCAJ12903CDFCC2.json │ │ │ ├── TRABCEC128F426456E.json │ │ │ ├── TRABCEI128F424C983.json │ │ │ ├── TRABCFL128F149BB0D.json │ │ │ ├── TRABCIX128F4265903.json │ │ │ ├── TRABCKL128F423A778.json │ │ │ ├── TRABCPZ128F4275C32.json │ │ │ ├── TRABCRU128F423F449.json │ │ │ ├── TRABCTK128F934B224.json │ │ │ ├── TRABCUQ128E0783E2B.json │ │ │ ├── TRABCXB128F4286BD3.json │ │ │ └── TRABCYE128F934CE1D.json │ ├── etl.ipynb │ ├── etl.py │ ├── sql_queries.py │ └── test.ipynb ├── P2_Cassandra_Data_Modeling_and_ETL │ ├── Project_1B_ Project_Template.ipynb │ ├── event_data │ │ ├── 2018-11-01-events.csv │ │ ├── 2018-11-02-events.csv │ │ ├── 2018-11-03-events.csv │ │ ├── 2018-11-04-events.csv │ │ ├── 2018-11-05-events.csv │ │ ├── 2018-11-06-events.csv │ │ ├── 2018-11-07-events.csv │ │ ├── 2018-11-08-events.csv │ │ ├── 2018-11-09-events.csv │ │ ├── 2018-11-10-events.csv │ │ ├── 2018-11-11-events.csv │ │ ├── 2018-11-12-events.csv │ │ ├── 2018-11-13-events.csv │ │ ├── 2018-11-14-events.csv │ │ ├── 2018-11-15-events.csv │ │ ├── 2018-11-16-events.csv │ │ ├── 2018-11-17-events.csv │ │ ├── 2018-11-18-events.csv │ │ ├── 2018-11-19-events.csv │ │ ├── 2018-11-20-events.csv │ │ ├── 2018-11-21-events.csv │ │ ├── 2018-11-22-events.csv │ │ ├── 2018-11-23-events.csv │ │ ├── 2018-11-24-events.csv │ │ ├── 2018-11-25-events.csv │ │ ├── 2018-11-26-events.csv │ │ ├── 2018-11-27-events.csv │ │ ├── 2018-11-28-events.csv │ │ ├── 2018-11-29-events.csv │ │ └── 2018-11-30-events.csv │ ├── event_datafile_new.csv │ └── images │ │ └── image_event_datafile_new.jpg └── notebooks │ ├── L1-D0-creating-a-table-with-postgres.ipynb │ ├── L1-D1-creating-a-table-with-postgres.ipynb │ ├── L1-D2-creating-a-table-with-apache-cassandra.ipynb │ ├── L2-D1-creating-normalized-tables.ipynb │ ├── L2-D2-creating-denormalized-tables.ipynb │ ├── L2-D3-creating-fact-and-dimension-tables-with-star-schema.ipynb │ ├── L3-D1-2-queries-2-tables.ipynb │ ├── L3-D2-primary-key.ipynb │ ├── L3-D3-clustering-column.ipynb │ └── L3-D4-using-the-where-clause.ipynb ├── 2_dend_cloud_data_warehouses ├── Data warehousing in the cloud.pdf ├── P3_Data_Warehouse_Project │ ├── .vscode │ │ └── settings.json │ ├── README.md │ ├── analytics.py │ ├── create_cluster.py │ ├── create_tables.py │ ├── etl.py │ ├── requirements.txt │ └── sql_queries.py ├── infrastructure_as_code.py ├── log-data.csv │ └── log_data.csv ├── notebooks │ ├── Data │ │ ├── README │ │ ├── pagila-data.sql │ │ ├── pagila-insert-data.sql │ │ └── pagila-schema.sql │ ├── L1 E1 - Step 1 and 2.ipynb │ ├── L1 E1 - Step 3.ipynb │ ├── L1 E1 - Step 4.ipynb │ ├── L1 E1 - Step 5.ipynb │ ├── L1 E1 - Step 6.ipynb │ ├── L1 E2 - 1 - Slicing and Dicing.ipynb │ ├── L1 E2 - 2 - Roll up and Drill Down.ipynb │ ├── L1 E2 - 3 - Grouping Sets.ipynb │ ├── L1 E2 - 4 - CUBE.ipynb │ ├── L1 E3 - Columnar Vs Row Storage.ipynb │ ├── L3 Exercise 2 - IaC - Solution.ipynb │ ├── L3 Exercise 2 - IaC - Solution.py │ ├── L3 Exercise 3 - Parallel ETL - Solution.ipynb │ ├── L3 Exercise 3 - Parallel ETL - Solution.py │ ├── L3 Exercise 4 - Table Design - Solution.ipynb │ ├── L3 Exercise 4 - Table Design - Solution.py │ └── pagila-star.png └── notes │ └── AWS.md ├── 3_dend_spark_data_lakes ├── Data Lakes with Spark.pdf ├── P4_Data_Lake │ ├── README.md │ └── etl.py ├── data │ ├── log-data.png │ ├── log-data.zip │ ├── song-data.zip │ └── sparkify_log_small.json ├── notebooks │ ├── 1_procedural_vs_functional_in_python.ipynb │ ├── 2_spark_maps_and_lazy_evaluation.ipynb │ ├── 3_data_inputs_and_outputs.ipynb │ ├── 4_data_wrangling.ipynb │ ├── 5_dataframe_quiz.ipynb │ ├── 6_dataframe_quiz_solution.ipynb │ ├── 7_data_wrangling-sql.ipynb │ ├── 8_spark_sql_quiz.ipynb │ ├── 9_spark_sql_quiz_solution.ipynb │ ├── Exercise 1 - Schema On Read.ipynb │ ├── Exercise 2 - Advanced Analytics NLP.ipynb │ ├── Exercise 3 - Data Lake on S3.ipynb │ └── mapreduce_practice.ipynb └── spark.md ├── 4_dend_airflow_data_pipelines ├── P5_Data_Pipelines │ ├── README.md │ ├── __init__.py │ ├── airflow.db │ ├── dags │ │ ├── __init__.py │ │ ├── sparkify_dend_dag.py │ │ └── sparkify_dend_dimesions_subdag.py │ ├── imgs │ │ ├── airflow-details-dag.png │ │ ├── airflow-running-dag.png │ │ ├── dag-code.png │ │ └── dag.png │ └── plugins │ │ ├── __init__.py │ │ ├── helpers │ │ ├── __init__.py │ │ └── sql_queries.py │ │ └── operators │ │ ├── __init__.py │ │ ├── create_tables.py │ │ ├── create_tables.sql │ │ ├── data_quality.py │ │ ├── load_dimension.py │ │ ├── load_fact.py │ │ └── stage_redshift.py ├── data_pipelines.md ├── exercises │ ├── __init__.py │ ├── dags │ │ ├── 1_ex1_hello_world.py │ │ ├── 1_ex2_scheduler.py │ │ ├── 1_ex3_dependencies.py │ │ ├── 1_ex4_connections.py │ │ ├── 1_ex5_context.py │ │ ├── 1_ex6_redshift_queries.py │ │ ├── 2_ex1_data_lineage.py │ │ ├── 2_ex2_schedule_backfilling.py │ │ ├── 2_ex3_data_partitioning.py │ │ ├── 2_ex4_data_quality.py │ │ ├── 3_ex1_plugins.py │ │ ├── 3_ex2_refactoring.py │ │ ├── 3_ex3_subdags │ │ │ ├── __init__.py │ │ │ ├── dag.py │ │ │ └── subdag.py │ │ ├── 3_ex4_full_dag.py │ │ ├── __init__.py │ │ └── sql_statements.py │ └── plugins │ │ ├── __init__.py │ │ └── operators │ │ ├── __init__.py │ │ ├── facts_calculator.py │ │ ├── has_rows.py │ │ └── s3_to_redshift.py └── glossary-data-pipelines-in-airflow.pdf ├── DEND.code-workspace ├── LICENSE ├── README.md ├── _config.yml ├── cheatsheets ├── Data-Science-Books-for-2018.pdf ├── Pandas DataFrame Notes.pdf ├── Pandas_Cheat_Sheet.pdf └── linux cheatsheet.jpg ├── data-engineering.jpg └── environment.yml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.DS_Store 6 | .idea/* 7 | 8 | # C extensions 9 | *.so 10 | *.cfg 11 | .idea/ 12 | .vscode/ 13 | credentials.csv 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | -------------------------------------------------------------------------------- /.idea/Data-engineering-nanodegree.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/markdown-navigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 36 | 37 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /.idea/markdown-navigator/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/f.silvestre/anaconda3/envs/pasi-server/bin/python" 3 | } -------------------------------------------------------------------------------- /1_dend_data_modeling/Data Modeling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/1_dend_data_modeling/Data Modeling.pdf -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/create_tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from sql_queries import create_table_queries, drop_table_queries 3 | 4 | 5 | def create_database(): 6 | '''Creates and connects to sparkifydb database. Returns cursor and connection to DB''' 7 | # connect to default database 8 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student") 9 | conn.set_session(autocommit=True) 10 | cur = conn.cursor() 11 | 12 | # create sparkify database with UTF8 encoding 13 | cur.execute("DROP DATABASE IF EXISTS sparkifydb") 14 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0") 15 | 16 | # close connection to default database 17 | conn.close() 18 | 19 | # connect to sparkify database 20 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 21 | cur = conn.cursor() 22 | 23 | return cur, conn 24 | 25 | 26 | def drop_tables(cur, conn): 27 | '''Drops all tables created on the database''' 28 | for query in drop_table_queries: 29 | cur.execute(query) 30 | conn.commit() 31 | 32 | 33 | def create_tables(cur, conn): 34 | '''Created tables defined on the sql_queries script: [songplays, users, songs, artists, time]''' 35 | for query in create_table_queries: 36 | cur.execute(query) 37 | conn.commit() 38 | 39 | 40 | def main(): 41 | """ Function to drop and re create sparkifydb database and all related tables. 42 | Usage: python create_tables.py 43 | """ 44 | cur, conn = create_database() 45 | 46 | drop_tables(cur, conn) 47 | create_tables(cur, conn) 48 | 49 | conn.close() 50 | 51 | 52 | if __name__ == "__main__": 53 | main() -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAAW128F429D538.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAABD128F429CF47.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAADZ128F9348C2E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAEF128F4273421.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAFD128F92F423A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAMO128F1481E7F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAPK128E0786D96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAARJ128F9320760.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAVG12903CFA543.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAVO128F93133D4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSJW91187B9B1D6B", "artist_latitude": 35.21962, "artist_longitude": -80.01955, "artist_location": "North Carolina", "artist_name": "JennyAnyKind", "song_id": "SOQHXMF12AB0182363", "title": "Young Boy Blues", "duration": 218.77506, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABCL128F4286650.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABDL12903CAABBA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABJL12903CDCF1A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABJV128F1460C49.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABLR128F423B7E3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABNV128F425CEE1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABRB128F9306DD5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABVM128F92CA9DC.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABXG128F9318EBD.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABYN12903CFD305.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABYW128F4244559.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI3BMM1187FB4255E", "artist_latitude": 38.8991, "artist_longitude": -77.029, "artist_location": "Washington", "artist_name": "Alice Stuart", "song_id": "SOBEBDG12A58A76D60", "title": "Kassie Jones", "duration": 220.78649, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACCG128F92E8A55.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACER128F4290F96.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACFV128F935E50B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACHN128F1489601.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACIW12903CC0F6D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACLV128F427E123.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACNS128F14A2DF5.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACOW128F933E35F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACPE128F421C1B9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACQT128F9331780.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACSL128F93462F4.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAJPHH1187FB5566A", "artist_latitude": 40.7038, "artist_longitude": -73.83168, "artist_location": "Queens, NY", "artist_name": "The Shangri-Las", "song_id": "SOYTPEP12AB0180E7B", "title": "Twist and Shout", "duration": 164.80608, "year": 1964} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACTB12903CAAF15.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0RCMP1187FB3F427", "artist_latitude": 30.08615, "artist_longitude": -94.10158, "artist_location": "Beaumont, TX", "artist_name": "Billie Jo Spears", "song_id": "SOGXHEG12AB018653E", "title": "It Makes No Difference Now", "duration": 133.32853, "year": 1992} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACVS128E078BE39.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREBBGV1187FB523D2", "artist_latitude": null, "artist_longitude": null, "artist_location": "Houston, TX", "artist_name": "Mike Jones (Featuring CJ_ Mello & Lil' Bran)", "song_id": "SOOLYAZ12A6701F4A6", "title": "Laws Patrolling (Album Version)", "duration": 173.66159, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACZK128F4243829.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGUVEV1187B98BA17", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sierra Maestra", "song_id": "SOGOSOV12AF72A285E", "title": "\u00bfD\u00f3nde va Chichi?", "duration": 313.12934, "year": 1997} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABACN128F425B784.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAFJ128F42AF24E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAFP128F931E9A1.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAIO128F42938F9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABATO128F42627E9.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAWW128F4250A31.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXL128F424FC50.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXR128F426515F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXV128F92F6AE3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAZH128F930419A.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7ZKHQ1187B98DD73", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Glad", "song_id": "SOTUKVB12AB0181477", "title": "Blessed Assurance", "duration": 270.602, "year": 1993} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBAM128F429D223.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBBV128F42967D7.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBJE12903CDB442.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBKX128F4285205.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBLU128F93349CF.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBNP128F932546F.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBOP128F931B50D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBOR128F4286200.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBTA128F933D304.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBXU128F92FEF48.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARP6N5A1187B99D1A3", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamtramck, MI", "artist_name": "Mitch Ryder", "song_id": "SOXILUQ12A58A7C72A", "title": "Jenny Take a Ride", "duration": 207.43791, "year": 2004} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBZN12903CD9297.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARGSAFR1269FB35070", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Blingtones", "song_id": "SOTCKKY12AB018A141", "title": "Sonnerie lalaleul\u00e9 hi houuu", "duration": 29.54404, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCEC128F426456E.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCEI128F424C983.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCFL128F149BB0D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCIX128F4265903.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCKL128F423A778.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCPZ128F4275C32.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCRU128F423F449.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCTK128F934B224.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCUQ128E0783E2B.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCXB128F4286BD3.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "ARWB3G61187FB49404", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamilton, Ohio", "artist_name": "Steve Morse", "song_id": "SODAUVL12A8C13D184", "title": "Prognosis", "duration": 363.85914, "year": 2000} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCYE128F934CE1D.json: -------------------------------------------------------------------------------- 1 | {"num_songs": 1, "artist_id": "AREVWGE1187B9B890A", "artist_latitude": -13.442, "artist_longitude": -41.9952, "artist_location": "Noci (BA)", "artist_name": "Bitter End", "song_id": "SOFCHDR12AB01866EF", "title": "Living Hell", "duration": 282.43546, "year": 0} -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/etl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import psycopg2 4 | import pandas as pd 5 | from sql_queries import * 6 | 7 | 8 | def process_song_file(cur, filepath): 9 | """Reads songs log file row by row, selects needed fields and inserts them into song and artist tables. 10 | 11 | Parameters: 12 | cur (psycopg2.cursor()): Cursor of the sparkifydb database 13 | filepath (str): Filepath of the file to be analyzed 14 | """ 15 | # open song file 16 | df = pd.read_json(filepath, lines=True) 17 | 18 | for value in df.values: 19 | artist_id, artist_latitude, artist_location, artist_longitude, artist_name, duration, num_songs, song_id, title, year = value 20 | 21 | # insert artist record 22 | artist_data = [artist_id, artist_name, artist_location, artist_longitude, artist_latitude] 23 | cur.execute(artist_table_insert, artist_data) 24 | 25 | # insert song record 26 | song_data = [song_id, title, artist_id, year, duration] 27 | cur.execute(song_table_insert, song_data) 28 | 29 | 30 | def process_log_file(cur, filepath): 31 | """Reads user activity log file row by row, filters by NexSong, selects needed fields, transforms them and inserts 32 | them into time, user and songplay tables. 33 | 34 | Parameters: 35 | cur (psycopg2.cursor()): Cursor of the sparkifydb database 36 | filepath (str): Filepath of the file to be analyzed 37 | """ 38 | # open log file 39 | df = pd.read_json(filepath, lines=True) 40 | 41 | # filter by NextSong action 42 | df = df[df['page']=='NextSong'] 43 | 44 | # convert timestamp column to datetime 45 | t = pd.to_datetime(df['ts'], unit='ms') 46 | 47 | # insert time data records 48 | time_data = [] 49 | for line in t: 50 | time_data.append([line, line.hour, line.day, line.week, line.month, line.year, line.day_name()]) 51 | column_labels = ('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday') 52 | time_df = pd.DataFrame.from_records(time_data, columns=column_labels) 53 | 54 | for i, row in time_df.iterrows(): 55 | cur.execute(time_table_insert, list(row)) 56 | 57 | # load user table 58 | user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']] 59 | 60 | # insert user records 61 | for i, row in user_df.iterrows(): 62 | cur.execute(user_table_insert, row) 63 | 64 | # insert songplay records 65 | for index, row in df.iterrows(): 66 | 67 | # get songid and artistid from song and artist tables 68 | cur.execute(song_select, (row.song, row.artist, row.length)) 69 | results = cur.fetchone() 70 | 71 | if results: 72 | songid, artistid = results 73 | else: 74 | songid, artistid = None, None 75 | 76 | # insert songplay record 77 | songplay_data = (index, pd.to_datetime(row.ts, unit='ms'), int(row.userId), row.level, songid, artistid, row.sessionId, row.location, row.userAgent) 78 | cur.execute(songplay_table_insert, songplay_data) 79 | 80 | 81 | def process_data(cur, conn, filepath, func): 82 | """Walks through all files nested under filepath, and processes all logs found. 83 | 84 | Parameters: 85 | cur (psycopg2.cursor()): Cursor of the sparkifydb database 86 | conn (psycopg2.connect()): Connectio to the sparkifycdb database 87 | filepath (str): Filepath parent of the logs to be analyzed 88 | func (python function): Function to be used to process each log 89 | 90 | Returns: 91 | Name of files processed 92 | """ 93 | # get all files matching extension from directory 94 | all_files = [] 95 | for root, dirs, files in os.walk(filepath): 96 | files = glob.glob(os.path.join(root,'*.json')) 97 | for f in files : 98 | all_files.append(os.path.abspath(f)) 99 | 100 | # get total number of files found 101 | num_files = len(all_files) 102 | print('{} files found in {}'.format(num_files, filepath)) 103 | 104 | # iterate over files and process 105 | for i, datafile in enumerate(all_files, 1): 106 | func(cur, datafile) 107 | conn.commit() 108 | print('{}/{} files processed.'.format(i, num_files)) 109 | 110 | return all_files 111 | 112 | 113 | def main(): 114 | """Function used to extract, transform all data from song and user activity logs and load it into a PostgreSQL DB 115 | Usage: python etl.py 116 | """ 117 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student") 118 | cur = conn.cursor() 119 | 120 | process_data(cur, conn, filepath='data/song_data', func=process_song_file) 121 | process_data(cur, conn, filepath='data/log_data', func=process_log_file) 122 | 123 | conn.close() 124 | 125 | 126 | if __name__ == "__main__": 127 | main() -------------------------------------------------------------------------------- /1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/sql_queries.py: -------------------------------------------------------------------------------- 1 | # DROP TABLES 2 | 3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays" 4 | user_table_drop = "DROP TABLE IF EXISTS users" 5 | song_table_drop = "DROP TABLE IF EXISTS songs" 6 | artist_table_drop = "DROP TABLE IF EXISTS artists" 7 | time_table_drop = "DROP TABLE IF EXISTS time" 8 | 9 | # CREATE TABLES 10 | 11 | songplay_table_create = (""" 12 | CREATE TABLE IF NOT EXISTS songplays 13 | (songplay_id int PRIMARY KEY, 14 | start_time date REFERENCES time(start_time), 15 | user_id int NOT NULL REFERENCES users(user_id), 16 | level text, 17 | song_id text REFERENCES songs(song_id), 18 | artist_id text REFERENCES artists(artist_id), 19 | session_id int, 20 | location text, 21 | user_agent text) 22 | """) 23 | 24 | user_table_create = (""" 25 | CREATE TABLE IF NOT EXISTS users 26 | (user_id int PRIMARY KEY, 27 | first_name text NOT NULL, 28 | last_name text NOT NULL, 29 | gender text, 30 | level text) 31 | """) 32 | 33 | song_table_create = (""" 34 | CREATE TABLE IF NOT EXISTS songs 35 | (song_id text PRIMARY KEY, 36 | title text NOT NULL, 37 | artist_id text NOT NULL REFERENCES artists(artist_id), 38 | year int, 39 | duration float NOT NULL) 40 | """) 41 | 42 | artist_table_create = (""" 43 | CREATE TABLE IF NOT EXISTS artists 44 | (artist_id text PRIMARY KEY, 45 | name text NOT NULL, 46 | location text, 47 | lattitude float, 48 | longitude float) 49 | """) 50 | 51 | time_table_create = (""" 52 | CREATE TABLE IF NOT EXISTS time 53 | (start_time date PRIMARY KEY, 54 | hour int, 55 | day int, 56 | week int, 57 | month int, 58 | year int, 59 | weekday text) 60 | """) 61 | 62 | # INSERT RECORDS 63 | 64 | songplay_table_insert = (""" 65 | INSERT INTO songplays 66 | (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) 67 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 68 | ON CONFLICT (songplay_id) DO NOTHING; 69 | """) 70 | 71 | user_table_insert = (""" 72 | INSERT INTO users 73 | (user_id, first_name, last_name, gender, level) 74 | VALUES (%s, %s, %s, %s, %s) 75 | ON CONFLICT (user_id) DO NOTHING; 76 | """) 77 | 78 | song_table_insert = (""" 79 | INSERT INTO songs 80 | (song_id, title, artist_id, year, duration) 81 | VALUES (%s, %s, %s, %s, %s) 82 | ON CONFLICT (song_id) DO NOTHING; 83 | """) 84 | 85 | artist_table_insert = (""" 86 | INSERT INTO artists 87 | (artist_id, name, location, lattitude, longitude) 88 | VALUES (%s, %s, %s, %s, %s) 89 | ON CONFLICT (artist_id) DO NOTHING; 90 | """) 91 | 92 | 93 | time_table_insert = (""" 94 | INSERT INTO time 95 | (start_time, hour, day, week, month, year, weekday) 96 | VALUES (%s, %s, %s, %s, %s, %s, %s) 97 | ON CONFLICT (start_time) DO NOTHING; 98 | """) 99 | 100 | # FIND SONGS 101 | 102 | song_select = (""" 103 | SELECT song_id, artists.artist_id 104 | FROM songs JOIN artists ON songs.artist_id = artists.artist_id 105 | WHERE songs.title = %s 106 | AND artists.name = %s 107 | AND songs.duration = %s 108 | """) 109 | 110 | # QUERY LISTS 111 | 112 | create_table_queries = [user_table_create, artist_table_create, song_table_create, time_table_create, songplay_table_create] 113 | drop_table_queries = [user_table_drop, artist_table_drop, song_table_drop, time_table_drop, songplay_table_drop] -------------------------------------------------------------------------------- /1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/event_data/2018-11-01-events.csv: -------------------------------------------------------------------------------- 1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId 2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39 3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8 4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8 5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8 6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8 7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8 8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8 9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8 10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8 11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8 12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10 13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26 14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26 15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26 16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101 17 | -------------------------------------------------------------------------------- /1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/images/image_event_datafile_new.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/images/image_event_datafile_new.jpg -------------------------------------------------------------------------------- /1_dend_data_modeling/notebooks/L1-D0-creating-a-table-with-postgres.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 1 Demo 0: PostgreSQL and AutoCommits\n", 8 | "\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Walk through the basics of PostgreSQL autocommits " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "## import postgreSQL adapter for the Python\n", 26 | "import psycopg2" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Create a connection to the database\n", 34 | "1. Connect to the local instance of PostgreSQL (*127.0.0.1*)\n", 35 | "2. Use the database/schema from the instance. \n", 36 | "3. The connection reaches out to the database (*studentdb*) and use the correct privilages to connect to the database (*user and password = student*)." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Use the connection to get a cursor that will be used to execute queries." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "cur = conn.cursor()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Create a database to work in" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "cur.execute(\"select * from test\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### Error occurs, but it was to be expected because table has not been created as yet. To fix the error, create the table. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Error indicates we cannot execute this query. Since we have not committed the transaction and had an error in the transaction block, we are blocked until we restart the connection." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n", 110 | "cur = conn.cursor()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "In our exercises instead of worrying about commiting each transaction or getting a strange error when we hit something unexpected, let's set autocommit to true. **This says after each call during the session commit that one action and do not hold open the transaction for any other actions. One action = one transaction.**" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "In this demo we will use automatic commit so each action is commited without having to call `conn.commit()` after each command. **The ability to rollback and commit transactions are a feature of Relational Databases.**" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "conn.set_session(autocommit=True)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "cur.execute(\"select * from test\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### Once autocommit is set to true, we execute this code successfully. There were no issues with transaction blocks and we did not need to restart our connection. " 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "cur.execute(\"select * from test\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "cur.execute(\"select count(*) from test\")\n", 177 | "print(cur.fetchall())" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.7.1" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/Data warehousing in the cloud.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/2_dend_cloud_data_warehouses/Data warehousing in the cloud.pdf -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "jira-plugin.workingProject": "" 3 | } -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/README.md: -------------------------------------------------------------------------------- 1 | # Project Datawarehouse 2 | 3 | ## Project description 4 | 5 | Sparkify is a music streaming startup with a growing user base and song database. 6 | 7 | Their user activity and songs metadata data resides in json files in S3. The goal of the current project is to build an ETL pipeline that extracts their data from S3, stages them in Redshift, and transforms data into a set of dimensional tables for their analytics team to continue finding insights in what songs their users are listening to. 8 | 9 | ## How to run 10 | 11 | 1. To run this project you will need to fill the following information, and save it as *dwh.cfg* in the project root folder. 12 | 13 | ``` 14 | [CLUSTER] 15 | HOST='' 16 | DB_NAME='' 17 | DB_USER='' 18 | DB_PASSWORD='' 19 | DB_PORT=5439 20 | 21 | [IAM_ROLE] 22 | ARN= 23 | 24 | [S3] 25 | LOG_DATA='s3://udacity-dend/log_data' 26 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json' 27 | SONG_DATA='s3://udacity-dend/song_data' 28 | 29 | [AWS] 30 | KEY= 31 | SECRET= 32 | 33 | [DWH] 34 | DWH_CLUSTER_TYPE = multi-node 35 | DWH_NUM_NODES = 4 36 | DWH_NODE_TYPE = dc2.large 37 | DWH_CLUSTER_IDENTIFIER = 38 | DWH_DB = 39 | DWH_DB_USER = 40 | DWH_DB_PASSWORD = 41 | DWH_PORT = 5439 42 | DWH_IAM_ROLE_NAME = 43 | ``` 44 | 45 | 2. Create a python environment with the dependencies listed on *requirements.txt* 46 | 3. Run the *create_cluster* script to set up the needed infrastructure for this project. 47 | 48 | `$ python create_cluster.py` 49 | 50 | 4. Run the *create_tables* script to set up the database staging and analytical tables 51 | 52 | `$ python create_tables.py` 53 | 54 | 5. Finally, run the *etl* script to extract data from the files in S3, stage it in redshift, and finally store it in the dimensional tables. 55 | 56 | `$ python create_tables.py` 57 | 58 | 59 | ## Project structure 60 | 61 | This project includes five script files: 62 | 63 | - analytics.py runs a few queries on the created star schema to validate that the project has been completed successfully. 64 | - create_cluster.py is where the AWS components for this project are created programmatically 65 | - create_table.py is where fact and dimension tables for the star schema in Redshift are created. 66 | - etl.py is where data gets loaded from S3 into staging tables on Redshift and then processed into the analytics tables on Redshift. 67 | - sql_queries.py where SQL statements are defined, which are then used by etl.py, create_table.py and analytics.py. 68 | - README.md is current file. 69 | - requirements.txt with python dependencies needed to run the project 70 | 71 | ## Database schema design 72 | State and justify your database schema design and ETL pipeline. 73 | 74 | #### Staging Tables 75 | - staging_events 76 | - staging_songs 77 | 78 | #### Fact Table 79 | - songplays - records in event data associated with song plays i.e. records with page NextSong - 80 | *songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent* 81 | 82 | #### Dimension Tables 83 | - users - users in the app - 84 | *user_id, first_name, last_name, gender, level* 85 | - songs - songs in music database - 86 | *song_id, title, artist_id, year, duration* 87 | - artists - artists in music database - 88 | *artist_id, name, location, lattitude, longitude* 89 | - time - timestamps of records in songplays broken down into specific units - 90 | *start_time, hour, day, week, month, year, weekday* 91 | 92 | 93 | ## Queries and Results 94 | 95 | Number of rows in each table: 96 | 97 | | Table | rows | 98 | |--- | --: | 99 | | staging_events | 8056 | 100 | | staging_songs | 14896 | 101 | | artists | 10025 | 102 | | songplays | 333 | 103 | | songs | 14896 | 104 | | time | 8023 | 105 | | users | 105 | 106 | 107 | 108 | ### Steps followed on this project 109 | 110 | 1. Create Table Schemas 111 | - Design schemas for your fact and dimension tables 112 | - Write a SQL CREATE statement for each of these tables in sql_queries.py 113 | - Complete the logic in create_tables.py to connect to the database and create these tables 114 | - Write SQL DROP statements to drop tables in the beginning of - create_tables.py if the tables already exist. This way, you can run create_tables.py whenever you want to reset your database and test your ETL pipeline. 115 | - Launch a redshift cluster and create an IAM role that has read access to S3. 116 | - Add redshift database and IAM role info to dwh.cfg. 117 | - Test by running create_tables.py and checking the table schemas in your redshift database. You can use Query Editor in the AWS Redshift console for this. 118 | 119 | 2. Build ETL Pipeline 120 | - Implement the logic in etl.py to load data from S3 to staging tables on Redshift. 121 | - Implement the logic in etl.py to load data from staging tables to analytics tables on Redshift. 122 | - Test by running etl.py after running create_tables.py and running the analytic queries on your Redshift database to compare your results with the expected results. 123 | - Delete your redshift cluster when finished. 124 | 125 | 3. Document Process 126 | Do the following steps in your README.md file. 127 | 128 | - Discuss the purpose of this database in context of the startup, Sparkify, and their analytical goals. 129 | - State and justify your database schema design and ETL pipeline. 130 | - [Optional] Provide example queries and results for song play analysis. 131 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/analytics.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import select_number_rows_queries 4 | 5 | 6 | def get_results(cur, conn): 7 | """ 8 | Get the number of rows stored into each table 9 | """ 10 | for query in select_number_rows_queries: 11 | print('Running ' + query) 12 | cur.execute(query) 13 | results = cur.fetchone() 14 | 15 | for row in results: 16 | print(" ", row) 17 | 18 | 19 | def main(): 20 | """ 21 | Run queries on the staging and dimensional tables to validate that the project has been created successfully 22 | """ 23 | config = configparser.ConfigParser() 24 | config.read('dwh.cfg') 25 | 26 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 27 | cur = conn.cursor() 28 | 29 | get_results(cur, conn) 30 | 31 | conn.close() 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/create_cluster.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import boto3 3 | import json 4 | import psycopg2 5 | 6 | from botocore.exceptions import ClientError 7 | import configparser 8 | 9 | 10 | def create_iam_role(iam, DWH_IAM_ROLE_NAME): 11 | ''' 12 | Creates IAM Role for Redshift, to allow it to use AWS services 13 | ''' 14 | 15 | try: 16 | print("1.1 Creating a new IAM Role") 17 | dwhRole = iam.create_role( 18 | Path='/', 19 | RoleName=DWH_IAM_ROLE_NAME, 20 | Description = "Allows Redshift clusters to call AWS services on your behalf.", 21 | AssumeRolePolicyDocument=json.dumps( 22 | {'Statement': [{'Action': 'sts:AssumeRole', 23 | 'Effect': 'Allow', 24 | 'Principal': {'Service': 'redshift.amazonaws.com'}}], 25 | 'Version': '2012-10-17'}) 26 | ) 27 | except Exception as e: 28 | print(e) 29 | 30 | 31 | print("1.2 Attaching Policy") 32 | 33 | iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME, 34 | PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" 35 | )['ResponseMetadata']['HTTPStatusCode'] 36 | 37 | print("1.3 Get the IAM role ARN") 38 | roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn'] 39 | 40 | print(roleArn) 41 | return roleArn 42 | 43 | 44 | def create_cluster(redshift, roleArn, DWH_CLUSTER_TYPE, DWH_NODE_TYPE, DWH_NUM_NODES, DWH_DB, DWH_CLUSTER_IDENTIFIER, DWH_DB_USER, DWH_DB_PASSWORD): 45 | ''' 46 | Creates Redshift cluster 47 | ''' 48 | 49 | try: 50 | response = redshift.create_cluster( 51 | #HW 52 | ClusterType=DWH_CLUSTER_TYPE, 53 | NodeType=DWH_NODE_TYPE, 54 | NumberOfNodes=int(DWH_NUM_NODES), 55 | 56 | #Identifiers & Credentials 57 | DBName=DWH_DB, 58 | ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, 59 | MasterUsername=DWH_DB_USER, 60 | MasterUserPassword=DWH_DB_PASSWORD, 61 | 62 | #Roles (for s3 access) 63 | IamRoles=[roleArn] 64 | ) 65 | except Exception as e: 66 | print(e) 67 | 68 | 69 | def get_cluster_props(redshift, DWH_CLUSTER_IDENTIFIER): 70 | ''' 71 | Retrieve Redshift clusters properties 72 | ''' 73 | 74 | def prettyRedshiftProps(props): 75 | pd.set_option('display.max_colwidth', -1) 76 | keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId'] 77 | x = [(k, v) for k,v in props.items() if k in keysToShow] 78 | return pd.DataFrame(data=x, columns=["Key", "Value"]) 79 | 80 | myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0] 81 | prettyRedshiftProps(myClusterProps) 82 | 83 | DWH_ENDPOINT = myClusterProps['Endpoint']['Address'] 84 | DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn'] 85 | print("DWH_ENDPOINT :: ", DWH_ENDPOINT) 86 | print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN) 87 | return myClusterProps, DWH_ENDPOINT, DWH_ROLE_ARN 88 | 89 | 90 | def open_ports(ec2, myClusterProps, DWH_PORT): 91 | ''' 92 | Update clusters security group to allow access through redshift port 93 | ''' 94 | 95 | try: 96 | vpc = ec2.Vpc(id=myClusterProps['VpcId']) 97 | defaultSg = list(vpc.security_groups.all())[0] 98 | print(defaultSg) 99 | defaultSg.authorize_ingress( 100 | GroupName=defaultSg.group_name, 101 | CidrIp='0.0.0.0/0', 102 | IpProtocol='TCP', 103 | FromPort=int(DWH_PORT), 104 | ToPort=int(DWH_PORT) 105 | ) 106 | except Exception as e: 107 | print(e) 108 | 109 | 110 | def main(): 111 | 112 | config = configparser.ConfigParser() 113 | config.read_file(open('dwh.cfg')) 114 | 115 | KEY = config.get('AWS','KEY') 116 | SECRET = config.get('AWS','SECRET') 117 | 118 | DWH_CLUSTER_TYPE = config.get("DWH","DWH_CLUSTER_TYPE") 119 | DWH_NUM_NODES = config.get("DWH","DWH_NUM_NODES") 120 | DWH_NODE_TYPE = config.get("DWH","DWH_NODE_TYPE") 121 | 122 | DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER") 123 | DWH_DB = config.get("DWH","DWH_DB") 124 | DWH_DB_USER = config.get("DWH","DWH_DB_USER") 125 | DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD") 126 | DWH_PORT = config.get("DWH","DWH_PORT") 127 | 128 | DWH_IAM_ROLE_NAME = config.get("DWH", "DWH_IAM_ROLE_NAME") 129 | 130 | (DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB) 131 | 132 | df = pd.DataFrame({"Param": 133 | ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"], 134 | "Value": 135 | [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME] 136 | }) 137 | 138 | print(df) 139 | 140 | 141 | ec2 = boto3.resource('ec2', 142 | region_name="us-west-2", 143 | aws_access_key_id=KEY, 144 | aws_secret_access_key=SECRET 145 | ) 146 | 147 | s3 = boto3.resource('s3', 148 | region_name="us-west-2", 149 | aws_access_key_id=KEY, 150 | aws_secret_access_key=SECRET 151 | ) 152 | 153 | iam = boto3.client('iam',aws_access_key_id=KEY, 154 | aws_secret_access_key=SECRET, 155 | region_name='us-west-2' 156 | ) 157 | 158 | redshift = boto3.client('redshift', 159 | region_name="us-west-2", 160 | aws_access_key_id=KEY, 161 | aws_secret_access_key=SECRET 162 | ) 163 | 164 | roleArn = create_iam_role(iam, DWH_IAM_ROLE_NAME) 165 | 166 | create_cluster(redshift, roleArn, DWH_CLUSTER_TYPE, DWH_NODE_TYPE, DWH_NUM_NODES, DWH_DB, DWH_CLUSTER_IDENTIFIER, DWH_DB_USER, DWH_DB_PASSWORD) 167 | 168 | myClusterProps = get_cluster_props(redshift, DWH_CLUSTER_IDENTIFIER) 169 | 170 | open_ports(ec2, myClusterProps, DWH_PORT) 171 | 172 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 173 | cur = conn.cursor() 174 | 175 | print('Connected') 176 | 177 | conn.close() 178 | 179 | 180 | if __name__ == "__main__": 181 | main() -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/create_tables.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import create_table_queries, drop_table_queries 4 | 5 | 6 | def drop_tables(cur, conn): 7 | """ 8 | Delete pre-existing tables to be able to create them from scratch 9 | """ 10 | print('Droping tables') 11 | for query in drop_table_queries: 12 | cur.execute(query) 13 | conn.commit() 14 | 15 | 16 | def create_tables(cur, conn): 17 | """ 18 | Create staging and dimensional tables declared on sql_queries script 19 | """ 20 | for query in create_table_queries: 21 | print('Running ' + query + ' ') 22 | cur.execute(query) 23 | conn.commit() 24 | 25 | 26 | def main(): 27 | """ 28 | Set up the database tables, create needed tables with the appropriate columns and constricts 29 | """ 30 | config = configparser.ConfigParser() 31 | config.read('dwh.cfg') 32 | 33 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 34 | cur = conn.cursor() 35 | 36 | print('Connected to the cluster') 37 | 38 | drop_tables(cur, conn) 39 | create_tables(cur, conn) 40 | 41 | conn.close() 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import psycopg2 3 | from sql_queries import copy_table_queries, insert_table_queries 4 | 5 | 6 | def load_staging_tables(cur, conn): 7 | """ 8 | Load data from files stored in S3 to the staging tables using the queries declared on the sql_queries script 9 | """ 10 | print('Inserting data from json files stored in S3 buckets into staging tables') 11 | for query in copy_table_queries: 12 | print('Running ' + query) 13 | cur.execute(query) 14 | conn.commit() 15 | 16 | 17 | def insert_tables(cur, conn): 18 | """ 19 | Select and Transform data from staging tables into the dimensional tables using the queries declared on the sql_queries script 20 | """ 21 | print('Inserting data from staging tables into analytics tables') 22 | for query in insert_table_queries: 23 | print('Running ' + query) 24 | cur.execute(query) 25 | conn.commit() 26 | 27 | 28 | def main(): 29 | """ 30 | Extract songs metadata and user activity data from S3, transform it using a staging table, and load it into dimensional tables for analysis 31 | """ 32 | config = configparser.ConfigParser() 33 | config.read('dwh.cfg') 34 | 35 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())) 36 | cur = conn.cursor() 37 | 38 | load_staging_tables(cur, conn) 39 | insert_tables(cur, conn) 40 | 41 | conn.close() 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/requirements.txt: -------------------------------------------------------------------------------- 1 | awscli==1.16.140 2 | boto3==1.9.164 3 | botocore==1.12.164 4 | pandas==0.23.4 5 | psycopg2==2.7.7 6 | psycopg2-binary==2.8.2 7 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/infrastructure_as_code.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import boto3 3 | import json 4 | 5 | from botocore.exceptions import ClientError 6 | import configparser 7 | 8 | 9 | config = configparser.ConfigParser() 10 | config.read_file(open('/home/f.silvestre/Documents/Projects/Data-engineering-nanodegree/2_dend_cloud_data_warehouses/dhw.cfg')) 11 | 12 | KEY = config.get('AWS','KEY') 13 | SECRET = config.get('AWS','SECRET') 14 | 15 | DWH_CLUSTER_TYPE = config.get("DWH","DWH_CLUSTER_TYPE") 16 | DWH_NUM_NODES = config.get("DWH","DWH_NUM_NODES") 17 | DWH_NODE_TYPE = config.get("DWH","DWH_NODE_TYPE") 18 | 19 | DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER") 20 | DWH_DB = config.get("DWH","DWH_DB") 21 | DWH_DB_USER = config.get("DWH","DWH_DB_USER") 22 | DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD") 23 | DWH_PORT = config.get("DWH","DWH_PORT") 24 | 25 | DWH_IAM_ROLE_NAME = config.get("DWH", "DWH_IAM_ROLE_NAME") 26 | 27 | (DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB) 28 | 29 | df = pd.DataFrame({"Param": 30 | ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"], 31 | "Value": 32 | [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME] 33 | }) 34 | 35 | print(df) 36 | 37 | # Create clients 38 | ec2 = boto3.resource('ec2', 39 | region_name='us-west-2', 40 | aws_access_key_id=KEY, 41 | aws_secret_access_key=SECRET) 42 | 43 | s3 = boto3.resource('s3', 44 | region_name='us-west-2', 45 | aws_access_key_id=KEY, 46 | aws_secret_access_key=SECRET) 47 | 48 | iam = boto3.resource('iam', 49 | region_name='us-west-2', 50 | aws_access_key_id=KEY, 51 | aws_secret_access_key=SECRET) 52 | 53 | redshift = boto3.resource('redshift', 54 | region_name='us-west-2', 55 | aws_access_key_id=KEY, 56 | aws_secret_access_key=SECRET) 57 | 58 | 59 | # Connect to S3 60 | sampleDbBucket = s3.Bucket("awssampledbuswest2") 61 | 62 | 63 | try: 64 | print("Creating IAM Role") 65 | dwhRole=iam.create_role( 66 | Path='/', 67 | RoleName=DWH_IAM_ROLE_NAME, 68 | Description="Allows Redshift clusters to call AWS services on your behalf", 69 | AssumeRolePolicyDocument=json.dumps( 70 | {'Statement': [{'Action':'sts:AssumeRole', 71 | 'Effect':'Allow', 72 | 'Principal':{'Service': 'redshift.amazonaws.com'}}], 73 | 'Version':'2012-10-17'} 74 | ) 75 | ) 76 | except Exception as e: 77 | print(e) 78 | 79 | print("Attaching policy") 80 | 81 | iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME, 82 | PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" 83 | )['ResponseMetadata']['HTTPStatusCode'] 84 | 85 | print("Get IAM Role") 86 | roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn'] 87 | 88 | print(roleArn) 89 | 90 | # Create Readshift cluster 91 | try: 92 | response = redshift.create_cluster( 93 | ClusterType=DWH_CLUSTER_TYPE, 94 | NodeType=DWH_NODE_TYPE, 95 | NumberOfNodes=int(DWH_NUM_NODES), 96 | DBName=DWH_DB, 97 | ClusterIdentifier=DWH_CLUSTER_IDENTIFIER, 98 | MasterUsername=DWH_DB_USER, 99 | MasterUserPassword=DWH_DB_PASSWORD, 100 | 101 | #Roles (for s3 access) 102 | IamRoles=[roleArn] 103 | ) 104 | 105 | except Exception as e: 106 | print(e) 107 | 108 | 109 | # Describe cluster and status 110 | def prettyRedshiftProps(props): 111 | pd.set_option('display.max_colwidth', -1) 112 | keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId'] 113 | x = [(k, v) for k,v in props.items() if k in keysToShow] 114 | return pd.DataFrame(data=x, columns=["Key", "Value"]) 115 | 116 | myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0] 117 | prettyRedshiftProps(myClusterProps) 118 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/notebooks/Data/README: -------------------------------------------------------------------------------- 1 | Pagila 2 | ====== 3 | 4 | Pagila is a port of the Sakila example database available for MySQL, which was 5 | originally developed by Mike Hillyer of the MySQL AB documentation team. It 6 | is intended to provide a standard schema that can be used for examples in 7 | books, tutorials, articles, samples, etc. 8 | 9 | All the tables, data, views, and functions have been ported; some of the changes made were: 10 | 11 | * Changed char(1) true/false fields to true boolean fields 12 | * The last_update columns were set with triggers to update them 13 | * Added foreign keys 14 | * Removed 'DEFAULT 0' on foreign keys since it's pointless with real FK's 15 | * Used PostgreSQL built-in fulltext searching for fulltext index. Removed the need for the 16 | film_text table. 17 | * The rewards_report function was ported to a simple SRF 18 | 19 | The schema and data for the Sakila database were made available under the BSD license 20 | which can be found at http://www.opensource.org/licenses/bsd-license.php. The pagila 21 | database is made available under this license as well. 22 | 23 | 24 | FULLTEXT SEARCH 25 | --------------- 26 | 27 | In older versions of pagila, the fulltext search capabilities were split into a 28 | seperate file, so they could be loaded into only databases that support fulltext. 29 | Starting in PostgreSQL 8.3, fulltext functionality is built in, so now these 30 | parts of the schema exist in the main schema file. 31 | 32 | Example usage: 33 | 34 | SELECT * FROM film WHERE fulltext @@ to_tsquery('fate&india'); 35 | 36 | 37 | PARTITIONED TABLES 38 | ------------------ 39 | 40 | The payment table is designed as a partitioned table with a 6 month timespan for the date ranges. 41 | If you want to take full advantage of table partitioning, you need to make sure constraint_exclusion 42 | is turned on in your database. You can do this by setting "constraint_exclusion = on" in your 43 | postgresql.conf, or by issuing the command "ALTER DATABASE pagila SET constraint_exclusion = on" 44 | (substitute pagila for your database name if installing into a database with a different name) 45 | 46 | 47 | INSTALL NOTE 48 | ------------ 49 | 50 | The pagila-data.sql file and the pagila-insert-data.sql both contain the same 51 | data, the former using COPY commands, the latter using INSERT commands, so you 52 | only need to install one of them. Both formats are provided for those who have 53 | trouble using one version or another. 54 | 55 | 56 | ARTICLES 57 | -------------- 58 | 59 | The following articles make use of pagila to showcase various PostgreSQL features: 60 | 61 | * Showcasing REST in PostgreSQL - The PreQuel 62 | http://www.postgresonline.com/journal/index.php?/archives/32-Showcasing-REST-in-PostgreSQL-The-PreQuel.html#extended 63 | 64 | * PostgreSQL 8.3 Features: Enum Datatype 65 | http://people.planetpostgresql.org/xzilla/index.php?/archives/320-PostgreSQL-8.3-Features-Enum-Datatype.html 66 | 67 | * Email Validation with pl/PHP 68 | http://people.planetpostgresql.org/xzilla/index.php?/archives/261-Re-inventing-Gregs-method-to-prevent-re-inventing.html 69 | 70 | * Getting Started with PostgreSQL for Windows 71 | http://www.charltonlopez.com/index.php?option=com_content&task=view&id=56&Itemid=38 72 | 73 | * RATIO_TO_REPORT in PostgreSQL 74 | http://people.planetpostgresql.org/xzilla/index.php?/search/pagila/P3.html 75 | 76 | * The postmaster and postgres Processes 77 | http://www.charltonlopez.com/index.php?option=com_content&task=view&id=57&Itemid=38 78 | 79 | * Building Rails to Legacy Applications :: Take Control of Active Record 80 | http://people.planetpostgresql.org/xzilla/index.php?/archives/220-Building-Rails-to-Legacy-Applications-Take-Control-of-Active-Record.html 81 | 82 | * Building Rails to Legacy Applications :: Masking the Database 83 | http://people.planetpostgresql.org/xzilla/index.php?/archives/213-Building-Rails-to-Legacy-Applications-Masking-the-Database.html 84 | 85 | 86 | VERSION HISTORY 87 | --------------- 88 | 89 | Version 0.10.1 90 | * Add pagila-data-insert.sql file, added articles section 91 | 92 | Version 0.10 93 | * Support for built-in fulltext. Add enum example 94 | 95 | Version 0.9 96 | * Add table partitioning example 97 | 98 | Version 0.8 99 | * First release of pagila 100 | 101 | 102 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/notebooks/L3 Exercise 3 - Parallel ETL - Solution.py: -------------------------------------------------------------------------------- 1 | #%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting 2 | # ms-python.python added 3 | import os 4 | try: 5 | os.chdir(os.path.join(os.getcwd(), '2_dend_cloud_data_warehouses/notebooks')) 6 | print(os.getcwd()) 7 | except: 8 | pass 9 | #%% [markdown] 10 | # # Exercise 3: Parallel ETL 11 | 12 | #%% 13 | get_ipython().run_line_magic('load_ext', 'sql') 14 | 15 | 16 | #%% 17 | from time import time 18 | import configparser 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | 22 | #%% [markdown] 23 | # # STEP 1: Get the params of the created redshift cluster 24 | # - We need: 25 | # - The redshift cluster endpoint 26 | # - The IAM role ARN that give access to Redshift to read from S3 27 | 28 | #%% 29 | config = configparser.ConfigParser() 30 | config.read_file(open('dwh.cfg')) 31 | KEY=config.get('AWS','key') 32 | SECRET= config.get('AWS','secret') 33 | 34 | DWH_DB= config.get("DWH","DWH_DB") 35 | DWH_DB_USER= config.get("DWH","DWH_DB_USER") 36 | DWH_DB_PASSWORD= config.get("DWH","DWH_DB_PASSWORD") 37 | DWH_PORT = config.get("DWH","DWH_PORT") 38 | 39 | 40 | #%% 41 | # FILL IN THE REDSHIFT ENPOINT HERE 42 | # e.g. DWH_ENDPOINT="redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com" 43 | DWH_ENDPOINT="" 44 | 45 | #FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise 46 | #e.g DWH_ROLE_ARN="arn:aws:iam::988332130976:role/dwhRole" 47 | DWH_ROLE_ARN="" 48 | 49 | #%% [markdown] 50 | # # STEP 2: Connect to the Redshift Cluster 51 | 52 | #%% 53 | conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB) 54 | print(conn_string) 55 | get_ipython().run_line_magic('sql', '$conn_string') 56 | 57 | 58 | #%% 59 | import boto3 60 | 61 | s3 = boto3.resource('s3', 62 | region_name="us-west-2", 63 | aws_access_key_id=KEY, 64 | aws_secret_access_key=SECRET 65 | ) 66 | 67 | sampleDbBucket = s3.Bucket("udacity-labs") 68 | 69 | for obj in sampleDbBucket.objects.filter(Prefix="tickets"): 70 | print(obj) 71 | 72 | #%% [markdown] 73 | # # STEP 3: Create Tables 74 | 75 | #%% 76 | get_ipython().run_cell_magic('sql', '', 'DROP TABLE IF EXISTS "sporting_event_ticket";\nCREATE TABLE "sporting_event_ticket" (\n "id" double precision DEFAULT nextval(\'sporting_event_ticket_seq\') NOT NULL,\n "sporting_event_id" double precision NOT NULL,\n "sport_location_id" double precision NOT NULL,\n "seat_level" numeric(1,0) NOT NULL,\n "seat_section" character varying(15) NOT NULL,\n "seat_row" character varying(10) NOT NULL,\n "seat" character varying(10) NOT NULL,\n "ticketholder_id" double precision,\n "ticket_price" numeric(8,2) NOT NULL\n);') 77 | 78 | #%% [markdown] 79 | # # STEP 4: Load Partitioned data into the cluster 80 | 81 | #%% 82 | get_ipython().run_cell_magic('time', '', 'qry = """\n copy sporting_event_ticket from \'s3://udacity-labs/tickets/split/part\'\n credentials \'aws_iam_role={}\'\n gzip delimiter \';\' compupdate off region \'us-west-2\';\n""".format(DWH_ROLE_ARN)\n\n%sql $qry') 83 | 84 | #%% [markdown] 85 | # # STEP 4: Create Tables for the non-partitioned data 86 | 87 | #%% 88 | get_ipython().run_cell_magic('sql', '', 'DROP TABLE IF EXISTS "sporting_event_ticket_full";\nCREATE TABLE "sporting_event_ticket_full" (\n "id" double precision DEFAULT nextval(\'sporting_event_ticket_seq\') NOT NULL,\n "sporting_event_id" double precision NOT NULL,\n "sport_location_id" double precision NOT NULL,\n "seat_level" numeric(1,0) NOT NULL,\n "seat_section" character varying(15) NOT NULL,\n "seat_row" character varying(10) NOT NULL,\n "seat" character varying(10) NOT NULL,\n "ticketholder_id" double precision,\n "ticket_price" numeric(8,2) NOT NULL\n);') 89 | 90 | #%% [markdown] 91 | # # STEP 5: Load non-partitioned data into the cluster 92 | # - Note how it's slower than loading partitioned data 93 | 94 | #%% 95 | get_ipython().run_cell_magic('time', '', '\nqry = """\n copy sporting_event_ticket_full from \'s3://udacity-labs/tickets/full/full.csv.gz\' \n credentials \'aws_iam_role={}\' \n gzip delimiter \';\' compupdate off region \'us-west-2\';\n""".format(DWH_ROLE_ARN)\n\n%sql $qry') 96 | 97 | 98 | #%% 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/notebooks/pagila-star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/2_dend_cloud_data_warehouses/notebooks/pagila-star.png -------------------------------------------------------------------------------- /2_dend_cloud_data_warehouses/notes/AWS.md: -------------------------------------------------------------------------------- 1 | ## What Is Cloud Computing? 2 | *Cloud computing: the practice of using a network of remote servers hosted on the Internet to store, manage, and process data, rather than a local server or a personal computer.* 3 | 4 | The arrival of cloud computing completely changed the way we deploy our technology, providing powerful access to instant and scalable computing power to enterprises, startups, and developers alike. Whether you need servers to host a web application, reliable storage for your data, or machines to train machine learning models, it's easy to see the advantage of relying on the cloud rather than utilizing your personal computer or local servers. 5 | 6 | For one, you no longer have to invest in lots of hardware upfront. No need to worry about whether you are paying for more than you'll need or what to do if you need to scale a lot more later on. Cloud computing makes this as easy and clicking a few buttons to scale your resources up or down. 7 | 8 | It's significantly faster provisioning the resources you need through the cloud versus the time it would take to gather and build up the hardware you'd need to provide the same support. This allows you and your team, or company, to develop and experiment at a much faster rate. 9 | 10 | Lastly, you can provide efficient access to your applications around the world by spreading your deployments to multiple regions. 11 | 12 | ## Amazon Web Services 13 | Amazon Web Services is one of the largest providers in the cloud computing industry, with over 140 services in compute, storage, databases, networking, developer tools, security, and more. 14 | Services provided in AWS can be accessed in three different ways: the AWS Management Console, the Command Line Interface, or Software Development Kits, which can be used in combination. 15 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/Data Lakes with Spark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/Data Lakes with Spark.pdf -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/P4_Data_Lake/README.md: -------------------------------------------------------------------------------- 1 | # Project: Data Lake 2 | 3 | ## Introduction 4 | 5 | *A music streaming startup, Sparkify, has grown their user base and song database even more and want to move their data warehouse to a data lake. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app.* 6 | 7 | In this project we will build an ETL pipeline that extracts their data from the data lake hosted on S3, processes them using Spark which will be deployed on an EMR cluster using AWS, and load the data back into S3 as a set of dimensional tables in parquet format. 8 | 9 | From this tables we will be able to find insights in what songs their users are listening to. 10 | 11 | ## How to run 12 | 13 | *To run this project in local mode*, create a file `dl.cfg` in the root of this project with the following data: 14 | 15 | ``` 16 | KEY=YOUR_AWS_ACCESS_KEY 17 | SECRET=YOUR_AWS_SECRET_KEY 18 | ``` 19 | 20 | Create an S3 Bucket named `sparkify-dend` where output results will be stored. 21 | 22 | Finally, run the following command: 23 | 24 | `python etl.py` 25 | 26 | *To run on an Jupyter Notebook powered by an EMR cluster*, import the notebook found in this project. 27 | 28 | ## Project structure 29 | 30 | The files found at this project are the following: 31 | 32 | - dl.cfg: *not uploaded to github - you need to create this file yourself* File with AWS credentials. 33 | - etl.py: Program that extracts songs and log data from S3, transforms it using Spark, and loads the dimensional tables created in parquet format back to S3. 34 | - README.md: Current file, contains detailed information about the project. 35 | 36 | ## ETL pipeline 37 | 38 | 1. Load credentials 39 | 2. Read data from S3 40 | - Song data: `s3://udacity-dend/song_data` 41 | - Log data: `s3://udacity-dend/log_data` 42 | 43 | The script reads song_data and load_data from S3. 44 | 45 | 3. Process data using spark 46 | 47 | Transforms them to create five different tables listed under `Dimension Tables and Fact Table`. 48 | Each table includes the right columns and data types. Duplicates are addressed where appropriate. 49 | 50 | 4. Load it back to S3 51 | 52 | Writes them to partitioned parquet files in table directories on S3. 53 | 54 | Each of the five tables are written to parquet files in a separate analytics directory on S3. Each table has its own folder within the directory. Songs table files are partitioned by year and then artist. Time table files are partitioned by year and month. Songplays table files are partitioned by year and month. 55 | 56 | ### Source Data 57 | - **Song datasets**: all json files are nested in subdirectories under *s3a://udacity-dend/song_data*. A sample of this files is: 58 | 59 | ``` 60 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0} 61 | ``` 62 | 63 | - **Log datasets**: all json files are nested in subdirectories under *s3a://udacity-dend/log_data*. A sample of a single row of each files is: 64 | 65 | ``` 66 | {"artist":"Slipknot","auth":"Logged In","firstName":"Aiden","gender":"M","itemInSession":0,"lastName":"Ramirez","length":192.57424,"level":"paid","location":"New York-Newark-Jersey City, NY-NJ-PA","method":"PUT","page":"NextSong","registration":1540283578796.0,"sessionId":19,"song":"Opium Of The People (Album Version)","status":200,"ts":1541639510796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"20"} 67 | ``` 68 | 69 | ### Dimension Tables and Fact Table 70 | 71 | **songplays** - Fact table - records in log data associated with song plays i.e. records with page NextSong 72 | - songplay_id (INT) PRIMARY KEY: ID of each user song play 73 | - start_time (DATE) NOT NULL: Timestamp of beggining of user activity 74 | - user_id (INT) NOT NULL: ID of user 75 | - level (TEXT): User level {free | paid} 76 | - song_id (TEXT) NOT NULL: ID of Song played 77 | - artist_id (TEXT) NOT NULL: ID of Artist of the song played 78 | - session_id (INT): ID of the user Session 79 | - location (TEXT): User location 80 | - user_agent (TEXT): Agent used by user to access Sparkify platform 81 | 82 | **users** - users in the app 83 | - user_id (INT) PRIMARY KEY: ID of user 84 | - first_name (TEXT) NOT NULL: Name of user 85 | - last_name (TEXT) NOT NULL: Last Name of user 86 | - gender (TEXT): Gender of user {M | F} 87 | - level (TEXT): User level {free | paid} 88 | 89 | **songs** - songs in music database 90 | - song_id (TEXT) PRIMARY KEY: ID of Song 91 | - title (TEXT) NOT NULL: Title of Song 92 | - artist_id (TEXT) NOT NULL: ID of song Artist 93 | - year (INT): Year of song release 94 | - duration (FLOAT) NOT NULL: Song duration in milliseconds 95 | 96 | **artists** - artists in music database 97 | - artist_id (TEXT) PRIMARY KEY: ID of Artist 98 | - name (TEXT) NOT NULL: Name of Artist 99 | - location (TEXT): Name of Artist city 100 | - lattitude (FLOAT): Lattitude location of artist 101 | - longitude (FLOAT): Longitude location of artist 102 | 103 | **time** - timestamps of records in songplays broken down into specific units 104 | - start_time (DATE) PRIMARY KEY: Timestamp of row 105 | - hour (INT): Hour associated to start_time 106 | - day (INT): Day associated to start_time 107 | - week (INT): Week of year associated to start_time 108 | - month (INT): Month associated to start_time 109 | - year (INT): Year associated to start_time 110 | - weekday (TEXT): Name of week day associated to start_time 111 | 112 | 113 | ## Authors 114 | 115 | * **Florencia Silvestre** - [Github](https://github.com/Flor91) - [LinkedIn](https://www.linkedin.com/in/florencia-silvestre-2683587b/) -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/P4_Data_Lake/etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | from datetime import datetime 3 | import os 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import udf, col 6 | from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format 7 | from pyspark.sql.functions import monotonically_increasing_id 8 | from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Dat, TimestampType 9 | 10 | 11 | config = configparser.ConfigParser() 12 | config.read('dl.cfg') 13 | 14 | os.environ['AWS_ACCESS_KEY_ID']=config['AWS_ACCESS_KEY_ID'] 15 | os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS_SECRET_ACCESS_KEY'] 16 | 17 | 18 | def create_spark_session(): 19 | """ 20 | Create or retrieve a Spark Session 21 | """ 22 | spark = SparkSession \ 23 | .builder \ 24 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \ 25 | .getOrCreate() 26 | return spark 27 | 28 | 29 | def process_song_data(spark, input_data, output_data): 30 | """ 31 | Description: This function loads song_data from S3 and processes it by extracting the songs and artist tables 32 | and then again loaded back to S3 33 | 34 | Parameters: 35 | spark : Spark Session 36 | input_data : location of song_data json files with the songs metadata 37 | output_data : S3 bucket were dimensional tables in parquet format will be stored 38 | """ 39 | 40 | song_data = input_data + 'song_data/*/*/*/*.json' 41 | 42 | songSchema = R([ 43 | Fld("artist_id",Str()), 44 | Fld("artist_latitude",Dbl()), 45 | Fld("artist_location",Str()), 46 | Fld("artist_longitude",Dbl()), 47 | Fld("artist_name",Str()), 48 | Fld("duration",Dbl()), 49 | Fld("num_songs",Int()), 50 | Fld("title",Str()), 51 | Fld("year",Int()), 52 | ]) 53 | 54 | df = spark.read.json(song_data, schema=songSchema) 55 | 56 | song_fields = ["title", "artist_id","year", "duration"] 57 | 58 | songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id()) 59 | 60 | songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/') 61 | 62 | artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"] 63 | 64 | artists_table = df.selectExpr(artists_fields).dropDuplicates() 65 | 66 | artists_table.write.parquet(output_data + 'artists/') 67 | 68 | 69 | def process_log_data(spark, input_data, output_data): 70 | """ 71 | Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables 72 | and then again loaded back to S3. Also output from previous function is used in by spark.read.json command 73 | 74 | Parameters: 75 | spark : Spark Session 76 | input_data : location of log_data json files with the events data 77 | output_data : S3 bucket were dimensional tables in parquet format will be stored 78 | 79 | """ 80 | 81 | log_data = input_data + 'log_data/*/*/*.json' 82 | 83 | df = spark.read.json(log_data) 84 | 85 | df = df.filter(df.page == 'NextSong') 86 | 87 | users_fields = ["userdId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"] 88 | users_table = df.selectExpr(users_fields).dropDuplicates() 89 | 90 | users_table.write.parquet(output_data + 'users/') 91 | 92 | get_datetime = udf(date_convert, TimestampType()) 93 | df = df.withColumn("start_time", get_datetime('ts')) 94 | 95 | time_table = df.select("start_time").dropDuplicates() \ 96 | .withColumn("hour", hour(col("start_time")).withColumn("day", day(col("start_time")) \ 97 | .withColumn("week", week(col("start_time")).withColumn("month", month(col("start_time")) \ 98 | .withColumn("year", year(col("start_time")).withColumn("weekday", date_format(col("start_time"), 'E')) 99 | 100 | songs_table.write.partitionBy("year", "month").parquet(output_data + 'time/') 101 | 102 | df_songs = spark.read.parquet(output_data + 'songs/*/*/*') 103 | 104 | df_artists = spark.read.parquet(output_data + 'artists/*') 105 | 106 | songs_logs = df.join(songs_df, (df.song == songs_df.title)) 107 | artists_songs_logs = songs_logs.join(df_artists, (songs_logs.artist == df_artists.name)) 108 | 109 | songplays = artists_songs_logs.join( 110 | time_table, 111 | artists_songs_logs.ts == time_table.start_time, 'left' 112 | ).drop(artists_songs_logs.year) 113 | 114 | songplays_table = songplays.select( 115 | col('start_time').alias('start_time'), 116 | col('userId').alias('user_id'), 117 | col('level').alias('level'), 118 | col('song_id').alias('song_id'), 119 | col('artist_id').alias('artist_id'), 120 | col('sessionId').alias('session_id'), 121 | col('location').alias('location'), 122 | col('userAgent').alias('user_agent'), 123 | col('year').alias('year'), 124 | col('month').alias('month'), 125 | ).repartition("year", "month") 126 | 127 | songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/') 128 | 129 | 130 | def main(): 131 | """ 132 | Extract songs and events data from S3, Transform it into dimensional tables format, and Load it back to S3 in Parquet format 133 | """ 134 | spark = create_spark_session() 135 | input_data = "s3a://udacity-dend/" 136 | output_data = "s3a://sparkify-dend/" 137 | 138 | process_song_data(spark, input_data, output_data) 139 | process_log_data(spark, input_data, output_data) 140 | 141 | 142 | if __name__ == "__main__": 143 | main() 144 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/data/log-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/log-data.png -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/data/log-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/log-data.zip -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/data/song-data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/song-data.zip -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/1_procedural_vs_functional_in_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procedural Programming\n", 8 | "\n", 9 | "This notebook contains the code from the previous screencast. The code counts the number of times a song appears in the log_of_songs variable. \n", 10 | "\n", 11 | "You'll notice that the first time you run `count_plays(\"Despacito\")`, you get the correct count. However, when you run the same code again `count_plays(\"Despacito\")`, the results are no longer correct.This is because the global variable `play_count` stores the results outside of the count_plays function. \n", 12 | "\n", 13 | "\n", 14 | "# Instructions\n", 15 | "\n", 16 | "Run the code cells in this notebook to see the problem with " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "log_of_songs = [\n", 26 | " \"Despacito\",\n", 27 | " \"Nice for what\",\n", 28 | " \"No tears left to cry\",\n", 29 | " \"Despacito\",\n", 30 | " \"Havana\",\n", 31 | " \"In my feelings\",\n", 32 | " \"Nice for what\",\n", 33 | " \"Despacito\",\n", 34 | " \"All the stars\"\n", 35 | "]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "play_count = 0" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "def count_plays(song_title):\n", 54 | " global play_count\n", 55 | " for song in log_of_songs:\n", 56 | " if song == song_title:\n", 57 | " play_count = play_count + 1\n", 58 | " return play_count" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "count_plays(\"Despacito\")" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "count_plays(\"Despacito\")" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "# How to Solve the Issue\n", 84 | "\n", 85 | "How might you solve this issue? You could get rid of the global variable and instead use play_count as an input to the function:\n", 86 | "\n", 87 | "```python\n", 88 | "def count_plays(song_title, play_count):\n", 89 | " for song in log_of_songs:\n", 90 | " if song == song_title:\n", 91 | " play_count = play_count + 1\n", 92 | " return play_count\n", 93 | "\n", 94 | "```\n", 95 | "\n", 96 | "How would this work with parallel programming? Spark splits up data onto multiple machines. If your songs list were split onto two machines, Machine A would first need to finish counting, and then return its own result to Machine B. And then Machine B could use the output from Machine A and add to the count.\n", 97 | "\n", 98 | "However, that isn't parallel computing. Machine B would have to wait until Machine A finishes. You'll see in the next parts of the lesson how Spark solves this issue with a functional programming paradigm.\n", 99 | "\n", 100 | "In Spark, if your data is split onto two different machines, machine A will run a function to count how many times 'Despacito' appears on machine A. Machine B will simultaneously run a function to count how many times 'Despacito' appears on machine B. After they finish counting individually, they'll combine their results together. You'll see how this works in the next parts of the lesson." 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.6.3" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/2_spark_maps_and_lazy_evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Maps\n", 8 | "\n", 9 | "In Spark, maps take data as input and then transform that data with whatever function you put in the map. They are like directions for the data telling how each input should get to the output.\n", 10 | "\n", 11 | "The first code cell creates a SparkContext object. With the SparkContext, you can input a dataset and parallelize the data across a cluster (since you are currently using Spark in local mode on a single machine, technically the dataset isn't distributed yet).\n", 12 | "\n", 13 | "Run the code cell below to instantiate a SparkContext object and then read in the log_of_songs list into Spark. " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "### \n", 23 | "# You might have noticed this code in the screencast.\n", 24 | "#\n", 25 | "# import findspark\n", 26 | "# findspark.init('spark-2.3.2-bin-hadoop2.7')\n", 27 | "#\n", 28 | "# The findspark Python module makes it easier to install\n", 29 | "# Spark in local mode on your computer. This is convenient\n", 30 | "# for practicing Spark syntax locally. \n", 31 | "# However, the workspaces already have Spark installed and you do not\n", 32 | "# need to use the findspark module\n", 33 | "#\n", 34 | "###\n", 35 | "\n", 36 | "import pyspark\n", 37 | "sc = pyspark.SparkContext(appName=\"maps_and_lazy_evaluation_example\")\n", 38 | "\n", 39 | "log_of_songs = [\n", 40 | " \"Despacito\",\n", 41 | " \"Nice for what\",\n", 42 | " \"No tears left to cry\",\n", 43 | " \"Despacito\",\n", 44 | " \"Havana\",\n", 45 | " \"In my feelings\",\n", 46 | " \"Nice for what\",\n", 47 | " \"despacito\",\n", 48 | " \"All the stars\"\n", 49 | "]\n", 50 | "\n", 51 | "# parallelize the log_of_songs to use with Spark\n", 52 | "distributed_song_log = sc.parallelize(log_of_songs)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "This next code cell defines a function that converts a song title to lowercase. Then there is an example converting the word \"Havana\" to \"havana\"." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "def convert_song_to_lowercase(song):\n", 69 | " return song.lower()\n", 70 | "\n", 71 | "convert_song_to_lowercase(\"Havana\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "The following code cells demonstrate how to apply this function using a map step. The map step will go through each song in the list and apply the convert_song_to_lowercase() function. " 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "distributed_song_log.map(convert_song_to_lowercase)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "You'll notice that this code cell ran quite quickly. This is because of lazy evaluation. Spark does not actually execute the map step unless it needs to.\n", 95 | "\n", 96 | "\"RDD\" in the output refers to resilient distributed dataset. RDDs are exactly what they say they are: fault-tolerant datasets distributed across a cluster. This is how Spark stores data. \n", 97 | "\n", 98 | "To get Spark to actually run the map step, you need to use an \"action\". One available action is the collect method. The collect() method takes the results from all of the clusters and \"collects\" them into a single list on the master node." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "distributed_song_log.map(convert_song_to_lowercase).collect()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Note as well that Spark is not changing the original data set: Spark is merely making a copy. You can see this by running collect() on the original dataset." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "distributed_song_log.collect()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "You do not always have to write a custom function for the map step. You can also use anonymous (lambda) functions as well as built-in Python functions like string.lower(). \n", 131 | "\n", 132 | "Anonymous functions are actually a Python feature for writing functional style programs." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "distributed_song_log.map(lambda song: song.lower()).collect()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "distributed_song_log.map(lambda x: x.lower()).collect()" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.6.3" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 2 175 | } 176 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/3_data_inputs_and_outputs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reading and Writing Data with Spark\n", 8 | "\n", 9 | "This notebook contains the code from the previous screencast. The only difference is that instead of reading in a dataset from a remote cluster, the data set is read in from a local file. You can see the file by clicking on the \"jupyter\" icon and opening the folder titled \"data\".\n", 10 | "\n", 11 | "Run the code cell to see how everything works. \n", 12 | "\n", 13 | "First let's import SparkConf and SparkSession" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pyspark\n", 23 | "from pyspark import SparkConf\n", 24 | "from pyspark.sql import SparkSession" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "Since we're using Spark locally we already have both a sparkcontext and a sparksession running. We can update some of the parameters, such our application's name. Let's just call it \"Our first Python Spark SQL example\"" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "spark = SparkSession \\\n", 41 | " .builder \\\n", 42 | " .appName(\"Our first Python Spark SQL example\") \\\n", 43 | " .getOrCreate()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Let's check if the change went through" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "spark.sparkContext.getConf().getAll()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "spark" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "As you can see the app name is exactly how we set it\n", 76 | "\n", 77 | "Let's create our first dataframe from a fairly small sample data set. Througout the course we'll work with a log file data set that describes user interactions with a music streaming service. The records describe events such as logging in to the site, visiting a page, listening to the next song, seeing an ad." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "path = \"data/sparkify_log_small.json\"\n", 87 | "user_log = spark.read.json(path)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "user_log.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "user_log.describe()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "user_log.show(n=1)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "user_log.take(5)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "out_path = \"data/sparkify_log_small.csv\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "user_log.write.save(out_path, format=\"csv\", header=True)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "user_log_2 = spark.read.csv(out_path, header=True)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "user_log_2.printSchema()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "user_log_2.take(2)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "user_log_2.select(\"userID\").show()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "user_log_2.take(1)" 187 | ] 188 | } 189 | ], 190 | "metadata": { 191 | "kernelspec": { 192 | "display_name": "Python 3", 193 | "language": "python", 194 | "name": "python3" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.6.3" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 1 211 | } 212 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/5_dataframe_quiz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Wrangling with DataFrames Coding Quiz\n", 8 | "\n", 9 | "Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession\n", 19 | "\n", 20 | "# TODOS: \n", 21 | "# 1) import any other libraries you might need\n", 22 | "# 2) instantiate a Spark session \n", 23 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 24 | "# 4) write code to answer the quiz questions " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# Question 1\n", 32 | "\n", 33 | "Which page did user id \"\" (empty string) NOT visit?" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# TODO: write your code to answer question 1" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Question 2 - Reflect\n", 50 | "\n", 51 | "What type of user does the empty string user id most likely refer to?\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TODO: use this space to explore the behavior of the user with an empty string\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "# Question 3\n", 68 | "\n", 69 | "How many female users do we have in the data set?" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# TODO: write your code to answer question 3" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "# Question 4\n", 86 | "\n", 87 | "How many songs were played from the most played artist?" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# TODO: write your code to answer question 4" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "# Question 5 (challenge)\n", 104 | "\n", 105 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n", 106 | "\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# TODO: write your code to answer question 5" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.6.3" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 2 140 | } 141 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/6_dataframe_quiz_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Answer Key to the Data Wrangling with DataFrames Coding Quiz\n", 8 | "\n", 9 | "Helpful resources:\n", 10 | "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from pyspark.sql import SparkSession\n", 20 | "from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg\n", 21 | "from pyspark.sql.functions import sum as Fsum\n", 22 | "from pyspark.sql.window import Window\n", 23 | "from pyspark.sql.types import IntegerType" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# 1) import any other libraries you might need\n", 33 | "# 2) instantiate a Spark session \n", 34 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 35 | "# 4) write code to answer the quiz questions \n", 36 | "\n", 37 | "spark = SparkSession \\\n", 38 | " .builder \\\n", 39 | " .appName(\"Data Frames practice\") \\\n", 40 | " .getOrCreate()\n", 41 | "\n", 42 | "df = spark.read.json(\"data/sparkify_log_small.json\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Question 1\n", 50 | "\n", 51 | "Which page did user id \"\" (empty string) NOT visit?" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df.printSchema()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# filter for users with blank user id\n", 70 | "blank_pages = df.filter(df.userId == '') \\\n", 71 | " .select(col('page') \\\n", 72 | " .alias('blank_pages')) \\\n", 73 | " .dropDuplicates()\n", 74 | "\n", 75 | "# get a list of possible pages that could be visited\n", 76 | "all_pages = df.select('page').dropDuplicates()\n", 77 | "\n", 78 | "# find values in all_pages that are not in blank_pages\n", 79 | "# these are the pages that the blank user did not go to\n", 80 | "for row in set(all_pages.collect()) - set(blank_pages.collect()):\n", 81 | " print(row.page)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Question 2 - Reflect\n", 89 | "\n", 90 | "What type of user does the empty string user id most likely refer to?\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "Perhaps it represents users who have not signed up yet or who are signed out and are about to log in." 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Question 3\n", 105 | "\n", 106 | "How many female users do we have in the data set?" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "df.filter(df.gender == 'F') \\\n", 116 | " .select('userId', 'gender') \\\n", 117 | " .dropDuplicates() \\\n", 118 | " .count()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "# Question 4\n", 126 | "\n", 127 | "How many songs were played from the most played artist?" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "df.filter(df.page == 'NextSong') \\\n", 137 | " .select('Artist') \\\n", 138 | " .groupBy('Artist') \\\n", 139 | " .agg({'Artist':'count'}) \\\n", 140 | " .withColumnRenamed('count(Artist)', 'Artistcount') \\\n", 141 | " .sort(desc('Artistcount')) \\\n", 142 | " .show(1)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Question 5 (challenge)\n", 150 | "\n", 151 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n", 152 | "\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# TODO: filter out 0 sum and max sum to get more exact answer\n", 162 | "\n", 163 | "function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())\n", 164 | "\n", 165 | "user_window = Window \\\n", 166 | " .partitionBy('userID') \\\n", 167 | " .orderBy(desc('ts')) \\\n", 168 | " .rangeBetween(Window.unboundedPreceding, 0)\n", 169 | "\n", 170 | "cusum = df.filter((df.page == 'NextSong') | (df.page == 'Home')) \\\n", 171 | " .select('userID', 'page', 'ts') \\\n", 172 | " .withColumn('homevisit', function(col('page'))) \\\n", 173 | " .withColumn('period', Fsum('homevisit').over(user_window))\n", 174 | "\n", 175 | "cusum.filter((cusum.page == 'NextSong')) \\\n", 176 | " .groupBy('userID', 'period') \\\n", 177 | " .agg({'period':'count'}) \\\n", 178 | " .agg({'count(period)':'avg'}).show()" 179 | ] 180 | } 181 | ], 182 | "metadata": { 183 | "kernelspec": { 184 | "display_name": "Python 3", 185 | "language": "python", 186 | "name": "python3" 187 | }, 188 | "language_info": { 189 | "codemirror_mode": { 190 | "name": "ipython", 191 | "version": 3 192 | }, 193 | "file_extension": ".py", 194 | "mimetype": "text/x-python", 195 | "name": "python", 196 | "nbconvert_exporter": "python", 197 | "pygments_lexer": "ipython3", 198 | "version": "3.6.3" 199 | } 200 | }, 201 | "nbformat": 4, 202 | "nbformat_minor": 2 203 | } 204 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/8_spark_sql_quiz.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Wrangling with Spark SQL Quiz\n", 8 | "\n", 9 | "This quiz uses the same dataset and most of the same questions from the earlier \"Quiz - Data Wrangling with Data Frames Jupyter Notebook.\" For this quiz, however, use Spark SQL instead of Spark Data Frames." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession\n", 19 | "\n", 20 | "# TODOS: \n", 21 | "# 1) import any other libraries you might need\n", 22 | "# 2) instantiate a Spark session \n", 23 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n", 24 | "# 4) create a view to use with your SQL queries\n", 25 | "# 5) write code to answer the quiz questions " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Question 1\n", 33 | "\n", 34 | "Which page did user id \"\"(empty string) NOT visit?" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TODO: write your code to answer question 1" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Question 2 - Reflect\n", 51 | "\n", 52 | "Why might you prefer to use SQL over data frames? Why might you prefer data frames over SQL?" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# Question 3\n", 60 | "\n", 61 | "How many female users do we have in the data set?" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# TODO: write your code to answer question 3" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# Question 4\n", 78 | "\n", 79 | "How many songs were played from the most played artist?" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# TODO: write your code to answer question 4" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "# Question 5 (challenge)\n", 96 | "\n", 97 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# TODO: write your code to answer question 5" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.6.3" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/notebooks/mapreduce_practice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MapReduce\n", 8 | "\n", 9 | "The MapReduce programming technique was designed to analyze massive data sets across a cluster. In this Jupyter notebook, you'll get a sense for how Hadoop MapReduce works; however, this notebook will run locally rather than on a cluster.\n", 10 | "\n", 11 | "The biggest difference between Hadoop and Spark is that Spark tries to do as many calculations as possible in memory, which avoids moving data back and forth across a cluster. Hadoop writes intermediate calculations out to disk, which can be less efficient. Hadoop is an older technology than Spark and one of the cornerstone big data technologies.\n", 12 | "\n", 13 | "If you click on the Jupyter notebook logo at the top of the workspace, you'll be taken to the workspace directory. There you will see a file called \"songplays.txt\". This is a text file where each line represents a song that was played in the Sparkify app. The MapReduce code will count how many times each song was played. In other words, the code counts how many times the song title appears in the list.\n", 14 | "\n", 15 | "\n", 16 | "# MapReduce versus Hadoop MapReduce\n", 17 | "\n", 18 | "Don't get confused by the terminology! MapReduce is a programming technique. Hadoop MapReduce is a specific implementation of the programming technique.\n", 19 | "\n", 20 | "Some of the syntax will look a bit funny, so be sure to read the explanation and comments for each section. You'll learn more about the syntax in later lessons. \n", 21 | "\n", 22 | "Run each of the code cells below to see the output." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Install mrjob library. This package is for running MapReduce jobs with Python\n", 32 | "# In Jupyter notebooks, \"!\" runs terminal commands from inside notebooks \n", 33 | "\n", 34 | "! pip install mrjob" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "%%file wordcount.py\n", 44 | "# %%file is an Ipython magic function that saves the code cell as a file\n", 45 | "\n", 46 | "from mrjob.job import MRJob # import the mrjob library\n", 47 | "\n", 48 | "class MRSongCount(MRJob):\n", 49 | " \n", 50 | " # the map step: each line in the txt file is read as a key, value pair\n", 51 | " # in this case, each line in the txt file only contains a value but no key\n", 52 | " # _ means that in this case, there is no key for each line\n", 53 | " def mapper(self, _, song):\n", 54 | " # output each line as a tuple of (song_names, 1) \n", 55 | " yield (song, 1)\n", 56 | "\n", 57 | " # the reduce step: combine all tuples with the same key\n", 58 | " # in this case, the key is the song name\n", 59 | " # then sum all the values of the tuple, which will give the total song plays\n", 60 | " def reducer(self, key, values):\n", 61 | " yield (key, sum(values))\n", 62 | " \n", 63 | "if __name__ == \"__main__\":\n", 64 | " MRSongCount.run()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# run the code as a terminal command\n", 74 | "! python wordcount.py songplays.txt" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# Summary of what happens in the code.\n", 82 | "\n", 83 | "There is a list of songs in songplays.txt that looks like the following:\n", 84 | "\n", 85 | "Deep Dreams\n", 86 | "Data House Rock\n", 87 | "Deep Dreams\n", 88 | "Data House Rock\n", 89 | "Broken Networks\n", 90 | "Data House Rock\n", 91 | "etc.....\n", 92 | "\n", 93 | "During the map step, the code reads in the txt file one line at a time. The map steps outputs a set of tuples that look like this:\n", 94 | "\n", 95 | "(Deep Dreams, 1) \n", 96 | "(Data House Rock, 1) \n", 97 | "(Deep Dreams, 1) \n", 98 | "(Data House Rock, 1) \n", 99 | "(Broken Networks, 1) \n", 100 | "(Data House Rock, 1) \n", 101 | "etc.....\n", 102 | "\n", 103 | "Finally, the reduce step combines all of the values by keys and sums the values: \n", 104 | "\n", 105 | "(Deep Dreams, \\[1, 1, 1, 1, 1, 1, ... \\]) \n", 106 | "(Data House Rock, \\[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\\]) \n", 107 | "(Broken Networks, \\[1, 1, 1, ...\\] \n", 108 | "\n", 109 | "With the output \n", 110 | "\n", 111 | "(Deep Dreams, 1131) \n", 112 | "(Data House Rock, 510) \n", 113 | "(Broken Networks, 828) " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.6.3" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /3_dend_spark_data_lakes/spark.md: -------------------------------------------------------------------------------- 1 | ## General functions 2 | We have used the following general functions that are quite similar to methods of Pandas dataframes: 3 | 4 | - select(): returns a new dataframe with the selected columns 5 | - filter(): filters rows using the given condition 6 | - where(): is just an alias for filter() 7 | - groupBy(): groups the DataFrame using the specified columns, so we can run aggregation on them 8 | - sort(): returns a new DataFrame sorted by the specified column(s). By default the second parameter 'ascending' is True 9 | - dropDuplicates(): returns a new dataframe with unique rows based on all or just a subset of columns 10 | - withColumn(): returns a new DataFrame by adding a column or replacing the existing column that has the same name. The first parameter is the name of the new column, the second is an expression of how to compute it 11 | 12 | ## Aggregate functions 13 | Spark SQL provides built-in methods for the most common aggregations such as count(), countDistinct(), avg(), max(), min(), etc. in the pyspark.sql.functions module. These methods are not the same as the built-in methods in the Python Standard Library, where we can find min() for example as well, hence you need to be careful not to try to use them interchangeably. 14 | 15 | In many cases, there are multiple ways to express the same aggregations. For example, if we would like to compute one type of aggregate for one or more columns of the dataframe we can just simply chain the aggregate method after a groupBy(). If we would like to use different functions on different columns agg() comes in handy. For example agg({"salary": "avg", "age": "max"}) computes the average salary and maximum age. 16 | 17 | ## User defined functions (UDF) 18 | In Spark SQL we can define our own functions with the udf method from the pyspark.sql.functions module. The default type of the returned variable for UDFs is string. If we would like to return an other type we need to explicitly do so by using the different types from the pyspark.sql.types module. 19 | 20 | ## Window functions 21 | Window functions are a way of combining the values of ranges of rows in a dataframe. When defining the window we can choose how to sort and group (with the partitionBy method) the rows and how wide of a window we'd like to use (described by rangeBetween or rowsBetween). 22 | 23 | For further information see the [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html) and the [Spark Python API Docs](https://spark.apache.org/docs/latest/api/python/index.html) . -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/__init__.py -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/airflow.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/airflow.db -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/__init__.py -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/sparkify_dend_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators.subdag_operator import SubDagOperator 5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator, 6 | DataQualityOperator, CreateTablesOperator) 7 | from helpers import SqlQueries 8 | from sparkify_dend_dimesions_subdag import load_dimensional_tables_dag 9 | 10 | 11 | start_date = datetime.utcnow() 12 | 13 | default_args = { 14 | 'owner': 'florencia', 15 | 'start_date': datetime(2018, 5, 1), 16 | 'end_date': datetime(2018, 11, 30), 17 | 'depends_on_past': False, 18 | 'retries': 3, 19 | 'retry_delay': timedelta(minutes=5), 20 | 'catchup': False, 21 | 'email_on_retry': False 22 | } 23 | 24 | dag_name='sparkify_dend_dag' 25 | dag = DAG(dag_name, 26 | default_args=default_args, 27 | description='Load and transform data in Redshift with Airflow', 28 | schedule_interval='0 * * * *', 29 | max_active_runs=3 30 | ) 31 | 32 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag) 33 | 34 | create_redshift_tables = CreateTablesOperator( 35 | task_id='Create_tables', 36 | dag=dag, 37 | redshift_conn_id="redshift" 38 | ) 39 | 40 | stage_events_to_redshift = StageToRedshiftOperator( 41 | task_id='Stage_events', 42 | dag=dag, 43 | provide_context=True, 44 | table="events", 45 | redshift_conn_id="redshift", 46 | aws_credentials_id="aws_credentials", 47 | s3_bucket="udacity-dend", 48 | s3_key="log_data", 49 | region="us-west-2", 50 | file_format="JSON", 51 | execution_date=start_date 52 | ) 53 | 54 | stage_songs_to_redshift = StageToRedshiftOperator( 55 | task_id='Stage_songs', 56 | dag=dag, 57 | provide_context=True, 58 | table="songs", 59 | redshift_conn_id="redshift", 60 | aws_credentials_id="aws_credentials", 61 | s3_bucket="udacity-dend", 62 | s3_key="song_data", 63 | region="us-west-2", 64 | data_format="JSON", 65 | execution_date=start_date 66 | ) 67 | 68 | load_songplays_table = LoadFactOperator( 69 | task_id='Load_songplays_fact_table', 70 | dag=dag, 71 | provide_context=True, 72 | aws_credentials_id="aws_credentials", 73 | redshift_conn_id='redshift', 74 | sql_query=SqlQueries.songplay_table_insert 75 | ) 76 | 77 | load_user_dimension_table_task_id='Load_user_dim_table' 78 | load_user_dimension_table = SubDagOperator( 79 | subdag=load_dimensional_tables_dag( 80 | parent_dag_name=dag_name, 81 | task_id=load_user_dimension_table_task_id, 82 | redshift_conn_id="redshift", 83 | aws_credentials_id="aws_credentials", 84 | start_date= datetime(2018, 5, 1), 85 | table="users", 86 | sql_query=SqlQueries.user_table_insert, 87 | ), 88 | task_id=load_user_dimension_table_task_id, 89 | dag=dag, 90 | ) 91 | 92 | load_song_dimension_table_task_id='Load_song_dim_table' 93 | load_song_dimension_table = SubDagOperator( 94 | subdag=load_dimensional_tables_dag( 95 | parent_dag_name=dag_name, 96 | task_id=load_song_dimension_table_task_id, 97 | redshift_conn_id="redshift", 98 | aws_credentials_id="aws_credentials", 99 | start_date= datetime(2018, 5, 1), 100 | table="users", 101 | sql_query=SqlQueries.song_table_insert, 102 | ), 103 | task_id=load_song_dimension_table_task_id, 104 | dag=dag, 105 | ) 106 | 107 | load_artist_dimension_table_task_id='Load_artist_dim_table' 108 | load_artist_dimension_table = SubDagOperator( 109 | subdag=load_dimensional_tables_dag( 110 | parent_dag_name=dag_name, 111 | task_id=load_artist_dimension_table_task_id, 112 | redshift_conn_id="redshift", 113 | aws_credentials_id="aws_credentials", 114 | table="users", 115 | start_date= datetime(2018, 5, 1), 116 | sql_query=SqlQueries.artist_table_insert, 117 | ), 118 | task_id=load_artist_dimension_table_task_id, 119 | dag=dag, 120 | ) 121 | 122 | load_time_dimension_table_task_id='Load_artist_dim_table' 123 | load_time_dimension_table = SubDagOperator( 124 | subdag=load_dimensional_tables_dag( 125 | parent_dag_name=dag_name, 126 | task_id=load_artist_dimension_table_task_id, 127 | redshift_conn_id="redshift", 128 | aws_credentials_id="aws_credentials", 129 | table="users", 130 | start_date= datetime(2018, 5, 1), 131 | sql_query=SqlQueries.artist_table_insert, 132 | ), 133 | task_id=load_artist_dimension_table_task_id, 134 | dag=dag, 135 | ) 136 | 137 | 138 | run_quality_checks = DataQualityOperator( 139 | task_id='Run_data_quality_checks', 140 | dag=dag, 141 | provide_context=True, 142 | aws_credentials_id="aws_credentials", 143 | redshift_conn_id='redshift', 144 | tables=["songplay", "users", "song", "artist", "time"] 145 | ) 146 | 147 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag) 148 | 149 | # Setting tasks dependencies 150 | 151 | start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift] 152 | 153 | [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table 154 | 155 | load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table, 156 | load_time_dimension_table] >> run_quality_checks 157 | 158 | run_quality_checks >> end_operator 159 | 160 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/sparkify_dend_dimesions_subdag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators import LoadDimensionOperator 5 | from helpers import SqlQueries 6 | 7 | 8 | def load_dimensional_tables_dag( 9 | parent_dag_name, 10 | task_id, 11 | redshift_conn_id, 12 | aws_credentials_id, 13 | table, 14 | sql_query, 15 | *args, **kwargs): 16 | dag = DAG( 17 | f"{parent_dag_name}.{task_id}", 18 | **kwargs 19 | ) 20 | """ 21 | Returns a DAG inserts data into a dimensional redshift table from staging tables. 22 | """ 23 | 24 | load_dimension_table = LoadDimensionOperator( 25 | task_id=f"load_{table}_dim_table", 26 | dag=dag, 27 | table=table, 28 | redshift_conn_id=redshift_conn_id, 29 | aws_credentials_id=aws_credentials_id, 30 | sql_query=sql_query 31 | ) 32 | 33 | load_dimension_table 34 | 35 | return dag -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-details-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-details-dag.png -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-running-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-running-dag.png -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag-code.png -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag.png -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, absolute_import, print_function 2 | 3 | from airflow.plugins_manager import AirflowPlugin 4 | 5 | import operators 6 | import helpers 7 | 8 | # Defining the plugin class 9 | class UdacityPlugin(AirflowPlugin): 10 | name = "udacity_plugin" 11 | operators = [ 12 | operators.StageToRedshiftOperator, 13 | operators.LoadFactOperator, 14 | operators.LoadDimensionOperator, 15 | operators.DataQualityOperator, 16 | operators.CreateTablesOperator 17 | ] 18 | helpers = [ 19 | helpers.SqlQueries 20 | ] 21 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.sql_queries import SqlQueries 2 | 3 | __all__ = [ 4 | 'SqlQueries', 5 | ] -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/helpers/sql_queries.py: -------------------------------------------------------------------------------- 1 | class SqlQueries: 2 | 3 | songplay_table_insert = (""" 4 | INSERT INTO songplays ( 5 | songplay_id, 6 | start_time, 7 | userid, 8 | level, 9 | song_id, 10 | artist_id, 11 | sessionid, 12 | location, 13 | useragent, 14 | start_time 15 | ) 16 | SELECT 17 | md5(events.sessionid || events.start_time) songplay_id, 18 | events.start_time, 19 | events.userid, 20 | events.level, 21 | songs.song_id, 22 | songs.artist_id, 23 | events.sessionid, 24 | events.location, 25 | events.useragent 26 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, * 27 | FROM staging_events 28 | WHERE page='NextSong') events 29 | LEFT JOIN staging_songs songs 30 | ON events.song = songs.title 31 | AND events.artist = songs.artist_name 32 | AND events.length = songs.duration 33 | """) 34 | 35 | user_table_insert = (""" 36 | INSERT INTO users ( 37 | userid, 38 | firstname, 39 | lastname, 40 | gender, 41 | level 42 | ) 43 | SELECT distinct userid, firstname, lastname, gender, level 44 | FROM staging_events 45 | WHERE page='NextSong' 46 | """) 47 | 48 | song_table_insert = (""" 49 | INSERT INTO songs ( 50 | song_id, 51 | title, 52 | artist_id, 53 | year, 54 | duration 55 | ) 56 | SELECT distinct song_id, title, artist_id, year, duration 57 | FROM staging_songs 58 | """) 59 | 60 | artist_table_insert = (""" 61 | INSERT INTO artists ( 62 | artist_id, 63 | artist_name, 64 | artist_location, 65 | artist_latitude, 66 | artist_longitude 67 | ) 68 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude 69 | FROM staging_songs 70 | """) 71 | 72 | time_table_insert = (""" 73 | INSERT INTO time ( 74 | start_time, 75 | hour, 76 | day, 77 | week, 78 | month, 79 | year, 80 | weekday 81 | ) 82 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time), 83 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time) 84 | FROM songplays 85 | """) -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.stage_redshift import StageToRedshiftOperator 2 | from operators.load_fact import LoadFactOperator 3 | from operators.load_dimension import LoadDimensionOperator 4 | from operators.data_quality import DataQualityOperator 5 | from operators.create_tables import CreateTablesOperator 6 | 7 | __all__ = [ 8 | 'StageToRedshiftOperator', 9 | 'LoadFactOperator', 10 | 'LoadDimensionOperator', 11 | 'DataQualityOperator', 12 | 'CreateTablesOperator' 13 | ] 14 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/create_tables.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.contrib.hooks.aws_hook import AwsHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | 7 | class CreateTablesOperator(BaseOperator): 8 | ui_color = '#358140' 9 | sql_statement_file='create_tables.sql' 10 | 11 | @apply_defaults 12 | def __init__(self, 13 | redshift_conn_id="", 14 | *args, **kwargs): 15 | 16 | super(CreateTablesOperator, self).__init__(*args, **kwargs) 17 | self.redshift_conn_id = redshift_conn_id 18 | 19 | def execute(self, context): 20 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 21 | 22 | self.log.info("Creating Redshift tables ") 23 | 24 | fd = open(CreateTablesOperator.sql_statement_file, 'r') 25 | sql_file = fd.read() 26 | fd.close() 27 | 28 | sql_commands = sql_file.split(';') 29 | 30 | for command in sql_commands: 31 | if command.rstrip() != '': 32 | redshift.run(command) 33 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXIST public.artists ( 2 | artistid varchar(256) NOT NULL, 3 | name varchar(256), 4 | location varchar(256), 5 | lattitude numeric(18,0), 6 | longitude numeric(18,0) 7 | ); 8 | 9 | CREATE TABLE IF NOT EXIST public.songplays ( 10 | playid varchar(32) NOT NULL, 11 | start_time timestamp NOT NULL, 12 | userid int4 NOT NULL, 13 | "level" varchar(256), 14 | songid varchar(256), 15 | artistid varchar(256), 16 | sessionid int4, 17 | location varchar(256), 18 | user_agent varchar(256), 19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid) 20 | ); 21 | 22 | CREATE TABLE IF NOT EXIST public.songs ( 23 | songid varchar(256) NOT NULL, 24 | title varchar(256), 25 | artistid varchar(256), 26 | "year" int4, 27 | duration numeric(18,0), 28 | CONSTRAINT songs_pkey PRIMARY KEY (songid) 29 | ); 30 | 31 | CREATE TABLE IF NOT EXIST public.staging_events ( 32 | artist varchar(256), 33 | auth varchar(256), 34 | firstname varchar(256), 35 | gender varchar(256), 36 | iteminsession int4, 37 | lastname varchar(256), 38 | length numeric(18,0), 39 | "level" varchar(256), 40 | location varchar(256), 41 | "method" varchar(256), 42 | page varchar(256), 43 | registration numeric(18,0), 44 | sessionid int4, 45 | song varchar(256), 46 | status int4, 47 | ts int8, 48 | useragent varchar(256), 49 | userid int4 50 | ); 51 | 52 | CREATE TABLE IF NOT EXIST public.staging_songs ( 53 | num_songs int4, 54 | artist_id varchar(256), 55 | artist_name varchar(256), 56 | artist_latitude numeric(18,0), 57 | artist_longitude numeric(18,0), 58 | artist_location varchar(256), 59 | song_id varchar(256), 60 | title varchar(256), 61 | duration numeric(18,0), 62 | "year" int4 63 | ); 64 | 65 | CREATE TABLE IF NOT EXIST public."time" ( 66 | start_time timestamp NOT NULL, 67 | "hour" int4, 68 | "day" int4, 69 | week int4, 70 | "month" varchar(256), 71 | "year" int4, 72 | weekday varchar(256), 73 | CONSTRAINT time_pkey PRIMARY KEY (start_time) 74 | ) ; 75 | 76 | CREATE TABLE IF NOT EXIST public.users ( 77 | userid int4 NOT NULL, 78 | first_name varchar(256), 79 | last_name varchar(256), 80 | gender varchar(256), 81 | "level" varchar(256), 82 | CONSTRAINT users_pkey PRIMARY KEY (userid) 83 | ); 84 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/data_quality.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class DataQualityOperator(BaseOperator): 7 | 8 | ui_color = '#89DA59' 9 | 10 | @apply_defaults 11 | def __init__(self, 12 | aws_credentials_id="", 13 | redshift_conn_id="", 14 | tables=[], 15 | *args, **kwargs): 16 | super(DataQualityOperator, self).__init__(*args, **kwargs) 17 | self.aws_credentials_id = aws_credentials_id, 18 | self.redshift_conn_id = redshift_conn_id, 19 | self.tables = tables 20 | 21 | def execute(self, context): 22 | redshift_hook = PostgresHook(self.redshift_conn_id) 23 | for table in self.tables: 24 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}") 25 | if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: 26 | self.log.error(f"Data quality check failed. {table} returned no results") 27 | raise ValueError(f"Data quality check failed. {table} returned no results") 28 | self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records") 29 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/load_dimension.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class LoadDimensionOperator(BaseOperator): 7 | 8 | ui_color = '#80BD9E' 9 | 10 | @apply_defaults 11 | def __init__(self, 12 | redshift_conn_id="", 13 | sql_query="", 14 | table="", 15 | truncate="", 16 | *args, **kwargs): 17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs) 18 | self.redshift_conn_id = redshift_conn_id 19 | self.sql_query = sql_query 20 | self.table = table 21 | self.truncate = truncate 22 | 23 | def execute(self, context): 24 | """ 25 | Insert data into dimensional tables from staging events and song data. 26 | Using a truncate-insert method to empty target tables prior to load. 27 | """ 28 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 29 | if self.truncate: 30 | redshift.run(f"TRUNCATE TABLE {self.table}") 31 | formatted_sql = self.sql_query.format(self.table) 32 | redshift.run(formatted_sql) 33 | self.log.info(f"Success: {self.task_id}") 34 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/load_fact.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.models import BaseOperator 3 | from airflow.utils.decorators import apply_defaults 4 | 5 | 6 | class LoadFactOperator(BaseOperator): 7 | 8 | ui_color = '#F98866' 9 | 10 | @apply_defaults 11 | def __init__(self, 12 | aws_credentials_id="", 13 | redshift_conn_id="", 14 | sql_query="", 15 | *args, **kwargs): 16 | 17 | super(LoadFactOperator, self).__init__(*args, **kwargs) 18 | self.aws_credentials_id = aws_credentials_id, 19 | self.redshift_conn_id = redshift_conn_id, 20 | self.sql_query = sql_query, 21 | 22 | def execute(self, context): 23 | redshift_hook = PostgresHook(self.redshift_conn_id) 24 | redshift_hook.run(str(self.sql_query)) 25 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/stage_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.hooks.postgres_hook import PostgresHook 2 | from airflow.contrib.hooks.aws_hook import AwsHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | 7 | class StageToRedshiftOperator(BaseOperator): 8 | ui_color = '#358140' 9 | template_fields = ("s3_key",) 10 | copy_sql = """ 11 | COPY {} 12 | FROM '{}' 13 | ACCESS_KEY_ID '{}' 14 | SECRET_ACCESS_KEY '{}' 15 | REGION '{}' 16 | TIMEFORMAT as 'epochmillisecs' 17 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL 18 | {} 'auto' 19 | {} 20 | """ 21 | 22 | @apply_defaults 23 | def __init__(self, 24 | redshift_conn_id="", 25 | aws_credentials_id="", 26 | table="", 27 | s3_bucket="", 28 | s3_key="", 29 | region="", 30 | file_format="JSON", 31 | *args, **kwargs): 32 | 33 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs) 34 | self.table = table 35 | self.redshift_conn_id = redshift_conn_id 36 | self.s3_bucket = s3_bucket 37 | self.s3_key = s3_key 38 | self.region= region 39 | self.file_format = file_format 40 | self.aws_credentials_id = aws_credentials_id 41 | self.execution_date = kwargs.get('execution_date') 42 | 43 | def execute(self, context): 44 | """ 45 | Copy data from S3 buckets to redshift cluster into staging tables. 46 | - redshift_conn_id: redshift cluster connection 47 | - aws_credentials_id: AWS connection 48 | - table: redshift cluster table name 49 | - s3_bucket: S3 bucket name holding source data 50 | - s3_key: S3 key files of source data 51 | - file_format: source file format - options JSON, CSV 52 | """ 53 | aws_hook = AwsHook(self.aws_credentials_id) 54 | credentials = aws_hook.get_credentials() 55 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 56 | 57 | self.log.info("Clearing data from destination Redshift table") 58 | redshift.run("DELETE FROM {}".format(self.table)) 59 | 60 | self.log.info("Copying data from S3 to Redshift") 61 | 62 | s3_path = "s3://{}".format(self.s3_bucket) 63 | if self.execution_date: 64 | # Backfill a specific date 65 | year = self.execution_date.strftime("%Y") 66 | month = self.execution_date.strftime("%m") 67 | day = self.execution_date.strftime("%d") 68 | s3_path = '/'.join([s3_path, str(year), str(month), str(day)]) 69 | s3_path = s3_path + '/' + self.s3_key 70 | 71 | additional="" 72 | if self.file_format == 'CSV': 73 | additional = " DELIMETER ',' IGNOREHEADER 1 " 74 | 75 | formatted_sql = StageToRedshiftOperator.copy_sql.format( 76 | self.table, 77 | s3_path, 78 | credentials.access_key, 79 | credentials.secret_key, 80 | self.region, 81 | self.file_format, 82 | additional 83 | ) 84 | redshift.run(formatted_sql) 85 | 86 | self.log.info(f"Success: Copying {self.table} from S3 to Redshift") 87 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/__init__.py -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex1_hello_world.py: -------------------------------------------------------------------------------- 1 | # Define a function that uses the python logger to log a function. 2 | # Then finish filling in the details of the DAG down below. 3 | # Once you’ve done that, run "/opt/airflow/start.sh" command to start the web server. 4 | # Once the Airflow web server is ready, open the Airflow UI using the "Access Airflow" button. 5 | # Turn your DAG “On”, and then Run your DAG. 6 | 7 | import datetime 8 | import logging 9 | 10 | from airflow import DAG 11 | from airflow.operators.python_operator import PythonOperator 12 | 13 | 14 | def hello_world(): 15 | logging.info("Hello Flor!") 16 | 17 | 18 | dag = DAG( 19 | 'lesson1.solution1', 20 | start_date=datetime.datetime.now()) 21 | 22 | greet_task = PythonOperator( 23 | task_id="hello_world_task", 24 | python_callable=hello_world, 25 | dag=dag 26 | ) 27 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex2_scheduler.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # Complete the TODOs in this DAG so that it runs once a day. 3 | # Once you’ve done that, open the Airflow UI using the "Access Airflow" button. 4 | # go to the Airflow UI and turn the last exercise off, then turn this exercise on. 5 | # Wait a moment and refresh the UI to see Airflow automatically run your DAG. 6 | 7 | import datetime 8 | import logging 9 | 10 | from airflow import DAG 11 | from airflow.operators.python_operator import PythonOperator 12 | 13 | 14 | def hello_world(): 15 | logging.info("Hello World") 16 | 17 | dag = DAG( 18 | "lesson1.exercise2", 19 | start_date=datetime.datetime.now() - datetime.timedelta(days=2), 20 | schedule_interval="@daily") 21 | 22 | task = PythonOperator( 23 | task_id="hello_world_task", 24 | python_callable=hello_world, 25 | dag=dag) 26 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex3_dependencies.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | from airflow import DAG 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | 8 | def hello_world(): 9 | logging.info("Hello World") 10 | 11 | 12 | def addition(): 13 | logging.info(f"2 + 2 = {2+2}") 14 | 15 | 16 | def subtraction(): 17 | logging.info(f"6 -2 = {6-2}") 18 | 19 | 20 | def division(): 21 | logging.info(f"10 / 2 = {int(10/2)}") 22 | 23 | 24 | dag = DAG( 25 | "lesson1.solution3", 26 | schedule_interval='@hourly', 27 | start_date=datetime.datetime.now() - datetime.timedelta(days=1)) 28 | 29 | hello_world_task = PythonOperator( 30 | task_id="hello_world", 31 | python_callable=hello_world, 32 | dag=dag) 33 | 34 | addition_task = PythonOperator( 35 | task_id="addition", 36 | python_callable=addition, 37 | dag=dag) 38 | 39 | subtraction_task = PythonOperator( 40 | task_id="subtraction", 41 | python_callable=subtraction, 42 | dag=dag) 43 | 44 | division_task = PythonOperator( 45 | task_id="division", 46 | python_callable=division, 47 | dag=dag) 48 | 49 | # Configure Task Dependencies 50 | hello_world_task >> addition_task 51 | hello_world_task >> subtraction_task 52 | 53 | subtraction_task >> division_task 54 | addition_task >> division_task 55 | 56 | 57 | # -> addition_task 58 | # / \ 59 | # hello_world_task -> division_task 60 | # \ / 61 | # ->subtraction_task 62 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex4_connections.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | from airflow import DAG 5 | from airflow.models import Variable 6 | from airflow.operators.python_operator import PythonOperator 7 | from airflow.hooks.S3_hook import S3Hook 8 | 9 | # 10 | # We're going to create a connection and a variable. 11 | # 1. Open your browser to localhost:8080 and open Admin->Variables 12 | # 2. Click "Create" 13 | # 3. Set "Key" equal to "s3_bucket" and set "Val" equal to "udacity-dend" 14 | # 4. Set "Key" equal to "s3_prefix" and set "Val" equal to "data-pipelines" 15 | # 5. Click save 16 | # 6. Open Admin->Connections 17 | # 7. Click "Create" 18 | # 8. Set "Conn Id" to "aws_credentials", "Conn Type" to "Amazon Web Services" 19 | # 9. Set "Login" to your aws_access_key_id and "Password" to your aws_secret_key 20 | # 10. Click save 21 | # 11. Run the DAG 22 | 23 | def list_keys(): 24 | hook = S3Hook(aws_conn_id='aws_credentials') 25 | bucket = Variable.get('s3_bucket') 26 | prefix = Variable.get('s3_prefix') 27 | logging.info(f"Listing Keys from {bucket}/{prefix}") 28 | keys = hook.list_keys(bucket, prefix=prefix) 29 | for key in keys: 30 | logging.info(f"- s3://{bucket}/{key}") 31 | 32 | 33 | dag = DAG( 34 | 'lesson1.exercise4', 35 | start_date=datetime.datetime.now()) 36 | 37 | list_task = PythonOperator( 38 | task_id="list_keys", 39 | python_callable=list_keys, 40 | dag=dag 41 | ) 42 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex5_context.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # Use the Airflow context in the pythonoperator to complete the TODOs below. Once you are done, run your DAG and check the logs to see the context in use. 3 | 4 | import datetime 5 | import logging 6 | 7 | from airflow import DAG 8 | from airflow.models import Variable 9 | from airflow.operators.python_operator import PythonOperator 10 | from airflow.hooks.S3_hook import S3Hook 11 | 12 | 13 | # TODO: Extract ds, run_id, prev_ds, and next_ds from the kwargs, and log them 14 | # NOTE: Look here for context variables passed in on kwargs: 15 | # https://airflow.apache.org/code.html#macros 16 | def log_details(*args, **kwargs): 17 | logging.info(f"Execution date is {kwargs['ds']}") 18 | logging.info(f"My run id is {kwargs['run_id']}") 19 | previous_ds = kwargs.get('prev_ds') 20 | if previous_ds: 21 | logging.info(f"My previous run was on {previous_ds}") 22 | next_ds = kwargs.get('next_ds') 23 | if next_ds: 24 | logging.info(f"My next run will be {next_ds}") 25 | 26 | dag = DAG( 27 | 'lesson1.solution5', 28 | schedule_interval="@daily", 29 | start_date=datetime.datetime.now() - datetime.timedelta(days=2) 30 | ) 31 | 32 | list_task = PythonOperator( 33 | task_id="log_details", 34 | python_callable=log_details, 35 | provide_context=True, 36 | dag=dag 37 | ) 38 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/1_ex6_redshift_queries.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # Similar to what you saw in the demo, copy and populate the trips table. 3 | # Then, add another operator which creates a traffic analysis table from the trips table you created. 4 | # Note, in this class, we won’t be writing SQL -- all of the SQL statements we run against Redshift are predefined and included in your lesson. 5 | 6 | import datetime 7 | import logging 8 | 9 | from airflow import DAG 10 | from airflow.contrib.hooks.aws_hook import AwsHook 11 | from airflow.hooks.postgres_hook import PostgresHook 12 | from airflow.operators.postgres_operator import PostgresOperator 13 | from airflow.operators.python_operator import PythonOperator 14 | 15 | import sql_statements 16 | 17 | 18 | def load_data_to_redshift(*args, **kwargs): 19 | aws_hook = AwsHook("aws_credentials") 20 | credentials = aws_hook.get_credentials() 21 | redshift_hook = PostgresHook("redshift") 22 | redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key)) 23 | 24 | 25 | dag = DAG( 26 | 'lesson1.solution6', 27 | start_date=datetime.datetime.now() 28 | ) 29 | 30 | create_table = PostgresOperator( 31 | task_id="create_table", 32 | dag=dag, 33 | postgres_conn_id="redshift", 34 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 35 | ) 36 | 37 | copy_task = PythonOperator( 38 | task_id='load_from_s3_to_redshift', 39 | dag=dag, 40 | python_callable=load_data_to_redshift 41 | ) 42 | 43 | location_traffic_task = PostgresOperator( 44 | task_id="calculate_location_traffic", 45 | dag=dag, 46 | postgres_conn_id="redshift", 47 | sql=sql_statements.LOCATION_TRAFFIC_SQL 48 | ) 49 | 50 | create_table >> copy_task 51 | copy_task >> location_traffic_task 52 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/2_ex1_data_lineage.py: -------------------------------------------------------------------------------- 1 | #Instructions 2 | #1 - Run the DAG as it is first, and observe the Airflow UI 3 | #2 - Next, open up the DAG and add the copy and load tasks as directed in the TODOs 4 | #3 - Reload the Airflow UI and run the DAG once more, observing the Airflow UI 5 | 6 | import datetime 7 | import logging 8 | 9 | from airflow import DAG 10 | from airflow.contrib.hooks.aws_hook import AwsHook 11 | from airflow.hooks.postgres_hook import PostgresHook 12 | from airflow.operators.postgres_operator import PostgresOperator 13 | from airflow.operators.python_operator import PythonOperator 14 | 15 | import sql_statements 16 | 17 | 18 | def load_trip_data_to_redshift(*args, **kwargs): 19 | aws_hook = AwsHook("aws_credentials") 20 | credentials = aws_hook.get_credentials() 21 | redshift_hook = PostgresHook("redshift") 22 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format( 23 | credentials.access_key, 24 | credentials.secret_key, 25 | ) 26 | redshift_hook.run(sql_stmt) 27 | 28 | 29 | def load_station_data_to_redshift(*args, **kwargs): 30 | aws_hook = AwsHook("aws_credentials") 31 | credentials = aws_hook.get_credentials() 32 | redshift_hook = PostgresHook("redshift") 33 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format( 34 | credentials.access_key, 35 | credentials.secret_key, 36 | ) 37 | redshift_hook.run(sql_stmt) 38 | 39 | 40 | dag = DAG( 41 | 'lesson2.exercise1', 42 | start_date=datetime.datetime.now() 43 | ) 44 | 45 | create_trips_table = PostgresOperator( 46 | task_id="create_trips_table", 47 | dag=dag, 48 | postgres_conn_id="redshift", 49 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 50 | ) 51 | 52 | copy_trips_task = PythonOperator( 53 | task_id='load_trips_from_s3_to_redshift', 54 | dag=dag, 55 | python_callable=load_trip_data_to_redshift, 56 | ) 57 | 58 | create_stations_table = PostgresOperator( 59 | task_id="create_stations_table", 60 | dag=dag, 61 | postgres_conn_id="redshift", 62 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL, 63 | ) 64 | 65 | copy_stations_task = PythonOperator( 66 | task_id='load_stations_from_s3_to_redshift', 67 | dag=dag, 68 | python_callable=load_station_data_to_redshift, 69 | ) 70 | 71 | create_trips_table >> copy_trips_task 72 | create_stations_table >> copy_stations_task 73 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/2_ex2_schedule_backfilling.py: -------------------------------------------------------------------------------- 1 | #Instructions 2 | #1 - Revisit our bikeshare traffic 3 | #2 - Update our DAG with 4 | # a - @monthly schedule_interval 5 | # b - max_active_runs of 1 6 | # c - start_date of 2018/01/01 7 | # d - end_date of 2018/02/01 8 | # Use Airflow’s backfill capabilities to analyze our trip data on a monthly basis over 2 historical runs 9 | 10 | import datetime 11 | import logging 12 | 13 | from airflow import DAG 14 | from airflow.contrib.hooks.aws_hook import AwsHook 15 | from airflow.hooks.postgres_hook import PostgresHook 16 | from airflow.operators.postgres_operator import PostgresOperator 17 | from airflow.operators.python_operator import PythonOperator 18 | 19 | import sql_statements 20 | 21 | 22 | def load_trip_data_to_redshift(*args, **kwargs): 23 | aws_hook = AwsHook("aws_credentials") 24 | credentials = aws_hook.get_credentials() 25 | redshift_hook = PostgresHook("redshift") 26 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format( 27 | credentials.access_key, 28 | credentials.secret_key, 29 | ) 30 | redshift_hook.run(sql_stmt) 31 | 32 | 33 | def load_station_data_to_redshift(*args, **kwargs): 34 | aws_hook = AwsHook("aws_credentials") 35 | credentials = aws_hook.get_credentials() 36 | redshift_hook = PostgresHook("redshift") 37 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format( 38 | credentials.access_key, 39 | credentials.secret_key, 40 | ) 41 | redshift_hook.run(sql_stmt) 42 | 43 | 44 | dag = DAG( 45 | 'lesson2.exercise2', 46 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), 47 | end_date=datetime.datetime(2018, 2, 1, 0, 0, 0, 0), 48 | schedule_interval='@monthly', 49 | max_active_runs=1 50 | ) 51 | 52 | create_trips_table = PostgresOperator( 53 | task_id="create_trips_table", 54 | dag=dag, 55 | postgres_conn_id="redshift", 56 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 57 | ) 58 | 59 | copy_trips_task = PythonOperator( 60 | task_id='load_trips_from_s3_to_redshift', 61 | dag=dag, 62 | python_callable=load_trip_data_to_redshift, 63 | provide_context=True, 64 | ) 65 | 66 | create_stations_table = PostgresOperator( 67 | task_id="create_stations_table", 68 | dag=dag, 69 | postgres_conn_id="redshift", 70 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL, 71 | ) 72 | 73 | copy_stations_task = PythonOperator( 74 | task_id='load_stations_from_s3_to_redshift', 75 | dag=dag, 76 | python_callable=load_station_data_to_redshift, 77 | ) 78 | 79 | create_trips_table >> copy_trips_task 80 | create_stations_table >> copy_stations_task 81 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/2_ex3_data_partitioning.py: -------------------------------------------------------------------------------- 1 | #Instructions 2 | #1 - Modify the bikeshare DAG to load data month by month, instead of loading it all at once, every time. 3 | #2 - Use time partitioning to parallelize the execution of the DAG. 4 | 5 | import datetime 6 | import logging 7 | 8 | from airflow import DAG 9 | from airflow.contrib.hooks.aws_hook import AwsHook 10 | from airflow.hooks.postgres_hook import PostgresHook 11 | from airflow.operators.postgres_operator import PostgresOperator 12 | from airflow.operators.python_operator import PythonOperator 13 | 14 | import sql_statements 15 | 16 | 17 | def load_trip_data_to_redshift(*args, **kwargs): 18 | aws_hook = AwsHook("aws_credentials") 19 | credentials = aws_hook.get_credentials() 20 | redshift_hook = PostgresHook("redshift") 21 | execution_date = kwargs["execution_date"] 22 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format( 23 | credentials.access_key, 24 | credentials.secret_key, 25 | year=execution_date.year, 26 | month=execution_date.month 27 | ) 28 | redshift_hook.run(sql_stmt) 29 | 30 | 31 | def load_station_data_to_redshift(*args, **kwargs): 32 | aws_hook = AwsHook("aws_credentials") 33 | credentials = aws_hook.get_credentials() 34 | redshift_hook = PostgresHook("redshift") 35 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format( 36 | credentials.access_key, 37 | credentials.secret_key, 38 | ) 39 | redshift_hook.run(sql_stmt) 40 | 41 | 42 | dag = DAG( 43 | 'lesson2.exercise3', 44 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), 45 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), 46 | schedule_interval='@monthly', 47 | max_active_runs=1 48 | ) 49 | 50 | create_trips_table = PostgresOperator( 51 | task_id="create_trips_table", 52 | dag=dag, 53 | postgres_conn_id="redshift", 54 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 55 | ) 56 | 57 | copy_trips_task = PythonOperator( 58 | task_id='load_trips_from_s3_to_redshift', 59 | dag=dag, 60 | python_callable=load_trip_data_to_redshift, 61 | provide_context=True, 62 | ) 63 | 64 | create_stations_table = PostgresOperator( 65 | task_id="create_stations_table", 66 | dag=dag, 67 | postgres_conn_id="redshift", 68 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL, 69 | ) 70 | 71 | copy_stations_task = PythonOperator( 72 | task_id='load_stations_from_s3_to_redshift', 73 | dag=dag, 74 | python_callable=load_station_data_to_redshift, 75 | ) 76 | 77 | create_trips_table >> copy_trips_task 78 | create_stations_table >> copy_stations_task 79 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/2_ex4_data_quality.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # 1 - Set an SLA on our bikeshare traffic calculation operator 3 | # 2 - Add data verification step after the load step from s3 to redshift 4 | # 3 - Add data verification step after we calculate our output table 5 | 6 | import datetime 7 | import logging 8 | 9 | from airflow import DAG 10 | from airflow.contrib.hooks.aws_hook import AwsHook 11 | from airflow.hooks.postgres_hook import PostgresHook 12 | from airflow.operators.postgres_operator import PostgresOperator 13 | from airflow.operators.python_operator import PythonOperator 14 | 15 | import sql_statements 16 | 17 | 18 | def load_trip_data_to_redshift(*args, **kwargs): 19 | aws_hook = AwsHook("aws_credentials") 20 | credentials = aws_hook.get_credentials() 21 | redshift_hook = PostgresHook("redshift") 22 | execution_date = kwargs["execution_date"] 23 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format( 24 | credentials.access_key, 25 | credentials.secret_key, 26 | year=execution_date.year, 27 | month=execution_date.month 28 | ) 29 | redshift_hook.run(sql_stmt) 30 | 31 | 32 | def load_station_data_to_redshift(*args, **kwargs): 33 | aws_hook = AwsHook("aws_credentials") 34 | credentials = aws_hook.get_credentials() 35 | redshift_hook = PostgresHook("redshift") 36 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format( 37 | credentials.access_key, 38 | credentials.secret_key, 39 | ) 40 | redshift_hook.run(sql_stmt) 41 | 42 | 43 | def check_greater_than_zero(*args, **kwargs): 44 | table = kwargs["params"]["table"] 45 | redshift_hook = PostgresHook("redshift") 46 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}") 47 | if len(records) < 1 or len(records[0]) < 1: 48 | raise ValueError(f"Data quality check failed. {table} returned no results") 49 | num_records = records[0][0] 50 | if num_records < 1: 51 | raise ValueError(f"Data quality check failed. {table} contained 0 rows") 52 | logging.info(f"Data quality on table {table} check passed with {records[0][0]} records") 53 | 54 | 55 | dag = DAG( 56 | 'lesson2.exercise4', 57 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), 58 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), 59 | schedule_interval='@monthly', 60 | max_active_runs=1 61 | ) 62 | 63 | create_trips_table = PostgresOperator( 64 | task_id="create_trips_table", 65 | dag=dag, 66 | postgres_conn_id="redshift", 67 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 68 | ) 69 | 70 | copy_trips_task = PythonOperator( 71 | task_id='load_trips_from_s3_to_redshift', 72 | dag=dag, 73 | python_callable=load_trip_data_to_redshift, 74 | provide_context=True, 75 | ) 76 | 77 | check_trips = PythonOperator( 78 | task_id='check_trips_data', 79 | dag=dag, 80 | python_callable=check_greater_than_zero, 81 | provide_context=True, 82 | params={ 83 | 'table': 'trips', 84 | } 85 | ) 86 | 87 | create_stations_table = PostgresOperator( 88 | task_id="create_stations_table", 89 | dag=dag, 90 | postgres_conn_id="redshift", 91 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL, 92 | ) 93 | 94 | copy_stations_task = PythonOperator( 95 | task_id='load_stations_from_s3_to_redshift', 96 | dag=dag, 97 | python_callable=load_station_data_to_redshift, 98 | ) 99 | 100 | check_stations = PythonOperator( 101 | task_id='check_stations_data', 102 | dag=dag, 103 | python_callable=check_greater_than_zero, 104 | provide_context=True, 105 | params={ 106 | 'table': 'stations', 107 | } 108 | ) 109 | 110 | create_trips_table >> copy_trips_task 111 | create_stations_table >> copy_stations_task 112 | copy_stations_task >> check_stations 113 | copy_trips_task >> check_trips 114 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex1_plugins.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # In this exercise, we’ll consolidate repeated code into Operator Plugins 3 | # 1 - Move the data quality check logic into a custom operator 4 | # 2 - Replace the data quality check PythonOperators with our new custom operator 5 | # 3 - Consolidate both the S3 to RedShift functions into a custom operator 6 | # 4 - Replace the S3 to RedShift PythonOperators with our new custom operator 7 | # 5 - Execute the DAG 8 | 9 | import datetime 10 | import logging 11 | 12 | from airflow import DAG 13 | from airflow.contrib.hooks.aws_hook import AwsHook 14 | from airflow.hooks.postgres_hook import PostgresHook 15 | 16 | from airflow.operators import ( 17 | HasRowsOperator, 18 | PostgresOperator, 19 | PythonOperator, 20 | S3ToRedshiftOperator 21 | ) 22 | 23 | import sql_statements 24 | 25 | 26 | dag = DAG( 27 | "lesson3.exercise1", 28 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), 29 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), 30 | schedule_interval="@monthly", 31 | max_active_runs=1 32 | ) 33 | 34 | create_trips_table = PostgresOperator( 35 | task_id="create_trips_table", 36 | dag=dag, 37 | postgres_conn_id="redshift", 38 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL 39 | ) 40 | 41 | copy_trips_task = S3ToRedshiftOperator( 42 | task_id="load_trips_from_s3_to_redshift", 43 | dag=dag, 44 | table="trips", 45 | redshift_conn_id="redshift", 46 | aws_credentials_id="aws_credentials", 47 | s3_bucket="udac-data-pipelines", 48 | s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" 49 | ) 50 | 51 | check_trips = HasRowsOperator( 52 | task_id='check_trips_data', 53 | dag=dag, 54 | redshift_conn_id="redshift", 55 | table='trips', 56 | provide_context=True 57 | ) 58 | 59 | create_stations_table = PostgresOperator( 60 | task_id="create_stations_table", 61 | dag=dag, 62 | postgres_conn_id="redshift", 63 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL, 64 | ) 65 | 66 | copy_stations_task = S3ToRedshiftOperator( 67 | task_id="load_stations_from_s3_to_redshift", 68 | dag=dag, 69 | redshift_conn_id="redshift", 70 | aws_credentials_id="aws_credentials", 71 | s3_bucket="udac-data-pipelines", 72 | s3_key="divvy/unpartitioned/divvy_stations_2017.csv", 73 | table="stations" 74 | ) 75 | 76 | check_stations = HasRowsOperator( 77 | task_id='check_trips_data', 78 | dag=dag, 79 | redshift_conn_id="redshift", 80 | table='stations', 81 | provide_context=True 82 | ) 83 | 84 | create_trips_table >> copy_trips_task 85 | create_stations_table >> copy_stations_task 86 | copy_stations_task >> check_stations 87 | copy_trips_task >> check_trips 88 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex2_refactoring.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # In this exercise, we’ll refactor a DAG with a single overloaded task into a DAG with several tasks with well-defined boundaries 3 | # 1 - Read through the DAG and identify points in the DAG that could be split apart 4 | # 2 - Split the DAG into multiple PythonOperators 5 | # 3 - Run the DAG 6 | 7 | import datetime 8 | import logging 9 | 10 | from airflow import DAG 11 | from airflow.hooks.postgres_hook import PostgresHook 12 | 13 | from airflow.operators.postgres_operator import PostgresOperator 14 | from airflow.operators.python_operator import PythonOperator 15 | 16 | 17 | def log_oldest(): 18 | redshift_hook = PostgresHook("redshift") 19 | records = redshift_hook.get_records(""" 20 | SELECT birthyear FROM older_riders ORDER BY birthyear ASC LIMIT 1 21 | """) 22 | if len(records) > 0 and len(records[0]) > 0: 23 | logging.info(f"Oldest rider was born in {records[0][0]}") 24 | 25 | 26 | def log_youngest(): 27 | redshift_hook = PostgresHook("redshift") 28 | records = redshift_hook.get_records(""" 29 | SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1 30 | """) 31 | if len(records) > 0 and len(records[0]) > 0: 32 | logging.info(f"Youngest rider was born in {records[0][0]}") 33 | 34 | 35 | dag = DAG( 36 | "lesson3.exercise2", 37 | start_date=datetime.datetime.utcnow() 38 | ) 39 | 40 | # Find all trips where the rider was under 18 41 | create_oldest_task = PostgresOperator( 42 | task_id="create_oldest", 43 | dag=dag, 44 | sql=""" 45 | BEGIN; 46 | DROP TABLE IF EXISTS older_riders; 47 | CREATE TABLE older_riders AS ( 48 | SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945 49 | ); 50 | COMMIT; 51 | """, 52 | postgres_conn_id="redshift" 53 | ) 54 | 55 | log_oldest_task = PythonOperator( 56 | task_id="log_oldest", 57 | dag=dag, 58 | python_callable=log_oldest 59 | ) 60 | 61 | create_youngest_task = PostgresOperator( 62 | task_id="create_youngest", 63 | dag=dag, 64 | sql=""" 65 | BEGIN; 66 | DROP TABLE IF EXISTS younger_riders; 67 | CREATE TABLE younger_riders AS ( 68 | SELECT * FROM trips WHERE birthyear > 2000 69 | ); 70 | COMMIT; 71 | """, 72 | postgres_conn_id="redshift" 73 | ) 74 | 75 | log_youngest_task = PythonOperator( 76 | task_id="log_youngest", 77 | dag=dag, 78 | python_callable=log_youngest 79 | ) 80 | 81 | create_lifetime_task = PostgresOperator( 82 | task_id="create_lifetime", 83 | dag=dag, 84 | sql=""" 85 | BEGIN; 86 | DROP TABLE IF EXISTS lifetime_rides; 87 | CREATE TABLE lifetime_rides AS ( 88 | SELECT bikeid, COUNT(bikeid) 89 | FROM trips 90 | GROUP BY bikeid 91 | ); 92 | COMMIT; 93 | """, 94 | postgres_conn_id="redshift" 95 | ) 96 | 97 | create_city_stations_task = PostgresOperator( 98 | task_id="create_city_stations", 99 | dag=dag, 100 | sql=""" 101 | BEGIN; 102 | DROP TABLE IF EXISTS city_station_counts; 103 | CREATE TABLE city_station_counts AS( 104 | SELECT city, COUNT(city) 105 | FROM stations 106 | GROUP BY city 107 | ); 108 | COMMIT; 109 | """, 110 | postgres_conn_id="redshift" 111 | ) 112 | 113 | create_oldest_task >> log_oldest_task 114 | create_youngest_task >> log_youngest_task 115 | 116 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/__init__.py -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/dag.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # In this exercise, we’ll place our S3 to RedShift Copy operations into a SubDag. 3 | # 1 - Consolidate HasRowsOperator into the SubDag 4 | # 2 - Reorder the tasks to take advantage of the SubDag Operators 5 | 6 | import datetime 7 | 8 | from airflow import DAG 9 | from airflow.operators.postgres_operator import PostgresOperator 10 | from airflow.operators.subdag_operator import SubDagOperator 11 | from airflow.operators.udacity_plugin import HasRowsOperator 12 | 13 | from lesson3.exercise3.subdag import get_s3_to_redshift_dag 14 | import sql_statements 15 | 16 | 17 | start_date = datetime.datetime.utcnow() 18 | 19 | dag = DAG( 20 | "lesson3.exercise3", 21 | start_date=start_date, 22 | ) 23 | 24 | trips_task_id = "trips_subdag" 25 | trips_subdag_task = SubDagOperator( 26 | subdag=get_s3_to_redshift_dag( 27 | "lesson3.exercise3", 28 | trips_task_id, 29 | "redshift", 30 | "aws_credentials", 31 | "trips", 32 | sql_statements.CREATE_TRIPS_TABLE_SQL, 33 | s3_bucket="udac-data-pipelines", 34 | s3_key="divvy/unpartitioned/divvy_trips_2018.csv", 35 | start_date=start_date, 36 | ), 37 | task_id=trips_task_id, 38 | dag=dag, 39 | ) 40 | 41 | stations_task_id = "stations_subdag" 42 | stations_subdag_task = SubDagOperator( 43 | subdag=get_s3_to_redshift_dag( 44 | "lesson3.exercise3", 45 | stations_task_id, 46 | "redshift", 47 | "aws_credentials", 48 | "stations", 49 | sql_statements.CREATE_STATIONS_TABLE_SQL, 50 | s3_bucket="udac-data-pipelines", 51 | s3_key="divvy/unpartitioned/divvy_stations_2017.csv", 52 | start_date=start_date, 53 | ), 54 | task_id=stations_task_id, 55 | dag=dag, 56 | ) 57 | 58 | # 59 | # TODO: Consolidate check_trips and check_stations into a single check in the subdag 60 | # as we did with the create and copy in the demo 61 | # 62 | check_trips = HasRowsOperator( 63 | task_id="check_trips_data", 64 | dag=dag, 65 | redshift_conn_id="redshift", 66 | table="trips" 67 | ) 68 | 69 | check_stations = HasRowsOperator( 70 | task_id="check_stations_data", 71 | dag=dag, 72 | redshift_conn_id="redshift", 73 | table="stations" 74 | ) 75 | 76 | location_traffic_task = PostgresOperator( 77 | task_id="calculate_location_traffic", 78 | dag=dag, 79 | postgres_conn_id="redshift", 80 | sql=sql_statements.LOCATION_TRAFFIC_SQL 81 | ) 82 | 83 | # 84 | # TODO: Reorder the Graph once you have moved the checks 85 | # 86 | trips_subdag_task >> check_trips 87 | stations_subdag_task >> check_stations 88 | check_stations >> location_traffic_task 89 | check_trips >> location_traffic_task 90 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/subdag.py: -------------------------------------------------------------------------------- 1 | # Instructions 2 | # In this exercise, we’ll place our S3 to RedShift Copy operations into a SubDag. 3 | # 1 - Consolidate HasRowsOperator into the SubDag 4 | # 2 - Reorder the tasks to take advantage of the SubDag Operators 5 | 6 | import datetime 7 | 8 | from airflow import DAG 9 | from airflow.operators.postgres_operator import PostgresOperator 10 | from airflow.operators.udacity_plugin import HasRowsOperator 11 | from airflow.operators.udacity_plugin import S3ToRedshiftOperator 12 | 13 | import sql_statements.py 14 | 15 | 16 | # Returns a DAG which creates a table if it does not exist, and then proceeds 17 | # to load data into that table from S3. When the load is complete, a data 18 | # quality check is performed to assert that at least one row of data is 19 | # present. 20 | def get_s3_to_redshift_dag( 21 | parent_dag_name, 22 | task_id, 23 | redshift_conn_id, 24 | aws_credentials_id, 25 | table, 26 | create_sql_stmt, 27 | s3_bucket, 28 | s3_key, 29 | *args, **kwargs): 30 | dag = DAG( 31 | f"{parent_dag_name}.{task_id}", 32 | **kwargs 33 | ) 34 | 35 | create_task = PostgresOperator( 36 | task_id=f"create_{table}_table", 37 | dag=dag, 38 | postgres_conn_id=redshift_conn_id, 39 | sql=create_sql_stmt 40 | ) 41 | 42 | copy_task = S3ToRedshiftOperator( 43 | task_id=f"load_{table}_from_s3_to_redshift", 44 | dag=dag, 45 | table=table, 46 | redshift_conn_id=redshift_conn_id, 47 | aws_credentials_id=aws_credentials_id, 48 | s3_bucket=s3_bucket, 49 | s3_key=s3_key 50 | ) 51 | 52 | # 53 | # TODO: Move the HasRowsOperator task here from the DAG 54 | # 55 | 56 | create_task >> copy_task 57 | # 58 | # TODO: Use DAG ordering to place the check task 59 | # 60 | 61 | return dag 62 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/3_ex4_full_dag.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from airflow import DAG 4 | 5 | from airflow.operators import ( 6 | FactsCalculatorOperator, 7 | HasRowsOperator, 8 | S3ToRedshiftOperator 9 | ) 10 | 11 | # This DAG performs the following functions 12 | # 1. Loads Trip data from S3 to RedShift 13 | # 2. Performs a data quality check on the Trips table in RedShift 14 | # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift 15 | # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator 16 | # skeleton defined in plugins/operators/facts_calculator.py 17 | # 18 | dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow()) 19 | 20 | copy_trips_task = S3ToRedshiftOperator( 21 | task_id="load_trips_from_s3_to_redshift", 22 | dag=dag, 23 | table="trips", 24 | redshift_conn_id="redshift", 25 | aws_credentials_id="aws_credentials", 26 | s3_bucket="udacity-dend", 27 | s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv" 28 | ) 29 | 30 | check_trips = HasRowsOperator( 31 | task_id="trips_has_rows", 32 | dag=dag, 33 | redshift_conn_id="redshift", 34 | table="trips", 35 | provide_context=True 36 | ) 37 | 38 | calculate_facts = FactsCalculatorOperator( 39 | task_id="calculate_facts", 40 | dag=dag, 41 | postgres_conn_id="redshift", 42 | origin_table="trips", 43 | destination_table="trips_facts", 44 | fact_column="tripduration", 45 | groupbycolumn="bikeid" 46 | ) 47 | 48 | copy_trips_task >> check_trips 49 | check_trips >> calculate_facts 50 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/dags/__init__.py -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/dags/sql_statements.py: -------------------------------------------------------------------------------- 1 | CREATE_TRIPS_TABLE_SQL = """ 2 | CREATE TABLE IF NOT EXISTS trips ( 3 | trip_id INTEGER NOT NULL, 4 | start_time TIMESTAMP NOT NULL, 5 | end_time TIMESTAMP NOT NULL, 6 | bikeid INTEGER NOT NULL, 7 | tripduration DECIMAL(16,2) NOT NULL, 8 | from_station_id INTEGER NOT NULL, 9 | from_station_name VARCHAR(100) NOT NULL, 10 | to_station_id INTEGER NOT NULL, 11 | to_station_name VARCHAR(100) NOT NULL, 12 | usertype VARCHAR(20), 13 | gender VARCHAR(6), 14 | birthyear INTEGER, 15 | PRIMARY KEY(trip_id)) 16 | DISTSTYLE ALL; 17 | """ 18 | 19 | CREATE_STATIONS_TABLE_SQL = """ 20 | CREATE TABLE IF NOT EXISTS stations ( 21 | id INTEGER NOT NULL, 22 | name VARCHAR(250) NOT NULL, 23 | city VARCHAR(100) NOT NULL, 24 | latitude DECIMAL(9, 6) NOT NULL, 25 | longitude DECIMAL(9, 6) NOT NULL, 26 | dpcapacity INTEGER NOT NULL, 27 | online_date TIMESTAMP NOT NULL, 28 | PRIMARY KEY(id)) 29 | DISTSTYLE ALL; 30 | """ 31 | 32 | COPY_SQL = """ 33 | COPY {} 34 | FROM '{}' 35 | ACCESS_KEY_ID '{{}}' 36 | SECRET_ACCESS_KEY '{{}}' 37 | IGNOREHEADER 1 38 | DELIMITER ',' 39 | """ 40 | 41 | COPY_MONTHLY_TRIPS_SQL = COPY_SQL.format( 42 | "trips", 43 | "s3://udacity-dend/data-pipelines/divvy/partitioned/{year}/{month}/divvy_trips.csv" 44 | ) 45 | 46 | COPY_ALL_TRIPS_SQL = COPY_SQL.format( 47 | "trips", 48 | "s3://udacity-dend/data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv" 49 | ) 50 | 51 | COPY_STATIONS_SQL = COPY_SQL.format( 52 | "stations", 53 | "s3://udacity-dend/data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv" 54 | ) 55 | 56 | LOCATION_TRAFFIC_SQL = """ 57 | BEGIN; 58 | DROP TABLE IF EXISTS station_traffic; 59 | CREATE TABLE station_traffic AS 60 | SELECT 61 | DISTINCT(t.from_station_id) AS station_id, 62 | t.from_station_name AS station_name, 63 | num_departures, 64 | num_arrivals 65 | FROM trips t 66 | JOIN ( 67 | SELECT 68 | from_station_id, 69 | COUNT(from_station_id) AS num_departures 70 | FROM trips 71 | GROUP BY from_station_id 72 | ) AS fs ON t.from_station_id = fs.from_station_id 73 | JOIN ( 74 | SELECT 75 | to_station_id, 76 | COUNT(to_station_id) AS num_arrivals 77 | FROM trips 78 | GROUP BY to_station_id 79 | ) AS ts ON t.from_station_id = ts.to_station_id 80 | """ 81 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from airflow.plugins_manager import AirflowPlugin 2 | 3 | import operators 4 | 5 | 6 | # Defining the plugin class 7 | class UdacityPlugin(AirflowPlugin): 8 | name = "udacity_plugin" 9 | operators = [ 10 | operators.FactsCalculatorOperator, 11 | operators.HasRowsOperator, 12 | operators.S3ToRedshiftOperator 13 | ] 14 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/plugins/operators/__init__.py: -------------------------------------------------------------------------------- 1 | from operators.facts_calculator import FactsCalculatorOperator 2 | from operators.has_rows import HasRowsOperator 3 | from operators.s3_to_redshift import S3ToRedshiftOperator 4 | 5 | __all__ = [ 6 | 'FactsCalculatorOperator', 7 | 'HasRowsOperator', 8 | 'S3ToRedshiftOperator' 9 | ] 10 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/plugins/operators/facts_calculator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.hooks.postgres_hook import PostgresHook 4 | from airflow.models import BaseOperator 5 | from airflow.utils.decorators import apply_defaults 6 | 7 | 8 | class FactsCalculatorOperator(BaseOperator): 9 | facts_sql_template = """ 10 | DROP TABLE IF EXISTS {destination_table}; 11 | CREATE TABLE {destination_table} AS 12 | SELECT 13 | {groupby_column}, 14 | MAX({fact_column}) AS max_{fact_column}, 15 | MIN({fact_column}) AS min_{fact_column}, 16 | AVG({fact_column}) AS average_{fact_column} 17 | FROM {origin_table} 18 | GROUP BY {groupby_column}; 19 | """ 20 | 21 | @apply_defaults 22 | def __init__(self, 23 | redshift_conn_id="", 24 | origin_table="", 25 | destination_table="", 26 | fact_column="", 27 | groupby_column="", 28 | *args, **kwargs): 29 | 30 | super(FactsCalculatorOperator, self).__init__(*args, **kwargs) 31 | self.redshift_conn_id = redshift_conn_id 32 | self.origin_table = origin_table 33 | self.destination_table = destination_table 34 | self.fact_column = fact_column 35 | self.groupby_column = groupby_column 36 | 37 | def execute(self, context): 38 | redshift_hook = PostgresHook(self.redshift_conn_id) 39 | formatted_sql = FactsCalculatorOperator.facts_sql_template.format( 40 | origin_table=self.origin_table, 41 | destination_table=self.destination_table, 42 | groupby_column=self.groupby_column, 43 | fact_column=self.fact_column 44 | ) 45 | redshift_hook.run(formatted_sql) 46 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/plugins/operators/has_rows.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.hooks.postgres_hook import PostgresHook 4 | from airflow.models import BaseOperator 5 | from airflow.utils.decorators import apply_defaults 6 | 7 | 8 | class HasRowsOperator(BaseOperator): 9 | 10 | @apply_defaults 11 | def __init__(self, 12 | redshift_conn_id="", 13 | table="", 14 | *args, **kwargs): 15 | 16 | super(HasRowsOperator, self).__init__(*args, **kwargs) 17 | self.table = table 18 | self.redshift_conn_id = redshift_conn_id 19 | 20 | def execute(self, context): 21 | redshift_hook = PostgresHook(self.redshift_conn_id) 22 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {self.table}") 23 | if len(records) < 1 or len(records[0]) < 1: 24 | raise ValueError(f"Data quality check failed. {self.table} returned no results") 25 | num_records = records[0][0] 26 | if num_records < 1: 27 | raise ValueError(f"Data quality check failed. {self.table} contained 0 rows") 28 | logging.info(f"Data quality on table {self.table} check passed with {records[0][0]} records") 29 | 30 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/exercises/plugins/operators/s3_to_redshift.py: -------------------------------------------------------------------------------- 1 | from airflow.contrib.hooks.aws_hook import AwsHook 2 | from airflow.hooks.postgres_hook import PostgresHook 3 | from airflow.models import BaseOperator 4 | from airflow.utils.decorators import apply_defaults 5 | 6 | 7 | class S3ToRedshiftOperator(BaseOperator): 8 | # We are telling airflow that we want this param to be templetable 9 | template_fields = ("s3_key",) 10 | copy_sql = """ 11 | COPY {} 12 | FROM '{}' 13 | ACCESS_KEY_ID '{}' 14 | SECRET_ACCESS_KEY '{}' 15 | IGNOREHEADER {} 16 | DELIMITER '{}' 17 | """ 18 | 19 | 20 | @apply_defaults 21 | def __init__(self, 22 | redshift_conn_id="", 23 | aws_credentials_id="", 24 | table="", 25 | s3_bucket="", 26 | s3_key="", # renders this value from context variables (reason: see line 8) 27 | delimiter=",", 28 | ignore_headers=1, 29 | *args, **kwargs): 30 | 31 | super(S3ToRedshiftOperator, self).__init__(*args, **kwargs) 32 | self.table = table 33 | self.redshift_conn_id = redshift_conn_id 34 | self.s3_bucket = s3_bucket 35 | self.s3_key = s3_key 36 | self.delimiter = delimiter 37 | self.ignore_headers = ignore_headers 38 | self.aws_credentials_id = aws_credentials_id 39 | 40 | def execute(self, context): 41 | aws_hook = AwsHook(self.aws_credentials_id) 42 | credentials = aws_hook.get_credentials() 43 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) 44 | 45 | self.log.info("Clearing data from destination Redshift table") 46 | redshift.run("DELETE FROM {}".format(self.table)) 47 | 48 | self.log.info("Copying data from S3 to Redshift") 49 | rendered_key = self.s3_key.format(**context) 50 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) 51 | formatted_sql = S3ToRedshiftOperator.copy_sql.format( 52 | self.table, 53 | s3_path, 54 | credentials.access_key, 55 | credentials.secret_key, 56 | self.ignore_headers, 57 | self.delimiter 58 | ) 59 | redshift.run(formatted_sql) 60 | -------------------------------------------------------------------------------- /4_dend_airflow_data_pipelines/glossary-data-pipelines-in-airflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/glossary-data-pipelines-in-airflow.pdf -------------------------------------------------------------------------------- /DEND.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | }, 6 | { 7 | "path": "3_dend_spark_data_lakes/P4_Data_Lake" 8 | } 9 | ], 10 | "settings": { 11 | "jira-plugin.workingProject": "", 12 | "python.condaPath": "/home/f.silvestre/anaconda3/envs/dend/bin/python" 13 | } 14 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Florencia Silvestre 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-engineering-nanodegree 2 | Projects done in the [Data Engineering Nanodegree by Udacity.com](https://www.udacity.com/course/data-engineer-nanodegree--nd027) 3 | 4 | ![Icon](data-engineering.jpg) 5 | 6 | ## Course 1: Data Modeling 7 | ### Introduction to Data Modeling 8 | ➔ Understand the purpose of data modeling 9 | 10 | ➔ Identify the strengths and weaknesses of different types of databases and data storage techniques 11 | 12 | ➔ Create a table in Postgres and Apache Cassandra 13 | 14 | ### Relational Data Models 15 | ➔ Understand when to use a relational database 16 | 17 | ➔ Understand the difference between OLAP and OLTP databases 18 | 19 | ➔ Create normalized data tables 20 | 21 | ➔ Implement denormalized schemas (e.g. STAR, Snowflake) 22 | 23 | ### NoSQL Data Models 24 | ➔ Understand when to use NoSQL databases and how they differ from relational databases 25 | 26 | ➔ Select the appropriate primary key and clustering columns for a given use case 27 | 28 | ➔ Create a NoSQL database in Apache Cassandra 29 | 30 | 31 | #### Project: Data Modeling with Postgres and Apache Cassandra 32 | 33 | ## Course 2: Cloud Data Warehouses 34 | ### Introduction to the Data Warehouses 35 | ➔ Understand Data Warehousing architecture 36 | 37 | ➔ Run an ETL process to denormalize a database (3NF to Star) 38 | 39 | ➔ Create an OLAP cube from facts and dimensions 40 | 41 | ➔ Compare columnar vs. row oriented approaches 42 | 43 | ### Introduction to the Cloud with AWS 44 | ➔ Understand cloud computing 45 | 46 | ➔ Create an AWS account and understand their services 47 | 48 | ➔ Set up Amazon S3, IAM, VPC, EC2, RDS PostgreSQL 49 | 50 | ### Implementing Data Warehouses on AWS 51 | ➔ Identify components of the Redshift architecture 52 | 53 | ➔ Run ETL process to extract data from S3 into Redshift 54 | 55 | ➔ Set up AWS infrastructure using Infrastructure as Code (IaC) 56 | 57 | ➔ Design an optimized table by selecting the appropriate distribution style and sorting key 58 | 59 | #### Project 2: Data Infrastructure on the Cloud 60 | 61 | ## Course 3: Data Lakes with Spark 62 | ### The Power of Spark 63 | ➔ Understand the big data ecosystem 64 | 65 | ➔ Understand when to use Spark and when not to use it 66 | 67 | ### Data Wrangling with Spark 68 | ➔ Manipulate data with SparkSQL and Spark Dataframes 69 | 70 | ➔ Use Spark for ETL purposes 71 | 72 | ### Debugging and Optimization 73 | ➔ Troubleshoot common errors and optimize their code using the Spark WebUI 74 | 75 | ### Introduction to Data Lakes 76 | ➔ Understand the purpose and evolution of data lakes 77 | 78 | ➔ Implement data lakes on Amazon S3, EMR, Athena, and Amazon Glue 79 | 80 | ➔ Use Spark to run ELT processes and analytics on data of diverse sources, structures, and vintages 81 | 82 | ➔ Understand the components and issues of data lakes 83 | 84 | #### Project 3: Big Data with Spark 85 | 86 | ## Course 4: Automate Data Pipelines 87 | ### Data Pipelines 88 | ➔ Create data pipelines with Apache Airflow 89 | 90 | ➔ Set up task dependencies 91 | 92 | ➔ Create data connections using hooks 93 | 94 | ### Data Quality 95 | ➔ Track data lineage 96 | 97 | ➔ Set up data pipeline schedules 98 | 99 | ➔ Partition data to optimize pipelines 100 | 101 | ➔ Write tests to ensure data quality 102 | 103 | ➔ Backfill data 104 | 105 | ### Production Data Pipelines 106 | ➔ Build reusable and maintainable pipelines 107 | 108 | ➔ Build your own Apache Airflow plugins 109 | 110 | ➔ Implement subDAGs 111 | 112 | ➔ Set up task boundaries 113 | 114 | ➔ Monitor data pipelines 115 | 116 | #### Project: Data Pipelines with Airflow 117 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-dinky -------------------------------------------------------------------------------- /cheatsheets/Data-Science-Books-for-2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Data-Science-Books-for-2018.pdf -------------------------------------------------------------------------------- /cheatsheets/Pandas DataFrame Notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Pandas DataFrame Notes.pdf -------------------------------------------------------------------------------- /cheatsheets/Pandas_Cheat_Sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Pandas_Cheat_Sheet.pdf -------------------------------------------------------------------------------- /cheatsheets/linux cheatsheet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/linux cheatsheet.jpg -------------------------------------------------------------------------------- /data-engineering.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/data-engineering.jpg -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: dend 2 | channels: 3 | - defaults 4 | prefix: /home/f.silvestre/anaconda3/envs/dend 5 | 6 | --------------------------------------------------------------------------------