├── .gitignore
├── .idea
├── Data-engineering-nanodegree.iml
├── markdown-navigator.xml
├── markdown-navigator
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── .vscode
└── settings.json
├── 1_dend_data_modeling
├── Data Modeling.pdf
├── P1_Postgres_Data_Modeling_and_ETL
│ ├── README.md
│ ├── create_tables.py
│ ├── data
│ │ ├── log_data
│ │ │ └── 2018
│ │ │ │ └── 11
│ │ │ │ ├── 2018-11-01-events.json
│ │ │ │ ├── 2018-11-02-events.json
│ │ │ │ ├── 2018-11-03-events.json
│ │ │ │ ├── 2018-11-04-events.json
│ │ │ │ ├── 2018-11-05-events.json
│ │ │ │ ├── 2018-11-06-events.json
│ │ │ │ ├── 2018-11-07-events.json
│ │ │ │ ├── 2018-11-08-events.json
│ │ │ │ ├── 2018-11-09-events.json
│ │ │ │ ├── 2018-11-10-events.json
│ │ │ │ ├── 2018-11-11-events.json
│ │ │ │ ├── 2018-11-12-events.json
│ │ │ │ ├── 2018-11-13-events.json
│ │ │ │ ├── 2018-11-14-events.json
│ │ │ │ ├── 2018-11-15-events.json
│ │ │ │ ├── 2018-11-16-events.json
│ │ │ │ ├── 2018-11-17-events.json
│ │ │ │ ├── 2018-11-18-events.json
│ │ │ │ ├── 2018-11-19-events.json
│ │ │ │ ├── 2018-11-20-events.json
│ │ │ │ ├── 2018-11-21-events.json
│ │ │ │ ├── 2018-11-22-events.json
│ │ │ │ ├── 2018-11-23-events.json
│ │ │ │ ├── 2018-11-24-events.json
│ │ │ │ ├── 2018-11-25-events.json
│ │ │ │ ├── 2018-11-26-events.json
│ │ │ │ ├── 2018-11-27-events.json
│ │ │ │ ├── 2018-11-28-events.json
│ │ │ │ ├── 2018-11-29-events.json
│ │ │ │ └── 2018-11-30-events.json
│ │ └── song_data
│ │ │ └── A
│ │ │ ├── A
│ │ │ ├── A
│ │ │ │ ├── TRAAAAW128F429D538.json
│ │ │ │ ├── TRAAABD128F429CF47.json
│ │ │ │ ├── TRAAADZ128F9348C2E.json
│ │ │ │ ├── TRAAAEF128F4273421.json
│ │ │ │ ├── TRAAAFD128F92F423A.json
│ │ │ │ ├── TRAAAMO128F1481E7F.json
│ │ │ │ ├── TRAAAMQ128F1460CD3.json
│ │ │ │ ├── TRAAAPK128E0786D96.json
│ │ │ │ ├── TRAAARJ128F9320760.json
│ │ │ │ ├── TRAAAVG12903CFA543.json
│ │ │ │ └── TRAAAVO128F93133D4.json
│ │ │ ├── B
│ │ │ │ ├── TRAABCL128F4286650.json
│ │ │ │ ├── TRAABDL12903CAABBA.json
│ │ │ │ ├── TRAABJL12903CDCF1A.json
│ │ │ │ ├── TRAABJV128F1460C49.json
│ │ │ │ ├── TRAABLR128F423B7E3.json
│ │ │ │ ├── TRAABNV128F425CEE1.json
│ │ │ │ ├── TRAABRB128F9306DD5.json
│ │ │ │ ├── TRAABVM128F92CA9DC.json
│ │ │ │ ├── TRAABXG128F9318EBD.json
│ │ │ │ ├── TRAABYN12903CFD305.json
│ │ │ │ └── TRAABYW128F4244559.json
│ │ │ └── C
│ │ │ │ ├── TRAACCG128F92E8A55.json
│ │ │ │ ├── TRAACER128F4290F96.json
│ │ │ │ ├── TRAACFV128F935E50B.json
│ │ │ │ ├── TRAACHN128F1489601.json
│ │ │ │ ├── TRAACIW12903CC0F6D.json
│ │ │ │ ├── TRAACLV128F427E123.json
│ │ │ │ ├── TRAACNS128F14A2DF5.json
│ │ │ │ ├── TRAACOW128F933E35F.json
│ │ │ │ ├── TRAACPE128F421C1B9.json
│ │ │ │ ├── TRAACQT128F9331780.json
│ │ │ │ ├── TRAACSL128F93462F4.json
│ │ │ │ ├── TRAACTB12903CAAF15.json
│ │ │ │ ├── TRAACVS128E078BE39.json
│ │ │ │ └── TRAACZK128F4243829.json
│ │ │ └── B
│ │ │ ├── A
│ │ │ ├── TRABACN128F425B784.json
│ │ │ ├── TRABAFJ128F42AF24E.json
│ │ │ ├── TRABAFP128F931E9A1.json
│ │ │ ├── TRABAIO128F42938F9.json
│ │ │ ├── TRABATO128F42627E9.json
│ │ │ ├── TRABAVQ12903CBF7E0.json
│ │ │ ├── TRABAWW128F4250A31.json
│ │ │ ├── TRABAXL128F424FC50.json
│ │ │ ├── TRABAXR128F426515F.json
│ │ │ ├── TRABAXV128F92F6AE3.json
│ │ │ └── TRABAZH128F930419A.json
│ │ │ ├── B
│ │ │ ├── TRABBAM128F429D223.json
│ │ │ ├── TRABBBV128F42967D7.json
│ │ │ ├── TRABBJE12903CDB442.json
│ │ │ ├── TRABBKX128F4285205.json
│ │ │ ├── TRABBLU128F93349CF.json
│ │ │ ├── TRABBNP128F932546F.json
│ │ │ ├── TRABBOP128F931B50D.json
│ │ │ ├── TRABBOR128F4286200.json
│ │ │ ├── TRABBTA128F933D304.json
│ │ │ ├── TRABBVJ128F92F7EAA.json
│ │ │ ├── TRABBXU128F92FEF48.json
│ │ │ └── TRABBZN12903CD9297.json
│ │ │ └── C
│ │ │ ├── TRABCAJ12903CDFCC2.json
│ │ │ ├── TRABCEC128F426456E.json
│ │ │ ├── TRABCEI128F424C983.json
│ │ │ ├── TRABCFL128F149BB0D.json
│ │ │ ├── TRABCIX128F4265903.json
│ │ │ ├── TRABCKL128F423A778.json
│ │ │ ├── TRABCPZ128F4275C32.json
│ │ │ ├── TRABCRU128F423F449.json
│ │ │ ├── TRABCTK128F934B224.json
│ │ │ ├── TRABCUQ128E0783E2B.json
│ │ │ ├── TRABCXB128F4286BD3.json
│ │ │ └── TRABCYE128F934CE1D.json
│ ├── etl.ipynb
│ ├── etl.py
│ ├── sql_queries.py
│ └── test.ipynb
├── P2_Cassandra_Data_Modeling_and_ETL
│ ├── Project_1B_ Project_Template.ipynb
│ ├── event_data
│ │ ├── 2018-11-01-events.csv
│ │ ├── 2018-11-02-events.csv
│ │ ├── 2018-11-03-events.csv
│ │ ├── 2018-11-04-events.csv
│ │ ├── 2018-11-05-events.csv
│ │ ├── 2018-11-06-events.csv
│ │ ├── 2018-11-07-events.csv
│ │ ├── 2018-11-08-events.csv
│ │ ├── 2018-11-09-events.csv
│ │ ├── 2018-11-10-events.csv
│ │ ├── 2018-11-11-events.csv
│ │ ├── 2018-11-12-events.csv
│ │ ├── 2018-11-13-events.csv
│ │ ├── 2018-11-14-events.csv
│ │ ├── 2018-11-15-events.csv
│ │ ├── 2018-11-16-events.csv
│ │ ├── 2018-11-17-events.csv
│ │ ├── 2018-11-18-events.csv
│ │ ├── 2018-11-19-events.csv
│ │ ├── 2018-11-20-events.csv
│ │ ├── 2018-11-21-events.csv
│ │ ├── 2018-11-22-events.csv
│ │ ├── 2018-11-23-events.csv
│ │ ├── 2018-11-24-events.csv
│ │ ├── 2018-11-25-events.csv
│ │ ├── 2018-11-26-events.csv
│ │ ├── 2018-11-27-events.csv
│ │ ├── 2018-11-28-events.csv
│ │ ├── 2018-11-29-events.csv
│ │ └── 2018-11-30-events.csv
│ ├── event_datafile_new.csv
│ └── images
│ │ └── image_event_datafile_new.jpg
└── notebooks
│ ├── L1-D0-creating-a-table-with-postgres.ipynb
│ ├── L1-D1-creating-a-table-with-postgres.ipynb
│ ├── L1-D2-creating-a-table-with-apache-cassandra.ipynb
│ ├── L2-D1-creating-normalized-tables.ipynb
│ ├── L2-D2-creating-denormalized-tables.ipynb
│ ├── L2-D3-creating-fact-and-dimension-tables-with-star-schema.ipynb
│ ├── L3-D1-2-queries-2-tables.ipynb
│ ├── L3-D2-primary-key.ipynb
│ ├── L3-D3-clustering-column.ipynb
│ └── L3-D4-using-the-where-clause.ipynb
├── 2_dend_cloud_data_warehouses
├── Data warehousing in the cloud.pdf
├── P3_Data_Warehouse_Project
│ ├── .vscode
│ │ └── settings.json
│ ├── README.md
│ ├── analytics.py
│ ├── create_cluster.py
│ ├── create_tables.py
│ ├── etl.py
│ ├── requirements.txt
│ └── sql_queries.py
├── infrastructure_as_code.py
├── log-data.csv
│ └── log_data.csv
├── notebooks
│ ├── Data
│ │ ├── README
│ │ ├── pagila-data.sql
│ │ ├── pagila-insert-data.sql
│ │ └── pagila-schema.sql
│ ├── L1 E1 - Step 1 and 2.ipynb
│ ├── L1 E1 - Step 3.ipynb
│ ├── L1 E1 - Step 4.ipynb
│ ├── L1 E1 - Step 5.ipynb
│ ├── L1 E1 - Step 6.ipynb
│ ├── L1 E2 - 1 - Slicing and Dicing.ipynb
│ ├── L1 E2 - 2 - Roll up and Drill Down.ipynb
│ ├── L1 E2 - 3 - Grouping Sets.ipynb
│ ├── L1 E2 - 4 - CUBE.ipynb
│ ├── L1 E3 - Columnar Vs Row Storage.ipynb
│ ├── L3 Exercise 2 - IaC - Solution.ipynb
│ ├── L3 Exercise 2 - IaC - Solution.py
│ ├── L3 Exercise 3 - Parallel ETL - Solution.ipynb
│ ├── L3 Exercise 3 - Parallel ETL - Solution.py
│ ├── L3 Exercise 4 - Table Design - Solution.ipynb
│ ├── L3 Exercise 4 - Table Design - Solution.py
│ └── pagila-star.png
└── notes
│ └── AWS.md
├── 3_dend_spark_data_lakes
├── Data Lakes with Spark.pdf
├── P4_Data_Lake
│ ├── README.md
│ └── etl.py
├── data
│ ├── log-data.png
│ ├── log-data.zip
│ ├── song-data.zip
│ └── sparkify_log_small.json
├── notebooks
│ ├── 1_procedural_vs_functional_in_python.ipynb
│ ├── 2_spark_maps_and_lazy_evaluation.ipynb
│ ├── 3_data_inputs_and_outputs.ipynb
│ ├── 4_data_wrangling.ipynb
│ ├── 5_dataframe_quiz.ipynb
│ ├── 6_dataframe_quiz_solution.ipynb
│ ├── 7_data_wrangling-sql.ipynb
│ ├── 8_spark_sql_quiz.ipynb
│ ├── 9_spark_sql_quiz_solution.ipynb
│ ├── Exercise 1 - Schema On Read.ipynb
│ ├── Exercise 2 - Advanced Analytics NLP.ipynb
│ ├── Exercise 3 - Data Lake on S3.ipynb
│ └── mapreduce_practice.ipynb
└── spark.md
├── 4_dend_airflow_data_pipelines
├── P5_Data_Pipelines
│ ├── README.md
│ ├── __init__.py
│ ├── airflow.db
│ ├── dags
│ │ ├── __init__.py
│ │ ├── sparkify_dend_dag.py
│ │ └── sparkify_dend_dimesions_subdag.py
│ ├── imgs
│ │ ├── airflow-details-dag.png
│ │ ├── airflow-running-dag.png
│ │ ├── dag-code.png
│ │ └── dag.png
│ └── plugins
│ │ ├── __init__.py
│ │ ├── helpers
│ │ ├── __init__.py
│ │ └── sql_queries.py
│ │ └── operators
│ │ ├── __init__.py
│ │ ├── create_tables.py
│ │ ├── create_tables.sql
│ │ ├── data_quality.py
│ │ ├── load_dimension.py
│ │ ├── load_fact.py
│ │ └── stage_redshift.py
├── data_pipelines.md
├── exercises
│ ├── __init__.py
│ ├── dags
│ │ ├── 1_ex1_hello_world.py
│ │ ├── 1_ex2_scheduler.py
│ │ ├── 1_ex3_dependencies.py
│ │ ├── 1_ex4_connections.py
│ │ ├── 1_ex5_context.py
│ │ ├── 1_ex6_redshift_queries.py
│ │ ├── 2_ex1_data_lineage.py
│ │ ├── 2_ex2_schedule_backfilling.py
│ │ ├── 2_ex3_data_partitioning.py
│ │ ├── 2_ex4_data_quality.py
│ │ ├── 3_ex1_plugins.py
│ │ ├── 3_ex2_refactoring.py
│ │ ├── 3_ex3_subdags
│ │ │ ├── __init__.py
│ │ │ ├── dag.py
│ │ │ └── subdag.py
│ │ ├── 3_ex4_full_dag.py
│ │ ├── __init__.py
│ │ └── sql_statements.py
│ └── plugins
│ │ ├── __init__.py
│ │ └── operators
│ │ ├── __init__.py
│ │ ├── facts_calculator.py
│ │ ├── has_rows.py
│ │ └── s3_to_redshift.py
└── glossary-data-pipelines-in-airflow.pdf
├── DEND.code-workspace
├── LICENSE
├── README.md
├── _config.yml
├── cheatsheets
├── Data-Science-Books-for-2018.pdf
├── Pandas DataFrame Notes.pdf
├── Pandas_Cheat_Sheet.pdf
└── linux cheatsheet.jpg
├── data-engineering.jpg
└── environment.yml
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.DS_Store
6 | .idea/*
7 |
8 | # C extensions
9 | *.so
10 | *.cfg
11 | .idea/
12 | .vscode/
13 | credentials.csv
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # Environments
91 | .env
92 | .venv
93 | env/
94 | venv/
95 | ENV/
96 | env.bak/
97 | venv.bak/
98 |
99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 | # mkdocs documentation
107 | /site
108 |
109 | # mypy
110 | .mypy_cache/
111 |
--------------------------------------------------------------------------------
/.idea/Data-engineering-nanodegree.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/home/f.silvestre/anaconda3/envs/pasi-server/bin/python"
3 | }
--------------------------------------------------------------------------------
/1_dend_data_modeling/Data Modeling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/1_dend_data_modeling/Data Modeling.pdf
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/create_tables.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 | from sql_queries import create_table_queries, drop_table_queries
3 |
4 |
5 | def create_database():
6 | '''Creates and connects to sparkifydb database. Returns cursor and connection to DB'''
7 | # connect to default database
8 | conn = psycopg2.connect("host=127.0.0.1 dbname=studentdb user=student password=student")
9 | conn.set_session(autocommit=True)
10 | cur = conn.cursor()
11 |
12 | # create sparkify database with UTF8 encoding
13 | cur.execute("DROP DATABASE IF EXISTS sparkifydb")
14 | cur.execute("CREATE DATABASE sparkifydb WITH ENCODING 'utf8' TEMPLATE template0")
15 |
16 | # close connection to default database
17 | conn.close()
18 |
19 | # connect to sparkify database
20 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
21 | cur = conn.cursor()
22 |
23 | return cur, conn
24 |
25 |
26 | def drop_tables(cur, conn):
27 | '''Drops all tables created on the database'''
28 | for query in drop_table_queries:
29 | cur.execute(query)
30 | conn.commit()
31 |
32 |
33 | def create_tables(cur, conn):
34 | '''Created tables defined on the sql_queries script: [songplays, users, songs, artists, time]'''
35 | for query in create_table_queries:
36 | cur.execute(query)
37 | conn.commit()
38 |
39 |
40 | def main():
41 | """ Function to drop and re create sparkifydb database and all related tables.
42 | Usage: python create_tables.py
43 | """
44 | cur, conn = create_database()
45 |
46 | drop_tables(cur, conn)
47 | create_tables(cur, conn)
48 |
49 | conn.close()
50 |
51 |
52 | if __name__ == "__main__":
53 | main()
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAAW128F429D538.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOMZWCG12A8C13C480", "title": "I Didn't Mean To", "duration": 218.93179, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAABD128F429CF47.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARMJAGH1187FB546F3", "artist_latitude": 35.14968, "artist_longitude": -90.04892, "artist_location": "Memphis, TN", "artist_name": "The Box Tops", "song_id": "SOCIWDW12A8C13D406", "title": "Soul Deep", "duration": 148.03546, "year": 1969}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAADZ128F9348C2E.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARKRRTF1187B9984DA", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sonora Santanera", "song_id": "SOXVLOJ12AB0189215", "title": "Amor De Cabaret", "duration": 177.47546, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAEF128F4273421.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR7G5I41187FB4CE6C", "artist_latitude": null, "artist_longitude": null, "artist_location": "London, England", "artist_name": "Adam Ant", "song_id": "SONHOTT12A8C13493C", "title": "Something Girls", "duration": 233.40363, "year": 1982}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAFD128F92F423A.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARXR32B1187FB57099", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gob", "song_id": "SOFSOCN12A8C143F5D", "title": "Face the Ashes", "duration": 209.60608, "year": 2007}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAMO128F1481E7F.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARKFYS91187B98E58F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Jeff And Sheri Easter", "song_id": "SOYMRWW12A6D4FAB14", "title": "The Moon And I (Ordinary Day Album Version)", "duration": 267.7024, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAMQ128F1460CD3.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARD0S291187B9B7BF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "Ohio", "artist_name": "Rated R", "song_id": "SOMJBYD12A6D4F8557", "title": "Keepin It Real (Skit)", "duration": 114.78159, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAPK128E0786D96.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR10USD1187B99F3F1", "artist_latitude": null, "artist_longitude": null, "artist_location": "Burlington, Ontario, Canada", "artist_name": "Tweeterfriendly Music", "song_id": "SOHKNRJ12A6701D1F8", "title": "Drop of Rain", "duration": 189.57016, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAARJ128F9320760.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR8ZCNI1187B9A069B", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Planet P Project", "song_id": "SOIAZJW12AB01853F1", "title": "Pink World", "duration": 269.81832, "year": 1984}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAVG12903CFA543.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOUDSGM12AC9618304", "title": "Insatiable (Instrumental Version)", "duration": 266.39628, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/A/TRAAAVO128F93133D4.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARGSJW91187B9B1D6B", "artist_latitude": 35.21962, "artist_longitude": -80.01955, "artist_location": "North Carolina", "artist_name": "JennyAnyKind", "song_id": "SOQHXMF12AB0182363", "title": "Young Boy Blues", "duration": 218.77506, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABCL128F4286650.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARC43071187B990240", "artist_latitude": null, "artist_longitude": null, "artist_location": "Wisner, LA", "artist_name": "Wayne Watson", "song_id": "SOKEJEJ12A8C13E0D0", "title": "The Urgency (LP Version)", "duration": 245.21098, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABDL12903CAABBA.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARL7K851187B99ACD2", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Andy Andy", "song_id": "SOMUYGI12AB0188633", "title": "La Culpa", "duration": 226.35057, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABJL12903CDCF1A.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARHHO3O1187B989413", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Bob Azzam", "song_id": "SORAMLE12AB017C8B0", "title": "Auguri Cha Cha", "duration": 191.84281, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABJV128F1460C49.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARIK43K1187B9AE54C", "artist_latitude": null, "artist_longitude": null, "artist_location": "Beverly Hills, CA", "artist_name": "Lionel Richie", "song_id": "SOBONFF12A6D4F84D8", "title": "Tonight Will Be Alright", "duration": 307.3824, "year": 1986}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABLR128F423B7E3.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARD842G1187B997376", "artist_latitude": 43.64856, "artist_longitude": -79.38533, "artist_location": "Toronto, Ontario, Canada", "artist_name": "Blue Rodeo", "song_id": "SOHUOAP12A8AE488E9", "title": "Floating", "duration": 491.12771, "year": 1987}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABNV128F425CEE1.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARIG6O41187B988BDD", "artist_latitude": 37.16793, "artist_longitude": -95.84502, "artist_location": "United States", "artist_name": "Richard Souther", "song_id": "SOUQQEA12A8C134B1B", "title": "High Tide", "duration": 228.5971, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABRB128F9306DD5.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR1ZHYZ1187FB3C717", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Faiz Ali Faiz", "song_id": "SOILPQQ12AB017E82A", "title": "Sohna Nee Sohna Data", "duration": 599.24853, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABVM128F92CA9DC.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARYKCQI1187FB3B18F", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Tesla", "song_id": "SOXLBJT12A8C140925", "title": "Caught In A Dream", "duration": 290.29832, "year": 2004}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABXG128F9318EBD.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARNPAGP1241B9C7FD4", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "lextrical", "song_id": "SOZVMJI12AB01808AF", "title": "Synthetic Dream", "duration": 165.69424, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABYN12903CFD305.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARQGYP71187FB44566", "artist_latitude": 34.31109, "artist_longitude": -94.02978, "artist_location": "Mineola, AR", "artist_name": "Jimmy Wakely", "song_id": "SOWTBJW12AC468AC6E", "title": "Broken-Down Merry-Go-Round", "duration": 151.84934, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/B/TRAABYW128F4244559.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARI3BMM1187FB4255E", "artist_latitude": 38.8991, "artist_longitude": -77.029, "artist_location": "Washington", "artist_name": "Alice Stuart", "song_id": "SOBEBDG12A58A76D60", "title": "Kassie Jones", "duration": 220.78649, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACCG128F92E8A55.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACER128F4290F96.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARMAC4T1187FB3FA4C", "artist_latitude": 40.82624, "artist_longitude": -74.47995, "artist_location": "Morris Plains, NJ", "artist_name": "The Dillinger Escape Plan", "song_id": "SOBBUGU12A8C13E95D", "title": "Setting Fire to Sleeping Giants", "duration": 207.77751, "year": 2004}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACFV128F935E50B.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR47JEX1187B995D81", "artist_latitude": 37.83721, "artist_longitude": -94.35868, "artist_location": "Nevada, MO", "artist_name": "SUE THOMPSON", "song_id": "SOBLGCN12AB0183212", "title": "James (Hold The Ladder Steady)", "duration": 124.86485, "year": 1985}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACHN128F1489601.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARGIWFO1187B9B55B7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Five Bolt Main", "song_id": "SOPSWQW12A6D4F8781", "title": "Made Like This (Live)", "duration": 225.09669, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACIW12903CC0F6D.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARNTLGG11E2835DDB9", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Clp", "song_id": "SOZQDIU12A58A7BCF6", "title": "Superconfidential", "duration": 338.31138, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACLV128F427E123.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARDNS031187B9924F0", "artist_latitude": 32.67828, "artist_longitude": -83.22295, "artist_location": "Georgia", "artist_name": "Tim Wilson", "song_id": "SONYPOM12A8C13B2D7", "title": "I Think My Wife Is Running Around On Me (Taco Hell)", "duration": 186.48771, "year": 2005}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACNS128F14A2DF5.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AROUOZZ1187B9ABE51", "artist_latitude": 40.79195, "artist_longitude": -73.94512, "artist_location": "New York, NY [Spanish Harlem]", "artist_name": "Willie Bobo", "song_id": "SOBZBAZ12A6D4F8742", "title": "Spanish Grease", "duration": 168.25424, "year": 1997}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACOW128F933E35F.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARH4Z031187B9A71F2", "artist_latitude": 40.73197, "artist_longitude": -74.17418, "artist_location": "Newark, NJ", "artist_name": "Faye Adams", "song_id": "SOVYKGO12AB0187199", "title": "Crazy Mixed Up World", "duration": 156.39465, "year": 1961}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACPE128F421C1B9.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARB29H41187B98F0EF", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago", "artist_name": "Terry Callier", "song_id": "SOGNCJP12A58A80271", "title": "Do You Finally Need A Friend", "duration": 342.56934, "year": 1972}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACQT128F9331780.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR1Y2PT1187FB5B9CE", "artist_latitude": 27.94017, "artist_longitude": -82.32547, "artist_location": "Brandon", "artist_name": "John Wesley", "song_id": "SOLLHMX12AB01846DC", "title": "The Emperor Falls", "duration": 484.62322, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACSL128F93462F4.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARAJPHH1187FB5566A", "artist_latitude": 40.7038, "artist_longitude": -73.83168, "artist_location": "Queens, NY", "artist_name": "The Shangri-Las", "song_id": "SOYTPEP12AB0180E7B", "title": "Twist and Shout", "duration": 164.80608, "year": 1964}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACTB12903CAAF15.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR0RCMP1187FB3F427", "artist_latitude": 30.08615, "artist_longitude": -94.10158, "artist_location": "Beaumont, TX", "artist_name": "Billie Jo Spears", "song_id": "SOGXHEG12AB018653E", "title": "It Makes No Difference Now", "duration": 133.32853, "year": 1992}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACVS128E078BE39.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AREBBGV1187FB523D2", "artist_latitude": null, "artist_longitude": null, "artist_location": "Houston, TX", "artist_name": "Mike Jones (Featuring CJ_ Mello & Lil' Bran)", "song_id": "SOOLYAZ12A6701F4A6", "title": "Laws Patrolling (Album Version)", "duration": 173.66159, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/A/C/TRAACZK128F4243829.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARGUVEV1187B98BA17", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Sierra Maestra", "song_id": "SOGOSOV12AF72A285E", "title": "\u00bfD\u00f3nde va Chichi?", "duration": 313.12934, "year": 1997}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABACN128F425B784.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARD7TVE1187B99BFB1", "artist_latitude": null, "artist_longitude": null, "artist_location": "California - LA", "artist_name": "Casual", "song_id": "SOQLGFP12A58A7800E", "title": "OAKtown", "duration": 259.44771, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAFJ128F42AF24E.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR3JMC51187B9AE49D", "artist_latitude": 28.53823, "artist_longitude": -81.37739, "artist_location": "Orlando, FL", "artist_name": "Backstreet Boys", "song_id": "SOPVXLX12A8C1402D5", "title": "Larger Than Life", "duration": 236.25098, "year": 1999}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAFP128F931E9A1.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARPBNLO1187FB3D52F", "artist_latitude": 40.71455, "artist_longitude": -74.00712, "artist_location": "New York, NY", "artist_name": "Tiny Tim", "song_id": "SOAOIBZ12AB01815BE", "title": "I Hold Your Hand In Mine [Live At Royal Albert Hall]", "duration": 43.36281, "year": 2000}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAIO128F42938F9.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR9AWNF1187B9AB0B4", "artist_latitude": null, "artist_longitude": null, "artist_location": "Seattle, Washington USA", "artist_name": "Kenny G featuring Daryl Hall", "song_id": "SOZHPGD12A8C1394FE", "title": "Baby Come To Me", "duration": 236.93016, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABATO128F42627E9.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AROGWRA122988FEE45", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Christos Dantis", "song_id": "SOSLAVG12A8C13397F", "title": "Den Pai Alo", "duration": 243.82649, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAVQ12903CBF7E0.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARMBR4Y1187B9990EB", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "David Martin", "song_id": "SOTTDKS12AB018D69B", "title": "It Wont Be Christmas", "duration": 241.47546, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAWW128F4250A31.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARQ9BO41187FB5CF1F", "artist_latitude": 40.99471, "artist_longitude": -77.60454, "artist_location": "Pennsylvania", "artist_name": "John Davis", "song_id": "SOMVWWT12A58A7AE05", "title": "Knocked Out Of The Park", "duration": 183.17016, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXL128F424FC50.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARKULSX1187FB45F84", "artist_latitude": 39.49974, "artist_longitude": -111.54732, "artist_location": "Utah", "artist_name": "Trafik", "song_id": "SOQVMXR12A81C21483", "title": "Salt In NYC", "duration": 424.12363, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXR128F426515F.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARI2JSK1187FB496EF", "artist_latitude": 51.50632, "artist_longitude": -0.12714, "artist_location": "London, England", "artist_name": "Nick Ingman;Gavyn Wright", "song_id": "SODUJBS12A8C132150", "title": "Wessex Loses a Bride", "duration": 111.62077, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAXV128F92F6AE3.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AREDBBQ1187B98AFF5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Eddie Calvert", "song_id": "SOBBXLX12A58A79DDA", "title": "Erica (2005 Digital Remaster)", "duration": 138.63138, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/A/TRABAZH128F930419A.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR7ZKHQ1187B98DD73", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Glad", "song_id": "SOTUKVB12AB0181477", "title": "Blessed Assurance", "duration": 270.602, "year": 1993}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBAM128F429D223.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARBGXIG122988F409D", "artist_latitude": 37.77916, "artist_longitude": -122.42005, "artist_location": "California - SF", "artist_name": "Steel Rain", "song_id": "SOOJPRH12A8C141995", "title": "Loaded Like A Gun", "duration": 173.19138, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBBV128F42967D7.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR7SMBG1187B9B9066", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Los Manolos", "song_id": "SOBCOSW12A8C13D398", "title": "Rumba De Barcelona", "duration": 218.38322, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBJE12903CDB442.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARGCY1Y1187B9A4FA5", "artist_latitude": 36.16778, "artist_longitude": -86.77836, "artist_location": "Nashville, TN.", "artist_name": "Gloriana", "song_id": "SOQOTLQ12AB01868D0", "title": "Clementina Santaf\u00e8", "duration": 153.33832, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBKX128F4285205.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR36F9J1187FB406F1", "artist_latitude": 56.27609, "artist_longitude": 9.51695, "artist_location": "Denmark", "artist_name": "Bombay Rockers", "song_id": "SOBKWDJ12A8C13B2F3", "title": "Wild Rose (Back 2 Basics Mix)", "duration": 230.71302, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBLU128F93349CF.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARNNKDK1187B98BBD5", "artist_latitude": 45.80726, "artist_longitude": 15.9676, "artist_location": "Zagreb Croatia", "artist_name": "Jinx", "song_id": "SOFNOQK12AB01840FC", "title": "Kutt Free (DJ Volume Remix)", "duration": 407.37914, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBNP128F932546F.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR62SOJ1187FB47BB5", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Chase & Status", "song_id": "SOGVQGJ12AB017F169", "title": "Ten Tonne", "duration": 337.68444, "year": 2005}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBOP128F931B50D.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARBEBBY1187B9B43DB", "artist_latitude": null, "artist_longitude": null, "artist_location": "Gainesville, FL", "artist_name": "Tom Petty", "song_id": "SOFFKZS12AB017F194", "title": "A Higher Place (Album Version)", "duration": 236.17261, "year": 1994}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBOR128F4286200.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARDR4AC1187FB371A1", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Montserrat Caball\u00e9;Placido Domingo;Vicente Sardinero;Judith Blegen;Sherrill Milnes;Georg Solti", "song_id": "SOBAYLL12A8C138AF9", "title": "Sono andati? Fingevo di dormire", "duration": 511.16363, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBTA128F933D304.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARAGB2O1187FB3A161", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Pucho & His Latin Soul Brothers", "song_id": "SOLEYHO12AB0188A85", "title": "Got My Mojo Workin", "duration": 338.23302, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBVJ128F92F7EAA.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AREDL271187FB40F44", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Soul Mekanik", "song_id": "SOPEGZN12AB0181B3D", "title": "Get Your Head Stuck On Your Neck", "duration": 45.66159, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBXU128F92FEF48.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARP6N5A1187B99D1A3", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamtramck, MI", "artist_name": "Mitch Ryder", "song_id": "SOXILUQ12A58A7C72A", "title": "Jenny Take a Ride", "duration": 207.43791, "year": 2004}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/B/TRABBZN12903CD9297.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARGSAFR1269FB35070", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Blingtones", "song_id": "SOTCKKY12AB018A141", "title": "Sonnerie lalaleul\u00e9 hi houuu", "duration": 29.54404, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCAJ12903CDFCC2.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARULZCI1241B9C8611", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Luna Orbit Project", "song_id": "SOSWKAV12AB018FC91", "title": "Midnight Star", "duration": 335.51628, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCEC128F426456E.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR0IAWL1187B9A96D0", "artist_latitude": 8.4177, "artist_longitude": -80.11278, "artist_location": "Panama", "artist_name": "Danilo Perez", "song_id": "SONSKXP12A8C13A2C9", "title": "Native Soul", "duration": 197.19791, "year": 2003}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCEI128F424C983.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCFL128F149BB0D.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARLTWXK1187FB5A3F8", "artist_latitude": 32.74863, "artist_longitude": -97.32925, "artist_location": "Fort Worth, TX", "artist_name": "King Curtis", "song_id": "SODREIN12A58A7F2E5", "title": "A Whiter Shade Of Pale (Live @ Fillmore West)", "duration": 326.00771, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCIX128F4265903.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARNF6401187FB57032", "artist_latitude": 40.79086, "artist_longitude": -73.96644, "artist_location": "New York, NY [Manhattan]", "artist_name": "Sophie B. Hawkins", "song_id": "SONWXQJ12A8C134D94", "title": "The Ballad Of Sleeping Beauty", "duration": 305.162, "year": 1994}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCKL128F423A778.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARPFHN61187FB575F6", "artist_latitude": 41.88415, "artist_longitude": -87.63241, "artist_location": "Chicago, IL", "artist_name": "Lupe Fiasco", "song_id": "SOWQTQZ12A58A7B63E", "title": "Streets On Fire (Explicit Album Version)", "duration": 279.97995, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCPZ128F4275C32.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR051KA1187B98B2FF", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Wilks", "song_id": "SOLYIBD12A8C135045", "title": "Music is what we love", "duration": 261.51138, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCRU128F423F449.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR8IEZO1187B99055E", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Marc Shaiman", "song_id": "SOINLJW12A8C13314C", "title": "City Slickers", "duration": 149.86404, "year": 2008}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCTK128F934B224.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AR558FS1187FB45658", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "40 Grit", "song_id": "SOGDBUF12A8C140FAA", "title": "Intro", "duration": 75.67628, "year": 2003}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCUQ128E0783E2B.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARVBRGZ1187FB4675A", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Gwen Stefani", "song_id": "SORRZGD12A6310DBC3", "title": "Harajuku Girls", "duration": 290.55955, "year": 2004}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCXB128F4286BD3.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "ARWB3G61187FB49404", "artist_latitude": null, "artist_longitude": null, "artist_location": "Hamilton, Ohio", "artist_name": "Steve Morse", "song_id": "SODAUVL12A8C13D184", "title": "Prognosis", "duration": 363.85914, "year": 2000}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/data/song_data/A/B/C/TRABCYE128F934CE1D.json:
--------------------------------------------------------------------------------
1 | {"num_songs": 1, "artist_id": "AREVWGE1187B9B890A", "artist_latitude": -13.442, "artist_longitude": -41.9952, "artist_location": "Noci (BA)", "artist_name": "Bitter End", "song_id": "SOFCHDR12AB01866EF", "title": "Living Hell", "duration": 282.43546, "year": 0}
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/etl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import psycopg2
4 | import pandas as pd
5 | from sql_queries import *
6 |
7 |
8 | def process_song_file(cur, filepath):
9 | """Reads songs log file row by row, selects needed fields and inserts them into song and artist tables.
10 |
11 | Parameters:
12 | cur (psycopg2.cursor()): Cursor of the sparkifydb database
13 | filepath (str): Filepath of the file to be analyzed
14 | """
15 | # open song file
16 | df = pd.read_json(filepath, lines=True)
17 |
18 | for value in df.values:
19 | artist_id, artist_latitude, artist_location, artist_longitude, artist_name, duration, num_songs, song_id, title, year = value
20 |
21 | # insert artist record
22 | artist_data = [artist_id, artist_name, artist_location, artist_longitude, artist_latitude]
23 | cur.execute(artist_table_insert, artist_data)
24 |
25 | # insert song record
26 | song_data = [song_id, title, artist_id, year, duration]
27 | cur.execute(song_table_insert, song_data)
28 |
29 |
30 | def process_log_file(cur, filepath):
31 | """Reads user activity log file row by row, filters by NexSong, selects needed fields, transforms them and inserts
32 | them into time, user and songplay tables.
33 |
34 | Parameters:
35 | cur (psycopg2.cursor()): Cursor of the sparkifydb database
36 | filepath (str): Filepath of the file to be analyzed
37 | """
38 | # open log file
39 | df = pd.read_json(filepath, lines=True)
40 |
41 | # filter by NextSong action
42 | df = df[df['page']=='NextSong']
43 |
44 | # convert timestamp column to datetime
45 | t = pd.to_datetime(df['ts'], unit='ms')
46 |
47 | # insert time data records
48 | time_data = []
49 | for line in t:
50 | time_data.append([line, line.hour, line.day, line.week, line.month, line.year, line.day_name()])
51 | column_labels = ('start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday')
52 | time_df = pd.DataFrame.from_records(time_data, columns=column_labels)
53 |
54 | for i, row in time_df.iterrows():
55 | cur.execute(time_table_insert, list(row))
56 |
57 | # load user table
58 | user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']]
59 |
60 | # insert user records
61 | for i, row in user_df.iterrows():
62 | cur.execute(user_table_insert, row)
63 |
64 | # insert songplay records
65 | for index, row in df.iterrows():
66 |
67 | # get songid and artistid from song and artist tables
68 | cur.execute(song_select, (row.song, row.artist, row.length))
69 | results = cur.fetchone()
70 |
71 | if results:
72 | songid, artistid = results
73 | else:
74 | songid, artistid = None, None
75 |
76 | # insert songplay record
77 | songplay_data = (index, pd.to_datetime(row.ts, unit='ms'), int(row.userId), row.level, songid, artistid, row.sessionId, row.location, row.userAgent)
78 | cur.execute(songplay_table_insert, songplay_data)
79 |
80 |
81 | def process_data(cur, conn, filepath, func):
82 | """Walks through all files nested under filepath, and processes all logs found.
83 |
84 | Parameters:
85 | cur (psycopg2.cursor()): Cursor of the sparkifydb database
86 | conn (psycopg2.connect()): Connectio to the sparkifycdb database
87 | filepath (str): Filepath parent of the logs to be analyzed
88 | func (python function): Function to be used to process each log
89 |
90 | Returns:
91 | Name of files processed
92 | """
93 | # get all files matching extension from directory
94 | all_files = []
95 | for root, dirs, files in os.walk(filepath):
96 | files = glob.glob(os.path.join(root,'*.json'))
97 | for f in files :
98 | all_files.append(os.path.abspath(f))
99 |
100 | # get total number of files found
101 | num_files = len(all_files)
102 | print('{} files found in {}'.format(num_files, filepath))
103 |
104 | # iterate over files and process
105 | for i, datafile in enumerate(all_files, 1):
106 | func(cur, datafile)
107 | conn.commit()
108 | print('{}/{} files processed.'.format(i, num_files))
109 |
110 | return all_files
111 |
112 |
113 | def main():
114 | """Function used to extract, transform all data from song and user activity logs and load it into a PostgreSQL DB
115 | Usage: python etl.py
116 | """
117 | conn = psycopg2.connect("host=127.0.0.1 dbname=sparkifydb user=student password=student")
118 | cur = conn.cursor()
119 |
120 | process_data(cur, conn, filepath='data/song_data', func=process_song_file)
121 | process_data(cur, conn, filepath='data/log_data', func=process_log_file)
122 |
123 | conn.close()
124 |
125 |
126 | if __name__ == "__main__":
127 | main()
--------------------------------------------------------------------------------
/1_dend_data_modeling/P1_Postgres_Data_Modeling_and_ETL/sql_queries.py:
--------------------------------------------------------------------------------
1 | # DROP TABLES
2 |
3 | songplay_table_drop = "DROP TABLE IF EXISTS songplays"
4 | user_table_drop = "DROP TABLE IF EXISTS users"
5 | song_table_drop = "DROP TABLE IF EXISTS songs"
6 | artist_table_drop = "DROP TABLE IF EXISTS artists"
7 | time_table_drop = "DROP TABLE IF EXISTS time"
8 |
9 | # CREATE TABLES
10 |
11 | songplay_table_create = ("""
12 | CREATE TABLE IF NOT EXISTS songplays
13 | (songplay_id int PRIMARY KEY,
14 | start_time date REFERENCES time(start_time),
15 | user_id int NOT NULL REFERENCES users(user_id),
16 | level text,
17 | song_id text REFERENCES songs(song_id),
18 | artist_id text REFERENCES artists(artist_id),
19 | session_id int,
20 | location text,
21 | user_agent text)
22 | """)
23 |
24 | user_table_create = ("""
25 | CREATE TABLE IF NOT EXISTS users
26 | (user_id int PRIMARY KEY,
27 | first_name text NOT NULL,
28 | last_name text NOT NULL,
29 | gender text,
30 | level text)
31 | """)
32 |
33 | song_table_create = ("""
34 | CREATE TABLE IF NOT EXISTS songs
35 | (song_id text PRIMARY KEY,
36 | title text NOT NULL,
37 | artist_id text NOT NULL REFERENCES artists(artist_id),
38 | year int,
39 | duration float NOT NULL)
40 | """)
41 |
42 | artist_table_create = ("""
43 | CREATE TABLE IF NOT EXISTS artists
44 | (artist_id text PRIMARY KEY,
45 | name text NOT NULL,
46 | location text,
47 | lattitude float,
48 | longitude float)
49 | """)
50 |
51 | time_table_create = ("""
52 | CREATE TABLE IF NOT EXISTS time
53 | (start_time date PRIMARY KEY,
54 | hour int,
55 | day int,
56 | week int,
57 | month int,
58 | year int,
59 | weekday text)
60 | """)
61 |
62 | # INSERT RECORDS
63 |
64 | songplay_table_insert = ("""
65 | INSERT INTO songplays
66 | (songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent)
67 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
68 | ON CONFLICT (songplay_id) DO NOTHING;
69 | """)
70 |
71 | user_table_insert = ("""
72 | INSERT INTO users
73 | (user_id, first_name, last_name, gender, level)
74 | VALUES (%s, %s, %s, %s, %s)
75 | ON CONFLICT (user_id) DO NOTHING;
76 | """)
77 |
78 | song_table_insert = ("""
79 | INSERT INTO songs
80 | (song_id, title, artist_id, year, duration)
81 | VALUES (%s, %s, %s, %s, %s)
82 | ON CONFLICT (song_id) DO NOTHING;
83 | """)
84 |
85 | artist_table_insert = ("""
86 | INSERT INTO artists
87 | (artist_id, name, location, lattitude, longitude)
88 | VALUES (%s, %s, %s, %s, %s)
89 | ON CONFLICT (artist_id) DO NOTHING;
90 | """)
91 |
92 |
93 | time_table_insert = ("""
94 | INSERT INTO time
95 | (start_time, hour, day, week, month, year, weekday)
96 | VALUES (%s, %s, %s, %s, %s, %s, %s)
97 | ON CONFLICT (start_time) DO NOTHING;
98 | """)
99 |
100 | # FIND SONGS
101 |
102 | song_select = ("""
103 | SELECT song_id, artists.artist_id
104 | FROM songs JOIN artists ON songs.artist_id = artists.artist_id
105 | WHERE songs.title = %s
106 | AND artists.name = %s
107 | AND songs.duration = %s
108 | """)
109 |
110 | # QUERY LISTS
111 |
112 | create_table_queries = [user_table_create, artist_table_create, song_table_create, time_table_create, songplay_table_create]
113 | drop_table_queries = [user_table_drop, artist_table_drop, song_table_drop, time_table_drop, songplay_table_drop]
--------------------------------------------------------------------------------
/1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/event_data/2018-11-01-events.csv:
--------------------------------------------------------------------------------
1 | artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
2 | ,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1.54092E+12,38,,200,1.54111E+12,39
3 | ,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1.54034E+12,139,,200,1.54111E+12,8
4 | Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,You Gotta Be,200,1.54111E+12,8
5 | ,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1.54034E+12,139,,200,1.54111E+12,8
6 | Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Flat 55,200,1.54111E+12,8
7 | Tamba Trio,Logged In,Kaylee,F,4,Summers,177.18812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Quem Quiser Encontrar O Amor,200,1.54111E+12,8
8 | The Mars Volta,Logged In,Kaylee,F,5,Summers,380.42077,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Eriatarka,200,1.54111E+12,8
9 | Infected Mushroom,Logged In,Kaylee,F,6,Summers,440.2673,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Becoming Insane,200,1.54111E+12,8
10 | Blue October / Imogen Heap,Logged In,Kaylee,F,7,Summers,241.3971,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Congratulations,200,1.54111E+12,8
11 | Girl Talk,Logged In,Kaylee,F,8,Summers,160.15628,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1.54034E+12,139,Once again,200,1.54111E+12,8
12 | Black Eyed Peas,Logged In,Sylvie,F,0,Cruz,214.93506,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV",PUT,NextSong,1.54027E+12,9,Pump It,200,1.54111E+12,10
13 | ,Logged In,Ryan,M,0,Smith,,free,"San Jose-Sunnyvale-Santa Clara, CA",GET,Home,1.54102E+12,169,,200,1.54111E+12,26
14 | Fall Out Boy,Logged In,Ryan,M,1,Smith,200.72444,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Nobody Puts Baby In The Corner,200,1.54111E+12,26
15 | M.I.A.,Logged In,Ryan,M,2,Smith,233.7171,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1.54102E+12,169,Mango Pickle Down River (With The Wilcannia Mob),200,1.54111E+12,26
16 | Survivor,Logged In,Jayden,M,0,Fox,245.36771,free,"New Orleans-Metairie, LA",PUT,NextSong,1.54103E+12,100,Eye Of The Tiger,200,1.54111E+12,101
17 |
--------------------------------------------------------------------------------
/1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/images/image_event_datafile_new.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/1_dend_data_modeling/P2_Cassandra_Data_Modeling_and_ETL/images/image_event_datafile_new.jpg
--------------------------------------------------------------------------------
/1_dend_data_modeling/notebooks/L1-D0-creating-a-table-with-postgres.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 1 Demo 0: PostgreSQL and AutoCommits\n",
8 | "\n",
9 | "
"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Walk through the basics of PostgreSQL autocommits "
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "## import postgreSQL adapter for the Python\n",
26 | "import psycopg2"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### Create a connection to the database\n",
34 | "1. Connect to the local instance of PostgreSQL (*127.0.0.1*)\n",
35 | "2. Use the database/schema from the instance. \n",
36 | "3. The connection reaches out to the database (*studentdb*) and use the correct privilages to connect to the database (*user and password = student*)."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Use the connection to get a cursor that will be used to execute queries."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "cur = conn.cursor()"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Create a database to work in"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "cur.execute(\"select * from test\")"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "### Error occurs, but it was to be expected because table has not been created as yet. To fix the error, create the table. "
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "### Error indicates we cannot execute this query. Since we have not committed the transaction and had an error in the transaction block, we are blocked until we restart the connection."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "conn = psycopg2.connect(\"host=127.0.0.1 dbname=studentdb user=student password=student\")\n",
110 | "cur = conn.cursor()"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "In our exercises instead of worrying about commiting each transaction or getting a strange error when we hit something unexpected, let's set autocommit to true. **This says after each call during the session commit that one action and do not hold open the transaction for any other actions. One action = one transaction.**"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "In this demo we will use automatic commit so each action is commited without having to call `conn.commit()` after each command. **The ability to rollback and commit transactions are a feature of Relational Databases.**"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "conn.set_session(autocommit=True)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "cur.execute(\"select * from test\")"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "cur.execute(\"CREATE TABLE test (col1 int, col2 int, col3 int);\")"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "### Once autocommit is set to true, we execute this code successfully. There were no issues with transaction blocks and we did not need to restart our connection. "
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {},
165 | "outputs": [],
166 | "source": [
167 | "cur.execute(\"select * from test\")"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "cur.execute(\"select count(*) from test\")\n",
177 | "print(cur.fetchall())"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": []
186 | }
187 | ],
188 | "metadata": {
189 | "kernelspec": {
190 | "display_name": "Python 3",
191 | "language": "python",
192 | "name": "python3"
193 | },
194 | "language_info": {
195 | "codemirror_mode": {
196 | "name": "ipython",
197 | "version": 3
198 | },
199 | "file_extension": ".py",
200 | "mimetype": "text/x-python",
201 | "name": "python",
202 | "nbconvert_exporter": "python",
203 | "pygments_lexer": "ipython3",
204 | "version": "3.7.1"
205 | }
206 | },
207 | "nbformat": 4,
208 | "nbformat_minor": 2
209 | }
210 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/Data warehousing in the cloud.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/2_dend_cloud_data_warehouses/Data warehousing in the cloud.pdf
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "jira-plugin.workingProject": ""
3 | }
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/README.md:
--------------------------------------------------------------------------------
1 | # Project Datawarehouse
2 |
3 | ## Project description
4 |
5 | Sparkify is a music streaming startup with a growing user base and song database.
6 |
7 | Their user activity and songs metadata data resides in json files in S3. The goal of the current project is to build an ETL pipeline that extracts their data from S3, stages them in Redshift, and transforms data into a set of dimensional tables for their analytics team to continue finding insights in what songs their users are listening to.
8 |
9 | ## How to run
10 |
11 | 1. To run this project you will need to fill the following information, and save it as *dwh.cfg* in the project root folder.
12 |
13 | ```
14 | [CLUSTER]
15 | HOST=''
16 | DB_NAME=''
17 | DB_USER=''
18 | DB_PASSWORD=''
19 | DB_PORT=5439
20 |
21 | [IAM_ROLE]
22 | ARN=
23 |
24 | [S3]
25 | LOG_DATA='s3://udacity-dend/log_data'
26 | LOG_JSONPATH='s3://udacity-dend/log_json_path.json'
27 | SONG_DATA='s3://udacity-dend/song_data'
28 |
29 | [AWS]
30 | KEY=
31 | SECRET=
32 |
33 | [DWH]
34 | DWH_CLUSTER_TYPE = multi-node
35 | DWH_NUM_NODES = 4
36 | DWH_NODE_TYPE = dc2.large
37 | DWH_CLUSTER_IDENTIFIER =
38 | DWH_DB =
39 | DWH_DB_USER =
40 | DWH_DB_PASSWORD =
41 | DWH_PORT = 5439
42 | DWH_IAM_ROLE_NAME =
43 | ```
44 |
45 | 2. Create a python environment with the dependencies listed on *requirements.txt*
46 | 3. Run the *create_cluster* script to set up the needed infrastructure for this project.
47 |
48 | `$ python create_cluster.py`
49 |
50 | 4. Run the *create_tables* script to set up the database staging and analytical tables
51 |
52 | `$ python create_tables.py`
53 |
54 | 5. Finally, run the *etl* script to extract data from the files in S3, stage it in redshift, and finally store it in the dimensional tables.
55 |
56 | `$ python create_tables.py`
57 |
58 |
59 | ## Project structure
60 |
61 | This project includes five script files:
62 |
63 | - analytics.py runs a few queries on the created star schema to validate that the project has been completed successfully.
64 | - create_cluster.py is where the AWS components for this project are created programmatically
65 | - create_table.py is where fact and dimension tables for the star schema in Redshift are created.
66 | - etl.py is where data gets loaded from S3 into staging tables on Redshift and then processed into the analytics tables on Redshift.
67 | - sql_queries.py where SQL statements are defined, which are then used by etl.py, create_table.py and analytics.py.
68 | - README.md is current file.
69 | - requirements.txt with python dependencies needed to run the project
70 |
71 | ## Database schema design
72 | State and justify your database schema design and ETL pipeline.
73 |
74 | #### Staging Tables
75 | - staging_events
76 | - staging_songs
77 |
78 | #### Fact Table
79 | - songplays - records in event data associated with song plays i.e. records with page NextSong -
80 | *songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent*
81 |
82 | #### Dimension Tables
83 | - users - users in the app -
84 | *user_id, first_name, last_name, gender, level*
85 | - songs - songs in music database -
86 | *song_id, title, artist_id, year, duration*
87 | - artists - artists in music database -
88 | *artist_id, name, location, lattitude, longitude*
89 | - time - timestamps of records in songplays broken down into specific units -
90 | *start_time, hour, day, week, month, year, weekday*
91 |
92 |
93 | ## Queries and Results
94 |
95 | Number of rows in each table:
96 |
97 | | Table | rows |
98 | |--- | --: |
99 | | staging_events | 8056 |
100 | | staging_songs | 14896 |
101 | | artists | 10025 |
102 | | songplays | 333 |
103 | | songs | 14896 |
104 | | time | 8023 |
105 | | users | 105 |
106 |
107 |
108 | ### Steps followed on this project
109 |
110 | 1. Create Table Schemas
111 | - Design schemas for your fact and dimension tables
112 | - Write a SQL CREATE statement for each of these tables in sql_queries.py
113 | - Complete the logic in create_tables.py to connect to the database and create these tables
114 | - Write SQL DROP statements to drop tables in the beginning of - create_tables.py if the tables already exist. This way, you can run create_tables.py whenever you want to reset your database and test your ETL pipeline.
115 | - Launch a redshift cluster and create an IAM role that has read access to S3.
116 | - Add redshift database and IAM role info to dwh.cfg.
117 | - Test by running create_tables.py and checking the table schemas in your redshift database. You can use Query Editor in the AWS Redshift console for this.
118 |
119 | 2. Build ETL Pipeline
120 | - Implement the logic in etl.py to load data from S3 to staging tables on Redshift.
121 | - Implement the logic in etl.py to load data from staging tables to analytics tables on Redshift.
122 | - Test by running etl.py after running create_tables.py and running the analytic queries on your Redshift database to compare your results with the expected results.
123 | - Delete your redshift cluster when finished.
124 |
125 | 3. Document Process
126 | Do the following steps in your README.md file.
127 |
128 | - Discuss the purpose of this database in context of the startup, Sparkify, and their analytical goals.
129 | - State and justify your database schema design and ETL pipeline.
130 | - [Optional] Provide example queries and results for song play analysis.
131 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/analytics.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import psycopg2
3 | from sql_queries import select_number_rows_queries
4 |
5 |
6 | def get_results(cur, conn):
7 | """
8 | Get the number of rows stored into each table
9 | """
10 | for query in select_number_rows_queries:
11 | print('Running ' + query)
12 | cur.execute(query)
13 | results = cur.fetchone()
14 |
15 | for row in results:
16 | print(" ", row)
17 |
18 |
19 | def main():
20 | """
21 | Run queries on the staging and dimensional tables to validate that the project has been created successfully
22 | """
23 | config = configparser.ConfigParser()
24 | config.read('dwh.cfg')
25 |
26 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
27 | cur = conn.cursor()
28 |
29 | get_results(cur, conn)
30 |
31 | conn.close()
32 |
33 |
34 | if __name__ == "__main__":
35 | main()
36 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/create_cluster.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import boto3
3 | import json
4 | import psycopg2
5 |
6 | from botocore.exceptions import ClientError
7 | import configparser
8 |
9 |
10 | def create_iam_role(iam, DWH_IAM_ROLE_NAME):
11 | '''
12 | Creates IAM Role for Redshift, to allow it to use AWS services
13 | '''
14 |
15 | try:
16 | print("1.1 Creating a new IAM Role")
17 | dwhRole = iam.create_role(
18 | Path='/',
19 | RoleName=DWH_IAM_ROLE_NAME,
20 | Description = "Allows Redshift clusters to call AWS services on your behalf.",
21 | AssumeRolePolicyDocument=json.dumps(
22 | {'Statement': [{'Action': 'sts:AssumeRole',
23 | 'Effect': 'Allow',
24 | 'Principal': {'Service': 'redshift.amazonaws.com'}}],
25 | 'Version': '2012-10-17'})
26 | )
27 | except Exception as e:
28 | print(e)
29 |
30 |
31 | print("1.2 Attaching Policy")
32 |
33 | iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
34 | PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
35 | )['ResponseMetadata']['HTTPStatusCode']
36 |
37 | print("1.3 Get the IAM role ARN")
38 | roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
39 |
40 | print(roleArn)
41 | return roleArn
42 |
43 |
44 | def create_cluster(redshift, roleArn, DWH_CLUSTER_TYPE, DWH_NODE_TYPE, DWH_NUM_NODES, DWH_DB, DWH_CLUSTER_IDENTIFIER, DWH_DB_USER, DWH_DB_PASSWORD):
45 | '''
46 | Creates Redshift cluster
47 | '''
48 |
49 | try:
50 | response = redshift.create_cluster(
51 | #HW
52 | ClusterType=DWH_CLUSTER_TYPE,
53 | NodeType=DWH_NODE_TYPE,
54 | NumberOfNodes=int(DWH_NUM_NODES),
55 |
56 | #Identifiers & Credentials
57 | DBName=DWH_DB,
58 | ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
59 | MasterUsername=DWH_DB_USER,
60 | MasterUserPassword=DWH_DB_PASSWORD,
61 |
62 | #Roles (for s3 access)
63 | IamRoles=[roleArn]
64 | )
65 | except Exception as e:
66 | print(e)
67 |
68 |
69 | def get_cluster_props(redshift, DWH_CLUSTER_IDENTIFIER):
70 | '''
71 | Retrieve Redshift clusters properties
72 | '''
73 |
74 | def prettyRedshiftProps(props):
75 | pd.set_option('display.max_colwidth', -1)
76 | keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
77 | x = [(k, v) for k,v in props.items() if k in keysToShow]
78 | return pd.DataFrame(data=x, columns=["Key", "Value"])
79 |
80 | myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
81 | prettyRedshiftProps(myClusterProps)
82 |
83 | DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
84 | DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
85 | print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
86 | print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)
87 | return myClusterProps, DWH_ENDPOINT, DWH_ROLE_ARN
88 |
89 |
90 | def open_ports(ec2, myClusterProps, DWH_PORT):
91 | '''
92 | Update clusters security group to allow access through redshift port
93 | '''
94 |
95 | try:
96 | vpc = ec2.Vpc(id=myClusterProps['VpcId'])
97 | defaultSg = list(vpc.security_groups.all())[0]
98 | print(defaultSg)
99 | defaultSg.authorize_ingress(
100 | GroupName=defaultSg.group_name,
101 | CidrIp='0.0.0.0/0',
102 | IpProtocol='TCP',
103 | FromPort=int(DWH_PORT),
104 | ToPort=int(DWH_PORT)
105 | )
106 | except Exception as e:
107 | print(e)
108 |
109 |
110 | def main():
111 |
112 | config = configparser.ConfigParser()
113 | config.read_file(open('dwh.cfg'))
114 |
115 | KEY = config.get('AWS','KEY')
116 | SECRET = config.get('AWS','SECRET')
117 |
118 | DWH_CLUSTER_TYPE = config.get("DWH","DWH_CLUSTER_TYPE")
119 | DWH_NUM_NODES = config.get("DWH","DWH_NUM_NODES")
120 | DWH_NODE_TYPE = config.get("DWH","DWH_NODE_TYPE")
121 |
122 | DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
123 | DWH_DB = config.get("DWH","DWH_DB")
124 | DWH_DB_USER = config.get("DWH","DWH_DB_USER")
125 | DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD")
126 | DWH_PORT = config.get("DWH","DWH_PORT")
127 |
128 | DWH_IAM_ROLE_NAME = config.get("DWH", "DWH_IAM_ROLE_NAME")
129 |
130 | (DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)
131 |
132 | df = pd.DataFrame({"Param":
133 | ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
134 | "Value":
135 | [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
136 | })
137 |
138 | print(df)
139 |
140 |
141 | ec2 = boto3.resource('ec2',
142 | region_name="us-west-2",
143 | aws_access_key_id=KEY,
144 | aws_secret_access_key=SECRET
145 | )
146 |
147 | s3 = boto3.resource('s3',
148 | region_name="us-west-2",
149 | aws_access_key_id=KEY,
150 | aws_secret_access_key=SECRET
151 | )
152 |
153 | iam = boto3.client('iam',aws_access_key_id=KEY,
154 | aws_secret_access_key=SECRET,
155 | region_name='us-west-2'
156 | )
157 |
158 | redshift = boto3.client('redshift',
159 | region_name="us-west-2",
160 | aws_access_key_id=KEY,
161 | aws_secret_access_key=SECRET
162 | )
163 |
164 | roleArn = create_iam_role(iam, DWH_IAM_ROLE_NAME)
165 |
166 | create_cluster(redshift, roleArn, DWH_CLUSTER_TYPE, DWH_NODE_TYPE, DWH_NUM_NODES, DWH_DB, DWH_CLUSTER_IDENTIFIER, DWH_DB_USER, DWH_DB_PASSWORD)
167 |
168 | myClusterProps = get_cluster_props(redshift, DWH_CLUSTER_IDENTIFIER)
169 |
170 | open_ports(ec2, myClusterProps, DWH_PORT)
171 |
172 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
173 | cur = conn.cursor()
174 |
175 | print('Connected')
176 |
177 | conn.close()
178 |
179 |
180 | if __name__ == "__main__":
181 | main()
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/create_tables.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import psycopg2
3 | from sql_queries import create_table_queries, drop_table_queries
4 |
5 |
6 | def drop_tables(cur, conn):
7 | """
8 | Delete pre-existing tables to be able to create them from scratch
9 | """
10 | print('Droping tables')
11 | for query in drop_table_queries:
12 | cur.execute(query)
13 | conn.commit()
14 |
15 |
16 | def create_tables(cur, conn):
17 | """
18 | Create staging and dimensional tables declared on sql_queries script
19 | """
20 | for query in create_table_queries:
21 | print('Running ' + query + ' ')
22 | cur.execute(query)
23 | conn.commit()
24 |
25 |
26 | def main():
27 | """
28 | Set up the database tables, create needed tables with the appropriate columns and constricts
29 | """
30 | config = configparser.ConfigParser()
31 | config.read('dwh.cfg')
32 |
33 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
34 | cur = conn.cursor()
35 |
36 | print('Connected to the cluster')
37 |
38 | drop_tables(cur, conn)
39 | create_tables(cur, conn)
40 |
41 | conn.close()
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/etl.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import psycopg2
3 | from sql_queries import copy_table_queries, insert_table_queries
4 |
5 |
6 | def load_staging_tables(cur, conn):
7 | """
8 | Load data from files stored in S3 to the staging tables using the queries declared on the sql_queries script
9 | """
10 | print('Inserting data from json files stored in S3 buckets into staging tables')
11 | for query in copy_table_queries:
12 | print('Running ' + query)
13 | cur.execute(query)
14 | conn.commit()
15 |
16 |
17 | def insert_tables(cur, conn):
18 | """
19 | Select and Transform data from staging tables into the dimensional tables using the queries declared on the sql_queries script
20 | """
21 | print('Inserting data from staging tables into analytics tables')
22 | for query in insert_table_queries:
23 | print('Running ' + query)
24 | cur.execute(query)
25 | conn.commit()
26 |
27 |
28 | def main():
29 | """
30 | Extract songs metadata and user activity data from S3, transform it using a staging table, and load it into dimensional tables for analysis
31 | """
32 | config = configparser.ConfigParser()
33 | config.read('dwh.cfg')
34 |
35 | conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values()))
36 | cur = conn.cursor()
37 |
38 | load_staging_tables(cur, conn)
39 | insert_tables(cur, conn)
40 |
41 | conn.close()
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/P3_Data_Warehouse_Project/requirements.txt:
--------------------------------------------------------------------------------
1 | awscli==1.16.140
2 | boto3==1.9.164
3 | botocore==1.12.164
4 | pandas==0.23.4
5 | psycopg2==2.7.7
6 | psycopg2-binary==2.8.2
7 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/infrastructure_as_code.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import boto3
3 | import json
4 |
5 | from botocore.exceptions import ClientError
6 | import configparser
7 |
8 |
9 | config = configparser.ConfigParser()
10 | config.read_file(open('/home/f.silvestre/Documents/Projects/Data-engineering-nanodegree/2_dend_cloud_data_warehouses/dhw.cfg'))
11 |
12 | KEY = config.get('AWS','KEY')
13 | SECRET = config.get('AWS','SECRET')
14 |
15 | DWH_CLUSTER_TYPE = config.get("DWH","DWH_CLUSTER_TYPE")
16 | DWH_NUM_NODES = config.get("DWH","DWH_NUM_NODES")
17 | DWH_NODE_TYPE = config.get("DWH","DWH_NODE_TYPE")
18 |
19 | DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
20 | DWH_DB = config.get("DWH","DWH_DB")
21 | DWH_DB_USER = config.get("DWH","DWH_DB_USER")
22 | DWH_DB_PASSWORD = config.get("DWH","DWH_DB_PASSWORD")
23 | DWH_PORT = config.get("DWH","DWH_PORT")
24 |
25 | DWH_IAM_ROLE_NAME = config.get("DWH", "DWH_IAM_ROLE_NAME")
26 |
27 | (DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)
28 |
29 | df = pd.DataFrame({"Param":
30 | ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
31 | "Value":
32 | [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
33 | })
34 |
35 | print(df)
36 |
37 | # Create clients
38 | ec2 = boto3.resource('ec2',
39 | region_name='us-west-2',
40 | aws_access_key_id=KEY,
41 | aws_secret_access_key=SECRET)
42 |
43 | s3 = boto3.resource('s3',
44 | region_name='us-west-2',
45 | aws_access_key_id=KEY,
46 | aws_secret_access_key=SECRET)
47 |
48 | iam = boto3.resource('iam',
49 | region_name='us-west-2',
50 | aws_access_key_id=KEY,
51 | aws_secret_access_key=SECRET)
52 |
53 | redshift = boto3.resource('redshift',
54 | region_name='us-west-2',
55 | aws_access_key_id=KEY,
56 | aws_secret_access_key=SECRET)
57 |
58 |
59 | # Connect to S3
60 | sampleDbBucket = s3.Bucket("awssampledbuswest2")
61 |
62 |
63 | try:
64 | print("Creating IAM Role")
65 | dwhRole=iam.create_role(
66 | Path='/',
67 | RoleName=DWH_IAM_ROLE_NAME,
68 | Description="Allows Redshift clusters to call AWS services on your behalf",
69 | AssumeRolePolicyDocument=json.dumps(
70 | {'Statement': [{'Action':'sts:AssumeRole',
71 | 'Effect':'Allow',
72 | 'Principal':{'Service': 'redshift.amazonaws.com'}}],
73 | 'Version':'2012-10-17'}
74 | )
75 | )
76 | except Exception as e:
77 | print(e)
78 |
79 | print("Attaching policy")
80 |
81 | iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
82 | PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
83 | )['ResponseMetadata']['HTTPStatusCode']
84 |
85 | print("Get IAM Role")
86 | roleArn = iam.get_role(RoleName=DWH_IAM_ROLE_NAME)['Role']['Arn']
87 |
88 | print(roleArn)
89 |
90 | # Create Readshift cluster
91 | try:
92 | response = redshift.create_cluster(
93 | ClusterType=DWH_CLUSTER_TYPE,
94 | NodeType=DWH_NODE_TYPE,
95 | NumberOfNodes=int(DWH_NUM_NODES),
96 | DBName=DWH_DB,
97 | ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,
98 | MasterUsername=DWH_DB_USER,
99 | MasterUserPassword=DWH_DB_PASSWORD,
100 |
101 | #Roles (for s3 access)
102 | IamRoles=[roleArn]
103 | )
104 |
105 | except Exception as e:
106 | print(e)
107 |
108 |
109 | # Describe cluster and status
110 | def prettyRedshiftProps(props):
111 | pd.set_option('display.max_colwidth', -1)
112 | keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
113 | x = [(k, v) for k,v in props.items() if k in keysToShow]
114 | return pd.DataFrame(data=x, columns=["Key", "Value"])
115 |
116 | myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
117 | prettyRedshiftProps(myClusterProps)
118 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/notebooks/Data/README:
--------------------------------------------------------------------------------
1 | Pagila
2 | ======
3 |
4 | Pagila is a port of the Sakila example database available for MySQL, which was
5 | originally developed by Mike Hillyer of the MySQL AB documentation team. It
6 | is intended to provide a standard schema that can be used for examples in
7 | books, tutorials, articles, samples, etc.
8 |
9 | All the tables, data, views, and functions have been ported; some of the changes made were:
10 |
11 | * Changed char(1) true/false fields to true boolean fields
12 | * The last_update columns were set with triggers to update them
13 | * Added foreign keys
14 | * Removed 'DEFAULT 0' on foreign keys since it's pointless with real FK's
15 | * Used PostgreSQL built-in fulltext searching for fulltext index. Removed the need for the
16 | film_text table.
17 | * The rewards_report function was ported to a simple SRF
18 |
19 | The schema and data for the Sakila database were made available under the BSD license
20 | which can be found at http://www.opensource.org/licenses/bsd-license.php. The pagila
21 | database is made available under this license as well.
22 |
23 |
24 | FULLTEXT SEARCH
25 | ---------------
26 |
27 | In older versions of pagila, the fulltext search capabilities were split into a
28 | seperate file, so they could be loaded into only databases that support fulltext.
29 | Starting in PostgreSQL 8.3, fulltext functionality is built in, so now these
30 | parts of the schema exist in the main schema file.
31 |
32 | Example usage:
33 |
34 | SELECT * FROM film WHERE fulltext @@ to_tsquery('fate&india');
35 |
36 |
37 | PARTITIONED TABLES
38 | ------------------
39 |
40 | The payment table is designed as a partitioned table with a 6 month timespan for the date ranges.
41 | If you want to take full advantage of table partitioning, you need to make sure constraint_exclusion
42 | is turned on in your database. You can do this by setting "constraint_exclusion = on" in your
43 | postgresql.conf, or by issuing the command "ALTER DATABASE pagila SET constraint_exclusion = on"
44 | (substitute pagila for your database name if installing into a database with a different name)
45 |
46 |
47 | INSTALL NOTE
48 | ------------
49 |
50 | The pagila-data.sql file and the pagila-insert-data.sql both contain the same
51 | data, the former using COPY commands, the latter using INSERT commands, so you
52 | only need to install one of them. Both formats are provided for those who have
53 | trouble using one version or another.
54 |
55 |
56 | ARTICLES
57 | --------------
58 |
59 | The following articles make use of pagila to showcase various PostgreSQL features:
60 |
61 | * Showcasing REST in PostgreSQL - The PreQuel
62 | http://www.postgresonline.com/journal/index.php?/archives/32-Showcasing-REST-in-PostgreSQL-The-PreQuel.html#extended
63 |
64 | * PostgreSQL 8.3 Features: Enum Datatype
65 | http://people.planetpostgresql.org/xzilla/index.php?/archives/320-PostgreSQL-8.3-Features-Enum-Datatype.html
66 |
67 | * Email Validation with pl/PHP
68 | http://people.planetpostgresql.org/xzilla/index.php?/archives/261-Re-inventing-Gregs-method-to-prevent-re-inventing.html
69 |
70 | * Getting Started with PostgreSQL for Windows
71 | http://www.charltonlopez.com/index.php?option=com_content&task=view&id=56&Itemid=38
72 |
73 | * RATIO_TO_REPORT in PostgreSQL
74 | http://people.planetpostgresql.org/xzilla/index.php?/search/pagila/P3.html
75 |
76 | * The postmaster and postgres Processes
77 | http://www.charltonlopez.com/index.php?option=com_content&task=view&id=57&Itemid=38
78 |
79 | * Building Rails to Legacy Applications :: Take Control of Active Record
80 | http://people.planetpostgresql.org/xzilla/index.php?/archives/220-Building-Rails-to-Legacy-Applications-Take-Control-of-Active-Record.html
81 |
82 | * Building Rails to Legacy Applications :: Masking the Database
83 | http://people.planetpostgresql.org/xzilla/index.php?/archives/213-Building-Rails-to-Legacy-Applications-Masking-the-Database.html
84 |
85 |
86 | VERSION HISTORY
87 | ---------------
88 |
89 | Version 0.10.1
90 | * Add pagila-data-insert.sql file, added articles section
91 |
92 | Version 0.10
93 | * Support for built-in fulltext. Add enum example
94 |
95 | Version 0.9
96 | * Add table partitioning example
97 |
98 | Version 0.8
99 | * First release of pagila
100 |
101 |
102 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/notebooks/L3 Exercise 3 - Parallel ETL - Solution.py:
--------------------------------------------------------------------------------
1 | #%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataScience.changeDirOnImportExport setting
2 | # ms-python.python added
3 | import os
4 | try:
5 | os.chdir(os.path.join(os.getcwd(), '2_dend_cloud_data_warehouses/notebooks'))
6 | print(os.getcwd())
7 | except:
8 | pass
9 | #%% [markdown]
10 | # # Exercise 3: Parallel ETL
11 |
12 | #%%
13 | get_ipython().run_line_magic('load_ext', 'sql')
14 |
15 |
16 | #%%
17 | from time import time
18 | import configparser
19 | import matplotlib.pyplot as plt
20 | import pandas as pd
21 |
22 | #%% [markdown]
23 | # # STEP 1: Get the params of the created redshift cluster
24 | # - We need:
25 | # - The redshift cluster endpoint
26 | # - The IAM role ARN that give access to Redshift to read from S3
27 |
28 | #%%
29 | config = configparser.ConfigParser()
30 | config.read_file(open('dwh.cfg'))
31 | KEY=config.get('AWS','key')
32 | SECRET= config.get('AWS','secret')
33 |
34 | DWH_DB= config.get("DWH","DWH_DB")
35 | DWH_DB_USER= config.get("DWH","DWH_DB_USER")
36 | DWH_DB_PASSWORD= config.get("DWH","DWH_DB_PASSWORD")
37 | DWH_PORT = config.get("DWH","DWH_PORT")
38 |
39 |
40 | #%%
41 | # FILL IN THE REDSHIFT ENPOINT HERE
42 | # e.g. DWH_ENDPOINT="redshift-cluster-1.csmamz5zxmle.us-west-2.redshift.amazonaws.com"
43 | DWH_ENDPOINT=""
44 |
45 | #FILL IN THE IAM ROLE ARN you got in step 2.2 of the previous exercise
46 | #e.g DWH_ROLE_ARN="arn:aws:iam::988332130976:role/dwhRole"
47 | DWH_ROLE_ARN=""
48 |
49 | #%% [markdown]
50 | # # STEP 2: Connect to the Redshift Cluster
51 |
52 | #%%
53 | conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
54 | print(conn_string)
55 | get_ipython().run_line_magic('sql', '$conn_string')
56 |
57 |
58 | #%%
59 | import boto3
60 |
61 | s3 = boto3.resource('s3',
62 | region_name="us-west-2",
63 | aws_access_key_id=KEY,
64 | aws_secret_access_key=SECRET
65 | )
66 |
67 | sampleDbBucket = s3.Bucket("udacity-labs")
68 |
69 | for obj in sampleDbBucket.objects.filter(Prefix="tickets"):
70 | print(obj)
71 |
72 | #%% [markdown]
73 | # # STEP 3: Create Tables
74 |
75 | #%%
76 | get_ipython().run_cell_magic('sql', '', 'DROP TABLE IF EXISTS "sporting_event_ticket";\nCREATE TABLE "sporting_event_ticket" (\n "id" double precision DEFAULT nextval(\'sporting_event_ticket_seq\') NOT NULL,\n "sporting_event_id" double precision NOT NULL,\n "sport_location_id" double precision NOT NULL,\n "seat_level" numeric(1,0) NOT NULL,\n "seat_section" character varying(15) NOT NULL,\n "seat_row" character varying(10) NOT NULL,\n "seat" character varying(10) NOT NULL,\n "ticketholder_id" double precision,\n "ticket_price" numeric(8,2) NOT NULL\n);')
77 |
78 | #%% [markdown]
79 | # # STEP 4: Load Partitioned data into the cluster
80 |
81 | #%%
82 | get_ipython().run_cell_magic('time', '', 'qry = """\n copy sporting_event_ticket from \'s3://udacity-labs/tickets/split/part\'\n credentials \'aws_iam_role={}\'\n gzip delimiter \';\' compupdate off region \'us-west-2\';\n""".format(DWH_ROLE_ARN)\n\n%sql $qry')
83 |
84 | #%% [markdown]
85 | # # STEP 4: Create Tables for the non-partitioned data
86 |
87 | #%%
88 | get_ipython().run_cell_magic('sql', '', 'DROP TABLE IF EXISTS "sporting_event_ticket_full";\nCREATE TABLE "sporting_event_ticket_full" (\n "id" double precision DEFAULT nextval(\'sporting_event_ticket_seq\') NOT NULL,\n "sporting_event_id" double precision NOT NULL,\n "sport_location_id" double precision NOT NULL,\n "seat_level" numeric(1,0) NOT NULL,\n "seat_section" character varying(15) NOT NULL,\n "seat_row" character varying(10) NOT NULL,\n "seat" character varying(10) NOT NULL,\n "ticketholder_id" double precision,\n "ticket_price" numeric(8,2) NOT NULL\n);')
89 |
90 | #%% [markdown]
91 | # # STEP 5: Load non-partitioned data into the cluster
92 | # - Note how it's slower than loading partitioned data
93 |
94 | #%%
95 | get_ipython().run_cell_magic('time', '', '\nqry = """\n copy sporting_event_ticket_full from \'s3://udacity-labs/tickets/full/full.csv.gz\' \n credentials \'aws_iam_role={}\' \n gzip delimiter \';\' compupdate off region \'us-west-2\';\n""".format(DWH_ROLE_ARN)\n\n%sql $qry')
96 |
97 |
98 | #%%
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/notebooks/pagila-star.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/2_dend_cloud_data_warehouses/notebooks/pagila-star.png
--------------------------------------------------------------------------------
/2_dend_cloud_data_warehouses/notes/AWS.md:
--------------------------------------------------------------------------------
1 | ## What Is Cloud Computing?
2 | *Cloud computing: the practice of using a network of remote servers hosted on the Internet to store, manage, and process data, rather than a local server or a personal computer.*
3 |
4 | The arrival of cloud computing completely changed the way we deploy our technology, providing powerful access to instant and scalable computing power to enterprises, startups, and developers alike. Whether you need servers to host a web application, reliable storage for your data, or machines to train machine learning models, it's easy to see the advantage of relying on the cloud rather than utilizing your personal computer or local servers.
5 |
6 | For one, you no longer have to invest in lots of hardware upfront. No need to worry about whether you are paying for more than you'll need or what to do if you need to scale a lot more later on. Cloud computing makes this as easy and clicking a few buttons to scale your resources up or down.
7 |
8 | It's significantly faster provisioning the resources you need through the cloud versus the time it would take to gather and build up the hardware you'd need to provide the same support. This allows you and your team, or company, to develop and experiment at a much faster rate.
9 |
10 | Lastly, you can provide efficient access to your applications around the world by spreading your deployments to multiple regions.
11 |
12 | ## Amazon Web Services
13 | Amazon Web Services is one of the largest providers in the cloud computing industry, with over 140 services in compute, storage, databases, networking, developer tools, security, and more.
14 | Services provided in AWS can be accessed in three different ways: the AWS Management Console, the Command Line Interface, or Software Development Kits, which can be used in combination.
15 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/Data Lakes with Spark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/Data Lakes with Spark.pdf
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/P4_Data_Lake/README.md:
--------------------------------------------------------------------------------
1 | # Project: Data Lake
2 |
3 | ## Introduction
4 |
5 | *A music streaming startup, Sparkify, has grown their user base and song database even more and want to move their data warehouse to a data lake. Their data resides in S3, in a directory of JSON logs on user activity on the app, as well as a directory with JSON metadata on the songs in their app.*
6 |
7 | In this project we will build an ETL pipeline that extracts their data from the data lake hosted on S3, processes them using Spark which will be deployed on an EMR cluster using AWS, and load the data back into S3 as a set of dimensional tables in parquet format.
8 |
9 | From this tables we will be able to find insights in what songs their users are listening to.
10 |
11 | ## How to run
12 |
13 | *To run this project in local mode*, create a file `dl.cfg` in the root of this project with the following data:
14 |
15 | ```
16 | KEY=YOUR_AWS_ACCESS_KEY
17 | SECRET=YOUR_AWS_SECRET_KEY
18 | ```
19 |
20 | Create an S3 Bucket named `sparkify-dend` where output results will be stored.
21 |
22 | Finally, run the following command:
23 |
24 | `python etl.py`
25 |
26 | *To run on an Jupyter Notebook powered by an EMR cluster*, import the notebook found in this project.
27 |
28 | ## Project structure
29 |
30 | The files found at this project are the following:
31 |
32 | - dl.cfg: *not uploaded to github - you need to create this file yourself* File with AWS credentials.
33 | - etl.py: Program that extracts songs and log data from S3, transforms it using Spark, and loads the dimensional tables created in parquet format back to S3.
34 | - README.md: Current file, contains detailed information about the project.
35 |
36 | ## ETL pipeline
37 |
38 | 1. Load credentials
39 | 2. Read data from S3
40 | - Song data: `s3://udacity-dend/song_data`
41 | - Log data: `s3://udacity-dend/log_data`
42 |
43 | The script reads song_data and load_data from S3.
44 |
45 | 3. Process data using spark
46 |
47 | Transforms them to create five different tables listed under `Dimension Tables and Fact Table`.
48 | Each table includes the right columns and data types. Duplicates are addressed where appropriate.
49 |
50 | 4. Load it back to S3
51 |
52 | Writes them to partitioned parquet files in table directories on S3.
53 |
54 | Each of the five tables are written to parquet files in a separate analytics directory on S3. Each table has its own folder within the directory. Songs table files are partitioned by year and then artist. Time table files are partitioned by year and month. Songplays table files are partitioned by year and month.
55 |
56 | ### Source Data
57 | - **Song datasets**: all json files are nested in subdirectories under *s3a://udacity-dend/song_data*. A sample of this files is:
58 |
59 | ```
60 | {"num_songs": 1, "artist_id": "ARJIE2Y1187B994AB7", "artist_latitude": null, "artist_longitude": null, "artist_location": "", "artist_name": "Line Renaud", "song_id": "SOUPIRU12A6D4FA1E1", "title": "Der Kleine Dompfaff", "duration": 152.92036, "year": 0}
61 | ```
62 |
63 | - **Log datasets**: all json files are nested in subdirectories under *s3a://udacity-dend/log_data*. A sample of a single row of each files is:
64 |
65 | ```
66 | {"artist":"Slipknot","auth":"Logged In","firstName":"Aiden","gender":"M","itemInSession":0,"lastName":"Ramirez","length":192.57424,"level":"paid","location":"New York-Newark-Jersey City, NY-NJ-PA","method":"PUT","page":"NextSong","registration":1540283578796.0,"sessionId":19,"song":"Opium Of The People (Album Version)","status":200,"ts":1541639510796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"20"}
67 | ```
68 |
69 | ### Dimension Tables and Fact Table
70 |
71 | **songplays** - Fact table - records in log data associated with song plays i.e. records with page NextSong
72 | - songplay_id (INT) PRIMARY KEY: ID of each user song play
73 | - start_time (DATE) NOT NULL: Timestamp of beggining of user activity
74 | - user_id (INT) NOT NULL: ID of user
75 | - level (TEXT): User level {free | paid}
76 | - song_id (TEXT) NOT NULL: ID of Song played
77 | - artist_id (TEXT) NOT NULL: ID of Artist of the song played
78 | - session_id (INT): ID of the user Session
79 | - location (TEXT): User location
80 | - user_agent (TEXT): Agent used by user to access Sparkify platform
81 |
82 | **users** - users in the app
83 | - user_id (INT) PRIMARY KEY: ID of user
84 | - first_name (TEXT) NOT NULL: Name of user
85 | - last_name (TEXT) NOT NULL: Last Name of user
86 | - gender (TEXT): Gender of user {M | F}
87 | - level (TEXT): User level {free | paid}
88 |
89 | **songs** - songs in music database
90 | - song_id (TEXT) PRIMARY KEY: ID of Song
91 | - title (TEXT) NOT NULL: Title of Song
92 | - artist_id (TEXT) NOT NULL: ID of song Artist
93 | - year (INT): Year of song release
94 | - duration (FLOAT) NOT NULL: Song duration in milliseconds
95 |
96 | **artists** - artists in music database
97 | - artist_id (TEXT) PRIMARY KEY: ID of Artist
98 | - name (TEXT) NOT NULL: Name of Artist
99 | - location (TEXT): Name of Artist city
100 | - lattitude (FLOAT): Lattitude location of artist
101 | - longitude (FLOAT): Longitude location of artist
102 |
103 | **time** - timestamps of records in songplays broken down into specific units
104 | - start_time (DATE) PRIMARY KEY: Timestamp of row
105 | - hour (INT): Hour associated to start_time
106 | - day (INT): Day associated to start_time
107 | - week (INT): Week of year associated to start_time
108 | - month (INT): Month associated to start_time
109 | - year (INT): Year associated to start_time
110 | - weekday (TEXT): Name of week day associated to start_time
111 |
112 |
113 | ## Authors
114 |
115 | * **Florencia Silvestre** - [Github](https://github.com/Flor91) - [LinkedIn](https://www.linkedin.com/in/florencia-silvestre-2683587b/)
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/P4_Data_Lake/etl.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | from datetime import datetime
3 | import os
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql.functions import udf, col
6 | from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
7 | from pyspark.sql.functions import monotonically_increasing_id
8 | from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Dat, TimestampType
9 |
10 |
11 | config = configparser.ConfigParser()
12 | config.read('dl.cfg')
13 |
14 | os.environ['AWS_ACCESS_KEY_ID']=config['AWS_ACCESS_KEY_ID']
15 | os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS_SECRET_ACCESS_KEY']
16 |
17 |
18 | def create_spark_session():
19 | """
20 | Create or retrieve a Spark Session
21 | """
22 | spark = SparkSession \
23 | .builder \
24 | .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
25 | .getOrCreate()
26 | return spark
27 |
28 |
29 | def process_song_data(spark, input_data, output_data):
30 | """
31 | Description: This function loads song_data from S3 and processes it by extracting the songs and artist tables
32 | and then again loaded back to S3
33 |
34 | Parameters:
35 | spark : Spark Session
36 | input_data : location of song_data json files with the songs metadata
37 | output_data : S3 bucket were dimensional tables in parquet format will be stored
38 | """
39 |
40 | song_data = input_data + 'song_data/*/*/*/*.json'
41 |
42 | songSchema = R([
43 | Fld("artist_id",Str()),
44 | Fld("artist_latitude",Dbl()),
45 | Fld("artist_location",Str()),
46 | Fld("artist_longitude",Dbl()),
47 | Fld("artist_name",Str()),
48 | Fld("duration",Dbl()),
49 | Fld("num_songs",Int()),
50 | Fld("title",Str()),
51 | Fld("year",Int()),
52 | ])
53 |
54 | df = spark.read.json(song_data, schema=songSchema)
55 |
56 | song_fields = ["title", "artist_id","year", "duration"]
57 |
58 | songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
59 |
60 | songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
61 |
62 | artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]
63 |
64 | artists_table = df.selectExpr(artists_fields).dropDuplicates()
65 |
66 | artists_table.write.parquet(output_data + 'artists/')
67 |
68 |
69 | def process_log_data(spark, input_data, output_data):
70 | """
71 | Description: This function loads log_data from S3 and processes it by extracting the songs and artist tables
72 | and then again loaded back to S3. Also output from previous function is used in by spark.read.json command
73 |
74 | Parameters:
75 | spark : Spark Session
76 | input_data : location of log_data json files with the events data
77 | output_data : S3 bucket were dimensional tables in parquet format will be stored
78 |
79 | """
80 |
81 | log_data = input_data + 'log_data/*/*/*.json'
82 |
83 | df = spark.read.json(log_data)
84 |
85 | df = df.filter(df.page == 'NextSong')
86 |
87 | users_fields = ["userdId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"]
88 | users_table = df.selectExpr(users_fields).dropDuplicates()
89 |
90 | users_table.write.parquet(output_data + 'users/')
91 |
92 | get_datetime = udf(date_convert, TimestampType())
93 | df = df.withColumn("start_time", get_datetime('ts'))
94 |
95 | time_table = df.select("start_time").dropDuplicates() \
96 | .withColumn("hour", hour(col("start_time")).withColumn("day", day(col("start_time")) \
97 | .withColumn("week", week(col("start_time")).withColumn("month", month(col("start_time")) \
98 | .withColumn("year", year(col("start_time")).withColumn("weekday", date_format(col("start_time"), 'E'))
99 |
100 | songs_table.write.partitionBy("year", "month").parquet(output_data + 'time/')
101 |
102 | df_songs = spark.read.parquet(output_data + 'songs/*/*/*')
103 |
104 | df_artists = spark.read.parquet(output_data + 'artists/*')
105 |
106 | songs_logs = df.join(songs_df, (df.song == songs_df.title))
107 | artists_songs_logs = songs_logs.join(df_artists, (songs_logs.artist == df_artists.name))
108 |
109 | songplays = artists_songs_logs.join(
110 | time_table,
111 | artists_songs_logs.ts == time_table.start_time, 'left'
112 | ).drop(artists_songs_logs.year)
113 |
114 | songplays_table = songplays.select(
115 | col('start_time').alias('start_time'),
116 | col('userId').alias('user_id'),
117 | col('level').alias('level'),
118 | col('song_id').alias('song_id'),
119 | col('artist_id').alias('artist_id'),
120 | col('sessionId').alias('session_id'),
121 | col('location').alias('location'),
122 | col('userAgent').alias('user_agent'),
123 | col('year').alias('year'),
124 | col('month').alias('month'),
125 | ).repartition("year", "month")
126 |
127 | songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
128 |
129 |
130 | def main():
131 | """
132 | Extract songs and events data from S3, Transform it into dimensional tables format, and Load it back to S3 in Parquet format
133 | """
134 | spark = create_spark_session()
135 | input_data = "s3a://udacity-dend/"
136 | output_data = "s3a://sparkify-dend/"
137 |
138 | process_song_data(spark, input_data, output_data)
139 | process_log_data(spark, input_data, output_data)
140 |
141 |
142 | if __name__ == "__main__":
143 | main()
144 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/data/log-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/log-data.png
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/data/log-data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/log-data.zip
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/data/song-data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/3_dend_spark_data_lakes/data/song-data.zip
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/1_procedural_vs_functional_in_python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Procedural Programming\n",
8 | "\n",
9 | "This notebook contains the code from the previous screencast. The code counts the number of times a song appears in the log_of_songs variable. \n",
10 | "\n",
11 | "You'll notice that the first time you run `count_plays(\"Despacito\")`, you get the correct count. However, when you run the same code again `count_plays(\"Despacito\")`, the results are no longer correct.This is because the global variable `play_count` stores the results outside of the count_plays function. \n",
12 | "\n",
13 | "\n",
14 | "# Instructions\n",
15 | "\n",
16 | "Run the code cells in this notebook to see the problem with "
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "log_of_songs = [\n",
26 | " \"Despacito\",\n",
27 | " \"Nice for what\",\n",
28 | " \"No tears left to cry\",\n",
29 | " \"Despacito\",\n",
30 | " \"Havana\",\n",
31 | " \"In my feelings\",\n",
32 | " \"Nice for what\",\n",
33 | " \"Despacito\",\n",
34 | " \"All the stars\"\n",
35 | "]"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "play_count = 0"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "def count_plays(song_title):\n",
54 | " global play_count\n",
55 | " for song in log_of_songs:\n",
56 | " if song == song_title:\n",
57 | " play_count = play_count + 1\n",
58 | " return play_count"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "count_plays(\"Despacito\")"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "count_plays(\"Despacito\")"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "# How to Solve the Issue\n",
84 | "\n",
85 | "How might you solve this issue? You could get rid of the global variable and instead use play_count as an input to the function:\n",
86 | "\n",
87 | "```python\n",
88 | "def count_plays(song_title, play_count):\n",
89 | " for song in log_of_songs:\n",
90 | " if song == song_title:\n",
91 | " play_count = play_count + 1\n",
92 | " return play_count\n",
93 | "\n",
94 | "```\n",
95 | "\n",
96 | "How would this work with parallel programming? Spark splits up data onto multiple machines. If your songs list were split onto two machines, Machine A would first need to finish counting, and then return its own result to Machine B. And then Machine B could use the output from Machine A and add to the count.\n",
97 | "\n",
98 | "However, that isn't parallel computing. Machine B would have to wait until Machine A finishes. You'll see in the next parts of the lesson how Spark solves this issue with a functional programming paradigm.\n",
99 | "\n",
100 | "In Spark, if your data is split onto two different machines, machine A will run a function to count how many times 'Despacito' appears on machine A. Machine B will simultaneously run a function to count how many times 'Despacito' appears on machine B. After they finish counting individually, they'll combine their results together. You'll see how this works in the next parts of the lesson."
101 | ]
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 3",
107 | "language": "python",
108 | "name": "python3"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.6.3"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 2
125 | }
126 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/2_spark_maps_and_lazy_evaluation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Maps\n",
8 | "\n",
9 | "In Spark, maps take data as input and then transform that data with whatever function you put in the map. They are like directions for the data telling how each input should get to the output.\n",
10 | "\n",
11 | "The first code cell creates a SparkContext object. With the SparkContext, you can input a dataset and parallelize the data across a cluster (since you are currently using Spark in local mode on a single machine, technically the dataset isn't distributed yet).\n",
12 | "\n",
13 | "Run the code cell below to instantiate a SparkContext object and then read in the log_of_songs list into Spark. "
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "### \n",
23 | "# You might have noticed this code in the screencast.\n",
24 | "#\n",
25 | "# import findspark\n",
26 | "# findspark.init('spark-2.3.2-bin-hadoop2.7')\n",
27 | "#\n",
28 | "# The findspark Python module makes it easier to install\n",
29 | "# Spark in local mode on your computer. This is convenient\n",
30 | "# for practicing Spark syntax locally. \n",
31 | "# However, the workspaces already have Spark installed and you do not\n",
32 | "# need to use the findspark module\n",
33 | "#\n",
34 | "###\n",
35 | "\n",
36 | "import pyspark\n",
37 | "sc = pyspark.SparkContext(appName=\"maps_and_lazy_evaluation_example\")\n",
38 | "\n",
39 | "log_of_songs = [\n",
40 | " \"Despacito\",\n",
41 | " \"Nice for what\",\n",
42 | " \"No tears left to cry\",\n",
43 | " \"Despacito\",\n",
44 | " \"Havana\",\n",
45 | " \"In my feelings\",\n",
46 | " \"Nice for what\",\n",
47 | " \"despacito\",\n",
48 | " \"All the stars\"\n",
49 | "]\n",
50 | "\n",
51 | "# parallelize the log_of_songs to use with Spark\n",
52 | "distributed_song_log = sc.parallelize(log_of_songs)"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "This next code cell defines a function that converts a song title to lowercase. Then there is an example converting the word \"Havana\" to \"havana\"."
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "def convert_song_to_lowercase(song):\n",
69 | " return song.lower()\n",
70 | "\n",
71 | "convert_song_to_lowercase(\"Havana\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "The following code cells demonstrate how to apply this function using a map step. The map step will go through each song in the list and apply the convert_song_to_lowercase() function. "
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "distributed_song_log.map(convert_song_to_lowercase)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "You'll notice that this code cell ran quite quickly. This is because of lazy evaluation. Spark does not actually execute the map step unless it needs to.\n",
95 | "\n",
96 | "\"RDD\" in the output refers to resilient distributed dataset. RDDs are exactly what they say they are: fault-tolerant datasets distributed across a cluster. This is how Spark stores data. \n",
97 | "\n",
98 | "To get Spark to actually run the map step, you need to use an \"action\". One available action is the collect method. The collect() method takes the results from all of the clusters and \"collects\" them into a single list on the master node."
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "distributed_song_log.map(convert_song_to_lowercase).collect()"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "Note as well that Spark is not changing the original data set: Spark is merely making a copy. You can see this by running collect() on the original dataset."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "distributed_song_log.collect()"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "You do not always have to write a custom function for the map step. You can also use anonymous (lambda) functions as well as built-in Python functions like string.lower(). \n",
131 | "\n",
132 | "Anonymous functions are actually a Python feature for writing functional style programs."
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "distributed_song_log.map(lambda song: song.lower()).collect()"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "distributed_song_log.map(lambda x: x.lower()).collect()"
151 | ]
152 | }
153 | ],
154 | "metadata": {
155 | "kernelspec": {
156 | "display_name": "Python 3",
157 | "language": "python",
158 | "name": "python3"
159 | },
160 | "language_info": {
161 | "codemirror_mode": {
162 | "name": "ipython",
163 | "version": 3
164 | },
165 | "file_extension": ".py",
166 | "mimetype": "text/x-python",
167 | "name": "python",
168 | "nbconvert_exporter": "python",
169 | "pygments_lexer": "ipython3",
170 | "version": "3.6.3"
171 | }
172 | },
173 | "nbformat": 4,
174 | "nbformat_minor": 2
175 | }
176 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/3_data_inputs_and_outputs.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Reading and Writing Data with Spark\n",
8 | "\n",
9 | "This notebook contains the code from the previous screencast. The only difference is that instead of reading in a dataset from a remote cluster, the data set is read in from a local file. You can see the file by clicking on the \"jupyter\" icon and opening the folder titled \"data\".\n",
10 | "\n",
11 | "Run the code cell to see how everything works. \n",
12 | "\n",
13 | "First let's import SparkConf and SparkSession"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import pyspark\n",
23 | "from pyspark import SparkConf\n",
24 | "from pyspark.sql import SparkSession"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "Since we're using Spark locally we already have both a sparkcontext and a sparksession running. We can update some of the parameters, such our application's name. Let's just call it \"Our first Python Spark SQL example\""
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "spark = SparkSession \\\n",
41 | " .builder \\\n",
42 | " .appName(\"Our first Python Spark SQL example\") \\\n",
43 | " .getOrCreate()"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Let's check if the change went through"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "spark.sparkContext.getConf().getAll()"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "spark"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "As you can see the app name is exactly how we set it\n",
76 | "\n",
77 | "Let's create our first dataframe from a fairly small sample data set. Througout the course we'll work with a log file data set that describes user interactions with a music streaming service. The records describe events such as logging in to the site, visiting a page, listening to the next song, seeing an ad."
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "path = \"data/sparkify_log_small.json\"\n",
87 | "user_log = spark.read.json(path)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "user_log.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "user_log.describe()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "user_log.show(n=1)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "user_log.take(5)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "out_path = \"data/sparkify_log_small.csv\""
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "user_log.write.save(out_path, format=\"csv\", header=True)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "user_log_2 = spark.read.csv(out_path, header=True)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "user_log_2.printSchema()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "user_log_2.take(2)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "user_log_2.select(\"userID\").show()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "user_log_2.take(1)"
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.6.3"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 1
211 | }
212 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/5_dataframe_quiz.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Wrangling with DataFrames Coding Quiz\n",
8 | "\n",
9 | "Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from pyspark.sql import SparkSession\n",
19 | "\n",
20 | "# TODOS: \n",
21 | "# 1) import any other libraries you might need\n",
22 | "# 2) instantiate a Spark session \n",
23 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n",
24 | "# 4) write code to answer the quiz questions "
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "# Question 1\n",
32 | "\n",
33 | "Which page did user id \"\" (empty string) NOT visit?"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# TODO: write your code to answer question 1"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "# Question 2 - Reflect\n",
50 | "\n",
51 | "What type of user does the empty string user id most likely refer to?\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TODO: use this space to explore the behavior of the user with an empty string\n"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "# Question 3\n",
68 | "\n",
69 | "How many female users do we have in the data set?"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "# TODO: write your code to answer question 3"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "# Question 4\n",
86 | "\n",
87 | "How many songs were played from the most played artist?"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "# TODO: write your code to answer question 4"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "# Question 5 (challenge)\n",
104 | "\n",
105 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n",
106 | "\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "# TODO: write your code to answer question 5"
116 | ]
117 | }
118 | ],
119 | "metadata": {
120 | "kernelspec": {
121 | "display_name": "Python 3",
122 | "language": "python",
123 | "name": "python3"
124 | },
125 | "language_info": {
126 | "codemirror_mode": {
127 | "name": "ipython",
128 | "version": 3
129 | },
130 | "file_extension": ".py",
131 | "mimetype": "text/x-python",
132 | "name": "python",
133 | "nbconvert_exporter": "python",
134 | "pygments_lexer": "ipython3",
135 | "version": "3.6.3"
136 | }
137 | },
138 | "nbformat": 4,
139 | "nbformat_minor": 2
140 | }
141 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/6_dataframe_quiz_solution.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Answer Key to the Data Wrangling with DataFrames Coding Quiz\n",
8 | "\n",
9 | "Helpful resources:\n",
10 | "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "from pyspark.sql import SparkSession\n",
20 | "from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg\n",
21 | "from pyspark.sql.functions import sum as Fsum\n",
22 | "from pyspark.sql.window import Window\n",
23 | "from pyspark.sql.types import IntegerType"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# 1) import any other libraries you might need\n",
33 | "# 2) instantiate a Spark session \n",
34 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n",
35 | "# 4) write code to answer the quiz questions \n",
36 | "\n",
37 | "spark = SparkSession \\\n",
38 | " .builder \\\n",
39 | " .appName(\"Data Frames practice\") \\\n",
40 | " .getOrCreate()\n",
41 | "\n",
42 | "df = spark.read.json(\"data/sparkify_log_small.json\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "# Question 1\n",
50 | "\n",
51 | "Which page did user id \"\" (empty string) NOT visit?"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "df.printSchema()"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "# filter for users with blank user id\n",
70 | "blank_pages = df.filter(df.userId == '') \\\n",
71 | " .select(col('page') \\\n",
72 | " .alias('blank_pages')) \\\n",
73 | " .dropDuplicates()\n",
74 | "\n",
75 | "# get a list of possible pages that could be visited\n",
76 | "all_pages = df.select('page').dropDuplicates()\n",
77 | "\n",
78 | "# find values in all_pages that are not in blank_pages\n",
79 | "# these are the pages that the blank user did not go to\n",
80 | "for row in set(all_pages.collect()) - set(blank_pages.collect()):\n",
81 | " print(row.page)"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "# Question 2 - Reflect\n",
89 | "\n",
90 | "What type of user does the empty string user id most likely refer to?\n"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "Perhaps it represents users who have not signed up yet or who are signed out and are about to log in."
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "# Question 3\n",
105 | "\n",
106 | "How many female users do we have in the data set?"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "df.filter(df.gender == 'F') \\\n",
116 | " .select('userId', 'gender') \\\n",
117 | " .dropDuplicates() \\\n",
118 | " .count()"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "# Question 4\n",
126 | "\n",
127 | "How many songs were played from the most played artist?"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "df.filter(df.page == 'NextSong') \\\n",
137 | " .select('Artist') \\\n",
138 | " .groupBy('Artist') \\\n",
139 | " .agg({'Artist':'count'}) \\\n",
140 | " .withColumnRenamed('count(Artist)', 'Artistcount') \\\n",
141 | " .sort(desc('Artistcount')) \\\n",
142 | " .show(1)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "# Question 5 (challenge)\n",
150 | "\n",
151 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.\n",
152 | "\n"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "# TODO: filter out 0 sum and max sum to get more exact answer\n",
162 | "\n",
163 | "function = udf(lambda ishome : int(ishome == 'Home'), IntegerType())\n",
164 | "\n",
165 | "user_window = Window \\\n",
166 | " .partitionBy('userID') \\\n",
167 | " .orderBy(desc('ts')) \\\n",
168 | " .rangeBetween(Window.unboundedPreceding, 0)\n",
169 | "\n",
170 | "cusum = df.filter((df.page == 'NextSong') | (df.page == 'Home')) \\\n",
171 | " .select('userID', 'page', 'ts') \\\n",
172 | " .withColumn('homevisit', function(col('page'))) \\\n",
173 | " .withColumn('period', Fsum('homevisit').over(user_window))\n",
174 | "\n",
175 | "cusum.filter((cusum.page == 'NextSong')) \\\n",
176 | " .groupBy('userID', 'period') \\\n",
177 | " .agg({'period':'count'}) \\\n",
178 | " .agg({'count(period)':'avg'}).show()"
179 | ]
180 | }
181 | ],
182 | "metadata": {
183 | "kernelspec": {
184 | "display_name": "Python 3",
185 | "language": "python",
186 | "name": "python3"
187 | },
188 | "language_info": {
189 | "codemirror_mode": {
190 | "name": "ipython",
191 | "version": 3
192 | },
193 | "file_extension": ".py",
194 | "mimetype": "text/x-python",
195 | "name": "python",
196 | "nbconvert_exporter": "python",
197 | "pygments_lexer": "ipython3",
198 | "version": "3.6.3"
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 2
203 | }
204 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/8_spark_sql_quiz.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Wrangling with Spark SQL Quiz\n",
8 | "\n",
9 | "This quiz uses the same dataset and most of the same questions from the earlier \"Quiz - Data Wrangling with Data Frames Jupyter Notebook.\" For this quiz, however, use Spark SQL instead of Spark Data Frames."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from pyspark.sql import SparkSession\n",
19 | "\n",
20 | "# TODOS: \n",
21 | "# 1) import any other libraries you might need\n",
22 | "# 2) instantiate a Spark session \n",
23 | "# 3) read in the data set located at the path \"data/sparkify_log_small.json\"\n",
24 | "# 4) create a view to use with your SQL queries\n",
25 | "# 5) write code to answer the quiz questions "
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "# Question 1\n",
33 | "\n",
34 | "Which page did user id \"\"(empty string) NOT visit?"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TODO: write your code to answer question 1"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "# Question 2 - Reflect\n",
51 | "\n",
52 | "Why might you prefer to use SQL over data frames? Why might you prefer data frames over SQL?"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "# Question 3\n",
60 | "\n",
61 | "How many female users do we have in the data set?"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# TODO: write your code to answer question 3"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "# Question 4\n",
78 | "\n",
79 | "How many songs were played from the most played artist?"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "# TODO: write your code to answer question 4"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "# Question 5 (challenge)\n",
96 | "\n",
97 | "How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer."
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# TODO: write your code to answer question 5"
107 | ]
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.6.3"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 2
131 | }
132 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/notebooks/mapreduce_practice.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# MapReduce\n",
8 | "\n",
9 | "The MapReduce programming technique was designed to analyze massive data sets across a cluster. In this Jupyter notebook, you'll get a sense for how Hadoop MapReduce works; however, this notebook will run locally rather than on a cluster.\n",
10 | "\n",
11 | "The biggest difference between Hadoop and Spark is that Spark tries to do as many calculations as possible in memory, which avoids moving data back and forth across a cluster. Hadoop writes intermediate calculations out to disk, which can be less efficient. Hadoop is an older technology than Spark and one of the cornerstone big data technologies.\n",
12 | "\n",
13 | "If you click on the Jupyter notebook logo at the top of the workspace, you'll be taken to the workspace directory. There you will see a file called \"songplays.txt\". This is a text file where each line represents a song that was played in the Sparkify app. The MapReduce code will count how many times each song was played. In other words, the code counts how many times the song title appears in the list.\n",
14 | "\n",
15 | "\n",
16 | "# MapReduce versus Hadoop MapReduce\n",
17 | "\n",
18 | "Don't get confused by the terminology! MapReduce is a programming technique. Hadoop MapReduce is a specific implementation of the programming technique.\n",
19 | "\n",
20 | "Some of the syntax will look a bit funny, so be sure to read the explanation and comments for each section. You'll learn more about the syntax in later lessons. \n",
21 | "\n",
22 | "Run each of the code cells below to see the output."
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# Install mrjob library. This package is for running MapReduce jobs with Python\n",
32 | "# In Jupyter notebooks, \"!\" runs terminal commands from inside notebooks \n",
33 | "\n",
34 | "! pip install mrjob"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "%%file wordcount.py\n",
44 | "# %%file is an Ipython magic function that saves the code cell as a file\n",
45 | "\n",
46 | "from mrjob.job import MRJob # import the mrjob library\n",
47 | "\n",
48 | "class MRSongCount(MRJob):\n",
49 | " \n",
50 | " # the map step: each line in the txt file is read as a key, value pair\n",
51 | " # in this case, each line in the txt file only contains a value but no key\n",
52 | " # _ means that in this case, there is no key for each line\n",
53 | " def mapper(self, _, song):\n",
54 | " # output each line as a tuple of (song_names, 1) \n",
55 | " yield (song, 1)\n",
56 | "\n",
57 | " # the reduce step: combine all tuples with the same key\n",
58 | " # in this case, the key is the song name\n",
59 | " # then sum all the values of the tuple, which will give the total song plays\n",
60 | " def reducer(self, key, values):\n",
61 | " yield (key, sum(values))\n",
62 | " \n",
63 | "if __name__ == \"__main__\":\n",
64 | " MRSongCount.run()"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "# run the code as a terminal command\n",
74 | "! python wordcount.py songplays.txt"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "# Summary of what happens in the code.\n",
82 | "\n",
83 | "There is a list of songs in songplays.txt that looks like the following:\n",
84 | "\n",
85 | "Deep Dreams\n",
86 | "Data House Rock\n",
87 | "Deep Dreams\n",
88 | "Data House Rock\n",
89 | "Broken Networks\n",
90 | "Data House Rock\n",
91 | "etc.....\n",
92 | "\n",
93 | "During the map step, the code reads in the txt file one line at a time. The map steps outputs a set of tuples that look like this:\n",
94 | "\n",
95 | "(Deep Dreams, 1) \n",
96 | "(Data House Rock, 1) \n",
97 | "(Deep Dreams, 1) \n",
98 | "(Data House Rock, 1) \n",
99 | "(Broken Networks, 1) \n",
100 | "(Data House Rock, 1) \n",
101 | "etc.....\n",
102 | "\n",
103 | "Finally, the reduce step combines all of the values by keys and sums the values: \n",
104 | "\n",
105 | "(Deep Dreams, \\[1, 1, 1, 1, 1, 1, ... \\]) \n",
106 | "(Data House Rock, \\[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\\]) \n",
107 | "(Broken Networks, \\[1, 1, 1, ...\\] \n",
108 | "\n",
109 | "With the output \n",
110 | "\n",
111 | "(Deep Dreams, 1131) \n",
112 | "(Data House Rock, 510) \n",
113 | "(Broken Networks, 828) "
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {},
120 | "outputs": [],
121 | "source": []
122 | }
123 | ],
124 | "metadata": {
125 | "kernelspec": {
126 | "display_name": "Python 3",
127 | "language": "python",
128 | "name": "python3"
129 | },
130 | "language_info": {
131 | "codemirror_mode": {
132 | "name": "ipython",
133 | "version": 3
134 | },
135 | "file_extension": ".py",
136 | "mimetype": "text/x-python",
137 | "name": "python",
138 | "nbconvert_exporter": "python",
139 | "pygments_lexer": "ipython3",
140 | "version": "3.6.3"
141 | }
142 | },
143 | "nbformat": 4,
144 | "nbformat_minor": 2
145 | }
146 |
--------------------------------------------------------------------------------
/3_dend_spark_data_lakes/spark.md:
--------------------------------------------------------------------------------
1 | ## General functions
2 | We have used the following general functions that are quite similar to methods of Pandas dataframes:
3 |
4 | - select(): returns a new dataframe with the selected columns
5 | - filter(): filters rows using the given condition
6 | - where(): is just an alias for filter()
7 | - groupBy(): groups the DataFrame using the specified columns, so we can run aggregation on them
8 | - sort(): returns a new DataFrame sorted by the specified column(s). By default the second parameter 'ascending' is True
9 | - dropDuplicates(): returns a new dataframe with unique rows based on all or just a subset of columns
10 | - withColumn(): returns a new DataFrame by adding a column or replacing the existing column that has the same name. The first parameter is the name of the new column, the second is an expression of how to compute it
11 |
12 | ## Aggregate functions
13 | Spark SQL provides built-in methods for the most common aggregations such as count(), countDistinct(), avg(), max(), min(), etc. in the pyspark.sql.functions module. These methods are not the same as the built-in methods in the Python Standard Library, where we can find min() for example as well, hence you need to be careful not to try to use them interchangeably.
14 |
15 | In many cases, there are multiple ways to express the same aggregations. For example, if we would like to compute one type of aggregate for one or more columns of the dataframe we can just simply chain the aggregate method after a groupBy(). If we would like to use different functions on different columns agg() comes in handy. For example agg({"salary": "avg", "age": "max"}) computes the average salary and maximum age.
16 |
17 | ## User defined functions (UDF)
18 | In Spark SQL we can define our own functions with the udf method from the pyspark.sql.functions module. The default type of the returned variable for UDFs is string. If we would like to return an other type we need to explicitly do so by using the different types from the pyspark.sql.types module.
19 |
20 | ## Window functions
21 | Window functions are a way of combining the values of ranges of rows in a dataframe. When defining the window we can choose how to sort and group (with the partitionBy method) the rows and how wide of a window we'd like to use (described by rangeBetween or rowsBetween).
22 |
23 | For further information see the [Spark SQL, DataFrames and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html) and the [Spark Python API Docs](https://spark.apache.org/docs/latest/api/python/index.html) .
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/__init__.py
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/airflow.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/airflow.db
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/__init__.py
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/sparkify_dend_dag.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.dummy_operator import DummyOperator
4 | from airflow.operators.subdag_operator import SubDagOperator
5 | from airflow.operators import (StageToRedshiftOperator, LoadFactOperator,
6 | DataQualityOperator, CreateTablesOperator)
7 | from helpers import SqlQueries
8 | from sparkify_dend_dimesions_subdag import load_dimensional_tables_dag
9 |
10 |
11 | start_date = datetime.utcnow()
12 |
13 | default_args = {
14 | 'owner': 'florencia',
15 | 'start_date': datetime(2018, 5, 1),
16 | 'end_date': datetime(2018, 11, 30),
17 | 'depends_on_past': False,
18 | 'retries': 3,
19 | 'retry_delay': timedelta(minutes=5),
20 | 'catchup': False,
21 | 'email_on_retry': False
22 | }
23 |
24 | dag_name='sparkify_dend_dag'
25 | dag = DAG(dag_name,
26 | default_args=default_args,
27 | description='Load and transform data in Redshift with Airflow',
28 | schedule_interval='0 * * * *',
29 | max_active_runs=3
30 | )
31 |
32 | start_operator = DummyOperator(task_id='Begin_execution', dag=dag)
33 |
34 | create_redshift_tables = CreateTablesOperator(
35 | task_id='Create_tables',
36 | dag=dag,
37 | redshift_conn_id="redshift"
38 | )
39 |
40 | stage_events_to_redshift = StageToRedshiftOperator(
41 | task_id='Stage_events',
42 | dag=dag,
43 | provide_context=True,
44 | table="events",
45 | redshift_conn_id="redshift",
46 | aws_credentials_id="aws_credentials",
47 | s3_bucket="udacity-dend",
48 | s3_key="log_data",
49 | region="us-west-2",
50 | file_format="JSON",
51 | execution_date=start_date
52 | )
53 |
54 | stage_songs_to_redshift = StageToRedshiftOperator(
55 | task_id='Stage_songs',
56 | dag=dag,
57 | provide_context=True,
58 | table="songs",
59 | redshift_conn_id="redshift",
60 | aws_credentials_id="aws_credentials",
61 | s3_bucket="udacity-dend",
62 | s3_key="song_data",
63 | region="us-west-2",
64 | data_format="JSON",
65 | execution_date=start_date
66 | )
67 |
68 | load_songplays_table = LoadFactOperator(
69 | task_id='Load_songplays_fact_table',
70 | dag=dag,
71 | provide_context=True,
72 | aws_credentials_id="aws_credentials",
73 | redshift_conn_id='redshift',
74 | sql_query=SqlQueries.songplay_table_insert
75 | )
76 |
77 | load_user_dimension_table_task_id='Load_user_dim_table'
78 | load_user_dimension_table = SubDagOperator(
79 | subdag=load_dimensional_tables_dag(
80 | parent_dag_name=dag_name,
81 | task_id=load_user_dimension_table_task_id,
82 | redshift_conn_id="redshift",
83 | aws_credentials_id="aws_credentials",
84 | start_date= datetime(2018, 5, 1),
85 | table="users",
86 | sql_query=SqlQueries.user_table_insert,
87 | ),
88 | task_id=load_user_dimension_table_task_id,
89 | dag=dag,
90 | )
91 |
92 | load_song_dimension_table_task_id='Load_song_dim_table'
93 | load_song_dimension_table = SubDagOperator(
94 | subdag=load_dimensional_tables_dag(
95 | parent_dag_name=dag_name,
96 | task_id=load_song_dimension_table_task_id,
97 | redshift_conn_id="redshift",
98 | aws_credentials_id="aws_credentials",
99 | start_date= datetime(2018, 5, 1),
100 | table="users",
101 | sql_query=SqlQueries.song_table_insert,
102 | ),
103 | task_id=load_song_dimension_table_task_id,
104 | dag=dag,
105 | )
106 |
107 | load_artist_dimension_table_task_id='Load_artist_dim_table'
108 | load_artist_dimension_table = SubDagOperator(
109 | subdag=load_dimensional_tables_dag(
110 | parent_dag_name=dag_name,
111 | task_id=load_artist_dimension_table_task_id,
112 | redshift_conn_id="redshift",
113 | aws_credentials_id="aws_credentials",
114 | table="users",
115 | start_date= datetime(2018, 5, 1),
116 | sql_query=SqlQueries.artist_table_insert,
117 | ),
118 | task_id=load_artist_dimension_table_task_id,
119 | dag=dag,
120 | )
121 |
122 | load_time_dimension_table_task_id='Load_artist_dim_table'
123 | load_time_dimension_table = SubDagOperator(
124 | subdag=load_dimensional_tables_dag(
125 | parent_dag_name=dag_name,
126 | task_id=load_artist_dimension_table_task_id,
127 | redshift_conn_id="redshift",
128 | aws_credentials_id="aws_credentials",
129 | table="users",
130 | start_date= datetime(2018, 5, 1),
131 | sql_query=SqlQueries.artist_table_insert,
132 | ),
133 | task_id=load_artist_dimension_table_task_id,
134 | dag=dag,
135 | )
136 |
137 |
138 | run_quality_checks = DataQualityOperator(
139 | task_id='Run_data_quality_checks',
140 | dag=dag,
141 | provide_context=True,
142 | aws_credentials_id="aws_credentials",
143 | redshift_conn_id='redshift',
144 | tables=["songplay", "users", "song", "artist", "time"]
145 | )
146 |
147 | end_operator = DummyOperator(task_id='Stop_execution', dag=dag)
148 |
149 | # Setting tasks dependencies
150 |
151 | start_operator >> create_redshift_tables >> [stage_songs_to_redshift, stage_events_to_redshift]
152 |
153 | [stage_events_to_redshift, stage_songs_to_redshift] >> load_songplays_table
154 |
155 | load_songplays_table >> [load_user_dimension_table, load_song_dimension_table, load_artist_dimension_table,
156 | load_time_dimension_table] >> run_quality_checks
157 |
158 | run_quality_checks >> end_operator
159 |
160 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/dags/sparkify_dend_dimesions_subdag.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.dummy_operator import DummyOperator
4 | from airflow.operators import LoadDimensionOperator
5 | from helpers import SqlQueries
6 |
7 |
8 | def load_dimensional_tables_dag(
9 | parent_dag_name,
10 | task_id,
11 | redshift_conn_id,
12 | aws_credentials_id,
13 | table,
14 | sql_query,
15 | *args, **kwargs):
16 | dag = DAG(
17 | f"{parent_dag_name}.{task_id}",
18 | **kwargs
19 | )
20 | """
21 | Returns a DAG inserts data into a dimensional redshift table from staging tables.
22 | """
23 |
24 | load_dimension_table = LoadDimensionOperator(
25 | task_id=f"load_{table}_dim_table",
26 | dag=dag,
27 | table=table,
28 | redshift_conn_id=redshift_conn_id,
29 | aws_credentials_id=aws_credentials_id,
30 | sql_query=sql_query
31 | )
32 |
33 | load_dimension_table
34 |
35 | return dag
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-details-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-details-dag.png
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-running-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/airflow-running-dag.png
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag-code.png
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/P5_Data_Pipelines/imgs/dag.png
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, absolute_import, print_function
2 |
3 | from airflow.plugins_manager import AirflowPlugin
4 |
5 | import operators
6 | import helpers
7 |
8 | # Defining the plugin class
9 | class UdacityPlugin(AirflowPlugin):
10 | name = "udacity_plugin"
11 | operators = [
12 | operators.StageToRedshiftOperator,
13 | operators.LoadFactOperator,
14 | operators.LoadDimensionOperator,
15 | operators.DataQualityOperator,
16 | operators.CreateTablesOperator
17 | ]
18 | helpers = [
19 | helpers.SqlQueries
20 | ]
21 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.sql_queries import SqlQueries
2 |
3 | __all__ = [
4 | 'SqlQueries',
5 | ]
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/helpers/sql_queries.py:
--------------------------------------------------------------------------------
1 | class SqlQueries:
2 |
3 | songplay_table_insert = ("""
4 | INSERT INTO songplays (
5 | songplay_id,
6 | start_time,
7 | userid,
8 | level,
9 | song_id,
10 | artist_id,
11 | sessionid,
12 | location,
13 | useragent,
14 | start_time
15 | )
16 | SELECT
17 | md5(events.sessionid || events.start_time) songplay_id,
18 | events.start_time,
19 | events.userid,
20 | events.level,
21 | songs.song_id,
22 | songs.artist_id,
23 | events.sessionid,
24 | events.location,
25 | events.useragent
26 | FROM (SELECT TIMESTAMP 'epoch' + ts/1000 * interval '1 second' AS start_time, *
27 | FROM staging_events
28 | WHERE page='NextSong') events
29 | LEFT JOIN staging_songs songs
30 | ON events.song = songs.title
31 | AND events.artist = songs.artist_name
32 | AND events.length = songs.duration
33 | """)
34 |
35 | user_table_insert = ("""
36 | INSERT INTO users (
37 | userid,
38 | firstname,
39 | lastname,
40 | gender,
41 | level
42 | )
43 | SELECT distinct userid, firstname, lastname, gender, level
44 | FROM staging_events
45 | WHERE page='NextSong'
46 | """)
47 |
48 | song_table_insert = ("""
49 | INSERT INTO songs (
50 | song_id,
51 | title,
52 | artist_id,
53 | year,
54 | duration
55 | )
56 | SELECT distinct song_id, title, artist_id, year, duration
57 | FROM staging_songs
58 | """)
59 |
60 | artist_table_insert = ("""
61 | INSERT INTO artists (
62 | artist_id,
63 | artist_name,
64 | artist_location,
65 | artist_latitude,
66 | artist_longitude
67 | )
68 | SELECT distinct artist_id, artist_name, artist_location, artist_latitude, artist_longitude
69 | FROM staging_songs
70 | """)
71 |
72 | time_table_insert = ("""
73 | INSERT INTO time (
74 | start_time,
75 | hour,
76 | day,
77 | week,
78 | month,
79 | year,
80 | weekday
81 | )
82 | SELECT start_time, extract(hour from start_time), extract(day from start_time), extract(week from start_time),
83 | extract(month from start_time), extract(year from start_time), extract(dayofweek from start_time)
84 | FROM songplays
85 | """)
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
1 | from operators.stage_redshift import StageToRedshiftOperator
2 | from operators.load_fact import LoadFactOperator
3 | from operators.load_dimension import LoadDimensionOperator
4 | from operators.data_quality import DataQualityOperator
5 | from operators.create_tables import CreateTablesOperator
6 |
7 | __all__ = [
8 | 'StageToRedshiftOperator',
9 | 'LoadFactOperator',
10 | 'LoadDimensionOperator',
11 | 'DataQualityOperator',
12 | 'CreateTablesOperator'
13 | ]
14 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/create_tables.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.contrib.hooks.aws_hook import AwsHook
3 | from airflow.models import BaseOperator
4 | from airflow.utils.decorators import apply_defaults
5 |
6 |
7 | class CreateTablesOperator(BaseOperator):
8 | ui_color = '#358140'
9 | sql_statement_file='create_tables.sql'
10 |
11 | @apply_defaults
12 | def __init__(self,
13 | redshift_conn_id="",
14 | *args, **kwargs):
15 |
16 | super(CreateTablesOperator, self).__init__(*args, **kwargs)
17 | self.redshift_conn_id = redshift_conn_id
18 |
19 | def execute(self, context):
20 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
21 |
22 | self.log.info("Creating Redshift tables ")
23 |
24 | fd = open(CreateTablesOperator.sql_statement_file, 'r')
25 | sql_file = fd.read()
26 | fd.close()
27 |
28 | sql_commands = sql_file.split(';')
29 |
30 | for command in sql_commands:
31 | if command.rstrip() != '':
32 | redshift.run(command)
33 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/create_tables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXIST public.artists (
2 | artistid varchar(256) NOT NULL,
3 | name varchar(256),
4 | location varchar(256),
5 | lattitude numeric(18,0),
6 | longitude numeric(18,0)
7 | );
8 |
9 | CREATE TABLE IF NOT EXIST public.songplays (
10 | playid varchar(32) NOT NULL,
11 | start_time timestamp NOT NULL,
12 | userid int4 NOT NULL,
13 | "level" varchar(256),
14 | songid varchar(256),
15 | artistid varchar(256),
16 | sessionid int4,
17 | location varchar(256),
18 | user_agent varchar(256),
19 | CONSTRAINT songplays_pkey PRIMARY KEY (playid)
20 | );
21 |
22 | CREATE TABLE IF NOT EXIST public.songs (
23 | songid varchar(256) NOT NULL,
24 | title varchar(256),
25 | artistid varchar(256),
26 | "year" int4,
27 | duration numeric(18,0),
28 | CONSTRAINT songs_pkey PRIMARY KEY (songid)
29 | );
30 |
31 | CREATE TABLE IF NOT EXIST public.staging_events (
32 | artist varchar(256),
33 | auth varchar(256),
34 | firstname varchar(256),
35 | gender varchar(256),
36 | iteminsession int4,
37 | lastname varchar(256),
38 | length numeric(18,0),
39 | "level" varchar(256),
40 | location varchar(256),
41 | "method" varchar(256),
42 | page varchar(256),
43 | registration numeric(18,0),
44 | sessionid int4,
45 | song varchar(256),
46 | status int4,
47 | ts int8,
48 | useragent varchar(256),
49 | userid int4
50 | );
51 |
52 | CREATE TABLE IF NOT EXIST public.staging_songs (
53 | num_songs int4,
54 | artist_id varchar(256),
55 | artist_name varchar(256),
56 | artist_latitude numeric(18,0),
57 | artist_longitude numeric(18,0),
58 | artist_location varchar(256),
59 | song_id varchar(256),
60 | title varchar(256),
61 | duration numeric(18,0),
62 | "year" int4
63 | );
64 |
65 | CREATE TABLE IF NOT EXIST public."time" (
66 | start_time timestamp NOT NULL,
67 | "hour" int4,
68 | "day" int4,
69 | week int4,
70 | "month" varchar(256),
71 | "year" int4,
72 | weekday varchar(256),
73 | CONSTRAINT time_pkey PRIMARY KEY (start_time)
74 | ) ;
75 |
76 | CREATE TABLE IF NOT EXIST public.users (
77 | userid int4 NOT NULL,
78 | first_name varchar(256),
79 | last_name varchar(256),
80 | gender varchar(256),
81 | "level" varchar(256),
82 | CONSTRAINT users_pkey PRIMARY KEY (userid)
83 | );
84 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/data_quality.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 |
6 | class DataQualityOperator(BaseOperator):
7 |
8 | ui_color = '#89DA59'
9 |
10 | @apply_defaults
11 | def __init__(self,
12 | aws_credentials_id="",
13 | redshift_conn_id="",
14 | tables=[],
15 | *args, **kwargs):
16 | super(DataQualityOperator, self).__init__(*args, **kwargs)
17 | self.aws_credentials_id = aws_credentials_id,
18 | self.redshift_conn_id = redshift_conn_id,
19 | self.tables = tables
20 |
21 | def execute(self, context):
22 | redshift_hook = PostgresHook(self.redshift_conn_id)
23 | for table in self.tables:
24 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
25 | if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
26 | self.log.error(f"Data quality check failed. {table} returned no results")
27 | raise ValueError(f"Data quality check failed. {table} returned no results")
28 | self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records")
29 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/load_dimension.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 |
6 | class LoadDimensionOperator(BaseOperator):
7 |
8 | ui_color = '#80BD9E'
9 |
10 | @apply_defaults
11 | def __init__(self,
12 | redshift_conn_id="",
13 | sql_query="",
14 | table="",
15 | truncate="",
16 | *args, **kwargs):
17 | super(LoadDimensionOperator, self).__init__(*args, **kwargs)
18 | self.redshift_conn_id = redshift_conn_id
19 | self.sql_query = sql_query
20 | self.table = table
21 | self.truncate = truncate
22 |
23 | def execute(self, context):
24 | """
25 | Insert data into dimensional tables from staging events and song data.
26 | Using a truncate-insert method to empty target tables prior to load.
27 | """
28 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
29 | if self.truncate:
30 | redshift.run(f"TRUNCATE TABLE {self.table}")
31 | formatted_sql = self.sql_query.format(self.table)
32 | redshift.run(formatted_sql)
33 | self.log.info(f"Success: {self.task_id}")
34 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/load_fact.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.models import BaseOperator
3 | from airflow.utils.decorators import apply_defaults
4 |
5 |
6 | class LoadFactOperator(BaseOperator):
7 |
8 | ui_color = '#F98866'
9 |
10 | @apply_defaults
11 | def __init__(self,
12 | aws_credentials_id="",
13 | redshift_conn_id="",
14 | sql_query="",
15 | *args, **kwargs):
16 |
17 | super(LoadFactOperator, self).__init__(*args, **kwargs)
18 | self.aws_credentials_id = aws_credentials_id,
19 | self.redshift_conn_id = redshift_conn_id,
20 | self.sql_query = sql_query,
21 |
22 | def execute(self, context):
23 | redshift_hook = PostgresHook(self.redshift_conn_id)
24 | redshift_hook.run(str(self.sql_query))
25 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/P5_Data_Pipelines/plugins/operators/stage_redshift.py:
--------------------------------------------------------------------------------
1 | from airflow.hooks.postgres_hook import PostgresHook
2 | from airflow.contrib.hooks.aws_hook import AwsHook
3 | from airflow.models import BaseOperator
4 | from airflow.utils.decorators import apply_defaults
5 |
6 |
7 | class StageToRedshiftOperator(BaseOperator):
8 | ui_color = '#358140'
9 | template_fields = ("s3_key",)
10 | copy_sql = """
11 | COPY {}
12 | FROM '{}'
13 | ACCESS_KEY_ID '{}'
14 | SECRET_ACCESS_KEY '{}'
15 | REGION '{}'
16 | TIMEFORMAT as 'epochmillisecs'
17 | TRUNCATECOLUMNS BLANKSASNULL EMPTYASNULL
18 | {} 'auto'
19 | {}
20 | """
21 |
22 | @apply_defaults
23 | def __init__(self,
24 | redshift_conn_id="",
25 | aws_credentials_id="",
26 | table="",
27 | s3_bucket="",
28 | s3_key="",
29 | region="",
30 | file_format="JSON",
31 | *args, **kwargs):
32 |
33 | super(StageToRedshiftOperator, self).__init__(*args, **kwargs)
34 | self.table = table
35 | self.redshift_conn_id = redshift_conn_id
36 | self.s3_bucket = s3_bucket
37 | self.s3_key = s3_key
38 | self.region= region
39 | self.file_format = file_format
40 | self.aws_credentials_id = aws_credentials_id
41 | self.execution_date = kwargs.get('execution_date')
42 |
43 | def execute(self, context):
44 | """
45 | Copy data from S3 buckets to redshift cluster into staging tables.
46 | - redshift_conn_id: redshift cluster connection
47 | - aws_credentials_id: AWS connection
48 | - table: redshift cluster table name
49 | - s3_bucket: S3 bucket name holding source data
50 | - s3_key: S3 key files of source data
51 | - file_format: source file format - options JSON, CSV
52 | """
53 | aws_hook = AwsHook(self.aws_credentials_id)
54 | credentials = aws_hook.get_credentials()
55 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
56 |
57 | self.log.info("Clearing data from destination Redshift table")
58 | redshift.run("DELETE FROM {}".format(self.table))
59 |
60 | self.log.info("Copying data from S3 to Redshift")
61 |
62 | s3_path = "s3://{}".format(self.s3_bucket)
63 | if self.execution_date:
64 | # Backfill a specific date
65 | year = self.execution_date.strftime("%Y")
66 | month = self.execution_date.strftime("%m")
67 | day = self.execution_date.strftime("%d")
68 | s3_path = '/'.join([s3_path, str(year), str(month), str(day)])
69 | s3_path = s3_path + '/' + self.s3_key
70 |
71 | additional=""
72 | if self.file_format == 'CSV':
73 | additional = " DELIMETER ',' IGNOREHEADER 1 "
74 |
75 | formatted_sql = StageToRedshiftOperator.copy_sql.format(
76 | self.table,
77 | s3_path,
78 | credentials.access_key,
79 | credentials.secret_key,
80 | self.region,
81 | self.file_format,
82 | additional
83 | )
84 | redshift.run(formatted_sql)
85 |
86 | self.log.info(f"Success: Copying {self.table} from S3 to Redshift")
87 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/__init__.py
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex1_hello_world.py:
--------------------------------------------------------------------------------
1 | # Define a function that uses the python logger to log a function.
2 | # Then finish filling in the details of the DAG down below.
3 | # Once you’ve done that, run "/opt/airflow/start.sh" command to start the web server.
4 | # Once the Airflow web server is ready, open the Airflow UI using the "Access Airflow" button.
5 | # Turn your DAG “On”, and then Run your DAG.
6 |
7 | import datetime
8 | import logging
9 |
10 | from airflow import DAG
11 | from airflow.operators.python_operator import PythonOperator
12 |
13 |
14 | def hello_world():
15 | logging.info("Hello Flor!")
16 |
17 |
18 | dag = DAG(
19 | 'lesson1.solution1',
20 | start_date=datetime.datetime.now())
21 |
22 | greet_task = PythonOperator(
23 | task_id="hello_world_task",
24 | python_callable=hello_world,
25 | dag=dag
26 | )
27 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex2_scheduler.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # Complete the TODOs in this DAG so that it runs once a day.
3 | # Once you’ve done that, open the Airflow UI using the "Access Airflow" button.
4 | # go to the Airflow UI and turn the last exercise off, then turn this exercise on.
5 | # Wait a moment and refresh the UI to see Airflow automatically run your DAG.
6 |
7 | import datetime
8 | import logging
9 |
10 | from airflow import DAG
11 | from airflow.operators.python_operator import PythonOperator
12 |
13 |
14 | def hello_world():
15 | logging.info("Hello World")
16 |
17 | dag = DAG(
18 | "lesson1.exercise2",
19 | start_date=datetime.datetime.now() - datetime.timedelta(days=2),
20 | schedule_interval="@daily")
21 |
22 | task = PythonOperator(
23 | task_id="hello_world_task",
24 | python_callable=hello_world,
25 | dag=dag)
26 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex3_dependencies.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 |
4 | from airflow import DAG
5 | from airflow.operators.python_operator import PythonOperator
6 |
7 |
8 | def hello_world():
9 | logging.info("Hello World")
10 |
11 |
12 | def addition():
13 | logging.info(f"2 + 2 = {2+2}")
14 |
15 |
16 | def subtraction():
17 | logging.info(f"6 -2 = {6-2}")
18 |
19 |
20 | def division():
21 | logging.info(f"10 / 2 = {int(10/2)}")
22 |
23 |
24 | dag = DAG(
25 | "lesson1.solution3",
26 | schedule_interval='@hourly',
27 | start_date=datetime.datetime.now() - datetime.timedelta(days=1))
28 |
29 | hello_world_task = PythonOperator(
30 | task_id="hello_world",
31 | python_callable=hello_world,
32 | dag=dag)
33 |
34 | addition_task = PythonOperator(
35 | task_id="addition",
36 | python_callable=addition,
37 | dag=dag)
38 |
39 | subtraction_task = PythonOperator(
40 | task_id="subtraction",
41 | python_callable=subtraction,
42 | dag=dag)
43 |
44 | division_task = PythonOperator(
45 | task_id="division",
46 | python_callable=division,
47 | dag=dag)
48 |
49 | # Configure Task Dependencies
50 | hello_world_task >> addition_task
51 | hello_world_task >> subtraction_task
52 |
53 | subtraction_task >> division_task
54 | addition_task >> division_task
55 |
56 |
57 | # -> addition_task
58 | # / \
59 | # hello_world_task -> division_task
60 | # \ /
61 | # ->subtraction_task
62 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex4_connections.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 |
4 | from airflow import DAG
5 | from airflow.models import Variable
6 | from airflow.operators.python_operator import PythonOperator
7 | from airflow.hooks.S3_hook import S3Hook
8 |
9 | #
10 | # We're going to create a connection and a variable.
11 | # 1. Open your browser to localhost:8080 and open Admin->Variables
12 | # 2. Click "Create"
13 | # 3. Set "Key" equal to "s3_bucket" and set "Val" equal to "udacity-dend"
14 | # 4. Set "Key" equal to "s3_prefix" and set "Val" equal to "data-pipelines"
15 | # 5. Click save
16 | # 6. Open Admin->Connections
17 | # 7. Click "Create"
18 | # 8. Set "Conn Id" to "aws_credentials", "Conn Type" to "Amazon Web Services"
19 | # 9. Set "Login" to your aws_access_key_id and "Password" to your aws_secret_key
20 | # 10. Click save
21 | # 11. Run the DAG
22 |
23 | def list_keys():
24 | hook = S3Hook(aws_conn_id='aws_credentials')
25 | bucket = Variable.get('s3_bucket')
26 | prefix = Variable.get('s3_prefix')
27 | logging.info(f"Listing Keys from {bucket}/{prefix}")
28 | keys = hook.list_keys(bucket, prefix=prefix)
29 | for key in keys:
30 | logging.info(f"- s3://{bucket}/{key}")
31 |
32 |
33 | dag = DAG(
34 | 'lesson1.exercise4',
35 | start_date=datetime.datetime.now())
36 |
37 | list_task = PythonOperator(
38 | task_id="list_keys",
39 | python_callable=list_keys,
40 | dag=dag
41 | )
42 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex5_context.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # Use the Airflow context in the pythonoperator to complete the TODOs below. Once you are done, run your DAG and check the logs to see the context in use.
3 |
4 | import datetime
5 | import logging
6 |
7 | from airflow import DAG
8 | from airflow.models import Variable
9 | from airflow.operators.python_operator import PythonOperator
10 | from airflow.hooks.S3_hook import S3Hook
11 |
12 |
13 | # TODO: Extract ds, run_id, prev_ds, and next_ds from the kwargs, and log them
14 | # NOTE: Look here for context variables passed in on kwargs:
15 | # https://airflow.apache.org/code.html#macros
16 | def log_details(*args, **kwargs):
17 | logging.info(f"Execution date is {kwargs['ds']}")
18 | logging.info(f"My run id is {kwargs['run_id']}")
19 | previous_ds = kwargs.get('prev_ds')
20 | if previous_ds:
21 | logging.info(f"My previous run was on {previous_ds}")
22 | next_ds = kwargs.get('next_ds')
23 | if next_ds:
24 | logging.info(f"My next run will be {next_ds}")
25 |
26 | dag = DAG(
27 | 'lesson1.solution5',
28 | schedule_interval="@daily",
29 | start_date=datetime.datetime.now() - datetime.timedelta(days=2)
30 | )
31 |
32 | list_task = PythonOperator(
33 | task_id="log_details",
34 | python_callable=log_details,
35 | provide_context=True,
36 | dag=dag
37 | )
38 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/1_ex6_redshift_queries.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # Similar to what you saw in the demo, copy and populate the trips table.
3 | # Then, add another operator which creates a traffic analysis table from the trips table you created.
4 | # Note, in this class, we won’t be writing SQL -- all of the SQL statements we run against Redshift are predefined and included in your lesson.
5 |
6 | import datetime
7 | import logging
8 |
9 | from airflow import DAG
10 | from airflow.contrib.hooks.aws_hook import AwsHook
11 | from airflow.hooks.postgres_hook import PostgresHook
12 | from airflow.operators.postgres_operator import PostgresOperator
13 | from airflow.operators.python_operator import PythonOperator
14 |
15 | import sql_statements
16 |
17 |
18 | def load_data_to_redshift(*args, **kwargs):
19 | aws_hook = AwsHook("aws_credentials")
20 | credentials = aws_hook.get_credentials()
21 | redshift_hook = PostgresHook("redshift")
22 | redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key))
23 |
24 |
25 | dag = DAG(
26 | 'lesson1.solution6',
27 | start_date=datetime.datetime.now()
28 | )
29 |
30 | create_table = PostgresOperator(
31 | task_id="create_table",
32 | dag=dag,
33 | postgres_conn_id="redshift",
34 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
35 | )
36 |
37 | copy_task = PythonOperator(
38 | task_id='load_from_s3_to_redshift',
39 | dag=dag,
40 | python_callable=load_data_to_redshift
41 | )
42 |
43 | location_traffic_task = PostgresOperator(
44 | task_id="calculate_location_traffic",
45 | dag=dag,
46 | postgres_conn_id="redshift",
47 | sql=sql_statements.LOCATION_TRAFFIC_SQL
48 | )
49 |
50 | create_table >> copy_task
51 | copy_task >> location_traffic_task
52 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/2_ex1_data_lineage.py:
--------------------------------------------------------------------------------
1 | #Instructions
2 | #1 - Run the DAG as it is first, and observe the Airflow UI
3 | #2 - Next, open up the DAG and add the copy and load tasks as directed in the TODOs
4 | #3 - Reload the Airflow UI and run the DAG once more, observing the Airflow UI
5 |
6 | import datetime
7 | import logging
8 |
9 | from airflow import DAG
10 | from airflow.contrib.hooks.aws_hook import AwsHook
11 | from airflow.hooks.postgres_hook import PostgresHook
12 | from airflow.operators.postgres_operator import PostgresOperator
13 | from airflow.operators.python_operator import PythonOperator
14 |
15 | import sql_statements
16 |
17 |
18 | def load_trip_data_to_redshift(*args, **kwargs):
19 | aws_hook = AwsHook("aws_credentials")
20 | credentials = aws_hook.get_credentials()
21 | redshift_hook = PostgresHook("redshift")
22 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format(
23 | credentials.access_key,
24 | credentials.secret_key,
25 | )
26 | redshift_hook.run(sql_stmt)
27 |
28 |
29 | def load_station_data_to_redshift(*args, **kwargs):
30 | aws_hook = AwsHook("aws_credentials")
31 | credentials = aws_hook.get_credentials()
32 | redshift_hook = PostgresHook("redshift")
33 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
34 | credentials.access_key,
35 | credentials.secret_key,
36 | )
37 | redshift_hook.run(sql_stmt)
38 |
39 |
40 | dag = DAG(
41 | 'lesson2.exercise1',
42 | start_date=datetime.datetime.now()
43 | )
44 |
45 | create_trips_table = PostgresOperator(
46 | task_id="create_trips_table",
47 | dag=dag,
48 | postgres_conn_id="redshift",
49 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
50 | )
51 |
52 | copy_trips_task = PythonOperator(
53 | task_id='load_trips_from_s3_to_redshift',
54 | dag=dag,
55 | python_callable=load_trip_data_to_redshift,
56 | )
57 |
58 | create_stations_table = PostgresOperator(
59 | task_id="create_stations_table",
60 | dag=dag,
61 | postgres_conn_id="redshift",
62 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
63 | )
64 |
65 | copy_stations_task = PythonOperator(
66 | task_id='load_stations_from_s3_to_redshift',
67 | dag=dag,
68 | python_callable=load_station_data_to_redshift,
69 | )
70 |
71 | create_trips_table >> copy_trips_task
72 | create_stations_table >> copy_stations_task
73 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/2_ex2_schedule_backfilling.py:
--------------------------------------------------------------------------------
1 | #Instructions
2 | #1 - Revisit our bikeshare traffic
3 | #2 - Update our DAG with
4 | # a - @monthly schedule_interval
5 | # b - max_active_runs of 1
6 | # c - start_date of 2018/01/01
7 | # d - end_date of 2018/02/01
8 | # Use Airflow’s backfill capabilities to analyze our trip data on a monthly basis over 2 historical runs
9 |
10 | import datetime
11 | import logging
12 |
13 | from airflow import DAG
14 | from airflow.contrib.hooks.aws_hook import AwsHook
15 | from airflow.hooks.postgres_hook import PostgresHook
16 | from airflow.operators.postgres_operator import PostgresOperator
17 | from airflow.operators.python_operator import PythonOperator
18 |
19 | import sql_statements
20 |
21 |
22 | def load_trip_data_to_redshift(*args, **kwargs):
23 | aws_hook = AwsHook("aws_credentials")
24 | credentials = aws_hook.get_credentials()
25 | redshift_hook = PostgresHook("redshift")
26 | sql_stmt = sql_statements.COPY_ALL_TRIPS_SQL.format(
27 | credentials.access_key,
28 | credentials.secret_key,
29 | )
30 | redshift_hook.run(sql_stmt)
31 |
32 |
33 | def load_station_data_to_redshift(*args, **kwargs):
34 | aws_hook = AwsHook("aws_credentials")
35 | credentials = aws_hook.get_credentials()
36 | redshift_hook = PostgresHook("redshift")
37 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
38 | credentials.access_key,
39 | credentials.secret_key,
40 | )
41 | redshift_hook.run(sql_stmt)
42 |
43 |
44 | dag = DAG(
45 | 'lesson2.exercise2',
46 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
47 | end_date=datetime.datetime(2018, 2, 1, 0, 0, 0, 0),
48 | schedule_interval='@monthly',
49 | max_active_runs=1
50 | )
51 |
52 | create_trips_table = PostgresOperator(
53 | task_id="create_trips_table",
54 | dag=dag,
55 | postgres_conn_id="redshift",
56 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
57 | )
58 |
59 | copy_trips_task = PythonOperator(
60 | task_id='load_trips_from_s3_to_redshift',
61 | dag=dag,
62 | python_callable=load_trip_data_to_redshift,
63 | provide_context=True,
64 | )
65 |
66 | create_stations_table = PostgresOperator(
67 | task_id="create_stations_table",
68 | dag=dag,
69 | postgres_conn_id="redshift",
70 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
71 | )
72 |
73 | copy_stations_task = PythonOperator(
74 | task_id='load_stations_from_s3_to_redshift',
75 | dag=dag,
76 | python_callable=load_station_data_to_redshift,
77 | )
78 |
79 | create_trips_table >> copy_trips_task
80 | create_stations_table >> copy_stations_task
81 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/2_ex3_data_partitioning.py:
--------------------------------------------------------------------------------
1 | #Instructions
2 | #1 - Modify the bikeshare DAG to load data month by month, instead of loading it all at once, every time.
3 | #2 - Use time partitioning to parallelize the execution of the DAG.
4 |
5 | import datetime
6 | import logging
7 |
8 | from airflow import DAG
9 | from airflow.contrib.hooks.aws_hook import AwsHook
10 | from airflow.hooks.postgres_hook import PostgresHook
11 | from airflow.operators.postgres_operator import PostgresOperator
12 | from airflow.operators.python_operator import PythonOperator
13 |
14 | import sql_statements
15 |
16 |
17 | def load_trip_data_to_redshift(*args, **kwargs):
18 | aws_hook = AwsHook("aws_credentials")
19 | credentials = aws_hook.get_credentials()
20 | redshift_hook = PostgresHook("redshift")
21 | execution_date = kwargs["execution_date"]
22 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format(
23 | credentials.access_key,
24 | credentials.secret_key,
25 | year=execution_date.year,
26 | month=execution_date.month
27 | )
28 | redshift_hook.run(sql_stmt)
29 |
30 |
31 | def load_station_data_to_redshift(*args, **kwargs):
32 | aws_hook = AwsHook("aws_credentials")
33 | credentials = aws_hook.get_credentials()
34 | redshift_hook = PostgresHook("redshift")
35 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
36 | credentials.access_key,
37 | credentials.secret_key,
38 | )
39 | redshift_hook.run(sql_stmt)
40 |
41 |
42 | dag = DAG(
43 | 'lesson2.exercise3',
44 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
45 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
46 | schedule_interval='@monthly',
47 | max_active_runs=1
48 | )
49 |
50 | create_trips_table = PostgresOperator(
51 | task_id="create_trips_table",
52 | dag=dag,
53 | postgres_conn_id="redshift",
54 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
55 | )
56 |
57 | copy_trips_task = PythonOperator(
58 | task_id='load_trips_from_s3_to_redshift',
59 | dag=dag,
60 | python_callable=load_trip_data_to_redshift,
61 | provide_context=True,
62 | )
63 |
64 | create_stations_table = PostgresOperator(
65 | task_id="create_stations_table",
66 | dag=dag,
67 | postgres_conn_id="redshift",
68 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
69 | )
70 |
71 | copy_stations_task = PythonOperator(
72 | task_id='load_stations_from_s3_to_redshift',
73 | dag=dag,
74 | python_callable=load_station_data_to_redshift,
75 | )
76 |
77 | create_trips_table >> copy_trips_task
78 | create_stations_table >> copy_stations_task
79 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/2_ex4_data_quality.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # 1 - Set an SLA on our bikeshare traffic calculation operator
3 | # 2 - Add data verification step after the load step from s3 to redshift
4 | # 3 - Add data verification step after we calculate our output table
5 |
6 | import datetime
7 | import logging
8 |
9 | from airflow import DAG
10 | from airflow.contrib.hooks.aws_hook import AwsHook
11 | from airflow.hooks.postgres_hook import PostgresHook
12 | from airflow.operators.postgres_operator import PostgresOperator
13 | from airflow.operators.python_operator import PythonOperator
14 |
15 | import sql_statements
16 |
17 |
18 | def load_trip_data_to_redshift(*args, **kwargs):
19 | aws_hook = AwsHook("aws_credentials")
20 | credentials = aws_hook.get_credentials()
21 | redshift_hook = PostgresHook("redshift")
22 | execution_date = kwargs["execution_date"]
23 | sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format(
24 | credentials.access_key,
25 | credentials.secret_key,
26 | year=execution_date.year,
27 | month=execution_date.month
28 | )
29 | redshift_hook.run(sql_stmt)
30 |
31 |
32 | def load_station_data_to_redshift(*args, **kwargs):
33 | aws_hook = AwsHook("aws_credentials")
34 | credentials = aws_hook.get_credentials()
35 | redshift_hook = PostgresHook("redshift")
36 | sql_stmt = sql_statements.COPY_STATIONS_SQL.format(
37 | credentials.access_key,
38 | credentials.secret_key,
39 | )
40 | redshift_hook.run(sql_stmt)
41 |
42 |
43 | def check_greater_than_zero(*args, **kwargs):
44 | table = kwargs["params"]["table"]
45 | redshift_hook = PostgresHook("redshift")
46 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table}")
47 | if len(records) < 1 or len(records[0]) < 1:
48 | raise ValueError(f"Data quality check failed. {table} returned no results")
49 | num_records = records[0][0]
50 | if num_records < 1:
51 | raise ValueError(f"Data quality check failed. {table} contained 0 rows")
52 | logging.info(f"Data quality on table {table} check passed with {records[0][0]} records")
53 |
54 |
55 | dag = DAG(
56 | 'lesson2.exercise4',
57 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
58 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
59 | schedule_interval='@monthly',
60 | max_active_runs=1
61 | )
62 |
63 | create_trips_table = PostgresOperator(
64 | task_id="create_trips_table",
65 | dag=dag,
66 | postgres_conn_id="redshift",
67 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
68 | )
69 |
70 | copy_trips_task = PythonOperator(
71 | task_id='load_trips_from_s3_to_redshift',
72 | dag=dag,
73 | python_callable=load_trip_data_to_redshift,
74 | provide_context=True,
75 | )
76 |
77 | check_trips = PythonOperator(
78 | task_id='check_trips_data',
79 | dag=dag,
80 | python_callable=check_greater_than_zero,
81 | provide_context=True,
82 | params={
83 | 'table': 'trips',
84 | }
85 | )
86 |
87 | create_stations_table = PostgresOperator(
88 | task_id="create_stations_table",
89 | dag=dag,
90 | postgres_conn_id="redshift",
91 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
92 | )
93 |
94 | copy_stations_task = PythonOperator(
95 | task_id='load_stations_from_s3_to_redshift',
96 | dag=dag,
97 | python_callable=load_station_data_to_redshift,
98 | )
99 |
100 | check_stations = PythonOperator(
101 | task_id='check_stations_data',
102 | dag=dag,
103 | python_callable=check_greater_than_zero,
104 | provide_context=True,
105 | params={
106 | 'table': 'stations',
107 | }
108 | )
109 |
110 | create_trips_table >> copy_trips_task
111 | create_stations_table >> copy_stations_task
112 | copy_stations_task >> check_stations
113 | copy_trips_task >> check_trips
114 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex1_plugins.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # In this exercise, we’ll consolidate repeated code into Operator Plugins
3 | # 1 - Move the data quality check logic into a custom operator
4 | # 2 - Replace the data quality check PythonOperators with our new custom operator
5 | # 3 - Consolidate both the S3 to RedShift functions into a custom operator
6 | # 4 - Replace the S3 to RedShift PythonOperators with our new custom operator
7 | # 5 - Execute the DAG
8 |
9 | import datetime
10 | import logging
11 |
12 | from airflow import DAG
13 | from airflow.contrib.hooks.aws_hook import AwsHook
14 | from airflow.hooks.postgres_hook import PostgresHook
15 |
16 | from airflow.operators import (
17 | HasRowsOperator,
18 | PostgresOperator,
19 | PythonOperator,
20 | S3ToRedshiftOperator
21 | )
22 |
23 | import sql_statements
24 |
25 |
26 | dag = DAG(
27 | "lesson3.exercise1",
28 | start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
29 | end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
30 | schedule_interval="@monthly",
31 | max_active_runs=1
32 | )
33 |
34 | create_trips_table = PostgresOperator(
35 | task_id="create_trips_table",
36 | dag=dag,
37 | postgres_conn_id="redshift",
38 | sql=sql_statements.CREATE_TRIPS_TABLE_SQL
39 | )
40 |
41 | copy_trips_task = S3ToRedshiftOperator(
42 | task_id="load_trips_from_s3_to_redshift",
43 | dag=dag,
44 | table="trips",
45 | redshift_conn_id="redshift",
46 | aws_credentials_id="aws_credentials",
47 | s3_bucket="udac-data-pipelines",
48 | s3_key="divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
49 | )
50 |
51 | check_trips = HasRowsOperator(
52 | task_id='check_trips_data',
53 | dag=dag,
54 | redshift_conn_id="redshift",
55 | table='trips',
56 | provide_context=True
57 | )
58 |
59 | create_stations_table = PostgresOperator(
60 | task_id="create_stations_table",
61 | dag=dag,
62 | postgres_conn_id="redshift",
63 | sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
64 | )
65 |
66 | copy_stations_task = S3ToRedshiftOperator(
67 | task_id="load_stations_from_s3_to_redshift",
68 | dag=dag,
69 | redshift_conn_id="redshift",
70 | aws_credentials_id="aws_credentials",
71 | s3_bucket="udac-data-pipelines",
72 | s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
73 | table="stations"
74 | )
75 |
76 | check_stations = HasRowsOperator(
77 | task_id='check_trips_data',
78 | dag=dag,
79 | redshift_conn_id="redshift",
80 | table='stations',
81 | provide_context=True
82 | )
83 |
84 | create_trips_table >> copy_trips_task
85 | create_stations_table >> copy_stations_task
86 | copy_stations_task >> check_stations
87 | copy_trips_task >> check_trips
88 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex2_refactoring.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # In this exercise, we’ll refactor a DAG with a single overloaded task into a DAG with several tasks with well-defined boundaries
3 | # 1 - Read through the DAG and identify points in the DAG that could be split apart
4 | # 2 - Split the DAG into multiple PythonOperators
5 | # 3 - Run the DAG
6 |
7 | import datetime
8 | import logging
9 |
10 | from airflow import DAG
11 | from airflow.hooks.postgres_hook import PostgresHook
12 |
13 | from airflow.operators.postgres_operator import PostgresOperator
14 | from airflow.operators.python_operator import PythonOperator
15 |
16 |
17 | def log_oldest():
18 | redshift_hook = PostgresHook("redshift")
19 | records = redshift_hook.get_records("""
20 | SELECT birthyear FROM older_riders ORDER BY birthyear ASC LIMIT 1
21 | """)
22 | if len(records) > 0 and len(records[0]) > 0:
23 | logging.info(f"Oldest rider was born in {records[0][0]}")
24 |
25 |
26 | def log_youngest():
27 | redshift_hook = PostgresHook("redshift")
28 | records = redshift_hook.get_records("""
29 | SELECT birthyear FROM younger_riders ORDER BY birthyear DESC LIMIT 1
30 | """)
31 | if len(records) > 0 and len(records[0]) > 0:
32 | logging.info(f"Youngest rider was born in {records[0][0]}")
33 |
34 |
35 | dag = DAG(
36 | "lesson3.exercise2",
37 | start_date=datetime.datetime.utcnow()
38 | )
39 |
40 | # Find all trips where the rider was under 18
41 | create_oldest_task = PostgresOperator(
42 | task_id="create_oldest",
43 | dag=dag,
44 | sql="""
45 | BEGIN;
46 | DROP TABLE IF EXISTS older_riders;
47 | CREATE TABLE older_riders AS (
48 | SELECT * FROM trips WHERE birthyear > 0 AND birthyear <= 1945
49 | );
50 | COMMIT;
51 | """,
52 | postgres_conn_id="redshift"
53 | )
54 |
55 | log_oldest_task = PythonOperator(
56 | task_id="log_oldest",
57 | dag=dag,
58 | python_callable=log_oldest
59 | )
60 |
61 | create_youngest_task = PostgresOperator(
62 | task_id="create_youngest",
63 | dag=dag,
64 | sql="""
65 | BEGIN;
66 | DROP TABLE IF EXISTS younger_riders;
67 | CREATE TABLE younger_riders AS (
68 | SELECT * FROM trips WHERE birthyear > 2000
69 | );
70 | COMMIT;
71 | """,
72 | postgres_conn_id="redshift"
73 | )
74 |
75 | log_youngest_task = PythonOperator(
76 | task_id="log_youngest",
77 | dag=dag,
78 | python_callable=log_youngest
79 | )
80 |
81 | create_lifetime_task = PostgresOperator(
82 | task_id="create_lifetime",
83 | dag=dag,
84 | sql="""
85 | BEGIN;
86 | DROP TABLE IF EXISTS lifetime_rides;
87 | CREATE TABLE lifetime_rides AS (
88 | SELECT bikeid, COUNT(bikeid)
89 | FROM trips
90 | GROUP BY bikeid
91 | );
92 | COMMIT;
93 | """,
94 | postgres_conn_id="redshift"
95 | )
96 |
97 | create_city_stations_task = PostgresOperator(
98 | task_id="create_city_stations",
99 | dag=dag,
100 | sql="""
101 | BEGIN;
102 | DROP TABLE IF EXISTS city_station_counts;
103 | CREATE TABLE city_station_counts AS(
104 | SELECT city, COUNT(city)
105 | FROM stations
106 | GROUP BY city
107 | );
108 | COMMIT;
109 | """,
110 | postgres_conn_id="redshift"
111 | )
112 |
113 | create_oldest_task >> log_oldest_task
114 | create_youngest_task >> log_youngest_task
115 |
116 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/__init__.py
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/dag.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # In this exercise, we’ll place our S3 to RedShift Copy operations into a SubDag.
3 | # 1 - Consolidate HasRowsOperator into the SubDag
4 | # 2 - Reorder the tasks to take advantage of the SubDag Operators
5 |
6 | import datetime
7 |
8 | from airflow import DAG
9 | from airflow.operators.postgres_operator import PostgresOperator
10 | from airflow.operators.subdag_operator import SubDagOperator
11 | from airflow.operators.udacity_plugin import HasRowsOperator
12 |
13 | from lesson3.exercise3.subdag import get_s3_to_redshift_dag
14 | import sql_statements
15 |
16 |
17 | start_date = datetime.datetime.utcnow()
18 |
19 | dag = DAG(
20 | "lesson3.exercise3",
21 | start_date=start_date,
22 | )
23 |
24 | trips_task_id = "trips_subdag"
25 | trips_subdag_task = SubDagOperator(
26 | subdag=get_s3_to_redshift_dag(
27 | "lesson3.exercise3",
28 | trips_task_id,
29 | "redshift",
30 | "aws_credentials",
31 | "trips",
32 | sql_statements.CREATE_TRIPS_TABLE_SQL,
33 | s3_bucket="udac-data-pipelines",
34 | s3_key="divvy/unpartitioned/divvy_trips_2018.csv",
35 | start_date=start_date,
36 | ),
37 | task_id=trips_task_id,
38 | dag=dag,
39 | )
40 |
41 | stations_task_id = "stations_subdag"
42 | stations_subdag_task = SubDagOperator(
43 | subdag=get_s3_to_redshift_dag(
44 | "lesson3.exercise3",
45 | stations_task_id,
46 | "redshift",
47 | "aws_credentials",
48 | "stations",
49 | sql_statements.CREATE_STATIONS_TABLE_SQL,
50 | s3_bucket="udac-data-pipelines",
51 | s3_key="divvy/unpartitioned/divvy_stations_2017.csv",
52 | start_date=start_date,
53 | ),
54 | task_id=stations_task_id,
55 | dag=dag,
56 | )
57 |
58 | #
59 | # TODO: Consolidate check_trips and check_stations into a single check in the subdag
60 | # as we did with the create and copy in the demo
61 | #
62 | check_trips = HasRowsOperator(
63 | task_id="check_trips_data",
64 | dag=dag,
65 | redshift_conn_id="redshift",
66 | table="trips"
67 | )
68 |
69 | check_stations = HasRowsOperator(
70 | task_id="check_stations_data",
71 | dag=dag,
72 | redshift_conn_id="redshift",
73 | table="stations"
74 | )
75 |
76 | location_traffic_task = PostgresOperator(
77 | task_id="calculate_location_traffic",
78 | dag=dag,
79 | postgres_conn_id="redshift",
80 | sql=sql_statements.LOCATION_TRAFFIC_SQL
81 | )
82 |
83 | #
84 | # TODO: Reorder the Graph once you have moved the checks
85 | #
86 | trips_subdag_task >> check_trips
87 | stations_subdag_task >> check_stations
88 | check_stations >> location_traffic_task
89 | check_trips >> location_traffic_task
90 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex3_subdags/subdag.py:
--------------------------------------------------------------------------------
1 | # Instructions
2 | # In this exercise, we’ll place our S3 to RedShift Copy operations into a SubDag.
3 | # 1 - Consolidate HasRowsOperator into the SubDag
4 | # 2 - Reorder the tasks to take advantage of the SubDag Operators
5 |
6 | import datetime
7 |
8 | from airflow import DAG
9 | from airflow.operators.postgres_operator import PostgresOperator
10 | from airflow.operators.udacity_plugin import HasRowsOperator
11 | from airflow.operators.udacity_plugin import S3ToRedshiftOperator
12 |
13 | import sql_statements.py
14 |
15 |
16 | # Returns a DAG which creates a table if it does not exist, and then proceeds
17 | # to load data into that table from S3. When the load is complete, a data
18 | # quality check is performed to assert that at least one row of data is
19 | # present.
20 | def get_s3_to_redshift_dag(
21 | parent_dag_name,
22 | task_id,
23 | redshift_conn_id,
24 | aws_credentials_id,
25 | table,
26 | create_sql_stmt,
27 | s3_bucket,
28 | s3_key,
29 | *args, **kwargs):
30 | dag = DAG(
31 | f"{parent_dag_name}.{task_id}",
32 | **kwargs
33 | )
34 |
35 | create_task = PostgresOperator(
36 | task_id=f"create_{table}_table",
37 | dag=dag,
38 | postgres_conn_id=redshift_conn_id,
39 | sql=create_sql_stmt
40 | )
41 |
42 | copy_task = S3ToRedshiftOperator(
43 | task_id=f"load_{table}_from_s3_to_redshift",
44 | dag=dag,
45 | table=table,
46 | redshift_conn_id=redshift_conn_id,
47 | aws_credentials_id=aws_credentials_id,
48 | s3_bucket=s3_bucket,
49 | s3_key=s3_key
50 | )
51 |
52 | #
53 | # TODO: Move the HasRowsOperator task here from the DAG
54 | #
55 |
56 | create_task >> copy_task
57 | #
58 | # TODO: Use DAG ordering to place the check task
59 | #
60 |
61 | return dag
62 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/3_ex4_full_dag.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from airflow import DAG
4 |
5 | from airflow.operators import (
6 | FactsCalculatorOperator,
7 | HasRowsOperator,
8 | S3ToRedshiftOperator
9 | )
10 |
11 | # This DAG performs the following functions
12 | # 1. Loads Trip data from S3 to RedShift
13 | # 2. Performs a data quality check on the Trips table in RedShift
14 | # 3. Uses the FactsCalculatorOperator to create a Facts table in Redshift
15 | # a. **NOTE**: to complete this step you must complete the FactsCalcuatorOperator
16 | # skeleton defined in plugins/operators/facts_calculator.py
17 | #
18 | dag = DAG("lesson3.exercise4", start_date=datetime.datetime.utcnow())
19 |
20 | copy_trips_task = S3ToRedshiftOperator(
21 | task_id="load_trips_from_s3_to_redshift",
22 | dag=dag,
23 | table="trips",
24 | redshift_conn_id="redshift",
25 | aws_credentials_id="aws_credentials",
26 | s3_bucket="udacity-dend",
27 | s3_key="data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
28 | )
29 |
30 | check_trips = HasRowsOperator(
31 | task_id="trips_has_rows",
32 | dag=dag,
33 | redshift_conn_id="redshift",
34 | table="trips",
35 | provide_context=True
36 | )
37 |
38 | calculate_facts = FactsCalculatorOperator(
39 | task_id="calculate_facts",
40 | dag=dag,
41 | postgres_conn_id="redshift",
42 | origin_table="trips",
43 | destination_table="trips_facts",
44 | fact_column="tripduration",
45 | groupbycolumn="bikeid"
46 | )
47 |
48 | copy_trips_task >> check_trips
49 | check_trips >> calculate_facts
50 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/exercises/dags/__init__.py
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/dags/sql_statements.py:
--------------------------------------------------------------------------------
1 | CREATE_TRIPS_TABLE_SQL = """
2 | CREATE TABLE IF NOT EXISTS trips (
3 | trip_id INTEGER NOT NULL,
4 | start_time TIMESTAMP NOT NULL,
5 | end_time TIMESTAMP NOT NULL,
6 | bikeid INTEGER NOT NULL,
7 | tripduration DECIMAL(16,2) NOT NULL,
8 | from_station_id INTEGER NOT NULL,
9 | from_station_name VARCHAR(100) NOT NULL,
10 | to_station_id INTEGER NOT NULL,
11 | to_station_name VARCHAR(100) NOT NULL,
12 | usertype VARCHAR(20),
13 | gender VARCHAR(6),
14 | birthyear INTEGER,
15 | PRIMARY KEY(trip_id))
16 | DISTSTYLE ALL;
17 | """
18 |
19 | CREATE_STATIONS_TABLE_SQL = """
20 | CREATE TABLE IF NOT EXISTS stations (
21 | id INTEGER NOT NULL,
22 | name VARCHAR(250) NOT NULL,
23 | city VARCHAR(100) NOT NULL,
24 | latitude DECIMAL(9, 6) NOT NULL,
25 | longitude DECIMAL(9, 6) NOT NULL,
26 | dpcapacity INTEGER NOT NULL,
27 | online_date TIMESTAMP NOT NULL,
28 | PRIMARY KEY(id))
29 | DISTSTYLE ALL;
30 | """
31 |
32 | COPY_SQL = """
33 | COPY {}
34 | FROM '{}'
35 | ACCESS_KEY_ID '{{}}'
36 | SECRET_ACCESS_KEY '{{}}'
37 | IGNOREHEADER 1
38 | DELIMITER ','
39 | """
40 |
41 | COPY_MONTHLY_TRIPS_SQL = COPY_SQL.format(
42 | "trips",
43 | "s3://udacity-dend/data-pipelines/divvy/partitioned/{year}/{month}/divvy_trips.csv"
44 | )
45 |
46 | COPY_ALL_TRIPS_SQL = COPY_SQL.format(
47 | "trips",
48 | "s3://udacity-dend/data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv"
49 | )
50 |
51 | COPY_STATIONS_SQL = COPY_SQL.format(
52 | "stations",
53 | "s3://udacity-dend/data-pipelines/divvy/unpartitioned/divvy_stations_2017.csv"
54 | )
55 |
56 | LOCATION_TRAFFIC_SQL = """
57 | BEGIN;
58 | DROP TABLE IF EXISTS station_traffic;
59 | CREATE TABLE station_traffic AS
60 | SELECT
61 | DISTINCT(t.from_station_id) AS station_id,
62 | t.from_station_name AS station_name,
63 | num_departures,
64 | num_arrivals
65 | FROM trips t
66 | JOIN (
67 | SELECT
68 | from_station_id,
69 | COUNT(from_station_id) AS num_departures
70 | FROM trips
71 | GROUP BY from_station_id
72 | ) AS fs ON t.from_station_id = fs.from_station_id
73 | JOIN (
74 | SELECT
75 | to_station_id,
76 | COUNT(to_station_id) AS num_arrivals
77 | FROM trips
78 | GROUP BY to_station_id
79 | ) AS ts ON t.from_station_id = ts.to_station_id
80 | """
81 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/plugins/__init__.py:
--------------------------------------------------------------------------------
1 | from airflow.plugins_manager import AirflowPlugin
2 |
3 | import operators
4 |
5 |
6 | # Defining the plugin class
7 | class UdacityPlugin(AirflowPlugin):
8 | name = "udacity_plugin"
9 | operators = [
10 | operators.FactsCalculatorOperator,
11 | operators.HasRowsOperator,
12 | operators.S3ToRedshiftOperator
13 | ]
14 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/plugins/operators/__init__.py:
--------------------------------------------------------------------------------
1 | from operators.facts_calculator import FactsCalculatorOperator
2 | from operators.has_rows import HasRowsOperator
3 | from operators.s3_to_redshift import S3ToRedshiftOperator
4 |
5 | __all__ = [
6 | 'FactsCalculatorOperator',
7 | 'HasRowsOperator',
8 | 'S3ToRedshiftOperator'
9 | ]
10 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/plugins/operators/facts_calculator.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.hooks.postgres_hook import PostgresHook
4 | from airflow.models import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 |
7 |
8 | class FactsCalculatorOperator(BaseOperator):
9 | facts_sql_template = """
10 | DROP TABLE IF EXISTS {destination_table};
11 | CREATE TABLE {destination_table} AS
12 | SELECT
13 | {groupby_column},
14 | MAX({fact_column}) AS max_{fact_column},
15 | MIN({fact_column}) AS min_{fact_column},
16 | AVG({fact_column}) AS average_{fact_column}
17 | FROM {origin_table}
18 | GROUP BY {groupby_column};
19 | """
20 |
21 | @apply_defaults
22 | def __init__(self,
23 | redshift_conn_id="",
24 | origin_table="",
25 | destination_table="",
26 | fact_column="",
27 | groupby_column="",
28 | *args, **kwargs):
29 |
30 | super(FactsCalculatorOperator, self).__init__(*args, **kwargs)
31 | self.redshift_conn_id = redshift_conn_id
32 | self.origin_table = origin_table
33 | self.destination_table = destination_table
34 | self.fact_column = fact_column
35 | self.groupby_column = groupby_column
36 |
37 | def execute(self, context):
38 | redshift_hook = PostgresHook(self.redshift_conn_id)
39 | formatted_sql = FactsCalculatorOperator.facts_sql_template.format(
40 | origin_table=self.origin_table,
41 | destination_table=self.destination_table,
42 | groupby_column=self.groupby_column,
43 | fact_column=self.fact_column
44 | )
45 | redshift_hook.run(formatted_sql)
46 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/plugins/operators/has_rows.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.hooks.postgres_hook import PostgresHook
4 | from airflow.models import BaseOperator
5 | from airflow.utils.decorators import apply_defaults
6 |
7 |
8 | class HasRowsOperator(BaseOperator):
9 |
10 | @apply_defaults
11 | def __init__(self,
12 | redshift_conn_id="",
13 | table="",
14 | *args, **kwargs):
15 |
16 | super(HasRowsOperator, self).__init__(*args, **kwargs)
17 | self.table = table
18 | self.redshift_conn_id = redshift_conn_id
19 |
20 | def execute(self, context):
21 | redshift_hook = PostgresHook(self.redshift_conn_id)
22 | records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
23 | if len(records) < 1 or len(records[0]) < 1:
24 | raise ValueError(f"Data quality check failed. {self.table} returned no results")
25 | num_records = records[0][0]
26 | if num_records < 1:
27 | raise ValueError(f"Data quality check failed. {self.table} contained 0 rows")
28 | logging.info(f"Data quality on table {self.table} check passed with {records[0][0]} records")
29 |
30 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/exercises/plugins/operators/s3_to_redshift.py:
--------------------------------------------------------------------------------
1 | from airflow.contrib.hooks.aws_hook import AwsHook
2 | from airflow.hooks.postgres_hook import PostgresHook
3 | from airflow.models import BaseOperator
4 | from airflow.utils.decorators import apply_defaults
5 |
6 |
7 | class S3ToRedshiftOperator(BaseOperator):
8 | # We are telling airflow that we want this param to be templetable
9 | template_fields = ("s3_key",)
10 | copy_sql = """
11 | COPY {}
12 | FROM '{}'
13 | ACCESS_KEY_ID '{}'
14 | SECRET_ACCESS_KEY '{}'
15 | IGNOREHEADER {}
16 | DELIMITER '{}'
17 | """
18 |
19 |
20 | @apply_defaults
21 | def __init__(self,
22 | redshift_conn_id="",
23 | aws_credentials_id="",
24 | table="",
25 | s3_bucket="",
26 | s3_key="", # renders this value from context variables (reason: see line 8)
27 | delimiter=",",
28 | ignore_headers=1,
29 | *args, **kwargs):
30 |
31 | super(S3ToRedshiftOperator, self).__init__(*args, **kwargs)
32 | self.table = table
33 | self.redshift_conn_id = redshift_conn_id
34 | self.s3_bucket = s3_bucket
35 | self.s3_key = s3_key
36 | self.delimiter = delimiter
37 | self.ignore_headers = ignore_headers
38 | self.aws_credentials_id = aws_credentials_id
39 |
40 | def execute(self, context):
41 | aws_hook = AwsHook(self.aws_credentials_id)
42 | credentials = aws_hook.get_credentials()
43 | redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
44 |
45 | self.log.info("Clearing data from destination Redshift table")
46 | redshift.run("DELETE FROM {}".format(self.table))
47 |
48 | self.log.info("Copying data from S3 to Redshift")
49 | rendered_key = self.s3_key.format(**context)
50 | s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
51 | formatted_sql = S3ToRedshiftOperator.copy_sql.format(
52 | self.table,
53 | s3_path,
54 | credentials.access_key,
55 | credentials.secret_key,
56 | self.ignore_headers,
57 | self.delimiter
58 | )
59 | redshift.run(formatted_sql)
60 |
--------------------------------------------------------------------------------
/4_dend_airflow_data_pipelines/glossary-data-pipelines-in-airflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/4_dend_airflow_data_pipelines/glossary-data-pipelines-in-airflow.pdf
--------------------------------------------------------------------------------
/DEND.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "."
5 | },
6 | {
7 | "path": "3_dend_spark_data_lakes/P4_Data_Lake"
8 | }
9 | ],
10 | "settings": {
11 | "jira-plugin.workingProject": "",
12 | "python.condaPath": "/home/f.silvestre/anaconda3/envs/dend/bin/python"
13 | }
14 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Florencia Silvestre
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data-engineering-nanodegree
2 | Projects done in the [Data Engineering Nanodegree by Udacity.com](https://www.udacity.com/course/data-engineer-nanodegree--nd027)
3 |
4 | 
5 |
6 | ## Course 1: Data Modeling
7 | ### Introduction to Data Modeling
8 | ➔ Understand the purpose of data modeling
9 |
10 | ➔ Identify the strengths and weaknesses of different types of databases and data storage techniques
11 |
12 | ➔ Create a table in Postgres and Apache Cassandra
13 |
14 | ### Relational Data Models
15 | ➔ Understand when to use a relational database
16 |
17 | ➔ Understand the difference between OLAP and OLTP databases
18 |
19 | ➔ Create normalized data tables
20 |
21 | ➔ Implement denormalized schemas (e.g. STAR, Snowflake)
22 |
23 | ### NoSQL Data Models
24 | ➔ Understand when to use NoSQL databases and how they differ from relational databases
25 |
26 | ➔ Select the appropriate primary key and clustering columns for a given use case
27 |
28 | ➔ Create a NoSQL database in Apache Cassandra
29 |
30 |
31 | #### Project: Data Modeling with Postgres and Apache Cassandra
32 |
33 | ## Course 2: Cloud Data Warehouses
34 | ### Introduction to the Data Warehouses
35 | ➔ Understand Data Warehousing architecture
36 |
37 | ➔ Run an ETL process to denormalize a database (3NF to Star)
38 |
39 | ➔ Create an OLAP cube from facts and dimensions
40 |
41 | ➔ Compare columnar vs. row oriented approaches
42 |
43 | ### Introduction to the Cloud with AWS
44 | ➔ Understand cloud computing
45 |
46 | ➔ Create an AWS account and understand their services
47 |
48 | ➔ Set up Amazon S3, IAM, VPC, EC2, RDS PostgreSQL
49 |
50 | ### Implementing Data Warehouses on AWS
51 | ➔ Identify components of the Redshift architecture
52 |
53 | ➔ Run ETL process to extract data from S3 into Redshift
54 |
55 | ➔ Set up AWS infrastructure using Infrastructure as Code (IaC)
56 |
57 | ➔ Design an optimized table by selecting the appropriate distribution style and sorting key
58 |
59 | #### Project 2: Data Infrastructure on the Cloud
60 |
61 | ## Course 3: Data Lakes with Spark
62 | ### The Power of Spark
63 | ➔ Understand the big data ecosystem
64 |
65 | ➔ Understand when to use Spark and when not to use it
66 |
67 | ### Data Wrangling with Spark
68 | ➔ Manipulate data with SparkSQL and Spark Dataframes
69 |
70 | ➔ Use Spark for ETL purposes
71 |
72 | ### Debugging and Optimization
73 | ➔ Troubleshoot common errors and optimize their code using the Spark WebUI
74 |
75 | ### Introduction to Data Lakes
76 | ➔ Understand the purpose and evolution of data lakes
77 |
78 | ➔ Implement data lakes on Amazon S3, EMR, Athena, and Amazon Glue
79 |
80 | ➔ Use Spark to run ELT processes and analytics on data of diverse sources, structures, and vintages
81 |
82 | ➔ Understand the components and issues of data lakes
83 |
84 | #### Project 3: Big Data with Spark
85 |
86 | ## Course 4: Automate Data Pipelines
87 | ### Data Pipelines
88 | ➔ Create data pipelines with Apache Airflow
89 |
90 | ➔ Set up task dependencies
91 |
92 | ➔ Create data connections using hooks
93 |
94 | ### Data Quality
95 | ➔ Track data lineage
96 |
97 | ➔ Set up data pipeline schedules
98 |
99 | ➔ Partition data to optimize pipelines
100 |
101 | ➔ Write tests to ensure data quality
102 |
103 | ➔ Backfill data
104 |
105 | ### Production Data Pipelines
106 | ➔ Build reusable and maintainable pipelines
107 |
108 | ➔ Build your own Apache Airflow plugins
109 |
110 | ➔ Implement subDAGs
111 |
112 | ➔ Set up task boundaries
113 |
114 | ➔ Monitor data pipelines
115 |
116 | #### Project: Data Pipelines with Airflow
117 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-dinky
--------------------------------------------------------------------------------
/cheatsheets/Data-Science-Books-for-2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Data-Science-Books-for-2018.pdf
--------------------------------------------------------------------------------
/cheatsheets/Pandas DataFrame Notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Pandas DataFrame Notes.pdf
--------------------------------------------------------------------------------
/cheatsheets/Pandas_Cheat_Sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/Pandas_Cheat_Sheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/linux cheatsheet.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/cheatsheets/linux cheatsheet.jpg
--------------------------------------------------------------------------------
/data-engineering.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Flor91/Data-engineering-nanodegree/139dfcc7226c50e24da2f1b4f9488c1d1d106c64/data-engineering.jpg
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: dend
2 | channels:
3 | - defaults
4 | prefix: /home/f.silvestre/anaconda3/envs/dend
5 |
6 |
--------------------------------------------------------------------------------