├── .gitignore ├── CyclingERD.sql ├── README.md ├── airflow ├── .env.example ├── Dockerfile ├── README.md ├── dags │ ├── init_0_ingestion_to_s3_dag.py │ ├── init_1_spark_emr_dag.py │ ├── init_2_s3_to_redshifht_dag.py │ ├── init_3_web_scraping_dag.py │ ├── proc_0_ingestion_to_s3_dag.py │ ├── proc_1_spark_emr_dag.py │ ├── proc_2_s3_to_redshifht_dag.py │ └── scripts │ │ ├── init-data-transformation.py │ │ └── journey-data-transformation.py ├── docker-compose.yaml ├── logs │ └── scheduler │ │ └── latest └── requirements.txt ├── images ├── CyclingERD.png ├── batch-on-aws.png ├── dags │ ├── init_0.png │ ├── init_1.png │ ├── init_2.png │ ├── init_3.png │ ├── inits.png │ ├── proc_0.png │ ├── proc_1.png │ └── proc_2.png ├── final-dashboard.png └── redshift-metabase.png ├── metabase └── README.md ├── notebook ├── data-exploration │ ├── Exploration.ipynb │ └── Scraping.ipynb └── data-transformation │ ├── experiment.ipynb │ ├── init-data-transformation.ipynb │ └── journey-data-transformation.ipynb ├── services.md └── terraform ├── main.tf ├── services.md └── variables.tf /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/.gitignore -------------------------------------------------------------------------------- /CyclingERD.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/CyclingERD.sql -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/README.md -------------------------------------------------------------------------------- /airflow/.env.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/.env.example -------------------------------------------------------------------------------- /airflow/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/Dockerfile -------------------------------------------------------------------------------- /airflow/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/README.md -------------------------------------------------------------------------------- /airflow/dags/init_0_ingestion_to_s3_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/init_0_ingestion_to_s3_dag.py -------------------------------------------------------------------------------- /airflow/dags/init_1_spark_emr_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/init_1_spark_emr_dag.py -------------------------------------------------------------------------------- /airflow/dags/init_2_s3_to_redshifht_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/init_2_s3_to_redshifht_dag.py -------------------------------------------------------------------------------- /airflow/dags/init_3_web_scraping_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/init_3_web_scraping_dag.py -------------------------------------------------------------------------------- /airflow/dags/proc_0_ingestion_to_s3_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/proc_0_ingestion_to_s3_dag.py -------------------------------------------------------------------------------- /airflow/dags/proc_1_spark_emr_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/proc_1_spark_emr_dag.py -------------------------------------------------------------------------------- /airflow/dags/proc_2_s3_to_redshifht_dag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/proc_2_s3_to_redshifht_dag.py -------------------------------------------------------------------------------- /airflow/dags/scripts/init-data-transformation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/scripts/init-data-transformation.py -------------------------------------------------------------------------------- /airflow/dags/scripts/journey-data-transformation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/dags/scripts/journey-data-transformation.py -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/airflow/docker-compose.yaml -------------------------------------------------------------------------------- /airflow/logs/scheduler/latest: -------------------------------------------------------------------------------- 1 | /opt/airflow/logs/scheduler/2022-03-11 -------------------------------------------------------------------------------- /airflow/requirements.txt: -------------------------------------------------------------------------------- 1 | apache-airflow-providers-amazon 2 | bs4 3 | pandas -------------------------------------------------------------------------------- /images/CyclingERD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/CyclingERD.png -------------------------------------------------------------------------------- /images/batch-on-aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/batch-on-aws.png -------------------------------------------------------------------------------- /images/dags/init_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/init_0.png -------------------------------------------------------------------------------- /images/dags/init_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/init_1.png -------------------------------------------------------------------------------- /images/dags/init_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/init_2.png -------------------------------------------------------------------------------- /images/dags/init_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/init_3.png -------------------------------------------------------------------------------- /images/dags/inits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/inits.png -------------------------------------------------------------------------------- /images/dags/proc_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/proc_0.png -------------------------------------------------------------------------------- /images/dags/proc_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/proc_1.png -------------------------------------------------------------------------------- /images/dags/proc_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/dags/proc_2.png -------------------------------------------------------------------------------- /images/final-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/final-dashboard.png -------------------------------------------------------------------------------- /images/redshift-metabase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/images/redshift-metabase.png -------------------------------------------------------------------------------- /metabase/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/metabase/README.md -------------------------------------------------------------------------------- /notebook/data-exploration/Exploration.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/notebook/data-exploration/Exploration.ipynb -------------------------------------------------------------------------------- /notebook/data-exploration/Scraping.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/notebook/data-exploration/Scraping.ipynb -------------------------------------------------------------------------------- /notebook/data-transformation/experiment.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/notebook/data-transformation/experiment.ipynb -------------------------------------------------------------------------------- /notebook/data-transformation/init-data-transformation.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/notebook/data-transformation/init-data-transformation.ipynb -------------------------------------------------------------------------------- /notebook/data-transformation/journey-data-transformation.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/notebook/data-transformation/journey-data-transformation.ipynb -------------------------------------------------------------------------------- /services.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/services.md -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/terraform/main.tf -------------------------------------------------------------------------------- /terraform/services.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/terraform/services.md -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HoracioSoldman/batch-processing-on-aws/HEAD/terraform/variables.tf --------------------------------------------------------------------------------