├── ETL_toll_data.py ├── README.md ├── configure_simulator.jpg ├── consolidate_data.jpg ├── create_toll_topic.jpg ├── dag_args.jpg ├── dag_definition.jpg ├── dag_runs.jpg ├── data_reader_output.jpg ├── download_simulator.jpg ├── exit ├── extract_data_from_csv.jpg ├── extract_data_from_fixed_width.jpg ├── extract_data_from_tsv.jpg ├── output_rows.jpg ├── simulator_output.jpg ├── start_kafka.jpg ├── start_zookeeper.jpg ├── streaming_reader_code.jpg ├── submit_dag.jpg ├── task_pipeline.jpg ├── transform.jpg ├── unpause_dag.jpg └── unzip_data.jpg /ETL_toll_data.py: -------------------------------------------------------------------------------- 1 | #Import the libraries 2 | from datetime import timedelta 3 | from airflow import DAG 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.utils.dates import days_ago 6 | 7 | # Task 1.1 - Define DAG arguments 8 | default_args = { 9 | 'owner':'Marco Boccenti', 10 | 'start_date':days_ago(0), 11 | 'email':['marco.boccenti@gmail.com'], 12 | 'email_on_failure':True, 13 | 'email_on_retry':True, 14 | 'retries':1, 15 | 'retry_delay':timedelta(minutes = 5) 16 | } 17 | 18 | # Task 1.2 - Define the DAG 19 | dag = DAG( 20 | 'ETL_toll_data', 21 | default_args=default_args, 22 | description='Apache Airflow Final Assignment', 23 | schedule_interval=timedelta(days=1), 24 | ) 25 | 26 | # Task 1.3 - Create a task to unzip data 27 | unzip_data = BashOperator( 28 | task_id = 'unzip_data', 29 | bash_command = 'tar -xzf /home/project/airflow/dags/finalassignment/tolldata.tgz', 30 | dag = dag 31 | ) 32 | 33 | # Task 1.4 - Create a task to extract data from csv file 34 | extract_data_from_csv = BashOperator( 35 | task_id = 'extract_data_from_csv', 36 | bash_command = 'cut -d"," -f1-4 vehicle-data.csv > csv_data.csv', 37 | dag = dag, 38 | ) 39 | 40 | # Task 1.5 - Create a task to extract data from tsv file 41 | extract_data_from_tsv= BashOperator( 42 | task_id = 'extract_data_from_tsv', 43 | bash_command = 'cut -f5-7 tollplaza-data.tsv > tsv_data.csv', 44 | dag = dag, 45 | ) 46 | 47 | # Task 1.6 - Create a task to extract data from fixed width file 48 | extract_data_from_fixed_width = BashOperator( 49 | task_id = 'extract_data_from_fixed_width', 50 | bash_command = 'awk "NF{print $(NF-1),$NF}" OFS="\t" payment-data.txt > fixed_width_data.csv', 51 | dag = dag, 52 | ) 53 | 54 | # Task 1.7 - Create a task to consolidate data extracted from previous tasks 55 | consolidate_data = BashOperator( 56 | task_id = 'consolidate_data', 57 | bash_command = 'paste csv_data.csv tsv_data.csv fixed_width_data.csv > extracted_data.csv', 58 | dag = dag, 59 | ) 60 | 61 | #Task 1.8 -Transform and load the data 62 | transform_data = BashOperator( 63 | task_id = 'transform_data', 64 | bash_command = 'awk "$5 = toupper($5)" < extracted_data.csv > transformed_data.csv', 65 | dag = dag, 66 | ) 67 | 68 | #Task 1.9- Define the task pipeline 69 | unzip_data >> extract_data_from_csv >> extract_data_from_tsv >> extract_data_from_fixed_width >> consolidate_data >> transform_data 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ETL and Data Pipelines with Shell, Airflow and Kafka 2 | 3 | -------------------------------------------------------------------------------- /configure_simulator.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/configure_simulator.jpg -------------------------------------------------------------------------------- /consolidate_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/consolidate_data.jpg -------------------------------------------------------------------------------- /create_toll_topic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/create_toll_topic.jpg -------------------------------------------------------------------------------- /dag_args.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/dag_args.jpg -------------------------------------------------------------------------------- /dag_definition.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/dag_definition.jpg -------------------------------------------------------------------------------- /dag_runs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/dag_runs.jpg -------------------------------------------------------------------------------- /data_reader_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/data_reader_output.jpg -------------------------------------------------------------------------------- /download_simulator.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/download_simulator.jpg -------------------------------------------------------------------------------- /exit: -------------------------------------------------------------------------------- 1 | commit 9a45f375309a1f71a6d0f2954a8759c8bd2b6cdf (HEAD -> main, origin/main, origin/HEAD) 2 | Merge: 72d3428 5d036aa 3 | Author: Marco Boccenti 4 | Date: Mon Oct 24 15:46:58 2022 +0200 5 | 6 | Merge branch 'main' of https://github.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka 7 | 8 | commit 5d036aa451f1d7a3e02d4151b723731740bf7333 9 | Author: Marco Boccenti 10 | Date: Mon Oct 24 15:45:43 2022 +0200 11 | 12 | Update README.md 13 | 14 | commit 72d34288091d22ceec2e658446ca8ae158db601e 15 | Author: Marco Boccenti 16 | Date: Mon Oct 24 15:45:04 2022 +0200 17 | 18 | Added tasks screenshot 19 | 20 | commit 7ab94af2fc9cd769ed322df3f89bd8d189d98d3b 21 | Author: Marco Boccenti 22 | Date: Mon Oct 24 14:46:46 2022 +0200 23 | 24 | Added Task 1.1-1.10 25 | 26 | commit a7b1f2bd41c134e2793e346ad9e4109cca71d864 27 | Author: Marco Boccenti 28 | Date: Mon Oct 24 14:44:51 2022 +0200 29 | 30 | Update ETL_toll_data.py 31 | 32 | commit eeb36b14dfd99779dd2ddfa4355cfb38c9efb3eb 33 | Author: Marco Boccenti 34 | Date: Mon Oct 24 14:12:23 2022 +0200 35 | 36 | Create ETL_toll_data.py 37 | 38 | commit 55a25c4c29b30c58280ea8fcc47006d77b837af2 39 | Author: Marco Boccenti 40 | Date: Mon Oct 24 14:11:43 2022 +0200 41 | 42 | Delete ETL_toll_data.py 43 | 44 | commit e84867b24273dd188833421869eea4dc216031a6 45 | Author: Marco Boccenti 46 | Date: Mon Oct 24 14:11:23 2022 +0200 47 | 48 | Update ETL_toll_data.py 49 | 50 | commit 54c0397633ccfbb6cd91e54027eaf5350ddef40f 51 | Author: Marco Boccenti 52 | Date: Mon Oct 24 14:09:43 2022 +0200 53 | 54 | Create ETL_toll_data.py 55 | 56 | commit 89b9ce8218f856aa3538fe02c9ca3c1a17ffdb99 57 | Author: Marco Boccenti 58 | Date: Mon Oct 24 14:08:33 2022 +0200 59 | 60 | Initial commit 61 | -------------------------------------------------------------------------------- /extract_data_from_csv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/extract_data_from_csv.jpg -------------------------------------------------------------------------------- /extract_data_from_fixed_width.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/extract_data_from_fixed_width.jpg -------------------------------------------------------------------------------- /extract_data_from_tsv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/extract_data_from_tsv.jpg -------------------------------------------------------------------------------- /output_rows.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/output_rows.jpg -------------------------------------------------------------------------------- /simulator_output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/simulator_output.jpg -------------------------------------------------------------------------------- /start_kafka.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/start_kafka.jpg -------------------------------------------------------------------------------- /start_zookeeper.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/start_zookeeper.jpg -------------------------------------------------------------------------------- /streaming_reader_code.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/streaming_reader_code.jpg -------------------------------------------------------------------------------- /submit_dag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/submit_dag.jpg -------------------------------------------------------------------------------- /task_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/task_pipeline.jpg -------------------------------------------------------------------------------- /transform.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/transform.jpg -------------------------------------------------------------------------------- /unpause_dag.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/unpause_dag.jpg -------------------------------------------------------------------------------- /unzip_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mboccenti/ETL-and-Data-Pipelines-with-Shell-Airflow-and-Kafka/117458684aebb5c0288c87e1902365f8319d730f/unzip_data.jpg --------------------------------------------------------------------------------