└── dags ├── scripts ├── properties │ └── job.properties └── sqoop_incremental.sh └── wf_incremental_load.py /dags/scripts/properties/job.properties: -------------------------------------------------------------------------------- 1 | 2 | tableName=orders 3 | target_dir=/user/udaysharma/incoming/orders 4 | username="xyz" 5 | password="pqrs" 6 | last_val="2016-01-13" -------------------------------------------------------------------------------- /dags/scripts/sqoop_incremental.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./properties/job.properties 3 | 4 | export 'whoami' 5 | 6 | echo "[INFO]: Deleting directory :-----> $target_dir" 7 | 8 | hadoop fs -rmr $target_dir 9 | 10 | echo "[INFO]: Importing new records from $tableName" 11 | sqoop import --connect jdbc:mysql://localhost:3306/test \ 12 | --username $username --password $password \ 13 | --table $tableName -m 1 \ 14 | --as-avrodatafile \ 15 | --target-dir $target_dir \ 16 | --incremental append \ 17 | --check-column last_modified \ 18 | --last-value $last_val \ 19 | --map-column-java order_date=String 20 | -------------------------------------------------------------------------------- /dags/wf_incremental_load.py: -------------------------------------------------------------------------------- 1 | #@auther: uday sharma 2 | 3 | from airflow import DAG 4 | from airflow.operators import BashOperator, HiveOperator 5 | from datetime import datetime, timedelta 6 | 7 | default_args = { 8 | 'owner': 'udaysharma', 9 | 'start_date': datetime(2016, 1, 14), 10 | 'retries': 1, 11 | 'retry_delay': timedelta(minutes=5) 12 | } 13 | 14 | dag = DAG('incremental_load', default_args=default_args) 15 | 16 | sqoop_job = """ 17 | exec ./scripts/sqoop_incremental.sh 18 | """ 19 | # Importing the data from Mysql table to HDFS 20 | task1 = BashOperator( 21 | task_id= 'sqoop_import', 22 | bash_command=sqoop_job, 23 | dag=dag 24 | ) 25 | 26 | # Inserting the data from Hive external table to the target table 27 | task2 = HiveOperator( 28 | task_id= 'hive_insert', 29 | hql='INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;', 30 | depends_on_past=True, 31 | dag=dag 32 | ) 33 | 34 | # defining the job dependency 35 | task2.set_upstream(task1) --------------------------------------------------------------------------------