├── Airflow ├── AirflowDemo │ ├── Helloworld.py │ └── datatest.py ├── README.md ├── airflow_docs.docx └── ~$rflow_docs.docx ├── Celery ├── Celerydemo │ └── celery_blog.py ├── README.md └── celery_install_doc.docx ├── DASK ├── DASKDemo │ ├── AWSDASKDemo.ipynb │ └── DaskDemo.ipynb ├── InstructionsForDASK.docx └── README.md ├── DataPipelining.docx ├── Luigi ├── Instructions.docx ├── LuigiDemo │ ├── aggregateTask.py │ ├── loadFromAPI.py │ ├── loadFromMySQL.py │ └── saveFinalOutput.py ├── Readme.md ├── luigidemo.mp4 └── ~$structions.docx ├── Make ├── InstructionsForMakeDemo.docx ├── MakeDemo.mp4 ├── MakeDemo │ ├── Makefile │ ├── aggregateTask.py │ ├── loadFromAPI1.py │ ├── loadFromAPI2.py │ └── saveFinalOutput.py └── Readme.md ├── README.md └── Team1_DataPipelining.pptx /Airflow/AirflowDemo/Helloworld.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators import BashOperator 3 | from datetime import datetime, timedelta 4 | 5 | # Following are defaults which can be overridden later on 6 | default_args = { 7 | 'owner': 'manasi', 8 | 'depends_on_past': False, 9 | 'start_date': datetime(2016, 4, 15), 10 | 'email': ['manasidalvi14@gmail.com'], 11 | 'email_on_failure': False, 12 | 'email_on_retry': False, 13 | 'retries': 1, 14 | 'retry_delay': timedelta(minutes=1), 15 | } 16 | 17 | dag = DAG('Helloworld', default_args=default_args) 18 | 19 | # t1, t2, t3 and t4 are examples of tasks created using operators 20 | 21 | t1 = BashOperator( 22 | task_id='task_1', 23 | bash_command='echo "Hello World from Task 1"', 24 | dag=dag) 25 | 26 | t2 = BashOperator( 27 | task_id='task_2', 28 | bash_command='echo "Hello World from Task 2"', 29 | dag=dag) 30 | 31 | t3 = BashOperator( 32 | task_id='task_3', 33 | bash_command='echo "Hello World from Task 3"', 34 | dag=dag) 35 | 36 | t4 = BashOperator( 37 | task_id='task_4', 38 | bash_command='echo "Hello World from Task 4"', 39 | dag=dag) 40 | 41 | t2.set_upstream(t1) 42 | t3.set_upstream(t1) 43 | t4.set_upstream(t2) 44 | t4.set_upstream(t3) 45 | -------------------------------------------------------------------------------- /Airflow/AirflowDemo/datatest.py: -------------------------------------------------------------------------------- 1 | import airflow 2 | from airflow import DAG 3 | from airflow.operators import BashOperator 4 | from datetime import datetime, timedelta 5 | import pandas as pd 6 | import numpy as np 7 | 8 | # Following are defaults which can be overridden later on 9 | default_args = { 10 | 'owner': 'manasi', 11 | 'depends_on_past': False, 12 | 'start_date': datetime(2016, 4, 15), 13 | 'email': ['manasidalvi14@gmail.com'], 14 | 'email_on_failure': False, 15 | 'email_on_retry': False, 16 | 'retries': 1, 17 | 'retry_delay': timedelta(minutes=1), 18 | } 19 | 20 | dag = DAG('datafile', default_args=default_args) 21 | 22 | 23 | def task_read13(): 24 | print("hello from task13 read data") 25 | #query="https://data.colorado.gov/resource/tv8u-hswn.json" 26 | data_2013=pd.read_json("https://data.policefoundation.org/resource/jhvd-4583.json") 27 | data_2013.to_csv('/home/manasi/outputcsv/data_2013.csv') 28 | 29 | def task_read12(): 30 | print("hello from task12 read data") 31 | #query="https://data.colorado.gov/resource/tv8u-hswn.json" 32 | data_2012=pd.read_json("https://data.policefoundation.org/resource/fgcx-vmf9.json") 33 | data_2012.to_csv('/home/manasi/outputcsv/data_2012.csv') 34 | 35 | 36 | def task_merge(): 37 | print("tasks 2012") 38 | data_2012=pd.read_csv('/home/manasi/outputcsv/data_2012.csv') 39 | year2012=data_2012['ward'].value_counts(sort=True, ascending=True).to_frame() #2012 arrests by ward 40 | print("tasks 2013") 41 | data_2013=pd.read_csv('/home/manasi/outputcsv/data_2012.csv') 42 | year2013=data_2013['ward'].value_counts(sort=True, ascending=True).to_frame() #2013 arrests by ward 43 | m=pd.concat([year2012,year2013],axis=1) 44 | m.columns=['year2012','year2013'] 45 | m.to_csv('/home/manasi/outputcsv/merge/mdata.csv') 46 | 47 | 48 | t1 = BashOperator( 49 | task_id='read_json_2012', 50 | python_callable=task_read12(), 51 | bash_command='python3 ~/airflow/dags/datatest.py', 52 | dag=dag) 53 | 54 | t2 = BashOperator( 55 | task_id='read_json_2013', 56 | python_callable=task_read13(), 57 | bash_command='python3 ~/airflow/dags/datatest.py', 58 | dag=dag) 59 | 60 | t3 = BashOperator( 61 | task_id='merge', 62 | python_callable=task_merge(), 63 | bash_command='python3 ~/airflow/dags/datatest.py', 64 | dag=dag) 65 | 66 | t3.set_upstream(t1) 67 | t3.set_upstream(t2) 68 | 69 | -------------------------------------------------------------------------------- /Airflow/README.md: -------------------------------------------------------------------------------- 1 | Refer the demo video : 2 | airflow_part1 :https://youtu.be/Qs02p3mh8m4 3 | 4 | airflow_part2 :https://youtu.be/83gPOMr6NOE -------------------------------------------------------------------------------- /Airflow/airflow_docs.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Airflow/airflow_docs.docx -------------------------------------------------------------------------------- /Airflow/~$rflow_docs.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Airflow/~$rflow_docs.docx -------------------------------------------------------------------------------- /Celery/Celerydemo/celery_blog.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | from app import app 3 | import requests 4 | import time 5 | 6 | app = Celery('celery_blog', broker='redis://localhost:6379/0') 7 | 8 | @app.task 9 | def fetch_url(url): 10 | resp = requests.get(url) 11 | print(resp.status_code) 12 | 13 | def func(urls): 14 | for url in urls: 15 | fetch_url.delay(url) 16 | 17 | if __name__ == "__main__": 18 | func(["http://google.com", "https://amazon.in", "https://facebook.com", "https://twitter.com", "https://alexa.com"]) 19 | -------------------------------------------------------------------------------- /Celery/README.md: -------------------------------------------------------------------------------- 1 | Youtube Link: https://youtu.be/XelpLUcpKLM -------------------------------------------------------------------------------- /Celery/celery_install_doc.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Celery/celery_install_doc.docx -------------------------------------------------------------------------------- /DASK/DASKDemo/AWSDASKDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from s3fs import S3FileSystem\n", 12 | "\n", 13 | "s3 = S3FileSystem(anon=True)\n", 14 | "s3.ls('dask-data/nyc-taxi/2015/')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "def loadNYC1(xd):\n", 28 | " with s3.open(xd) as f:\n", 29 | " xd = pd.read_csv(f, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n", 30 | " return xd" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import pandas as pd\n", 42 | "import numpy as np\n", 43 | "def loadNYC2(xd):\n", 44 | " with s3.open(xd) as f:\n", 45 | " xd = pd.read_csv(f, parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])\n", 46 | " return xd" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "def loadAPIData(queryString):\n", 58 | " query = (queryString)\n", 59 | " return pd.read_json(query)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "import numpy as np\n", 71 | "def retMean(datfram,field):\n", 72 | " return np.mean(datfram[field])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "def add(x,y):\n", 84 | " return x + y" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "dskTask = {\n", 96 | " 'query1':\"dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv\",\n", 97 | " 'field1':\"trip_distance\",\n", 98 | " 'query2':\"dask-data/nyc-taxi/2015/yellow_tripdata_2015-08.csv\",\n", 99 | " 'field2':\"trip_distance\",\n", 100 | " 'loadNYC1':(loadNYC1,'query1'),\n", 101 | " 'loadNYC2':(loadNYC2,'query2'),\n", 102 | " 'meanNYC':(retMean,'loadNYC1','field1'),\n", 103 | " 'meanFinalPermit':(retMean,'loadNYC2','field2'),\n", 104 | " 'sum':(add,'meanNYC','meanFinalPermit')\n", 105 | "}\n", 106 | "dskTask" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false, 114 | "scrolled": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "from dask.dot import dot_graph\n", 119 | "dot_graph(dskTask)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from dask.distributed import Client\n", 131 | "c = Client('52.207.129.175:8786')\n", 132 | "c.get(dskTask,'sum')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "c.shutdown()" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "anaconda-cloud": {}, 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.6.0" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 1 169 | } 170 | -------------------------------------------------------------------------------- /DASK/InstructionsForDASK.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/DASK/InstructionsForDASK.docx -------------------------------------------------------------------------------- /DASK/README.md: -------------------------------------------------------------------------------- 1 | #Youtube Link for the Demo 2 | 3 | https://youtu.be/x4HGbh8mJNQ 4 | 5 | Check out this youtube channel by Matthew Rocklin. He is one of the developers for DASK 6 | Very Informative. 7 | ``` 8 | https://www.youtube.com/channel/UCFYhuCL11p3oO9375_NbLQg 9 | ``` -------------------------------------------------------------------------------- /DataPipelining.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/DataPipelining.docx -------------------------------------------------------------------------------- /Luigi/Instructions.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Luigi/Instructions.docx -------------------------------------------------------------------------------- /Luigi/LuigiDemo/aggregateTask.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | from loadFromAPI import LoadFromAPI 3 | from loadFromMySQL import LoadFromMySQL 4 | import pandas as pd 5 | import numpy as np 6 | 7 | class AggregateTask(luigi.Task): 8 | 9 | def requires(self): 10 | yield LoadFromMySQL() #incase of two dependencicies, use yieild instead of return 11 | yield LoadFromAPI() 12 | 13 | def run(self): 14 | finavg = [0,0] 15 | index = 0 16 | for ip in self.input(): 17 | with ip.open("r") as in_file: 18 | users = pd.read_csv(in_file,delimiter=",") 19 | finavg[index] = np.mean(users.iloc[:,-1]) 20 | index+=1 21 | with self.output().open("w") as out_file: 22 | out_file.write(str(finavg[0])) 23 | out_file.write("\n") 24 | out_file.write(str(finavg[1])) 25 | def output(self): 26 | return luigi.LocalTarget("data/aggregateOutput.txt") -------------------------------------------------------------------------------- /Luigi/LuigiDemo/loadFromAPI.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import pandas as pd 3 | class LoadFromAPI(luigi.Task): 4 | def run(self): 5 | query = ("https://data.colorado.gov/resource/ncpu-fd8q.json") 6 | dd = pd.read_json(query) 7 | with self.output().open("w") as out_file: 8 | out_file.write(dd.to_csv()) 9 | 10 | def output(self): 11 | return luigi.LocalTarget("data/loadFromAPI.csv") -------------------------------------------------------------------------------- /Luigi/LuigiDemo/loadFromMySQL.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | import luigi 3 | 4 | class LoadFromMySQL(luigi.Task): 5 | def run(self): 6 | cnx = mysql.connector.connect(user='root', password = 'root', database='sampledb', host='localhost',port = '3306') 7 | cursor = cnx.cursor() 8 | 9 | query = ("SELECT id,score FROM core_stats where id > 1") 10 | idip = 1 11 | 12 | cursor.execute(query, (idip)) 13 | with self.output().open("w") as out_file: 14 | out_file.write("id,score") 15 | out_file.write("\n") 16 | for (id,score) in cursor: 17 | s = str(id)+","+str(score) 18 | out_file.write(s) 19 | out_file.write("\n") 20 | cursor.close() 21 | cnx.close() 22 | 23 | def output(self): 24 | return luigi.LocalTarget("data/loadFromMySQL.csv") 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /Luigi/LuigiDemo/saveFinalOutput.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import numpy as np 3 | from aggregateTask import AggregateTask 4 | 5 | class SaveFinalOutput(luigi.Task): 6 | 7 | def requires(self): 8 | return AggregateTask() 9 | 10 | def run(self): 11 | sm=0 12 | with self.input().open("r") as in_file: 13 | arr = np.loadtxt(in_file) 14 | sm = np.sum(arr) 15 | with self.output().open("w") as out_file: 16 | out_file.write(str(sm)) 17 | def output(self): 18 | return luigi.LocalTarget("data/saveFinalOutput.txt") -------------------------------------------------------------------------------- /Luigi/Readme.md: -------------------------------------------------------------------------------- 1 | #Youtube Link for Luigi Demo 2 | https://youtu.be/NxsVEj0NeH4 3 | -------------------------------------------------------------------------------- /Luigi/luigidemo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Luigi/luigidemo.mp4 -------------------------------------------------------------------------------- /Luigi/~$structions.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Luigi/~$structions.docx -------------------------------------------------------------------------------- /Make/InstructionsForMakeDemo.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Make/InstructionsForMakeDemo.docx -------------------------------------------------------------------------------- /Make/MakeDemo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Make/MakeDemo.mp4 -------------------------------------------------------------------------------- /Make/MakeDemo/Makefile: -------------------------------------------------------------------------------- 1 | saveFinalOutput: aggregateTask 2 | python saveFinalOutput.py aggregateTask.txt 3 | echo "Pipeline Completed" 4 | 5 | aggregateTask: loadFromAPI1 loadFromAPI2 loadFromAPI1.csv loadFromAPI2.csv 6 | python aggregateTask.py loadFromAPI1.csv loadFromAPI2.csv 7 | 8 | loadFromAPI1: 9 | python loadFromAPI1.py 10 | loadFromAPI2: 11 | python loadFromAPI2.py 12 | clean: 13 | rm loadFromAPI1.csv 14 | rm loadFromAPI2.csv 15 | rm aggregateTask.txt 16 | rm saveFinalOutput.txt 17 | -------------------------------------------------------------------------------- /Make/MakeDemo/aggregateTask.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | index=0 6 | filename = False 7 | finavg=[0,0] 8 | for ip in sys.argv: 9 | with open(ip,"r") as in_file: 10 | if not filename: 11 | filename=True 12 | else: 13 | users = pd.read_csv(in_file,delimiter=",") 14 | finavg[index] = np.mean(users.iloc[:,-1]) 15 | index+=1 16 | with open("aggregateTask.txt","w") as out_file: 17 | out_file.write(str(finavg[0])) 18 | out_file.write("\n") 19 | out_file.write(str(finavg[1])) 20 | -------------------------------------------------------------------------------- /Make/MakeDemo/loadFromAPI1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | query = ("https://data.colorado.gov/resource/ncpu-fd8q.json") 3 | dd = pd.read_json(query) 4 | with open("loadFromAPI1.csv","w") as out_file: 5 | out_file.write(dd.to_csv()) 6 | -------------------------------------------------------------------------------- /Make/MakeDemo/loadFromAPI2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | query = ("https://data.colorado.gov/resource/ujff-j2yj.json") 3 | dd = pd.read_json(query) 4 | with open("loadFromAPI2.csv","w") as out_file: 5 | out_file.write(dd.to_csv()) 6 | -------------------------------------------------------------------------------- /Make/MakeDemo/saveFinalOutput.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | with open(sys.argv[1],"r") as in_file: 4 | arr = np.loadtxt(in_file) 5 | sm = np.sum(arr) 6 | with open("saveFinalOutput.txt","w") as out_file: 7 | out_file.write(str(sm)) 8 | -------------------------------------------------------------------------------- /Make/Readme.md: -------------------------------------------------------------------------------- 1 | #Youtube Link 2 | https://youtu.be/Pb7lJMeXczo 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Pipelining 2 | This repository contains the artifacts for the presentation on Data Pipelining by Team 1 on tools such as 3 | GNU Make, Luigi, DASK, Celery and Airflow. 4 | 5 | Video recording for the presentation can be found on Blackboard under Tegrity classes 6 | 7 | ## Videos: 8 | The links for the videos that we have recorded to explain the demo better are given below 9 | 10 | ### Airflow 11 | Part 1 12 | ``` 13 | https://youtu.be/Qs02p3mh8m4 14 | ``` 15 | Part2 16 | ``` 17 | https://youtu.be/83gPOMr6NOE 18 | ``` 19 | 20 | ### Celery 21 | ``` 22 | https://youtu.be/XelpLUcpKLM 23 | ``` 24 | 25 | ### DASK 26 | ``` 27 | https://www.youtube.com/watch?v=x4HGbh8mJNQ&t 28 | ``` 29 | 30 | ### Luigi 31 | ``` 32 | https://www.youtube.com/watch?v=NxsVEj0NeH4&t 33 | ``` 34 | 35 | ### Make 36 | ``` 37 | https://www.youtube.com/watch?v=Pb7lJMeXczo&t 38 | ``` 39 | -------------------------------------------------------------------------------- /Team1_DataPipelining.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vishalsatam/Data-Pipelining/57740e66bdfc7b80f1d9f725272e1c8ed05bf033/Team1_DataPipelining.pptx --------------------------------------------------------------------------------