├── .gitignore ├── README.md ├── airflowRedditDag.py ├── airflowRedditPysparkDag.py └── src ├── pyspark ├── averageUpvote.py └── numUniqueAuthors.py └── python ├── averageUpvote.py ├── numUniqueAuthors.py └── s3-reddit.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | data/* 3 | notes 4 | blog-notes 5 | *.swp 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # airflow-pyspark-reddit 2 | 3 | Example of using Airflow to schedule downloading data form S3 and launching spark jobs 4 | -------------------------------------------------------------------------------- /airflowRedditDag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from airflow.operators.python_operator import PythonOperator 4 | from datetime import datetime, timedelta 5 | import os 6 | 7 | s3Bucket = 'reddit-comments' 8 | s3Key = '2007/RC_2007-10' 9 | redditFile = os.getcwd() + '/data/RC-s3-2007-10' 10 | #can alternatively wrap methods in functions and use PythonOperator instead of BashOperator 11 | srcDir = os.getcwd() + '/src/' 12 | 13 | default_args = { 14 | 'owner': 'airflow', 15 | 'depends_on_past': False, 16 | 'start_date': datetime(2016, 10, 14, 16, 12), 17 | 'retries': 5, 18 | 'retry_delay': timedelta(minutes=1), 19 | } 20 | 21 | dag = DAG( 22 | 's3Reddit', default_args=default_args, schedule_interval=timedelta(seconds=45)) 23 | 24 | downloadData= BashOperator( 25 | task_id='download-data', 26 | bash_command='python ' + srcDir + 's3-reddit.py ' + s3Bucket + ' ' + s3Key + ' ' + redditFile, 27 | dag=dag) 28 | 29 | numUniqueAuthors = BashOperator( 30 | task_id='Unique-authors', 31 | bash_command='python ' + srcDir + 'numUniqueAuthors.py ' + redditFile, 32 | dag=dag) 33 | numUniqueAuthors.set_upstream(downloadData) 34 | 35 | averageUpvotes = BashOperator( 36 | task_id='average-upvotes', 37 | bash_command='python ' + srcDir + 'averageUpvote.py ' + redditFile, 38 | dag=dag) 39 | 40 | averageUpvotes.set_upstream(downloadData) 41 | 42 | -------------------------------------------------------------------------------- /airflowRedditPysparkDag.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from airflow.operators.bash_operator import BashOperator 3 | from airflow.operators.python_operator import PythonOperator 4 | from datetime import datetime, timedelta 5 | import os 6 | 7 | s3Bucket = 'reddit-comments' 8 | s3Key = '2007/RC_2007-10' 9 | redditFile = os.getcwd() + '/data/RC-s3-2007-10' 10 | srcDir = os.getcwd() + '/src/' 11 | 12 | sparkSubmit = '/usr/local/spark/bin/spark-submit' 13 | 14 | default_args = { 15 | 'owner': 'airflow', 16 | 'depends_on_past': False, 17 | # 'start_date': datetime(2016, 10, 14, 16, 49), 18 | 'start_date': datetime.now() - timedelta(seconds=45), 19 | 'retries': 5, 20 | 'retry_delay': timedelta(minutes=1), 21 | } 22 | 23 | dag = DAG('s3RedditPyspark', default_args=default_args, schedule_interval=timedelta(seconds=45)) 24 | 25 | downloadData= BashOperator( 26 | task_id='download-data', 27 | bash_command='python ' + srcDir + 'python/s3-reddit.py ' + s3Bucket + ' ' + s3Key + ' ' + redditFile, 28 | dag=dag) 29 | 30 | numUniqueAuthors = BashOperator( 31 | task_id='Unique-authors', 32 | bash_command=sparkSubmit + ' ' + srcDir + 'pyspark/numUniqueAuthors.py ' + redditFile, 33 | dag=dag) 34 | numUniqueAuthors.set_upstream(downloadData) 35 | 36 | averageUpvotes = BashOperator( 37 | task_id='average-upvotes', 38 | bash_command=sparkSubmit + ' ' + srcDir + 'pyspark/averageUpvote.py ' + redditFile, 39 | dag=dag) 40 | averageUpvotes.set_upstream(downloadData) 41 | -------------------------------------------------------------------------------- /src/pyspark/averageUpvote.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pyspark import SparkConf, SparkContext 4 | 5 | conf = (SparkConf() 6 | .setMaster("local") 7 | .setAppName("My app")) 8 | 9 | sc = SparkContext(conf = conf) 10 | filename = sys.argv[1] 11 | f = sc.textFile(filename) 12 | 13 | avg = f.map(lambda line : json.loads(line)) \ 14 | .filter(lambda record: 'ups' in record) \ 15 | .map(lambda record: record['ups']) \ 16 | .mean() 17 | 18 | print("******************************* avg ************************* ") 19 | print(avg) 20 | -------------------------------------------------------------------------------- /src/pyspark/numUniqueAuthors.py: -------------------------------------------------------------------------------- 1 | # script to do simple processing of reddit data 2 | import json 3 | from sets import Set 4 | import sys 5 | from pyspark import SparkConf, SparkContext 6 | 7 | conf = (SparkConf() 8 | .setMaster("local") 9 | .setAppName("My app")) 10 | sc = SparkContext(conf = conf) 11 | 12 | filename = sys.argv[1] 13 | f = sc.textFile(filename) 14 | 15 | authorCount = f.map(lambda line : json.loads(line)) \ 16 | .filter(lambda record: 'author' in record) \ 17 | .groupBy(lambda record: record['author']) \ 18 | .count() 19 | 20 | print('*************** authorCount ********************** ') 21 | print(authorCount) 22 | -------------------------------------------------------------------------------- /src/python/averageUpvote.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | filename = sys.argv[1] 5 | 6 | count = 0 7 | total = 0 8 | with open(filename) as f: 9 | for i, line in enumerate(f): 10 | jsonLine = json.loads(line) 11 | if 'ups' in jsonLine: 12 | count += 1 13 | total += jsonLine['ups'] 14 | print total/count 15 | -------------------------------------------------------------------------------- /src/python/numUniqueAuthors.py: -------------------------------------------------------------------------------- 1 | # script to do simple processing of reddit data 2 | import json 3 | from sets import Set 4 | import sys 5 | 6 | filename = sys.argv[1] 7 | authorSet = Set() 8 | 9 | with open(filename) as f: 10 | for i, line in enumerate(f): 11 | jsonLine = json.loads(line) 12 | if 'author' in jsonLine: 13 | authorSet.add(jsonLine['author']) 14 | print len(authorSet) 15 | -------------------------------------------------------------------------------- /src/python/s3-reddit.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import sys 3 | 4 | s3Bucket = sys.argv[1] 5 | s3Key = sys.argv[2] 6 | outfile = sys.argv[3] 7 | 8 | s3 = boto3.resource('s3') 9 | s3.meta.client.download_file(s3Bucket, s3Key, outfile) 10 | 11 | --------------------------------------------------------------------------------