├── .gitignore
├── README.md
├── airflow
    └── dags
    │   ├── generate_twitter.py
    │   ├── subdags
    │       └── twitter_subdag.py
    │   └── twitter_airflow.py
├── celery_app
    ├── __init__.py
    ├── celeryapp.py
    ├── more_tasks.py
    ├── pytest_stock_tasks.py
    ├── tasks.py
    └── test_stock_tasks.py
├── data
    ├── example_chatlogs.json
    ├── mvt.csv
    ├── mvt_cleaned.csv
    └── tweets
    │   └── latest_links.txt
├── deploy
    ├── celery_service
    ├── celerybeat_service
    ├── example_variables.yml
    ├── flower_service
    ├── jupyter_service
    ├── jupyterhub_service
    ├── luigi_service
    ├── pipelines_playbook.yml
    ├── pipelines_variables.yml
    └── templates
    │   ├── jupyterhub_config.py
    │   └── sshd_config
├── example_prod.cfg
├── luigi
    ├── luigi.cfg
    ├── taxi_data_import.py
    └── wordcount_map_reduce.py
├── notebooks
    ├── Chapter 3 - Basic Celery Tasks.ipynb
    ├── Chapter 3 - Complex Task Chains.ipynb
    ├── Chapter 3 - First Steps with Celery.ipynb
    ├── Chapter 3 - Monitoring Tasks.ipynb
    ├── Chapter 4 - Dask Distributed.ipynb
    ├── Chapter 4 - First Steps with Dask.ipynb
    ├── Chapter 4 - Learning Dask Bags.ipynb
    ├── Chapter 6 - Introduction to PySpark.ipynb
    ├── Chapter 6 - Introduction to Spark Streaming.ipynb
    ├── Chapter 7 - Testing with Hypothesis.ipynb
    └── Extras (Chapter 4) - Clean Vehicle Theft Data.ipynb
├── requirements.txt
└── streaming
    └── tweepy_stream.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/*.json.gz
 2 | *.png
 3 | config/*
 4 | */config/*
 5 | *.db
 6 | *.log
 7 | *~
 8 | *.pyc
 9 | venv/*
10 | *.*/
11 | *.pid
12 | *.db
13 | real_variables.yml
14 | data/tweets/*.csv
15 | 
16 | # airflow configs / logs
17 | airflow.cfg
18 | unittests.cfg
19 | */logs/*
20 | 
21 | django/*
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Pipelines with Python (video edition)
 2 | 
 3 | Welcome to the code repository for [Data Pipelines with Python](http://shop.oreilly.com/product/0636920055334.do)! If you have any questions reach out to @kjam on Twitter or GitHub.
 4 | 
 5 | ### Code Structure
 6 | 
 7 | Most of the code covered in the videos is here; but not all of it. I highly recommend you take time to type out all the code along with the videos and simply use these scripts to "double check" or remind yourself of the work you've already completed.
 8 | 
 9 | ### Installation
10 | 
11 | Install with the requirements.txt file.  
12 | 
13 | ```pip install -r requirements.txt```
14 | 
15 | 
16 | ### Yahoo Finance API
17 | 
18 | There is a [good writeup in German for the Finance API](http://brusdeylins.info/tips_and_tricks/yahoo-finance-api/) which I used as a starting point to download newer-time data.
19 | 
20 | ### Python2 v. Python3
21 | 
22 | This repository is primarily compliant for both versions. Please let me know if you run into any bugs!
23 | 
24 | 
25 | ### Ansible Playbook
26 | 
27 | To use as a template rather than as a direct template, I've included a working playbook in the deploy folder. If you try and run it directly, you will likely receive some errors. Please read through the notebook and take a look at the directives and determine which you need and which you don't. It also requires a .ssh/authorized_hosts file as well as a config file located in `celeryapp/config/prod.cfg`. If you run into other errors, I highly recommend reading through [the Ansible
28 | documentation](http://docs.ansible.com/ansible/) or searching on StackOverflow.
29 | 
30 | ### Corrections?
31 | 
32 | If you find any issues in these code examples, feel free to submit an Issue or Pull Request. I appreciate your input!
33 | 
34 | ### Questions?
35 | 
36 | Reach out to @kjam on Twitter or GitHub. @kjam is also often on freenode. :)
37 | 


--------------------------------------------------------------------------------
/airflow/dags/generate_twitter.py:
--------------------------------------------------------------------------------
 1 | """ Simple example of creating subdags and generating work dynamically"""
 2 | from airflow import DAG
 3 | from airflow.hooks import SqliteHook
 4 | from airflow.operators import BashOperator, EmailOperator, SubDagOperator, \
 5 |     PythonOperator, BranchPythonOperator
 6 | from twitter_airflow import search_twitter, RAW_TWEET_DIR
 7 | from subdags.twitter_subdag import subdag
 8 | from datetime import datetime, timedelta
 9 | import pandas as pd
10 | import re
11 | import random
12 | 
13 | 
14 | SEARCH_TERMS = ['#python', '#pydata', '#airflow', 'data wrangling',
15 |                 'data pipelines']
16 | 
17 | 
18 | default_args = {
19 |     'owner': 'admin',
20 |     'depends_on_past': False,
21 |     'start_date': datetime.now() - timedelta(days=4),
22 |     'retries': 1,
23 |     'retry_delay': timedelta(minutes=5),
24 | }
25 | 
26 | dag = DAG('generate_twitter_dags', default_args=default_args,
27 |           schedule_interval='@daily')
28 | 
29 | 
30 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs):
31 |     """ Fill sqlite database with a few search terms. """
32 |     sqlite = SqliteHook('twitter_sqlite')
33 |     conn = sqlite.get_conn()
34 |     df = pd.DataFrame(my_terms, columns=['search_term'])
35 |     try:
36 |         df.to_sql('twitter_terms', conn)
37 |     except ValueError:
38 |         # table already exists
39 |         pass
40 | 
41 | 
42 | def generate_search_terms(**kwargs):
43 |     """ Generate subdag to search twitter for terms. """
44 |     sqlite = SqliteHook('twitter_sqlite')
45 |     conn = sqlite.get_conn()
46 |     query = "select * from twitter_terms"
47 |     df = pd.read_sql_query(query, conn)
48 |     return random.choice([
49 |         'search_{}_twitter'.format(re.sub(r'\W+', '', t))
50 |         for t in df.search_term.values])
51 | 
52 | 
53 | fill_search_terms = PythonOperator(task_id='fill_terms',
54 |                                    provide_context=True,
55 |                                    python_callable=fill_terms,
56 |                                    dag=dag)
57 | 
58 | 
59 | gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
60 |                                         provide_context=True,
61 |                                         python_callable=generate_search_terms,
62 |                                         dag=dag)
63 | 
64 | 
65 | email_links = EmailOperator(task_id='email_best_links',
66 |                             to='MYEMAIL@MYSITE.com',
67 |                             subject='Latest popular links',
68 |                             html_content='Check out the latest!!',
69 |                             files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
70 |                             dag=dag)
71 | 
72 | 
73 | sub = SubDagOperator(subdag=subdag,
74 |                      task_id='insert_and_id_pop',
75 |                      trigger_rule='one_success',
76 |                      dag=dag)
77 | 
78 | 
79 | clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format(
80 |     RAW_TWEET_DIR), task_id='clear_latest', dag=dag)
81 | 
82 | 
83 | gen_search_terms.set_upstream(fill_search_terms)
84 | 
85 | for term in SEARCH_TERMS:
86 |     term_without_punctuation = re.sub(r'\W+', '', term)
87 |     simple_search = PythonOperator(
88 |         task_id='search_{}_twitter'.format(term_without_punctuation),
89 |         provide_context=True,
90 |         python_callable=search_twitter,
91 |         dag=dag,
92 |         params={'query': term})
93 |     simple_search.set_upstream(gen_search_terms)
94 |     simple_search.set_downstream(sub)
95 | 
96 | sub.set_downstream(email_links)
97 | email_links.set_downstream(clear_latest)
98 | 


--------------------------------------------------------------------------------
/airflow/dags/subdags/twitter_subdag.py:
--------------------------------------------------------------------------------
 1 | """ Simple subdag example """
 2 | from airflow import DAG
 3 | from airflow.operators import PythonOperator
 4 | from twitter_airflow import csv_to_sqlite, identify_popular_links
 5 | from datetime import datetime, timedelta
 6 | 
 7 | 
 8 | default_args = {
 9 |     'owner': 'admin',
10 |     'depends_on_past': False,
11 |     'start_date': datetime(2016, 1, 1),
12 |     'retries': 1,
13 |     'retry_delay': timedelta(minutes=5),
14 | }
15 | 
16 | subdag = DAG('generate_twitter_dags.insert_and_id_pop',
17 |              default_args=default_args)
18 | 
19 | move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
20 |                                        provide_context=True,
21 |                                        python_callable=csv_to_sqlite,
22 |                                        dag=subdag)
23 | 
24 | id_popular = PythonOperator(task_id='identify_popular_links',
25 |                             provide_context=True,
26 |                             python_callable=identify_popular_links,
27 |                             dag=subdag,
28 |                             params={'write_mode': 'a'})
29 | 
30 | id_popular.set_upstream(move_tweets_to_sqlite)
31 | 


--------------------------------------------------------------------------------
/airflow/dags/twitter_airflow.py:
--------------------------------------------------------------------------------
  1 | """ Simple Airflow data pipeline example using Twitter API """
  2 | from airflow import DAG
  3 | from airflow.operators import EmailOperator, PythonOperator
  4 | from airflow.hooks import SqliteHook
  5 | from tweepy import API, Cursor, OAuthHandler
  6 | from configparser import ConfigParser
  7 | from csv import DictWriter, writer
  8 | from collections import Counter
  9 | from datetime import datetime, timedelta
 10 | import ast
 11 | import itertools
 12 | import glob
 13 | import shutil
 14 | import pandas as pd
 15 | import os.path
 16 | 
 17 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, '../../../data/tweets/'))
 18 | CONFIG_FILE = os.path.abspath(os.path.join(__file__,
 19 |                                            '../../../config/prod.cfg'))
 20 | MAX_TWEEPY_PAGE = 300
 21 | 
 22 | default_args = {
 23 |     'owner': 'admin',
 24 |     'depends_on_past': False,
 25 |     'start_date': datetime.now() - timedelta(days=4),
 26 |     'retries': 1,
 27 |     'retry_delay': timedelta(minutes=5),
 28 | }
 29 | 
 30 | dag = DAG('twitter_links', default_args=default_args,
 31 |           schedule_interval='@daily')
 32 | 
 33 | 
 34 | def extract_tweet_data(tweepy_obj, query):
 35 |     """ Extract relevant and serializable data from a tweepy Tweet object
 36 |         params:
 37 |             tweepy_obj: Tweepy Tweet Object
 38 |             query: str
 39 |         returns dict
 40 |     """
 41 |     return {
 42 |         'user_id': tweepy_obj.user.id,
 43 |         'user_name': tweepy_obj.user.name,
 44 |         'user_screenname': tweepy_obj.user.screen_name,
 45 |         'user_url': tweepy_obj.user.url,
 46 |         'user_description': tweepy_obj.user.description,
 47 |         'user_followers': tweepy_obj.user.followers_count,
 48 |         'user_friends': tweepy_obj.user.friends_count,
 49 |         'created': tweepy_obj.created_at.isoformat(),
 50 |         'text': tweepy_obj.text,
 51 |         'hashtags': [ht.get('text') for ht in
 52 |                      tweepy_obj.entities.get('hashtags')],
 53 |         'mentions': [(um.get('id'), um.get('screen_name')) for um in
 54 |                      tweepy_obj.entities.get('user_mentions')],
 55 |         'urls': [url.get('expanded_url') for url in
 56 |                  tweepy_obj.entities.get('urls')],
 57 |         'tweet_id': tweepy_obj.id,
 58 |         'is_quote_status': tweepy_obj.is_quote_status,
 59 |         'favorite_count': tweepy_obj.favorite_count,
 60 |         'retweet_count': tweepy_obj.retweet_count,
 61 |         'reply_status_id': tweepy_obj.in_reply_to_status_id,
 62 |         'lang': tweepy_obj.lang,
 63 |         'source': tweepy_obj.source,
 64 |         'location': tweepy_obj.coordinates,
 65 |         'query': query,
 66 |     }
 67 | 
 68 | 
 69 | def search_twitter(**kwargs):
 70 |     """ simple search for a query in public tweets"""
 71 |     query = kwargs.get('params').get('query')
 72 |     config = ConfigParser()
 73 |     config.read(CONFIG_FILE)
 74 |     auth = OAuthHandler(config.get('twitter', 'consumer_key'),
 75 |                         config.get('twitter', 'consumer_secret'))
 76 |     auth.set_access_token(config.get('twitter', 'access_token'),
 77 |                           config.get('twitter', 'access_token_secret'))
 78 |     api = API(auth)
 79 | 
 80 |     all_tweets = []
 81 |     page_num = 0
 82 |     since_date = datetime.strptime(
 83 |         kwargs.get('ds'), '%Y-%m-%d').date() - timedelta(days=1)
 84 |     query += ' since:{} until:{}'.format(since_date.strftime('%Y-%m-%d'),
 85 |                                          kwargs.get('ds'))
 86 |     print('searching twitter with: %s' % query)
 87 |     for page in Cursor(api.search, q=query, monitor_rate_limit=True,
 88 |                        wait_on_rate_limit=True).pages():
 89 |         all_tweets.extend([extract_tweet_data(t, query) for t in page])
 90 |         page_num += 1
 91 |         if page_num > MAX_TWEEPY_PAGE:
 92 |             break
 93 | 
 94 |     # if it's an empty list, stop here
 95 |     if not len(all_tweets):
 96 |         return
 97 | 
 98 |     filename = '{}/{}_{}.csv'.format(
 99 |         RAW_TWEET_DIR, query, datetime.now().strftime('%m%d%Y%H%M%S'))
100 | 
101 |     with open(filename, 'w') as raw_file:
102 |         raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys())
103 |         raw_wrtr.writeheader()
104 |         raw_wrtr.writerows(all_tweets)
105 | 
106 | 
107 | def csv_to_sqlite(directory=RAW_TWEET_DIR, **kwargs):
108 |     """ Very basic csv to sqlite pipeline using pandas
109 |         params:
110 |             directory: str (file path to csv files)
111 |     """
112 |     sqlite = SqliteHook('twitter_sqlite')
113 |     conn = sqlite.get_conn()
114 |     for fname in glob.glob('{}/*.csv'.format(directory)):
115 |         if '_read' not in fname:
116 |             try:
117 |                 df = pd.read_csv(fname)
118 |                 df.to_sql('tweets', conn, if_exists='append', index=False)
119 |                 shutil.move(fname, fname.replace('.csv', '_read.csv'))
120 |             except pd.io.common.EmptyDataError:
121 |                 # probably an io error with another task / open file
122 |                 continue
123 | 
124 | 
125 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode='w', **kwargs):
126 |     """ Identify the most popular links from the last day of tweest in the db
127 |         Writes them to latest_links.txt in the RAW_TWEET_DIR
128 |         (or directory kwarg)
129 |     """
130 |     sqlite = SqliteHook('twitter_sqlite')
131 |     conn = sqlite.get_conn()
132 |     query = """select * from tweets where
133 |     created > date('now', '-1 days') and urls is not null
134 |     order by favorite_count"""
135 |     df = pd.read_sql_query(query, conn)
136 |     df.urls = df.urls.map(ast.literal_eval)
137 |     cntr = Counter(itertools.chain.from_iterable(df.urls.values))
138 |     with open('{}/latest_links.txt'.format(directory), write_mode) as latest:
139 |         wrtr = writer(latest)
140 |         wrtr.writerow(['url', 'count'])
141 |         wrtr.writerows(cntr.most_common(5))
142 | 
143 | 
144 | simple_search = PythonOperator(task_id='search_twitter',
145 |                                provide_context=True,
146 |                                python_callable=search_twitter,
147 |                                dag=dag,
148 |                                params={'query': '#python'})
149 | 
150 | 
151 | move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
152 |                                        provide_context=True,
153 |                                        python_callable=csv_to_sqlite,
154 |                                        dag=dag)
155 | 
156 | 
157 | id_popular = PythonOperator(task_id='identify_popular_links',
158 |                             provide_context=True,
159 |                             python_callable=identify_popular_links,
160 |                             dag=dag)
161 | 
162 | 
163 | email_links = EmailOperator(task_id='email_best_links',
164 |                             to='katharine@kjamistan.com',
165 |                             subject='Latest popular links',
166 |                             html_content='Check out the latest!!',
167 |                             files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
168 |                             dag=dag)
169 | 
170 | 
171 | simple_search.set_downstream(move_tweets_to_sqlite)
172 | id_popular.set_upstream(move_tweets_to_sqlite)
173 | email_links.set_upstream(id_popular)
174 | 


--------------------------------------------------------------------------------
/celery_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/data-pipelines-course/2c8c8420d220df9168561b9157fb05cc28fb9bc0/celery_app/__init__.py


--------------------------------------------------------------------------------
/celery_app/celeryapp.py:
--------------------------------------------------------------------------------
 1 | ''' Celery settings and app '''
 2 | from celery import Celery
 3 | from kombu import Queue
 4 | from configparser import ConfigParser
 5 | from datetime import datetime, timedelta
 6 | import os
 7 | 
 8 | 
 9 | 
10 | config = ConfigParser()
11 | current_dir = os.path.dirname(os.path.realpath(__file__))
12 | 
13 | if os.environ.get('DEPLOY') == 'PROD':
14 |     config.read(os.path.join(current_dir, 'config/prod.cfg'))
15 | else:
16 |     config.read(os.path.join(current_dir, 'config/dev.cfg'))
17 | 
18 | app = Celery('tasks', broker=config.get('celery', 'broker_url'))
19 | 
20 | CELERY_CONFIG = {
21 |     'CELERY_IMPORTS': ['tasks'],
22 |     'CELERY_TIMEZONE': 'Europe/Berlin',
23 |     'CELERY_IGNORE_RESULT': False,
24 |     'CELERY_TRACK_STARTED': True,
25 |     'CELERY_DEFAULT_QUEUE': 'default',
26 |     'CELERY_QUEUES': (Queue('default'), Queue('priority'),),
27 |     'CELERY_DEFAULT_RATE_LIMIT': '20/s',
28 |     'CELERY_RESULT_BACKEND': 'amqp://',
29 |     'CELERY_CHORD_PROPAGATES': True,
30 |     'CELERYD_TASK_TIME_LIMIT': 7200,
31 |     'CELERYD_POOL_RESTARTS': True,
32 |     'CELERYD_TASK_LOG_FORMAT':
33 |     '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
34 |     'CELERY_ANNOTATIONS': {
35 |         'celery.chord_unlock': {'hard_time_limit': 360},
36 |     },
37 |     'CELERYBEAT_SCHEDULE': {
38 |         'get_stock_info_60s': {
39 |             'task': 'tasks.get_stock_info',
40 |             'schedule': timedelta(seconds=60),
41 |             'args': ('FB', datetime(2016, 1, 1), datetime.today())
42 |         }
43 |     }
44 | }
45 | 
46 | 
47 | app.conf.update(**CELERY_CONFIG)
48 | 


--------------------------------------------------------------------------------
/celery_app/more_tasks.py:
--------------------------------------------------------------------------------
 1 | ''' Here are a few options for chp3 homework '''
 2 | from pandas_datareader import data
 3 | from celeryapp import app
 4 | import pandas as pd
 5 | from datetime import datetime, timedelta
 6 | 
 7 | 
 8 | @app.task
 9 | def current_earnings(stock):
10 |     ''' return json response of current year ESP from yahoo finance
11 |     params:
12 |         stock str
13 |     returns:
14 |         json
15 |     '''
16 |     url = 'http://finance.yahoo.com/d/quotes.csv?s={}&f=se7'.format(stock)
17 |     cy = pd.read_csv(url, names=['Stock', 'Current Year ESP'])
18 |     return cy.to_json()
19 | 
20 | 
21 | @app.task
22 | def yoy_change(stock, source='yahoo'):
23 |     ''' return year over year change for a given stock from today.
24 |     params:
25 |         stock str
26 |     kwargs:
27 |         source str
28 |     returns float
29 |     '''
30 |     start = datetime.today() - timedelta(days=365)  # not accounting for leap yr
31 |     df = data.DataReader(stock, source, start, datetime.today())
32 |     return ((df.ix[-1]['Adj Close'] / df.ix[0]['Adj Close']) - 1) * 100
33 | 


--------------------------------------------------------------------------------
/celery_app/pytest_stock_tasks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tasks import get_stock_info
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | def test_get_stock_info():
 7 |     start_date = datetime(2013, 1, 1)
 8 |     end_date = datetime(2013, 2, 1)
 9 |     stock = 'FB'
10 | 
11 |     result = get_stock_info(stock, start_date, end_date)
12 |     assert isinstance(result, str)
13 |     result = json.loads(result)
14 |     assert 'High min' in result.keys()
15 |     stock = ' '.join(result['High min'].keys())
16 |     assert stock == stock
17 |     price = result['High min'][stock]
18 |     assert isinstance(price, float)
19 |     assert price > 0
20 | 


--------------------------------------------------------------------------------
/celery_app/tasks.py:
--------------------------------------------------------------------------------
  1 | ''' Task module for showing celery functionality. '''
  2 | from pandas_datareader import data
  3 | from celeryapp import app
  4 | from urllib.error import HTTPError, URLError
  5 | import pandas as pd
  6 | import logging
  7 | 
  8 | 
  9 | @app.task
 10 | def get_stock_info(stock, start, end, source='yahoo'):
 11 |     ''' Collect aggregate info for a stock given a daterange.
 12 |         params:
 13 |             stock: str
 14 |             start: datetime
 15 |             end: datetime
 16 |         kwargs:
 17 |             source (optional): str
 18 |         returns:
 19 |             json
 20 |     '''
 21 |     logging.debug('start and end types are: %s %s', type(start), type(end))
 22 |     df = data.DataReader(stock, source, start, end)
 23 |     df['Stock'] = stock
 24 |     agg = df.groupby('Stock').agg({
 25 |         'Open': ['min', 'max', 'mean', 'median'],
 26 |         'Adj Close': ['min', 'max', 'mean', 'median'],
 27 |         'Close': ['min', 'max', 'mean', 'median'],
 28 |         'High': ['min', 'max', 'mean', 'median'],
 29 |         'Low': ['min', 'max', 'mean', 'median'],
 30 |     })
 31 |     agg.columns = [' '.join(col).strip() for col in agg.columns.values]
 32 |     return agg.to_json()
 33 | 
 34 | 
 35 | def calc_ratio(price, compare):
 36 |     ''' Calculates ratio and converts it into percentage
 37 |         when given stock price and comparison price
 38 |     params:
 39 |         price: float
 40 |         compare: float
 41 |     returns float
 42 |     '''
 43 |     return round(((price / compare) - 1) * 100, 2)
 44 | 
 45 | 
 46 | @app.task(bind=True)
 47 | def price_range(self, stock, start, end, source='yahoo'):
 48 |     ''' Compare today's date to see if it is near max or min of closing prices
 49 |         in certain daterange.
 50 |     params:
 51 |         stock: str
 52 |         start: datetime
 53 |         end: datetime
 54 |     kwargs:
 55 |         source (optional): str
 56 |     returns:
 57 |         dictionary
 58 |     '''
 59 |     df = data.DataReader(stock, source, start, end)
 60 |     period_high = df['Adj Close'].max()
 61 |     period_mean = df['Adj Close'].mean()
 62 |     period_low = df['Adj Close'].min()
 63 |     resp = {
 64 |         'stock': stock,
 65 |         'period_high': period_high,
 66 |         'period_low': period_low,
 67 |         'period_mean': period_mean,
 68 |         'period_start': start,
 69 |         'period_end': end,
 70 |     }
 71 |     url = 'http://finance.yahoo.com/d/quotes.csv?s={}&f=sat1'.format(stock)
 72 |     try:
 73 |         td = pd.read_csv(url, names=['Stock', 'Price', 'Last Trade'])
 74 |     except (HTTPError, URLError) as exc:
 75 |         logging.exception('pandas read_csv error for yahoo finance URL: %s',
 76 |                           url)
 77 |         raise self.retry(exc=exc)
 78 |     td_price = td['Price'].mean()
 79 |     resp['todays_price'] = td_price
 80 |     if abs(td_price - period_high) < abs(td_price - period_low):
 81 |         resp['result'] = 'higher'
 82 |     else:
 83 |         resp['result'] = 'lower'
 84 |     resp['percent_change'] = calc_ratio(td_price, period_mean)
 85 |     return resp
 86 | 
 87 | 
 88 | @app.task
 89 | def determine_buy(result):
 90 |     ''' Extremely naive buy logic (for example's sake)
 91 |     params:
 92 |         result: json result from price_range task
 93 |     return:
 94 |         boolean
 95 |     '''
 96 |     if result['result'] == 'lower':
 97 |         return True
 98 |     return False
 99 | 
100 | 
101 | @app.task
102 | def sort_results(results, key='todays_price'):
103 |     ''' Sort by given key, defaults to todays_price
104 |     params:
105 |         results: list of results from price_range task
106 |     kwargs:
107 |         key: str (must be in price_range return dictionary)
108 |     return sorted list
109 |     '''
110 |     return sorted(results, key=lambda x: x[key])
111 | 


--------------------------------------------------------------------------------
/celery_app/test_stock_tasks.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import json
 3 | from tasks import get_stock_info
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | class TestStockInfo(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.start_date = datetime(2013, 1, 1)
10 |         self.end_date = datetime(2013, 2, 1)
11 |         self.stock = 'FB'
12 | 
13 |     def test_get_stock_info(self):
14 |         result = get_stock_info(self.stock, self.start_date, self.end_date)
15 |         self.assertIsInstance(result, str)
16 |         result = json.loads(result)
17 |         self.assertIn('High min', result.keys())
18 |         stock = ' '.join(result['High min'].keys())
19 |         self.assertEqual(stock, self.stock)
20 |         price = result['High min'][self.stock]
21 |         self.assertIsInstance(price, float)
22 |         self.assertTrue(price > 0)
23 | 


--------------------------------------------------------------------------------
/data/tweets/latest_links.txt:
--------------------------------------------------------------------------------
1 | url,count
2 | 


--------------------------------------------------------------------------------
/deploy/celery_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=celery 
 3 | After=syslog.target network.target
 4 | 
 5 | [Service]
 6 | Environment=DEPLOY=PROD
 7 | ExecStart=/home/deploy/venv/bin/celery multi start 4 -A tasks --loglevel=debug --logfile=/var/log/celery/%N.log 
 8 | ExecStop=/home/deploy/venv/bin/celery multi stopwait 4 -A tasks 
 9 | ExecReload=/home/deploy/venv/bin/celery multi restart 4 -A tasks --loglevel=debug --logfile=/var/log/celery/%N.log 
10 | # Requires systemd version 211 or newer
11 | WorkingDirectory=/var/www/pipelines/celery_app
12 | Type=forking
13 | StandardError=syslog
14 | User=deploy
15 | Group=deploy
16 | TimeoutSec=3600
17 | 
18 | [Install]
19 | WantedBy=multi-user.target
20 | 


--------------------------------------------------------------------------------
/deploy/celerybeat_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=celerybeat
 3 | After=syslog.target
 4 | 
 5 | [Service]
 6 | ExecStart=/home/deploy/venv/bin/celery -A tasks beat --loglevel=debug --logfile=/var/log/celery/%n.log
 7 | # Requires systemd version 211 or newer
 8 | WorkingDirectory=/var/www/pipelines/celery_app
 9 | Restart=always
10 | KillSignal=SIGTERM
11 | Type=simple
12 | StandardError=syslog
13 | NotifyAccess=all
14 | User=deploy
15 | Group=deploy
16 | 
17 | [Install]
18 | WantedBy=multi-user.target
19 | 


--------------------------------------------------------------------------------
/deploy/example_variables.yml:
--------------------------------------------------------------------------------
1 | deploy_url: pipelines.foo.com
2 | deploy_pass: $6$rw0zQQOmZqt1KsDFksakjio291fSzScf3qGxedkxt249FfFskwonDDlsso$32onLzXth3ZHK0
3 | deploy_email: youremail@you.com
4 | rabbitmq_pass: hereisapasswordasanexample
5 | 


--------------------------------------------------------------------------------
/deploy/flower_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=celery flower
 3 | After=syslog.target
 4 | 
 5 | [Service]
 6 | Environment=DEPLOY=PROD
 7 | ExecStart=/home/deploy/venv/bin/celery flower -A tasks --port=5566 --basic_auth=admin:getouttahere  
 8 | # Requires systemd version 211 or newer
 9 | WorkingDirectory=/var/www/pipelines/celery_app
10 | Restart=always
11 | KillSignal=SIGTERM
12 | Type=simple
13 | StandardError=syslog
14 | NotifyAccess=all
15 | User=deploy
16 | Group=deploy
17 | 
18 | [Install]
19 | WantedBy=multi-user.target
20 | 


--------------------------------------------------------------------------------
/deploy/jupyter_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=jupyter
 3 | 
 4 | [Service]
 5 | PIDFile=/var/run/jupyter.pid
 6 | ExecStart=/home/deploy/venv/bin/python jupyter notebook --no-browser --pylab=inline
 7 | KillSignal=SIGTERM
 8 | WorkingDirectory=/var/www/pipelines/
 9 | 
10 | [Install]
11 | WantedBy=multi-user.target
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/deploy/jupyterhub_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Jupyterhub
 3 | After=syslog.target network.target
 4 | 
 5 | 
 6 | [Service]
 7 | User=root
 8 | Environment=PYTHONPATH=/home/deploy/venv/bin/python
 9 | Environment=VIRTUAL_ENV=/home/deploy/venv
10 | Environment=DEPLOY=PROD
11 | ExecStart=/home/deploy/venv/bin/jupyterhub -f /var/www/pipelines/notebooks/jupyterhub_config.py
12 | WorkingDirectory=/var/www/pipelines/notebooks
13 | 
14 | [Install]
15 | WantedBy=multi-user.target
16 | 


--------------------------------------------------------------------------------
/deploy/luigi_service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Luigi
 3 | After=syslog.target network.target
 4 | 
 5 | [Service]
 6 | User=root
 7 | Environment=PYTHONPATH=/home/deploy/venv/bin/python
 8 | Environment=VIRTUAL_ENV=/home/deploy/venv
 9 | Environment=DEPLOY=PROD
10 | ExecStart=/home/deploy/venv/bin/luigid --logdir /var/log/luigi --pidfile /var/run/luigi.pid
11 | WorkingDirectory=/var/www/pipelines
12 | 
13 | [Install]
14 | WantedBy=multi-user.target
15 | 


--------------------------------------------------------------------------------
/deploy/pipelines_playbook.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | - hosts: pipelines
  3 |   become: yes
  4 | 
  5 |   tasks:
  6 |     - include_vars: pipelines_variables.yml
  7 |     
  8 |     - name: get jessie backports
  9 |       apt_repository: repo='deb http://ftp.debian.org/debian jessie-backports main' state=present
 10 |     
 11 |     - name: update
 12 |       apt: update_cache=yes
 13 | 
 14 |     - name: install fail2ban
 15 |       apt: pkg=fail2ban state=installed
 16 | 
 17 |     - name: add deploy user
 18 |       user: name=deploy shell=/bin/bash password={{ deploy_pass }}
 19 | 
 20 |     - name: add ssh dir
 21 |       file: path=/home/deploy/.ssh state=directory owner=deploy group=deploy mode=0700 
 22 | 
 23 |     - name: move key to deploy user folder
 24 |       become_user: deploy
 25 |       copy: src=~/.ssh/authorized_keys dest=/home/deploy/.ssh/authorized_keys
 26 | 
 27 |     - name: copy ssh deploy key files
 28 |       become_user: deploy
 29 |       copy: src=~/.ssh/deploy dest=/home/deploy/.ssh/
 30 |       
 31 |     - name: Allow deploy to have sudo
 32 |       lineinfile: dest=/etc/sudoers state=present line='deploy  ALL=(ALL:ALL) ALL'
 33 | 
 34 |     - name: change perms for ssh
 35 |       file: path=/home/deploy/.ssh/deploy owner=deploy group=deploy mode=0400 
 36 |     
 37 |     - name: change sshd
 38 |       copy: src=templates/sshd_config dest=/etc/ssh/sshd_config
 39 |       notify:
 40 |           - restart ssh 
 41 | 
 42 |     - name: install pip3
 43 |       apt: name=python3-pip state=installed
 44 | 
 45 |     - name: virtualenv
 46 |       shell: pip3 install virtualenv
 47 | 
 48 |     - name: install hdf5
 49 |       apt: pkg=libhdf5-dev state=installed install_recommends=yes
 50 |     
 51 |     - name: install hdf5 tools
 52 |       apt: pkg=hdf5-tools state=installed install_recommends=yes
 53 | 
 54 |     - name: install sqlite
 55 |       apt: pkg=sqlite3 state=installed 
 56 | 
 57 |     - name: npm
 58 |       apt: name=npm state=installed
 59 |           
 60 |     - name: nodejs
 61 |       apt: name=nodejs-legacy state=installed
 62 | 
 63 |     - name: install npm http proxy
 64 |       npm: name=configurable-http-proxy global=yes state=present
 65 | 
 66 |     - name: install python crypto
 67 |       apt: name=python-cryptography state=installed install_recommends=yes
 68 |       
 69 |     - name: install certbot
 70 |       apt: name=certbot state=installed install_recommends=yes default_release=jessie-backports
 71 |     
 72 |     - name: install letsencrypt
 73 |       apt: name=letsencrypt state=installed install_recommends=yes
 74 | 
 75 |     - name: run certbot
 76 |       shell: certbot certonly --standalone -d {{ deploy_url }} --standalone-supported-challenges tls-sni-01 -n -m {{ deploy_email }} --keep-until-expiring --agree-tos  
 77 | 
 78 |     - name: rabbitmq
 79 |       apt: name=rabbitmq-server state=installed
 80 | 
 81 |     - name: redis
 82 |       apt: name=redis-server state=installed
 83 | 
 84 |     - name: graphviz
 85 |       apt: name=graphviz state=installed
 86 | 
 87 |     - name: git
 88 |       apt: name=git state=installed
 89 | 
 90 |     - name: change app perms to fetch
 91 |       file: path=/var/www/pipelines mode=0777 recurse=yes state=directory
 92 |       ignore_errors: yes
 93 | 
 94 |     - name: fetch application
 95 |       become_user: deploy
 96 |       git: repo=git@github.com:kjam/data-pipelines-course.git dest=/var/www/pipelines key_file=~/.ssh/deploy accept_hostkey=yes force=yes
 97 | 
 98 |     - name: install requirements
 99 |       pip: requirements=/var/www/pipelines/requirements.txt virtualenv=/home/deploy/venv
100 |       environment: 
101 |           HDF5_DIR: /usr/lib/x86_64-linux-gnu/hdf5/serial/  
102 | 
103 |     - name: make notebooks directory 
104 |       file: path=/var/www/pipelines/notebooks state=directory owner=deploy mode=0755 recurse=yes
105 |     
106 |     - name: make notebooks cookiefile 
107 |       file: path=/var/www/pipelines/notebooks/jupyterhub_cookie_secret mode=0600 state=touch
108 |   
109 |     - name: make celery config directory 
110 |       file: path=/var/www/pipelines/celery_app/config state=directory owner=deploy mode=0755 recurse=yes
111 |       
112 |     - name: copy config to celery dir
113 |       copy: src=../celery_app/config/prod.cfg dest=/var/www/pipelines/celery_app/config
114 | 
115 |     - name: copy config to nb dir
116 |       copy: src=templates/jupyterhub_config.py dest=/var/www/pipelines/notebooks
117 | 
118 |     - name: make log dir
119 |       file: path=/var/log/celery state=directory owner=deploy mode=0755 recurse=yes
120 | 
121 |     - name: rabbitmq add vhost
122 |       rabbitmq_vhost: name=celery_vhost state=present
123 | 
124 |     - name: add rabbitmq user
125 |       rabbitmq_user: user=celery_user password={{ rabbitmq_pass }} vhost=celery_vhost configure_priv=.* read_priv=.* write_priv=.* tags=administrator state=present
126 | 
127 |     - name: enabling rabbitmq management 
128 |       rabbitmq_plugin: names=rabbitmq_management state=enabled
129 |       notify:
130 |           - restart rabbitmq
131 | 
132 |     - name: change sysd perms
133 |       file: path=/var/www/pipelines/deploy mode=0644 recurse=yes
134 | 
135 |     - name: link flower systemd file
136 |       file: src=/var/www/pipelines/deploy/flower_service dest=/etc/systemd/system/flower.service state=link
137 | 
138 |     - name: link celery systemd file
139 |       file: src=/var/www/pipelines/deploy/celery_service dest=/etc/systemd/system/celery.service state=link
140 |    
141 |     - name: link celerybeat systemd file
142 |       file: src=/var/www/pipelines/deploy/celerybeat_service dest=/etc/systemd/system/celerybeat.service state=link
143 | 
144 |     - name: link jupyterhub systemd file
145 |       file: src=/var/www/pipelines/deploy/jupyterhub_service dest=/etc/systemd/system/jupyterhub.service state=link
146 |     
147 |     #- name: link jupyter systemd file
148 |     #  file: src=/var/www/pipelines/deploy/jupyter_service dest=/etc/systemd/system/jupyterhub.service state=link
149 | 
150 |     - name: reload systemd
151 |       shell: systemctl daemon-reload && systemctl restart jupyterhub
152 | 
153 |     - name: start flower & celery & celerybeat
154 |       shell: systemctl restart celery && systemctl restart flower && systemctl restart celerybeat
155 |   
156 |   handlers:
157 |     - name: restart rabbitmq
158 |       service: name=rabbitmq-server state=restarted
159 | 
160 |     - name: restart ssh
161 |       service: name=ssh state=restarted
162 | 


--------------------------------------------------------------------------------
/deploy/pipelines_variables.yml:
--------------------------------------------------------------------------------
1 | deploy_url: YOUR_DOMAIN_HERE
2 | deploy_pass: PUT_SHA1_PW_HERE
3 | deploy_email: PUT_EMAIL_HERE
4 | rabbitmq_pass: PUT_PLAINTEXT_PASS_HERE
5 | 


--------------------------------------------------------------------------------
/deploy/templates/jupyterhub_config.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for jupyterhub.
  2 | 
  3 | #------------------------------------------------------------------------------
  4 | # Configurable configuration
  5 | #------------------------------------------------------------------------------
  6 | 
  7 | #------------------------------------------------------------------------------
  8 | # LoggingConfigurable configuration
  9 | #------------------------------------------------------------------------------
 10 | 
 11 | # A parent class for Configurables that log.
 12 | #
 13 | # Subclasses have a log trait, and the default behavior is to get the logger
 14 | # from the currently running Application.
 15 | 
 16 | #------------------------------------------------------------------------------
 17 | # SingletonConfigurable configuration
 18 | #------------------------------------------------------------------------------
 19 | 
 20 | # A configurable that only allows one instance.
 21 | #
 22 | # This class is for classes that should only have one instance of itself or
 23 | # *any* subclass. To create and retrieve such a class use the
 24 | # :meth:`SingletonConfigurable.instance` method.
 25 | 
 26 | #------------------------------------------------------------------------------
 27 | # Application configuration
 28 | #------------------------------------------------------------------------------
 29 | 
 30 | # This is an application.
 31 | 
 32 | # The date format used by logging formatters for %(asctime)s
 33 | c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S'
 34 | 
 35 | # The Logging format template
 36 | c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s'
 37 | 
 38 | # Set the log level by value or name.
 39 | c.Application.log_level = 30
 40 | 
 41 | #------------------------------------------------------------------------------
 42 | # JupyterHub configuration
 43 | #------------------------------------------------------------------------------
 44 | 
 45 | # An Application for starting a Multi-User Jupyter Notebook server.
 46 | 
 47 | # Grant admin users permission to access single-user servers.
 48 | #
 49 | # Users should be properly informed if this is enabled.
 50 | # c.JupyterHub.admin_access = False
 51 | 
 52 | # DEPRECATED, use Authenticator.admin_users instead.
 53 | # c.JupyterHub.admin_users = set()
 54 | 
 55 | # Answer yes to any questions (e.g. confirm overwrite)
 56 | #c.JupyterHub.answer_yes = False
 57 | 
 58 | # Dict of token:username to be loaded into the database.
 59 | #
 60 | # Allows ahead-of-time generation of API tokens for use by services.
 61 | # c.JupyterHub.api_tokens = {}
 62 | 
 63 | # Class for authenticating users.
 64 | #
 65 | # This should be a class with the following form:
 66 | #
 67 | # - constructor takes one kwarg: `config`, the IPython config object.
 68 | #
 69 | # - is a tornado.gen.coroutine
 70 | # - returns username on success, None on failure
 71 | # - takes two arguments: (handler, data),
 72 | #   where `handler` is the calling web.RequestHandler,
 73 | #   and `data` is the POST form data from the login page.
 74 | c.JupyterHub.authenticator_class = 'jupyterhub.auth.PAMAuthenticator'
 75 | 
 76 | # The base URL of the entire application
 77 | c.JupyterHub.base_url = '/'
 78 | 
 79 | # Whether to shutdown the proxy when the Hub shuts down.
 80 | #
 81 | # Disable if you want to be able to teardown the Hub while leaving the proxy
 82 | # running.
 83 | #
 84 | # Only valid if the proxy was starting by the Hub process.
 85 | #
 86 | # If both this and cleanup_servers are False, sending SIGINT to the Hub will
 87 | # only shutdown the Hub, leaving everything else running.
 88 | #
 89 | # The Hub should be able to resume from database state.
 90 | c.JupyterHub.cleanup_proxy = True
 91 | 
 92 | # Whether to shutdown single-user servers when the Hub shuts down.
 93 | #
 94 | # Disable if you want to be able to teardown the Hub while leaving the single-
 95 | # user servers running.
 96 | #
 97 | # If both this and cleanup_proxy are False, sending SIGINT to the Hub will only
 98 | # shutdown the Hub, leaving everything else running.
 99 | #
100 | # The Hub should be able to resume from database state.
101 | c.JupyterHub.cleanup_servers = True
102 | 
103 | # The config file to load
104 | #c.JupyterHub.config_file = 'jupyterhub_config.py'
105 | 
106 | # Confirm that JupyterHub should be run without SSL. This is **NOT RECOMMENDED**
107 | # unless SSL termination is being handled by another layer.
108 | # c.JupyterHub.confirm_no_ssl = False
109 | 
110 | # Number of days for a login cookie to be valid. Default is two weeks.
111 | # c.JupyterHub.cookie_max_age_days = 14
112 | 
113 | # The cookie secret to use to encrypt cookies.
114 | #
115 | # Loaded from the JPY_COOKIE_SECRET env variable by default.
116 | # c.JupyterHub.cookie_secret = b''
117 | 
118 | # File in which to store the cookie secret.
119 | # c.JupyterHub.cookie_secret_file = 'jupyterhub_cookie_secret'
120 | 
121 | # The location of jupyterhub data files (e.g. /usr/local/share/jupyter/hub)
122 | # c.JupyterHub.data_files_path = '/share/jupyter/hub'
123 | 
124 | # Include any kwargs to pass to the database connection. See
125 | # sqlalchemy.create_engine for details.
126 | # c.JupyterHub.db_kwargs = {}
127 | 
128 | # url for the database. e.g. `sqlite:///jupyterhub.sqlite`
129 | c.JupyterHub.db_url = 'sqlite:///jupyterhub.sqlite'
130 | 
131 | # log all database transactions. This has A LOT of output
132 | # c.JupyterHub.debug_db = False
133 | 
134 | # show debug output in configurable-http-proxy
135 | c.JupyterHub.debug_proxy = True
136 | 
137 | # Send JupyterHub's logs to this file.
138 | #
139 | # This will *only* include the logs of the Hub itself, not the logs of the proxy
140 | # or any single-user servers.
141 | # c.JupyterHub.extra_log_file = ''
142 | 
143 | # Extra log handlers to set on JupyterHub logger
144 | # c.JupyterHub.extra_log_handlers = []
145 | 
146 | # Generate default config file
147 | # c.JupyterHub.generate_config = False
148 | 
149 | # The ip for this process
150 | c.JupyterHub.hub_ip = '127.0.0.1'
151 | 
152 | # The port for this process
153 | c.JupyterHub.hub_port = 8081
154 | 
155 | # The prefix for the hub server. Must not be '/'
156 | # c.JupyterHub.hub_prefix = '/hub/'
157 | 
158 | # The public facing ip of the whole application (the proxy)
159 | c.JupyterHub.ip = '144.76.180.19'
160 | 
161 | # Supply extra arguments that will be passed to Jinja environment.
162 | # c.JupyterHub.jinja_environment_options = {}
163 | 
164 | # Interval (in seconds) at which to update last-activity timestamps.
165 | # c.JupyterHub.last_activity_interval = 300
166 | 
167 | # Specify path to a logo image to override the Jupyter logo in the banner.
168 | # c.JupyterHub.logo_file = ''
169 | 
170 | # File to write PID Useful for daemonizing jupyterhub.
171 | c.JupyterHub.pid_file = '/var/www/pipelines/jupyter.pid'
172 | 
173 | # The public facing port of the proxy
174 | c.JupyterHub.port = 443
175 | 
176 | # The ip for the proxy API handlers
177 | #c.JupyterHub.proxy_api_ip = '127.0.0.1'
178 | 
179 | # The port for the proxy API handlers
180 | #c.JupyterHub.proxy_api_port = 0
181 | 
182 | # The Proxy Auth token.
183 | #
184 | # Loaded from the CONFIGPROXY_AUTH_TOKEN env variable by default.
185 | # c.JupyterHub.proxy_auth_token = ''
186 | 
187 | # Interval (in seconds) at which to check if the proxy is running.
188 | # c.JupyterHub.proxy_check_interval = 30
189 | 
190 | # The command to start the http proxy.
191 | #
192 | # Only override if configurable-http-proxy is not on your PATH
193 | # c.JupyterHub.proxy_cmd = ['configurable-http-proxy']
194 | 
195 | # Purge and reset the database.
196 | c.JupyterHub.reset_db = False
197 | 
198 | # The class to use for spawning single-user servers.
199 | #
200 | # Should be a subclass of Spawner.
201 | #c.JupyterHub.spawner_class = 'jupyterhub.spawner.LocalProcessSpawner'
202 | 
203 | # Path to SSL certificate file for the public facing interface of the proxy
204 | #
205 | # Use with ssl_key
206 | c.JupyterHub.ssl_cert = '/etc/letsencrypt/live/pipelines.kjamistan.com/fullchain.pem'
207 | 
208 | # Path to SSL key file for the public facing interface of the proxy
209 | #
210 | # Use with ssl_cert
211 | c.JupyterHub.ssl_key = '/etc/letsencrypt/live/pipelines.kjamistan.com/privkey.pem'
212 | 
213 | # Host to send statds metrics to
214 | # c.JupyterHub.statsd_host = ''
215 | 
216 | # Port on which to send statsd metrics about the hub
217 | # c.JupyterHub.statsd_port = 8125
218 | 
219 | # Prefix to use for all metrics sent by jupyterhub to statsd
220 | # c.JupyterHub.statsd_prefix = 'jupyterhub'
221 | 
222 | # Run single-user servers on subdomains of this host.
223 | #
224 | # This should be the full https://hub.domain.tld[:port]
225 | #
226 | # Provides additional cross-site protections for javascript served by single-
227 | # user servers.
228 | #
229 | # Requires <username>.hub.domain.tld to resolve to the same host as
230 | # hub.domain.tld.
231 | #
232 | # In general, this is most easily achieved with wildcard DNS.
233 | #
234 | # When using SSL (i.e. always) this also requires a wildcard SSL certificate.
235 | # c.JupyterHub.subdomain_host = ''
236 | 
237 | # Paths to search for jinja templates.
238 | # c.JupyterHub.template_paths = []
239 | 
240 | # Extra settings overrides to pass to the tornado application.
241 | # c.JupyterHub.tornado_settings = {}
242 | 
243 | #------------------------------------------------------------------------------
244 | # Spawner configuration
245 | #------------------------------------------------------------------------------
246 | 
247 | # Base class for spawning single-user notebook servers.
248 | #
249 | # Subclass this, and override the following methods:
250 | #
251 | # - load_state - get_state - start - stop - poll
252 | 
253 | # Extra arguments to be passed to the single-user server
254 | # c.Spawner.args = []
255 | 
256 | # The command used for starting notebooks.
257 | c.Spawner.cmd = ['/home/deploy/venv/bin/jupyterhub-singleuser']
258 | 
259 | # Enable debug-logging of the single-user server
260 | c.Spawner.debug = True
261 | 
262 | # The default URL for the single-user server.
263 | #
264 | # Can be used in conjunction with --notebook-dir=/ to enable  full filesystem
265 | # traversal, while preserving user's homedir as landing page for notebook
266 | #
267 | # `%U` will be expanded to the user's username
268 | # c.Spawner.default_url = ''
269 | 
270 | # Disable per-user configuration of single-user servers.
271 | #
272 | # This prevents any config in users' $HOME directories from having an effect on
273 | # their server.
274 | # c.Spawner.disable_user_config = False
275 | 
276 | # Whitelist of environment variables for the subprocess to inherit
277 | c.Spawner.env_keep = ['PATH', 'PYTHONPATH', 'CONDA_ROOT',
278 |                       'CONDA_DEFAULT_ENV', 'DEPLOY', 'VIRTUAL_ENV',
279 |                       'LANG', 'LC_ALL', ]
280 | 
281 | # Environment variables to load for the Spawner.
282 | #
283 | # Value could be a string or a callable. If it is a callable, it will be called
284 | # with one parameter, which will be the instance of the spawner in use. It
285 | # should quickly (without doing much blocking operations) return a string that
286 | # will be used as the value for the environment variable.
287 | #c.Spawner.environment = {'VIRTUAL_ENV': '/home/deploy/venv',
288 |  #                        'PYTHONPATH': '/home/deploy/venv/bin/python'}
289 | 
290 | # Timeout (in seconds) before giving up on a spawned HTTP server
291 | #
292 | # Once a server has successfully been spawned, this is the amount of time we
293 | # wait before assuming that the server is unable to accept connections.
294 | # c.Spawner.http_timeout = 30
295 | 
296 | # The IP address (or hostname) the single-user server should listen on
297 | c.Spawner.ip = '127.0.0.1'
298 | 
299 | # The notebook directory for the single-user server
300 | #
301 | # `~` will be expanded to the user's home directory `%U` will be expanded to the
302 | # user's username
303 | #c.Spawner.notebook_dir = '~/notebooks'
304 | 
305 | # An HTML form for options a user can specify on launching their server. The
306 | # surrounding `<form>` element and the submit button are already provided.
307 | #
308 | # For example:
309 | #
310 | #     Set your key:
311 | #     <input name="key" val="default_key"></input>
312 | #     <br>
313 | #     Choose a letter:
314 | #     <select name="letter" multiple="true">
315 | #       <option value="A">The letter A</option>
316 | #       <option value="B">The letter B</option>
317 | #     </select>
318 | # c.Spawner.options_form = ''
319 | 
320 | # Interval (in seconds) on which to poll the spawner.
321 | # c.Spawner.poll_interval = 30
322 | 
323 | # Timeout (in seconds) before giving up on the spawner.
324 | #
325 | # This is the timeout for start to return, not the timeout for the server to
326 | # respond. Callers of spawner.start will assume that startup has failed if it
327 | # takes longer than this. start should return when the server process is started
328 | # and its location is known.
329 | # c.Spawner.start_timeout = 60
330 | 
331 | #------------------------------------------------------------------------------
332 | # LocalProcessSpawner configuration
333 | #------------------------------------------------------------------------------
334 | 
335 | # A Spawner that just uses Popen to start local processes as users.
336 | #
337 | # Requires users to exist on the local system.
338 | #
339 | # This is the default spawner for JupyterHub.
340 | 
341 | # Seconds to wait for process to halt after SIGINT before proceeding to SIGTERM
342 | # c.LocalProcessSpawner.INTERRUPT_TIMEOUT = 10
343 | 
344 | # Seconds to wait for process to halt after SIGKILL before giving up
345 | # c.LocalProcessSpawner.KILL_TIMEOUT = 5
346 | 
347 | # Seconds to wait for process to halt after SIGTERM before proceeding to SIGKILL
348 | # c.LocalProcessSpawner.TERM_TIMEOUT = 5
349 | 
350 | #------------------------------------------------------------------------------
351 | # Authenticator configuration
352 | #------------------------------------------------------------------------------
353 | 
354 | # A class for authentication.
355 | #
356 | # The primary API is one method, `authenticate`, a tornado coroutine for
357 | # authenticating users.
358 | 
359 | # set of usernames of admin users
360 | #
361 | # If unspecified, only the user that launches the server will be admin.
362 | c.Authenticator.admin_users = set(['deploy'])
363 | 
364 | # Dictionary mapping authenticator usernames to JupyterHub users.
365 | #
366 | # Can be used to map OAuth service names to local users, for instance.
367 | #
368 | # Used in normalize_username.
369 | # c.Authenticator.username_map = {}
370 | 
371 | # Regular expression pattern for validating usernames.
372 | #
373 | # If not defined: allow any username.
374 | # c.Authenticator.username_pattern = ''
375 | 
376 | # Username whitelist.
377 | #
378 | # Use this to restrict which users can login. If empty, allow any user to
379 | # attempt login.
380 | # c.Authenticator.whitelist = set()
381 | 
382 | #------------------------------------------------------------------------------
383 | # LocalAuthenticator configuration
384 | #------------------------------------------------------------------------------
385 | 
386 | # Base class for Authenticators that work with local Linux/UNIX users
387 | #
388 | # Checks for local users, and can attempt to create them if they exist.
389 | 
390 | # The command to use for creating users as a list of strings.
391 | #
392 | # For each element in the list, the string USERNAME will be replaced with the
393 | # user's username. The username will also be appended as the final argument.
394 | #
395 | # For Linux, the default value is:
396 | #
397 | #     ['adduser', '-q', '--gecos', '""', '--disabled-password']
398 | #
399 | # To specify a custom home directory, set this to:
400 | #
401 | #     ['adduser', '-q', '--gecos', '""', '--home', '/customhome/USERNAME',
402 | # '--disabled-password']
403 | #
404 | # This will run the command:
405 | #
406 | # adduser -q --gecos "" --home /customhome/river --disabled-password river
407 | #
408 | # when the user 'river' is created.
409 | # c.LocalAuthenticator.add_user_cmd = []
410 | 
411 | # If a user is added that doesn't exist on the system, should I try to create
412 | # the system user?
413 | # c.LocalAuthenticator.create_system_users = False
414 | 
415 | # Automatically whitelist anyone in this group.
416 | # c.LocalAuthenticator.group_whitelist = set()
417 | 
418 | #------------------------------------------------------------------------------
419 | # PAMAuthenticator configuration
420 | #------------------------------------------------------------------------------
421 | 
422 | # Authenticate local Linux/UNIX users with PAM
423 | 
424 | # The encoding to use for PAM
425 | # c.PAMAuthenticator.encoding = 'utf8'
426 | 
427 | # Whether to open PAM sessions when spawners are started.
428 | #
429 | # This may trigger things like mounting shared filsystems, loading credentials,
430 | # etc. depending on system configuration, but it does not always work.
431 | #
432 | # It can be disabled with::
433 | #
434 | #     c.PAMAuthenticator.open_sessions = False
435 | # c.PAMAuthenticator.open_sessions = True
436 | 
437 | # The PAM service to use for authentication.
438 | #c.PAMAuthenticator.service = 'login'
439 | 


--------------------------------------------------------------------------------
/deploy/templates/sshd_config:
--------------------------------------------------------------------------------
 1 | # Package generated configuration file
 2 | # See the sshd_config(5) manpage for details
 3 | 
 4 | # What ports, IPs and protocols we listen for
 5 | Port 22
 6 | # Use these options to restrict which interfaces/protocols sshd will bind to
 7 | #ListenAddress ::
 8 | #ListenAddress 0.0.0.0
 9 | Protocol 2
10 | # HostKeys for protocol version 2
11 | HostKey /etc/ssh/ssh_host_rsa_key
12 | HostKey /etc/ssh/ssh_host_dsa_key
13 | HostKey /etc/ssh/ssh_host_ecdsa_key
14 | HostKey /etc/ssh/ssh_host_ed25519_key
15 | #Privilege Separation is turned on for security
16 | UsePrivilegeSeparation yes
17 | 
18 | # Lifetime and size of ephemeral version 1 server key
19 | KeyRegenerationInterval 3600
20 | ServerKeyBits 1024
21 | 
22 | # Logging
23 | SyslogFacility AUTH
24 | LogLevel INFO
25 | 
26 | # Authentication:
27 | LoginGraceTime 120
28 | PermitRootLogin no
29 | StrictModes yes
30 | 
31 | RSAAuthentication yes
32 | PubkeyAuthentication yes
33 | #AuthorizedKeysFile	%h/.ssh/authorized_keys
34 | 
35 | # Don't read the user's ~/.rhosts and ~/.shosts files
36 | IgnoreRhosts yes
37 | # For this to work you will also need host keys in /etc/ssh_known_hosts
38 | RhostsRSAAuthentication no
39 | # similar for protocol version 2
40 | HostbasedAuthentication no
41 | # Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication
42 | #IgnoreUserKnownHosts yes
43 | 
44 | # To enable empty passwords, change to yes (NOT RECOMMENDED)
45 | PermitEmptyPasswords no
46 | 
47 | # Change to yes to enable challenge-response passwords (beware issues with
48 | # some PAM modules and threads)
49 | ChallengeResponseAuthentication no
50 | 
51 | # Change to no to disable tunnelled clear text passwords
52 | PasswordAuthentication no
53 | 
54 | # Kerberos options
55 | #KerberosAuthentication no
56 | #KerberosGetAFSToken no
57 | #KerberosOrLocalPasswd yes
58 | #KerberosTicketCleanup yes
59 | 
60 | # GSSAPI options
61 | #GSSAPIAuthentication no
62 | #GSSAPICleanupCredentials yes
63 | 
64 | X11Forwarding yes
65 | X11DisplayOffset 10
66 | PrintMotd no
67 | PrintLastLog yes
68 | TCPKeepAlive yes
69 | #UseLogin no
70 | 
71 | #MaxStartups 10:30:60
72 | #Banner /etc/issue.net
73 | 
74 | # Allow client to pass locale environment variables
75 | AcceptEnv LANG LC_*
76 | 
77 | Subsystem sftp /usr/lib/openssh/sftp-server
78 | 
79 | # Set this to 'yes' to enable PAM authentication, account processing,
80 | # and session processing. If this is enabled, PAM authentication will
81 | # be allowed through the ChallengeResponseAuthentication and
82 | # PasswordAuthentication.  Depending on your PAM configuration,
83 | # PAM authentication via ChallengeResponseAuthentication may bypass
84 | # the setting of "PermitRootLogin without-password".
85 | # If you just want the PAM account and session checks to run without
86 | # PAM authentication, then enable this but set PasswordAuthentication
87 | # and ChallengeResponseAuthentication to 'no'.
88 | UsePAM yes
89 | 


--------------------------------------------------------------------------------
/example_prod.cfg:
--------------------------------------------------------------------------------
 1 | [openweather]
 2 | api_key=425b9b9e2416cjfr47329434jk2lX4u32
 3 | 
 4 | [twitter]
 5 | consumer_key = CIuYfkdFw8392kdfHuioj
 6 | consumer_secret = 4QiJw1wkd902eklfjs920skcSwikFpkl3289
 7 | access_token = 15632343-qaMfjk1ri8eklclfiFisoTwjneio48930
 8 | access_token_secret = FAifw894jk3l24h543ljfs89hC9fhjFhkjrel3784
 9 | 
10 | [google]
11 | api_key=AI16cjfr47329434jk2lX4u32
12 | 


--------------------------------------------------------------------------------
/luigi/luigi.cfg:
--------------------------------------------------------------------------------
 1 | [worker]
 2 | keep_alive=True
 3 | task_limit=10
 4 | 
 5 | [scheduler]
 6 | retry_count=4
 7 | record_task_history=True
 8 | 
 9 | [task_history]
10 | db_connection=sqlite:///tasks.db
11 | 
12 | [hadoop]
13 | client=hadoopcli
14 | streaming-jar=/usr/local/lib/hadoop-2.7.2/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar
15 | python-executable=/usr/bin/python3
16 | jar=/usr/local/lib/hadoop-2.7.2/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar
17 | tmp_dir=/tmp
18 | 
19 | [hdfs]
20 | tmp_dir=/tmp
21 | 
22 | 


--------------------------------------------------------------------------------
/luigi/taxi_data_import.py:
--------------------------------------------------------------------------------
  1 | from configparser import ConfigParser
  2 | from luigi.contrib import sqla
  3 | from luigi.mock import MockFile
  4 | from googleplaces import GooglePlaces
  5 | from sqlalchemy import Float, DateTime, Integer, String
  6 | import csv
  7 | import logging
  8 | import luigi
  9 | import os
 10 | import requests
 11 | import shutil
 12 | 
 13 | 
 14 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, '../../config/prod.cfg'))
 15 | 
 16 | class DownloadTaxiUrls(luigi.Task):
 17 |     """ Download NYC Taxi Data for our use. """
 18 |     year = luigi.IntParameter(default=2016)
 19 |     months = luigi.ListParameter(default=[6,7,8])
 20 |     url_list = luigi.Parameter(default='https://raw.githubusercontent.com/toddwschneider/nyc-taxi-data/master/raw_data_urls.txt')
 21 |     cab_type = luigi.Parameter(default='yellow')
 22 | 
 23 |     def run(self):
 24 |         resp = requests.get(self.url_list)
 25 |         urls = []
 26 |         possible_strs = ['{}-{:02d}'.format(self.year, m) for m in self.months]
 27 |         for line in resp.iter_lines():
 28 |             if self.cab_type in str(line):
 29 |                 for datestr in possible_strs:
 30 |                     if datestr in str(line):
 31 |                         urls.append(line)
 32 |                         break
 33 |         with self.output().open('w') as url_file:
 34 |             for url in urls:
 35 |                 print(url.decode(), file=url_file)
 36 | 
 37 |     def output(self):
 38 |         return luigi.LocalTarget('/tmp/taxi_data/urls.txt')
 39 | 
 40 | 
 41 | class DownloadTaxiData(luigi.Task):
 42 |     """ Downloading each file of taxi data for each url from the repo. """
 43 |     def requires(self):
 44 |         return DownloadTaxiUrls()
 45 | 
 46 |     def input(self):
 47 |         return luigi.LocalTarget('/tmp/taxi_data/urls.txt')
 48 | 
 49 |     def run(self):
 50 |         for url in self.input().open('r'):
 51 |             yield DownloadTaxiFile(url.rstrip('\n'))
 52 | 
 53 |     def output(self):
 54 |         files = [url.rstrip('\n').split('/')[-1]
 55 |                  for url in self.input().open('r')]
 56 |         return [luigi.LocalTarget('/tmp/taxi_data/{}'.format(file_name))
 57 |                 for file_name in files]
 58 | 
 59 | 
 60 | class DownloadTaxiFile(luigi.Task):
 61 |     """ Download each file, and save it locally to /tmp/taxi_data """
 62 |     url = luigi.Parameter()
 63 | 
 64 |     def requires(self):
 65 |         return DownloadTaxiUrls()
 66 | 
 67 |     def run(self):
 68 |         file_name = self.url.split('/')[-1]
 69 |         resp = requests.get(str(self.url), stream=True)
 70 |         with open(self.output().path, 'wb') as taxi_file:
 71 |             shutil.copyfileobj(resp.raw, taxi_file)
 72 | 
 73 |     def output(self):
 74 |         file_name = self.url.split('/')[-1]
 75 |         return luigi.LocalTarget('/tmp/taxi_data/{}'.format(file_name))
 76 | 
 77 | 
 78 | class AddTaxiLocations(luigi.Task):
 79 |     """ Import the files and add the locations using Google Reverse Search. """
 80 |     dir_name = luigi.Parameter(default='/tmp/taxi_data/')
 81 | 
 82 |     def requires(self):
 83 |         return DownloadTaxiData()
 84 | 
 85 |     def input(self):
 86 |         return [luigi.LocalTarget('/tmp/taxi_data/{}'.format(fn)) for fn
 87 |                 in os.listdir(self.dir_name) if fn.endswith('csv')]
 88 | 
 89 |     def run(self):
 90 |         for fn in self.input():
 91 |             rdr = csv.DictReader(fn.open('r'))
 92 |             for line in rdr:
 93 |                 yield AddTaxiLocation(line)
 94 | 
 95 | 
 96 | class AddTaxiLocation(luigi.Task):
 97 |     """ Search for pickup and dropoff location and add them via Google API.
 98 |         NOTE: it appears the names and mappings change over time, you will
 99 |         need to adapt the code for different years. I've included columns for
100 |         2009 and 2016 here. Feel free to expand this and send PR if you'd
101 |         like to share!
102 |     """
103 |     line = luigi.DictParameter()
104 | 
105 |     columns_2009 = ['vendor_name',
106 |                     'Rate_Code', 'surcharge', 'store_and_forward',
107 |                'mta_tax', 'Total_Amt', 'Fare_Amt', 'Tolls_Amt', 'Tip_Amt',
108 |                'Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime',
109 |                'Passenger_Count', 'Payment_Type', 'Trip_Distance',
110 |                'Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat',
111 |                'pickup_location_name', 'pickup_location_phone',
112 |                'pickup_location_addy', 'pickup_location_web',
113 |                'dropoff_location_name', 'dropoff_location_phone',
114 |                'dropoff_location_addy', 'dropoff_location_web']
115 | 
116 |     columns = ['VendorID', 'RatecodeID', 'improvement_surcharge',
117 |                'store_and_fwd_flag', 'mta_tax', 'total_amount',
118 |                'fare_amount', 'extra', 'tip_amount',
119 |                'tpep_pickup_datetime', 'tpep_dropoff_datetime',
120 |                'passenger_count', 'payment_type', 'trip_distance',
121 |                'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
122 |                'dropoff_latitude', 'pickup_location_name',
123 |                'pickup_location_phone', 'pickup_location_addy',
124 |                'pickup_location_web', 'dropoff_location_name',
125 |                'dropoff_location_phone', 'dropoff_location_addy',
126 |                'dropoff_location_web']
127 | 
128 | 
129 |     def add_addy_info(self, res, loc_type):
130 |         if len(res.places):
131 |             place = res.places[0]
132 |             place.get_details()
133 |             self.line['{}_location_name'.format(loc_type)] = place.name
134 |             self.line['{}_location_phone'.format(loc_type)] = place.local_phone_number
135 |             self.line['{}_location_addy'.format(loc_type)] = place.vicinity
136 |             self.line['{}_location_web'.format(loc_type)] = place.website
137 | 
138 |     def run(self):
139 |         self.line = dict((k, v) for k,v in self.line.items())
140 |         config = ConfigParser()
141 |         config.read(CONFIG_FILE)
142 |         client = GooglePlaces(config.get('google', 'api_key'))
143 |         if len(set(self.line.keys()) - set(self.columns)) > 2:
144 |             self.columns = self.columns_2009
145 |         res = client.nearby_search(lat_lng={'lat': self.line[self.columns[15]],
146 |                                             'lng': self.line[self.columns[14]]})
147 |         self.add_addy_info(res, 'pickup')
148 |         res = client.nearby_search(lat_lng={'lat': self.line[self.columns[17]],
149 |                                             'lng': self.line[self.columns[16]]})
150 |         self.add_addy_info(res, 'dropoff')
151 |         with self.output().open('w') as line_output:
152 |             line_with_tabs = '\t'.join([self.line.get(key) if self.line.get(key)
153 |                                         else '' for key in self.columns])
154 |             line_output.write(line_with_tabs)
155 | 
156 |     def output(self):
157 |         return MockFile("AddTaxiLocation")
158 | 
159 | 
160 | class SaveTaxiRow(sqla.CopyToTable):
161 |     """ Save each taxi line with the location information to the database """
162 |     connection_string = 'sqlite:///taxi_db.db'
163 |     table = 'taxi_rides'
164 |     columns = [
165 |         (['vendor_name', String(10)], {}),
166 |         (['rate_code', String(4)], {}),
167 |         (['surcharge', Float()], {}),
168 |         (['store_and_forward', Integer()], {}),
169 |         (['mta_tax', Float()], {}),
170 |         (['total_amt', Float()], {}),
171 |         (['fare_amt', Float()], {}),
172 |         (['tolls_amt', Float()], {}),
173 |         (['tip_amt', Float()], {}),
174 |         (['pickup_datetime', DateTime()], {}),
175 |         (['dropoff_datetime', DateTime()], {}),
176 |         (['passenger_count', Integer()], {}),
177 |         (['payment_type', String(100)], {}),
178 |         (['trip_distance', Float()], {}),
179 |         (['pickup_longitude', Float()], {}),
180 |         (['pickup_latitude', Float()], {}),
181 |         (['dropoff_longitude', Float()], {}),
182 |         (['dropoff_latitude', Float()], {}),
183 |         (['pickup_location_name', String(128)], {}),
184 |         (['pickup_location_phone', String(64)], {}),
185 |         (['pickup_location_addy', String(256)], {}),
186 |         (['pickup_location_web', String(64)], {}),
187 |         (['dropoff_location_name', String(128)], {}),
188 |         (['dropoff_location_phone', String(64)], {}),
189 |         (['dropoff_location_addy', String(256)], {}),
190 |         (['dropoff_location_web', String(64)], {}),
191 |     ]
192 | 
193 |     def requires(self):
194 |         return AddTaxiLocation()
195 | 


--------------------------------------------------------------------------------
/luigi/wordcount_map_reduce.py:
--------------------------------------------------------------------------------
 1 | """ Simple wordcount using map reduce """
 2 | import luigi
 3 | import luigi.contrib.hadoop
 4 | import luigi.contrib.hdfs
 5 | import json
 6 | import os
 7 | 
 8 | class ProcessChatLogs(luigi.Task):
 9 |     file_name = luigi.Parameter()
10 | 
11 |     def input(self):
12 |         return luigi.contrib.hdfs.HdfsTarget(self.file_name)
13 | 
14 |     def run(self):
15 |         with self.output().open('w') as output_file:
16 |             for msg_dict in json.load(self.input().open('r')):
17 |                 output_file.write(msg_dict.get('message') + '\n')
18 | 
19 |     def output(self):
20 |         return luigi.contrib.hdfs.HdfsTarget(
21 |             self.file_name.replace('.json', '_messages_only.txt'))
22 | 
23 | 
24 | class ChatWordCount(luigi.contrib.hadoop.JobTask):
25 |     file_name = luigi.Parameter()
26 | 
27 |     def output(self):
28 |         file_dir = os.path.dirname(self.file_name)
29 |         new_file_name = 'wordcount_{}'.format(
30 |             os.path.basename(self.file_name).replace('.json', '.txt'))
31 |         return luigi.contrib.hdfs.HdfsTarget(
32 |             os.path.join(file_dir, new_file_name))
33 | 
34 |     def mapper(self, line):
35 |         for word in line.strip().split():
36 |             yield word, 1
37 | 
38 |     def reducer(self, key, values):
39 |         yield key, sum(values)
40 | 
41 |     def requires(self):
42 |         return ProcessChatLogs(self.file_name)
43 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 3 - Basic Celery Tasks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 3,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "sys.path.append('/var/www/pipelines/celery_app')"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 4,
 28 |    "metadata": {
 29 |     "collapsed": false
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from tasks import get_stock_info"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 5,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from datetime import datetime"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 9,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "res = get_stock_info.delay('FB', datetime(2016,1,1), datetime.today(), source='yahoo')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 10,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "<AsyncResult: cb1ab1ea-9fe4-472e-8391-c6a82ea5cc10>"
 69 |       ]
 70 |      },
 71 |      "execution_count": 10,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "res"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 11,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "'SUCCESS'"
 91 |       ]
 92 |      },
 93 |      "execution_count": 11,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "res.status"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 12,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "'{\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005}}'"
113 |       ]
114 |      },
115 |      "execution_count": 12,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "res.get()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 6,
127 |    "metadata": {
128 |     "collapsed": false
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "res = get_stock_info.delay('XFJKLSFD', datetime(2016,1,1), datetime.today(), source='yahoo')\n"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "'FAILURE'"
146 |       ]
147 |      },
148 |      "execution_count": 7,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "res.status"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 8,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "ename": "RemoteDataError",
166 |      "evalue": "Unable to read URL: http://ichart.finance.yahoo.com/table.csv",
167 |      "output_type": "error",
168 |      "traceback": [
169 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
170 |       "\u001b[0;31mRemoteDataError\u001b[0m                           Traceback (most recent call last)",
171 |       "\u001b[0;32m<ipython-input-8-8bb969b0b8af>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
172 |       "\u001b[0;32m/home/deploy/venv/lib/python3.4/site-packages/celery/result.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout, propagate, interval, no_ack, follow_parents, EXCEPTION_STATES, PROPAGATE_STATES)\u001b[0m\n\u001b[1;32m    160\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cache\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    161\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mpropagate\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaybe_reraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    163\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
173 |       "\u001b[0;32m/home/deploy/venv/lib/python3.4/site-packages/celery/result.py\u001b[0m in \u001b[0;36mmaybe_reraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    269\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mmaybe_reraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    270\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPROPAGATE_STATES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 271\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    272\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    273\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mbuild_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mintermediate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
174 |       "\u001b[0;31mRemoteDataError\u001b[0m: Unable to read URL: http://ichart.finance.yahoo.com/table.csv"
175 |      ]
176 |     }
177 |    ],
178 |    "source": [
179 |     "res.get()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 17,
185 |    "metadata": {
186 |     "collapsed": false
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "res = get_stock_info.apply_async(('GOOG', datetime(2016, 1, 1), datetime.today()), queue='priority') "
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 18,
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "<AsyncResult: 9fd821c9-660e-48c0-ba20-857e7e96be04>"
204 |       ]
205 |      },
206 |      "execution_count": 18,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "res"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 19,
218 |    "metadata": {
219 |     "collapsed": false
220 |    },
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "'SUCCESS'"
226 |       ]
227 |      },
228 |      "execution_count": 19,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "res.status"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 20,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "'{\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965}}'"
248 |       ]
249 |      },
250 |      "execution_count": 20,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "res.get()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Python 3",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.4.2"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 1
290 | }
291 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 3 - Complex Task Chains.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "sys.path.append('/var/www/pipelines/celery_app')"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from tasks import get_stock_info, price_range, determine_buy, sort_results"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from celery import chain, group, chord"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from datetime import datetime"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "my_stocks = ['FB', 'GOOG', 'IBM']"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 7,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "start = datetime(2016,1,1)\n",
 78 |     "end = datetime.today()"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### Working with Chains"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 8,
 91 |    "metadata": {
 92 |     "collapsed": false
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "my_chain = chain(price_range.s('FB', start, end), determine_buy.s())"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 9,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "tasks.price_range('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)) | tasks.determine_buy()"
110 |       ]
111 |      },
112 |      "execution_count": 9,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "my_chain"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 10,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "res = my_chain()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 11,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "<AsyncResult: ff86c59a-16f3-4ef8-b12c-6649cd8aea2e>"
143 |       ]
144 |      },
145 |      "execution_count": 11,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "res"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 12,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "'SUCCESS'"
165 |       ]
166 |      },
167 |      "execution_count": 12,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "res.state"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 13,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "False"
187 |       ]
188 |      },
189 |      "execution_count": 13,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "res.get()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "### Working with Groups"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 14,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "my_grp = group(get_stock_info.s(stk, start, end) for stk in my_stocks)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 15,
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "[tasks.get_stock_info('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)), tasks.get_stock_info('GOOG', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)), tasks.get_stock_info('IBM', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098))]"
227 |       ]
228 |      },
229 |      "execution_count": 15,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "my_grp"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 16,
241 |    "metadata": {
242 |     "collapsed": false
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "res = my_grp()"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 17,
252 |    "metadata": {
253 |     "collapsed": false
254 |    },
255 |    "outputs": [
256 |     {
257 |      "data": {
258 |       "text/plain": [
259 |        "<GroupResult: 7a3db195-6340-498b-adb1-f3981dd7667e [1286288f-2622-4dac-83ab-92961be18fe7, 2d6344f5-f03f-47b0-b5e7-399289783e72, b3bf473b-fe6f-4e34-bb17-2dafd80ee35b]>"
260 |       ]
261 |      },
262 |      "execution_count": 17,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": [
268 |     "res"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 18,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "['{\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005}}',\n",
282 |        " '{\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003},\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745}}',\n",
283 |        " '{\"Open min\":{\"IBM\":118.459999},\"Open max\":{\"IBM\":163.190002},\"Open mean\":{\"IBM\":145.5231246875},\"Open median\":{\"IBM\":148.4449995},\"Low min\":{\"IBM\":116.900002},\"Low max\":{\"IBM\":162.179993},\"Low mean\":{\"IBM\":144.4778746625},\"Low median\":{\"IBM\":147.3549955},\"High min\":{\"IBM\":119.660004},\"High max\":{\"IBM\":164.949997},\"High mean\":{\"IBM\":146.7556872938},\"High median\":{\"IBM\":149.774994},\"Adj Close min\":{\"IBM\":115.72409},\"Adj Close max\":{\"IBM\":163.529999},\"Adj Close mean\":{\"IBM\":143.7099821812},\"Adj Close median\":{\"IBM\":146.422927},\"Close min\":{\"IBM\":117.849998},\"Close max\":{\"IBM\":163.529999},\"Close mean\":{\"IBM\":145.7649376375},\"Close median\":{\"IBM\":148.4800035}}']"
284 |       ]
285 |      },
286 |      "execution_count": 18,
287 |      "metadata": {},
288 |      "output_type": "execute_result"
289 |     }
290 |    ],
291 |    "source": [
292 |     "res.get()"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "### Working with Chords (redis backend only!)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 19,
305 |    "metadata": {
306 |     "collapsed": false
307 |    },
308 |    "outputs": [],
309 |    "source": [
310 |     "header = [price_range.subtask((stk, start, end)) for stk in my_stocks]"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 23,
316 |    "metadata": {
317 |     "collapsed": false
318 |    },
319 |    "outputs": [
320 |     {
321 |      "name": "stdout",
322 |      "output_type": "stream",
323 |      "text": [
324 |       "1 loop, best of 3: 849 ms per loop\n"
325 |      ]
326 |     }
327 |    ],
328 |    "source": [
329 |     "%timeit sort_results(group(header)().get())"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 24,
335 |    "metadata": {
336 |     "collapsed": false
337 |    },
338 |    "outputs": [
339 |     {
340 |      "data": {
341 |       "text/plain": [
342 |        "[{'percent_change': 9.2799999999999994,\n",
343 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
344 |        "  'period_high': 125.260002,\n",
345 |        "  'period_low': 94.160004000000001,\n",
346 |        "  'period_mean': 112.86093739375001,\n",
347 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
348 |        "  'result': 'higher',\n",
349 |        "  'stock': 'FB',\n",
350 |        "  'todays_price': 123.34},\n",
351 |        " {'percent_change': 11.06,\n",
352 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
353 |        "  'period_high': 163.529999,\n",
354 |        "  'period_low': 115.72408999999999,\n",
355 |        "  'period_mean': 143.70998218125001,\n",
356 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
357 |        "  'result': 'higher',\n",
358 |        "  'stock': 'IBM',\n",
359 |        "  'todays_price': 159.61000000000001},\n",
360 |        " {'percent_change': 6.54,\n",
361 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
362 |        "  'period_high': 784.84997599999997,\n",
363 |        "  'period_low': 668.26000999999997,\n",
364 |        "  'period_mean': 724.18143844999997,\n",
365 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
366 |        "  'result': 'higher',\n",
367 |        "  'stock': 'GOOG',\n",
368 |        "  'todays_price': 771.55999999999995}]"
369 |       ]
370 |      },
371 |      "execution_count": 24,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "sort_results(group(header)().get())"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 25,
383 |    "metadata": {
384 |     "collapsed": false
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "callback = sort_results.subtask()"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 26,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [
398 |     {
399 |      "name": "stdout",
400 |      "output_type": "stream",
401 |      "text": [
402 |       "1 loop, best of 3: 1.16 s per loop\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "%timeit chord(header)(callback).get()"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 27,
413 |    "metadata": {
414 |     "collapsed": false
415 |    },
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/plain": [
420 |        "[{'percent_change': 9.3100000000000005,\n",
421 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
422 |        "  'period_high': 125.260002,\n",
423 |        "  'period_low': 94.160004000000001,\n",
424 |        "  'period_mean': 112.86093739375001,\n",
425 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
426 |        "  'result': 'higher',\n",
427 |        "  'stock': 'FB',\n",
428 |        "  'todays_price': 123.37},\n",
429 |        " {'percent_change': 11.06,\n",
430 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
431 |        "  'period_high': 163.529999,\n",
432 |        "  'period_low': 115.72408999999999,\n",
433 |        "  'period_mean': 143.70998218125001,\n",
434 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
435 |        "  'result': 'higher',\n",
436 |        "  'stock': 'IBM',\n",
437 |        "  'todays_price': 159.61000000000001},\n",
438 |        " {'percent_change': 6.5099999999999998,\n",
439 |        "  'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n",
440 |        "  'period_high': 784.84997599999997,\n",
441 |        "  'period_low': 668.26000999999997,\n",
442 |        "  'period_mean': 724.18143844999997,\n",
443 |        "  'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n",
444 |        "  'result': 'higher',\n",
445 |        "  'stock': 'GOOG',\n",
446 |        "  'todays_price': 771.30999999999995}]"
447 |       ]
448 |      },
449 |      "execution_count": 27,
450 |      "metadata": {},
451 |      "output_type": "execute_result"
452 |     }
453 |    ],
454 |    "source": [
455 |     "chord(header)(callback).get()"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {
462 |     "collapsed": true
463 |    },
464 |    "outputs": [],
465 |    "source": []
466 |   }
467 |  ],
468 |  "metadata": {
469 |   "kernelspec": {
470 |    "display_name": "Python 3",
471 |    "language": "python",
472 |    "name": "python3"
473 |   },
474 |   "language_info": {
475 |    "codemirror_mode": {
476 |     "name": "ipython",
477 |     "version": 3
478 |    },
479 |    "file_extension": ".py",
480 |    "mimetype": "text/x-python",
481 |    "name": "python",
482 |    "nbconvert_exporter": "python",
483 |    "pygments_lexer": "ipython3",
484 |    "version": "3.4.2"
485 |   }
486 |  },
487 |  "nbformat": 4,
488 |  "nbformat_minor": 1
489 | }
490 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 3 - First Steps with Celery.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "sys.path.append('/var/www/pipelines/celery_app')"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from datetime import datetime"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from tasks import get_stock_info"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "res = get_stock_info.delay('FB', datetime(2016, 1, 1), datetime.today())"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "'STARTED'"
 69 |       ]
 70 |      },
 71 |      "execution_count": 6,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "res.status"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 7,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "'{\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995}}'"
 91 |       ]
 92 |      },
 93 |      "execution_count": 7,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "res.get()"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 8,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "res = get_stock_info.apply_async(('FB', datetime(2016, 1, 1), datetime.today()), queue='priority')"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 9,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "'{\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995}}'"
124 |       ]
125 |      },
126 |      "execution_count": 9,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "res.get()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 10,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "from tasks import get_stock_info, price_range, determine_buy, sort_results"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 11,
149 |    "metadata": {
150 |     "collapsed": true
151 |    },
152 |    "outputs": [],
153 |    "source": [
154 |     "from celery import chain, group, chord"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 12,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "start = datetime(2016, 1, 1)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 13,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "end = datetime.today()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 14,
182 |    "metadata": {
183 |     "collapsed": true
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "my_chain = chain(price_range.s('FB', start, end), determine_buy.s())"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 15,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "tasks.price_range('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)) | tasks.determine_buy()"
201 |       ]
202 |      },
203 |      "execution_count": 15,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "my_chain"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 16,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "res = my_chain()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 17,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "<AsyncResult: 83770b43-3ae1-4a85-9066-3426ee482b94>"
234 |       ]
235 |      },
236 |      "execution_count": 17,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "res"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 18,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "'SUCCESS'"
256 |       ]
257 |      },
258 |      "execution_count": 18,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "res.status"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 19,
270 |    "metadata": {
271 |     "collapsed": false
272 |    },
273 |    "outputs": [
274 |     {
275 |      "data": {
276 |       "text/plain": [
277 |        "False"
278 |       ]
279 |      },
280 |      "execution_count": 19,
281 |      "metadata": {},
282 |      "output_type": "execute_result"
283 |     }
284 |    ],
285 |    "source": [
286 |     "res.get()"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 20,
292 |    "metadata": {
293 |     "collapsed": true
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "my_stocks = ['FB', 'GOOG', 'IBM']"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 21,
303 |    "metadata": {
304 |     "collapsed": true
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "my_group = group(get_stock_info.s(stk, start, end) for stk in my_stocks)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 22,
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "[tasks.get_stock_info('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)), tasks.get_stock_info('GOOG', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)), tasks.get_stock_info('IBM', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856))]"
322 |       ]
323 |      },
324 |      "execution_count": 22,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "my_group"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 23,
336 |    "metadata": {
337 |     "collapsed": true
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "res = my_group()"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 24,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "<GroupResult: ad8c46f7-4032-4028-a563-c44391ab400e [7089d1ed-9335-4002-9de4-544e4b899cff, 5840e896-9b8e-48ff-b6fa-a67ad1ca9212, 4762aa4c-c1ae-475d-94af-6afbe7a3ae66]>"
355 |       ]
356 |      },
357 |      "execution_count": 24,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "res"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 25,
369 |    "metadata": {
370 |     "collapsed": false
371 |    },
372 |    "outputs": [
373 |     {
374 |      "data": {
375 |       "text/plain": [
376 |        "['{\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005}}',\n",
377 |        " '{\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003}}',\n",
378 |        " '{\"Low min\":{\"IBM\":116.900002},\"Low max\":{\"IBM\":162.179993},\"Low mean\":{\"IBM\":144.4778746625},\"Low median\":{\"IBM\":147.3549955},\"High min\":{\"IBM\":119.660004},\"High max\":{\"IBM\":164.949997},\"High mean\":{\"IBM\":146.7556872938},\"High median\":{\"IBM\":149.774994},\"Open min\":{\"IBM\":118.459999},\"Open max\":{\"IBM\":163.190002},\"Open mean\":{\"IBM\":145.5231246875},\"Open median\":{\"IBM\":148.4449995},\"Close min\":{\"IBM\":117.849998},\"Close max\":{\"IBM\":163.529999},\"Close mean\":{\"IBM\":145.7649376375},\"Close median\":{\"IBM\":148.4800035},\"Adj Close min\":{\"IBM\":115.72409},\"Adj Close max\":{\"IBM\":163.529999},\"Adj Close mean\":{\"IBM\":143.7099821812},\"Adj Close median\":{\"IBM\":146.422927}}']"
379 |       ]
380 |      },
381 |      "execution_count": 25,
382 |      "metadata": {},
383 |      "output_type": "execute_result"
384 |     }
385 |    ],
386 |    "source": [
387 |     "res.get()"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 26,
393 |    "metadata": {
394 |     "collapsed": true
395 |    },
396 |    "outputs": [],
397 |    "source": [
398 |     "header = (price_range.subtask((stk, start, end)) for stk in my_stocks)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 27,
404 |    "metadata": {
405 |     "collapsed": true
406 |    },
407 |    "outputs": [],
408 |    "source": [
409 |     "callback = sort_results.subtask()"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 28,
415 |    "metadata": {
416 |     "collapsed": false
417 |    },
418 |    "outputs": [
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "1 loop, best of 3: 1.08 s per loop\n"
424 |      ]
425 |     }
426 |    ],
427 |    "source": [
428 |     "%timeit chord(header)(callback).get()"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 29,
434 |    "metadata": {
435 |     "collapsed": false
436 |    },
437 |    "outputs": [
438 |     {
439 |      "name": "stdout",
440 |      "output_type": "stream",
441 |      "text": [
442 |       "The slowest run took 34.82 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
443 |       "10000 loops, best of 3: 67.9 µs per loop\n"
444 |      ]
445 |     }
446 |    ],
447 |    "source": [
448 |     "%timeit sort_results(group(header)().get())"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {
455 |     "collapsed": true
456 |    },
457 |    "outputs": [],
458 |    "source": []
459 |   }
460 |  ],
461 |  "metadata": {
462 |   "kernelspec": {
463 |    "display_name": "Python 3",
464 |    "language": "python",
465 |    "name": "python3"
466 |   },
467 |   "language_info": {
468 |    "codemirror_mode": {
469 |     "name": "ipython",
470 |     "version": 3
471 |    },
472 |    "file_extension": ".py",
473 |    "mimetype": "text/x-python",
474 |    "name": "python",
475 |    "nbconvert_exporter": "python",
476 |    "pygments_lexer": "ipython3",
477 |    "version": "3.4.2"
478 |   }
479 |  },
480 |  "nbformat": 4,
481 |  "nbformat_minor": 1
482 | }
483 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 3 - Monitoring Tasks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "sys.path.append('/var/www/pipelines/celery_app')"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "from datetime import datetime"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from tasks import get_stock_info"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "start = datetime(2016, 1, 1)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "end = datetime.today()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 7,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "res = get_stock_info.delay('FB', start, end)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 8,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "'SUCCESS'"
 91 |       ]
 92 |      },
 93 |      "execution_count": 8,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "res.status"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 9,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "res = get_stock_info.delay('MYCOOLSTOCK', start, end)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 10,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "'FAILURE'"
124 |       ]
125 |      },
126 |      "execution_count": 10,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "res.status"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": []
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.4.2"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 1
166 | }
167 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 4 - Dask Distributed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from dask import do\n",
 12 |     "from distributed import LocalCluster, Executor\n",
 13 |     "from configparser import ConfigParser\n",
 14 |     "import requests\n",
 15 |     "import numpy as np"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "You must have a folder `config` in the parent directory or current directory or simply modify the `get_config` method. You will also need to [acquire an API Key for the OpenWeatherMap API](http://openweathermap.org/appid). Your `prod.cfg` file in the aforementioned `config` folder should have a section like so:\n",
 23 |     "\n",
 24 |     "```\n",
 25 |     "[openweather]\n",
 26 |     "api_key=425b9b9e2416cjfr47329434jk2lX4u32\n",
 27 |     "```\n",
 28 |     "with your assigned key from OpenWeatherMap."
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "def get_current(location_str, config):\n",
 38 |     "    '''Get latest temperature data from openweather\n",
 39 |     "    params:\n",
 40 |     "        location_str: string with city,country_code\n",
 41 |     "        config: ConfigParser object with openweather section and api_key key\n",
 42 |     "    returns:\n",
 43 |     "        tuple: (location_str, parsed json response)    \n",
 44 |     "    '''\n",
 45 |     "    weather_key = config.get('openweather', 'api_key')\n",
 46 |     "    resp = requests.get('http://api.openweathermap.org/data/2.5/weather',       \n",
 47 |     "                        params={'q': location_str, \n",
 48 |     "                                'appid': weather_key, \n",
 49 |     "                                'units': 'metric'}) \n",
 50 |     "    return location_str, resp.json()\n",
 51 |     "   "
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "def get_forecast(location_str, config):\n",
 63 |     "    '''Get forecast temperature data from openweather\n",
 64 |     "    params:\n",
 65 |     "        location_str: string with city,country_code\n",
 66 |     "        config: ConfigParser object with openweather section and api_key key\n",
 67 |     "    returns:\n",
 68 |     "        tuple: (location_str, parsed json response)\n",
 69 |     "    '''\n",
 70 |     "    weather_key = config.get('openweather', 'api_key')\n",
 71 |     "    resp = requests.get('http://api.openweathermap.org/data/2.5/forecast',       \n",
 72 |     "                        params={'q': location_str, \n",
 73 |     "                                'appid': weather_key,        \n",
 74 |     "                                'units': 'metric'})\n",
 75 |     "    return location_str, resp.json()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "def filter_temp(location_str, weather_json):\n",
 87 |     "    '''Filter out just the city, temperature, and humidity in forecast or current weather data.\n",
 88 |     "    params:\n",
 89 |     "        location_str: string with city,country_code\n",
 90 |     "        weather_json: json returned from get_forecast or get_current \n",
 91 |     "    returns:\n",
 92 |     "        dict: containing city names and either list of forecast temps and humidity or current temp and humidity\n",
 93 |     "    '''\n",
 94 |     "    if 'cod' in weather_json.keys() and int(weather_json['cod']) != 200:\n",
 95 |     "        raise ValueError('Bad Data Returned from API: {} - {}'.format(\n",
 96 |     "                location_str, weather_json))\n",
 97 |     "    try:\n",
 98 |     "        api_city_str = '{},{}'.format(weather_json['name'], weather_json['sys']['country'])\n",
 99 |     "    except KeyError:\n",
100 |     "        api_city_str = '{},{}'.format(weather_json['city']['name'], weather_json['city']['country'])\n",
101 |     "    resp = {\n",
102 |     "             'search_city': location_str,\n",
103 |     "             'api_city': api_city_str,\n",
104 |     "    }\n",
105 |     "    if 'main' in weather_json.keys():\n",
106 |     "        resp['current_temp'] = weather_json['main']['temp']\n",
107 |     "        resp['current_humidity'] = weather_json['main']['humidity']\n",
108 |     "    else:\n",
109 |     "        resp['forecast_temps'] = [fr['main']['temp'] for fr in weather_json['list']]\n",
110 |     "        resp['forecast_humidity'] = [fr['main']['humidity'] for fr in weather_json['list']]\n",
111 |     "    return resp"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "def merge_data(latest, forecast):\n",
123 |     "    ''' Merge data from current and forecast dictionaries and avg forecasts\n",
124 |     "    params:\n",
125 |     "        latest: filtered dictionary from get_latest\n",
126 |     "        forecast: filtered dictionary from get_forecast\n",
127 |     "    returns:\n",
128 |     "        dict: merged dict with additional mean for forecasts\n",
129 |     "    '''\n",
130 |     "    final = latest.copy()\n",
131 |     "    final.update(forecast)\n",
132 |     "    mean_tmp, mean_hum = np.mean(forecast['forecast_temps']), np.mean(forecast['forecast_humidity'])\n",
133 |     "    final['mean_temp'] = np.round(mean_tmp, 2)\n",
134 |     "    final['mean_hum'] = np.round(mean_hum, 2)\n",
135 |     "    return final"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": true
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "def main(city):\n",
147 |     "    ''' Main function which will take city names and return a final dataset for each city\n",
148 |     "    params:\n",
149 |     "        city: string (ex: 'Berlin,DE')\n",
150 |     "    returns:\n",
151 |     "        dict: current and forecast temps and humidities for given city\n",
152 |     "    '''\n",
153 |     "    config = get_config()\n",
154 |     "    city_str, weather_data = get_current(city, config)\n",
155 |     "    latest = filter_temp(city_str, weather_data)\n",
156 |     "    city_str, weather_data = get_forecast(city, config)\n",
157 |     "    forecast = filter_temp(city_str, weather_data)\n",
158 |     "    final = merge_data(latest, forecast)\n",
159 |     "    return final"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "def get_config():\n",
171 |     "    ''' returns config '''\n",
172 |     "    config = ConfigParser()\n",
173 |     "    config.read(['../config/prod.cfg', 'config/prod.cfg'])\n",
174 |     "    return config"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "city_list = ['London,UK', 'Berlin,DE', 'NewYork,NY', \n",
186 |     "             'LosAngeles,CA', 'Madrid,ES', 'Bangkok,TH', \n",
187 |     "             'Baghdad,IQ', 'Auckland,NZ', 'Istanbul,TR',\n",
188 |     "             'MexicoCity,MX', 'Primavera,CL', 'KualaLumpur,MY',\n",
189 |     "             'Shanghai,CN', 'Chicago,IL', 'Rome,IT', 'Nairobi,KE',\n",
190 |     "             'MachuPicchu,PE', 'Cardiff,UK', 'Somewhere,WL']"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "%%time\n",
200 |     "res = []\n",
201 |     "for city in city_list:\n",
202 |     "    try:\n",
203 |     "        final = main(city)\n",
204 |     "        res.append(final)\n",
205 |     "    except Exception as e:\n",
206 |     "        print(city, e)\n",
207 |     "\n",
208 |     "print('sorted by current temp: ', sorted(res, key=lambda x: x.get('current_temp'), reverse=True))\n",
209 |     "print('sorted by upcoming forecast temp: ', sorted(res, key=lambda x: x.get('mean_temp'), reverse=True))"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "Depending on your setup the `start_diagnostics_server` which starts the web UI for analyzing your Dask scheduler and work via the Executor may or may not work. If it doesn't work out of the box, you'll need to start the dask-scheduler a different way. Easiest is using:\n",
217 |     "\n",
218 |     "`/path/to/your/virtualenv/bin/dask-scheduler`\n",
219 |     "\n",
220 |     "which will start the scheduler process in your terminal as well as the Bokeh server for the web UI. The output should have the links for both the web UI (usually [localhost:8787](http://127.0.0.1:8787)) as well as the local scheduler. \n",
221 |     "\n",
222 |     "In a new shell or screen session, run the worker nodes with however many workers you'd like (here I chose 8):\n",
223 |     "\n",
224 |     "`/path/to/your/virtualenv/bin/dask-worker --nprocs 8 127.0.0.1:8786`\n",
225 |     "\n",
226 |     "I recommend using [`screen`](https://www.gnu.org/software/screen/) so you can easily switch between shells and keep track of logs. Once installed, you can create a new named screen like so: `screen -S scheduler`, use ctl + a followed by d to detach back to your main shell and ctl + a followed by k to kill the screen session when you are done. To reattach to a running named screen you can use `screen -r scheduler`. Read through the docs for more info. \n",
227 |     "\n",
228 |     "Then you can utilize the code directly below this cell instead of the `LocalCluster` code two cells below."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "exc = Executor('127.0.0.1:8786') # You may want to change this to the exact IP shown when you ran dask-scheduler"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "lc = LocalCluster()\n",
247 |     "lc.start_diagnostics_server() \n",
248 |     "exc = Executor(lc)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "%%time\n",
258 |     "\n",
259 |     "futures = [e.submit(main, i) for i in city_list]\n",
260 |     "print(futures)\n",
261 |     "print('sorted by current temp', \n",
262 |     "      sorted([f.result() for f in futures if f.status != 'error'], \n",
263 |     "             key=lambda x: x['current_temp'], reverse=True))\n",
264 |     "print('sorted by forecast temp', \n",
265 |     "      sorted([f.result() for f in futures if f.status != 'error'], \n",
266 |     "             key=lambda x: x['mean_temp'], reverse=True))"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "example_error = futures[-1]"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "example_error.status"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "example_error.result()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": []
304 |   }
305 |  ],
306 |  "metadata": {
307 |   "kernelspec": {
308 |    "display_name": "Python 3",
309 |    "language": "python",
310 |    "name": "python3"
311 |   },
312 |   "language_info": {
313 |    "codemirror_mode": {
314 |     "name": "ipython",
315 |     "version": 3
316 |    },
317 |    "file_extension": ".py",
318 |    "mimetype": "text/x-python",
319 |    "name": "python",
320 |    "nbconvert_exporter": "python",
321 |    "pygments_lexer": "ipython3",
322 |    "version": "3.6.1"
323 |   }
324 |  },
325 |  "nbformat": 4,
326 |  "nbformat_minor": 1
327 | }
328 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 4 - Learning Dask Bags.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from dask import bag\n",
 10 |     "import json\n",
 11 |     "from bokeh.plotting import output_notebook\n",
 12 |     "output_notebook()"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "Some of this notebook is taken from [the Dask Examples repository](https://github.com/dask/dask-examples/blob/master/github-on-ec2.ipynb)\n",
 20 |     "\n",
 21 |     "To gather the data, I ran this in my terminal from the `data` directory:\n",
 22 |     "\n",
 23 |     "`wget http://data.githubarchive.org/2016-01-01-{0..23}.json.gz\n",
 24 |     "wget http://data.githubarchive.org/2015-12-31-{0..23}.json.gz`\n",
 25 |     "\n",
 26 |     "This is not (by any means) big data, but is used for example"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "db = bag.read_text(['../data/2016*.json.gz', '../data/2015*.json.gz']).map(json.loads)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "db.count().compute()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "first = db.take(1)[0]\n",
 54 |     "first"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "tenth = db.take(10)[-1]\n",
 64 |     "tenth"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "%time db.pluck('type').frequencies().compute()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "import re\n",
 83 |     "time_pattern = re.compile('[\\d\\-]+T(?P<hour>[\\d]+)')\n",
 84 |     "\n",
 85 |     "pushes = db.filter(lambda x: x['type'] == 'PushEvent')\n",
 86 |     "hours = pushes.pluck('created_at').map(lambda x: re.search(time_pattern, x).group('hour'))\n",
 87 |     "top_10_hours = hours.frequencies().topk(10, key=lambda time, count: count)\n",
 88 |     "%time top_10_hours.compute()"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "def get_hours(x):\n",
 98 |     "    \"\"\"The key for foldby, like a groupby key. Get the hour from a PushEvent\"\"\"\n",
 99 |     "    return re.search(time_pattern, x['created_at']).group('hour')\n",
100 |     "\n",
101 |     "def binop(total, x):\n",
102 |     "    \"\"\"Count the number of commits in a PushEvent\"\"\"\n",
103 |     "    return total + len(x['payload']['commits'])\n",
104 |     "\n",
105 |     "def combine(total1, total2):\n",
106 |     "    \"\"\"This combines commit counts from PushEvents\"\"\"\n",
107 |     "    return total1 + total2\n",
108 |     "\n",
109 |     "commits = pushes.foldby(get_hours, binop, initial=0, combine=combine)\n",
110 |     "top_commits = commits.topk(10, key=lambda time, count: count)\n",
111 |     "%time top_commits.compute()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "messages = pushes.pluck('payload').map(lambda x: ' '.join([c['message'].lower() for c in x['commits']]))\n",
121 |     "top_10_words = messages.str.split().concat().frequencies().topk(10, lambda word, count: count)\n",
122 |     "%time top_10_words.compute()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "If you haven't run `nltk` yet, you'll need to download your corpora. To do so, use this:\n",
130 |     "\n",
131 |     "`import nltk; nltk.download()`\n",
132 |     "\n",
133 |     "Follow the prompt and select (d) for Download and then type: `stopwords`\n",
134 |     "\n",
135 |     "Then you can use (q) to quit once the download is completed."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from nltk.corpus import stopwords"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "def get_combined_messages(x):\n",
156 |     "    long_str = ' '.join([c['message'].lower() for c in x['commits']])\n",
157 |     "    return ' '.join([w for w in long_str.split() if w not in stopwords.words('english')])"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "long_strs = pushes.pluck('payload').map(get_combined_messages)\n",
167 |     "long_strs.take(5)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "top_20_words = long_strs.str.split().concat().frequencies().topk(20, lambda word, count: count)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "from dask.diagnostics import Profiler\n",
186 |     "prof = Profiler()\n",
187 |     "\n",
188 |     "with prof:\n",
189 |     "    res = top_20_words.compute()\n",
190 |     "\n",
191 |     "prof.visualize()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "res"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.6.1"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 1
234 | }
235 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 6 - Introduction to PySpark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Introduction to PySpark"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "<pyspark.context.SparkContext at 0x1056a1400>"
 21 |       ]
 22 |      },
 23 |      "execution_count": 1,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "sc"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "<pyspark.sql.context.SQLContext at 0x105e96a90>"
 43 |       ]
 44 |      },
 45 |      "execution_count": 2,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "sqlCtx"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "20"
 65 |       ]
 66 |      },
 67 |      "execution_count": 3,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "rdd = sc.parallelize(range(1000), 20)  \n",
 74 |     "rdd.getNumPartitions()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "[0, 1, 2, 3, 4]"
 88 |       ]
 89 |      },
 90 |      "execution_count": 4,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "rdd.take(5)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "text_rdd = sc.textFile(\n",
108 |     "    'file:///Users/kjam/data-pipelines-course/data/europarl_speech_text.txt')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "PythonRDD[4] at RDD at PythonRDD.scala:48"
122 |       ]
123 |      },
124 |      "execution_count": 6,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "text_rdd.filter(lambda x: 'Deutschland' in x)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 7,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "'\"Ich begrüße die Gewährung eines finanziellen Beitrags aus dem Europäischen Fonds für die Anpassung an die Globalisierung, den die deutschen Behörden im Zuge der Entlassungen bei der Aleo Solar AG und zwei ihrer Tochtergesellschaften beantragt hatten.,Dieser Fonds wurde eingerichtet, um Arbeitnehmer, die unter den Folgen weitreichender Strukturveränderungen im Welthandelsgefüge zu leiden haben, zusätzlich zu unterstützen. Aus dem Fonds werden individuell angepasste Maßnahmen zur beruflichen Wiedereingliederung von Arbeitssuchenden finanziert, Schritte in die Selbständigkeit und Unternehmensgründungen gefördert, Mobilitätsbeihilfen sowie Beihilfen für benachteiligte oder ältere Arbeitnehmer gewährt.,China hat enorme Überkapazitäten bei Solarmodulen aufgebaut, die weder von den eigenen Verbrauchern noch vom Weltmarkt aufgenommen werden können. Zusammen mit dem weltweiten Rückgang der Nachfrage hat dies zu einem Preisverfall geführt. Da China ca. 80% seiner Produktion zu billigen Preisen nach Europa exportiert, war die Nachfrage nach China-Produkten größer als nach denen der deutschen Firma Aleo Solar. Dadurch lassen sich die 615 Entlassungen der Arbeitsgemeinschaft sowie zwei ihrer Tochtergesellschaften erklären.,Es ist deshalb erfreulich, dass die EU dem Antrag Deutschlands stattgegeben hat und Mittel zur Unterstützung der entlassenen Arbeitskräfte zur Verfügung stellt, in der Hoffnung, diesen durch die ergriffenen Maßnahmen zu dauerhaften, langfristigen und damit stabilen Beschäftigungsverhältnissen zu verhelfen.\"'"
144 |       ]
145 |      },
146 |      "execution_count": 7,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "text_rdd.filter(lambda x: 'Deutschland' in x).first()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 8,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "germany = text_rdd.filter(lambda x: 'Deutschland' in x)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 9,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "merkel = text_rdd.filter(lambda x: 'Merkel' in x)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 10,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "de_merkel = germany.union(merkel)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 11,
191 |    "metadata": {
192 |     "collapsed": false
193 |    },
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "UnionRDD[8] at union at NativeMethodAccessorImpl.java:-2"
199 |       ]
200 |      },
201 |      "execution_count": 11,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "de_merkel"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 12,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "890"
221 |       ]
222 |      },
223 |      "execution_count": 12,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "de_merkel.count()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 13,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "de_merkel = de_merkel.persist()"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 14,
246 |    "metadata": {
247 |     "collapsed": true
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "pairs = de_merkel.map(lambda s: (s, 1))\n",
252 |     "counts = pairs.reduceByKey(lambda a, b: a + b)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 15,
258 |    "metadata": {
259 |     "collapsed": false
260 |    },
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "[('\"Frau von Storch! Ich habe gesagt, dass es jetzt vor allem einmal um Kriegsflüchtlinge geht. Da war ich wirklich froh. Frau Merkel ist nicht von meiner Partei, und ich habe oft viel Kritik an ihr. Aber in dem Fall, als sie gesagt hat, dass in Deutschland Flüchtlinge aus Syrien nicht mehr nach Ungarn oder sonst wohin zurückgeschickt werden – das war richtig und gut. Und ich bin froh, dass auch mein Heimatland das endlich gemacht hat.,Um Kriegsflüchtlinge geht es hier, und für die brauchen wir Platz, für die brauchen wir tatsächlich auch Unterstützung. Aber Armutsflüchtlinge – wie viele sind denn aus Europa weggegangen, vor schrecklicher Armut aus Irland, auch aus meinem Land, auch aus Deutschland anderswohin geflüchtet, als die Zeiten schlecht waren? Dazu müssen wir auch beitragen, dass vor Ort Welthandelsstrukturen und ähnliches geändert werden, dass Menschen nicht mehr fliehen müssen. Aber wenn jemand flieht vor Klimakatastrophen, die wir mit verursachen, vor Armut, die wir mit verursachen, weil wir nicht genügend zahlen für Rohstoffe, dann muss man auch hier genau hinschauen und sehen, ob diese Leute daheim überhaupt überleben können oder nicht, oder ob sie nicht auch bei uns ein Recht haben zu leben.\"',\n",
266 |        "  2),\n",
267 |        " ('\"Ich habe heute für den Bericht über bildungs- und ausbildungspolitische Maßnahmen zum Abbau der Jugendarbeitslosigkeit gestimmt. Ein besonderes Augenmerk möchte ich als Europaabgeordneter der Familien-Partei Deutschlands dabei auf eine Zielgruppe lenken: Junge Mütter. Ihre Anliegen, wenn es um einen Start oder eine Rückkehr in den Arbeitsmarkt geht, müssen durch eine spezielle „skill policy“ gefördert werden, die auch die „skills“ sieht, anerkennt und grenzübergreifend vergleichbar macht, die Mütter und Väter in Erziehungszeit erwerben.\"',\n",
268 |        "  1),\n",
269 |        " ('\"– MrPresident, the EU is pandering to a country which is sliding ever closer to barbarism. The fact that this project is bribing Turkey with EUR2.2billion of EU taxpayers’ money, in a desperate bid to stem the migrant crisis of its own making, is a total political, economic and human disaster.,To stem this migration, the EU was willing to bend over backwards and ignore the war on the Kurds, the smuggling of oil from ISIS and the continual destruction of human rights in Turkey. Instead of dealing with the real issues of the Schengen zone and MsMerkel’s insane open doors policy, the EU is choosing to believe that a coup was attempted, when in reality it was set up by the Turkish Government to crack down on internal dissent.,I am only glad that we in the UK are getting out of the EU while we can, because this policy is madness. The cowardly, self-delusionary posturing of the EU towards Turkey is an utter disaster, and the accession process must end soon. It is an offence against both human rights and democracy. You are simply enabling a totalitarian regime, MsMogherini. Halt all talks now and stop giving bribes to them.\"',\n",
270 |        "  1),\n",
271 |        " ('\"Die Bewältigung des Flüchtlingsansturms auf Europa ist eine Aufgabe, die weder Deutschland, noch Ungarn, Dänemark oder Griechenland auf sich allein gestellt lösen kann. Die letzten Wochen haben gezeigt, dass nationale Alleingänge einzelner Mitgliedstaaten nur ins Chaos führen, gegenseitige Schuldzuweisungen bringen uns keinen Schritt weiter. Nur mit gemeinsamen europäischen Ansätzen können Lösungen erreicht werden. Kommissionspräsident Juncker hat richtigerweise an unsere gemeinsame humanitäre Verpflichtung zur Aufnahme von Menschen, die aus Angst um ihr Leben auf der Flucht sind, erinnert. Es ist dauerhaft nicht hinnehmbar, dass einige Mitgliedstaaten überhaupt keine Hilfe leisten und die Lasten einfach anderen überlassen. Deshalb führt kein Weg an einem Quotenmodell mit einem europäischen Verteilungsschlüssel vorbei. Außerdem müssen die bereits bestehenden Mindeststandards des Gemeinsamen Europäischen Asylsystems, zum Beispiel im Hinblick auf eine ordnungsgemäße Unterbringung und Behandlung der Flüchtlinge, in der gesamten EU wirksam angewendet werden. Um wirklich den Menschen helfen zu können, die unsere Hilfe am nötigsten haben, brauchen wir auch eine einheitliche Definition von sicheren Herkunftsländern in allen Mitgliedstaaten. Es muss allen klar sein, dass Asylanträge aus Ländern mit gefestigten Demokratien, wie etwa den Westbalkanländern, nicht akzeptiert werden können.\"',\n",
272 |        "  1),\n",
273 |        " ('\"Herr Präsident, liebe Kolleginnen und Kollegen, Frau Kommissarin! Normalerweise bin ich sehr stolz auf unsere deutsche Gründlichkeit. Aber in diesem Fall ist die deutsche Regierung weit über das Ziel hinausgeschossen mit ihrer Detailversessenheit. Es ist tatsächlich so, dass ich überhaupt nicht gegen Mindestlöhne als solche bin. Das hat die Frau Kommissarin gesagt: Darüber kann jedes Land selber entscheiden, das ist nationale Souveränität – vollkommen okay. In diesem Fall ist es aber das erste Mal – wir sind ja nicht das erste Land, das ein Mindestlohngesetz hat, ganz viele andere hatten es auch schon –, dass der Transportsektor, auch der Transitsektor, so detailversessen beschrieben worden ist, bis dahin – meine Kollegin hat es gesagt –, dass Faxe vorweg auf Deutsch geschickt werden sollen an die deutsche Regierung, die deutschen Behörden, wann genau welches Unternehmen sich wie lange auf deutschem Boden aufhält, und dass man sich versichern soll, dass man den deutschen Mindestlohn bezahlt. Meines Erachtens geht das deutlich übers Ziel hinaus, und es ist auch gegen europäisches Recht.,Die Prüfung ist noch nicht abgeschlossen, das haben Sie gesagt. Aber wie kann man denn für den Binnenmarkt sein, wo wir in den einzelnen Ländern unterschiedliche Lebensbedingungen haben, natürlich auch unterschiedliche Lohnniveaus. Das ist einfach so! Wir wollen auf keinen Fall, dass die LKW-Fahrer schlecht behandelt werden. Da liegt auch vieles im Argen. Das gebe ich genau so zu. Aber meines Erachtens – das hat auch Herr Ujhelyi gesagt, das betrifft auch die Petition und die Demonstration – ist dies der falsche Weg, um dem abzuhelfen. Da müssen wir wirklich anders vorgehen, um bessere Bedingungen für die LKW-Fahrer zu schaffen. Es ist tatsächlich so, dass meines Erachtens Deutschland damit ein Fass aufgemacht hat. Denn es ist das erste Mal, dass eben Logistik und Transport drin ist – eine europäische Politik.,Jetzt hat Frankreich nachgezogen. Frankreich will das jetzt genauso machen. Wenn andere Länder dazukommen, was soll denn dann bitte ein Spediteur machen, der durch fünf verschiedene Länder fährt? In Polen ist der Lohn relativ niedrig, bei 1,95 Euro oder auch ein bisschen mehr, in Luxemburg 11,10 Euro, in Frankreich 9,61 Euro, in Deutschland 8,50 Euro. Wie soll man das auseinanderdröseln, und wer soll das auch kontrollieren? Meines Erachtens geht das in diesem Fall ganz klar am Ziel, die Leute besserzustellen, vorbei und absolut gegen Europa und gegen den Binnenmarkt. Das können wir meines Erachtens überhaupt nicht dulden. Als Letztes: Ich befürchte auch, dass kleine mittelständische Unternehmen, die wir immer schützen wollen, nun gerade aufgeben müssen und Arbeitsplätzen verlustig gehen. Das ist auch nicht Europas Wille!\"',\n",
274 |        "  1),\n",
275 |        " ('\"In einer Situation, in der reiche Staaten wie Deutschland ihre Rentensysteme mittel- und langfristig nicht mehr finanzieren werden können, sind deutliche Reformschritte nötig. Statt wie Wolfgang Schäuble über die Erhöhung des Rentenalters nachzudenken, fordert ALFA eine detaillierte Überprüfung der Haushaltsausgaben, einschließlich des deutschen Beitrags in den EU-Haushalt.,ALFA setzt sich für einen effizienteren EU-Haushalt ein und wendet sich entschieden gegen die Verschwendung von Steuergeldern, die auf nationalstaatlicher ähnlich wie auf der EU-Ebene ein großes Problem ist. Eine solche Einstellung erwartet ALFA von dem ganzen Europaparlament, das die Entlastung zum EU-Budget erteilt.,Da die Entwürfe des Ausschusses für Haushaltskontrolle generell eine Reform des EU-Budgets nicht deutlich genug gefordert hatten, habe ich gegen die Entlastung bei den meisten EU-Haushaltslinien gestimmt. Dies betraf auch die Entlastung zum Gemeinsamen Unternehmen SESAR.\"',\n",
276 |        "  2),\n",
277 |        " ('\"In einer Situation, in der reiche Staaten wie Deutschland ihre Rentensysteme mittel- und langfristig nicht mehr finanzieren werden können, sind rasante Reformschritte nötig. Statt wie Wolfgang Schäuble über die Erhöhung des Rentenalters nachzudenken, fordert ALFA eine detaillierte Überprüfung der Haushaltsausgaben, einschließlich des deutschen Beitrags in den EU-Haushalt.,ALFA setzt sich für einen effizienteren EU-Haushalt ein und wendet sich entschieden gegen die Verschwendung von Steuergeldern, die auf nationalstaatlicher ähnlich wie auf der EU-Ebene ein großes Problem ist. Eine solche Einstellung erwartet ALFA vom ganzen Europäischen Parlament, das die Entlastung zum EU-Budget erteilt.,Da die Entwürfe des Ausschusses für Haushaltskontrolle generell eine Reform des EU-Budgets nicht stark genug gefordert hatten, habe ich gegen die Entlastung bei den meisten EU-Haushaltslinien gestimmt. Dies betraf auch die Entlastung zum Gemeinsamen Unternehmen Clean Sky.\"',\n",
278 |        "  1),\n",
279 |        " ('Ich habe heute für den Einwand gemäß Artikel 106: Genehmigung genetisch veränderter Sojabohnen der Sorte MON 87708 × MON 89788 gestimmt. Die Familien-Partei Deutschlands ist gegen den Einsatz gentechnisch manipulierten Saatgutes in der Landwirtschaft.',\n",
280 |        "  2),\n",
281 |        " ('\"Frau Präsidentin! Herr Pittella sprach eben davon, dass er sich wie in einem Film vorkäme. Ja, das gilt auch für mich. Nur habe ich das Gefühl, wir wären mitten in einem Filmriss und dieselben Szenen würden immer wieder vorgeführt. Genau diesen Eindruck habe ich. Das habe ich schon so oft in diesem Parlament gehört, diese Beschwörung dessen, was man tun muss. Ich sage ihnen: Wenn Herr Moscovici sagt, was Griechenland alles tun muss, das ist ein Entmündigungsprogramm! Und dann sagt er: Jetzt kommt Wachstum, jetzt kommen Investitionen, jetzt kommt Wettbewerbsfähigkeit. Nein, nichts davon kommt! Der Economist hat festgestellt: \"\",\"\". Schauen Sie doch mal die Statistiken nach! Warum ist in Deutschland die Zahl der Arbeitslosen von zehn Prozent auf fünf Prozent zurückgegangen, warum ist im Süden der Eurozone die Arbeitslosigkeit von zehn Prozent auf fünfzehn und mehr Prozent angestiegen? Weil der Euro ein falscher relativer Preis ist. Das ist alles! Man muss da ansetzen! Aber dieses Parlament geht nicht die Probleme an, sondern tanzt um das Goldene Kalb des Euro, ohne die Probleme in den Griff bekommen zu wollen.\"',\n",
282 |        "  1),\n",
283 |        " ('\"Frau Präsidentin! 1997 habe ich mit der damaligen Umweltministerin Angela Merkel die Umweltkonferenz in Kyoto besucht, und wir haben uns damals beide für spezifische und nicht für absolute CO2-Ziele eingesetzt. Inzwischen will Frau Merkel das Weltklima retten. Sie ist überhaupt beim Retten: Sie will den Euro retten, sie will Griechenland retten – gleich dreimal, sie will die Banken retten durch die Europäische Bankenunion zu Lasten der deutschen Sparkassen, und jetzt will sie die Weltflüchtlinge retten. Meine Damen und Herren! Immer mehr ausländische Kollegen auch in diesem Gremium zeigen und sagen mir, dass Frau Merkel und die Bundesregierung anscheinend an einem Helfersyndrom leiden. Das muss beendet werden!,Darf ich Ihnen bei dieser Gelegenheit auch nochmal sagen: Am deutschen Wesen soll Europa nicht weiter genesen. Im Übrigen – und das ist mein ceterum censeo – möchte ich darauf aufmerksam machen, dass der Einheitseuro Europa spaltet und abgeschafft werden muss.\"',\n",
284 |        "  1)]"
285 |       ]
286 |      },
287 |      "execution_count": 15,
288 |      "metadata": {},
289 |      "output_type": "execute_result"
290 |     }
291 |    ],
292 |    "source": [
293 |     "counts.take(10)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": false
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "pairs = de_merkel.flatMap(lambda s: s.split(' ')).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "collapsed": false
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "pairs.take(4)"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "pairs.sortBy(lambda k: k[1]).top(10)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "import re"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {
344 |     "collapsed": false
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "pairs = de_merkel.flatMap(lambda s: re.findall('\\w+', s)).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {
355 |     "collapsed": false
356 |    },
357 |    "outputs": [],
358 |    "source": [
359 |     "my_df = pairs.sortBy(lambda p: p[1]).collect()"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {
366 |     "collapsed": false
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "my_df"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "collapsed": false
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "type(my_df)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {
388 |     "collapsed": false
389 |    },
390 |    "outputs": [],
391 |    "source": [
392 |     "my_df[-40:]"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {
399 |     "collapsed": true
400 |    },
401 |    "outputs": [],
402 |    "source": []
403 |   }
404 |  ],
405 |  "metadata": {
406 |   "kernelspec": {
407 |    "display_name": "Python 3",
408 |    "language": "python",
409 |    "name": "python3"
410 |   },
411 |   "language_info": {
412 |    "codemirror_mode": {
413 |     "name": "ipython",
414 |     "version": 3
415 |    },
416 |    "file_extension": ".py",
417 |    "mimetype": "text/x-python",
418 |    "name": "python",
419 |    "nbconvert_exporter": "python",
420 |    "pygments_lexer": "ipython3",
421 |    "version": "3.4.4"
422 |   }
423 |  },
424 |  "nbformat": 4,
425 |  "nbformat_minor": 0
426 | }
427 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 6 - Introduction to Spark Streaming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pyspark.streaming import StreamingContext"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false,
 19 |     "scrolled": true
 20 |    },
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/plain": [
 25 |        "<pyspark.context.SparkContext at 0x105699400>"
 26 |       ]
 27 |      },
 28 |      "execution_count": 2,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "sc"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "ssc = StreamingContext(sc, 60)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "lines = ssc.socketTextStream(\"0.0.0.0\", 9999)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 5,
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "<pyspark.streaming.dstream.DStream at 0x106937198>"
 70 |       ]
 71 |      },
 72 |      "execution_count": 5,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "lines"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 6,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "words = lines.flatMap(lambda line: line.split(\" \"))"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 7,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "pairs = words.map(lambda word: (word, 1))\n",
101 |     "wordCounts = pairs.reduceByKey(lambda x, y: x + y)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 8,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "wordCounts.pprint()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "-------------------------------------------\n",
127 |       "Time: 2016-10-16 15:18:00\n",
128 |       "-------------------------------------------\n",
129 |       "\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "ssc.start()             # Start the computation\n",
135 |     "ssc.awaitTermination() "
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": true
143 |    },
144 |    "outputs": [],
145 |    "source": []
146 |   }
147 |  ],
148 |  "metadata": {
149 |   "kernelspec": {
150 |    "display_name": "Python 3",
151 |    "language": "python",
152 |    "name": "python3"
153 |   },
154 |   "language_info": {
155 |    "codemirror_mode": {
156 |     "name": "ipython",
157 |     "version": 3
158 |    },
159 |    "file_extension": ".py",
160 |    "mimetype": "text/x-python",
161 |    "name": "python",
162 |    "nbconvert_exporter": "python",
163 |    "pygments_lexer": "ipython3",
164 |    "version": "3.4.4"
165 |   }
166 |  },
167 |  "nbformat": 4,
168 |  "nbformat_minor": 0
169 | }
170 | 


--------------------------------------------------------------------------------
/notebooks/Chapter 7 - Testing with Hypothesis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import sys\n",
 12 |     "sys.path.append('../celery_app')"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 4,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from hypothesis import given, note, strategies as st\n",
 24 |     "from tasks import calc_ratio"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 12,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "@given(st.floats(), st.floats())\n",
 36 |     "def test_calc_ratio(p, c):\n",
 37 |     "    ratio = calc_ratio(p, c)\n",
 38 |     "    assert isinstance(ratio, float)\n",
 39 |     "    assert -100 <= ratio <= 100\n",
 40 |     "    assert len(str(ratio).split('.')) == 2\n",
 41 |     "    assert len(str(ratio).split('.')[1]) == 2"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 13,
 47 |    "metadata": {
 48 |     "collapsed": false,
 49 |     "scrolled": true
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "Falsifying example: test_calc_ratio(p=0.0, c=0.0)\n"
 57 |      ]
 58 |     },
 59 |     {
 60 |      "ename": "ZeroDivisionError",
 61 |      "evalue": "float division by zero",
 62 |      "output_type": "error",
 63 |      "traceback": [
 64 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 65 |       "\u001b[0;31mZeroDivisionError\u001b[0m                         Traceback (most recent call last)",
 66 |       "\u001b[0;32m<ipython-input-13-875c52153492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 67 |       "\u001b[0;32m<ipython-input-12-4f6523e5f226>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 68 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m    522\u001b[0m                         reify_and_execute(\n\u001b[1;32m    523\u001b[0m                             \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m                             \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    525\u001b[0m                         ))\n\u001b[1;32m    526\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 69 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 70 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m    109\u001b[0m                     lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m    110\u001b[0m                         test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    112\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 71 |       "\u001b[0;32m<ipython-input-12-4f6523e5f226>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 72 |       "\u001b[0;32m/home/katharine/wrrk/my_classes/data_pipelines_course/celery_app/tasks.py\u001b[0m in \u001b[0;36mcalc_ratio\u001b[0;34m(price, compare)\u001b[0m\n\u001b[1;32m     40\u001b[0m     \u001b[0mreturns\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     41\u001b[0m     '''\n\u001b[0;32m---> 42\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprice\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mcompare\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     43\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 73 |       "\u001b[0;31mZeroDivisionError\u001b[0m: float division by zero"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "test_calc_ratio()"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 14,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "@given(st.floats(min_value=4), st.floats(min_value=4))\n",
 90 |     "def test_calc_ratio(p, c):\n",
 91 |     "    ratio = calc_ratio(p, c)\n",
 92 |     "    assert isinstance(ratio, float)\n",
 93 |     "    assert -100 <= ratio <= 100\n",
 94 |     "    assert len(str(ratio).split('.')) == 2\n",
 95 |     "    assert len(str(ratio).split('.')[1]) == 2"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 15,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "Falsifying example: test_calc_ratio(p=4.0, c=4.0)\n"
110 |      ]
111 |     },
112 |     {
113 |      "ename": "AssertionError",
114 |      "evalue": "",
115 |      "output_type": "error",
116 |      "traceback": [
117 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
118 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
119 |       "\u001b[0;32m<ipython-input-15-875c52153492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
120 |       "\u001b[0;32m<ipython-input-14-5497d3354cb1>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
121 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m    522\u001b[0m                         reify_and_execute(\n\u001b[1;32m    523\u001b[0m                             \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m                             \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    525\u001b[0m                         ))\n\u001b[1;32m    526\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
122 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
123 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m    109\u001b[0m                     lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m    110\u001b[0m                         test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    112\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
124 |       "\u001b[0;32m<ipython-input-14-5497d3354cb1>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
125 |       "\u001b[0;31mAssertionError\u001b[0m: "
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "test_calc_ratio()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 16,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "'0'"
144 |       ]
145 |      },
146 |      "execution_count": 16,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "str(calc_ratio(4, 4)).split('.')[1]"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 19,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "@given(st.floats(min_value=4), st.floats(min_value=4))\n",
164 |     "def test_calc_ratio(p, c):\n",
165 |     "    ratio = calc_ratio(p, c)\n",
166 |     "    assert isinstance(ratio, float)\n",
167 |     "    assert -100 <= ratio <= 100\n",
168 |     "    assert len(str(ratio).split('.')) == 2\n",
169 |     "    assert len(str(ratio).split('.')[1]) <= 2"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 20,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "Falsifying example: test_calc_ratio(p=8.000200000000001, c=4.0)\n"
184 |      ]
185 |     },
186 |     {
187 |      "ename": "AssertionError",
188 |      "evalue": "",
189 |      "output_type": "error",
190 |      "traceback": [
191 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
192 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
193 |       "\u001b[0;32m<ipython-input-20-875c52153492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
194 |       "\u001b[0;32m<ipython-input-19-cb818307f727>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
195 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m    522\u001b[0m                         reify_and_execute(\n\u001b[1;32m    523\u001b[0m                             \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m                             \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    525\u001b[0m                         ))\n\u001b[1;32m    526\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
196 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
197 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m    109\u001b[0m                     lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m    110\u001b[0m                         test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    112\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
198 |       "\u001b[0;32m<ipython-input-19-cb818307f727>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
199 |       "\u001b[0;31mAssertionError\u001b[0m: "
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "test_calc_ratio()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 21,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "100.01"
218 |       ]
219 |      },
220 |      "execution_count": 21,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "calc_ratio(8.000200000000001, 4)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 22,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "@given(st.floats(min_value=4), st.floats(min_value=4))\n",
238 |     "def test_calc_ratio(p, c):\n",
239 |     "    ratio = calc_ratio(p, c)\n",
240 |     "    assert isinstance(ratio, float)\n",
241 |     "    assert len(str(ratio).split('.')) == 2\n",
242 |     "    assert len(str(ratio).split('.')[1]) <= 2"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 23,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "Falsifying example: test_calc_ratio(p=400000000000004.0, c=4.0)\n"
257 |      ]
258 |     },
259 |     {
260 |      "ename": "AssertionError",
261 |      "evalue": "",
262 |      "output_type": "error",
263 |      "traceback": [
264 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
265 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
266 |       "\u001b[0;32m<ipython-input-23-875c52153492>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
267 |       "\u001b[0;32m<ipython-input-22-b8f94d529461>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
268 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m    522\u001b[0m                         reify_and_execute(\n\u001b[1;32m    523\u001b[0m                             \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m                             \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    525\u001b[0m                         ))\n\u001b[1;32m    526\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
269 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m     56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
270 |       "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m    109\u001b[0m                     lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m    110\u001b[0m                         test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    112\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
271 |       "\u001b[0;32m<ipython-input-22-b8f94d529461>\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
272 |       "\u001b[0;31mAssertionError\u001b[0m: "
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "test_calc_ratio()"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 25,
283 |    "metadata": {
284 |     "collapsed": false
285 |    },
286 |    "outputs": [
287 |     {
288 |      "data": {
289 |       "text/plain": [
290 |        "'1e+16'"
291 |       ]
292 |      },
293 |      "execution_count": 25,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "calc_ratio(400000000000004.0, 4.0)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 26,
305 |    "metadata": {
306 |     "collapsed": true
307 |    },
308 |    "outputs": [],
309 |    "source": [
310 |     "@given(st.floats(min_value=4, max_value=10000), st.floats(min_value=4, max_value=10000))\n",
311 |     "def test_calc_ratio(p, c):\n",
312 |     "    ratio = calc_ratio(p, c)\n",
313 |     "    assert isinstance(ratio, float)\n",
314 |     "    assert len(str(ratio).split('.')) == 2\n",
315 |     "    assert len(str(ratio).split('.')[1]) <= 2"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 27,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "test_calc_ratio()"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "outputs": [],
336 |    "source": []
337 |   }
338 |  ],
339 |  "metadata": {
340 |   "kernelspec": {
341 |    "display_name": "Python 3",
342 |    "language": "python",
343 |    "name": "python3"
344 |   },
345 |   "language_info": {
346 |    "codemirror_mode": {
347 |     "name": "ipython",
348 |     "version": 3
349 |    },
350 |    "file_extension": ".py",
351 |    "mimetype": "text/x-python",
352 |    "name": "python",
353 |    "nbconvert_exporter": "python",
354 |    "pygments_lexer": "ipython3",
355 |    "version": "3.4.3"
356 |   }
357 |  },
358 |  "nbformat": 4,
359 |  "nbformat_minor": 1
360 | }
361 | 


--------------------------------------------------------------------------------
/notebooks/Extras (Chapter 4) - Clean Vehicle Theft Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 11,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "from datetime import datetime"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 12,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "df = pd.read_csv('/home/katharine/Downloads/datasets/mvt.csv')"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 13,
 29 |    "metadata": {
 30 |     "collapsed": false
 31 |    },
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/html": [
 36 |        "<div>\n",
 37 |        "<table border=\"1\" class=\"dataframe\">\n",
 38 |        "  <thead>\n",
 39 |        "    <tr style=\"text-align: right;\">\n",
 40 |        "      <th></th>\n",
 41 |        "      <th>Date</th>\n",
 42 |        "      <th>Latitude</th>\n",
 43 |        "      <th>Longitude</th>\n",
 44 |        "    </tr>\n",
 45 |        "  </thead>\n",
 46 |        "  <tbody>\n",
 47 |        "    <tr>\n",
 48 |        "      <th>0</th>\n",
 49 |        "      <td>12/31/12 23:15</td>\n",
 50 |        "      <td>41.756284</td>\n",
 51 |        "      <td>-87.621645</td>\n",
 52 |        "    </tr>\n",
 53 |        "    <tr>\n",
 54 |        "      <th>1</th>\n",
 55 |        "      <td>12/31/12 22:00</td>\n",
 56 |        "      <td>41.898788</td>\n",
 57 |        "      <td>-87.661303</td>\n",
 58 |        "    </tr>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>2</th>\n",
 61 |        "      <td>12/31/12 22:00</td>\n",
 62 |        "      <td>41.969186</td>\n",
 63 |        "      <td>-87.767670</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>3</th>\n",
 67 |        "      <td>12/31/12 22:00</td>\n",
 68 |        "      <td>41.769329</td>\n",
 69 |        "      <td>-87.657726</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>4</th>\n",
 73 |        "      <td>12/31/12 21:30</td>\n",
 74 |        "      <td>41.837568</td>\n",
 75 |        "      <td>-87.621761</td>\n",
 76 |        "    </tr>\n",
 77 |        "  </tbody>\n",
 78 |        "</table>\n",
 79 |        "</div>"
 80 |       ],
 81 |       "text/plain": [
 82 |        "             Date   Latitude  Longitude\n",
 83 |        "0  12/31/12 23:15  41.756284 -87.621645\n",
 84 |        "1  12/31/12 22:00  41.898788 -87.661303\n",
 85 |        "2  12/31/12 22:00  41.969186 -87.767670\n",
 86 |        "3  12/31/12 22:00  41.769329 -87.657726\n",
 87 |        "4  12/31/12 21:30  41.837568 -87.621761"
 88 |       ]
 89 |      },
 90 |      "execution_count": 13,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "df.head()"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 14,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "df['DateTime'] = df['Date'].map(lambda d: datetime.strptime(d, '%m/%d/%y %H:%M'))"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 17,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "df['Hour'] = df['DateTime'].map(lambda d: d.hour)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 23,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "df['DayOfWeek'] = df['DateTime'].map(lambda d: d.weekday())"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 25,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "df['Date'] = df['DateTime'].map(lambda d: d.date())"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 26,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "Date                 object\n",
154 |        "Latitude            float64\n",
155 |        "Longitude           float64\n",
156 |        "DateTime     datetime64[ns]\n",
157 |        "Hour                  int64\n",
158 |        "DayOfWeek             int64\n",
159 |        "dtype: object"
160 |       ]
161 |      },
162 |      "execution_count": 26,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "df.dtypes"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 27,
174 |    "metadata": {
175 |     "collapsed": false
176 |    },
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/html": [
181 |        "<div>\n",
182 |        "<table border=\"1\" class=\"dataframe\">\n",
183 |        "  <thead>\n",
184 |        "    <tr style=\"text-align: right;\">\n",
185 |        "      <th></th>\n",
186 |        "      <th>Date</th>\n",
187 |        "      <th>Latitude</th>\n",
188 |        "      <th>Longitude</th>\n",
189 |        "      <th>DateTime</th>\n",
190 |        "      <th>Hour</th>\n",
191 |        "      <th>DayOfWeek</th>\n",
192 |        "    </tr>\n",
193 |        "  </thead>\n",
194 |        "  <tbody>\n",
195 |        "    <tr>\n",
196 |        "      <th>0</th>\n",
197 |        "      <td>2012-12-31</td>\n",
198 |        "      <td>41.756284</td>\n",
199 |        "      <td>-87.621645</td>\n",
200 |        "      <td>2012-12-31 23:15:00</td>\n",
201 |        "      <td>23</td>\n",
202 |        "      <td>0</td>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>1</th>\n",
206 |        "      <td>2012-12-31</td>\n",
207 |        "      <td>41.898788</td>\n",
208 |        "      <td>-87.661303</td>\n",
209 |        "      <td>2012-12-31 22:00:00</td>\n",
210 |        "      <td>22</td>\n",
211 |        "      <td>0</td>\n",
212 |        "    </tr>\n",
213 |        "    <tr>\n",
214 |        "      <th>2</th>\n",
215 |        "      <td>2012-12-31</td>\n",
216 |        "      <td>41.969186</td>\n",
217 |        "      <td>-87.767670</td>\n",
218 |        "      <td>2012-12-31 22:00:00</td>\n",
219 |        "      <td>22</td>\n",
220 |        "      <td>0</td>\n",
221 |        "    </tr>\n",
222 |        "    <tr>\n",
223 |        "      <th>3</th>\n",
224 |        "      <td>2012-12-31</td>\n",
225 |        "      <td>41.769329</td>\n",
226 |        "      <td>-87.657726</td>\n",
227 |        "      <td>2012-12-31 22:00:00</td>\n",
228 |        "      <td>22</td>\n",
229 |        "      <td>0</td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>4</th>\n",
233 |        "      <td>2012-12-31</td>\n",
234 |        "      <td>41.837568</td>\n",
235 |        "      <td>-87.621761</td>\n",
236 |        "      <td>2012-12-31 21:30:00</td>\n",
237 |        "      <td>21</td>\n",
238 |        "      <td>0</td>\n",
239 |        "    </tr>\n",
240 |        "  </tbody>\n",
241 |        "</table>\n",
242 |        "</div>"
243 |       ],
244 |       "text/plain": [
245 |        "         Date   Latitude  Longitude            DateTime  Hour  DayOfWeek\n",
246 |        "0  2012-12-31  41.756284 -87.621645 2012-12-31 23:15:00    23          0\n",
247 |        "1  2012-12-31  41.898788 -87.661303 2012-12-31 22:00:00    22          0\n",
248 |        "2  2012-12-31  41.969186 -87.767670 2012-12-31 22:00:00    22          0\n",
249 |        "3  2012-12-31  41.769329 -87.657726 2012-12-31 22:00:00    22          0\n",
250 |        "4  2012-12-31  41.837568 -87.621761 2012-12-31 21:30:00    21          0"
251 |       ]
252 |      },
253 |      "execution_count": 27,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "df.head()"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 29,
265 |    "metadata": {
266 |     "collapsed": true
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "df.to_csv('/home/katharine/Downloads/datasets/mvt_cleaned.csv', index=False)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "collapsed": true
278 |    },
279 |    "outputs": [],
280 |    "source": []
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.4.3"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 1
304 | }
305 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | airflow==1.7.1.3
  2 | alembic==0.8.8
  3 | amqp==1.4.9
  4 | anyjson==0.3.3
  5 | appnope==0.1.0
  6 | asgi-redis==0.14.1
  7 | asgiref==0.14.0
  8 | autobahn==0.16.0
  9 | Babel==1.3
 10 | backports-abc==0.4
 11 | billiard==3.3.0.23
 12 | bitarray==0.8.1
 13 | bokeh==0.12.1
 14 | boto3==1.4.0
 15 | botocore==1.4.49
 16 | celery==3.1.23
 17 | cffi==1.8.3
 18 | channels==0.17.3
 19 | chartkick==0.4.2
 20 | click==6.6
 21 | cloudpickle==0.2.1
 22 | croniter==0.3.12
 23 | cryptography==1.5.2
 24 | daphne==0.15.0
 25 | dask==0.10.2
 26 | decorator==4.0.10
 27 | dill==0.2.5
 28 | distributed==1.11.3
 29 | Django==1.10.2
 30 | docutils==0.12
 31 | entrypoints==0.2.2
 32 | filechunkio==1.8
 33 | Flask==0.10.1
 34 | Flask-Admin==1.4.0
 35 | Flask-Cache==0.13.1
 36 | Flask-Login==0.2.11
 37 | Flask-WTF==0.12
 38 | flower==0.9.1
 39 | funcsigs==0.4
 40 | future==0.15.2
 41 | futures==3.0.5
 42 | gevent==1.1.2
 43 | graphviz==0.4.10
 44 | greenlet==0.4.10
 45 | gunicorn==19.3.0
 46 | h5py==2.6.0
 47 | hive-thrift-py==0.0.1
 48 | httplib2==0.9.2
 49 | hypothesis==3.6.0
 50 | idna==2.1
 51 | impyla==0.13.8
 52 | ipykernel==4.4.1
 53 | ipython==5.1.0
 54 | ipython-genutils==0.1.0
 55 | ipywidgets==5.2.2
 56 | itsdangerous==0.24
 57 | Jinja2==2.8
 58 | jmespath==0.9.0
 59 | jsonschema==2.5.1
 60 | jupyter==1.0.0
 61 | jupyter-client==4.3.0
 62 | jupyter-console==5.0.0
 63 | jupyter-core==4.1.1
 64 | kombu==3.0.35
 65 | locket==0.2.0
 66 | lockfile==0.12.2
 67 | luigi==2.3.2
 68 | Mako==1.0.4
 69 | Markdown==2.6.7
 70 | MarkupSafe==0.23
 71 | mistune==0.7.3
 72 | msgpack-python==0.4.8
 73 | mysqlclient==1.3.9
 74 | nbconvert==4.2.0
 75 | nbformat==4.1.0
 76 | nltk==3.2.1
 77 | notebook==4.2.2
 78 | numexpr==2.6.1
 79 | numpy==1.11.1
 80 | oauthlib==2.0.0
 81 | pamela==0.2.1
 82 | pandas==0.18.1
 83 | pandas-datareader==0.2.1
 84 | partd==0.3.6
 85 | pexpect==4.2.0
 86 | pickleshare==0.7.4
 87 | ply==3.9
 88 | prompt-toolkit==1.0.6
 89 | psutil==4.3.0
 90 | ptyprocess==0.5.1
 91 | py==1.4.31
 92 | pyasn1==0.1.9
 93 | pyasn1-modules==0.0.8
 94 | pycparser==2.14
 95 | Pygments==2.1.3
 96 | PyHive==0.2.1
 97 | pytest==3.0.3
 98 | python-daemon==2.1.1
 99 | python-dateutil==2.5.3
100 | python-editor==1.0.1
101 | python-google-places==1.2.0
102 | pytz==2016.6.1
103 | PyYAML==3.11
104 | pyzmq==15.4.0
105 | qtconsole==4.2.1
106 | redis==2.10.5
107 | requests==2.11.1
108 | requests-file==1.4
109 | requests-oauthlib==0.7.0
110 | s3fs==0.0.7
111 | s3transfer==0.1.2
112 | scipy==0.18.0
113 | setproctitle==1.1.10
114 | simplegeneric==0.8.1
115 | simplejson==3.8.2
116 | six==1.10.0
117 | SQLAlchemy==1.0.14
118 | tables==3.2.3.1
119 | tblib==1.3.0
120 | terminado==0.6
121 | thrift==0.9.3
122 | thriftpy==0.3.9
123 | toolz==0.8.0
124 | tornado==4.2
125 | traitlets==4.2.2
126 | tweepy==3.5.0
127 | Twisted==16.4.1
128 | txaio==2.5.1
129 | unicodecsv==0.14.1
130 | uritemplate==0.6
131 | wcwidth==0.1.7
132 | Werkzeug==0.11.10
133 | widgetsnbextension==1.2.6
134 | WTForms==2.1
135 | zope.interface==4.3.2
136 | 


--------------------------------------------------------------------------------
/streaming/tweepy_stream.py:
--------------------------------------------------------------------------------
 1 | """ Module to load tweets for spark streaming access.  Modified only slightly
 2 | from this SO answer: http://stackoverflow.com/questions/27882631/consuming-twitter-stream-with-tweepy-and-serving-content-via-websocket-with-geve"""
 3 | from __future__ import absolute_import, print_function
 4 | import gevent
 5 | import gevent.monkey
 6 | gevent.monkey.patch_all()
 7 | from gevent.server import StreamServer
 8 | 
 9 | from tweepy.streaming import StreamListener
10 | from tweepy import OAuthHandler, Stream
11 | from configparser import ConfigParser
12 | from random import choice
13 | import json
14 | import os
15 | 
16 | CONFIG_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__),
17 |                                           '..', 'config/'))
18 | 
19 | 
20 | class SparkStreamListener(StreamListener):
21 |     """ Use twitter streaming API to stream to PySpark. """
22 |     def __init__(self):
23 |         config = ConfigParser()
24 |         config.read(os.path.join(CONFIG_DIR, 'prod.cfg'))
25 |         self.sockets = []
26 |         auth = OAuthHandler(config.get('twitter', 'consumer_key'),
27 |                             config.get('twitter', 'consumer_secret'))
28 |         auth.set_access_token(config.get('twitter', 'access_token'),
29 |                               config.get('twitter', 'access_token_secret'))
30 |         self.stream = Stream(auth, self)
31 | 
32 |     def add_socket(self, ws):
33 |         self.sockets.append(ws)
34 |         print(self.sockets)
35 | 
36 |     def run(self):
37 |         try:
38 |             self.stream.filter(track=['python'])
39 |         except Exception as e:
40 |             print(e)
41 |             self.stream.disconnect()
42 | 
43 |     def start(self):
44 |         """ Start GEvent """
45 |         gevent.spawn(self.run)
46 | 
47 |     def send(self, status):
48 |         """ Send status to socket """
49 |         print(self.sockets)
50 |         if len(self.sockets) > 1:
51 |             ws = choice(self.sockets)
52 |         else:
53 |             ws = self.sockets[0]
54 |         try:
55 |             ws.send(status.encode('utf-8'))
56 |         except ValueError:
57 |             print(e)
58 |             # the web socket die..
59 |             self.sockets.remove(ws)
60 | 
61 |     def on_data(self, data):
62 |         decoded = json.loads(data)
63 |         gevent.spawn(self.send, decoded.get('text') + '\n')
64 |         return True
65 | 
66 |     def on_error(self, status):
67 |         print("Error: %s", status)
68 | 
69 |     def on_timeout(self):
70 |         print("tweepy timeout.. wait 30 seconds")
71 |         gevent.sleep(30)
72 | 
73 | 
74 | def app(socket, address):
75 |     stream_listener = SparkStreamListener()
76 |     stream_listener.start()
77 |     stream_listener.add_socket(socket)
78 |     while not socket.closed:
79 |         gevent.sleep(0.1)
80 | 
81 | if __name__ == '__main__':
82 |     server = StreamServer(('0.0.0.0', 9999), app)
83 |     server.serve_forever()
84 | 


--------------------------------------------------------------------------------