├── .gitignore ├── README.md ├── airflow └── dags │ ├── generate_twitter.py │ ├── subdags │ └── twitter_subdag.py │ └── twitter_airflow.py ├── celery_app ├── __init__.py ├── celeryapp.py ├── more_tasks.py ├── pytest_stock_tasks.py ├── tasks.py └── test_stock_tasks.py ├── data ├── example_chatlogs.json ├── mvt.csv ├── mvt_cleaned.csv └── tweets │ └── latest_links.txt ├── deploy ├── celery_service ├── celerybeat_service ├── example_variables.yml ├── flower_service ├── jupyter_service ├── jupyterhub_service ├── luigi_service ├── pipelines_playbook.yml ├── pipelines_variables.yml └── templates │ ├── jupyterhub_config.py │ └── sshd_config ├── example_prod.cfg ├── luigi ├── luigi.cfg ├── taxi_data_import.py └── wordcount_map_reduce.py ├── notebooks ├── Chapter 3 - Basic Celery Tasks.ipynb ├── Chapter 3 - Complex Task Chains.ipynb ├── Chapter 3 - First Steps with Celery.ipynb ├── Chapter 3 - Monitoring Tasks.ipynb ├── Chapter 4 - Dask Distributed.ipynb ├── Chapter 4 - First Steps with Dask.ipynb ├── Chapter 4 - Learning Dask Bags.ipynb ├── Chapter 6 - Introduction to PySpark.ipynb ├── Chapter 6 - Introduction to Spark Streaming.ipynb ├── Chapter 7 - Testing with Hypothesis.ipynb └── Extras (Chapter 4) - Clean Vehicle Theft Data.ipynb ├── requirements.txt └── streaming └── tweepy_stream.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/*.json.gz 2 | *.png 3 | config/* 4 | */config/* 5 | *.db 6 | *.log 7 | *~ 8 | *.pyc 9 | venv/* 10 | *.*/ 11 | *.pid 12 | *.db 13 | real_variables.yml 14 | data/tweets/*.csv 15 | 16 | # airflow configs / logs 17 | airflow.cfg 18 | unittests.cfg 19 | */logs/* 20 | 21 | django/* 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Data Pipelines with Python (video edition) 2 | 3 | Welcome to the code repository for [Data Pipelines with Python](http://shop.oreilly.com/product/0636920055334.do)! If you have any questions reach out to @kjam on Twitter or GitHub. 4 | 5 | ### Code Structure 6 | 7 | Most of the code covered in the videos is here; but not all of it. I highly recommend you take time to type out all the code along with the videos and simply use these scripts to "double check" or remind yourself of the work you've already completed. 8 | 9 | ### Installation 10 | 11 | Install with the requirements.txt file. 12 | 13 | ```pip install -r requirements.txt``` 14 | 15 | 16 | ### Yahoo Finance API 17 | 18 | There is a [good writeup in German for the Finance API](http://brusdeylins.info/tips_and_tricks/yahoo-finance-api/) which I used as a starting point to download newer-time data. 19 | 20 | ### Python2 v. Python3 21 | 22 | This repository is primarily compliant for both versions. Please let me know if you run into any bugs! 23 | 24 | 25 | ### Ansible Playbook 26 | 27 | To use as a template rather than as a direct template, I've included a working playbook in the deploy folder. If you try and run it directly, you will likely receive some errors. Please read through the notebook and take a look at the directives and determine which you need and which you don't. It also requires a .ssh/authorized_hosts file as well as a config file located in `celeryapp/config/prod.cfg`. If you run into other errors, I highly recommend reading through [the Ansible 28 | documentation](http://docs.ansible.com/ansible/) or searching on StackOverflow. 29 | 30 | ### Corrections? 31 | 32 | If you find any issues in these code examples, feel free to submit an Issue or Pull Request. I appreciate your input! 33 | 34 | ### Questions? 35 | 36 | Reach out to @kjam on Twitter or GitHub. @kjam is also often on freenode. :) 37 | -------------------------------------------------------------------------------- /airflow/dags/generate_twitter.py: -------------------------------------------------------------------------------- 1 | """ Simple example of creating subdags and generating work dynamically""" 2 | from airflow import DAG 3 | from airflow.hooks import SqliteHook 4 | from airflow.operators import BashOperator, EmailOperator, SubDagOperator, \ 5 | PythonOperator, BranchPythonOperator 6 | from twitter_airflow import search_twitter, RAW_TWEET_DIR 7 | from subdags.twitter_subdag import subdag 8 | from datetime import datetime, timedelta 9 | import pandas as pd 10 | import re 11 | import random 12 | 13 | 14 | SEARCH_TERMS = ['#python', '#pydata', '#airflow', 'data wrangling', 15 | 'data pipelines'] 16 | 17 | 18 | default_args = { 19 | 'owner': 'admin', 20 | 'depends_on_past': False, 21 | 'start_date': datetime.now() - timedelta(days=4), 22 | 'retries': 1, 23 | 'retry_delay': timedelta(minutes=5), 24 | } 25 | 26 | dag = DAG('generate_twitter_dags', default_args=default_args, 27 | schedule_interval='@daily') 28 | 29 | 30 | def fill_terms(my_terms=SEARCH_TERMS, **kwargs): 31 | """ Fill sqlite database with a few search terms. """ 32 | sqlite = SqliteHook('twitter_sqlite') 33 | conn = sqlite.get_conn() 34 | df = pd.DataFrame(my_terms, columns=['search_term']) 35 | try: 36 | df.to_sql('twitter_terms', conn) 37 | except ValueError: 38 | # table already exists 39 | pass 40 | 41 | 42 | def generate_search_terms(**kwargs): 43 | """ Generate subdag to search twitter for terms. """ 44 | sqlite = SqliteHook('twitter_sqlite') 45 | conn = sqlite.get_conn() 46 | query = "select * from twitter_terms" 47 | df = pd.read_sql_query(query, conn) 48 | return random.choice([ 49 | 'search_{}_twitter'.format(re.sub(r'\W+', '', t)) 50 | for t in df.search_term.values]) 51 | 52 | 53 | fill_search_terms = PythonOperator(task_id='fill_terms', 54 | provide_context=True, 55 | python_callable=fill_terms, 56 | dag=dag) 57 | 58 | 59 | gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', 60 | provide_context=True, 61 | python_callable=generate_search_terms, 62 | dag=dag) 63 | 64 | 65 | email_links = EmailOperator(task_id='email_best_links', 66 | to='MYEMAIL@MYSITE.com', 67 | subject='Latest popular links', 68 | html_content='Check out the latest!!', 69 | files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], 70 | dag=dag) 71 | 72 | 73 | sub = SubDagOperator(subdag=subdag, 74 | task_id='insert_and_id_pop', 75 | trigger_rule='one_success', 76 | dag=dag) 77 | 78 | 79 | clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format( 80 | RAW_TWEET_DIR), task_id='clear_latest', dag=dag) 81 | 82 | 83 | gen_search_terms.set_upstream(fill_search_terms) 84 | 85 | for term in SEARCH_TERMS: 86 | term_without_punctuation = re.sub(r'\W+', '', term) 87 | simple_search = PythonOperator( 88 | task_id='search_{}_twitter'.format(term_without_punctuation), 89 | provide_context=True, 90 | python_callable=search_twitter, 91 | dag=dag, 92 | params={'query': term}) 93 | simple_search.set_upstream(gen_search_terms) 94 | simple_search.set_downstream(sub) 95 | 96 | sub.set_downstream(email_links) 97 | email_links.set_downstream(clear_latest) 98 | -------------------------------------------------------------------------------- /airflow/dags/subdags/twitter_subdag.py: -------------------------------------------------------------------------------- 1 | """ Simple subdag example """ 2 | from airflow import DAG 3 | from airflow.operators import PythonOperator 4 | from twitter_airflow import csv_to_sqlite, identify_popular_links 5 | from datetime import datetime, timedelta 6 | 7 | 8 | default_args = { 9 | 'owner': 'admin', 10 | 'depends_on_past': False, 11 | 'start_date': datetime(2016, 1, 1), 12 | 'retries': 1, 13 | 'retry_delay': timedelta(minutes=5), 14 | } 15 | 16 | subdag = DAG('generate_twitter_dags.insert_and_id_pop', 17 | default_args=default_args) 18 | 19 | move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', 20 | provide_context=True, 21 | python_callable=csv_to_sqlite, 22 | dag=subdag) 23 | 24 | id_popular = PythonOperator(task_id='identify_popular_links', 25 | provide_context=True, 26 | python_callable=identify_popular_links, 27 | dag=subdag, 28 | params={'write_mode': 'a'}) 29 | 30 | id_popular.set_upstream(move_tweets_to_sqlite) 31 | -------------------------------------------------------------------------------- /airflow/dags/twitter_airflow.py: -------------------------------------------------------------------------------- 1 | """ Simple Airflow data pipeline example using Twitter API """ 2 | from airflow import DAG 3 | from airflow.operators import EmailOperator, PythonOperator 4 | from airflow.hooks import SqliteHook 5 | from tweepy import API, Cursor, OAuthHandler 6 | from configparser import ConfigParser 7 | from csv import DictWriter, writer 8 | from collections import Counter 9 | from datetime import datetime, timedelta 10 | import ast 11 | import itertools 12 | import glob 13 | import shutil 14 | import pandas as pd 15 | import os.path 16 | 17 | RAW_TWEET_DIR = os.path.abspath(os.path.join(__file__, '../../../data/tweets/')) 18 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, 19 | '../../../config/prod.cfg')) 20 | MAX_TWEEPY_PAGE = 300 21 | 22 | default_args = { 23 | 'owner': 'admin', 24 | 'depends_on_past': False, 25 | 'start_date': datetime.now() - timedelta(days=4), 26 | 'retries': 1, 27 | 'retry_delay': timedelta(minutes=5), 28 | } 29 | 30 | dag = DAG('twitter_links', default_args=default_args, 31 | schedule_interval='@daily') 32 | 33 | 34 | def extract_tweet_data(tweepy_obj, query): 35 | """ Extract relevant and serializable data from a tweepy Tweet object 36 | params: 37 | tweepy_obj: Tweepy Tweet Object 38 | query: str 39 | returns dict 40 | """ 41 | return { 42 | 'user_id': tweepy_obj.user.id, 43 | 'user_name': tweepy_obj.user.name, 44 | 'user_screenname': tweepy_obj.user.screen_name, 45 | 'user_url': tweepy_obj.user.url, 46 | 'user_description': tweepy_obj.user.description, 47 | 'user_followers': tweepy_obj.user.followers_count, 48 | 'user_friends': tweepy_obj.user.friends_count, 49 | 'created': tweepy_obj.created_at.isoformat(), 50 | 'text': tweepy_obj.text, 51 | 'hashtags': [ht.get('text') for ht in 52 | tweepy_obj.entities.get('hashtags')], 53 | 'mentions': [(um.get('id'), um.get('screen_name')) for um in 54 | tweepy_obj.entities.get('user_mentions')], 55 | 'urls': [url.get('expanded_url') for url in 56 | tweepy_obj.entities.get('urls')], 57 | 'tweet_id': tweepy_obj.id, 58 | 'is_quote_status': tweepy_obj.is_quote_status, 59 | 'favorite_count': tweepy_obj.favorite_count, 60 | 'retweet_count': tweepy_obj.retweet_count, 61 | 'reply_status_id': tweepy_obj.in_reply_to_status_id, 62 | 'lang': tweepy_obj.lang, 63 | 'source': tweepy_obj.source, 64 | 'location': tweepy_obj.coordinates, 65 | 'query': query, 66 | } 67 | 68 | 69 | def search_twitter(**kwargs): 70 | """ simple search for a query in public tweets""" 71 | query = kwargs.get('params').get('query') 72 | config = ConfigParser() 73 | config.read(CONFIG_FILE) 74 | auth = OAuthHandler(config.get('twitter', 'consumer_key'), 75 | config.get('twitter', 'consumer_secret')) 76 | auth.set_access_token(config.get('twitter', 'access_token'), 77 | config.get('twitter', 'access_token_secret')) 78 | api = API(auth) 79 | 80 | all_tweets = [] 81 | page_num = 0 82 | since_date = datetime.strptime( 83 | kwargs.get('ds'), '%Y-%m-%d').date() - timedelta(days=1) 84 | query += ' since:{} until:{}'.format(since_date.strftime('%Y-%m-%d'), 85 | kwargs.get('ds')) 86 | print('searching twitter with: %s' % query) 87 | for page in Cursor(api.search, q=query, monitor_rate_limit=True, 88 | wait_on_rate_limit=True).pages(): 89 | all_tweets.extend([extract_tweet_data(t, query) for t in page]) 90 | page_num += 1 91 | if page_num > MAX_TWEEPY_PAGE: 92 | break 93 | 94 | # if it's an empty list, stop here 95 | if not len(all_tweets): 96 | return 97 | 98 | filename = '{}/{}_{}.csv'.format( 99 | RAW_TWEET_DIR, query, datetime.now().strftime('%m%d%Y%H%M%S')) 100 | 101 | with open(filename, 'w') as raw_file: 102 | raw_wrtr = DictWriter(raw_file, fieldnames=all_tweets[0].keys()) 103 | raw_wrtr.writeheader() 104 | raw_wrtr.writerows(all_tweets) 105 | 106 | 107 | def csv_to_sqlite(directory=RAW_TWEET_DIR, **kwargs): 108 | """ Very basic csv to sqlite pipeline using pandas 109 | params: 110 | directory: str (file path to csv files) 111 | """ 112 | sqlite = SqliteHook('twitter_sqlite') 113 | conn = sqlite.get_conn() 114 | for fname in glob.glob('{}/*.csv'.format(directory)): 115 | if '_read' not in fname: 116 | try: 117 | df = pd.read_csv(fname) 118 | df.to_sql('tweets', conn, if_exists='append', index=False) 119 | shutil.move(fname, fname.replace('.csv', '_read.csv')) 120 | except pd.io.common.EmptyDataError: 121 | # probably an io error with another task / open file 122 | continue 123 | 124 | 125 | def identify_popular_links(directory=RAW_TWEET_DIR, write_mode='w', **kwargs): 126 | """ Identify the most popular links from the last day of tweest in the db 127 | Writes them to latest_links.txt in the RAW_TWEET_DIR 128 | (or directory kwarg) 129 | """ 130 | sqlite = SqliteHook('twitter_sqlite') 131 | conn = sqlite.get_conn() 132 | query = """select * from tweets where 133 | created > date('now', '-1 days') and urls is not null 134 | order by favorite_count""" 135 | df = pd.read_sql_query(query, conn) 136 | df.urls = df.urls.map(ast.literal_eval) 137 | cntr = Counter(itertools.chain.from_iterable(df.urls.values)) 138 | with open('{}/latest_links.txt'.format(directory), write_mode) as latest: 139 | wrtr = writer(latest) 140 | wrtr.writerow(['url', 'count']) 141 | wrtr.writerows(cntr.most_common(5)) 142 | 143 | 144 | simple_search = PythonOperator(task_id='search_twitter', 145 | provide_context=True, 146 | python_callable=search_twitter, 147 | dag=dag, 148 | params={'query': '#python'}) 149 | 150 | 151 | move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', 152 | provide_context=True, 153 | python_callable=csv_to_sqlite, 154 | dag=dag) 155 | 156 | 157 | id_popular = PythonOperator(task_id='identify_popular_links', 158 | provide_context=True, 159 | python_callable=identify_popular_links, 160 | dag=dag) 161 | 162 | 163 | email_links = EmailOperator(task_id='email_best_links', 164 | to='katharine@kjamistan.com', 165 | subject='Latest popular links', 166 | html_content='Check out the latest!!', 167 | files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], 168 | dag=dag) 169 | 170 | 171 | simple_search.set_downstream(move_tweets_to_sqlite) 172 | id_popular.set_upstream(move_tweets_to_sqlite) 173 | email_links.set_upstream(id_popular) 174 | -------------------------------------------------------------------------------- /celery_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/data-pipelines-course/2c8c8420d220df9168561b9157fb05cc28fb9bc0/celery_app/__init__.py -------------------------------------------------------------------------------- /celery_app/celeryapp.py: -------------------------------------------------------------------------------- 1 | ''' Celery settings and app ''' 2 | from celery import Celery 3 | from kombu import Queue 4 | from configparser import ConfigParser 5 | from datetime import datetime, timedelta 6 | import os 7 | 8 | 9 | 10 | config = ConfigParser() 11 | current_dir = os.path.dirname(os.path.realpath(__file__)) 12 | 13 | if os.environ.get('DEPLOY') == 'PROD': 14 | config.read(os.path.join(current_dir, 'config/prod.cfg')) 15 | else: 16 | config.read(os.path.join(current_dir, 'config/dev.cfg')) 17 | 18 | app = Celery('tasks', broker=config.get('celery', 'broker_url')) 19 | 20 | CELERY_CONFIG = { 21 | 'CELERY_IMPORTS': ['tasks'], 22 | 'CELERY_TIMEZONE': 'Europe/Berlin', 23 | 'CELERY_IGNORE_RESULT': False, 24 | 'CELERY_TRACK_STARTED': True, 25 | 'CELERY_DEFAULT_QUEUE': 'default', 26 | 'CELERY_QUEUES': (Queue('default'), Queue('priority'),), 27 | 'CELERY_DEFAULT_RATE_LIMIT': '20/s', 28 | 'CELERY_RESULT_BACKEND': 'amqp://', 29 | 'CELERY_CHORD_PROPAGATES': True, 30 | 'CELERYD_TASK_TIME_LIMIT': 7200, 31 | 'CELERYD_POOL_RESTARTS': True, 32 | 'CELERYD_TASK_LOG_FORMAT': 33 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 34 | 'CELERY_ANNOTATIONS': { 35 | 'celery.chord_unlock': {'hard_time_limit': 360}, 36 | }, 37 | 'CELERYBEAT_SCHEDULE': { 38 | 'get_stock_info_60s': { 39 | 'task': 'tasks.get_stock_info', 40 | 'schedule': timedelta(seconds=60), 41 | 'args': ('FB', datetime(2016, 1, 1), datetime.today()) 42 | } 43 | } 44 | } 45 | 46 | 47 | app.conf.update(**CELERY_CONFIG) 48 | -------------------------------------------------------------------------------- /celery_app/more_tasks.py: -------------------------------------------------------------------------------- 1 | ''' Here are a few options for chp3 homework ''' 2 | from pandas_datareader import data 3 | from celeryapp import app 4 | import pandas as pd 5 | from datetime import datetime, timedelta 6 | 7 | 8 | @app.task 9 | def current_earnings(stock): 10 | ''' return json response of current year ESP from yahoo finance 11 | params: 12 | stock str 13 | returns: 14 | json 15 | ''' 16 | url = 'http://finance.yahoo.com/d/quotes.csv?s={}&f=se7'.format(stock) 17 | cy = pd.read_csv(url, names=['Stock', 'Current Year ESP']) 18 | return cy.to_json() 19 | 20 | 21 | @app.task 22 | def yoy_change(stock, source='yahoo'): 23 | ''' return year over year change for a given stock from today. 24 | params: 25 | stock str 26 | kwargs: 27 | source str 28 | returns float 29 | ''' 30 | start = datetime.today() - timedelta(days=365) # not accounting for leap yr 31 | df = data.DataReader(stock, source, start, datetime.today()) 32 | return ((df.ix[-1]['Adj Close'] / df.ix[0]['Adj Close']) - 1) * 100 33 | -------------------------------------------------------------------------------- /celery_app/pytest_stock_tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tasks import get_stock_info 3 | from datetime import datetime 4 | 5 | 6 | def test_get_stock_info(): 7 | start_date = datetime(2013, 1, 1) 8 | end_date = datetime(2013, 2, 1) 9 | stock = 'FB' 10 | 11 | result = get_stock_info(stock, start_date, end_date) 12 | assert isinstance(result, str) 13 | result = json.loads(result) 14 | assert 'High min' in result.keys() 15 | stock = ' '.join(result['High min'].keys()) 16 | assert stock == stock 17 | price = result['High min'][stock] 18 | assert isinstance(price, float) 19 | assert price > 0 20 | -------------------------------------------------------------------------------- /celery_app/tasks.py: -------------------------------------------------------------------------------- 1 | ''' Task module for showing celery functionality. ''' 2 | from pandas_datareader import data 3 | from celeryapp import app 4 | from urllib.error import HTTPError, URLError 5 | import pandas as pd 6 | import logging 7 | 8 | 9 | @app.task 10 | def get_stock_info(stock, start, end, source='yahoo'): 11 | ''' Collect aggregate info for a stock given a daterange. 12 | params: 13 | stock: str 14 | start: datetime 15 | end: datetime 16 | kwargs: 17 | source (optional): str 18 | returns: 19 | json 20 | ''' 21 | logging.debug('start and end types are: %s %s', type(start), type(end)) 22 | df = data.DataReader(stock, source, start, end) 23 | df['Stock'] = stock 24 | agg = df.groupby('Stock').agg({ 25 | 'Open': ['min', 'max', 'mean', 'median'], 26 | 'Adj Close': ['min', 'max', 'mean', 'median'], 27 | 'Close': ['min', 'max', 'mean', 'median'], 28 | 'High': ['min', 'max', 'mean', 'median'], 29 | 'Low': ['min', 'max', 'mean', 'median'], 30 | }) 31 | agg.columns = [' '.join(col).strip() for col in agg.columns.values] 32 | return agg.to_json() 33 | 34 | 35 | def calc_ratio(price, compare): 36 | ''' Calculates ratio and converts it into percentage 37 | when given stock price and comparison price 38 | params: 39 | price: float 40 | compare: float 41 | returns float 42 | ''' 43 | return round(((price / compare) - 1) * 100, 2) 44 | 45 | 46 | @app.task(bind=True) 47 | def price_range(self, stock, start, end, source='yahoo'): 48 | ''' Compare today's date to see if it is near max or min of closing prices 49 | in certain daterange. 50 | params: 51 | stock: str 52 | start: datetime 53 | end: datetime 54 | kwargs: 55 | source (optional): str 56 | returns: 57 | dictionary 58 | ''' 59 | df = data.DataReader(stock, source, start, end) 60 | period_high = df['Adj Close'].max() 61 | period_mean = df['Adj Close'].mean() 62 | period_low = df['Adj Close'].min() 63 | resp = { 64 | 'stock': stock, 65 | 'period_high': period_high, 66 | 'period_low': period_low, 67 | 'period_mean': period_mean, 68 | 'period_start': start, 69 | 'period_end': end, 70 | } 71 | url = 'http://finance.yahoo.com/d/quotes.csv?s={}&f=sat1'.format(stock) 72 | try: 73 | td = pd.read_csv(url, names=['Stock', 'Price', 'Last Trade']) 74 | except (HTTPError, URLError) as exc: 75 | logging.exception('pandas read_csv error for yahoo finance URL: %s', 76 | url) 77 | raise self.retry(exc=exc) 78 | td_price = td['Price'].mean() 79 | resp['todays_price'] = td_price 80 | if abs(td_price - period_high) < abs(td_price - period_low): 81 | resp['result'] = 'higher' 82 | else: 83 | resp['result'] = 'lower' 84 | resp['percent_change'] = calc_ratio(td_price, period_mean) 85 | return resp 86 | 87 | 88 | @app.task 89 | def determine_buy(result): 90 | ''' Extremely naive buy logic (for example's sake) 91 | params: 92 | result: json result from price_range task 93 | return: 94 | boolean 95 | ''' 96 | if result['result'] == 'lower': 97 | return True 98 | return False 99 | 100 | 101 | @app.task 102 | def sort_results(results, key='todays_price'): 103 | ''' Sort by given key, defaults to todays_price 104 | params: 105 | results: list of results from price_range task 106 | kwargs: 107 | key: str (must be in price_range return dictionary) 108 | return sorted list 109 | ''' 110 | return sorted(results, key=lambda x: x[key]) 111 | -------------------------------------------------------------------------------- /celery_app/test_stock_tasks.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import json 3 | from tasks import get_stock_info 4 | from datetime import datetime 5 | 6 | 7 | class TestStockInfo(unittest.TestCase): 8 | def setUp(self): 9 | self.start_date = datetime(2013, 1, 1) 10 | self.end_date = datetime(2013, 2, 1) 11 | self.stock = 'FB' 12 | 13 | def test_get_stock_info(self): 14 | result = get_stock_info(self.stock, self.start_date, self.end_date) 15 | self.assertIsInstance(result, str) 16 | result = json.loads(result) 17 | self.assertIn('High min', result.keys()) 18 | stock = ' '.join(result['High min'].keys()) 19 | self.assertEqual(stock, self.stock) 20 | price = result['High min'][self.stock] 21 | self.assertIsInstance(price, float) 22 | self.assertTrue(price > 0) 23 | -------------------------------------------------------------------------------- /data/tweets/latest_links.txt: -------------------------------------------------------------------------------- 1 | url,count 2 | -------------------------------------------------------------------------------- /deploy/celery_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=celery 3 | After=syslog.target network.target 4 | 5 | [Service] 6 | Environment=DEPLOY=PROD 7 | ExecStart=/home/deploy/venv/bin/celery multi start 4 -A tasks --loglevel=debug --logfile=/var/log/celery/%N.log 8 | ExecStop=/home/deploy/venv/bin/celery multi stopwait 4 -A tasks 9 | ExecReload=/home/deploy/venv/bin/celery multi restart 4 -A tasks --loglevel=debug --logfile=/var/log/celery/%N.log 10 | # Requires systemd version 211 or newer 11 | WorkingDirectory=/var/www/pipelines/celery_app 12 | Type=forking 13 | StandardError=syslog 14 | User=deploy 15 | Group=deploy 16 | TimeoutSec=3600 17 | 18 | [Install] 19 | WantedBy=multi-user.target 20 | -------------------------------------------------------------------------------- /deploy/celerybeat_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=celerybeat 3 | After=syslog.target 4 | 5 | [Service] 6 | ExecStart=/home/deploy/venv/bin/celery -A tasks beat --loglevel=debug --logfile=/var/log/celery/%n.log 7 | # Requires systemd version 211 or newer 8 | WorkingDirectory=/var/www/pipelines/celery_app 9 | Restart=always 10 | KillSignal=SIGTERM 11 | Type=simple 12 | StandardError=syslog 13 | NotifyAccess=all 14 | User=deploy 15 | Group=deploy 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /deploy/example_variables.yml: -------------------------------------------------------------------------------- 1 | deploy_url: pipelines.foo.com 2 | deploy_pass: $6$rw0zQQOmZqt1KsDFksakjio291fSzScf3qGxedkxt249FfFskwonDDlsso$32onLzXth3ZHK0 3 | deploy_email: youremail@you.com 4 | rabbitmq_pass: hereisapasswordasanexample 5 | -------------------------------------------------------------------------------- /deploy/flower_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=celery flower 3 | After=syslog.target 4 | 5 | [Service] 6 | Environment=DEPLOY=PROD 7 | ExecStart=/home/deploy/venv/bin/celery flower -A tasks --port=5566 --basic_auth=admin:getouttahere 8 | # Requires systemd version 211 or newer 9 | WorkingDirectory=/var/www/pipelines/celery_app 10 | Restart=always 11 | KillSignal=SIGTERM 12 | Type=simple 13 | StandardError=syslog 14 | NotifyAccess=all 15 | User=deploy 16 | Group=deploy 17 | 18 | [Install] 19 | WantedBy=multi-user.target 20 | -------------------------------------------------------------------------------- /deploy/jupyter_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=jupyter 3 | 4 | [Service] 5 | PIDFile=/var/run/jupyter.pid 6 | ExecStart=/home/deploy/venv/bin/python jupyter notebook --no-browser --pylab=inline 7 | KillSignal=SIGTERM 8 | WorkingDirectory=/var/www/pipelines/ 9 | 10 | [Install] 11 | WantedBy=multi-user.target 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /deploy/jupyterhub_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Jupyterhub 3 | After=syslog.target network.target 4 | 5 | 6 | [Service] 7 | User=root 8 | Environment=PYTHONPATH=/home/deploy/venv/bin/python 9 | Environment=VIRTUAL_ENV=/home/deploy/venv 10 | Environment=DEPLOY=PROD 11 | ExecStart=/home/deploy/venv/bin/jupyterhub -f /var/www/pipelines/notebooks/jupyterhub_config.py 12 | WorkingDirectory=/var/www/pipelines/notebooks 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | -------------------------------------------------------------------------------- /deploy/luigi_service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Luigi 3 | After=syslog.target network.target 4 | 5 | [Service] 6 | User=root 7 | Environment=PYTHONPATH=/home/deploy/venv/bin/python 8 | Environment=VIRTUAL_ENV=/home/deploy/venv 9 | Environment=DEPLOY=PROD 10 | ExecStart=/home/deploy/venv/bin/luigid --logdir /var/log/luigi --pidfile /var/run/luigi.pid 11 | WorkingDirectory=/var/www/pipelines 12 | 13 | [Install] 14 | WantedBy=multi-user.target 15 | -------------------------------------------------------------------------------- /deploy/pipelines_playbook.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: pipelines 3 | become: yes 4 | 5 | tasks: 6 | - include_vars: pipelines_variables.yml 7 | 8 | - name: get jessie backports 9 | apt_repository: repo='deb http://ftp.debian.org/debian jessie-backports main' state=present 10 | 11 | - name: update 12 | apt: update_cache=yes 13 | 14 | - name: install fail2ban 15 | apt: pkg=fail2ban state=installed 16 | 17 | - name: add deploy user 18 | user: name=deploy shell=/bin/bash password={{ deploy_pass }} 19 | 20 | - name: add ssh dir 21 | file: path=/home/deploy/.ssh state=directory owner=deploy group=deploy mode=0700 22 | 23 | - name: move key to deploy user folder 24 | become_user: deploy 25 | copy: src=~/.ssh/authorized_keys dest=/home/deploy/.ssh/authorized_keys 26 | 27 | - name: copy ssh deploy key files 28 | become_user: deploy 29 | copy: src=~/.ssh/deploy dest=/home/deploy/.ssh/ 30 | 31 | - name: Allow deploy to have sudo 32 | lineinfile: dest=/etc/sudoers state=present line='deploy ALL=(ALL:ALL) ALL' 33 | 34 | - name: change perms for ssh 35 | file: path=/home/deploy/.ssh/deploy owner=deploy group=deploy mode=0400 36 | 37 | - name: change sshd 38 | copy: src=templates/sshd_config dest=/etc/ssh/sshd_config 39 | notify: 40 | - restart ssh 41 | 42 | - name: install pip3 43 | apt: name=python3-pip state=installed 44 | 45 | - name: virtualenv 46 | shell: pip3 install virtualenv 47 | 48 | - name: install hdf5 49 | apt: pkg=libhdf5-dev state=installed install_recommends=yes 50 | 51 | - name: install hdf5 tools 52 | apt: pkg=hdf5-tools state=installed install_recommends=yes 53 | 54 | - name: install sqlite 55 | apt: pkg=sqlite3 state=installed 56 | 57 | - name: npm 58 | apt: name=npm state=installed 59 | 60 | - name: nodejs 61 | apt: name=nodejs-legacy state=installed 62 | 63 | - name: install npm http proxy 64 | npm: name=configurable-http-proxy global=yes state=present 65 | 66 | - name: install python crypto 67 | apt: name=python-cryptography state=installed install_recommends=yes 68 | 69 | - name: install certbot 70 | apt: name=certbot state=installed install_recommends=yes default_release=jessie-backports 71 | 72 | - name: install letsencrypt 73 | apt: name=letsencrypt state=installed install_recommends=yes 74 | 75 | - name: run certbot 76 | shell: certbot certonly --standalone -d {{ deploy_url }} --standalone-supported-challenges tls-sni-01 -n -m {{ deploy_email }} --keep-until-expiring --agree-tos 77 | 78 | - name: rabbitmq 79 | apt: name=rabbitmq-server state=installed 80 | 81 | - name: redis 82 | apt: name=redis-server state=installed 83 | 84 | - name: graphviz 85 | apt: name=graphviz state=installed 86 | 87 | - name: git 88 | apt: name=git state=installed 89 | 90 | - name: change app perms to fetch 91 | file: path=/var/www/pipelines mode=0777 recurse=yes state=directory 92 | ignore_errors: yes 93 | 94 | - name: fetch application 95 | become_user: deploy 96 | git: repo=git@github.com:kjam/data-pipelines-course.git dest=/var/www/pipelines key_file=~/.ssh/deploy accept_hostkey=yes force=yes 97 | 98 | - name: install requirements 99 | pip: requirements=/var/www/pipelines/requirements.txt virtualenv=/home/deploy/venv 100 | environment: 101 | HDF5_DIR: /usr/lib/x86_64-linux-gnu/hdf5/serial/ 102 | 103 | - name: make notebooks directory 104 | file: path=/var/www/pipelines/notebooks state=directory owner=deploy mode=0755 recurse=yes 105 | 106 | - name: make notebooks cookiefile 107 | file: path=/var/www/pipelines/notebooks/jupyterhub_cookie_secret mode=0600 state=touch 108 | 109 | - name: make celery config directory 110 | file: path=/var/www/pipelines/celery_app/config state=directory owner=deploy mode=0755 recurse=yes 111 | 112 | - name: copy config to celery dir 113 | copy: src=../celery_app/config/prod.cfg dest=/var/www/pipelines/celery_app/config 114 | 115 | - name: copy config to nb dir 116 | copy: src=templates/jupyterhub_config.py dest=/var/www/pipelines/notebooks 117 | 118 | - name: make log dir 119 | file: path=/var/log/celery state=directory owner=deploy mode=0755 recurse=yes 120 | 121 | - name: rabbitmq add vhost 122 | rabbitmq_vhost: name=celery_vhost state=present 123 | 124 | - name: add rabbitmq user 125 | rabbitmq_user: user=celery_user password={{ rabbitmq_pass }} vhost=celery_vhost configure_priv=.* read_priv=.* write_priv=.* tags=administrator state=present 126 | 127 | - name: enabling rabbitmq management 128 | rabbitmq_plugin: names=rabbitmq_management state=enabled 129 | notify: 130 | - restart rabbitmq 131 | 132 | - name: change sysd perms 133 | file: path=/var/www/pipelines/deploy mode=0644 recurse=yes 134 | 135 | - name: link flower systemd file 136 | file: src=/var/www/pipelines/deploy/flower_service dest=/etc/systemd/system/flower.service state=link 137 | 138 | - name: link celery systemd file 139 | file: src=/var/www/pipelines/deploy/celery_service dest=/etc/systemd/system/celery.service state=link 140 | 141 | - name: link celerybeat systemd file 142 | file: src=/var/www/pipelines/deploy/celerybeat_service dest=/etc/systemd/system/celerybeat.service state=link 143 | 144 | - name: link jupyterhub systemd file 145 | file: src=/var/www/pipelines/deploy/jupyterhub_service dest=/etc/systemd/system/jupyterhub.service state=link 146 | 147 | #- name: link jupyter systemd file 148 | # file: src=/var/www/pipelines/deploy/jupyter_service dest=/etc/systemd/system/jupyterhub.service state=link 149 | 150 | - name: reload systemd 151 | shell: systemctl daemon-reload && systemctl restart jupyterhub 152 | 153 | - name: start flower & celery & celerybeat 154 | shell: systemctl restart celery && systemctl restart flower && systemctl restart celerybeat 155 | 156 | handlers: 157 | - name: restart rabbitmq 158 | service: name=rabbitmq-server state=restarted 159 | 160 | - name: restart ssh 161 | service: name=ssh state=restarted 162 | -------------------------------------------------------------------------------- /deploy/pipelines_variables.yml: -------------------------------------------------------------------------------- 1 | deploy_url: YOUR_DOMAIN_HERE 2 | deploy_pass: PUT_SHA1_PW_HERE 3 | deploy_email: PUT_EMAIL_HERE 4 | rabbitmq_pass: PUT_PLAINTEXT_PASS_HERE 5 | -------------------------------------------------------------------------------- /deploy/templates/jupyterhub_config.py: -------------------------------------------------------------------------------- 1 | # Configuration file for jupyterhub. 2 | 3 | #------------------------------------------------------------------------------ 4 | # Configurable configuration 5 | #------------------------------------------------------------------------------ 6 | 7 | #------------------------------------------------------------------------------ 8 | # LoggingConfigurable configuration 9 | #------------------------------------------------------------------------------ 10 | 11 | # A parent class for Configurables that log. 12 | # 13 | # Subclasses have a log trait, and the default behavior is to get the logger 14 | # from the currently running Application. 15 | 16 | #------------------------------------------------------------------------------ 17 | # SingletonConfigurable configuration 18 | #------------------------------------------------------------------------------ 19 | 20 | # A configurable that only allows one instance. 21 | # 22 | # This class is for classes that should only have one instance of itself or 23 | # *any* subclass. To create and retrieve such a class use the 24 | # :meth:`SingletonConfigurable.instance` method. 25 | 26 | #------------------------------------------------------------------------------ 27 | # Application configuration 28 | #------------------------------------------------------------------------------ 29 | 30 | # This is an application. 31 | 32 | # The date format used by logging formatters for %(asctime)s 33 | c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S' 34 | 35 | # The Logging format template 36 | c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s' 37 | 38 | # Set the log level by value or name. 39 | c.Application.log_level = 30 40 | 41 | #------------------------------------------------------------------------------ 42 | # JupyterHub configuration 43 | #------------------------------------------------------------------------------ 44 | 45 | # An Application for starting a Multi-User Jupyter Notebook server. 46 | 47 | # Grant admin users permission to access single-user servers. 48 | # 49 | # Users should be properly informed if this is enabled. 50 | # c.JupyterHub.admin_access = False 51 | 52 | # DEPRECATED, use Authenticator.admin_users instead. 53 | # c.JupyterHub.admin_users = set() 54 | 55 | # Answer yes to any questions (e.g. confirm overwrite) 56 | #c.JupyterHub.answer_yes = False 57 | 58 | # Dict of token:username to be loaded into the database. 59 | # 60 | # Allows ahead-of-time generation of API tokens for use by services. 61 | # c.JupyterHub.api_tokens = {} 62 | 63 | # Class for authenticating users. 64 | # 65 | # This should be a class with the following form: 66 | # 67 | # - constructor takes one kwarg: `config`, the IPython config object. 68 | # 69 | # - is a tornado.gen.coroutine 70 | # - returns username on success, None on failure 71 | # - takes two arguments: (handler, data), 72 | # where `handler` is the calling web.RequestHandler, 73 | # and `data` is the POST form data from the login page. 74 | c.JupyterHub.authenticator_class = 'jupyterhub.auth.PAMAuthenticator' 75 | 76 | # The base URL of the entire application 77 | c.JupyterHub.base_url = '/' 78 | 79 | # Whether to shutdown the proxy when the Hub shuts down. 80 | # 81 | # Disable if you want to be able to teardown the Hub while leaving the proxy 82 | # running. 83 | # 84 | # Only valid if the proxy was starting by the Hub process. 85 | # 86 | # If both this and cleanup_servers are False, sending SIGINT to the Hub will 87 | # only shutdown the Hub, leaving everything else running. 88 | # 89 | # The Hub should be able to resume from database state. 90 | c.JupyterHub.cleanup_proxy = True 91 | 92 | # Whether to shutdown single-user servers when the Hub shuts down. 93 | # 94 | # Disable if you want to be able to teardown the Hub while leaving the single- 95 | # user servers running. 96 | # 97 | # If both this and cleanup_proxy are False, sending SIGINT to the Hub will only 98 | # shutdown the Hub, leaving everything else running. 99 | # 100 | # The Hub should be able to resume from database state. 101 | c.JupyterHub.cleanup_servers = True 102 | 103 | # The config file to load 104 | #c.JupyterHub.config_file = 'jupyterhub_config.py' 105 | 106 | # Confirm that JupyterHub should be run without SSL. This is **NOT RECOMMENDED** 107 | # unless SSL termination is being handled by another layer. 108 | # c.JupyterHub.confirm_no_ssl = False 109 | 110 | # Number of days for a login cookie to be valid. Default is two weeks. 111 | # c.JupyterHub.cookie_max_age_days = 14 112 | 113 | # The cookie secret to use to encrypt cookies. 114 | # 115 | # Loaded from the JPY_COOKIE_SECRET env variable by default. 116 | # c.JupyterHub.cookie_secret = b'' 117 | 118 | # File in which to store the cookie secret. 119 | # c.JupyterHub.cookie_secret_file = 'jupyterhub_cookie_secret' 120 | 121 | # The location of jupyterhub data files (e.g. /usr/local/share/jupyter/hub) 122 | # c.JupyterHub.data_files_path = '/share/jupyter/hub' 123 | 124 | # Include any kwargs to pass to the database connection. See 125 | # sqlalchemy.create_engine for details. 126 | # c.JupyterHub.db_kwargs = {} 127 | 128 | # url for the database. e.g. `sqlite:///jupyterhub.sqlite` 129 | c.JupyterHub.db_url = 'sqlite:///jupyterhub.sqlite' 130 | 131 | # log all database transactions. This has A LOT of output 132 | # c.JupyterHub.debug_db = False 133 | 134 | # show debug output in configurable-http-proxy 135 | c.JupyterHub.debug_proxy = True 136 | 137 | # Send JupyterHub's logs to this file. 138 | # 139 | # This will *only* include the logs of the Hub itself, not the logs of the proxy 140 | # or any single-user servers. 141 | # c.JupyterHub.extra_log_file = '' 142 | 143 | # Extra log handlers to set on JupyterHub logger 144 | # c.JupyterHub.extra_log_handlers = [] 145 | 146 | # Generate default config file 147 | # c.JupyterHub.generate_config = False 148 | 149 | # The ip for this process 150 | c.JupyterHub.hub_ip = '127.0.0.1' 151 | 152 | # The port for this process 153 | c.JupyterHub.hub_port = 8081 154 | 155 | # The prefix for the hub server. Must not be '/' 156 | # c.JupyterHub.hub_prefix = '/hub/' 157 | 158 | # The public facing ip of the whole application (the proxy) 159 | c.JupyterHub.ip = '144.76.180.19' 160 | 161 | # Supply extra arguments that will be passed to Jinja environment. 162 | # c.JupyterHub.jinja_environment_options = {} 163 | 164 | # Interval (in seconds) at which to update last-activity timestamps. 165 | # c.JupyterHub.last_activity_interval = 300 166 | 167 | # Specify path to a logo image to override the Jupyter logo in the banner. 168 | # c.JupyterHub.logo_file = '' 169 | 170 | # File to write PID Useful for daemonizing jupyterhub. 171 | c.JupyterHub.pid_file = '/var/www/pipelines/jupyter.pid' 172 | 173 | # The public facing port of the proxy 174 | c.JupyterHub.port = 443 175 | 176 | # The ip for the proxy API handlers 177 | #c.JupyterHub.proxy_api_ip = '127.0.0.1' 178 | 179 | # The port for the proxy API handlers 180 | #c.JupyterHub.proxy_api_port = 0 181 | 182 | # The Proxy Auth token. 183 | # 184 | # Loaded from the CONFIGPROXY_AUTH_TOKEN env variable by default. 185 | # c.JupyterHub.proxy_auth_token = '' 186 | 187 | # Interval (in seconds) at which to check if the proxy is running. 188 | # c.JupyterHub.proxy_check_interval = 30 189 | 190 | # The command to start the http proxy. 191 | # 192 | # Only override if configurable-http-proxy is not on your PATH 193 | # c.JupyterHub.proxy_cmd = ['configurable-http-proxy'] 194 | 195 | # Purge and reset the database. 196 | c.JupyterHub.reset_db = False 197 | 198 | # The class to use for spawning single-user servers. 199 | # 200 | # Should be a subclass of Spawner. 201 | #c.JupyterHub.spawner_class = 'jupyterhub.spawner.LocalProcessSpawner' 202 | 203 | # Path to SSL certificate file for the public facing interface of the proxy 204 | # 205 | # Use with ssl_key 206 | c.JupyterHub.ssl_cert = '/etc/letsencrypt/live/pipelines.kjamistan.com/fullchain.pem' 207 | 208 | # Path to SSL key file for the public facing interface of the proxy 209 | # 210 | # Use with ssl_cert 211 | c.JupyterHub.ssl_key = '/etc/letsencrypt/live/pipelines.kjamistan.com/privkey.pem' 212 | 213 | # Host to send statds metrics to 214 | # c.JupyterHub.statsd_host = '' 215 | 216 | # Port on which to send statsd metrics about the hub 217 | # c.JupyterHub.statsd_port = 8125 218 | 219 | # Prefix to use for all metrics sent by jupyterhub to statsd 220 | # c.JupyterHub.statsd_prefix = 'jupyterhub' 221 | 222 | # Run single-user servers on subdomains of this host. 223 | # 224 | # This should be the full https://hub.domain.tld[:port] 225 | # 226 | # Provides additional cross-site protections for javascript served by single- 227 | # user servers. 228 | # 229 | # Requires .hub.domain.tld to resolve to the same host as 230 | # hub.domain.tld. 231 | # 232 | # In general, this is most easily achieved with wildcard DNS. 233 | # 234 | # When using SSL (i.e. always) this also requires a wildcard SSL certificate. 235 | # c.JupyterHub.subdomain_host = '' 236 | 237 | # Paths to search for jinja templates. 238 | # c.JupyterHub.template_paths = [] 239 | 240 | # Extra settings overrides to pass to the tornado application. 241 | # c.JupyterHub.tornado_settings = {} 242 | 243 | #------------------------------------------------------------------------------ 244 | # Spawner configuration 245 | #------------------------------------------------------------------------------ 246 | 247 | # Base class for spawning single-user notebook servers. 248 | # 249 | # Subclass this, and override the following methods: 250 | # 251 | # - load_state - get_state - start - stop - poll 252 | 253 | # Extra arguments to be passed to the single-user server 254 | # c.Spawner.args = [] 255 | 256 | # The command used for starting notebooks. 257 | c.Spawner.cmd = ['/home/deploy/venv/bin/jupyterhub-singleuser'] 258 | 259 | # Enable debug-logging of the single-user server 260 | c.Spawner.debug = True 261 | 262 | # The default URL for the single-user server. 263 | # 264 | # Can be used in conjunction with --notebook-dir=/ to enable full filesystem 265 | # traversal, while preserving user's homedir as landing page for notebook 266 | # 267 | # `%U` will be expanded to the user's username 268 | # c.Spawner.default_url = '' 269 | 270 | # Disable per-user configuration of single-user servers. 271 | # 272 | # This prevents any config in users' $HOME directories from having an effect on 273 | # their server. 274 | # c.Spawner.disable_user_config = False 275 | 276 | # Whitelist of environment variables for the subprocess to inherit 277 | c.Spawner.env_keep = ['PATH', 'PYTHONPATH', 'CONDA_ROOT', 278 | 'CONDA_DEFAULT_ENV', 'DEPLOY', 'VIRTUAL_ENV', 279 | 'LANG', 'LC_ALL', ] 280 | 281 | # Environment variables to load for the Spawner. 282 | # 283 | # Value could be a string or a callable. If it is a callable, it will be called 284 | # with one parameter, which will be the instance of the spawner in use. It 285 | # should quickly (without doing much blocking operations) return a string that 286 | # will be used as the value for the environment variable. 287 | #c.Spawner.environment = {'VIRTUAL_ENV': '/home/deploy/venv', 288 | # 'PYTHONPATH': '/home/deploy/venv/bin/python'} 289 | 290 | # Timeout (in seconds) before giving up on a spawned HTTP server 291 | # 292 | # Once a server has successfully been spawned, this is the amount of time we 293 | # wait before assuming that the server is unable to accept connections. 294 | # c.Spawner.http_timeout = 30 295 | 296 | # The IP address (or hostname) the single-user server should listen on 297 | c.Spawner.ip = '127.0.0.1' 298 | 299 | # The notebook directory for the single-user server 300 | # 301 | # `~` will be expanded to the user's home directory `%U` will be expanded to the 302 | # user's username 303 | #c.Spawner.notebook_dir = '~/notebooks' 304 | 305 | # An HTML form for options a user can specify on launching their server. The 306 | # surrounding `
` element and the submit button are already provided. 307 | # 308 | # For example: 309 | # 310 | # Set your key: 311 | # 312 | #
313 | # Choose a letter: 314 | # 318 | # c.Spawner.options_form = '' 319 | 320 | # Interval (in seconds) on which to poll the spawner. 321 | # c.Spawner.poll_interval = 30 322 | 323 | # Timeout (in seconds) before giving up on the spawner. 324 | # 325 | # This is the timeout for start to return, not the timeout for the server to 326 | # respond. Callers of spawner.start will assume that startup has failed if it 327 | # takes longer than this. start should return when the server process is started 328 | # and its location is known. 329 | # c.Spawner.start_timeout = 60 330 | 331 | #------------------------------------------------------------------------------ 332 | # LocalProcessSpawner configuration 333 | #------------------------------------------------------------------------------ 334 | 335 | # A Spawner that just uses Popen to start local processes as users. 336 | # 337 | # Requires users to exist on the local system. 338 | # 339 | # This is the default spawner for JupyterHub. 340 | 341 | # Seconds to wait for process to halt after SIGINT before proceeding to SIGTERM 342 | # c.LocalProcessSpawner.INTERRUPT_TIMEOUT = 10 343 | 344 | # Seconds to wait for process to halt after SIGKILL before giving up 345 | # c.LocalProcessSpawner.KILL_TIMEOUT = 5 346 | 347 | # Seconds to wait for process to halt after SIGTERM before proceeding to SIGKILL 348 | # c.LocalProcessSpawner.TERM_TIMEOUT = 5 349 | 350 | #------------------------------------------------------------------------------ 351 | # Authenticator configuration 352 | #------------------------------------------------------------------------------ 353 | 354 | # A class for authentication. 355 | # 356 | # The primary API is one method, `authenticate`, a tornado coroutine for 357 | # authenticating users. 358 | 359 | # set of usernames of admin users 360 | # 361 | # If unspecified, only the user that launches the server will be admin. 362 | c.Authenticator.admin_users = set(['deploy']) 363 | 364 | # Dictionary mapping authenticator usernames to JupyterHub users. 365 | # 366 | # Can be used to map OAuth service names to local users, for instance. 367 | # 368 | # Used in normalize_username. 369 | # c.Authenticator.username_map = {} 370 | 371 | # Regular expression pattern for validating usernames. 372 | # 373 | # If not defined: allow any username. 374 | # c.Authenticator.username_pattern = '' 375 | 376 | # Username whitelist. 377 | # 378 | # Use this to restrict which users can login. If empty, allow any user to 379 | # attempt login. 380 | # c.Authenticator.whitelist = set() 381 | 382 | #------------------------------------------------------------------------------ 383 | # LocalAuthenticator configuration 384 | #------------------------------------------------------------------------------ 385 | 386 | # Base class for Authenticators that work with local Linux/UNIX users 387 | # 388 | # Checks for local users, and can attempt to create them if they exist. 389 | 390 | # The command to use for creating users as a list of strings. 391 | # 392 | # For each element in the list, the string USERNAME will be replaced with the 393 | # user's username. The username will also be appended as the final argument. 394 | # 395 | # For Linux, the default value is: 396 | # 397 | # ['adduser', '-q', '--gecos', '""', '--disabled-password'] 398 | # 399 | # To specify a custom home directory, set this to: 400 | # 401 | # ['adduser', '-q', '--gecos', '""', '--home', '/customhome/USERNAME', 402 | # '--disabled-password'] 403 | # 404 | # This will run the command: 405 | # 406 | # adduser -q --gecos "" --home /customhome/river --disabled-password river 407 | # 408 | # when the user 'river' is created. 409 | # c.LocalAuthenticator.add_user_cmd = [] 410 | 411 | # If a user is added that doesn't exist on the system, should I try to create 412 | # the system user? 413 | # c.LocalAuthenticator.create_system_users = False 414 | 415 | # Automatically whitelist anyone in this group. 416 | # c.LocalAuthenticator.group_whitelist = set() 417 | 418 | #------------------------------------------------------------------------------ 419 | # PAMAuthenticator configuration 420 | #------------------------------------------------------------------------------ 421 | 422 | # Authenticate local Linux/UNIX users with PAM 423 | 424 | # The encoding to use for PAM 425 | # c.PAMAuthenticator.encoding = 'utf8' 426 | 427 | # Whether to open PAM sessions when spawners are started. 428 | # 429 | # This may trigger things like mounting shared filsystems, loading credentials, 430 | # etc. depending on system configuration, but it does not always work. 431 | # 432 | # It can be disabled with:: 433 | # 434 | # c.PAMAuthenticator.open_sessions = False 435 | # c.PAMAuthenticator.open_sessions = True 436 | 437 | # The PAM service to use for authentication. 438 | #c.PAMAuthenticator.service = 'login' 439 | -------------------------------------------------------------------------------- /deploy/templates/sshd_config: -------------------------------------------------------------------------------- 1 | # Package generated configuration file 2 | # See the sshd_config(5) manpage for details 3 | 4 | # What ports, IPs and protocols we listen for 5 | Port 22 6 | # Use these options to restrict which interfaces/protocols sshd will bind to 7 | #ListenAddress :: 8 | #ListenAddress 0.0.0.0 9 | Protocol 2 10 | # HostKeys for protocol version 2 11 | HostKey /etc/ssh/ssh_host_rsa_key 12 | HostKey /etc/ssh/ssh_host_dsa_key 13 | HostKey /etc/ssh/ssh_host_ecdsa_key 14 | HostKey /etc/ssh/ssh_host_ed25519_key 15 | #Privilege Separation is turned on for security 16 | UsePrivilegeSeparation yes 17 | 18 | # Lifetime and size of ephemeral version 1 server key 19 | KeyRegenerationInterval 3600 20 | ServerKeyBits 1024 21 | 22 | # Logging 23 | SyslogFacility AUTH 24 | LogLevel INFO 25 | 26 | # Authentication: 27 | LoginGraceTime 120 28 | PermitRootLogin no 29 | StrictModes yes 30 | 31 | RSAAuthentication yes 32 | PubkeyAuthentication yes 33 | #AuthorizedKeysFile %h/.ssh/authorized_keys 34 | 35 | # Don't read the user's ~/.rhosts and ~/.shosts files 36 | IgnoreRhosts yes 37 | # For this to work you will also need host keys in /etc/ssh_known_hosts 38 | RhostsRSAAuthentication no 39 | # similar for protocol version 2 40 | HostbasedAuthentication no 41 | # Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication 42 | #IgnoreUserKnownHosts yes 43 | 44 | # To enable empty passwords, change to yes (NOT RECOMMENDED) 45 | PermitEmptyPasswords no 46 | 47 | # Change to yes to enable challenge-response passwords (beware issues with 48 | # some PAM modules and threads) 49 | ChallengeResponseAuthentication no 50 | 51 | # Change to no to disable tunnelled clear text passwords 52 | PasswordAuthentication no 53 | 54 | # Kerberos options 55 | #KerberosAuthentication no 56 | #KerberosGetAFSToken no 57 | #KerberosOrLocalPasswd yes 58 | #KerberosTicketCleanup yes 59 | 60 | # GSSAPI options 61 | #GSSAPIAuthentication no 62 | #GSSAPICleanupCredentials yes 63 | 64 | X11Forwarding yes 65 | X11DisplayOffset 10 66 | PrintMotd no 67 | PrintLastLog yes 68 | TCPKeepAlive yes 69 | #UseLogin no 70 | 71 | #MaxStartups 10:30:60 72 | #Banner /etc/issue.net 73 | 74 | # Allow client to pass locale environment variables 75 | AcceptEnv LANG LC_* 76 | 77 | Subsystem sftp /usr/lib/openssh/sftp-server 78 | 79 | # Set this to 'yes' to enable PAM authentication, account processing, 80 | # and session processing. If this is enabled, PAM authentication will 81 | # be allowed through the ChallengeResponseAuthentication and 82 | # PasswordAuthentication. Depending on your PAM configuration, 83 | # PAM authentication via ChallengeResponseAuthentication may bypass 84 | # the setting of "PermitRootLogin without-password". 85 | # If you just want the PAM account and session checks to run without 86 | # PAM authentication, then enable this but set PasswordAuthentication 87 | # and ChallengeResponseAuthentication to 'no'. 88 | UsePAM yes 89 | -------------------------------------------------------------------------------- /example_prod.cfg: -------------------------------------------------------------------------------- 1 | [openweather] 2 | api_key=425b9b9e2416cjfr47329434jk2lX4u32 3 | 4 | [twitter] 5 | consumer_key = CIuYfkdFw8392kdfHuioj 6 | consumer_secret = 4QiJw1wkd902eklfjs920skcSwikFpkl3289 7 | access_token = 15632343-qaMfjk1ri8eklclfiFisoTwjneio48930 8 | access_token_secret = FAifw894jk3l24h543ljfs89hC9fhjFhkjrel3784 9 | 10 | [google] 11 | api_key=AI16cjfr47329434jk2lX4u32 12 | -------------------------------------------------------------------------------- /luigi/luigi.cfg: -------------------------------------------------------------------------------- 1 | [worker] 2 | keep_alive=True 3 | task_limit=10 4 | 5 | [scheduler] 6 | retry_count=4 7 | record_task_history=True 8 | 9 | [task_history] 10 | db_connection=sqlite:///tasks.db 11 | 12 | [hadoop] 13 | client=hadoopcli 14 | streaming-jar=/usr/local/lib/hadoop-2.7.2/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar 15 | python-executable=/usr/bin/python3 16 | jar=/usr/local/lib/hadoop-2.7.2/share/hadoop/tools/lib/hadoop-streaming-2.7.2.jar 17 | tmp_dir=/tmp 18 | 19 | [hdfs] 20 | tmp_dir=/tmp 21 | 22 | -------------------------------------------------------------------------------- /luigi/taxi_data_import.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | from luigi.contrib import sqla 3 | from luigi.mock import MockFile 4 | from googleplaces import GooglePlaces 5 | from sqlalchemy import Float, DateTime, Integer, String 6 | import csv 7 | import logging 8 | import luigi 9 | import os 10 | import requests 11 | import shutil 12 | 13 | 14 | CONFIG_FILE = os.path.abspath(os.path.join(__file__, '../../config/prod.cfg')) 15 | 16 | class DownloadTaxiUrls(luigi.Task): 17 | """ Download NYC Taxi Data for our use. """ 18 | year = luigi.IntParameter(default=2016) 19 | months = luigi.ListParameter(default=[6,7,8]) 20 | url_list = luigi.Parameter(default='https://raw.githubusercontent.com/toddwschneider/nyc-taxi-data/master/raw_data_urls.txt') 21 | cab_type = luigi.Parameter(default='yellow') 22 | 23 | def run(self): 24 | resp = requests.get(self.url_list) 25 | urls = [] 26 | possible_strs = ['{}-{:02d}'.format(self.year, m) for m in self.months] 27 | for line in resp.iter_lines(): 28 | if self.cab_type in str(line): 29 | for datestr in possible_strs: 30 | if datestr in str(line): 31 | urls.append(line) 32 | break 33 | with self.output().open('w') as url_file: 34 | for url in urls: 35 | print(url.decode(), file=url_file) 36 | 37 | def output(self): 38 | return luigi.LocalTarget('/tmp/taxi_data/urls.txt') 39 | 40 | 41 | class DownloadTaxiData(luigi.Task): 42 | """ Downloading each file of taxi data for each url from the repo. """ 43 | def requires(self): 44 | return DownloadTaxiUrls() 45 | 46 | def input(self): 47 | return luigi.LocalTarget('/tmp/taxi_data/urls.txt') 48 | 49 | def run(self): 50 | for url in self.input().open('r'): 51 | yield DownloadTaxiFile(url.rstrip('\n')) 52 | 53 | def output(self): 54 | files = [url.rstrip('\n').split('/')[-1] 55 | for url in self.input().open('r')] 56 | return [luigi.LocalTarget('/tmp/taxi_data/{}'.format(file_name)) 57 | for file_name in files] 58 | 59 | 60 | class DownloadTaxiFile(luigi.Task): 61 | """ Download each file, and save it locally to /tmp/taxi_data """ 62 | url = luigi.Parameter() 63 | 64 | def requires(self): 65 | return DownloadTaxiUrls() 66 | 67 | def run(self): 68 | file_name = self.url.split('/')[-1] 69 | resp = requests.get(str(self.url), stream=True) 70 | with open(self.output().path, 'wb') as taxi_file: 71 | shutil.copyfileobj(resp.raw, taxi_file) 72 | 73 | def output(self): 74 | file_name = self.url.split('/')[-1] 75 | return luigi.LocalTarget('/tmp/taxi_data/{}'.format(file_name)) 76 | 77 | 78 | class AddTaxiLocations(luigi.Task): 79 | """ Import the files and add the locations using Google Reverse Search. """ 80 | dir_name = luigi.Parameter(default='/tmp/taxi_data/') 81 | 82 | def requires(self): 83 | return DownloadTaxiData() 84 | 85 | def input(self): 86 | return [luigi.LocalTarget('/tmp/taxi_data/{}'.format(fn)) for fn 87 | in os.listdir(self.dir_name) if fn.endswith('csv')] 88 | 89 | def run(self): 90 | for fn in self.input(): 91 | rdr = csv.DictReader(fn.open('r')) 92 | for line in rdr: 93 | yield AddTaxiLocation(line) 94 | 95 | 96 | class AddTaxiLocation(luigi.Task): 97 | """ Search for pickup and dropoff location and add them via Google API. 98 | NOTE: it appears the names and mappings change over time, you will 99 | need to adapt the code for different years. I've included columns for 100 | 2009 and 2016 here. Feel free to expand this and send PR if you'd 101 | like to share! 102 | """ 103 | line = luigi.DictParameter() 104 | 105 | columns_2009 = ['vendor_name', 106 | 'Rate_Code', 'surcharge', 'store_and_forward', 107 | 'mta_tax', 'Total_Amt', 'Fare_Amt', 'Tolls_Amt', 'Tip_Amt', 108 | 'Trip_Pickup_DateTime', 'Trip_Dropoff_DateTime', 109 | 'Passenger_Count', 'Payment_Type', 'Trip_Distance', 110 | 'Start_Lon', 'Start_Lat', 'End_Lon', 'End_Lat', 111 | 'pickup_location_name', 'pickup_location_phone', 112 | 'pickup_location_addy', 'pickup_location_web', 113 | 'dropoff_location_name', 'dropoff_location_phone', 114 | 'dropoff_location_addy', 'dropoff_location_web'] 115 | 116 | columns = ['VendorID', 'RatecodeID', 'improvement_surcharge', 117 | 'store_and_fwd_flag', 'mta_tax', 'total_amount', 118 | 'fare_amount', 'extra', 'tip_amount', 119 | 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 120 | 'passenger_count', 'payment_type', 'trip_distance', 121 | 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 122 | 'dropoff_latitude', 'pickup_location_name', 123 | 'pickup_location_phone', 'pickup_location_addy', 124 | 'pickup_location_web', 'dropoff_location_name', 125 | 'dropoff_location_phone', 'dropoff_location_addy', 126 | 'dropoff_location_web'] 127 | 128 | 129 | def add_addy_info(self, res, loc_type): 130 | if len(res.places): 131 | place = res.places[0] 132 | place.get_details() 133 | self.line['{}_location_name'.format(loc_type)] = place.name 134 | self.line['{}_location_phone'.format(loc_type)] = place.local_phone_number 135 | self.line['{}_location_addy'.format(loc_type)] = place.vicinity 136 | self.line['{}_location_web'.format(loc_type)] = place.website 137 | 138 | def run(self): 139 | self.line = dict((k, v) for k,v in self.line.items()) 140 | config = ConfigParser() 141 | config.read(CONFIG_FILE) 142 | client = GooglePlaces(config.get('google', 'api_key')) 143 | if len(set(self.line.keys()) - set(self.columns)) > 2: 144 | self.columns = self.columns_2009 145 | res = client.nearby_search(lat_lng={'lat': self.line[self.columns[15]], 146 | 'lng': self.line[self.columns[14]]}) 147 | self.add_addy_info(res, 'pickup') 148 | res = client.nearby_search(lat_lng={'lat': self.line[self.columns[17]], 149 | 'lng': self.line[self.columns[16]]}) 150 | self.add_addy_info(res, 'dropoff') 151 | with self.output().open('w') as line_output: 152 | line_with_tabs = '\t'.join([self.line.get(key) if self.line.get(key) 153 | else '' for key in self.columns]) 154 | line_output.write(line_with_tabs) 155 | 156 | def output(self): 157 | return MockFile("AddTaxiLocation") 158 | 159 | 160 | class SaveTaxiRow(sqla.CopyToTable): 161 | """ Save each taxi line with the location information to the database """ 162 | connection_string = 'sqlite:///taxi_db.db' 163 | table = 'taxi_rides' 164 | columns = [ 165 | (['vendor_name', String(10)], {}), 166 | (['rate_code', String(4)], {}), 167 | (['surcharge', Float()], {}), 168 | (['store_and_forward', Integer()], {}), 169 | (['mta_tax', Float()], {}), 170 | (['total_amt', Float()], {}), 171 | (['fare_amt', Float()], {}), 172 | (['tolls_amt', Float()], {}), 173 | (['tip_amt', Float()], {}), 174 | (['pickup_datetime', DateTime()], {}), 175 | (['dropoff_datetime', DateTime()], {}), 176 | (['passenger_count', Integer()], {}), 177 | (['payment_type', String(100)], {}), 178 | (['trip_distance', Float()], {}), 179 | (['pickup_longitude', Float()], {}), 180 | (['pickup_latitude', Float()], {}), 181 | (['dropoff_longitude', Float()], {}), 182 | (['dropoff_latitude', Float()], {}), 183 | (['pickup_location_name', String(128)], {}), 184 | (['pickup_location_phone', String(64)], {}), 185 | (['pickup_location_addy', String(256)], {}), 186 | (['pickup_location_web', String(64)], {}), 187 | (['dropoff_location_name', String(128)], {}), 188 | (['dropoff_location_phone', String(64)], {}), 189 | (['dropoff_location_addy', String(256)], {}), 190 | (['dropoff_location_web', String(64)], {}), 191 | ] 192 | 193 | def requires(self): 194 | return AddTaxiLocation() 195 | -------------------------------------------------------------------------------- /luigi/wordcount_map_reduce.py: -------------------------------------------------------------------------------- 1 | """ Simple wordcount using map reduce """ 2 | import luigi 3 | import luigi.contrib.hadoop 4 | import luigi.contrib.hdfs 5 | import json 6 | import os 7 | 8 | class ProcessChatLogs(luigi.Task): 9 | file_name = luigi.Parameter() 10 | 11 | def input(self): 12 | return luigi.contrib.hdfs.HdfsTarget(self.file_name) 13 | 14 | def run(self): 15 | with self.output().open('w') as output_file: 16 | for msg_dict in json.load(self.input().open('r')): 17 | output_file.write(msg_dict.get('message') + '\n') 18 | 19 | def output(self): 20 | return luigi.contrib.hdfs.HdfsTarget( 21 | self.file_name.replace('.json', '_messages_only.txt')) 22 | 23 | 24 | class ChatWordCount(luigi.contrib.hadoop.JobTask): 25 | file_name = luigi.Parameter() 26 | 27 | def output(self): 28 | file_dir = os.path.dirname(self.file_name) 29 | new_file_name = 'wordcount_{}'.format( 30 | os.path.basename(self.file_name).replace('.json', '.txt')) 31 | return luigi.contrib.hdfs.HdfsTarget( 32 | os.path.join(file_dir, new_file_name)) 33 | 34 | def mapper(self, line): 35 | for word in line.strip().split(): 36 | yield word, 1 37 | 38 | def reducer(self, key, values): 39 | yield key, sum(values) 40 | 41 | def requires(self): 42 | return ProcessChatLogs(self.file_name) 43 | -------------------------------------------------------------------------------- /notebooks/Chapter 3 - Basic Celery Tasks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "sys.path.append('/var/www/pipelines/celery_app')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 4, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from tasks import get_stock_info" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 5, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from datetime import datetime" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 9, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "res = get_stock_info.delay('FB', datetime(2016,1,1), datetime.today(), source='yahoo')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 10, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "" 69 | ] 70 | }, 71 | "execution_count": 10, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "res" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 11, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "'SUCCESS'" 91 | ] 92 | }, 93 | "execution_count": 11, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "res.status" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 12, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "'{\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005}}'" 113 | ] 114 | }, 115 | "execution_count": 12, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "res.get()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "res = get_stock_info.delay('XFJKLSFD', datetime(2016,1,1), datetime.today(), source='yahoo')\n" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "'FAILURE'" 146 | ] 147 | }, 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "res.status" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "ename": "RemoteDataError", 166 | "evalue": "Unable to read URL: http://ichart.finance.yahoo.com/table.csv", 167 | "output_type": "error", 168 | "traceback": [ 169 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 170 | "\u001b[0;31mRemoteDataError\u001b[0m Traceback (most recent call last)", 171 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 172 | "\u001b[0;32m/home/deploy/venv/lib/python3.4/site-packages/celery/result.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout, propagate, interval, no_ack, follow_parents, EXCEPTION_STATES, PROPAGATE_STATES)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cache\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpropagate\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaybe_reraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 173 | "\u001b[0;32m/home/deploy/venv/lib/python3.4/site-packages/celery/result.py\u001b[0m in \u001b[0;36mmaybe_reraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmaybe_reraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstates\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPROPAGATE_STATES\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 271\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 272\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbuild_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mintermediate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformatter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 174 | "\u001b[0;31mRemoteDataError\u001b[0m: Unable to read URL: http://ichart.finance.yahoo.com/table.csv" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "res.get()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 17, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "res = get_stock_info.apply_async(('GOOG', datetime(2016, 1, 1), datetime.today()), queue='priority') " 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 18, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "" 204 | ] 205 | }, 206 | "execution_count": 18, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "res" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 19, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "'SUCCESS'" 226 | ] 227 | }, 228 | "execution_count": 19, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "res.status" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 20, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "'{\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965}}'" 248 | ] 249 | }, 250 | "execution_count": 20, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "res.get()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.4.2" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 1 290 | } 291 | -------------------------------------------------------------------------------- /notebooks/Chapter 3 - Complex Task Chains.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "sys.path.append('/var/www/pipelines/celery_app')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from tasks import get_stock_info, price_range, determine_buy, sort_results" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from celery import chain, group, chord" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from datetime import datetime" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "my_stocks = ['FB', 'GOOG', 'IBM']" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 7, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "start = datetime(2016,1,1)\n", 78 | "end = datetime.today()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Working with Chains" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "my_chain = chain(price_range.s('FB', start, end), determine_buy.s())" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 9, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "tasks.price_range('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)) | tasks.determine_buy()" 110 | ] 111 | }, 112 | "execution_count": 9, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "my_chain" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 10, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "res = my_chain()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 11, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "" 143 | ] 144 | }, 145 | "execution_count": 11, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "res" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 12, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "'SUCCESS'" 165 | ] 166 | }, 167 | "execution_count": 12, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "res.state" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 13, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "False" 187 | ] 188 | }, 189 | "execution_count": 13, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "res.get()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Working with Groups" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 14, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "my_grp = group(get_stock_info.s(stk, start, end) for stk in my_stocks)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 15, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "[tasks.get_stock_info('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)), tasks.get_stock_info('GOOG', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098)), tasks.get_stock_info('IBM', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 9, 47, 10, 633098))]" 227 | ] 228 | }, 229 | "execution_count": 15, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "my_grp" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 16, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "res = my_grp()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 17, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "" 260 | ] 261 | }, 262 | "execution_count": 17, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "res" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 18, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "['{\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005}}',\n", 282 | " '{\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003},\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745}}',\n", 283 | " '{\"Open min\":{\"IBM\":118.459999},\"Open max\":{\"IBM\":163.190002},\"Open mean\":{\"IBM\":145.5231246875},\"Open median\":{\"IBM\":148.4449995},\"Low min\":{\"IBM\":116.900002},\"Low max\":{\"IBM\":162.179993},\"Low mean\":{\"IBM\":144.4778746625},\"Low median\":{\"IBM\":147.3549955},\"High min\":{\"IBM\":119.660004},\"High max\":{\"IBM\":164.949997},\"High mean\":{\"IBM\":146.7556872938},\"High median\":{\"IBM\":149.774994},\"Adj Close min\":{\"IBM\":115.72409},\"Adj Close max\":{\"IBM\":163.529999},\"Adj Close mean\":{\"IBM\":143.7099821812},\"Adj Close median\":{\"IBM\":146.422927},\"Close min\":{\"IBM\":117.849998},\"Close max\":{\"IBM\":163.529999},\"Close mean\":{\"IBM\":145.7649376375},\"Close median\":{\"IBM\":148.4800035}}']" 284 | ] 285 | }, 286 | "execution_count": 18, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "res.get()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### Working with Chords (redis backend only!)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 19, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "header = [price_range.subtask((stk, start, end)) for stk in my_stocks]" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 23, 316 | "metadata": { 317 | "collapsed": false 318 | }, 319 | "outputs": [ 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | "1 loop, best of 3: 849 ms per loop\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "%timeit sort_results(group(header)().get())" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 24, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": [ 342 | "[{'percent_change': 9.2799999999999994,\n", 343 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 344 | " 'period_high': 125.260002,\n", 345 | " 'period_low': 94.160004000000001,\n", 346 | " 'period_mean': 112.86093739375001,\n", 347 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 348 | " 'result': 'higher',\n", 349 | " 'stock': 'FB',\n", 350 | " 'todays_price': 123.34},\n", 351 | " {'percent_change': 11.06,\n", 352 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 353 | " 'period_high': 163.529999,\n", 354 | " 'period_low': 115.72408999999999,\n", 355 | " 'period_mean': 143.70998218125001,\n", 356 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 357 | " 'result': 'higher',\n", 358 | " 'stock': 'IBM',\n", 359 | " 'todays_price': 159.61000000000001},\n", 360 | " {'percent_change': 6.54,\n", 361 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 362 | " 'period_high': 784.84997599999997,\n", 363 | " 'period_low': 668.26000999999997,\n", 364 | " 'period_mean': 724.18143844999997,\n", 365 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 366 | " 'result': 'higher',\n", 367 | " 'stock': 'GOOG',\n", 368 | " 'todays_price': 771.55999999999995}]" 369 | ] 370 | }, 371 | "execution_count": 24, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "sort_results(group(header)().get())" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 25, 383 | "metadata": { 384 | "collapsed": false 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "callback = sort_results.subtask()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 26, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "1 loop, best of 3: 1.16 s per loop\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "%timeit chord(header)(callback).get()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 27, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "[{'percent_change': 9.3100000000000005,\n", 421 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 422 | " 'period_high': 125.260002,\n", 423 | " 'period_low': 94.160004000000001,\n", 424 | " 'period_mean': 112.86093739375001,\n", 425 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 426 | " 'result': 'higher',\n", 427 | " 'stock': 'FB',\n", 428 | " 'todays_price': 123.37},\n", 429 | " {'percent_change': 11.06,\n", 430 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 431 | " 'period_high': 163.529999,\n", 432 | " 'period_low': 115.72408999999999,\n", 433 | " 'period_mean': 143.70998218125001,\n", 434 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 435 | " 'result': 'higher',\n", 436 | " 'stock': 'IBM',\n", 437 | " 'todays_price': 159.61000000000001},\n", 438 | " {'percent_change': 6.5099999999999998,\n", 439 | " 'period_end': datetime.datetime(2016, 8, 22, 9, 47, 10, 633098),\n", 440 | " 'period_high': 784.84997599999997,\n", 441 | " 'period_low': 668.26000999999997,\n", 442 | " 'period_mean': 724.18143844999997,\n", 443 | " 'period_start': datetime.datetime(2016, 1, 1, 0, 0),\n", 444 | " 'result': 'higher',\n", 445 | " 'stock': 'GOOG',\n", 446 | " 'todays_price': 771.30999999999995}]" 447 | ] 448 | }, 449 | "execution_count": 27, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "chord(header)(callback).get()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": { 462 | "collapsed": true 463 | }, 464 | "outputs": [], 465 | "source": [] 466 | } 467 | ], 468 | "metadata": { 469 | "kernelspec": { 470 | "display_name": "Python 3", 471 | "language": "python", 472 | "name": "python3" 473 | }, 474 | "language_info": { 475 | "codemirror_mode": { 476 | "name": "ipython", 477 | "version": 3 478 | }, 479 | "file_extension": ".py", 480 | "mimetype": "text/x-python", 481 | "name": "python", 482 | "nbconvert_exporter": "python", 483 | "pygments_lexer": "ipython3", 484 | "version": "3.4.2" 485 | } 486 | }, 487 | "nbformat": 4, 488 | "nbformat_minor": 1 489 | } 490 | -------------------------------------------------------------------------------- /notebooks/Chapter 3 - First Steps with Celery.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "sys.path.append('/var/www/pipelines/celery_app')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from datetime import datetime" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from tasks import get_stock_info" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "res = get_stock_info.delay('FB', datetime(2016, 1, 1), datetime.today())" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "'STARTED'" 69 | ] 70 | }, 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "res.status" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 7, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "'{\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995}}'" 91 | ] 92 | }, 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "res.get()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "res = get_stock_info.apply_async(('FB', datetime(2016, 1, 1), datetime.today()), queue='priority')" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 9, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "'{\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995}}'" 124 | ] 125 | }, 126 | "execution_count": 9, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "res.get()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from tasks import get_stock_info, price_range, determine_buy, sort_results" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 11, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "from celery import chain, group, chord" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 12, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "start = datetime(2016, 1, 1)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 13, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "end = datetime.today()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "collapsed": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "my_chain = chain(price_range.s('FB', start, end), determine_buy.s())" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 15, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "tasks.price_range('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)) | tasks.determine_buy()" 201 | ] 202 | }, 203 | "execution_count": 15, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "my_chain" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 16, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "res = my_chain()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 17, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "" 234 | ] 235 | }, 236 | "execution_count": 17, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "res" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 18, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "'SUCCESS'" 256 | ] 257 | }, 258 | "execution_count": 18, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "res.status" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 19, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "False" 278 | ] 279 | }, 280 | "execution_count": 19, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "res.get()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 20, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "my_stocks = ['FB', 'GOOG', 'IBM']" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 21, 303 | "metadata": { 304 | "collapsed": true 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "my_group = group(get_stock_info.s(stk, start, end) for stk in my_stocks)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 22, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "[tasks.get_stock_info('FB', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)), tasks.get_stock_info('GOOG', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856)), tasks.get_stock_info('IBM', datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 8, 22, 13, 26, 38, 292856))]" 322 | ] 323 | }, 324 | "execution_count": 22, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "my_group" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 23, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "res = my_group()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 24, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "" 355 | ] 356 | }, 357 | "execution_count": 24, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "res" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 25, 369 | "metadata": { 370 | "collapsed": false 371 | }, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "['{\"Low min\":{\"FB\":89.370003},\"Low max\":{\"FB\":124.75},\"Low mean\":{\"FB\":111.5968122063},\"Low median\":{\"FB\":112.959999},\"High min\":{\"FB\":95.0},\"High max\":{\"FB\":128.330002},\"High mean\":{\"FB\":113.9396873375},\"High median\":{\"FB\":114.7299995},\"Open min\":{\"FB\":92.830002},\"Open max\":{\"FB\":127.519997},\"Open mean\":{\"FB\":112.8902501312},\"Open median\":{\"FB\":113.8500025},\"Close min\":{\"FB\":94.160004},\"Close max\":{\"FB\":125.260002},\"Close mean\":{\"FB\":112.8609373937},\"Close median\":{\"FB\":113.9300005},\"Adj Close min\":{\"FB\":94.160004},\"Adj Close max\":{\"FB\":125.260002},\"Adj Close mean\":{\"FB\":112.8609373937},\"Adj Close median\":{\"FB\":113.9300005}}',\n", 377 | " '{\"Open min\":{\"GOOG\":667.849976},\"Open max\":{\"GOOG\":785.0},\"Open mean\":{\"GOOG\":724.2199360063},\"Open median\":{\"GOOG\":722.4649965},\"Close min\":{\"GOOG\":668.26001},\"Close max\":{\"GOOG\":784.849976},\"Close mean\":{\"GOOG\":724.18143845},\"Close median\":{\"GOOG\":719.6299745},\"Low min\":{\"GOOG\":663.059998},\"Low max\":{\"GOOG\":782.969971},\"Low mean\":{\"GOOG\":717.8580696687},\"Low median\":{\"GOOG\":716.169983},\"Adj Close min\":{\"GOOG\":668.26001},\"Adj Close max\":{\"GOOG\":784.849976},\"Adj Close mean\":{\"GOOG\":724.18143845},\"Adj Close median\":{\"GOOG\":719.6299745},\"High min\":{\"GOOG\":672.299988},\"High max\":{\"GOOG\":789.869995},\"High mean\":{\"GOOG\":730.2307738688},\"High median\":{\"GOOG\":725.828003}}',\n", 378 | " '{\"Low min\":{\"IBM\":116.900002},\"Low max\":{\"IBM\":162.179993},\"Low mean\":{\"IBM\":144.4778746625},\"Low median\":{\"IBM\":147.3549955},\"High min\":{\"IBM\":119.660004},\"High max\":{\"IBM\":164.949997},\"High mean\":{\"IBM\":146.7556872938},\"High median\":{\"IBM\":149.774994},\"Open min\":{\"IBM\":118.459999},\"Open max\":{\"IBM\":163.190002},\"Open mean\":{\"IBM\":145.5231246875},\"Open median\":{\"IBM\":148.4449995},\"Close min\":{\"IBM\":117.849998},\"Close max\":{\"IBM\":163.529999},\"Close mean\":{\"IBM\":145.7649376375},\"Close median\":{\"IBM\":148.4800035},\"Adj Close min\":{\"IBM\":115.72409},\"Adj Close max\":{\"IBM\":163.529999},\"Adj Close mean\":{\"IBM\":143.7099821812},\"Adj Close median\":{\"IBM\":146.422927}}']" 379 | ] 380 | }, 381 | "execution_count": 25, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "res.get()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 26, 393 | "metadata": { 394 | "collapsed": true 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "header = (price_range.subtask((stk, start, end)) for stk in my_stocks)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 27, 404 | "metadata": { 405 | "collapsed": true 406 | }, 407 | "outputs": [], 408 | "source": [ 409 | "callback = sort_results.subtask()" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 28, 415 | "metadata": { 416 | "collapsed": false 417 | }, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "1 loop, best of 3: 1.08 s per loop\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "%timeit chord(header)(callback).get()" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 29, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "The slowest run took 34.82 times longer than the fastest. This could mean that an intermediate result is being cached.\n", 443 | "10000 loops, best of 3: 67.9 µs per loop\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "%timeit sort_results(group(header)().get())" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "collapsed": true 456 | }, 457 | "outputs": [], 458 | "source": [] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.4.2" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 1 482 | } 483 | -------------------------------------------------------------------------------- /notebooks/Chapter 3 - Monitoring Tasks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "sys.path.append('/var/www/pipelines/celery_app')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "from datetime import datetime" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "from tasks import get_stock_info" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "start = datetime(2016, 1, 1)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "end = datetime.today()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 7, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "res = get_stock_info.delay('FB', start, end)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 8, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "'SUCCESS'" 91 | ] 92 | }, 93 | "execution_count": 8, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "res.status" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 9, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "res = get_stock_info.delay('MYCOOLSTOCK', start, end)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 10, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "'FAILURE'" 124 | ] 125 | }, 126 | "execution_count": 10, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "res.status" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.4.2" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 1 166 | } 167 | -------------------------------------------------------------------------------- /notebooks/Chapter 4 - Dask Distributed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from dask import do\n", 12 | "from distributed import LocalCluster, Executor\n", 13 | "from configparser import ConfigParser\n", 14 | "import requests\n", 15 | "import numpy as np" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "You must have a folder `config` in the parent directory or current directory or simply modify the `get_config` method. You will also need to [acquire an API Key for the OpenWeatherMap API](http://openweathermap.org/appid). Your `prod.cfg` file in the aforementioned `config` folder should have a section like so:\n", 23 | "\n", 24 | "```\n", 25 | "[openweather]\n", 26 | "api_key=425b9b9e2416cjfr47329434jk2lX4u32\n", 27 | "```\n", 28 | "with your assigned key from OpenWeatherMap." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def get_current(location_str, config):\n", 38 | " '''Get latest temperature data from openweather\n", 39 | " params:\n", 40 | " location_str: string with city,country_code\n", 41 | " config: ConfigParser object with openweather section and api_key key\n", 42 | " returns:\n", 43 | " tuple: (location_str, parsed json response) \n", 44 | " '''\n", 45 | " weather_key = config.get('openweather', 'api_key')\n", 46 | " resp = requests.get('http://api.openweathermap.org/data/2.5/weather', \n", 47 | " params={'q': location_str, \n", 48 | " 'appid': weather_key, \n", 49 | " 'units': 'metric'}) \n", 50 | " return location_str, resp.json()\n", 51 | " " 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "def get_forecast(location_str, config):\n", 63 | " '''Get forecast temperature data from openweather\n", 64 | " params:\n", 65 | " location_str: string with city,country_code\n", 66 | " config: ConfigParser object with openweather section and api_key key\n", 67 | " returns:\n", 68 | " tuple: (location_str, parsed json response)\n", 69 | " '''\n", 70 | " weather_key = config.get('openweather', 'api_key')\n", 71 | " resp = requests.get('http://api.openweathermap.org/data/2.5/forecast', \n", 72 | " params={'q': location_str, \n", 73 | " 'appid': weather_key, \n", 74 | " 'units': 'metric'})\n", 75 | " return location_str, resp.json()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "def filter_temp(location_str, weather_json):\n", 87 | " '''Filter out just the city, temperature, and humidity in forecast or current weather data.\n", 88 | " params:\n", 89 | " location_str: string with city,country_code\n", 90 | " weather_json: json returned from get_forecast or get_current \n", 91 | " returns:\n", 92 | " dict: containing city names and either list of forecast temps and humidity or current temp and humidity\n", 93 | " '''\n", 94 | " if 'cod' in weather_json.keys() and int(weather_json['cod']) != 200:\n", 95 | " raise ValueError('Bad Data Returned from API: {} - {}'.format(\n", 96 | " location_str, weather_json))\n", 97 | " try:\n", 98 | " api_city_str = '{},{}'.format(weather_json['name'], weather_json['sys']['country'])\n", 99 | " except KeyError:\n", 100 | " api_city_str = '{},{}'.format(weather_json['city']['name'], weather_json['city']['country'])\n", 101 | " resp = {\n", 102 | " 'search_city': location_str,\n", 103 | " 'api_city': api_city_str,\n", 104 | " }\n", 105 | " if 'main' in weather_json.keys():\n", 106 | " resp['current_temp'] = weather_json['main']['temp']\n", 107 | " resp['current_humidity'] = weather_json['main']['humidity']\n", 108 | " else:\n", 109 | " resp['forecast_temps'] = [fr['main']['temp'] for fr in weather_json['list']]\n", 110 | " resp['forecast_humidity'] = [fr['main']['humidity'] for fr in weather_json['list']]\n", 111 | " return resp" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "def merge_data(latest, forecast):\n", 123 | " ''' Merge data from current and forecast dictionaries and avg forecasts\n", 124 | " params:\n", 125 | " latest: filtered dictionary from get_latest\n", 126 | " forecast: filtered dictionary from get_forecast\n", 127 | " returns:\n", 128 | " dict: merged dict with additional mean for forecasts\n", 129 | " '''\n", 130 | " final = latest.copy()\n", 131 | " final.update(forecast)\n", 132 | " mean_tmp, mean_hum = np.mean(forecast['forecast_temps']), np.mean(forecast['forecast_humidity'])\n", 133 | " final['mean_temp'] = np.round(mean_tmp, 2)\n", 134 | " final['mean_hum'] = np.round(mean_hum, 2)\n", 135 | " return final" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "def main(city):\n", 147 | " ''' Main function which will take city names and return a final dataset for each city\n", 148 | " params:\n", 149 | " city: string (ex: 'Berlin,DE')\n", 150 | " returns:\n", 151 | " dict: current and forecast temps and humidities for given city\n", 152 | " '''\n", 153 | " config = get_config()\n", 154 | " city_str, weather_data = get_current(city, config)\n", 155 | " latest = filter_temp(city_str, weather_data)\n", 156 | " city_str, weather_data = get_forecast(city, config)\n", 157 | " forecast = filter_temp(city_str, weather_data)\n", 158 | " final = merge_data(latest, forecast)\n", 159 | " return final" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "def get_config():\n", 171 | " ''' returns config '''\n", 172 | " config = ConfigParser()\n", 173 | " config.read(['../config/prod.cfg', 'config/prod.cfg'])\n", 174 | " return config" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "city_list = ['London,UK', 'Berlin,DE', 'NewYork,NY', \n", 186 | " 'LosAngeles,CA', 'Madrid,ES', 'Bangkok,TH', \n", 187 | " 'Baghdad,IQ', 'Auckland,NZ', 'Istanbul,TR',\n", 188 | " 'MexicoCity,MX', 'Primavera,CL', 'KualaLumpur,MY',\n", 189 | " 'Shanghai,CN', 'Chicago,IL', 'Rome,IT', 'Nairobi,KE',\n", 190 | " 'MachuPicchu,PE', 'Cardiff,UK', 'Somewhere,WL']" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "%%time\n", 200 | "res = []\n", 201 | "for city in city_list:\n", 202 | " try:\n", 203 | " final = main(city)\n", 204 | " res.append(final)\n", 205 | " except Exception as e:\n", 206 | " print(city, e)\n", 207 | "\n", 208 | "print('sorted by current temp: ', sorted(res, key=lambda x: x.get('current_temp'), reverse=True))\n", 209 | "print('sorted by upcoming forecast temp: ', sorted(res, key=lambda x: x.get('mean_temp'), reverse=True))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Depending on your setup the `start_diagnostics_server` which starts the web UI for analyzing your Dask scheduler and work via the Executor may or may not work. If it doesn't work out of the box, you'll need to start the dask-scheduler a different way. Easiest is using:\n", 217 | "\n", 218 | "`/path/to/your/virtualenv/bin/dask-scheduler`\n", 219 | "\n", 220 | "which will start the scheduler process in your terminal as well as the Bokeh server for the web UI. The output should have the links for both the web UI (usually [localhost:8787](http://127.0.0.1:8787)) as well as the local scheduler. \n", 221 | "\n", 222 | "In a new shell or screen session, run the worker nodes with however many workers you'd like (here I chose 8):\n", 223 | "\n", 224 | "`/path/to/your/virtualenv/bin/dask-worker --nprocs 8 127.0.0.1:8786`\n", 225 | "\n", 226 | "I recommend using [`screen`](https://www.gnu.org/software/screen/) so you can easily switch between shells and keep track of logs. Once installed, you can create a new named screen like so: `screen -S scheduler`, use ctl + a followed by d to detach back to your main shell and ctl + a followed by k to kill the screen session when you are done. To reattach to a running named screen you can use `screen -r scheduler`. Read through the docs for more info. \n", 227 | "\n", 228 | "Then you can utilize the code directly below this cell instead of the `LocalCluster` code two cells below." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "exc = Executor('127.0.0.1:8786') # You may want to change this to the exact IP shown when you ran dask-scheduler" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "lc = LocalCluster()\n", 247 | "lc.start_diagnostics_server() \n", 248 | "exc = Executor(lc)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "%%time\n", 258 | "\n", 259 | "futures = [e.submit(main, i) for i in city_list]\n", 260 | "print(futures)\n", 261 | "print('sorted by current temp', \n", 262 | " sorted([f.result() for f in futures if f.status != 'error'], \n", 263 | " key=lambda x: x['current_temp'], reverse=True))\n", 264 | "print('sorted by forecast temp', \n", 265 | " sorted([f.result() for f in futures if f.status != 'error'], \n", 266 | " key=lambda x: x['mean_temp'], reverse=True))" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "example_error = futures[-1]" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "example_error.status" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "example_error.result()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 3", 309 | "language": "python", 310 | "name": "python3" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.6.1" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 1 327 | } 328 | -------------------------------------------------------------------------------- /notebooks/Chapter 4 - Learning Dask Bags.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from dask import bag\n", 10 | "import json\n", 11 | "from bokeh.plotting import output_notebook\n", 12 | "output_notebook()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "Some of this notebook is taken from [the Dask Examples repository](https://github.com/dask/dask-examples/blob/master/github-on-ec2.ipynb)\n", 20 | "\n", 21 | "To gather the data, I ran this in my terminal from the `data` directory:\n", 22 | "\n", 23 | "`wget http://data.githubarchive.org/2016-01-01-{0..23}.json.gz\n", 24 | "wget http://data.githubarchive.org/2015-12-31-{0..23}.json.gz`\n", 25 | "\n", 26 | "This is not (by any means) big data, but is used for example" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "db = bag.read_text(['../data/2016*.json.gz', '../data/2015*.json.gz']).map(json.loads)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "db.count().compute()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "first = db.take(1)[0]\n", 54 | "first" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "tenth = db.take(10)[-1]\n", 64 | "tenth" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "%time db.pluck('type').frequencies().compute()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "import re\n", 83 | "time_pattern = re.compile('[\\d\\-]+T(?P[\\d]+)')\n", 84 | "\n", 85 | "pushes = db.filter(lambda x: x['type'] == 'PushEvent')\n", 86 | "hours = pushes.pluck('created_at').map(lambda x: re.search(time_pattern, x).group('hour'))\n", 87 | "top_10_hours = hours.frequencies().topk(10, key=lambda time, count: count)\n", 88 | "%time top_10_hours.compute()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "def get_hours(x):\n", 98 | " \"\"\"The key for foldby, like a groupby key. Get the hour from a PushEvent\"\"\"\n", 99 | " return re.search(time_pattern, x['created_at']).group('hour')\n", 100 | "\n", 101 | "def binop(total, x):\n", 102 | " \"\"\"Count the number of commits in a PushEvent\"\"\"\n", 103 | " return total + len(x['payload']['commits'])\n", 104 | "\n", 105 | "def combine(total1, total2):\n", 106 | " \"\"\"This combines commit counts from PushEvents\"\"\"\n", 107 | " return total1 + total2\n", 108 | "\n", 109 | "commits = pushes.foldby(get_hours, binop, initial=0, combine=combine)\n", 110 | "top_commits = commits.topk(10, key=lambda time, count: count)\n", 111 | "%time top_commits.compute()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "messages = pushes.pluck('payload').map(lambda x: ' '.join([c['message'].lower() for c in x['commits']]))\n", 121 | "top_10_words = messages.str.split().concat().frequencies().topk(10, lambda word, count: count)\n", 122 | "%time top_10_words.compute()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "If you haven't run `nltk` yet, you'll need to download your corpora. To do so, use this:\n", 130 | "\n", 131 | "`import nltk; nltk.download()`\n", 132 | "\n", 133 | "Follow the prompt and select (d) for Download and then type: `stopwords`\n", 134 | "\n", 135 | "Then you can use (q) to quit once the download is completed." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from nltk.corpus import stopwords" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "def get_combined_messages(x):\n", 156 | " long_str = ' '.join([c['message'].lower() for c in x['commits']])\n", 157 | " return ' '.join([w for w in long_str.split() if w not in stopwords.words('english')])" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "long_strs = pushes.pluck('payload').map(get_combined_messages)\n", 167 | "long_strs.take(5)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "top_20_words = long_strs.str.split().concat().frequencies().topk(20, lambda word, count: count)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from dask.diagnostics import Profiler\n", 186 | "prof = Profiler()\n", 187 | "\n", 188 | "with prof:\n", 189 | " res = top_20_words.compute()\n", 190 | "\n", 191 | "prof.visualize()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "res" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.6.1" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 1 234 | } 235 | -------------------------------------------------------------------------------- /notebooks/Chapter 6 - Introduction to PySpark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Introduction to PySpark" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "" 21 | ] 22 | }, 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "sc" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "sqlCtx" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "20" 65 | ] 66 | }, 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "rdd = sc.parallelize(range(1000), 20) \n", 74 | "rdd.getNumPartitions()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "[0, 1, 2, 3, 4]" 88 | ] 89 | }, 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "rdd.take(5)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "text_rdd = sc.textFile(\n", 108 | " 'file:///Users/kjam/data-pipelines-course/data/europarl_speech_text.txt')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "PythonRDD[4] at RDD at PythonRDD.scala:48" 122 | ] 123 | }, 124 | "execution_count": 6, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "text_rdd.filter(lambda x: 'Deutschland' in x)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 7, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "'\"Ich begrüße die Gewährung eines finanziellen Beitrags aus dem Europäischen Fonds für die Anpassung an die Globalisierung, den die deutschen Behörden im Zuge der Entlassungen bei der Aleo Solar AG und zwei ihrer Tochtergesellschaften beantragt hatten.,Dieser Fonds wurde eingerichtet, um Arbeitnehmer, die unter den Folgen weitreichender Strukturveränderungen im Welthandelsgefüge zu leiden haben, zusätzlich zu unterstützen. Aus dem Fonds werden individuell angepasste Maßnahmen zur beruflichen Wiedereingliederung von Arbeitssuchenden finanziert, Schritte in die Selbständigkeit und Unternehmensgründungen gefördert, Mobilitätsbeihilfen sowie Beihilfen für benachteiligte oder ältere Arbeitnehmer gewährt.,China hat enorme Überkapazitäten bei Solarmodulen aufgebaut, die weder von den eigenen Verbrauchern noch vom Weltmarkt aufgenommen werden können. Zusammen mit dem weltweiten Rückgang der Nachfrage hat dies zu einem Preisverfall geführt. Da China ca. 80% seiner Produktion zu billigen Preisen nach Europa exportiert, war die Nachfrage nach China-Produkten größer als nach denen der deutschen Firma Aleo Solar. Dadurch lassen sich die 615 Entlassungen der Arbeitsgemeinschaft sowie zwei ihrer Tochtergesellschaften erklären.,Es ist deshalb erfreulich, dass die EU dem Antrag Deutschlands stattgegeben hat und Mittel zur Unterstützung der entlassenen Arbeitskräfte zur Verfügung stellt, in der Hoffnung, diesen durch die ergriffenen Maßnahmen zu dauerhaften, langfristigen und damit stabilen Beschäftigungsverhältnissen zu verhelfen.\"'" 144 | ] 145 | }, 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "text_rdd.filter(lambda x: 'Deutschland' in x).first()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 8, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "germany = text_rdd.filter(lambda x: 'Deutschland' in x)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 9, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "merkel = text_rdd.filter(lambda x: 'Merkel' in x)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 10, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "de_merkel = germany.union(merkel)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 11, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "UnionRDD[8] at union at NativeMethodAccessorImpl.java:-2" 199 | ] 200 | }, 201 | "execution_count": 11, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "de_merkel" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 12, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "890" 221 | ] 222 | }, 223 | "execution_count": 12, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "de_merkel.count()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 13, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "de_merkel = de_merkel.persist()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 14, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "pairs = de_merkel.map(lambda s: (s, 1))\n", 252 | "counts = pairs.reduceByKey(lambda a, b: a + b)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 15, 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "[('\"Frau von Storch! Ich habe gesagt, dass es jetzt vor allem einmal um Kriegsflüchtlinge geht. Da war ich wirklich froh. Frau Merkel ist nicht von meiner Partei, und ich habe oft viel Kritik an ihr. Aber in dem Fall, als sie gesagt hat, dass in Deutschland Flüchtlinge aus Syrien nicht mehr nach Ungarn oder sonst wohin zurückgeschickt werden – das war richtig und gut. Und ich bin froh, dass auch mein Heimatland das endlich gemacht hat.,Um Kriegsflüchtlinge geht es hier, und für die brauchen wir Platz, für die brauchen wir tatsächlich auch Unterstützung. Aber Armutsflüchtlinge – wie viele sind denn aus Europa weggegangen, vor schrecklicher Armut aus Irland, auch aus meinem Land, auch aus Deutschland anderswohin geflüchtet, als die Zeiten schlecht waren? Dazu müssen wir auch beitragen, dass vor Ort Welthandelsstrukturen und ähnliches geändert werden, dass Menschen nicht mehr fliehen müssen. Aber wenn jemand flieht vor Klimakatastrophen, die wir mit verursachen, vor Armut, die wir mit verursachen, weil wir nicht genügend zahlen für Rohstoffe, dann muss man auch hier genau hinschauen und sehen, ob diese Leute daheim überhaupt überleben können oder nicht, oder ob sie nicht auch bei uns ein Recht haben zu leben.\"',\n", 266 | " 2),\n", 267 | " ('\"Ich habe heute für den Bericht über bildungs- und ausbildungspolitische Maßnahmen zum Abbau der Jugendarbeitslosigkeit gestimmt. Ein besonderes Augenmerk möchte ich als Europaabgeordneter der Familien-Partei Deutschlands dabei auf eine Zielgruppe lenken: Junge Mütter. Ihre Anliegen, wenn es um einen Start oder eine Rückkehr in den Arbeitsmarkt geht, müssen durch eine spezielle „skill policy“ gefördert werden, die auch die „skills“ sieht, anerkennt und grenzübergreifend vergleichbar macht, die Mütter und Väter in Erziehungszeit erwerben.\"',\n", 268 | " 1),\n", 269 | " ('\"– MrPresident, the EU is pandering to a country which is sliding ever closer to barbarism. The fact that this project is bribing Turkey with EUR2.2billion of EU taxpayers’ money, in a desperate bid to stem the migrant crisis of its own making, is a total political, economic and human disaster.,To stem this migration, the EU was willing to bend over backwards and ignore the war on the Kurds, the smuggling of oil from ISIS and the continual destruction of human rights in Turkey. Instead of dealing with the real issues of the Schengen zone and MsMerkel’s insane open doors policy, the EU is choosing to believe that a coup was attempted, when in reality it was set up by the Turkish Government to crack down on internal dissent.,I am only glad that we in the UK are getting out of the EU while we can, because this policy is madness. The cowardly, self-delusionary posturing of the EU towards Turkey is an utter disaster, and the accession process must end soon. It is an offence against both human rights and democracy. You are simply enabling a totalitarian regime, MsMogherini. Halt all talks now and stop giving bribes to them.\"',\n", 270 | " 1),\n", 271 | " ('\"Die Bewältigung des Flüchtlingsansturms auf Europa ist eine Aufgabe, die weder Deutschland, noch Ungarn, Dänemark oder Griechenland auf sich allein gestellt lösen kann. Die letzten Wochen haben gezeigt, dass nationale Alleingänge einzelner Mitgliedstaaten nur ins Chaos führen, gegenseitige Schuldzuweisungen bringen uns keinen Schritt weiter. Nur mit gemeinsamen europäischen Ansätzen können Lösungen erreicht werden. Kommissionspräsident Juncker hat richtigerweise an unsere gemeinsame humanitäre Verpflichtung zur Aufnahme von Menschen, die aus Angst um ihr Leben auf der Flucht sind, erinnert. Es ist dauerhaft nicht hinnehmbar, dass einige Mitgliedstaaten überhaupt keine Hilfe leisten und die Lasten einfach anderen überlassen. Deshalb führt kein Weg an einem Quotenmodell mit einem europäischen Verteilungsschlüssel vorbei. Außerdem müssen die bereits bestehenden Mindeststandards des Gemeinsamen Europäischen Asylsystems, zum Beispiel im Hinblick auf eine ordnungsgemäße Unterbringung und Behandlung der Flüchtlinge, in der gesamten EU wirksam angewendet werden. Um wirklich den Menschen helfen zu können, die unsere Hilfe am nötigsten haben, brauchen wir auch eine einheitliche Definition von sicheren Herkunftsländern in allen Mitgliedstaaten. Es muss allen klar sein, dass Asylanträge aus Ländern mit gefestigten Demokratien, wie etwa den Westbalkanländern, nicht akzeptiert werden können.\"',\n", 272 | " 1),\n", 273 | " ('\"Herr Präsident, liebe Kolleginnen und Kollegen, Frau Kommissarin! Normalerweise bin ich sehr stolz auf unsere deutsche Gründlichkeit. Aber in diesem Fall ist die deutsche Regierung weit über das Ziel hinausgeschossen mit ihrer Detailversessenheit. Es ist tatsächlich so, dass ich überhaupt nicht gegen Mindestlöhne als solche bin. Das hat die Frau Kommissarin gesagt: Darüber kann jedes Land selber entscheiden, das ist nationale Souveränität – vollkommen okay. In diesem Fall ist es aber das erste Mal – wir sind ja nicht das erste Land, das ein Mindestlohngesetz hat, ganz viele andere hatten es auch schon –, dass der Transportsektor, auch der Transitsektor, so detailversessen beschrieben worden ist, bis dahin – meine Kollegin hat es gesagt –, dass Faxe vorweg auf Deutsch geschickt werden sollen an die deutsche Regierung, die deutschen Behörden, wann genau welches Unternehmen sich wie lange auf deutschem Boden aufhält, und dass man sich versichern soll, dass man den deutschen Mindestlohn bezahlt. Meines Erachtens geht das deutlich übers Ziel hinaus, und es ist auch gegen europäisches Recht.,Die Prüfung ist noch nicht abgeschlossen, das haben Sie gesagt. Aber wie kann man denn für den Binnenmarkt sein, wo wir in den einzelnen Ländern unterschiedliche Lebensbedingungen haben, natürlich auch unterschiedliche Lohnniveaus. Das ist einfach so! Wir wollen auf keinen Fall, dass die LKW-Fahrer schlecht behandelt werden. Da liegt auch vieles im Argen. Das gebe ich genau so zu. Aber meines Erachtens – das hat auch Herr Ujhelyi gesagt, das betrifft auch die Petition und die Demonstration – ist dies der falsche Weg, um dem abzuhelfen. Da müssen wir wirklich anders vorgehen, um bessere Bedingungen für die LKW-Fahrer zu schaffen. Es ist tatsächlich so, dass meines Erachtens Deutschland damit ein Fass aufgemacht hat. Denn es ist das erste Mal, dass eben Logistik und Transport drin ist – eine europäische Politik.,Jetzt hat Frankreich nachgezogen. Frankreich will das jetzt genauso machen. Wenn andere Länder dazukommen, was soll denn dann bitte ein Spediteur machen, der durch fünf verschiedene Länder fährt? In Polen ist der Lohn relativ niedrig, bei 1,95 Euro oder auch ein bisschen mehr, in Luxemburg 11,10 Euro, in Frankreich 9,61 Euro, in Deutschland 8,50 Euro. Wie soll man das auseinanderdröseln, und wer soll das auch kontrollieren? Meines Erachtens geht das in diesem Fall ganz klar am Ziel, die Leute besserzustellen, vorbei und absolut gegen Europa und gegen den Binnenmarkt. Das können wir meines Erachtens überhaupt nicht dulden. Als Letztes: Ich befürchte auch, dass kleine mittelständische Unternehmen, die wir immer schützen wollen, nun gerade aufgeben müssen und Arbeitsplätzen verlustig gehen. Das ist auch nicht Europas Wille!\"',\n", 274 | " 1),\n", 275 | " ('\"In einer Situation, in der reiche Staaten wie Deutschland ihre Rentensysteme mittel- und langfristig nicht mehr finanzieren werden können, sind deutliche Reformschritte nötig. Statt wie Wolfgang Schäuble über die Erhöhung des Rentenalters nachzudenken, fordert ALFA eine detaillierte Überprüfung der Haushaltsausgaben, einschließlich des deutschen Beitrags in den EU-Haushalt.,ALFA setzt sich für einen effizienteren EU-Haushalt ein und wendet sich entschieden gegen die Verschwendung von Steuergeldern, die auf nationalstaatlicher ähnlich wie auf der EU-Ebene ein großes Problem ist. Eine solche Einstellung erwartet ALFA von dem ganzen Europaparlament, das die Entlastung zum EU-Budget erteilt.,Da die Entwürfe des Ausschusses für Haushaltskontrolle generell eine Reform des EU-Budgets nicht deutlich genug gefordert hatten, habe ich gegen die Entlastung bei den meisten EU-Haushaltslinien gestimmt. Dies betraf auch die Entlastung zum Gemeinsamen Unternehmen SESAR.\"',\n", 276 | " 2),\n", 277 | " ('\"In einer Situation, in der reiche Staaten wie Deutschland ihre Rentensysteme mittel- und langfristig nicht mehr finanzieren werden können, sind rasante Reformschritte nötig. Statt wie Wolfgang Schäuble über die Erhöhung des Rentenalters nachzudenken, fordert ALFA eine detaillierte Überprüfung der Haushaltsausgaben, einschließlich des deutschen Beitrags in den EU-Haushalt.,ALFA setzt sich für einen effizienteren EU-Haushalt ein und wendet sich entschieden gegen die Verschwendung von Steuergeldern, die auf nationalstaatlicher ähnlich wie auf der EU-Ebene ein großes Problem ist. Eine solche Einstellung erwartet ALFA vom ganzen Europäischen Parlament, das die Entlastung zum EU-Budget erteilt.,Da die Entwürfe des Ausschusses für Haushaltskontrolle generell eine Reform des EU-Budgets nicht stark genug gefordert hatten, habe ich gegen die Entlastung bei den meisten EU-Haushaltslinien gestimmt. Dies betraf auch die Entlastung zum Gemeinsamen Unternehmen Clean Sky.\"',\n", 278 | " 1),\n", 279 | " ('Ich habe heute für den Einwand gemäß Artikel 106: Genehmigung genetisch veränderter Sojabohnen der Sorte MON 87708 × MON 89788 gestimmt. Die Familien-Partei Deutschlands ist gegen den Einsatz gentechnisch manipulierten Saatgutes in der Landwirtschaft.',\n", 280 | " 2),\n", 281 | " ('\"Frau Präsidentin! Herr Pittella sprach eben davon, dass er sich wie in einem Film vorkäme. Ja, das gilt auch für mich. Nur habe ich das Gefühl, wir wären mitten in einem Filmriss und dieselben Szenen würden immer wieder vorgeführt. Genau diesen Eindruck habe ich. Das habe ich schon so oft in diesem Parlament gehört, diese Beschwörung dessen, was man tun muss. Ich sage ihnen: Wenn Herr Moscovici sagt, was Griechenland alles tun muss, das ist ein Entmündigungsprogramm! Und dann sagt er: Jetzt kommt Wachstum, jetzt kommen Investitionen, jetzt kommt Wettbewerbsfähigkeit. Nein, nichts davon kommt! Der Economist hat festgestellt: \"\",\"\". Schauen Sie doch mal die Statistiken nach! Warum ist in Deutschland die Zahl der Arbeitslosen von zehn Prozent auf fünf Prozent zurückgegangen, warum ist im Süden der Eurozone die Arbeitslosigkeit von zehn Prozent auf fünfzehn und mehr Prozent angestiegen? Weil der Euro ein falscher relativer Preis ist. Das ist alles! Man muss da ansetzen! Aber dieses Parlament geht nicht die Probleme an, sondern tanzt um das Goldene Kalb des Euro, ohne die Probleme in den Griff bekommen zu wollen.\"',\n", 282 | " 1),\n", 283 | " ('\"Frau Präsidentin! 1997 habe ich mit der damaligen Umweltministerin Angela Merkel die Umweltkonferenz in Kyoto besucht, und wir haben uns damals beide für spezifische und nicht für absolute CO2-Ziele eingesetzt. Inzwischen will Frau Merkel das Weltklima retten. Sie ist überhaupt beim Retten: Sie will den Euro retten, sie will Griechenland retten – gleich dreimal, sie will die Banken retten durch die Europäische Bankenunion zu Lasten der deutschen Sparkassen, und jetzt will sie die Weltflüchtlinge retten. Meine Damen und Herren! Immer mehr ausländische Kollegen auch in diesem Gremium zeigen und sagen mir, dass Frau Merkel und die Bundesregierung anscheinend an einem Helfersyndrom leiden. Das muss beendet werden!,Darf ich Ihnen bei dieser Gelegenheit auch nochmal sagen: Am deutschen Wesen soll Europa nicht weiter genesen. Im Übrigen – und das ist mein ceterum censeo – möchte ich darauf aufmerksam machen, dass der Einheitseuro Europa spaltet und abgeschafft werden muss.\"',\n", 284 | " 1)]" 285 | ] 286 | }, 287 | "execution_count": 15, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "counts.take(10)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "pairs = de_merkel.flatMap(lambda s: s.split(' ')).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "pairs.take(4)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "pairs.sortBy(lambda k: k[1]).top(10)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "import re" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "pairs = de_merkel.flatMap(lambda s: re.findall('\\w+', s)).map(lambda w: (w, 1)).reduceByKey(lambda a, b: a + b)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "my_df = pairs.sortBy(lambda p: p[1]).collect()" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "my_df" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "type(my_df)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "my_df[-40:]" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": true 400 | }, 401 | "outputs": [], 402 | "source": [] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.4.4" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 0 426 | } 427 | -------------------------------------------------------------------------------- /notebooks/Chapter 6 - Introduction to Spark Streaming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyspark.streaming import StreamingContext" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false, 19 | "scrolled": true 20 | }, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "" 26 | ] 27 | }, 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "sc" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "ssc = StreamingContext(sc, 60)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "lines = ssc.socketTextStream(\"0.0.0.0\", 9999)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "" 70 | ] 71 | }, 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "lines" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "words = lines.flatMap(lambda line: line.split(\" \"))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "pairs = words.map(lambda word: (word, 1))\n", 101 | "wordCounts = pairs.reduceByKey(lambda x, y: x + y)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 8, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "wordCounts.pprint()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "-------------------------------------------\n", 127 | "Time: 2016-10-16 15:18:00\n", 128 | "-------------------------------------------\n", 129 | "\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "ssc.start() # Start the computation\n", 135 | "ssc.awaitTermination() " 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.4.4" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 0 169 | } 170 | -------------------------------------------------------------------------------- /notebooks/Chapter 7 - Testing with Hypothesis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import sys\n", 12 | "sys.path.append('../celery_app')" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "from hypothesis import given, note, strategies as st\n", 24 | "from tasks import calc_ratio" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 12, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "@given(st.floats(), st.floats())\n", 36 | "def test_calc_ratio(p, c):\n", 37 | " ratio = calc_ratio(p, c)\n", 38 | " assert isinstance(ratio, float)\n", 39 | " assert -100 <= ratio <= 100\n", 40 | " assert len(str(ratio).split('.')) == 2\n", 41 | " assert len(str(ratio).split('.')[1]) == 2" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 13, 47 | "metadata": { 48 | "collapsed": false, 49 | "scrolled": true 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Falsifying example: test_calc_ratio(p=0.0, c=0.0)\n" 57 | ] 58 | }, 59 | { 60 | "ename": "ZeroDivisionError", 61 | "evalue": "float division by zero", 62 | "output_type": "error", 63 | "traceback": [ 64 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 65 | "\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", 66 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 67 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 68 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m reify_and_execute(\n\u001b[1;32m 523\u001b[0m \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m ))\n\u001b[1;32m 526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 69 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 70 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 109\u001b[0m lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m 110\u001b[0m test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 71 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 72 | "\u001b[0;32m/home/katharine/wrrk/my_classes/data_pipelines_course/celery_app/tasks.py\u001b[0m in \u001b[0;36mcalc_ratio\u001b[0;34m(price, compare)\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0mreturns\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m '''\n\u001b[0;32m---> 42\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprice\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mcompare\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 43\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 73 | "\u001b[0;31mZeroDivisionError\u001b[0m: float division by zero" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "test_calc_ratio()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 14, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "@given(st.floats(min_value=4), st.floats(min_value=4))\n", 90 | "def test_calc_ratio(p, c):\n", 91 | " ratio = calc_ratio(p, c)\n", 92 | " assert isinstance(ratio, float)\n", 93 | " assert -100 <= ratio <= 100\n", 94 | " assert len(str(ratio).split('.')) == 2\n", 95 | " assert len(str(ratio).split('.')[1]) == 2" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 15, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "Falsifying example: test_calc_ratio(p=4.0, c=4.0)\n" 110 | ] 111 | }, 112 | { 113 | "ename": "AssertionError", 114 | "evalue": "", 115 | "output_type": "error", 116 | "traceback": [ 117 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 118 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 119 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 120 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 121 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m reify_and_execute(\n\u001b[1;32m 523\u001b[0m \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m ))\n\u001b[1;32m 526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 122 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 123 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 109\u001b[0m lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m 110\u001b[0m test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 124 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 125 | "\u001b[0;31mAssertionError\u001b[0m: " 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "test_calc_ratio()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 16, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "'0'" 144 | ] 145 | }, 146 | "execution_count": 16, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "str(calc_ratio(4, 4)).split('.')[1]" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 19, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "@given(st.floats(min_value=4), st.floats(min_value=4))\n", 164 | "def test_calc_ratio(p, c):\n", 165 | " ratio = calc_ratio(p, c)\n", 166 | " assert isinstance(ratio, float)\n", 167 | " assert -100 <= ratio <= 100\n", 168 | " assert len(str(ratio).split('.')) == 2\n", 169 | " assert len(str(ratio).split('.')[1]) <= 2" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 20, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Falsifying example: test_calc_ratio(p=8.000200000000001, c=4.0)\n" 184 | ] 185 | }, 186 | { 187 | "ename": "AssertionError", 188 | "evalue": "", 189 | "output_type": "error", 190 | "traceback": [ 191 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 192 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 193 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 194 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 195 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m reify_and_execute(\n\u001b[1;32m 523\u001b[0m \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m ))\n\u001b[1;32m 526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 196 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 197 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 109\u001b[0m lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m 110\u001b[0m test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 198 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m100\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 199 | "\u001b[0;31mAssertionError\u001b[0m: " 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "test_calc_ratio()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 21, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "100.01" 218 | ] 219 | }, 220 | "execution_count": 21, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "calc_ratio(8.000200000000001, 4)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 22, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "@given(st.floats(min_value=4), st.floats(min_value=4))\n", 238 | "def test_calc_ratio(p, c):\n", 239 | " ratio = calc_ratio(p, c)\n", 240 | " assert isinstance(ratio, float)\n", 241 | " assert len(str(ratio).split('.')) == 2\n", 242 | " assert len(str(ratio).split('.')[1]) <= 2" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 23, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "Falsifying example: test_calc_ratio(p=400000000000004.0, c=4.0)\n" 257 | ] 258 | }, 259 | { 260 | "ename": "AssertionError", 261 | "evalue": "", 262 | "output_type": "error", 263 | "traceback": [ 264 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 265 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 266 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 267 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgiven\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloats\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmin_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mtest_calc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 268 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mwrapped_test\u001b[0;34m(*arguments, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m reify_and_execute(\n\u001b[1;32m 523\u001b[0m \u001b[0msearch_strategy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mprint_example\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_final\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m ))\n\u001b[1;32m 526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mUnsatisfiedAssumption\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStopTest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 269 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/executors.py\u001b[0m in \u001b[0;36mdefault_new_style_executor\u001b[0;34m(data, function)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdefault_new_style_executor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 270 | "\u001b[0;32m/home/katharine/.virtualenv/data_pipelines/lib/python3.4/site-packages/hypothesis/core.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 109\u001b[0m lambda: 'Trying example: %s(%s)' % (\n\u001b[1;32m 110\u001b[0m test.__name__, arg_string(test, args, kwargs)))\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrun\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 271 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtest_calc_ratio\u001b[0;34m(p, c)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mratio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcalc_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mratio\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 272 | "\u001b[0;31mAssertionError\u001b[0m: " 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "test_calc_ratio()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 25, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "'1e+16'" 291 | ] 292 | }, 293 | "execution_count": 25, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "calc_ratio(400000000000004.0, 4.0)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 26, 305 | "metadata": { 306 | "collapsed": true 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "@given(st.floats(min_value=4, max_value=10000), st.floats(min_value=4, max_value=10000))\n", 311 | "def test_calc_ratio(p, c):\n", 312 | " ratio = calc_ratio(p, c)\n", 313 | " assert isinstance(ratio, float)\n", 314 | " assert len(str(ratio).split('.')) == 2\n", 315 | " assert len(str(ratio).split('.')[1]) <= 2" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 27, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "test_calc_ratio()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [] 337 | } 338 | ], 339 | "metadata": { 340 | "kernelspec": { 341 | "display_name": "Python 3", 342 | "language": "python", 343 | "name": "python3" 344 | }, 345 | "language_info": { 346 | "codemirror_mode": { 347 | "name": "ipython", 348 | "version": 3 349 | }, 350 | "file_extension": ".py", 351 | "mimetype": "text/x-python", 352 | "name": "python", 353 | "nbconvert_exporter": "python", 354 | "pygments_lexer": "ipython3", 355 | "version": "3.4.3" 356 | } 357 | }, 358 | "nbformat": 4, 359 | "nbformat_minor": 1 360 | } 361 | -------------------------------------------------------------------------------- /notebooks/Extras (Chapter 4) - Clean Vehicle Theft Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from datetime import datetime" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 12, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv('/home/katharine/Downloads/datasets/mvt.csv')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/html": [ 36 | "
\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | "
DateLatitudeLongitude
012/31/12 23:1541.756284-87.621645
112/31/12 22:0041.898788-87.661303
212/31/12 22:0041.969186-87.767670
312/31/12 22:0041.769329-87.657726
412/31/12 21:3041.837568-87.621761
\n", 79 | "
" 80 | ], 81 | "text/plain": [ 82 | " Date Latitude Longitude\n", 83 | "0 12/31/12 23:15 41.756284 -87.621645\n", 84 | "1 12/31/12 22:00 41.898788 -87.661303\n", 85 | "2 12/31/12 22:00 41.969186 -87.767670\n", 86 | "3 12/31/12 22:00 41.769329 -87.657726\n", 87 | "4 12/31/12 21:30 41.837568 -87.621761" 88 | ] 89 | }, 90 | "execution_count": 13, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "df.head()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 14, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "df['DateTime'] = df['Date'].map(lambda d: datetime.strptime(d, '%m/%d/%y %H:%M'))" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 17, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "df['Hour'] = df['DateTime'].map(lambda d: d.hour)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 23, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "df['DayOfWeek'] = df['DateTime'].map(lambda d: d.weekday())" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 25, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "df['Date'] = df['DateTime'].map(lambda d: d.date())" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 26, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "Date object\n", 154 | "Latitude float64\n", 155 | "Longitude float64\n", 156 | "DateTime datetime64[ns]\n", 157 | "Hour int64\n", 158 | "DayOfWeek int64\n", 159 | "dtype: object" 160 | ] 161 | }, 162 | "execution_count": 26, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "df.dtypes" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 27, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/html": [ 181 | "
\n", 182 | "\n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
DateLatitudeLongitudeDateTimeHourDayOfWeek
02012-12-3141.756284-87.6216452012-12-31 23:15:00230
12012-12-3141.898788-87.6613032012-12-31 22:00:00220
22012-12-3141.969186-87.7676702012-12-31 22:00:00220
32012-12-3141.769329-87.6577262012-12-31 22:00:00220
42012-12-3141.837568-87.6217612012-12-31 21:30:00210
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " Date Latitude Longitude DateTime Hour DayOfWeek\n", 246 | "0 2012-12-31 41.756284 -87.621645 2012-12-31 23:15:00 23 0\n", 247 | "1 2012-12-31 41.898788 -87.661303 2012-12-31 22:00:00 22 0\n", 248 | "2 2012-12-31 41.969186 -87.767670 2012-12-31 22:00:00 22 0\n", 249 | "3 2012-12-31 41.769329 -87.657726 2012-12-31 22:00:00 22 0\n", 250 | "4 2012-12-31 41.837568 -87.621761 2012-12-31 21:30:00 21 0" 251 | ] 252 | }, 253 | "execution_count": 27, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "df.head()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 29, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "df.to_csv('/home/katharine/Downloads/datasets/mvt_cleaned.csv', index=False)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.4.3" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 1 304 | } 305 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | airflow==1.7.1.3 2 | alembic==0.8.8 3 | amqp==1.4.9 4 | anyjson==0.3.3 5 | appnope==0.1.0 6 | asgi-redis==0.14.1 7 | asgiref==0.14.0 8 | autobahn==0.16.0 9 | Babel==1.3 10 | backports-abc==0.4 11 | billiard==3.3.0.23 12 | bitarray==0.8.1 13 | bokeh==0.12.1 14 | boto3==1.4.0 15 | botocore==1.4.49 16 | celery==3.1.23 17 | cffi==1.8.3 18 | channels==0.17.3 19 | chartkick==0.4.2 20 | click==6.6 21 | cloudpickle==0.2.1 22 | croniter==0.3.12 23 | cryptography==1.5.2 24 | daphne==0.15.0 25 | dask==0.10.2 26 | decorator==4.0.10 27 | dill==0.2.5 28 | distributed==1.11.3 29 | Django==1.10.2 30 | docutils==0.12 31 | entrypoints==0.2.2 32 | filechunkio==1.8 33 | Flask==0.10.1 34 | Flask-Admin==1.4.0 35 | Flask-Cache==0.13.1 36 | Flask-Login==0.2.11 37 | Flask-WTF==0.12 38 | flower==0.9.1 39 | funcsigs==0.4 40 | future==0.15.2 41 | futures==3.0.5 42 | gevent==1.1.2 43 | graphviz==0.4.10 44 | greenlet==0.4.10 45 | gunicorn==19.3.0 46 | h5py==2.6.0 47 | hive-thrift-py==0.0.1 48 | httplib2==0.9.2 49 | hypothesis==3.6.0 50 | idna==2.1 51 | impyla==0.13.8 52 | ipykernel==4.4.1 53 | ipython==5.1.0 54 | ipython-genutils==0.1.0 55 | ipywidgets==5.2.2 56 | itsdangerous==0.24 57 | Jinja2==2.8 58 | jmespath==0.9.0 59 | jsonschema==2.5.1 60 | jupyter==1.0.0 61 | jupyter-client==4.3.0 62 | jupyter-console==5.0.0 63 | jupyter-core==4.1.1 64 | kombu==3.0.35 65 | locket==0.2.0 66 | lockfile==0.12.2 67 | luigi==2.3.2 68 | Mako==1.0.4 69 | Markdown==2.6.7 70 | MarkupSafe==0.23 71 | mistune==0.7.3 72 | msgpack-python==0.4.8 73 | mysqlclient==1.3.9 74 | nbconvert==4.2.0 75 | nbformat==4.1.0 76 | nltk==3.2.1 77 | notebook==4.2.2 78 | numexpr==2.6.1 79 | numpy==1.11.1 80 | oauthlib==2.0.0 81 | pamela==0.2.1 82 | pandas==0.18.1 83 | pandas-datareader==0.2.1 84 | partd==0.3.6 85 | pexpect==4.2.0 86 | pickleshare==0.7.4 87 | ply==3.9 88 | prompt-toolkit==1.0.6 89 | psutil==4.3.0 90 | ptyprocess==0.5.1 91 | py==1.4.31 92 | pyasn1==0.1.9 93 | pyasn1-modules==0.0.8 94 | pycparser==2.14 95 | Pygments==2.1.3 96 | PyHive==0.2.1 97 | pytest==3.0.3 98 | python-daemon==2.1.1 99 | python-dateutil==2.5.3 100 | python-editor==1.0.1 101 | python-google-places==1.2.0 102 | pytz==2016.6.1 103 | PyYAML==3.11 104 | pyzmq==15.4.0 105 | qtconsole==4.2.1 106 | redis==2.10.5 107 | requests==2.11.1 108 | requests-file==1.4 109 | requests-oauthlib==0.7.0 110 | s3fs==0.0.7 111 | s3transfer==0.1.2 112 | scipy==0.18.0 113 | setproctitle==1.1.10 114 | simplegeneric==0.8.1 115 | simplejson==3.8.2 116 | six==1.10.0 117 | SQLAlchemy==1.0.14 118 | tables==3.2.3.1 119 | tblib==1.3.0 120 | terminado==0.6 121 | thrift==0.9.3 122 | thriftpy==0.3.9 123 | toolz==0.8.0 124 | tornado==4.2 125 | traitlets==4.2.2 126 | tweepy==3.5.0 127 | Twisted==16.4.1 128 | txaio==2.5.1 129 | unicodecsv==0.14.1 130 | uritemplate==0.6 131 | wcwidth==0.1.7 132 | Werkzeug==0.11.10 133 | widgetsnbextension==1.2.6 134 | WTForms==2.1 135 | zope.interface==4.3.2 136 | -------------------------------------------------------------------------------- /streaming/tweepy_stream.py: -------------------------------------------------------------------------------- 1 | """ Module to load tweets for spark streaming access. Modified only slightly 2 | from this SO answer: http://stackoverflow.com/questions/27882631/consuming-twitter-stream-with-tweepy-and-serving-content-via-websocket-with-geve""" 3 | from __future__ import absolute_import, print_function 4 | import gevent 5 | import gevent.monkey 6 | gevent.monkey.patch_all() 7 | from gevent.server import StreamServer 8 | 9 | from tweepy.streaming import StreamListener 10 | from tweepy import OAuthHandler, Stream 11 | from configparser import ConfigParser 12 | from random import choice 13 | import json 14 | import os 15 | 16 | CONFIG_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 17 | '..', 'config/')) 18 | 19 | 20 | class SparkStreamListener(StreamListener): 21 | """ Use twitter streaming API to stream to PySpark. """ 22 | def __init__(self): 23 | config = ConfigParser() 24 | config.read(os.path.join(CONFIG_DIR, 'prod.cfg')) 25 | self.sockets = [] 26 | auth = OAuthHandler(config.get('twitter', 'consumer_key'), 27 | config.get('twitter', 'consumer_secret')) 28 | auth.set_access_token(config.get('twitter', 'access_token'), 29 | config.get('twitter', 'access_token_secret')) 30 | self.stream = Stream(auth, self) 31 | 32 | def add_socket(self, ws): 33 | self.sockets.append(ws) 34 | print(self.sockets) 35 | 36 | def run(self): 37 | try: 38 | self.stream.filter(track=['python']) 39 | except Exception as e: 40 | print(e) 41 | self.stream.disconnect() 42 | 43 | def start(self): 44 | """ Start GEvent """ 45 | gevent.spawn(self.run) 46 | 47 | def send(self, status): 48 | """ Send status to socket """ 49 | print(self.sockets) 50 | if len(self.sockets) > 1: 51 | ws = choice(self.sockets) 52 | else: 53 | ws = self.sockets[0] 54 | try: 55 | ws.send(status.encode('utf-8')) 56 | except ValueError: 57 | print(e) 58 | # the web socket die.. 59 | self.sockets.remove(ws) 60 | 61 | def on_data(self, data): 62 | decoded = json.loads(data) 63 | gevent.spawn(self.send, decoded.get('text') + '\n') 64 | return True 65 | 66 | def on_error(self, status): 67 | print("Error: %s", status) 68 | 69 | def on_timeout(self): 70 | print("tweepy timeout.. wait 30 seconds") 71 | gevent.sleep(30) 72 | 73 | 74 | def app(socket, address): 75 | stream_listener = SparkStreamListener() 76 | stream_listener.start() 77 | stream_listener.add_socket(socket) 78 | while not socket.closed: 79 | gevent.sleep(0.1) 80 | 81 | if __name__ == '__main__': 82 | server = StreamServer(('0.0.0.0', 9999), app) 83 | server.serve_forever() 84 | --------------------------------------------------------------------------------