├── .gitignore
├── Ch02
    ├── competitor_blog.py
    ├── fizz_buzz.py
    ├── network_wikipedia.py
    ├── parallel_blog_processing.py
    └── phonenumber_cleaning.py
├── Ch03
    ├── hacker_translate.py
    └── twitter_demographics.py
├── Ch04
    ├── A.txt
    ├── B.txt
    ├── even_numbers.py
    ├── generate_poems.py
    ├── lake_simulation.py
    ├── m_words.py
    ├── more_filters.py
    └── poem_puzzle.py
├── Ch05
    ├── car_profit.py
    ├── cars.json
    ├── evenfilter.py
    ├── frequencies.py
    ├── products.py
    ├── scrabble_scores.py
    └── summation.py
├── Ch06
    ├── imap.py
    ├── naivebayes.py
    ├── par-filter.py
    ├── par-freqs.py
    ├── par-sum.py
    ├── parallel-fold.py
    ├── starmap.py
    ├── timing-chunks.py
    └── timing.py
├── Ch07
    ├── FlorenceMachineCounts
    │   ├── ._SUCCESS.crc
    │   ├── .part-00000.crc
    │   ├── _SUCCESS
    │   └── part-00000
    ├── Florence_Machine.txt
    ├── Florence_Nightingale.txt
    ├── highest_scoring.py
    ├── large_words
    ├── score_words.py
    ├── spark_scores.py
    ├── wc_mapper.py
    └── wc_reducer.py
├── Ch08
    ├── .most-active-times.py.swp
    ├── 10klog.csv
    ├── command_elo
    ├── common-errors.py
    ├── elo-mapper.py
    ├── elo-reducer.py
    ├── serena_counter.py
    ├── williams-counter.py
    └── wta.tar.bz2.tar.bz2
├── Ch09
    ├── spark_losses.py
    ├── spark_scores.py
    ├── wikipedia_edges.txt
    └── wta_matches_2001.csv
├── Ch10
    ├── decision_trees.py
    ├── iris.csv
    ├── leads.txt
    ├── mushrooms.data
    └── random_forest.py
├── Ch11
    ├── .gitkeep
    └── s3_upload.py
├── Ch12
    ├── crashes_nb.py
    ├── emr-script-example.sh
    ├── emr_crash_counts.sh
    ├── mrjob_crash_counts.py
    ├── mrjob_emr_nb.sh
    ├── mrspark_bayes.py
    ├── nb_on_emr.sh
    ├── spark_bayes.py
    └── spark_mrjob.conf
├── README.md
├── notebooks
    ├── Ch02_notebook.ipynb
    ├── Ch03_notebook.ipynb
    ├── Ch04_notebook.ipynb
    ├── Ch05_notebook.ipynb
    ├── Ch06_notebook.ipynb
    ├── Ch07_notebook.ipynb
    ├── Ch09_notebook.ipynb
    ├── Ch10_notebook.ipynb
    └── Ch11_notebook.ipynb
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .pycache
2 | bookenv/*
3 | */.ipynb_checkpoints/*
4 | 


--------------------------------------------------------------------------------
/Ch02/competitor_blog.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from urllib import request
 3 | from toolz import take
 4 | 
 5 | 
 6 | def days_between(start, stop):
 7 |     today = date(*start)
 8 |     stop = date(*stop)
 9 |     while today < stop:
10 |       yield "http://jtwolohan.com/evilblog/"+today.strftime("%m-%d-%Y")
11 |       today = date.fromordinal(today.toordinal()+1)
12 | 
13 | 
14 | def get_url(path):
15 |   return request.urlopen(path).read()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     start = (2000, 1, 1)
20 |     stop = (2001, 1, 1)
21 |     xs = map(get_url, days_between(start,stop))
22 |     print(take(5,xs))
23 | 


--------------------------------------------------------------------------------
/Ch02/fizz_buzz.py:
--------------------------------------------------------------------------------
 1 | class FizzBuzzer:
 2 |   def __init__(self):
 3 |     self.n = 0
 4 |   def foo(self,_):
 5 |     self.n += 1
 6 |     if (self.n % 3)  == 0:
 7 |       x = "buzz"
 8 |     else: x = "fizz"
 9 |     print(x)
10 |     return x
11 | 
12 | FB = FizzBuzzer()
13 | for i in range(21):
14 |   FB.foo(i)
15 | 


--------------------------------------------------------------------------------
/Ch02/network_wikipedia.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from urllib import request, parse
 3 | from multiprocessing import Pool
 4 | from itertools import chain
 5 | import networkx as nx
 6 | 
 7 | def link_to_title(link):
 8 |   return link["title"]
 9 | 
10 | def clean_if_key(page,key):
11 |     if key in page.keys():
12 |         return map(link_to_title,page[key])
13 |     else: return []
14 | 
15 | def get_Wiki_links(pageTitle):
16 |     safe_title = parse.quote(pageTitle)
17 |     url = "https://en.wikipedia.org/w/api.php?action=query&\
18 | prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&\
19 | format=json&formatversion=2".format(safe_title)
20 |     page = request.urlopen(url).read()
21 |     j = json.loads(page)
22 |     jpage = j['query']['pages'][0]
23 |     inbound = clean_if_key(jpage,"links")
24 |     outbound = clean_if_key(jpage,"linkshere")
25 |     return {"title": pageTitle,
26 |             "in-links":list(inbound),
27 |             "out-links":list(outbound)}
28 | 
29 | def flatten_network(page):
30 |     return page["in-links"]+page["out-links"]
31 | 
32 | def page_to_edges(page):
33 |     a = [(page['title'],p) for p in page['out-links']]
34 |     b = [(p,page['title']) for p in page['in-links']]
35 |     return a+b
36 | 
37 | if __name__ == "__main__":
38 |     root = get_Wiki_links("Parallel_computing")
39 |     initial_network = flatten_network(root)
40 |     with Pool() as P:
41 |         all_pages = P.map(get_Wiki_links, initial_network)
42 |         edges = P.map(page_to_edges, all_pages)
43 |     edges = chain.from_iterable(edges)
44 | 
45 |     G = nx.DiGraph()
46 |     for e in edges:
47 |         G.add_edge(*e)
48 |     nx.readwrite.gexf.write_gexf(G,"./MyGraph.gexf")
49 | 


--------------------------------------------------------------------------------
/Ch02/parallel_blog_processing.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | from urllib import request
 3 | 
 4 | from multiprocessing import Pool
 5 | 
 6 | def days_between(start,stop):
 7 |   today = date(*start)
 8 |   stop = date(*stop)
 9 |   while today < stop:
10 |     datestr = today.strftime("%m-%d-%Y")
11 |     yield "http://jtwolohan.com/arch-rival-blog/"+datestr
12 |     today = date.fromordinal(today.toordinal()+1)
13 | 
14 | def get_url(path):
15 |   return request.urlopen(path).read()
16 | 
17 | 
18 | with Pool() as P:
19 |   blog_posts = P.map(get_url,days_between((2000,1,1),(2011,1,1)))
20 | 


--------------------------------------------------------------------------------
/Ch02/phonenumber_cleaning.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | class PhoneFormatter:
 4 |     def __init__(self):
 5 |         self.r = re.compile(r"\d")
 6 |         
 7 |     def pretty_format(self, phone_number):
 8 |         numbers = self.r.findall(phone_number)
 9 |         area_code = "".join(numbers[-10:-7])
10 |         first_3 = "".join(numbers[-7:-4])
11 |         last_4 = "".join(numbers[-4:len(numbers)])
12 |         return "({}) {}-{}".format(area_code, first_3, last_4)
13 | 
14 | if __name__ == "__main__":
15 |     phone_numbers = [
16 |         "(123) 456-7890",
17 |         "1234567890",
18 |         "123.456.7890",
19 |         "+1 123 456-7890"
20 |     ]
21 | 
22 |     P = PhoneFormatter()
23 | 
24 |     print(list(map(P.pretty_format, phone_numbers)))


--------------------------------------------------------------------------------
/Ch03/hacker_translate.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from toolz.functoolz import pipe, compose
 3 | 
 4 | sample_messages = [
 5 | "7his所is家4没s4mpl3动m3ss463",
 6 | "don7家73ll经4nyon3法7his现m3ss463",
 7 | "w3现4r3当b3in6进so好s3cr3t",
 8 | "733小h33成h33去nobody看is天on分7o理us",
 9 | "w3么will面n3v3r分637理c4u6ht",
10 | "w3事4r3经such没sn34ky天h4ckers"]
11 | 
12 | 
13 | def replace_7t(s):
14 |     return s.replace('7', 't')
15 | 
16 | 
17 | def replace_3e(s):
18 |     return s.replace('3', 'e')
19 | 
20 | 
21 | def replace_6g(s):
22 |     return s.replace('6', 'g')
23 | 
24 | 
25 | def replace_4a(s):
26 |     return s.replace('4', 'a')
27 | 
28 | 
29 | class chinese_matcher:
30 |     def __init__(self):
31 |         self.r = re.compile(r'[\u4e00-\u9fff]+')
32 |         
33 |     def sub_chinese(self,s):
34 |         return self.r.sub(" ",s)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     C = chinese_matcher()
39 | 
40 |     # Not chained
41 |     print(list(
42 |     map( C.sub_chinese,
43 |         map(replace_4a,
44 |             map(replace_6g,
45 |                 map(replace_3e,
46 |                     map(replace_7t, sample_messages)))))),end="\n\n")
47 | 
48 |     # Option 1
49 |     hacker_translate = compose(C.sub_chinese, replace_4a, replace_6g,
50 |                                replace_3e, replace_7t)
51 | 
52 |     print(list(map(hacker_translate, sample_messages)),end="\n\n")
53 | 
54 |     # Option 2
55 |     def hacker_translate(s):
56 |         return pipe(s, replace_7t, replace_3e, replace_6g,
57 |                        replace_4a, C.sub_chinese)
58 | 
59 |     print(list(map(hacker_translate,sample_messages)),end="\n\n")
60 | 


--------------------------------------------------------------------------------
/Ch03/twitter_demographics.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | from toolz import compose, pipe
 3 | import twitter
 4 | 
 5 | Twitter = twitter.Api(consumer_key="",
 6 |                       consumer_secret="",
 7 |                       access_token_key="",
 8 |                       access_token_secret="")
 9 | 
10 | 
11 | def get_tweet_from_id(tweet_id, api=Twitter):
12 |     return api.GetStatus(tweet_id, trim_user=True)
13 | 
14 | 
15 | def tweet_to_text(tweet):
16 |     return tweet.text
17 | 
18 | 
19 | def tokenize_text(text):
20 |     return text.split()
21 | 
22 | 
23 | def score_text(tokens):
24 |     words = {"the":1, "to":1, "and":1, #Words with 1 indicate men
25 |              "in":1, "have":1, "it":1,
26 |              "be":-1, "of":-1, "a":-1, # Words with -1 indicate women
27 |              "that":-1, "i":-1, "for":-1}
28 |     return sum(map(lambda x: words.get(x, 0), tokens))
29 | 
30 | 
31 | def score_tweet(tweet_id):
32 |     return pipe(tweet_id, get_tweet_from_id, tweet_to_text,
33 |                           tokenize_text, score_text)
34 | 
35 | 
36 | def score_user(tweets):
37 |     N = len(tweets)
38 |     total = sum(map(score_tweet, tweets))
39 |     return total/N
40 | 
41 | 
42 | def categorize_user(user_score):
43 |     if user_score > 0:
44 |         return {"score":user_score,
45 |                 "gender": "Male"}
46 |     return {"score":user_score,
47 |             "gender":"Female"}
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     users_tweets = [
52 |     [1056365937547534341, 1056310126255034368, 1055985345341251584,
53 |     1056585873989394432, 1056585871623966720],
54 |     [1055986452612419584, 1056318330037002240, 1055957256162942977,
55 |      1056585921154420736, 1056585896898805766],
56 |     [1056240773572771841, 1056184836900175874, 1056367465477951490,
57 |      1056585972765224960, 1056585968155684864],
58 |     [1056452187897786368, 1056314736546115584, 1055172336062816258,
59 |      1056585983175602176, 1056585980881207297]]
60 |     gender_prediction_pipeline = compose(categorize_user, score_user)
61 |     with Pool() as P:
62 |         print(P.map(gender_prediction_pipeline, users_tweets))
63 | 


--------------------------------------------------------------------------------
/Ch04/even_numbers.py:
--------------------------------------------------------------------------------
1 | def even_numbers(n):
2 |     i = 1
3 |     while i <= n:
4 |         yield i*2
5 |         i += 1
6 | 
7 | first_100_even = (i*2 for i in range(1,101))
8 | 


--------------------------------------------------------------------------------
/Ch04/generate_poems.py:
--------------------------------------------------------------------------------
 1 | import re, os, glob
 2 | from functools import reduce
 3 | from random import randint, choice
 4 | from multiprocessing import Pool
 5 | from math import floor
 6 | 
 7 | class ContentMatcher:
 8 |     def __init__(self):
 9 |         self.r = re.compile(r'[A-Z\W-]+')
10 |     def is_content(self,l):
11 |         if self.r.fullmatch(l):
12 |             return False
13 |         else: return True
14 | 
15 | def line_to_thirds(l):
16 |   words = l.split()
17 |   n = len(words)
18 |   breakpoint = floor(n / 3)
19 |   return {"first": " ".join(words[:breakpoint]),
20 |          "second": " ".join(words[breakpoint:breakpoint*2]),
21 |          "third": " ".join(words[breakpoint*2:])}
22 | 
23 | def join_breaks(acc,nxt):
24 |     return {k:v+[nxt[k]] for k,v in acc.items()}
25 | 
26 | def consolidate_content(fp,R):
27 |     with open(fp) as f:
28 |         with Pool() as P:
29 |             content = P.map(line_to_thirds, filter(R.is_content, f.readlines()))
30 |     return reduce(join_breaks,
31 |                   content,
32 |                   {"first":[],"second":[],"third":[]})
33 | 
34 | def make_line(parts):
35 |     return " ".join([choice(parts['first']),
36 |                     choice(parts["second"]),
37 |                     choice(parts["third"])])
38 | 
39 | def write_poem(parts,name,i):
40 |     fp = "{}/poem_{}.txt".format(name,i)
41 |     num_lines = randint(7,40)
42 |     lines = (make_line(parts) for _ in range(num_lines))
43 |     with open(fp,"w") as f:
44 |         f.write("\n".join(lines))
45 | 
46 | def calc_total_size():
47 |     paths = glob.iglob("./author*/*")
48 |     return sum(map(os.path.getsize,paths))
49 | 
50 | def generate_poems(a, b, max_size=10000000):
51 |     try:
52 |         os.mkdir("author_a")
53 |         os.mkdir("author_b")
54 |     except FileExistsError:
55 |         pass
56 |     i = 1
57 |     #while calc_total_size() < max_size:
58 |     for _ in range(floor(max_size/1000)):
59 |         write_poem(a,"author_a",i)
60 |         write_poem(b,"author_b",i)
61 |         i+=1
62 | 
63 | if __name__ == "__main__":
64 |     CM = ContentMatcher()
65 |     author_a = consolidate_content("A.txt",CM)
66 |     author_b = consolidate_content("B.txt",CM)
67 |     generate_poems(author_a, author_b)
68 | 


--------------------------------------------------------------------------------
/Ch04/lake_simulation.py:
--------------------------------------------------------------------------------
 1 | import random, itertools
 2 | from operator import methodcaller
 3 | 
 4 | 
 5 | class Village:
 6 |   def __init__(self):
 7 |     self.population = random.uniform(1000,5000)
 8 |     self.cheat_rate = random.uniform(.05,.15)
 9 | 
10 |   def update(self, sim):
11 |     if sim.cheaters >= 2:
12 |       self.cheat_rate += .05
13 |     self.population = int(self.population*1.025)
14 | 
15 |   def go_fishing(self):
16 |     if random.uniform(0,1) < self.cheat_rate:
17 |       cheat = 1
18 |       fish_taken = self.population * 2
19 |     else:
20 |       cheat = 0
21 |       fish_taken = self.population * 1
22 |     return fish_taken, cheat
23 | 
24 | 
25 | class LakeSimulation:
26 |   def __init__(self):
27 |     self.villages = [Village() for _ in range(4)]
28 |     self.fish = 80000
29 |     self.year = 1
30 |     self.cheaters = 0
31 | 
32 |   def simulate(self):
33 |     for _ in itertools.count():
34 |         yearly_results = map(methodcaller("go_fishing"), self.villages)
35 |         fishs, cheats = zip(*yearly_results)
36 |         total_fished = sum(fishs)
37 |         self.cheaters = sum(cheats)
38 |         if self.year > 1000:
39 |             print("Wow! Your villages lasted 1000 years!")
40 |             break
41 |         if self.fish < total_fished:
42 |             print("The lake was overfished in {} years.".format(self.year))
43 |             break
44 |         else:
45 |             self.fish = (self.fish-total_fished)* 1.15
46 |             map(methodcaller("update"), self.villages)
47 |             print("Year {:<5}   Fish: {}".format(self.year,
48 |                                                  int(self.fish)))
49 |             self.year += 1
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     random.seed("Wolohan")
54 |     Lake = LakeSimulation()
55 |     Lake.simulate()
56 | 


--------------------------------------------------------------------------------
/Ch04/m_words.py:
--------------------------------------------------------------------------------
 1 | words = ["apple","mongoose","walk","mouse","good",
 2 |          "pineapple","yeti","minnesota","mars",
 3 |          "phone","cream","cucumber","coffee","elementary",
 4 |          "sinister","science","empire"]
 5 | 
 6 | def contains_m(s):
 7 |     if "m" in s.lower(): return True
 8 |     else: return False
 9 | 
10 | m_words = filter(contains_m, words)
11 | 
12 | next(m_words)
13 | next(m_words)
14 | next(m_words)
15 | 
16 | print(list(m_words))
17 | # [“mars”,”cream”,”cucumber”,”elementary”, ... ]
18 | 


--------------------------------------------------------------------------------
/Ch04/more_filters.py:
--------------------------------------------------------------------------------
 1 | from itertools import filterfalse
 2 | from toolz.dicttoolz import keyfilter, valfilter, itemfilter
 3 | 
 4 | def is_even(x):
 5 |     if x % 2 == 0: return True
 6 |     else: return False
 7 | 
 8 | def both_are_even(x):
 9 |     k,v = x
10 |     if is_even(k) and is_even(v): return True
11 |     else: return False
12 | 
13 | print(list(filterfalse(is_even, range(10))))
14 | # [1, 3, 5, 7, 9]
15 | 
16 | print(list(keyfilter(is_even, {1:2, 2:3, 3:4, 4:5, 5:6})))
17 | # [2, 4]
18 | 
19 | print(list(valfilter(is_even, {1:2, 2:3, 3:4, 4:5, 5:6})))
20 | # [1, 3, 5]
21 | 
22 | print(list(itemfilter(both_are_even, {1:5, 2:4, 3:3, 4:2, 5:1})))
23 | # [2, 4]
24 | 


--------------------------------------------------------------------------------
/Ch04/poem_puzzle.py:
--------------------------------------------------------------------------------
 1 | import toolz
 2 | import re, itertools
 3 | from glob import iglob
 4 | 
 5 | 
 6 | def word_ratio(d):
 7 |     """This helper function returns the ratio of a's to the's"""
 8 |     return float(d.get("a",0))/float(d.get("the",0.0001))
 9 | 
10 | 
11 | class PoemCleaner:
12 |     def __init__(self):
13 |         self.r = re.compile(r'[.,;:!-]')
14 | 
15 |     def clean_poem(self, fp):
16 |         """This helper function opens a poem at a filepath and returns a clean poem.
17 | 
18 |         A clean poem will be a punctuation-less sequence of lowercase words, in
19 |         the order that the author of the poem placed them.
20 |         """
21 |         with open(fp) as poem:
22 |             no_punc = self.r.sub("",poem.read())
23 |             return no_punc.lower().split()
24 | 
25 | 
26 | def word_is_desired(w):
27 |     """This helper function detects whether a word is "a" or "the".
28 | 
29 |     It is designed to be used in conjunction with filter to filter a sequence
30 |     of words down to just definite and indefinite articles.
31 |     """
32 |     if w in ["a","the"]:
33 |         return True
34 |     else:
35 |         return False
36 | 
37 | 
38 | def analyze_poems(poems, cleaner):
39 |     return word_ratio(
40 |         toolz.frequencies(
41 |             filter(word_is_desired,
42 |                 itertools.chain(*map(cleaner.clean_poem, poems)))))
43 | 
44 | 
45 | if __name__ == "__main__":
46 | 
47 |     Cleaner = PoemCleaner()
48 |     author_a_poems = iglob("author_a/*.txt")
49 |     author_b_poems = iglob("author_b/*.txt")
50 | 
51 |     author_a_ratio = analyze_poems(author_a_poems, Cleaner)
52 |     author_b_ratio = analyze_poems(author_b_poems, Cleaner)
53 | 
54 |     print("""
55 |     Original_Poem:  0.3
56 |     Author A:     {:.2f}
57 |     Author B:     {:.2f}
58 |     """.format(author_a_ratio, author_b_ratio))
59 | 


--------------------------------------------------------------------------------
/Ch05/car_profit.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | 
 4 | def low_med_hi(d, k, breaks):
 5 |     if float(d[k]) < breaks[0]:
 6 |         return "low"
 7 |     elif float(d[k]) < breaks[1]:
 8 |         return "medium"
 9 |     else:
10 |         return "high"
11 | 
12 | 
13 | def clean_entry(d):
14 |     r = {'profit':None, 'mpg':None, 'odo':None}
15 |     r['profit'] = float(d.get("price-sell", 0)) - float(d.get("price-buy", 0))
16 |     r['mpg'] = low_med_hi(d, 'mpg', (18, 35))
17 |     r['odo'] = low_med_hi(d, 'odo', (60000, 105000))
18 |     return r
19 | 
20 | 
21 | def acc_average(acc, profit):
22 |     acc['total'] = acc.get('total', 0) + profit
23 |     acc['count'] = acc.get('count', 0) + 1
24 |     acc['average'] = acc['total']/acc['count']
25 |     return acc
26 | 
27 | 
28 | def sort_and_add(acc, nxt):
29 |     p = nxt['profit']
30 |     acc['mpg'][nxt['mpg']] = acc_average(acc['mpg'].get(nxt['mpg'], {}), p)
31 |     acc['odo'][nxt['odo']] = acc_average(acc['odo'].get(nxt['odo'], {}), p)
32 |     return acc
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     import json
37 |     with open("cars.json") as f:
38 |         xs = json.load(f)
39 |     results = reduce(sort_and_add, map(clean_entry, xs), {"mpg": {}, "odo": {}})
40 |     print(json.dumps(results, indent=4))
41 | 


--------------------------------------------------------------------------------
/Ch05/evenfilter.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | xs = [1, 2, 3, 4, 5, 6, 7, 8, 9]
 4 | 
 5 | 
 6 | def keep_if_even(acc, nxt):
 7 |     if nxt % 2 == 0:
 8 |         return acc + [nxt]
 9 |     else:
10 |         return acc
11 | 
12 | 
13 | reduce(keep_if_even, xs, [])
14 | 


--------------------------------------------------------------------------------
/Ch05/frequencies.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | xs = ["A", "B", "C", "A", "A", "C", "A"]
 4 | ys = [1, 3, 6, 1, 2, 9, 3, 12]
 5 | 
 6 | 
 7 | def make_counts(acc, nxt):
 8 |     acc[nxt] = acc.get(nxt, 0) + 1
 9 |     return acc
10 | 
11 | 
12 | def my_frequencies(xs):
13 |     return reduce(make_counts, xs, {})
14 | 
15 | 
16 | print(my_frequencies(xs))
17 | print(my_frequencies(ys))
18 | print(my_frequencies("mississippi"))
19 | 


--------------------------------------------------------------------------------
/Ch05/products.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | my_products = [
 4 |     {"price": 9.99,
 5 |      "sn": '00231'},
 6 |     {"price": 59.99,
 7 |      "sn": '11010'},
 8 |     {"price": 74.99,
 9 |      "sn": '00013'},
10 |     {"price": 19.99,
11 |      "sn": '00831'},
12 | ]
13 | 
14 | reduce(lambda acc, nxt: acc+nxt.get("price", 0), my_products, 0)
15 | 


--------------------------------------------------------------------------------
/Ch05/scrabble_scores.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | 
 4 | def score_word(word):
 5 |     points = 0
 6 |     for char in word:
 7 |         if char == "z": points += 10
 8 |         elif char in ["f", "h", "v", "w"]: points += 5
 9 |         elif char in ["b", "c", "m", "p"]: points += 3
10 |         else: points += 1
11 |     return points
12 | 
13 | 
14 | words = ["these", "are", "my", "words"]
15 | 
16 | total_score = reduce(lambda acc,nxt: acc+nxt, map(score_word, words))
17 | print(total_score)
18 | 


--------------------------------------------------------------------------------
/Ch05/summation.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | 
 3 | xs = [10, 5, 1, 19, 11, 203]
 4 | 
 5 | 
 6 | def my_add(acc, nxt):
 7 |     return acc + nxt
 8 | 
 9 | 
10 | print(reduce(my_add, xs, 0))
11 | 
12 | # With a lambda instead:
13 | print(reduce(lambda acc, nxt: acc+nxt, xs, 0))
14 | 


--------------------------------------------------------------------------------
/Ch06/imap.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool
 2 | 
 3 | 
 4 | def increase(x):
 5 |   return x+1
 6 | 
 7 | 
 8 | with Pool() as P:
 9 |   a = P.map(increase, range(100))
10 | 
11 | 
12 | with Pool() as P:
13 |   b = P.imap(increase, range(100))
14 | 
15 | 
16 | with Pool() as P:
17 |   c = P.imap_unordered(increase, range(100))
18 | 
19 | print(a)
20 | print(b)
21 | print(c)
22 | 


--------------------------------------------------------------------------------
/Ch06/naivebayes.py:
--------------------------------------------------------------------------------
 1 | from itertools import starmap, repeat
 2 | from functools import reduce, partial
 3 | import dill as pickle
 4 | from toolz.sandbox.parallel import fold
 5 | from pathos.multiprocessing import ProcessingPool as PathosPool
 6 | from multiprocessing import Pool
 7 | from csv import DictReader
 8 | 
 9 | def unique_keys(left, right):
10 |     return set(left.keys()).union(set(right.keys()))
11 | 
12 | def prod(xs):
13 |     return reduce(lambda acc,nxt: acc*nxt, xs)
14 | 
15 | def compute_prob(model, k, v, label, N):
16 |     Cn = model['LABELS'][label]
17 |     prior = Cn / N
18 |     evidence = model[k][v].get(label,.001) / Cn
19 |     return prior * evidence
20 | 
21 | def _nb_suggest(ob, model, target):
22 |     ob.pop(target)
23 |     N = sum(model['LABELS'].values())
24 |     results = {}
25 |     for label in model['LABELS'].keys():
26 |         p = prod(compute_prob(model, k, v, label, N) for k, v in ob.items())
27 |         results[label] = p
28 |     return results
29 | 
30 | def naive_bayes_suggest(obs, model, target):
31 |     with Pool() as P:
32 |         f = partial(_nb_suggest, target=target)
33 |         return P.starmap(f, zip(obs, repeat(model)))
34 | 
35 | def nb_acc(acc, nxt, target):
36 |     label = nxt.pop(target)
37 |     if not acc.get('LABELS', False):
38 |         acc['LABELS'] = {}
39 |     acc['LABELS'][label] = acc['LABELS'].get(label,0) + 1
40 |     for k,v in nxt.items():
41 |         if not acc.get(k,False):
42 |             acc[k] = {}
43 |         if not acc[k].get(v, False):
44 |             acc[k][v] = {}
45 |         acc[k][v][label] = acc.get(k,{}).get(v,{}).get(label,0) + 1
46 |     return acc
47 | 
48 | def _nb_comb(left, right):
49 |     acc = {}
50 |     acc['LABELS'] = {}
51 |     for k in unique_keys(left['LABELS'], right['LABELS']):
52 |         acc['LABELS'][k] = left['LABELS'].get(k,0) + right['LABELS'].get(k,0)
53 |     for k in unique_keys(left, right):
54 |         if k == 'LABELS': continue
55 |         acc[k] = {}
56 |         for v in unique_keys(left.get(k,{}), right.get(k,{})):
57 |             acc[k][v] = {}
58 |             for label in acc['LABELS']:
59 |                 count_left = left.get(k,{}).get(v,{}).get(label,0)
60 |                 count_right = right.get(k,{}).get(v,{}).get(label,0)
61 |                 acc[k][v][label] = count_left + count_right
62 |     return acc
63 | 
64 | def naive_bayes(xs, target):
65 |     acc = partial(nb_acc, target=target)
66 |     with PathosPool() as P:
67 |         model = fold(acc, xs, {}, map=P.map, combine=_nb_comb)
68 |     return partial(naive_bayes_suggest, model=model, target=target)
69 | 
70 | def max_prob(probs):
71 |     return max(((k,v) for k,v in probs.items()), key=lambda x:x[1])[0]
72 | 
73 | if __name__ == "__main__":
74 |     # Download the nursery data and assign its path to fp
75 |     # https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data
76 |     fp = "/home/jt-w/Downloads/nursery.data"
77 |     with open(fp) as f:
78 |         reader = DictReader(f, fieldnames=["parents", "has_nurs", "form",
79 |                                      "children", "housing", "finance",
80 |                                      "social", "health", "recc"])
81 |         data = [row for row in reader]
82 | 
83 |     model = naive_bayes(data, "recc")
84 |     probs = model(data)
85 |     print("{}\t\t{}\t{}".format("Match", "Suggestion", "Actual"))
86 |     print("{}".format("-"*45))
87 |     for i,p in enumerate(probs):
88 |         suggestion = max_prob(p)
89 |         actual = data[i]['recc']
90 |         match = suggestion == actual
91 |         print("{}\t\t{}\t{}".format(match, suggestion, actual))
92 |         if i > 25: break
93 | 


--------------------------------------------------------------------------------
/Ch06/par-filter.py:
--------------------------------------------------------------------------------
 1 | from pathos.multiprocessing import ProcessingPool as Pool
 2 | from toolz.sandbox.parallel import fold
 3 | from functools import reduce
 4 | 
 5 | 
 6 | def map_combination(left, right):
 7 |   return left + right
 8 | 
 9 | 
10 | def keep_if_even(acc, nxt):
11 |     if nxt % 2 == 0:
12 |         return acc + [nxt]
13 |     else: return acc
14 | 
15 | 
16 | with Pool() as P:
17 |     fold(keep_if_even, range(500000), [],
18 |          map=P.imap, combine=map_combination)
19 | 
20 | print(reduce(keep_if_even, range(500), []))
21 | 
22 | 


--------------------------------------------------------------------------------
/Ch06/par-freqs.py:
--------------------------------------------------------------------------------
 1 | from pathos.multiprocessing import ProcessingPool as Pool
 2 | from toolz.sandbox.parallel import fold
 3 | from random import choice
 4 | from functools import reduce
 5 | 
 6 | 
 7 | def combine_counts(left, right):
 8 |   unique_keys = set(left.keys()).union(set(right.keys()))
 9 |   return {k:left.get(k, 0)+right.get(k, 0) for k in unique_keys}
10 | 
11 | 
12 | def make_counts(acc, nxt):
13 |     acc[nxt] = acc.get(nxt,0) + 1
14 |     return acc
15 | 
16 | 
17 | xs = (choice([1, 2, 3, 4, 5, 6]) for _ in range(500000))
18 | 
19 | with Pool() as P:
20 |     fold(make_counts, xs, {},
21 |          map=P.imap, combine=combine_counts)
22 | 
23 | print(reduce(make_counts, (choice([1, 2, 3, 4, 5, 6]) for _ in range(500)), {}))
24 | 


--------------------------------------------------------------------------------
/Ch06/par-sum.py:
--------------------------------------------------------------------------------
 1 | from pathos.multiprocessing import ProcessingPool as Pool
 2 | from toolz.sandbox.parallel import fold
 3 | from functools import reduce
 4 | 
 5 | 
 6 | def my_add(left, right):
 7 |   return left+right
 8 | 
 9 | 
10 | with Pool() as P: 
11 |     fold(my_add, range(500000), map=P.imap)
12 | 
13 | print(reduce(my_add, range(500)))


--------------------------------------------------------------------------------
/Ch06/parallel-fold.py:
--------------------------------------------------------------------------------
 1 | import dill as pickle
 2 | from toolz.sandbox.parallel import fold
 3 | from pathos.multiprocessing import ProcessingPool as Pool
 4 | from random import choice
 5 | 
 6 | N = 100000
 7 | P = Pool()
 8 | 
 9 | # Parallel summation
10 | def my_add(left, right):
11 |   return left+right
12 | 
13 | xs = range(N)
14 | 
15 | print(fold(my_add, xs, map=P.imap))
16 | 
17 | # Parallel filter
18 | def map_combination(left, right):
19 |   return left + right
20 | 
21 | def keep_if_even(acc, nxt):
22 |     if nxt % 2 == 0:
23 |         return acc + [nxt]
24 |     else: return acc
25 | 
26 | print(fold(keep_if_even, xs, [], map=P.imap, combine=map_combination))
27 | 
28 | #Parallel frequencies
29 | def combine_counts(left, right):
30 |   unique_keys = set(left.keys()).union(set(right.keys()))
31 |   return {k:left.get(k,0)+right.get(k,0) for k in unique_keys}
32 | 
33 | def make_counts(acc, nxt):
34 |     acc[nxt] = acc.get(nxt,0) + 1
35 |     return acc
36 | 
37 | xs = (choice([1,2,3,4,5,6]) for _ in range(N))
38 | 
39 | print(fold(make_counts, xs, {}, map=P.imap, combine=combine_counts))
40 | 


--------------------------------------------------------------------------------
/Ch06/starmap.py:
--------------------------------------------------------------------------------
 1 | from itertools import starmap
 2 | xs = [7, 3, 1, 19, 11]
 3 | ys = [8, 1, -3, 14, 22]
 4 | 
 5 | loop_maxes = [max(ys[i], x) for i, x in enumerate(xs)]
 6 | map_maxes = list(starmap(max, zip(xs, ys)))
 7 | 
 8 | print(loop_maxes)
 9 | # [8, 3, 1, 19, 22]
10 | print(map_maxes)
11 | # [8, 3, 1, 19, 22]
12 | 


--------------------------------------------------------------------------------
/Ch06/timing-chunks.py:
--------------------------------------------------------------------------------
 1 | from time import clock
 2 | from multiprocessing import Pool
 3 | 
 4 | 
 5 | def times_two(x):
 6 |   return x*2+7
 7 | 
 8 | 
 9 | def parallel_map(xs, chunk_size=8500):
10 |   with Pool(2) as P:
11 |     x = P.map(times_two, xs, chunk_size)
12 |   return x
13 | 
14 | 
15 | print("""
16 | {:<10}  |  {}
17 | -------------------------""".format("chunksize", "runtime"))
18 | 
19 | for i in range(0, 9):
20 |   N = 1000000
21 |   chunk_size = 5 * (10**i)
22 | 
23 |   t1 = clock()
24 |   parallel_map(range(N), chunk_size)
25 |   parallel_time = clock() - t1
26 | 
27 |   print("{:<10}  {:>0.3f}".format(chunk_size, parallel_time))
28 | 


--------------------------------------------------------------------------------
/Ch06/timing.py:
--------------------------------------------------------------------------------
 1 | from time import clock, sleep
 2 | from multiprocessing import Pool
 3 | 
 4 | 
 5 | def times_two(x):
 6 |   return x*2+7
 7 | 
 8 | 
 9 | def lazy_map(xs):
10 |   return list(map(times_two, xs))
11 | 
12 | 
13 | def parallel_map(xs, chunck=8500):
14 |   with Pool(2) as P:
15 |     x =  P.map(times_two, xs, chunck)
16 |   return x
17 | 
18 | 
19 | for i in range(0, 7):
20 |   N = 10**i
21 |   t1 = clock()
22 |   lazy_map(range(N))
23 |   lm_time = clock() - t1
24 | 
25 |   t1 = clock()
26 |   parallel_map(range(N))
27 |   par_time = clock() - t1
28 |   print("""
29 | -- N = {} --
30 | Lazy map time:      {}
31 | Parallel map time:  {}
32 | """.format(N, lm_time, par_time))
33 | 


--------------------------------------------------------------------------------
/Ch07/FlorenceMachineCounts/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/Ch07/FlorenceMachineCounts/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch07/FlorenceMachineCounts/.part-00000.crc


--------------------------------------------------------------------------------
/Ch07/FlorenceMachineCounts/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch07/FlorenceMachineCounts/_SUCCESS


--------------------------------------------------------------------------------
/Ch07/FlorenceMachineCounts/part-00000:
--------------------------------------------------------------------------------
  1 | crashes	
  2 | 	
  3 | Nothing	
  4 | 	
  5 | recession	
  6 | 	
  7 | messages,	
  8 | 	
  9 | freedom	
 10 | 	
 11 | phrases	
 12 | 	
 13 | Reflections	
 14 | 	
 15 | reflection	
 16 | 	
 17 | through	
 18 | 	
 19 | réponds	
 20 | 	
 21 | escaping,	
 22 | 	
 23 | Everybody	
 24 | 	
 25 | describe	
 26 | 	
 27 | forgive	
 28 | 	
 29 | themselves.	
 30 | 	
 31 | shelter	
 32 | 	
 33 | pourquoi	
 34 | 	
 35 | exchange	
 36 | 	
 37 | defiance	
 38 | 	
 39 | scrawling	
 40 | 	
 41 | loosely	
 42 | 	
 43 | holding	
 44 | 	
 45 | Drinking	
 46 | 	
 47 | doesn't	
 48 | 	
 49 | they're	
 50 | 	
 51 | Skyscrapers	
 52 | 	
 53 | entertainers,	
 54 | 	
 55 | Chicago	
 56 | 	
 57 | tabletop.	
 58 | 	
 59 | mistakes	
 60 | 	
 61 | mountain	
 62 | 	
 63 | absolution	
 64 | 	
 65 | anything	
 66 | 	
 67 | Darling,	
 68 | 	
 69 | Cathedral	
 70 | 	
 71 | inbetween	
 72 | 	
 73 | thousand	
 74 | 	
 75 | daylight	
 76 | 	
 77 | something	
 78 | 	
 79 | attitude's	
 80 | 	
 81 | convinced	
 82 | 	
 83 | youngsters	
 84 | 	
 85 | returned	
 86 | 	
 87 | Delilah	
 88 | 	
 89 | tempting	
 90 | 	
 91 | [Florence]	
 92 | 	
 93 | answer,	
 94 | 	
 95 | sailin'	
 96 | 	
 97 | everywhere's	
 98 | 	
 99 | willingly	
100 | 	
101 | Rascal]	
102 | 	
103 | [Pre-Chorus:]	
104 | 	
105 | dancing	
106 | 	
107 | stumble,	
108 | 	
109 | feeling	
110 | 	
111 | hurricane	
112 | 	
113 | Fractured	
114 | 	
115 | forgotten,	
116 | 	
117 | thinking	
118 | 	
119 | violent	
120 | 	
121 | together	
122 | 	
123 | there's	
124 | 	
125 | birthday,	
126 | 	
127 | another	
128 | 	
129 | beginning	
130 | 	
131 | rushing	
132 | 	
133 | moonlight	
134 | 	
135 | submission	
136 | 	
137 | staring	
138 | 	
139 | Climbing	
140 | 	
141 | electric	
142 | 	
143 | Looking	
144 | 	
145 | beating	
146 | 	
147 | language	
148 | 	
149 | crumble	
150 | 	
151 | weather	
152 | 	
153 | hollyoaks,	
154 | 	
155 | yourself	
156 | 	
157 | kindest	
158 | 	
159 | pillars	
160 | 	
161 | quicker	
162 | 	
163 | illusion,	
164 | 	
165 | mountain,	
166 | 	
167 | Tenderest	
168 | 	
169 | world’s	
170 | 	
171 | hearing	
172 | 	
173 | solution	
174 | 	
175 | Somewhere	
176 | 	
177 | smallest	
178 | 	
179 | calling	
180 | 	
181 | various	
182 | 	
183 | getting	
184 | 	
185 | broken-hearted	
186 | 	
187 | paycheck	
188 | 	
189 | Because	
190 | 	
191 | bedroom	
192 | 	
193 | longing	
194 | 	
195 | singing	
196 | 	
197 | dreamed	
198 | 	
199 | confessions	
200 | 	
201 | episode	
202 | 	
203 | whatever	
204 | 	
205 | meaning	
206 | 	
207 | themselves	
208 | 	
209 | confession	
210 | 	
211 | Checkin'	
212 | 	
213 | mourner	
214 | 	
215 | hurting	
216 | 	
217 | Spilled	
218 | 	
219 | bitterest	
220 | 	
221 | mistress	
222 | 	
223 | dragging	
224 | 	
225 | television	
226 | 	
227 | nameless,	
228 | 	
229 | famous,	
230 | 	
231 | throwing	
232 | 	
233 | [Pre-Chorus	
234 | 	
235 | different	
236 | 	
237 | friends	
238 | 	
239 | instead	
240 | 	
241 | tongues	
242 | 	
243 | warmer,	
244 | 	
245 | accurate	
246 | 	
247 | spinning	
248 | 	
249 | Outside	
250 | 	
251 | morning	
252 | 	
253 | tiptoes	
254 | 	
255 | darling,	
256 | 	
257 | though,	
258 | 	
259 | process?	
260 | 	
261 | minutes	
262 | 	
263 | [Pre-Chorus]	
264 | 	
265 | Sometimes	
266 | 	
267 | southside	
268 | 	
269 | couldn't	
270 | 	
271 | depression	
272 | 	
273 | favorite	
274 | 	
275 | heaven,	
276 | 	
277 | overflow	
278 | 	
279 | shameless	
280 | 	
281 | Saviour's	
282 | 	
283 | (Holding	
284 | 	
285 | unblinking	
286 | 	
287 | dites-moi	
288 | 	
289 | Starring	
290 | 	
291 | pressure's	
292 | 	
293 | Darling	
294 | 	
295 | darkest	
296 | 	
297 | remember	
298 | 	
299 | Spilling	
300 | 	
301 | [Refrain:]	
302 | 	
303 | devotion	
304 | 	
305 | selfish	
306 | 	
307 | forefront	
308 | 	
309 | sweetest	
310 | 	
311 | glasses	
312 | 	
313 | flashing	
314 | 	
315 | breathe	
316 | 	
317 | slipping	
318 | 	
319 | translate	
320 | 	
321 | looking	
322 | 	
323 | protected	
324 | 	
325 | nothing	
326 | 	
327 | sometimes	
328 | 	
329 | Between	
330 | 	
331 | gettin'	
332 | 	
333 | tonight)	
334 | 	
335 | hardest	
336 | 	
337 | shallow	
338 | 	
339 | thought	
340 | 	
341 | woah-oh-o	
342 | 	
343 | damaged	
344 | 	
345 | breaking	
346 | 	
347 | someone	
348 | 	
349 | Pushing	
350 | 	
351 | released	
352 | 	
353 | [Refrain]	
354 | 	
355 | saying,	
356 | 	
357 | profession	
358 | 	
359 | control	
360 | 	
361 | shocked	
362 | 	
363 | carrying	
364 | 	
365 | affection,	
366 | 	
367 | education	
368 | 	
369 | amounted	
370 | 	
371 | bleeding,	
372 | 	
373 | ticking	
374 | 	
375 | following	
376 | 	
377 | because	
378 | 	
379 | already	
380 | 	
381 | teaching	
382 | 	
383 | treatment	
384 | 	
385 | Praying	
386 | 	
387 | Uncurling	
388 | 	
389 | lifelines	
390 | 	
391 | hallway	
392 | 	
393 | bargain	
394 | 	
395 | underneath	
396 | 	
397 | [Chorus:]	
398 | 	
399 | ringing	
400 | 	
401 | stumbling	
402 | 	
403 | deserve	
404 | 	
405 | weekend	
406 | 	
407 | sleeping	
408 | 	
409 | [Chorus]	
410 | 	
411 | Christ,	
412 | 	
413 | prayers	
414 | 	
415 | aimless,	
416 | 	
417 | parties	
418 | 	
419 | Truthfully,	
420 | 	
421 | Everybody's	
422 | 	
423 | forsake	
424 | 	
425 | Politician	
426 | 	
427 | enough.	
428 | 	
429 | Another	
430 | 	
431 | peaceful	
432 | 	
433 | trouble	
434 | 	
435 | started	
436 | 	
437 | forever	
438 | 	
439 | prayers,	
440 | 	
441 | o-o-o-o-o-o-out	
442 | 	
443 | monument	
444 | 	
445 | (That's	
446 | 	
447 | [Dizzee	
448 | 	
449 | sunrise	
450 | 	
451 | Something's	
452 | 	
453 | Deliver	
454 | 	
455 | command	
456 | 	
457 | Huggin'	
458 | 	
459 | drifting	
460 | 	
461 | two-faced,	
462 | 	
463 | screaming	
464 | 	
465 | gestures	
466 | 	
467 | conclusion	
468 | 	
469 | ground.	
470 | 	
471 | proclamations	
472 | 	
473 | existed	
474 | 	
475 | baggage	
476 | 	
477 | weekend.	
478 | 	
479 | buildings	
480 | 	
481 | Whenever	
482 | 	
483 | Dealing	
484 | 	
485 | rooftop	
486 | 	
487 | Sweating	
488 | 	
489 | delivered	
490 | 	
491 | Everywhere	
492 | 	
493 | struggling	
494 | 	
495 | watched	
496 | 	
497 | understand	
498 | 	
499 | outside	
500 | 	
501 | ashamed	
502 | 	
503 | Pockets	
504 | 	
505 | 


--------------------------------------------------------------------------------
/Ch07/Florence_Machine.txt:
--------------------------------------------------------------------------------
  1 | High in the halls of the kings who are gone
  2 | Jenny would dance with her ghosts
  3 | The ones she had lost and the ones she had found
  4 | And the ones who had loved her the most
  5 | 
  6 | The ones who'd been gone for so very long
  7 | She couldn't remember their names
  8 | They spun her around on the damp old stones
  9 | Spun away all her sorrow and pain
 10 | 
 11 | And she never wanted to leave, never wanted to leave
 12 | Never wanted to leave, never wanted to leave
 13 | 
 14 | They danced through the day
 15 | And into the night through the snow that swept through the hall
 16 | From winter to summer then winter again
 17 | Til the walls did crumble and fall
 18 | 
 19 | And she never wanted to leave, never wanted to leave
 20 | Never wanted to leave, never wanted to leave
 21 | And she never wanted to leave, never wanted to leave
 22 | Never wanted to leave, never wanted to leave
 23 | 
 24 | High in the halls of the kings who are gone
 25 | Jenny would dance with her ghosts
 26 | The ones she had lost and the ones she had found
 27 | And the ones
 28 | Who had loved her the most
 29 | 
 30 | You need a big god
 31 | Big enough to hold your love
 32 | You need a big god
 33 | Big enough to fill you up
 34 | 
 35 | You keep me up at night
 36 | To my messages, you do not reply
 37 | You know I still like you the most
 38 | The best of the best and the worst of the worst
 39 | Well, you can never know
 40 | The places that I go
 41 | I still like you the most
 42 | You'll always be my favorite ghost
 43 | 
 44 | You need a big god
 45 | Big enough to hold your love
 46 | You need a big god
 47 | Big enough to fill you up
 48 | 
 49 | Sometimes I think it's gettin' better
 50 | And then it gets much worse
 51 | Is it just part of the process?
 52 | Well, Jesus Christ, it hurts
 53 | Though I know I should know better
 54 | Well, I can make this work
 55 | Is it just part of the process?
 56 | Well, Jesus Christ, Jesus Christ, it hurts
 57 | Jesus Christ, Jesus Christ, it hurts
 58 | 
 59 | You need a big god
 60 | Big enough to hold your love
 61 | You need a big god
 62 | Big enough to fill you up
 63 | 
 64 | Shower your affection, let it rain on me
 65 | And pull down the mountain, drag your cities to the sea
 66 | Shower your affection, let it rain on me
 67 | Don't leave me on this white cliff
 68 | Let it slide down to the, slide down to the sea
 69 | Slide down to the, slide down to the sea
 70 | 
 71 | 
 72 | Looking up from underneath
 73 | Fractured moonlight on the sea
 74 | Reflections still look the same to me
 75 | As before I went under
 76 | 
 77 | And it's peaceful in the deep
 78 | Cathedral where you cannot breathe
 79 | No need to pray, no need to speak
 80 | Now I am under all
 81 | 
 82 | And it's breaking over me
 83 | A thousand miles down to the sea bed
 84 | Found the place to rest my head
 85 | Never let me go
 86 | Never let me go
 87 | Never let me go
 88 | Never let me go
 89 | 
 90 | And the arms of the ocean are carrying me
 91 | And all this devotion was rushing out of me
 92 | And the crashes are heaven for a sinner like me
 93 | But the arms of the ocean delivered me
 94 | 
 95 | Though the pressure's hard to take
 96 | It's the only way I can escape
 97 | It seems a heavy choice to make
 98 | And now I am under all
 99 | 
100 | And it's breaking over me
101 | A thousand miles down to the sea bed
102 | Found the place to rest my head
103 | Never let me go
104 | Never let me go
105 | Never let me go
106 | Never let me go
107 | 
108 | And the arms of the ocean are carrying me
109 | And all this devotion was rushing out of me
110 | And the crashes are heaven for a sinner like me
111 | But the arms of the ocean delivered me
112 | 
113 | And it's over
114 | And I'm going under
115 | But I'm not giving up
116 | I'm just giving in
117 | 
118 | I'm slipping underneath
119 | So cold and so sweet
120 | 
121 | And the arms of the ocean so sweet and so cold
122 | And all this devotion I never knew at all
123 | And the crashes are heaven for a sinner released
124 | And the arms of the ocean delivered me
125 | Never let me go
126 | Never let me go
127 | Never let me go
128 | Never let me go
129 | Deliver me
130 | Never let me go
131 | Never let me go
132 | Never let me go
133 | Never let me go
134 | Deliver me
135 | Never let me go
136 | Never let me go
137 | Never let me go
138 | Never let me go
139 | Deliver me
140 | Never let me go
141 | Never let me go
142 | Never let me go
143 | Never let me go
144 | 
145 | And it's over
146 | (Never let me go, Never let me go)
147 | And I'm going under
148 | (Never let me go, Never let me go)
149 | But I'm not giving up
150 | (Never let me go, Never let me go)
151 | I'm just giving in
152 | (Never let me go, Never let me go)
153 | 
154 | I'm slipping underneath
155 | (Never let me go, Never let me go)
156 | So cold and so sweet
157 | (Never let me go, Never let me go)
158 | 
159 | The show was ending and I had started to crack
160 | Woke up in Chicago and the sky turned black
161 | And you're so high, you're so high, you had to be an angel
162 | And I'm so high, I'm so high, I can see an angel
163 | 
164 | I hear your heart beating in your chest
165 | The world slows 'till there's nothing left
166 | Skyscrapers look on like great, unblinking giants (oh)
167 | 
168 | In those heavy days in June
169 | When love became an act of defiance
170 | 
171 | Hold onto each other
172 | Hold onto each other
173 | Hold onto each other
174 | Hold onto each other
175 | 
176 | You were broken-hearted and the world was, too
177 | And I was beginning to lose my grip
178 | And I always held it loosely
179 | But this time I admit
180 | I felt it really start to slip
181 | 
182 | And choir singing in the street
183 | And I will come to you
184 | To watch the television screen
185 | In your hotel room
186 | 
187 | Hold onto each other
188 | Hold onto each other
189 | Hold onto each other
190 | Hold onto each other
191 | 
192 | You're so high, you're so high
193 | You're so high, you're so high
194 | You're so high, you're so high
195 | You had to be an angel
196 | I'm so high, I'm so high
197 | I'm so high, I'm so high
198 | I'm so high, I'm so high
199 | I can see an angel
200 | 
201 | No walls
202 | Can keep me protected
203 | No sleep
204 | Nothing inbetween me and the rain
205 | And you can't save me now,
206 | I'm in the grip of a hurricane
207 | I'm gonna blow myself away
208 | 
209 | I'm going out
210 | I'm gonna drink myself to death
211 | And in the crowd
212 | I see you with someone else,
213 | I brace myself
214 | Cause I know it's going to hurt
215 | But I like to think at least things can't get any worse
216 | 
217 | No home,
218 | I don't want shelter
219 | No calm,
220 | Nothing to keep me from the storm
221 | And you can't hold me down
222 | 'Cause I belong to the hurricane
223 | It's gonna blow this all away
224 | 
225 | I'm going out
226 | I'm gonna drink myself to death
227 | And in the crowd
228 | I see you with someone else
229 | I brace myself
230 | Cause I know it's going to hurt
231 | But I like to think at least things can't get any worse
232 | 
233 | I hope that you see me
234 | Cause I'm staring at you
235 | But when you look over
236 | You look right through
237 | Then you lean and kiss her on the head
238 | And I never felt so alive, and so... dead.
239 | 
240 | I'm going out
241 | I'm gonna drink myself to death
242 | And in the crowd
243 | I see you with someone else
244 | I brace myself
245 | Cause I know it's going to hurt
246 | I'm going out, woah-oh-o
247 | 
248 | I'm going out
249 | I'm gonna drink myself to death
250 | And in the crowd
251 | I see you with someone else
252 | I brace myself
253 | Cause I know it's going to hurt
254 | I'm going out, woah-oh-o
255 | I'm going out, woah-oh-o
256 | I'm going o-o-o-o-o-o-out
257 | I'm going out, woah-oh-o
258 | I'm going out
259 | 
260 | This is as good a place to fall as any
261 | We'll build our altar here
262 | Make me your Maria
263 | I'm already on my knees
264 | 
265 | You had Jesus on your breath
266 | And I caught him in mine
267 | Sweating our confessions
268 | The undone and the divine
269 | 
270 | 'Cause this is his body
271 | This is his love
272 | Such selfish prayers
273 | And I can't get enough
274 | 
275 | Oh, woah, woah, oh
276 | Oh, whoa, whoa, yeah
277 | 
278 | Spilled milk tears,
279 | I did this for you
280 | Spilling over the idol
281 | The black and the blue
282 | 
283 | The sweetest submission
284 | Drinking it in
285 | The wine, the women, the bedroom hymns
286 | 
287 | 'Cause this is his body
288 | This is his love
289 | Such selfish prayers and I can't get enough
290 | 
291 | Oh, woah, woah, oh
292 | Whoa, whoa, yeah
293 | I can't get enough
294 | 
295 | I'm not here looking for absolution
296 | Because I found myself an old solution
297 | I'm not here looking for absolution
298 | Because I found myself an old solution
299 | 
300 | This is his body
301 | This is his love
302 | Such selfish prayers, I can't get enough
303 | 
304 | This is his body
305 | This is his love
306 | Such selfish prayers, I can't get enough
307 | Whoa, whoa, yeah
308 | I can't get enough
309 | Whoa, whoa, yeah
310 | I can't get enough
311 | Whoa, whoa, yeah
312 | 
313 | Know you’ve been hurt by someone else
314 | I can tell by the way you carry yourself
315 | If you let me, here’s what I’ll do
316 | I’ll take care of you
317 | Cause I’ve loved and I’ve lost
318 | 
319 | I’ve asked about you and they told me things
320 | But my mind didn’t change and I still feel the same
321 | What's a life with no fun, please don’t be so ashamed
322 | I’ve had mine, you’ve had yours, we both know
323 | We know, they don’t get you like I will
324 | My only wish is I die real
325 | Cause that truth hurts, and those lies heal
326 | And you can’t sleep thinking that he lies still
327 | So you cry still, tears all in the pillow case
328 | Big girls all get a little taste
329 | Pushing me away so I give her space
330 | Dealing with a heart that I didn’t break
331 | I’ll be there for you, I will care for you
332 | I keep thinking you just don’t know
333 | Trying to run from that, say you’re done with that
334 | On your face girl, it just don’t show
335 | When you’re ready, just say you’re ready
336 | When all the baggage just ain’t as heavy
337 | And the parties over, just don’t forget me
338 | We’ll change the pace and we'll just go slow
339 | You won’t ever have to worry,
340 | You won’t ever have to hide
341 | You've seen all my mistakes
342 | So look me in my eyes
343 | 
344 | Cause if you let me, here’s what I’ll do
345 | I’ll take care of you
346 | Cause I’ve loved and I’ve lost
347 | 
348 | It’s my birthday, I'll get high if I want to
349 | Can’t deny that I want you, but I'll lie if I have to
350 | Cause you don’t say you love me
351 | To your friends when they ask you
352 | Even though we both know that you do (you do)
353 | One time, been in love one time
354 | You and all your girls in the club one time
355 | All so convinced that you’re following your heart
356 | Cause your mind don’t control what it does sometimes
357 | We all have our nights though, don’t be so ashamed
358 | I’ve had mine, you’ve had yours, we both know
359 | We know
360 | 
361 | Know you’ve been hurt by someone else
362 | I can tell by the way you carry yourself
363 | If you let me, here’s what I’ll do
364 | I’ll take care of you
365 | Cause I’ve loved and I’ve lost
366 | 
367 | I've loved and I've lost [3x]
368 | 
369 | And the air was full
370 | Of various storms and saints
371 | Praying in the street
372 | As the banks began to break
373 | And I'm in the throes of it
374 | Somewhere in the belly of the beast
375 | But you took your toll on me
376 | So I gave myself over willingly
377 | You got a hold on me
378 | And I don't know how I don't just stand outside and scream
379 | I am teaching myself how to be free
380 | 
381 | The monument of a memory
382 | You tear it down in your head
383 | Don't make the mountain your enemy
384 | Get out, get up there instead
385 | You saw the stars out in front of you
386 | Too tempting not to touch
387 | But even though it shocked you
388 | Something's electric in your blood
389 | 
390 | And people just untie themselves
391 | Uncurling lifelines
392 | If you could just forgive yourself
393 | 
394 | But still you stumble, feet give way
395 | Outside the world seems a violent place
396 | But you had to have him, and so you did
397 | Some things you let go in order to live
398 | While all around you, the buildings sway
399 | You sing it out loud, "who made us this way?"
400 | I know you're bleeding, but you'll be okay
401 | Hold on to your heart, you'll keep it safe
402 | Hold on to your heart, don't give it away
403 | 
404 | You'll find a rooftop to sing from
405 | Or find a hallway to dance
406 | You don't need no edge to cling from
407 | Your heart is there, it's in your hands
408 | I know it seems like forever
409 | I know it seems like an age
410 | But one day this will be over
411 | I swear it's not so far away
412 | 
413 | And people just untie themselves
414 | Uncurling lifelines
415 | If you could just forgive yourself
416 | 
417 | But still you stumble, feet give way
418 | Outside the world seems a violent place
419 | But you had to have him, and so you did
420 | Some things you let go in order to live
421 | While all around you, the buildings sway
422 | You sing it out loud, "who made us this way?"
423 | I know you're bleeding, but you'll be okay
424 | Hold on to your heart, you'll keep it safe
425 | Hold on to your heart
426 | 
427 | m drifting through the halls with the sunrise
428 | (Holding on for your call)
429 | Climbing up the walls for that flashing light
430 | (I can never let go)
431 | 
432 | [Refrain:]
433 | Cause I'm gonna be free and I'm gonna be fine
434 | (Holding on for your call)
435 | Cause I'm gonna be free and I'm gonna be fine
436 | (Maybe not tonight)
437 | 
438 | Now the sun is up and I'm going blind
439 | (Holding on for your call)
440 | Another drink just to pass the time
441 | (I can never say no)
442 | 
443 | [Refrain]
444 | 
445 | [Pre-Chorus:]
446 | It's a different kind of danger
447 | And the bells are ringing out
448 | And I'm calling for my mother
449 | As I pull the pillars down
450 | It's a different kind of danger
451 | And my feet are spinning around
452 | Never knew I was a dancer
453 | 'Till Delilah showed me how
454 | 
455 | Too fast for freedom
456 | Sometimes it all falls down
457 | These chains never leave me
458 | I keep dragging them around
459 | 
460 | [Chorus:]
461 | Now I'm dancing with Delilah and her vision is mine
462 | (Holding on for your call)
463 | A different kind of danger in the daylight
464 | (I can never let go)
465 | Took anything to cut you, I can find
466 | (Holding on for your call)
467 | A different kind of a danger in the daylight
468 | (Can't you let me know?)
469 | 
470 | Now it's one more boy and it's one more lie
471 | (Holding on for your call)
472 | Taking the pills just to pass the time
473 | (I can never say no)
474 | 
475 | [Refrain]
476 | 
477 | [Pre-Chorus]
478 | 
479 | [Chorus]
480 | 
481 | Strung up, strung out for your love
482 | Hang in, hung up, it's so rough
483 | I'm wrung and ringing out
484 | Why can't you let me know?
485 | [x2]
486 | 
487 | [Pre-Chorus x2]
488 | 
489 | Too fast for freedom
490 | Sometimes it all falls down
491 | These chains never leave me
492 | I keep dragging them around
493 | [x2]
494 | 
495 | When the night has come
496 | And the land is dark
497 | And the moon is the only light we see
498 | No, I won't be afraid
499 | Oh, I won't be afraid
500 | Just as long as you stand, stand by me
501 | 
502 | So, darling, darling, stand by me
503 | Oh, stand by me
504 | Oh, stand now, stand by me, stand by me
505 | 
506 | If the sky that we look upon
507 | Should tumble and fall
508 | Or the mountain should crumble to the sea
509 | I won't cry, I won't cry
510 | No, I won't shed a tear
511 | Just as long as you stand, stand by me
512 | 
513 | And darling, darling, stand by me
514 | Oh, stand by me
515 | Oh, stand now, stand by me, stand by me
516 | 
517 | And, darling, darling, stand by me
518 | Oh, stand by me
519 | Oh, stand, stand by me, stand by me
520 | 
521 | Whenever you're in trouble won't you stand by me?
522 | Oh, stand by me
523 | Oh, stand now, stand by me
524 | 
525 | Darling, darling, stand by me
526 | Oh, stand by me
527 | Oh, stand now, stand by me, stand by me
528 | 
529 | Whenever you're in trouble won't you stand by me?
530 | Oh, stand by me
531 | Oh, stand now, stand by me, stand by me
532 | 
533 | There is love in your body but you can't hold it in
534 | It pours from your eyes and spills from your skin
535 | Tenderest touch leaves the darkest of marks
536 | And the kindest of kisses break the hardest of hearts
537 | 
538 | The hardest of hearts
539 | The hardest of hearts
540 | The hardest of hearts
541 | 
542 | There is love in your body but you can't get it out
543 | It gets stuck in your head, won't come out of your mouth
544 | Sticks to your tongue and shows on your face
545 | That the sweetest of words have the bitterest taste
546 | 
547 | Darling heart, I loved you from the start
548 | But you'll never know what a fool I've been
549 | Darling heart, I loved you from the start
550 | But that's no excuse for the state I'm in
551 | 
552 | The hardest of hearts
553 | The hardest of hearts
554 | The hardest of hearts
555 | 
556 | There is love in our bodies and it holds us together
557 | But pulls us apart when we're holding each other
558 | We all want something to hold in the night
559 | We don't care if it hurts or we're holding too tight
560 | 
561 | There is love in your body but you can't get it out
562 | It gets stuck in your head, won't come out of your mouth
563 | Sticks to your tongue and it shows on your face
564 | That the sweetest of words have the bitterest taste
565 | 
566 | Darling heart, I loved you from the start
567 | But you'll never know what a fool I've been
568 | Darling heart, I loved you from the start
569 | But that's no excuse for the state I'm in
570 | 
571 | The hardest of hearts
572 | The hardest of hearts
573 | The hardest of hearts
574 | 
575 | My heart swells like a water at weight
576 | Can't stop myself before it's too late
577 | Hold on to your heart
578 | 'Cause I'm coming to take you
579 | Hold on to your heart
580 | 'Cause I'm coming to break you
581 | 
582 | Hold on hold on hold on hold on hold on
583 | Hold on hold on hold on hold on hold on
584 | The hardest of hearts (hold on, hold on)
585 | The hardest of hearts (hold on, hold on)
586 | The hardest of hearts (hold on)
587 | 
588 | 
589 | Time it took us
590 | To where the water was
591 | That’s what the water gave me
592 | And time goes quicker
593 | Between the two of us
594 | Oh, my love, don’t forsake me
595 | Take what the water gave me
596 | 
597 | Lay me down
598 | Let the only sound
599 | Be the overflow
600 | Pockets full of stones
601 | 
602 | Lay me down
603 | Let the only sound
604 | Be the overflow
605 | 
606 | And oh, poor Atlas
607 | The world’s a beast of a burden
608 | You’ve been holding up a long time
609 | And all this longing
610 | And the ships are left to rust
611 | That’s what the water gave us
612 | 
613 | So lay me down
614 | Let the only sound
615 | Be the overflow
616 | Pockets full of stones
617 | Lay me down
618 | Let the only sound
619 | Be the overflow
620 | 
621 | ‘Cause they took your loved ones
622 | But returned them in exchange for you
623 | But would you have it any other way?
624 | Would you have it any other way?
625 | You couldn't have it any other way
626 | 
627 | ‘Cause she’s a cruel mistress
628 | And a bargain must be made
629 | But oh, my love, don’t forget me
630 | When I let the water take me
631 | 
632 | So lay me down
633 | Let the only sound
634 | Be the over flow
635 | Pockets full of stones
636 | 
637 | Lay me down
638 | Let the only sound
639 | Be the overflow
640 | 
641 | So lay me down
642 | Let the only sound
643 | Be the overflow
644 | Pockets full of stones
645 | 
646 | Lay me down
647 | Let the only sound
648 | Be the overflow
649 | 
650 | 
651 | Dizzee Rascal]
652 | Everybody wants to be famous,
653 | Nobody wants to be nameless, aimless,
654 | People act shameless
655 | Tryna live like entertainers,
656 | Want a fat crib with the acres,
657 | So they spend money that they ain't made yet,
658 | Got a Benz on tick that they ain't paid yet,
659 | Spend their paycheck
660 | In the west out on a weekend
661 | Got no money by the end of the weekend.
662 | But they don't care cause their life is a movie,
663 | Starring Louis V, paid for by yours truly,
664 | Truthfully, it's a joke, like a bad episode of hollyoaks,
665 | Can't keep up with the cover notes,
666 | So they got bad credit livin' on direct debit in debt
667 | They still don't get it
668 | Cause they too busy livin' the high life, the night life
669 | Huggin' the high when livin' it large
670 | And they all say
671 | 
672 | [Florence]
673 | Sometimes it seems that the going is just too rough
674 | And things go wrong no matter what I do
675 | (That's right)
676 | Now and then it seems like life is just too much
677 | But you've got the love I need to see me through
678 | 
679 | [Dizzee Rascal]
680 | Let me take you down to London city
681 | Where the attitude's bad and the weather is shitty
682 | Everybody's on a paper chase
683 | It's one big rat race
684 | Everybody's got a screw face
685 | So many two-faced,
686 | Checkin' their high sayin' they're ready to ride
687 | I'm on the inside looking at the
688 | So it's an accurate reflection
689 | City wide, north, east, west and the southside
690 | Everywhere I go there's a goon on the corner
691 | Guns and drugs cause the city's like a sauna
692 | And it's getting warmer, and out of order
693 | Tryna put a struggling mother to a mourner
694 | Mr. Politician can you tell me the solution
695 | What's the answer, what's the conclusion
696 | Is it an illusion, is it a mirage
697 | I see youngsters die because they tryna live large
698 | And they all say
699 | 
700 | [Florence]
701 | Sometimes I feel like throwing my hands up in the air
702 | I know I can count on all of you
703 | Sometimes I feel like saying “Lord, I just don't care”
704 | (That's right, that's right)
705 | But you've got the love I need to see me through
706 | (Check it, check it, come on, come on)
707 | 
708 | You got the love
709 | (Who's got the love)
710 | You got the love
711 | (Who's got the love)
712 | You got the love
713 | (That's right, that's right, that's right)
714 | You got the love
715 | (Who's got the love)
716 | You got the love
717 | (Who's got the love)
718 | You got the love
719 | (Check it)
720 | 
721 | [Dizzee Rascal]
722 | We are living in the days of the credit crunch
723 | Give me the dough
724 | I'm tryna have a bunch
725 | But I can't have rice for lunch
726 | It's not there ain't enough to share
727 | It ain't fair never dreamed that he could be rare
728 | Who cares who dares to make a change
729 | Everybody's in the club trying to make it rain
730 | But not for famine just for the sake of having
731 | 15 minutes of fame and everywhere's the same
732 | Again and again I see the same thing
733 | Everybody acting like they play sailin'
734 | But I see rough seas ahead maybe a recession
735 | And then a depression in whatever profession
736 | This is my confession I can't front I’m in the forefront
737 | Living for money ready to start like a bungee jump
738 | With no rope but I ain't trying to see the bottom
739 | Because that's where I came from, I ain't forgotten,
740 | 
741 | [Florence]
742 | You got the love
743 | (Who's got the love)
744 | You got the love
745 | You got the love
746 | (Who's got the love)
747 | You got the love
748 | You got the love
749 | (That's right, that's right, that's right, that's right)
750 | You got the love
751 | You got the love
752 | You got the love
753 | (Who's got the love, who's got the love, who's got the love)
754 | 
755 | Sometimes I feel like throwing my hands up in the air
756 | I know I can count on all of you
757 | Sometimes I feel like saying “Lord, I just don't care”
758 | But you've got the love I need to see me through
759 | 
760 | And the heart is hard to translate
761 | It has a language of its own
762 | It talks in tongues and quiet sighs
763 | And prayers and proclamations
764 | In the grand deeds of great men and the smallest of gestures
765 | And short shallow gasps
766 | 
767 | But with all my education I can't seem to command it
768 | And the words are all escaping, and coming back all damaged
769 | And I would put them back in poetry if I only knew how
770 | I can't seem to understand it
771 | 
772 | And I would give all this and heaven too
773 | I would give it all if only for a moment
774 | That I could just understand the meaning of the word you see
775 | 'Cause I've been scrawling it forever but it never makes sense to me at all
776 | 
777 | And it talks to me in tiptoes
778 | And it sings to me inside
779 | It cries out in the darkest night and breaks in the morning light
780 | 
781 | But with all my education I can't seem to command it
782 | And the words are all escaping, and coming back all damaged
783 | And I would put them back in poetry if I only knew how
784 | I can't seem to understand it
785 | 
786 | And I would give all this and heaven too
787 | I would give it all if only for a moment
788 | That I could just understand the meaning of the word you see
789 | 'Cause I've been scrawling it forever but it never makes sense to me at all
790 | 
791 | And I would give all this and heaven too
792 | I would give it all if only for a moment
793 | That I could just understand the meaning of the word you see
794 | 'Cause I've been scrawling it forever but it never makes sense to me at all
795 | 
796 | No, words are a language
797 | It doesn't deserve such treatment
798 | And all of my stumbling phrases never amounted to anything worth this feeling
799 | 
800 | All this heaven never could describe such a feeling as I'm hearing
801 | 
802 | Words were never so useful
803 | So I was screaming out a language that I never knew existed before
804 | 
805 | Are you hurting the one you love?
806 | You say you've found Heaven but you can't find God.
807 | Are you hurting the one you love?
808 | Bite your tongue till it tastes like blood.
809 | 
810 | Are you hurting the one you love?
811 | So many glasses on the tabletop.
812 | Are you hurting the one you love?
813 | You'd like to stay in heaven but the rules are too tough.
814 | 
815 | Tough,
816 | It's just too tough.
817 | Tough,
818 | It's just too tough.
819 | 
820 | Are you hurting the one you love?
821 | When they watched the walls, and the ticking clock.
822 | Are you hurting the one you love?
823 | And was it something you could not stop.
824 | 
825 | Could not stop.
826 | Stop,
827 | Could not stop.
828 | Stop,
829 | Could not stop.
830 | Stop,
831 | Could not stop.
832 | Stop,
833 | Could not stop.
834 | 
835 | Are you hurting the one you love?
836 | When you leave them sleeping on the hollow ground.
837 | Are you hurting the one you love?
838 | And lost for themselves.
839 | 
840 | Are you hurting the one you love?
841 | And if heaven knows then who will stop.
842 | Are you hurting the one you love?
843 | You said you got to heaven, but it wasn't enough.
844 | 
845 | I love you all the time
846 | Oh oh, oh oh, oh oh oh oh
847 | Oh oh, oh oh, ah
848 | 
849 | I'm never alone, I look at my phone
850 | If I call you up, you're never at home
851 | I love you all the time
852 | 
853 | I'm fueled up and high, I'm out with the guys
854 | A smile on my face, no reason to cry
855 | I love you all the time
856 | 
857 | I can tell by that look in your eye
858 | You're looking and all you see's another guy
859 | I can tell you're going to take your love away
860 | 
861 | I can tell by that look in your eye
862 | You're looking and all you see's another guy
863 | I would beg you if I thought it would make you stay
864 | 
865 | Ce soir c’est le soir
866 | Et toi avec moi
867 | Et tu viens me voir
868 | Tu viens, oh la la
869 | I love you all the time
870 | 
871 | Tu ne réponds pas
872 | Ah dites-moi pourquoi
873 | Just say au revoir
874 | Again me voilà
875 | I love you all the time
876 | 
877 | And I can tell by that look in your eye
878 | You're looking and all you see's another guy
879 | I can tell you're going to take your love away
880 | 
881 | I can tell by that look in your eye
882 | You're looking and all you see's another guy
883 | I would beg if I thought it would make you stay
884 | I would beg if I thought it would make you stay
885 | I would beg if I thought it would make you stay
886 | 
887 | Ah dites-moi pourquoi
888 | Ah dites-moi pourquoi
889 | Ah dites-moi pourquoi
890 | 
891 | Sometimes I feel like throwing my hands up in the air
892 | I know I can count on you
893 | Sometimes I feel like saying, "Lord, I just don't care."
894 | But you've got the love I need To see me through
895 | 
896 | Sometimes it seems that the going is just too rough
897 | And things go wrong no matter what I do
898 | Now and then it seems that life is just too much
899 | But you've got the love I need to see me through
900 | 
901 | When food is gone you are my daily meal
902 | When friends are gone I know my Saviour's love is real
903 | You know it's real
904 | 
905 | You got the love
906 | You got the love
907 | You got the love
908 | You got the love
909 | You got the love
910 | You got the love
911 | 
912 | Time after time I think, "Oh, Lord, what's the use?"
913 | Time after time I think it's just no good
914 | 'Cause sooner or later in life, the things you love you lose
915 | But you got the love I need to see me through
916 | 
917 | [2x]
918 | You got the love
919 | You got the love
920 | You got the love
921 | You got the love
922 | You got the love
923 | You got the love
924 | 
925 | Sometimes I feel like throwing my hands up in the air
926 | 'Cause I know I can count on you
927 | Sometimes I feel like saying, "Lord, I just don't care."
928 | But you've got the love I need to see me through 
929 | 


--------------------------------------------------------------------------------
/Ch07/highest_scoring.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from functools import reduce
 5 | 
 6 | def keep_highest(acc, nxt):
 7 |   word, score = nxt.split('\t')
 8 |   s = int(score)
 9 |   if len(acc) < 5:
10 |     acc.append((word,s))
11 |     acc = sorted(acc, key=lambda x:x[1])
12 |   elif s > acc[0][1]:
13 |     acc.append((word, s))
14 |     acc = sorted(acc, key=lambda x:x[1])[1:]
15 |   return acc
16 | 
17 | print(reduce(keep_highest, sys.stdin, []))
18 | 


--------------------------------------------------------------------------------
/Ch07/large_words:
--------------------------------------------------------------------------------
1 | $HADOOP/bin/hadoop jar /home/jt-w/bin/hadoop/hadoop-streaming-3.2.0.jar \
2 |   -file ./wc_mapper.py -mapper ./wc_mapper.py \
3 |   -file ./wc_reducer.py -reducer ./wc_reducer.py \
4 |   -input 'Florence_Machine.txt' \
5 |   -output ./FlorenceMachineCounts
6 | 


--------------------------------------------------------------------------------
/Ch07/score_words.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | 
 5 | def score(word):
 6 |   total = 0
 7 |   for i,char in enumerate(word):
 8 |     if char.lower() in "dlcu":
 9 |       total +=1
10 |     elif char.lower() in "mwfbygpvk":
11 |       total += 2
12 |     elif char.lower() in "jxqz":
13 |       total += 4
14 |     if i >= 4:
15 |       total +=2
16 |   return total
17 | 
18 | for line in sys.stdin:
19 |   for word in line.split():
20 |     print("{}\t{}".format(word, score(word)))
21 | 


--------------------------------------------------------------------------------
/Ch07/spark_scores.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | import re
 3 | from pyspark import SparkContext
 4 | 
 5 | if __name__ == "__main__":
 6 |   sc = SparkContext(appName="WordScores")
 7 |   PAT = re.compile(r'[-./:\s\xa0]+')
 8 |   text_files = sc.textFile("/home/jt-w/Code/MR-test/data/*")
 9 |   xs = text_files.flatMap(lambda x:PAT.split(x))\
10 |                  .filter(lambda x:len(x)>6)\
11 |                  .countByValue()\
12 | 
13 |   for k,v in xs.items():
14 |     print("{:<30}{}".format(k.encode("ascii","ignore"),v))
15 | 


--------------------------------------------------------------------------------
/Ch07/wc_mapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | import sys
4 | 
5 | for line in sys.stdin:
6 |   for word in line.split():
7 |     if len(word) > 6:
8 |       print(word)
9 | 


--------------------------------------------------------------------------------
/Ch07/wc_reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from functools import reduce
 5 | 
 6 | def make_counts(acc, nxt):
 7 |     acc[nxt] = acc.get(nxt,0) + 1
 8 |     return acc
 9 | 
10 | for w in reduce(make_counts, sys.stdin, {}):
11 |     print(w)
12 | 


--------------------------------------------------------------------------------
/Ch08/.most-active-times.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch08/.most-active-times.py.swp


--------------------------------------------------------------------------------
/Ch08/command_elo:
--------------------------------------------------------------------------------
1 | $HADOOP/bin/hadoop jar /home/<user>/bin/hadoop/hadoop-streaming-3.2.0.jar \
2 |   -file ./elo-mapper.py -mapper ./elo-mapper.py \
3 |   -file ./elo-reducer.py -reducer ./elo-reducer.py \
4 |   -input '/path/to/wta/files/wta_matches_200*.csv' \
5 |   -output ./tennis_ratings
6 | 
7 | 


--------------------------------------------------------------------------------
/Ch08/common-errors.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | 
 3 | class ErrorCounter(MRJob):
 4 |   def mapper(self, _, line):
 5 |     fields = line.split(',')
 6 |     if fields[7] == '404.0':
 7 |       yield fields[6], 1
 8 | 
 9 |   def reducer(self, key, vals):
10 |     num_404s = sum(vals)
11 |     if num_404s>0:
12 |       yield key, num_404s
13 | 
14 | if __name__ == "__main__":
15 |   ErrorCounter.run()
16 | 
17 | 


--------------------------------------------------------------------------------
/Ch08/elo-mapper.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python3 
 2 | import json
 3 | from sys import stdin
 4 | 
 5 | 
 6 | def clean_match(match):
 7 |   ms = match.split(',')
 8 |   match_data = {'winner': ms[10],
 9 |                 'loser': ms[20],
10 |                 'surface': ms[2]}
11 |   return match_data
12 | 
13 | 
14 | if __name__ == "__main__":
15 |   for line in stdin:
16 |     print(json.dumps(clean_match(line)))
17 | 


--------------------------------------------------------------------------------
/Ch08/elo-reducer.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python3
 2 | import json
 3 | from sys import stdin
 4 | from functools import reduce
 5 | 
 6 | def round5(x):
 7 |   return 5*int(x/5)
 8 | 
 9 | def elo_acc(acc,nxt):
10 |   match_info = json.loads(nxt)
11 |   w_elo = acc.get(match_info['winner'],1400)
12 |   l_elo = acc.get(match_info['loser'],1400)
13 |   Qw = 10**(w_elo/400)
14 |   Ql = 10**(l_elo/400)
15 |   Qt = Qw+Ql
16 |   acc[match_info['winner']] = round5(w_elo + 100*(1-(Qw/Qt)))
17 |   acc[match_info['loser']] = round5(l_elo - 100*(Ql/Qt))
18 |   return acc
19 | 
20 | if __name__ == "__main__":
21 |   xs = reduce(elo_acc, stdin, {})
22 |   for player, rtg in xs.items():
23 |       print(rtg, player)
24 | 


--------------------------------------------------------------------------------
/Ch08/serena_counter.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from functools import reduce
 3 | 
 4 | def make_counts(acc, nxt):
 5 |     acc[nxt] = acc.get(nxt,0) + 1
 6 |     return acc
 7 | 
 8 | def my_frequencies(xs):
 9 |     return reduce(make_counts, xs, {})
10 | 
11 | class SerenaCounter(MRJob):
12 | 
13 |   def mapper(self, _, line):
14 |     fields = line.split(',')
15 |     if fields[10] == 'Serena Williams':
16 |         yield fields[2], 'W'
17 |     elif fields[20] == 'Serena Williams':
18 |         yield fields[2], 'L'
19 | 
20 |   def reducer(self, surface, results):
21 |     counts = my_frequencies(results)
22 |     yield surface, counts
23 | 
24 | if __name__ == "__main__":
25 |   SerenaCounter.run()
26 | 


--------------------------------------------------------------------------------
/Ch08/williams-counter.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from functools import reduce
 3 | 
 4 | def make_counts(acc, nxt):
 5 |     acc[nxt] = acc.get(nxt,0) + 1
 6 |     return acc
 7 | 
 8 | def my_frequencies(xs):
 9 |     return reduce(make_counts, xs, {})
10 | 
11 | class WilliamsRivalry(MRJob):
12 | 
13 |   def mapper(self, _, line):
14 |     fields = line.split(',')
15 |     players = ' '.join([fields[10], fields[20]])
16 |     if 'Serena Williams' in players and 'Venus Williams' in players:
17 |       yield fields[2], fields[10]
18 | 
19 |   def reducer(self, surface, results):
20 |     counts = my_frequencies(results)
21 |     yield surface, counts
22 | 
23 | if __name__ == "__main__":
24 |   WilliamsRivalry.run()
25 | 


--------------------------------------------------------------------------------
/Ch08/wta.tar.bz2.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch08/wta.tar.bz2.tar.bz2


--------------------------------------------------------------------------------
/Ch09/spark_losses.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | from math import log2, ceil
 3 | from functools import partial
 4 | from pyspark import SparkContext
 5 | 
 6 | def ceil5(x):
 7 |     return ceil(x/5)*5
 8 | 
 9 | def get_winner_loser(match):
10 |   ms = match.split(',')
11 |   # Put the loser in first position, winner in second
12 |   return (ms[20], ms[10])
13 | 
14 | def initialize_for_voting(losses):
15 |     return {'losses': losses,
16 |             'n_losses': len(losses),
17 |             'rating': 100}
18 | 
19 | def empty_ratings(d):
20 |   d['rating'] = 0
21 |   return d
22 | 
23 | def allocate_points(acc, nxt):
24 |   k,v = nxt
25 |   boost = v['rating'] / (v['n_losses'] + .01)
26 |   for loss in v['losses']:
27 |     if loss not in acc.keys():
28 |       acc[loss] = {'losses':[], 'n_losses': 0}
29 |     opp_rating = acc.get(loss,{}).get('rating',0)
30 |     acc[loss]['rating'] = opp_rating + boost
31 |   return acc
32 | 
33 | def combine_scores(a, b):
34 |   for k,v in b.items():
35 |     try:
36 |       a[k]['rating'] = a[k]['rating'] + b[k]['rating']
37 |     except KeyError:
38 |       a[k] = v
39 |   return a
40 | 
41 | if __name__ == "__main__":
42 |   sc = SparkContext(appName="TennisRatings")
43 |   match_data = sc.textFile("/media/jt-w/Seagate500G/wta_matches*")
44 | 
45 |   xs = match_data.map(get_winner_loser)\
46 |                  .groupByKey()\
47 |                  .mapValues(initialize_for_voting)
48 | 
49 |   for i in range(8):
50 |     if i > 0:
51 |       xs = sc.parallelize(zs.items())
52 |     acc = dict(xs.mapValues(empty_ratings).collect())
53 |     zs = xs.aggregate(acc, allocate_points, combine_scores)
54 | 
55 |   ratings = [(k,v['rating']) for k,v in zs.items()]
56 |   for player, rating in sorted(ratings, key=lambda x: x[1], reverse=True)[:20]:
57 |     print('{:<30}{}\t{}'.format(player,
58 |                                 round(log2(rating+1), 1),
59 |                                 ceil5(rating)))
60 | 


--------------------------------------------------------------------------------
/Ch09/spark_scores.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | import re, json
 3 | from pyspark import SparkContext
 4 | 
 5 | def round5(x):
 6 |   return 5*int(x/5)
 7 | 
 8 | def clean_match(match):
 9 |   ms = match.split(',')
10 |   match_data = {'winner': ms[10],
11 |                 'loser': ms[20],
12 |                 'surface': ms[2]}
13 |   return match_data
14 | 
15 | def elo_acc(acc,nxt):
16 |     w_elo = acc.get(nxt['winner'],1600)
17 |     l_elo = acc.get(nxt['loser'],1600)
18 |     Qw = 10**(w_elo/400)
19 |     Ql = 10**(l_elo/400)
20 |     Qt = Qw+Ql
21 |     acc[nxt['winner']] = round5(w_elo + 25*(1-(Qw/Qt)))
22 |     acc[nxt['loser']] = round5(l_elo - 25*(Ql/Qt))
23 |     return acc
24 | 
25 | def elo_comb(a,b):
26 |     a.update(b)
27 |     return a
28 | 
29 | if __name__ == "__main__":
30 |   sc = SparkContext(appName="TennisRatings")
31 |   text_files = sc.textFile("/path/to/my/data/wta_matches*")
32 |   xs = text_files.map(clean_match)\
33 |                  .aggregate({},elo_acc, elo_comb)
34 | 
35 |   for x in sorted(xs.items(), key=lambda x:x[1], reverse=True)[:20]:
36 |       print("{:<30}{}".format(*x))
37 | 


--------------------------------------------------------------------------------
/Ch10/decision_trees.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import SparkSession
 3 | from functools import reduce
 4 | from pyspark.ml.feature import StringIndexer, VectorAssembler
 5 | from pyspark.ml.classification import DecisionTreeClassifier
 6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 7 | 
 8 | def string_to_index(df, label):
 9 |      return StringIndexer(inputCol=label,
10 |                           outputCol="i-"+label).fit(df) \
11 |                                                .transform(df)
12 | 
13 | if __name__ == "__main__":
14 | 
15 |     spark = SparkSession.builder \
16 |                .master("local") \
17 |                .appName("Decision Trees") \
18 |                .getOrCreate()
19 | 
20 |     df = spark.read.csv("mushrooms.data", header=True, inferSchema=True)
21 | 
22 |     categories = ['cap-shape', 'cap-surface', 'cap-color']
23 |     df = reduce(string_to_index, categories, df)
24 | 
25 |     df = VectorAssembler(inputCols=["i-cap-shape","i-cap-surface", "i-cap-color"],
26 |                          outputCol="features").transform(df)
27 | 
28 |     df = StringIndexer(inputCol='edible?', outputCol='label').fit(df).transform(df)
29 | 
30 |     tree = DecisionTreeClassifier()
31 |     model = tree.fit(df)
32 |     #print(model.toDebugString)
33 | 
34 |     bce = BinaryClassificationEvaluator()
35 | 
36 |     auc = bce.evaluate(model.transform(df))
37 |     print("Decision Tree AUC: {:0.4f}".format(auc))
38 | 


--------------------------------------------------------------------------------
/Ch10/iris.csv:
--------------------------------------------------------------------------------
  1 | SepalLength,SepalWidth,PetalLength,PetalWidth,Name
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3.0,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5.0,3.6,1.4,0.2,Iris-setosa
  7 | 5.4,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5.0,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,2.9,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,1.6,0.2,Iris-setosa
 14 | 4.8,3.0,1.4,0.1,Iris-setosa
 15 | 4.3,3.0,1.1,0.1,Iris-setosa
 16 | 5.8,4.0,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,Iris-setosa
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1.0,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5.0,3.0,1.6,0.2,Iris-setosa
 28 | 5.0,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5.0,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3.0,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5.0,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5.0,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3.0,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5.0,3.3,1.4,0.2,Iris-setosa
 52 | 7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3.0,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6.0,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3.0,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3.0,5.8,2.2,Iris-virginica
107 | 7.6,3.0,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2.0,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3.0,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5.0,2.0,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3.0,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6.0,2.2,5.0,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2.0,Iris-virginica
124 | 7.7,2.8,6.7,2.0,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6.0,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3.0,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3.0,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2.0,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3.0,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6.0,3.0,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3.0,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5.0,1.9,Iris-virginica
149 | 6.5,3.0,5.2,2.0,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3.0,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/Ch10/random_forest.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkContext
 2 | from pyspark.sql import SparkSession
 3 | from functools import reduce
 4 | from pyspark.ml.feature import StringIndexer, VectorAssembler
 5 | from pyspark.ml.classification import RandomForestClassifier
 6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 7 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 8 | 
 9 | def string_to_index(df, label):
10 |      return StringIndexer(inputCol=label,
11 |                           outputCol="i-"+label).fit(df) \
12 |                                                .transform(df)
13 | 
14 | if __name__ == "__main__":
15 | 
16 |     spark = SparkSession.builder \
17 |                .master("local") \
18 |                .appName("Random Forests") \
19 |                .getOrCreate()
20 | 
21 |     bce = BinaryClassificationEvaluator()
22 | 
23 |     forest = RandomForestClassifier()
24 |     df = spark.read.csv("mushrooms.data", header=True, inferSchema=True)
25 | 
26 |     categories = df.columns
27 |     categories.pop(categories.index('edible?'))
28 |     df = reduce(string_to_index, categories, df)
29 |     indexes = ["i-"+c for c in categories]
30 |     df = VectorAssembler(inputCols=indexes,
31 |                          outputCol="features").transform(df)
32 |     df = StringIndexer(inputCol='edible?',
33 |                        outputCol='label').fit(df).transform(df)
34 | 
35 |     grid = ParamGridBuilder().addGrid(forest.maxDepth, [0, 2]).build()
36 |     cv = CrossValidator(estimator=forest, estimatorParamMaps=grid,
37 |                             evaluator=bce,numFolds=10,
38 |                             parallelism=4)
39 |     cv_model = cv.fit(df)
40 |     area_under_curve = bce.evaluate(cv_model.transform(df))
41 |     print("Random Forest AUC: {:0.4f}".format(area_under_curve))
42 |     print(cv_model.bestModel.toDebugString)
43 | 


--------------------------------------------------------------------------------
/Ch11/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch11/.gitkeep


--------------------------------------------------------------------------------
/Ch11/s3_upload.py:
--------------------------------------------------------------------------------
 1 | import boto3 as aws
 2 | import os.path
 3 | from functools import partial
 4 | from glob import iglob
 5 | 
 6 | def upload_file(fp, bucket):
 7 |     _, file_name = os.path.split(fp)
 8 |     s3 = aws.client("s3",
 9 |         aws_access_key_id = "YOURACCESSKEYID",
10 |         aws_secret_access_key = "YOURSECRETACCESSKEY"
11 |     )
12 |     response = s3.upload_file(fp, bucket, file_name)
13 |     return file_name, response
14 | 
15 | if __name__ == "__main__":
16 |     fs = iglob("/path/to/data/files/*")
17 |     uploads = map(partial(upload_file, bucket="your-backet-name"), fs)
18 |     for file_name, _ in uploads :
19 |         print(file_name)
20 | 


--------------------------------------------------------------------------------
/Ch12/crashes_nb.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | from pyspark import SparkContext
 4 | from pyspark.sql import SparkSession
 5 | from functools import reduce
 6 | from pyspark.ml.feature import StringIndexer, VectorAssembler
 7 | from pyspark.ml.classification import NaiveBayes
 8 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 9 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
10 | 
11 | def string_to_index(df, label):
12 |      return StringIndexer(inputCol=label,
13 |                           outputCol="i-"+label).fit(df) \
14 |                                                .transform(df)
15 | 
16 | def group_crashes(x):
17 |     if int(x['Number of Vehicles Involved']) > 3:
18 |         x['Number of Vehicles Involved'] = "3"
19 |     return x
20 | 
21 | def improve_times(x):
22 |     time = x['Time']
23 |     if time < "5:00":
24 |         x['Time'] = "Early morning"
25 |     elif time < "7:00":
26 |         x['Time'] = "Morning"
27 |     elif time < "9:00":
28 |         x['Time'] = "Morning commute"
29 |     elif time < "12:00":
30 |         x['Time'] = "Late morning"
31 |     elif time < "16:00":
32 |         x['Time'] = "Afternoon"
33 |     elif time < "18:30":
34 |         x['Time'] = "Evening commute"
35 |     elif time < "22:00":
36 |         x['Time'] = "Evening"
37 |     else:
38 |         x['Time'] = "Late night"
39 |     return x
40 | 
41 | if __name__ == "__main__":
42 | 
43 |     sc = SparkContext(appName="Crash counts")
44 |     spark = SparkSession.builder \
45 |                .master("local") \
46 |                .getOrCreate()
47 | 
48 |     mce = MulticlassClassificationEvaluator()
49 | 
50 |     nb = NaiveBayes()
51 |     # read in lines to RDD
52 |     crashes = sc.textFile(sys.argv[1])
53 |     xs = crashes.flatMap(lambda x:x.split('\n')) \
54 |                 .map(json.loads) \
55 |                 .map(group_crashes) \
56 |                 .map(improve_times)
57 | 
58 |     # conver to DF
59 |     df = spark.createDataFrame(xs)
60 | 
61 |     feature_labels = df.columns
62 |     feature_labels.pop(feature_labels.index('Number of Vehicles Involved'))
63 |     df = reduce(string_to_index, feature_labels, df)
64 |     indexes = ["i-"+f for f in feature_labels]
65 | 
66 |     df = VectorAssembler(inputCols=indexes,
67 |                          outputCol="features").transform(df)
68 | 
69 |     df = StringIndexer(inputCol='Number of Vehicles Involved',
70 |                        outputCol='label').fit(df).transform(df)
71 | 
72 |     grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \
73 |                              .build()
74 | 
75 |     cv = CrossValidator(estimator=nb, estimatorParamMaps=grid,
76 |                             evaluator=mce,numFolds=5,
77 |                             parallelism=4)
78 |     cv_model = cv.fit(df)
79 |     transformed = cv_model.transform(df)
80 |     f1 = mce.evaluate(transformed)
81 |     print("NB F1: {:0.4f}".format(f1))
82 |     cv_model.bestModel.save(sys.argv[2])
83 | 


--------------------------------------------------------------------------------
/Ch12/emr-script-example.sh:
--------------------------------------------------------------------------------
1 | aws emr add-steps --cluster-id j-1EN18B2OUXEN5 --bootstrap-actions Args=['pip install boto3'] --steps Type=spark,Name=CrashNB,Args=[--deploy-mode,cluster,--master,yarn,s3://scorpion-elastic-jobs/spark_bayes.py],ActionOnFailure=CONTINUE
2 | 


--------------------------------------------------------------------------------
/Ch12/emr_crash_counts.sh:
--------------------------------------------------------------------------------
1 | python mrjob_crash_counts.py \
2 |        -r emr s3://your-bucket-name-here/ \
3 |        --output-dir=s3://your-bucket-name-here/crash-counts 
4 |        --conf-path=</path/to/your/config/file.conf>
5 | 


--------------------------------------------------------------------------------
/Ch12/mrjob_crash_counts.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | import json
 3 | 
 4 | class MRWordFrequencyCount(MRJob):
 5 | 
 6 |     def mapper(self, _, line):
 7 |         j = json.loads(line)
 8 |         vehicles = j['Number of Vehicles Involved']
 9 |         yield vehicles, 1
10 | 
11 |     def reducer(self, key, values):
12 |         yield key, sum(values)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     MRWordFrequencyCount.run()
17 | 


--------------------------------------------------------------------------------
/Ch12/mrjob_emr_nb.sh:
--------------------------------------------------------------------------------
1 | python3 mrspark_bayes.py \
2 |         -r emr \
3 |         s3://scorpion-nys-crashes/ \
4 |         > s3://scorpion-spark-outputs/nb-model
5 | 


--------------------------------------------------------------------------------
/Ch12/mrspark_bayes.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from functools import reduce
 3 | import boto3 as aws
 4 | from mrjob.job import MRJob
 5 | 
 6 | def string_to_index(df, label):
 7 |      return StringIndexer(inputCol=label,
 8 |                           outputCol="i-"+label).fit(df) \
 9 |                                                .transform(df)
10 | 
11 | def group_crashes(x):
12 |     if int(x['Number of Vehicles Involved']) > 3:
13 |         x['Number of Vehicles Involved'] = "3"
14 |     return x
15 | 
16 | def improve_times(x):
17 |     time = x['Time']
18 |     if time < "5:00":
19 |         x['Time'] = "Early morning"
20 |     elif time < "7:00":
21 |         x['Time'] = "Morning"
22 |     elif time < "9:00":
23 |         x['Time'] = "Morning commute"
24 |     elif time < "12:00":
25 |         x['Time'] = "Late morning"
26 |     elif time < "16:00":
27 |         x['Time'] = "Afternoon"
28 |     elif time < "18:30":
29 |         x['Time'] = "Evening commute"
30 |     elif time < "22:00":
31 |         x['Time'] = "Evening"
32 |     else:
33 |         x['Time'] = "Late night"
34 |     return x
35 | 
36 | class MRSparkBayes(MRJob):
37 | 
38 |     def spark(self, _, output_path):
39 |         from pyspark import SparkContext
40 |         from pyspark.sql import SparkSession
41 |         from pyspark.ml.feature import StringIndexer, VectorAssembler
42 |         from pyspark.ml.classification import NaiveBayes
43 |         from pyspark.ml.evaluation import MulticlassClassificationEvaluator
44 |         from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
45 | 
46 |         sc = SparkContext(appName="Crash model")
47 |         spark = SparkSession.builder \
48 |                           .master("local") \
49 |                           .getOrCreate()
50 | 
51 |         s3 = aws.resource("s3")
52 | 
53 |         bucket = s3.Bucket("scorpion-nys-crashes")
54 |         objects = [(obj.bucket_name, obj.key) for obj
55 |                                               in bucket.objects.all()]
56 |         xs = sc.parallelize(objects) \
57 |                .map(read_s3_object) \
58 |                .flatMap(lambda x:x.split("\n")) \
59 |                .filter(lambda x:x) \
60 |                .map(json.loads) \
61 |                .map(group_crashes) \
62 |                .map(improve_times)
63 | 
64 |         df = spark.createDataFrame(xs)
65 | 
66 |         feature_labels = df.columns
67 |         feature_labels.pop(feature_labels.index('Number of Vehicles Involved'))
68 |         df = reduce(string_to_index, feature_labels, df)
69 |         indexes = ["i-"+f for f in feature_labels]
70 | 
71 |         df = VectorAssembler(inputCols=indexes,
72 |                            outputCol="features").transform(df)
73 | 
74 |         df = StringIndexer(inputCol='Number of Vehicles Involved',
75 |                          outputCol='label').fit(df).transform(df)
76 | 
77 |         grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \
78 |                                .build()
79 | 
80 | 
81 |         mce = MulticlassClassificationEvaluator()
82 |         nb = NaiveBayes()
83 |         cv = CrossValidator(estimator=nb, estimatorParamMaps=grid,
84 |                           evaluator=mce,numFolds=5,
85 |                           parallelism=4)
86 |         cv_model = cv.fit(df)
87 |         transformed = cv_model.transform(df)
88 |         f1 = mce.evaluate(transformed)
89 |         print("NB F1: {:0.4f}".format(f1))
90 |         #cv_model.bestModel.save("./nb-model")
91 |         cv_model.bestModel.save("./my-nb-model-s3")
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     MRSparkBayes.run()
96 | 


--------------------------------------------------------------------------------
/Ch12/nb_on_emr.sh:
--------------------------------------------------------------------------------
1 | aws emr add-steps --cluster-id j-2434JDJSLG768 --steps Type=spark,Name=SparkCrashesNB,Args=[--deploy-mode,cluster,--master,yarn,--conf,spark.yarn.submit.waitAppCompletion=false,--num-executors,2,--executor-cores,1,--executor-memory,10g,s3://scorpion-elastic-jobs/crashes_nb.py,s3://scorpion-nys-crashes/,s3://scorpion-spark-outputs/],ActionOnFailure=CONTINUE
2 | 


--------------------------------------------------------------------------------
/Ch12/spark_bayes.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3 as aws
 3 | from functools import reduce
 4 | from pyspark import SparkContext
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.ml.feature import StringIndexer, VectorAssembler
 7 | from pyspark.ml.classification import NaiveBayes
 8 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 9 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
10 | 
11 | def string_to_index(df, label):
12 |      return StringIndexer(inputCol=label,
13 |                           outputCol="i-"+label).fit(df) \
14 |                                                .transform(df)
15 | 
16 | def group_crashes(x):
17 |     if int(x['Number of Vehicles Involved']) > 3:
18 |         x['Number of Vehicles Involved'] = "3"
19 |     return x
20 | 
21 | def improve_times(x):
22 |     time = x['Time']
23 |     if time < "5:00":
24 |         x['Time'] = "Early morning"
25 |     elif time < "7:00":
26 |         x['Time'] = "Morning"
27 |     elif time < "9:00":
28 |         x['Time'] = "Morning commute"
29 |     elif time < "12:00":
30 |         x['Time'] = "Late morning"
31 |     elif time < "16:00":
32 |         x['Time'] = "Afternoon"
33 |     elif time < "18:30":
34 |         x['Time'] = "Evening commute"
35 |     elif time < "22:00":
36 |         x['Time'] = "Evening"
37 |     else:
38 |         x['Time'] = "Late night"
39 |     return x
40 | 
41 | def read_s3_object(x):
42 |     s3 = aws.resource("s3")
43 |     obj = s3.Object(x[0], x[1])
44 |     return obj.get()['Body'].read().decode('ascii')
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     sc = SparkContext(appName="Crash model")
49 |     spark = SparkSession.builder \
50 |                       .master("local") \
51 |                       .getOrCreate()
52 | 
53 |     s3 = aws.resource("s3")
54 | 
55 |     bucket = s3.Bucket("s3://path/to/your/bucket")
56 |     objects = [(obj.bucket_name, obj.key) for obj
57 |                                           in bucket.objects.all()]
58 |     xs = sc.parallelize(objects) \
59 |            .map(read_s3_object) \
60 |            .flatMap(lambda x:x.split("\n")) \
61 |            .filter(lambda x:x) \
62 |            .map(json.loads) \
63 |            .map(group_crashes) \
64 |            .map(improve_times)
65 | 
66 |     df = spark.createDataFrame(xs)
67 | 
68 |     feature_labels = df.columns
69 |     feature_labels.pop(feature_labels.index('Number of Vehicles Involved'))
70 |     df = reduce(string_to_index, feature_labels, df)
71 |     indexes = ["i-"+f for f in feature_labels]
72 | 
73 |     df = VectorAssembler(inputCols=indexes,
74 |                        outputCol="features").transform(df)
75 | 
76 |     df = StringIndexer(inputCol='Number of Vehicles Involved',
77 |                      outputCol='label').fit(df).transform(df)
78 | 
79 |     grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \
80 |                            .build()
81 | 
82 | 
83 |     mce = MulticlassClassificationEvaluator()
84 |     nb = NaiveBayes()
85 |     cv = CrossValidator(estimator=nb, estimatorParamMaps=grid,
86 |                       evaluator=mce,numFolds=5,
87 |                       parallelism=4)
88 |     cv_model = cv.fit(df)
89 |     transformed = cv_model.transform(df)
90 |     f1 = mce.evaluate(transformed)
91 |     print("NB F1: {:0.4f}".format(f1))
92 |     cv_model.bestModel.save("s3://path/to/your/bucket")
93 | 


--------------------------------------------------------------------------------
/Ch12/spark_mrjob.conf:
--------------------------------------------------------------------------------
1 | runners:
2 |   emr:
3 |     num_core_instances: 2
4 |     image_version: 5.24.0
5 |     instance_type: m1.large
6 |     region: us-east-1
7 |     tags:
8 |       project: Mastering Large Datasets
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Mastering Large Datasets with Python
 2 | =======
 3 | JT Wolohan, 2019
 4 | 
 5 | This repo is a companion to the book [Mastering Large Datasets with Python](https://www.manning.com/books/mastering-large-datasets-with-python).
 6 | 
 7 | In addition to the code found in the book, most chapters have accompanying Jupyter notebook examples.
 8 | 
 9 | <table>
10 | <tr>
11 | <td>
12 | <img width="300" src="https://images.manning.com/720/960/resize/book/3/b52c543-f569-4ea4-a6b0-2ab140b6a24c/Wolohan-MLD-MEAP-HI.png">
13 | </td>
14 | <td>
15 | <strong>Notebooks</strong>
16 | <ul>
17 |   <li>No notebook for Chapter 1</li>
18 |   <li><a href="notebooks/Ch02_notebook.ipynb">Chapter 2</a></li>
19 |   <li><a href="notebooks/Ch03_notebook.ipynb">Chapter 3</a></li>
20 |   <li><a href="notebooks/Ch04_notebook.ipynb">Chapter 4</a></li>
21 |   <li><a href="notebooks/Ch05_notebook.ipynb">Chapter 5</a></li>
22 |   <li><a href="notebooks/Ch06_notebook.ipynb">Chapter 6</a></li>
23 |   <li><a href="notebooks/Ch07_notebook.ipynb">Chapter 7</a></li>
24 |   <li>No notebook for Chapter 8</li>
25 |   <li><a href="notebooks/Ch09_notebook.ipynb">Chapter 9</a></li>
26 |   <li><a href="notebooks/Ch10_notebook.ipynb">Chapter 10</a></li>
27 |   <li><a href="notebooks/Ch11_notebook.ipynb">Chapter 11</a></li>
28 |   <li>No notebook for Chapter 12</li>
29 | </ul>
30 | </td>
31 | </tr>
32 | </table>
33 | 


--------------------------------------------------------------------------------
/notebooks/Ch02_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 2. Working with large datasets faster: parallelization and the map function\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Listing 2.1 and 2.2 :: Formatting phone numbers with loops and maps"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import re\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "class PhoneFormatter:\n",
 30 |     "    def __init__(self):\n",
 31 |     "        self.r = re.compile(r\"\\d\")\n",
 32 |     "        \n",
 33 |     "    def pretty_format(self, phone_number):\n",
 34 |     "        numbers = self.r.findall(phone_number)\n",
 35 |     "        area_code = \"\".join(numbers[-10:-7])\n",
 36 |     "        first_3 = \"\".join(numbers[-7:-4])\n",
 37 |     "        last_4 = \"\".join(numbers[-4:len(numbers)])\n",
 38 |     "        return \"({}) {}-{}\".format(area_code, first_3, last_4)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "phone_numbers = [\n",
 48 |     "    \"(123) 456-7890\",\n",
 49 |     "    \"1234567890\",\n",
 50 |     "    \"123.456.7890\",\n",
 51 |     "    \"+1 123 456-7890\"\n",
 52 |     "]\n",
 53 |     "\n",
 54 |     "P = PhoneFormatter()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "clean_numbers = []\n",
 64 |     "for phone_number in phone_numbers:\n",
 65 |     "    pretty = P.pretty_format(phone_number)\n",
 66 |     "    clean_numbers.append(pretty)\n",
 67 |     "print(clean_numbers)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "print(list(map(P.pretty_format, phone_numbers)))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "### Parallel blog processing"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from datetime import date\n",
 93 |     "from urllib import request\n",
 94 |     "\n",
 95 |     "from multiprocessing import Pool\n",
 96 |     "\n",
 97 |     "def days_between(start,stop):\n",
 98 |     "  today = date(*start)\n",
 99 |     "  stop = date(*stop)\n",
100 |     "  while today < stop:\n",
101 |     "    datestr = today.strftime(\"%m-%d-%Y\")\n",
102 |     "    yield \"http://jtwolohan.com/arch-rival-blog/\"+datestr\n",
103 |     "    today = date.fromordinal(today.toordinal()+1)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "def get_url(path):\n",
113 |     "  return request.urlopen(path).read()\n",
114 |     "\n",
115 |     "with Pool() as P:\n",
116 |     "  blog_posts = P.map(get_url,days_between((2000,1,1),(2011,1,1)))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "### Fizz Buzz - state and parallelization"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 51,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "class FizzBuzzer:\n",
133 |     "  def __init__(self):\n",
134 |     "    self.n = 0\n",
135 |     "  def foo(self,_):\n",
136 |     "    self.n += 1\n",
137 |     "    if (self.n % 3)  == 0:\n",
138 |     "      x = \"buzz\"\n",
139 |     "    else: x = \"fizz\"\n",
140 |     "    print(x)\n",
141 |     "    return x"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "FB = FizzBuzzer()\n",
151 |     "for i in range(21):\n",
152 |     "  FB.foo(i)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "with Pool() as P:\n",
162 |     "    P.map(FB.foo, range(1,22))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Wikipedia scraping"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 3,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "import json\n",
179 |     "from urllib import request, parse\n",
180 |     "from multiprocessing import Pool\n",
181 |     "from itertools import chain\n",
182 |     "import networkx as nx"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "def link_to_title(link):\n",
192 |     "  return link[\"title\"]"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 9,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "def clean_if_key(page,key):\n",
202 |     "    if key in page.keys():\n",
203 |     "        return map(link_to_title,page[key])\n",
204 |     "    else: return []"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 20,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "def get_Wiki_links(pageTitle):\n",
214 |     "    safe_title = parse.quote(pageTitle)\n",
215 |     "    url = \"https://en.wikipedia.org/w/api.php?action=query&\\\n",
216 |     "prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&\\\n",
217 |     "format=json&formatversion=2\".format(safe_title)\n",
218 |     "    page = request.urlopen(url).read().decode('utf-8')\n",
219 |     "    j = json.loads(page)\n",
220 |     "    jpage = j['query']['pages'][0]\n",
221 |     "    inbound = clean_if_key(jpage,\"links\")\n",
222 |     "    outbound = clean_if_key(jpage,\"linkshere\")\n",
223 |     "    return {\"title\": pageTitle,\n",
224 |     "            \"in-links\":list(inbound),\n",
225 |     "            \"out-links\":list(outbound)}"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 21,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "def flatten_network(page):\n",
235 |     "    return page[\"in-links\"]+page[\"out-links\"]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 22,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "def page_to_edges(page):\n",
245 |     "    a = [(page['title'],p) for p in page['out-links']]\n",
246 |     "    b = [(p,page['title']) for p in page['in-links']]\n",
247 |     "    return a+b"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 23,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "root = get_Wiki_links(\"Parallel_computing\")\n",
257 |     "initial_network = flatten_network(root)\n",
258 |     "with Pool() as P:\n",
259 |     "    all_pages = P.map(get_Wiki_links, initial_network)\n",
260 |     "    edges = P.map(page_to_edges, all_pages)\n",
261 |     "edges = chain.from_iterable(edges)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "G = nx.DiGraph()\n",
271 |     "for e in edges:\n",
272 |     "    G.add_edge(*e)\n",
273 |     "nx.readwrite.gexf.write_gexf(G,\"./MyGraph.gexf\")"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "[Read for more? Go to chapter 3!](./Ch03_notebook.ipynb)"
281 |    ]
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "mldbook",
287 |    "language": "python",
288 |    "name": "mldbook"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.5.3"
301 |   }
302 |  },
303 |  "nbformat": 4,
304 |  "nbformat_minor": 2
305 | }
306 | 


--------------------------------------------------------------------------------
/notebooks/Ch03_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 3. Function pipelines for mapping complex transformations\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Hacker translation"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import re\n",
 27 |     "from toolz.functoolz import pipe, compose\n",
 28 |     "\n",
 29 |     "sample_messages = [\n",
 30 |     "\"7his所is家4没s4mpl3动m3ss463\",\n",
 31 |     "\"don7家73ll经4nyon3法7his现m3ss463\",\n",
 32 |     "\"w3现4r3当b3in6进so好s3cr3t\",\n",
 33 |     "\"733小h33成h33去nobody看is天on分7o理us\",\n",
 34 |     "\"w3么will面n3v3r分637理c4u6ht\",\n",
 35 |     "\"w3事4r3经such没sn34ky天h4ckers\"]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "def replace_7t(s):\n",
 45 |     "    return s.replace('7', 't')"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "def replace_3e(s):\n",
 55 |     "    return s.replace('3', 'e')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def replace_6g(s):\n",
 65 |     "    return s.replace('6', 'g')"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "def replace_4a(s):\n",
 75 |     "    return s.replace('4', 'a')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# Alternative approach\n",
 85 |     "# This function makes functions!\n",
 86 |     "def make_letter_replacer(letter_1, letter_2):\n",
 87 |     "    def replacer(s):\n",
 88 |     "        return s.replace(letter_1, letter_2)\n",
 89 |     "    return replacer\n",
 90 |     "\n",
 91 |     "alt_replace_7t = make_letter_replacer('7','t')\n",
 92 |     "alt_replace_7t = make_letter_replacer('3','e')\n",
 93 |     "alt_replace_7t = make_letter_replacer('6','g')\n",
 94 |     "alt_replace_7t = make_letter_replacer('4','a')"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "class chinese_matcher:\n",
104 |     "    def __init__(self):\n",
105 |     "        self.r = re.compile(r'[\\u4e00-\\u9fff]+')\n",
106 |     "        \n",
107 |     "    def sub_chinese(self,s):\n",
108 |     "        return self.r.sub(\" \",s)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "C = chinese_matcher()\n",
118 |     "\n",
119 |     "# Not chained\n",
120 |     "print(list(\n",
121 |     "map( C.sub_chinese,\n",
122 |     "    map(replace_4a,\n",
123 |     "        map(replace_6g,\n",
124 |     "            map(replace_3e,\n",
125 |     "                map(replace_7t, sample_messages)))))),end=\"\\n\\n\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# Option 1\n",
135 |     "hacker_translate = compose(C.sub_chinese, replace_4a, replace_6g,\n",
136 |     "                           replace_3e, replace_7t)\n",
137 |     "\n",
138 |     "print(list(map(hacker_translate, sample_messages)),end=\"\\n\\n\")"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# Option 2\n",
148 |     "def hacker_translate(s):\n",
149 |     "    return pipe(s, replace_7t, replace_3e, replace_6g,\n",
150 |     "                   replace_4a, C.sub_chinese)\n",
151 |     "\n",
152 |     "print(list(map(hacker_translate,sample_messages)),end=\"\\n\\n\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Twitter scraping and gender prediction"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "from multiprocessing import Pool\n",
169 |     "from toolz import compose, pipe\n",
170 |     "import twitter\n",
171 |     "\n",
172 |     "# Remember to fill in the values below with your own account details\n",
173 |     "Twitter = twitter.Api(consumer_key=\"\",\n",
174 |     "                      consumer_secret=\"\",\n",
175 |     "                      access_token_key=\"\",\n",
176 |     "                      access_token_secret=\"\")"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "def get_tweet_from_id(tweet_id, api=Twitter):\n",
186 |     "    return api.GetStatus(tweet_id, trim_user=True)\n",
187 |     "\n",
188 |     "\n",
189 |     "def tweet_to_text(tweet):\n",
190 |     "    return tweet.text\n",
191 |     "\n",
192 |     "\n",
193 |     "def tokenize_text(text):\n",
194 |     "    return text.split()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "def score_text(tokens):\n",
204 |     "    words = {\"the\":1, \"to\":1, \"and\":1, #Words with 1 indicate men\n",
205 |     "             \"in\":1, \"have\":1, \"it\":1,\n",
206 |     "             \"be\":-1, \"of\":-1, \"a\":-1, # Words with -1 indicate women\n",
207 |     "             \"that\":-1, \"i\":-1, \"for\":-1}\n",
208 |     "    return sum(map(lambda x: words.get(x, 0), tokens))\n",
209 |     "\n",
210 |     "\n",
211 |     "def score_tweet(tweet_id):\n",
212 |     "    return pipe(tweet_id, get_tweet_from_id, tweet_to_text,\n",
213 |     "                          tokenize_text, score_text)\n",
214 |     "\n",
215 |     "\n",
216 |     "def score_user(tweets):\n",
217 |     "    N = len(tweets)\n",
218 |     "    total = sum(map(score_tweet, tweets))\n",
219 |     "    return total/N\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "def categorize_user(user_score):\n",
229 |     "    if user_score > 0:\n",
230 |     "        return {\"score\":user_score,\n",
231 |     "                \"gender\": \"Male\"}\n",
232 |     "    return {\"score\":user_score,\n",
233 |     "            \"gender\":\"Female\"}"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "users_tweets = [\n",
243 |     "[1056365937547534341, 1056310126255034368, 1055985345341251584,\n",
244 |     "1056585873989394432, 1056585871623966720],\n",
245 |     "[1055986452612419584, 1056318330037002240, 1055957256162942977,\n",
246 |     " 1056585921154420736, 1056585896898805766],\n",
247 |     "[1056240773572771841, 1056184836900175874, 1056367465477951490,\n",
248 |     " 1056585972765224960, 1056585968155684864],\n",
249 |     "[1056452187897786368, 1056314736546115584, 1055172336062816258,\n",
250 |     " 1056585983175602176, 1056585980881207297]]\n",
251 |     "gender_prediction_pipeline = compose(categorize_user, score_user)\n",
252 |     "with Pool() as P:\n",
253 |     "    print(P.map(gender_prediction_pipeline, users_tweets))\n"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "[Read for more? Go to chapter 4!](./Ch04_notebook.ipynb)"
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "mldbook",
267 |    "language": "python",
268 |    "name": "mldbook"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 3
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython3",
280 |    "version": "3.5.3"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 2
285 | }
286 | 


--------------------------------------------------------------------------------
/notebooks/Ch04_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 4. Processing large datasets with lazy workflows\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Poem Puzzle\n",
 18 |     "Remember to run the poem generation script before you run this code!"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import toolz\n",
 28 |     "import re, itertools\n",
 29 |     "from glob import iglob"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "def word_ratio(d):\n",
 39 |     "    \"\"\"This helper function returns the ratio of a's to the's\"\"\"\n",
 40 |     "    return float(d.get(\"a\",0))/float(d.get(\"the\",0.0001))"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "Class PoemCleaner:\n",
 50 |     "    def __init__(self):\n",
 51 |     "        self.r = re.compile(r'[.,;:!-]')\n",
 52 |     "\n",
 53 |     "    def clean_poem(self, fp):\n",
 54 |     "        \"\"\"This helper function opens a poem at a filepath and returns a clean poem.\n",
 55 |     "\n",
 56 |     "        A clean poem will be a punctuation-less sequence of lowercase words, in\n",
 57 |     "        the order that the author of the poem placed them.\n",
 58 |     "        \"\"\"\n",
 59 |     "        with open(fp) as poem:\n",
 60 |     "            no_punc = self.r.sub(\"\",poem.read())\n",
 61 |     "            return no_punc.lower().split()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "def word_is_desired(w):\n",
 71 |     "    \"\"\"This helper function detects whether a word is \"a\" or \"the\".\n",
 72 |     "\n",
 73 |     "    It is designed to be used in conjunction with filter to filter a sequence\n",
 74 |     "    of words down to just definite and indefinite articles.\n",
 75 |     "    \"\"\"\n",
 76 |     "    if w in [\"a\",\"the\"]:\n",
 77 |     "        return True\n",
 78 |     "    else:\n",
 79 |     "        return False"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "def analyze_poems(poems, cleaner):\n",
 89 |     "    return word_ratio(\n",
 90 |     "        toolz.frequencies(\n",
 91 |     "            filter(word_is_desired,\n",
 92 |     "                itertools.chain(*map(cleaner.clean_poem, poems)))))"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "Cleaner = PoemCleaner()\n",
102 |     "author_a_poems = iglob(\"author_a/*.txt\")\n",
103 |     "author_b_poems = iglob(\"author_b/*.txt\")\n",
104 |     "\n",
105 |     "author_a_ratio = analyze_poems(author_a_poems, Cleaner)\n",
106 |     "author_b_ratio = analyze_poems(author_b_poems, Cleaner)\n",
107 |     "\n",
108 |     "print(\"\"\"\n",
109 |     "Original_Poem:  0.3\n",
110 |     "Author A:     {:.2f}\n",
111 |     "Author B:     {:.2f}\n",
112 |     "\"\"\".format(author_a_ratio, author_b_ratio))"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "### Fishing village simulation"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "import random, itertools\n",
129 |     "from operator import methodcaller"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "class Village:\n",
139 |     "  def __init__(self):\n",
140 |     "    self.population = random.uniform(1000,5000)\n",
141 |     "    self.cheat_rate = random.uniform(.05,.15)\n",
142 |     "\n",
143 |     "  def update(self, sim):\n",
144 |     "    if sim.cheaters >= 2:\n",
145 |     "      self.cheat_rate += .05\n",
146 |     "    self.population = int(self.population*1.025)\n",
147 |     "\n",
148 |     "  def go_fishing(self):\n",
149 |     "    if random.uniform(0,1) < self.cheat_rate:\n",
150 |     "      cheat = 1\n",
151 |     "      fish_taken = self.population * 2\n",
152 |     "    else:\n",
153 |     "      cheat = 0\n",
154 |     "      fish_taken = self.population * 1\n",
155 |     "    return fish_taken, cheat"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "class LakeSimulation:\n",
165 |     "  def __init__(self):\n",
166 |     "    self.villages = [Village() for _ in range(4)]\n",
167 |     "    self.fish = 80000\n",
168 |     "    self.year = 1\n",
169 |     "    self.cheaters = 0\n",
170 |     "\n",
171 |     "  def simulate(self):\n",
172 |     "    for _ in itertools.count():\n",
173 |     "        yearly_results = map(methodcaller(\"go_fishing\"), self.villages)\n",
174 |     "        fishs, cheats = zip(*yearly_results)\n",
175 |     "        total_fished = sum(fishs)\n",
176 |     "        self.cheaters = sum(cheats)\n",
177 |     "        if self.year > 1000:\n",
178 |     "            print(\"Wow! Your villages lasted 1000 years!\")\n",
179 |     "            break\n",
180 |     "        if self.fish < total_fished:\n",
181 |     "            print(\"The lake was overfished in {} years.\".format(self.year))\n",
182 |     "            break\n",
183 |     "        else:\n",
184 |     "            self.fish = (self.fish-total_fished)* 1.15\n",
185 |     "            map(methodcaller(\"update\"), self.villages)\n",
186 |     "            print(\"Year {:<5}   Fish: {}\".format(self.year,\n",
187 |     "                                                 int(self.fish)))\n",
188 |     "            self.year += 1"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "random.seed(\"Wolohan\")\n",
198 |     "Lake = LakeSimulation()\n",
199 |     "Lake.simulate()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "[Read for more? Go to chapter 5!](./Ch05_notebook.ipynb)"
207 |    ]
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "mldbook",
213 |    "language": "python",
214 |    "name": "mldbook"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.5.3"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 2
231 | }
232 | 


--------------------------------------------------------------------------------
/notebooks/Ch05_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 5. Accumulation operations with Reduce\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Early chapter functions: Frequnecy and filter"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from functools import reduce\n",
 27 |     "\n",
 28 |     "xs = [1, 2, 3, 4, 5, 6, 7, 8, 9]\n",
 29 |     "\n",
 30 |     "def keep_if_even(acc, nxt):\n",
 31 |     "    if nxt % 2 == 0:\n",
 32 |     "        return acc + [nxt]\n",
 33 |     "    else:\n",
 34 |     "        return acc\n",
 35 |     "\n",
 36 |     "\n",
 37 |     "reduce(keep_if_even, xs, [])\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from functools import reduce\n",
 47 |     "\n",
 48 |     "xs = [\"A\", \"B\", \"C\", \"A\", \"A\", \"C\", \"A\"]\n",
 49 |     "ys = [1, 3, 6, 1, 2, 9, 3, 12]\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "def make_counts(acc, nxt):\n",
 53 |     "    acc[nxt] = acc.get(nxt, 0) + 1\n",
 54 |     "    return acc\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "def my_frequencies(xs):\n",
 58 |     "    return reduce(make_counts, xs, {})\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "print(my_frequencies(xs))\n",
 62 |     "print(my_frequencies(ys))\n",
 63 |     "print(my_frequencies(\"mississippi\"))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "### Analyzing car trends with reduce"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "**SCENARIO:  CHANGING  CAR  TRENDS** *Your  customer  is  a used  car  dealer.  They  have data on cars that they’ve bought and sold in the last 6 months and are hoping you can help them find what type of used cars they make the most profit on. One salesman believes that its high fuel-efficiency cars (those that get more than 35 miles per gallon) that make the most  money,  while  another believe  that  medium-mileage  cars  (between  60,000  and 100,000 miles) result in the highest average profit on resale. Given a CSV file with a variety of attributes about some used cars, write a script to find the average profit on cars of low (<18 mpg), medium (18-35 mpg) and high (>35) fuel-efficiency as well as low (<60,000), medium (60,000-100,000), and high mileage (>100,000) and settle the debate.*"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from functools import reduce\n",
 87 |     "\n",
 88 |     "def low_med_hi(d, k, breaks):\n",
 89 |     "    if float(d[k]) < breaks[0]:\n",
 90 |     "        return \"low\"\n",
 91 |     "    elif float(d[k]) < breaks[1]:\n",
 92 |     "        return \"medium\"\n",
 93 |     "    else:\n",
 94 |     "        return \"high\""
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "def clean_entry(d):\n",
104 |     "    r = {'profit':None, 'mpg':None, 'odo':None}\n",
105 |     "    r['profit'] = float(d.get(\"price-sell\", 0)) - float(d.get(\"price-buy\", 0))\n",
106 |     "    r['mpg'] = low_med_hi(d, 'mpg', (18, 35))\n",
107 |     "    r['odo'] = low_med_hi(d, 'odo', (60000, 105000))\n",
108 |     "    return r"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def acc_average(acc, profit):\n",
118 |     "    acc['total'] = acc.get('total', 0) + profit\n",
119 |     "    acc['count'] = acc.get('count', 0) + 1\n",
120 |     "    acc['average'] = acc['total']/acc['count']\n",
121 |     "    return acc"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "def sort_and_add(acc, nxt):\n",
131 |     "    p = nxt['profit']\n",
132 |     "    acc['mpg'][nxt['mpg']] = acc_average(acc['mpg'].get(nxt['mpg'], {}), p)\n",
133 |     "    acc['odo'][nxt['odo']] = acc_average(acc['odo'].get(nxt['odo'], {}), p)\n",
134 |     "    return acc"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "import json\n",
144 |     "with open(\"../Ch05/cars.json\") as f:\n",
145 |     "    xs = json.load(f)\n",
146 |     "results = reduce(sort_and_add, map(clean_entry, xs), {\"mpg\": {}, \"odo\": {}})\n",
147 |     "print(json.dumps(results, indent=4))"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "[Read for more? Go to chapter 6!](./Ch06_notebook.ipynb)"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "mldbook",
161 |    "language": "python",
162 |    "name": "mldbook"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.5.3"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------
/notebooks/Ch06_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 6. Speeding up map and reduce with advanced parallelization\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Timing"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "name": "stdout",
 27 |      "output_type": "stream",
 28 |      "text": [
 29 |       "\n",
 30 |       "-- N = 1 --\n",
 31 |       "Lazy map time:      8.000000000008e-06\n",
 32 |       "Parallel map time:  0.01324599999999998\n",
 33 |       "\n",
 34 |       "\n",
 35 |       "-- N = 10 --\n",
 36 |       "Lazy map time:      9.599999999998499e-05\n",
 37 |       "Parallel map time:  0.014952000000000076\n",
 38 |       "\n",
 39 |       "\n",
 40 |       "-- N = 100 --\n",
 41 |       "Lazy map time:      5.900000000003125e-05\n",
 42 |       "Parallel map time:  0.01502199999999998\n",
 43 |       "\n",
 44 |       "\n",
 45 |       "-- N = 1000 --\n",
 46 |       "Lazy map time:      0.0003989999999999272\n",
 47 |       "Parallel map time:  0.014475000000000016\n",
 48 |       "\n",
 49 |       "\n",
 50 |       "-- N = 10000 --\n",
 51 |       "Lazy map time:      0.0038730000000000153\n",
 52 |       "Parallel map time:  0.01732200000000006\n",
 53 |       "\n",
 54 |       "\n",
 55 |       "-- N = 100000 --\n",
 56 |       "Lazy map time:      0.03707399999999994\n",
 57 |       "Parallel map time:  0.02400800000000003\n",
 58 |       "\n",
 59 |       "\n",
 60 |       "-- N = 1000000 --\n",
 61 |       "Lazy map time:      0.199009\n",
 62 |       "Parallel map time:  0.13838499999999998\n",
 63 |       "\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "from time import clock, sleep\n",
 69 |     "from multiprocessing import Pool\n",
 70 |     "\n",
 71 |     "\n",
 72 |     "def times_two(x):\n",
 73 |     "  return x*2+7\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "def lazy_map(xs):\n",
 77 |     "  return list(map(times_two, xs))\n",
 78 |     "\n",
 79 |     "\n",
 80 |     "def parallel_map(xs, chunck=8500):\n",
 81 |     "  with Pool(2) as P:\n",
 82 |     "    x =  P.map(times_two, xs, chunck)\n",
 83 |     "  return x\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "for i in range(0, 7):\n",
 87 |     "  N = 10**i\n",
 88 |     "  t1 = clock()\n",
 89 |     "  lazy_map(range(N))\n",
 90 |     "  lm_time = clock() - t1\n",
 91 |     "\n",
 92 |     "  t1 = clock()\n",
 93 |     "  parallel_map(range(N))\n",
 94 |     "  par_time = clock() - t1\n",
 95 |     "  print(\"\"\"\n",
 96 |     "-- N = {} --\n",
 97 |     "Lazy map time:      {}\n",
 98 |     "Parallel map time:  {}\n",
 99 |     "\"\"\".format(N, lm_time, par_time))"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 4,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "\n",
112 |       "chunksize   |  runtime\n",
113 |       "-------------------------\n",
114 |       "5           |   5.083\n",
115 |       "50          |   1.431\n",
116 |       "500         |   0.291\n",
117 |       "5000        |   0.199\n",
118 |       "50000       |   0.159\n",
119 |       "500000      |   0.203\n",
120 |       "5000000     |   0.182\n",
121 |       "50000000    |   0.164\n",
122 |       "500000000   |   0.157\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "from time import clock\n",
128 |     "from multiprocessing import Pool\n",
129 |     "\n",
130 |     "\n",
131 |     "def times_two(x):\n",
132 |     "  return x*2+7\n",
133 |     "\n",
134 |     "\n",
135 |     "def parallel_map(xs, chunk_size=8500):\n",
136 |     "  with Pool(2) as P:\n",
137 |     "    x = P.map(times_two, xs, chunk_size)\n",
138 |     "  return x\n",
139 |     "\n",
140 |     "\n",
141 |     "print(\"\"\"\n",
142 |     "{:<10}  |  {}\n",
143 |     "-------------------------\"\"\".format(\"chunksize\", \"runtime\"))\n",
144 |     "\n",
145 |     "for i in range(0, 9):\n",
146 |     "  N = 1000000\n",
147 |     "  chunk_size = 5 * (10**i)\n",
148 |     "\n",
149 |     "  t1 = clock()\n",
150 |     "  parallel_map(range(N), chunk_size)\n",
151 |     "  parallel_time = clock() - t1\n",
152 |     "\n",
153 |     "  print(\"{:<10}  |   {:>0.3f}\".format(chunk_size, parallel_time))"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "### Parallel sum"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "from pathos.multiprocessing import ProcessingPool as Pool\n",
170 |     "from toolz.sandbox.parallel import fold\n",
171 |     "from functools import reduce\n",
172 |     "\n",
173 |     "\n",
174 |     "def my_add(left, right):\n",
175 |     "  return left+right\n",
176 |     "\n",
177 |     "\n",
178 |     "with Pool() as P: \n",
179 |     "    fold(my_add, range(500000), map=P.imap)\n",
180 |     "\n",
181 |     "print(reduce(my_add, range(500)))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Parallel filter"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 5,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "name": "stdout",
198 |      "output_type": "stream",
199 |      "text": [
200 |       "[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 422, 424, 426, 428, 430, 432, 434, 436, 438, 440, 442, 444, 446, 448, 450, 452, 454, 456, 458, 460, 462, 464, 466, 468, 470, 472, 474, 476, 478, 480, 482, 484, 486, 488, 490, 492, 494, 496, 498]\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "from pathos.multiprocessing import ProcessingPool as Pool\n",
206 |     "from toolz.sandbox.parallel import fold\n",
207 |     "from functools import reduce\n",
208 |     "\n",
209 |     "\n",
210 |     "def map_combination(left, right):\n",
211 |     "  return left + right\n",
212 |     "\n",
213 |     "\n",
214 |     "def keep_if_even(acc, nxt):\n",
215 |     "    if nxt % 2 == 0:\n",
216 |     "        return acc + [nxt]\n",
217 |     "    else: return acc\n",
218 |     "\n",
219 |     "\n",
220 |     "with Pool() as P:\n",
221 |     "    fold(keep_if_even, range(500000), [],\n",
222 |     "         map=P.imap, combine=map_combination)\n",
223 |     "\n",
224 |     "print(reduce(keep_if_even, range(500), []))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Parallel frequencies"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 6,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "{1: 76, 2: 94, 3: 74, 4: 78, 5: 88, 6: 90}\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "from pathos.multiprocessing import ProcessingPool as Pool\n",
249 |     "from toolz.sandbox.parallel import fold\n",
250 |     "from random import choice\n",
251 |     "from functools import reduce\n",
252 |     "\n",
253 |     "\n",
254 |     "def combine_counts(left, right):\n",
255 |     "  unique_keys = set(left.keys()).union(set(right.keys()))\n",
256 |     "  return {k:left.get(k, 0)+right.get(k, 0) for k in unique_keys}\n",
257 |     "\n",
258 |     "\n",
259 |     "def make_counts(acc, nxt):\n",
260 |     "    acc[nxt] = acc.get(nxt,0) + 1\n",
261 |     "    return acc\n",
262 |     "\n",
263 |     "\n",
264 |     "xs = (choice([1, 2, 3, 4, 5, 6]) for _ in range(500000))\n",
265 |     "\n",
266 |     "with Pool() as P:\n",
267 |     "    fold(make_counts, xs, {},\n",
268 |     "         map=P.imap, combine=combine_counts)\n",
269 |     "\n",
270 |     "print(reduce(make_counts, (choice([1, 2, 3, 4, 5, 6]) for _ in range(500)), {}))"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "### Parallel Naive Bayes\n",
278 |     "**NB:** *This code ended up getting cut from the book. It implements the naive Bayes algorithm in parallel using map and reduce patterns. Feel free to read through it as a bonus.*"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "from itertools import starmap, repeat\n",
288 |     "from functools import reduce, partial\n",
289 |     "import dill as pickle\n",
290 |     "from toolz.sandbox.parallel import fold\n",
291 |     "from pathos.multiprocessing import ProcessingPool as PathosPool\n",
292 |     "from multiprocessing import Pool\n",
293 |     "from csv import DictReader"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "def unique_keys(left, right):\n",
303 |     "    return set(left.keys()).union(set(right.keys()))\n",
304 |     "\n",
305 |     "def prod(xs):\n",
306 |     "    return reduce(lambda acc,nxt: acc*nxt, xs)"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "def compute_prob(model, k, v, label, N):\n",
316 |     "    \"\"\"Compute probabilities for event.\"\"\"\n",
317 |     "    Cn = model['LABELS'][label]\n",
318 |     "    prior = Cn / N\n",
319 |     "    evidence = model[k][v].get(label,.001) / Cn\n",
320 |     "    return prior * evidence\n",
321 |     "\n",
322 |     "def _nb_suggest(ob, model, target):\n",
323 |     "    \"\"\"maknaive Bayes prediction\"\"\"\n",
324 |     "    ob.pop(target)\n",
325 |     "    N = sum(model['LABELS'].values())\n",
326 |     "    results = {}\n",
327 |     "    for label in model['LABELS'].keys():\n",
328 |     "        p = prod(compute_prob(model, k, v, label, N) for k, v in ob.items())\n",
329 |     "        results[label] = p\n",
330 |     "    return results\n",
331 |     "\n",
332 |     "def naive_bayes_suggest(obs, model, target):\n",
333 |     "    \"\"\"Parallel naive Bayes prediction function\"\"\"\n",
334 |     "    with Pool() as P:\n",
335 |     "        f = partial(_nb_suggest, target=target)\n",
336 |     "        return P.starmap(f, zip(obs, repeat(model)))"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "def nb_acc(acc, nxt, target):\n",
346 |     "    label = nxt.pop(target)\n",
347 |     "    if not acc.get('LABELS', False):\n",
348 |     "        acc['LABELS'] = {}\n",
349 |     "    acc['LABELS'][label] = acc['LABELS'].get(label,0) + 1\n",
350 |     "    for k,v in nxt.items():\n",
351 |     "        if not acc.get(k,False):\n",
352 |     "            acc[k] = {}\n",
353 |     "        if not acc[k].get(v, False):\n",
354 |     "            acc[k][v] = {}\n",
355 |     "        acc[k][v][label] = acc.get(k,{}).get(v,{}).get(label,0) + 1\n",
356 |     "    return acc"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "def _nb_comb(left, right):\n",
366 |     "    acc = {}\n",
367 |     "    acc['LABELS'] = {}\n",
368 |     "    for k in unique_keys(left['LABELS'], right['LABELS']):\n",
369 |     "        acc['LABELS'][k] = left['LABELS'].get(k,0) + right['LABELS'].get(k,0)\n",
370 |     "    for k in unique_keys(left, right):\n",
371 |     "        if k == 'LABELS': continue\n",
372 |     "        acc[k] = {}\n",
373 |     "        for v in unique_keys(left.get(k,{}), right.get(k,{})):\n",
374 |     "            acc[k][v] = {}\n",
375 |     "            for label in acc['LABELS']:\n",
376 |     "                count_left = left.get(k,{}).get(v,{}).get(label,0)\n",
377 |     "                count_right = right.get(k,{}).get(v,{}).get(label,0)\n",
378 |     "                acc[k][v][label] = count_left + count_right\n",
379 |     "    return acc"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 7,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "def naive_bayes(xs, target):\n",
389 |     "    \"\"\"Create a naive Bayes model.\n",
390 |     "\n",
391 |     "\n",
392 |     "    Inputs\n",
393 |     "    xs: input data\n",
394 |     "    target: target variable\n",
395 |     "    \n",
396 |     "    Output\n",
397 |     "    prediction function\n",
398 |     "\"\"\"\n",
399 |     "    acc = partial(nb_acc, target=target)\n",
400 |     "    with PathosPool() as P:\n",
401 |     "        model = fold(acc, xs, {}, map=P.map, combine=_nb_comb)\n",
402 |     "    return partial(naive_bayes_suggest, model=model, target=target)\n",
403 |     "\n",
404 |     "def max_prob(probs):\n",
405 |     "    return max(((k,v) for k,v in probs.items()), key=lambda x:x[1])[0]"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "Download [the nursery data](https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data) and assign its path to `fp` in the next block"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "fp = \"\"\n",
422 |     "with open(fp) as f:\n",
423 |     "    reader = DictReader(f, fieldnames=[\"parents\", \"has_nurs\", \"form\",\n",
424 |     "                                 \"children\", \"housing\", \"finance\",\n",
425 |     "                                 \"social\", \"health\", \"recc\"])\n",
426 |     "    data = [row for row in reader]\n",
427 |     "\n",
428 |     "model = naive_bayes(data, \"recc\")\n",
429 |     "probs = model(data)\n",
430 |     "print(\"{}\\t\\t{}\\t{}\".format(\"Match\", \"Suggestion\", \"Actual\"))\n",
431 |     "print(\"{}\".format(\"-\"*45))\n",
432 |     "for i,p in enumerate(probs):\n",
433 |     "    suggestion = max_prob(p)\n",
434 |     "    actual = data[i]['recc']\n",
435 |     "    match = suggestion == actual\n",
436 |     "    print(\"{}\\t\\t{}\\t{}\".format(match, suggestion, actual))\n",
437 |     "    if i > 25: break"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "[Read for more? Go to chapter 7!](./Ch07_notebook.ipynb)"
445 |    ]
446 |   }
447 |  ],
448 |  "metadata": {
449 |   "kernelspec": {
450 |    "display_name": "mldbook",
451 |    "language": "python",
452 |    "name": "mldbook"
453 |   },
454 |   "language_info": {
455 |    "codemirror_mode": {
456 |     "name": "ipython",
457 |     "version": 3
458 |    },
459 |    "file_extension": ".py",
460 |    "mimetype": "text/x-python",
461 |    "name": "python",
462 |    "nbconvert_exporter": "python",
463 |    "pygments_lexer": "ipython3",
464 |    "version": "3.5.3"
465 |   }
466 |  },
467 |  "nbformat": 4,
468 |  "nbformat_minor": 2
469 | }
470 | 


--------------------------------------------------------------------------------
/notebooks/Ch07_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Chapter 7. Processing truly big datasets with Hadoop and Spark\n",
 8 |     "====\n",
 9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
10 |     "\n"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "markdown",
15 |    "metadata": {},
16 |    "source": [
17 |     "### Spark\n",
18 |     "One of the great benefits of Spark is that you can run Spark jobs in a Jupyter notebook, just like this one."
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "import re\n",
28 |     "from pyspark import SparkContext\n",
29 |     "\n",
30 |     "sc = SparkContext(appName=\"WordScores\")\n",
31 |     "PAT = re.compile(r'[-./:\\s\\xa0]+')\n",
32 |     "text_files = sc.textFile(\"../Ch07/*.txt\")\n",
33 |     "xs = text_files.flatMap(lambda x:PAT.split(x))\\\n",
34 |     "               .filter(lambda x:len(x)>6)\\\n",
35 |     "               .countByValue()\n",
36 |     "\n",
37 |     "for k,v in xs.items():\n",
38 |     "    print(\"{:<30}{}\".format(k,v))"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "markdown",
43 |    "metadata": {},
44 |    "source": [
45 |     "[Read for more? Go to chapter 9!](./Ch09_notebook.ipynb)\n",
46 |     "\n",
47 |     "(There's no notebook for Chapter 8. Chapter 8 focuses on Hadoop.)"
48 |    ]
49 |   }
50 |  ],
51 |  "metadata": {
52 |   "kernelspec": {
53 |    "display_name": "mldbook",
54 |    "language": "python",
55 |    "name": "mldbook"
56 |   },
57 |   "language_info": {
58 |    "codemirror_mode": {
59 |     "name": "ipython",
60 |     "version": 3
61 |    },
62 |    "file_extension": ".py",
63 |    "mimetype": "text/x-python",
64 |    "name": "python",
65 |    "nbconvert_exporter": "python",
66 |    "pygments_lexer": "ipython3",
67 |    "version": "3.5.3"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 2
72 | }
73 | 


--------------------------------------------------------------------------------
/notebooks/Ch09_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 9. PageRank with Map and Reduce in PySpark\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Elo ratings in Spark"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 3,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import re, json\n",
 27 |     "from pyspark import SparkContext"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 4,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "def round5(x):\n",
 37 |     "  return 5*int(x/5)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 5,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def clean_match(match):\n",
 47 |     "  ms = match.split(',')\n",
 48 |     "  match_data = {'winner': ms[10],\n",
 49 |     "                'loser': ms[20],\n",
 50 |     "                'surface': ms[2]}\n",
 51 |     "  return match_data"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 6,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def elo_acc(acc,nxt):\n",
 61 |     "    w_elo = acc.get(nxt['winner'],1600)\n",
 62 |     "    l_elo = acc.get(nxt['loser'],1600)\n",
 63 |     "    Qw = 10**(w_elo/400)\n",
 64 |     "    Ql = 10**(l_elo/400)\n",
 65 |     "    Qt = Qw+Ql\n",
 66 |     "    acc[nxt['winner']] = round5(w_elo + 25*(1-(Qw/Qt)))\n",
 67 |     "    acc[nxt['loser']] = round5(l_elo - 25*(Ql/Qt))\n",
 68 |     "    return acc"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 7,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "def elo_comb(a,b):\n",
 78 |     "    a.update(b)\n",
 79 |     "    return a"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 12,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "Martina Hingis                1865\n",
 92 |       "Venus Williams                1830\n",
 93 |       "Monica Seles                  1765\n",
 94 |       "Serena Williams               1755\n",
 95 |       "Lindsay Davenport             1745\n",
 96 |       "Maria Sharapova               1720\n",
 97 |       "Petra Russegger               1710\n",
 98 |       "Akiko Morigami                1690\n",
 99 |       "Garbine Muguruza              1685\n",
100 |       "Victoria Azarenka             1665\n",
101 |       "Nour Abbes                    1660\n",
102 |       "Timea Bacsinszky              1660\n",
103 |       "Belinda Bencic                1655\n",
104 |       "Amelie Mauresmo               1655\n",
105 |       "Mary Pierce                   1655\n",
106 |       "Jennifer Saret                1650\n",
107 |       "Angelique Kerber              1650\n",
108 |       "Bermet Duvanaeva              1650\n",
109 |       "Svetlana Komleva              1650\n",
110 |       "Cecilia Costa Melgar          1650\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "sc = SparkContext(appName=\"TennisRatings\")\n",
116 |     "text_files = sc.textFile(\"/path/to/my/data/wta_matches*\")\n",
117 |     "xs = text_files.map(clean_match)\\\n",
118 |     "             .aggregate({},elo_acc, elo_comb)\n",
119 |     "\n",
120 |     "for x in sorted(xs.items(), key=lambda x:x[1], reverse=True)[:20]:\n",
121 |     "  print(\"{:<30}{}\".format(*x))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "### Page rank in Spark"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 25,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "from math import log2, ceil\n",
138 |     "from functools import partial\n",
139 |     "from pyspark import SparkContext"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 16,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "def ceil5(x):\n",
149 |     "    return ceil(x/5)*5"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 17,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "def get_winner_loser(match):\n",
159 |     "  ms = match.split(',')\n",
160 |     "  # Put the loser in first position, winner in second\n",
161 |     "  return (ms[20], ms[10])"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 18,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "def initialize_for_voting(losses):\n",
171 |     "    return {'losses': losses,\n",
172 |     "            'n_losses': len(losses),\n",
173 |     "            'rating': 100}"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 19,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "def empty_ratings(d):\n",
183 |     "  d['rating'] = 0\n",
184 |     "  return d"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 20,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "def allocate_points(acc, nxt):\n",
194 |     "  k,v = nxt\n",
195 |     "  boost = v['rating'] / (v['n_losses'] + .01)\n",
196 |     "  for loss in v['losses']:\n",
197 |     "    if loss not in acc.keys():\n",
198 |     "      acc[loss] = {'losses':[], 'n_losses': 0}\n",
199 |     "    opp_rating = acc.get(loss,{}).get('rating',0)\n",
200 |     "    acc[loss]['rating'] = opp_rating + boost\n",
201 |     "  return acc"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 21,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "def combine_scores(a, b):\n",
211 |     "  for k,v in b.items():\n",
212 |     "    try:\n",
213 |     "      a[k]['rating'] = a[k]['rating'] + b[k]['rating']\n",
214 |     "    except KeyError:\n",
215 |     "      a[k] = v\n",
216 |     "  return a"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "To run the cell below, you may need to un-comment the Spark context. If you ran the Elo rating example above, leave it commented."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 26,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "Serena Williams               12.4\t5475\n",
236 |       "Venus Williams                12.0\t4230\n",
237 |       "Kim Clijsters                 11.9\t3870\n",
238 |       "Maria Sharapova               11.9\t3785\n",
239 |       "Justine Henin                 11.8\t3660\n",
240 |       "Elena Dementieva              11.6\t3130\n",
241 |       "Amelie Mauresmo               11.6\t3115\n",
242 |       "Svetlana Kuznetsova           11.6\t3060\n",
243 |       "Jelena Jankovic               11.6\t3055\n",
244 |       "Lindsay Davenport             11.6\t3055\n",
245 |       "Victoria Azarenka             11.3\t2485\n",
246 |       "Ana Ivanovic                  11.2\t2405\n",
247 |       "Daniela Hantuchova            11.2\t2385\n",
248 |       "Nadia Petrova                 11.2\t2360\n",
249 |       "Caroline Wozniacki            11.2\t2350\n",
250 |       "Agnieszka Radwanska           11.2\t2335\n",
251 |       "Vera Zvonareva                11.2\t2320\n",
252 |       "Patty Schnyder                11.1\t2220\n",
253 |       "Samantha Stosur               11.1\t2215\n",
254 |       "Francesca Schiavone           11.0\t2100\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "#sc = SparkContext(appName=\"TennisRatings\")\n",
260 |     "match_data = sc.textFile(\"path/to/tennis/files\")\n",
261 |     "xs = match_data.map(get_winner_loser)\\\n",
262 |     "             .groupByKey()\\\n",
263 |     "             .mapValues(initialize_for_voting)\n",
264 |     "\n",
265 |     "for i in range(8):\n",
266 |     "    if i > 0:\n",
267 |     "        xs = sc.parallelize(zs.items())\n",
268 |     "    acc = dict(xs.mapValues(empty_ratings).collect())\n",
269 |     "    zs = xs.aggregate(acc, allocate_points, combine_scores)\n",
270 |     "\n",
271 |     "ratings = [(k,v['rating']) for k,v in zs.items()]\n",
272 |     "for player, rating in sorted(ratings, key=lambda x: x[1], reverse=True)[:20]:\n",
273 |     "    print('{:<30}{}\\t{}'.format(player,\n",
274 |     "                                round(log2(rating+1), 1),\n",
275 |     "                                ceil5(rating)))\n"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "[Read for more? Go to chapter 10!](./Ch03_notebook.ipynb)"
283 |    ]
284 |   }
285 |  ],
286 |  "metadata": {
287 |   "kernelspec": {
288 |    "display_name": "mldbook",
289 |    "language": "python",
290 |    "name": "mldbook"
291 |   },
292 |   "language_info": {
293 |    "codemirror_mode": {
294 |     "name": "ipython",
295 |     "version": 3
296 |    },
297 |    "file_extension": ".py",
298 |    "mimetype": "text/x-python",
299 |    "name": "python",
300 |    "nbconvert_exporter": "python",
301 |    "pygments_lexer": "ipython3",
302 |    "version": "3.5.3"
303 |   }
304 |  },
305 |  "nbformat": 4,
306 |  "nbformat_minor": 2
307 | }
308 | 


--------------------------------------------------------------------------------
/notebooks/Ch10_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Chapter 10. Faster decision making with machine learning andPySpark\n",
  8 |     "====\n",
  9 |     "### Mastering Large Datasets with Python by JT Wolohan \n",
 10 |     "\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Decision Trees"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 21,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from pyspark import SparkContext\n",
 27 |     "from pyspark.sql import SparkSession\n",
 28 |     "from functools import reduce\n",
 29 |     "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
 30 |     "from pyspark.ml.classification import DecisionTreeClassifier\n",
 31 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 22,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "def string_to_index(df, label):\n",
 41 |     "     return StringIndexer(inputCol=label,\n",
 42 |     "                          outputCol=\"i-\"+label).fit(df) \\\n",
 43 |     "                                               .transform(df)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 23,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "spark = SparkSession.builder \\\n",
 53 |     "           .master(\"local\") \\\n",
 54 |     "           .appName(\"Decision Trees\") \\\n",
 55 |     "           .getOrCreate()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 24,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "df = spark.read.csv(\"../Ch10/mushrooms.data\", header=True, inferSchema=True)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 25,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "categories = ['cap-shape', 'cap-surface', 'cap-color']\n",
 74 |     "df = reduce(string_to_index, categories, df)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 26,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "df = VectorAssembler(inputCols=[\"i-cap-shape\",\"i-cap-surface\", \"i-cap-color\"],\n",
 84 |     "                     outputCol=\"features\").transform(df)\n",
 85 |     "\n",
 86 |     "df = StringIndexer(inputCol='edible?', outputCol='label').fit(df).transform(df)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 27,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "tree = DecisionTreeClassifier()\n",
 96 |     "model = tree.fit(df)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 28,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_06513451d79b) of depth 5 with 29 nodes\n",
109 |       "  If (feature 1 in {2.0,3.0})\n",
110 |       "   If (feature 2 in {0.0,2.0,4.0,6.0,7.0})\n",
111 |       "    If (feature 2 in {0.0,2.0,7.0})\n",
112 |       "     If (feature 0 in {0.0,1.0,2.0,4.0})\n",
113 |       "      Predict: 0.0\n",
114 |       "     Else (feature 0 not in {0.0,1.0,2.0,4.0})\n",
115 |       "      Predict: 1.0\n",
116 |       "    Else (feature 2 not in {0.0,2.0,7.0})\n",
117 |       "     If (feature 2 in {6.0})\n",
118 |       "      Predict: 1.0\n",
119 |       "     Else (feature 2 not in {6.0})\n",
120 |       "      Predict: 0.0\n",
121 |       "   Else (feature 2 not in {0.0,2.0,4.0,6.0,7.0})\n",
122 |       "    If (feature 2 in {3.0})\n",
123 |       "     Predict: 1.0\n",
124 |       "    Else (feature 2 not in {3.0})\n",
125 |       "     Predict: 0.0\n",
126 |       "  Else (feature 1 not in {2.0,3.0})\n",
127 |       "   If (feature 0 in {3.0,5.0})\n",
128 |       "    If (feature 2 in {0.0,1.0,3.0})\n",
129 |       "     If (feature 0 in {5.0})\n",
130 |       "      Predict: 1.0\n",
131 |       "     Else (feature 0 not in {5.0})\n",
132 |       "      Predict: 0.0\n",
133 |       "    Else (feature 2 not in {0.0,1.0,3.0})\n",
134 |       "     If (feature 2 in {5.0,6.0})\n",
135 |       "      Predict: 1.0\n",
136 |       "     Else (feature 2 not in {5.0,6.0})\n",
137 |       "      If (feature 0 in {5.0})\n",
138 |       "       Predict: 1.0\n",
139 |       "      Else (feature 0 not in {5.0})\n",
140 |       "       Predict: 0.0\n",
141 |       "   Else (feature 0 not in {3.0,5.0})\n",
142 |       "    If (feature 0 in {2.0})\n",
143 |       "     If (feature 2 in {1.0,4.0,5.0,6.0})\n",
144 |       "      Predict: 0.0\n",
145 |       "     Else (feature 2 not in {1.0,4.0,5.0,6.0})\n",
146 |       "      Predict: 1.0\n",
147 |       "    Else (feature 0 not in {2.0})\n",
148 |       "     If (feature 2 in {8.0,9.0})\n",
149 |       "      Predict: 0.0\n",
150 |       "     Else (feature 2 not in {8.0,9.0})\n",
151 |       "      Predict: 1.0\n",
152 |       "\n"
153 |      ]
154 |     }
155 |    ],
156 |    "source": [
157 |     "print(model.toDebugString)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 29,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "Decision Tree AUC: 0.6333\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "bce = BinaryClassificationEvaluator()\n",
175 |     "\n",
176 |     "auc = bce.evaluate(model.transform(df))\n",
177 |     "print(\"Decision Tree AUC: {:0.4f}\".format(auc))"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "### Random Forests"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 30,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "from pyspark import SparkContext\n",
194 |     "from pyspark.sql import SparkSession\n",
195 |     "from functools import reduce\n",
196 |     "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
197 |     "from pyspark.ml.classification import RandomForestClassifier\n",
198 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
199 |     "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 31,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "spark = SparkSession.builder \\\n",
209 |     "           .master(\"local\") \\\n",
210 |     "           .appName(\"Random Forests\") \\\n",
211 |     "           .getOrCreate()"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 32,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "bce = BinaryClassificationEvaluator()\n",
221 |     "forest = RandomForestClassifier()\n",
222 |     "df = spark.read.csv(\"../Ch10/mushrooms.data\", header=True, inferSchema=True)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 33,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "categories = df.columns\n",
232 |     "categories.pop(categories.index('edible?'))\n",
233 |     "df = reduce(string_to_index, categories, df)\n",
234 |     "indexes = [\"i-\"+c for c in categories]"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 34,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "df = VectorAssembler(inputCols=indexes,\n",
244 |     "                     outputCol=\"features\").transform(df)\n",
245 |     "df = StringIndexer(inputCol='edible?',\n",
246 |     "                   outputCol='label').fit(df).transform(df)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 35,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "grid = ParamGridBuilder().addGrid(forest.maxDepth, [0, 2]).build()\n",
256 |     "cv = CrossValidator(estimator=forest, estimatorParamMaps=grid,\n",
257 |     "                        evaluator=bce,numFolds=10,\n",
258 |     "                        parallelism=4)\n",
259 |     "cv_model = cv.fit(df)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 36,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "name": "stdout",
269 |      "output_type": "stream",
270 |      "text": [
271 |       "Random Forest AUC: 0.9950\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "area_under_curve = bce.evaluate(cv_model.transform(df))\n",
277 |     "print(\"Random Forest AUC: {:0.4f}\".format(area_under_curve))"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 37,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "RandomForestClassificationModel (uid=RandomForestClassifier_3715b1717fde) with 20 trees\n",
290 |       "  Tree 0 (weight 1.0):\n",
291 |       "    If (feature 7 in {0.0})\n",
292 |       "     If (feature 11 in {0.0,2.0,3.0})\n",
293 |       "      Predict: 0.0\n",
294 |       "     Else (feature 11 not in {0.0,2.0,3.0})\n",
295 |       "      Predict: 1.0\n",
296 |       "    Else (feature 7 not in {0.0})\n",
297 |       "     If (feature 1 in {2.0,3.0})\n",
298 |       "      Predict: 0.0\n",
299 |       "     Else (feature 1 not in {2.0,3.0})\n",
300 |       "      Predict: 1.0\n",
301 |       "  Tree 1 (weight 1.0):\n",
302 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
303 |       "     If (feature 4 in {0.0,4.0,5.0})\n",
304 |       "      Predict: 0.0\n",
305 |       "     Else (feature 4 not in {0.0,4.0,5.0})\n",
306 |       "      Predict: 1.0\n",
307 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
308 |       "     If (feature 21 in {6.0})\n",
309 |       "      Predict: 0.0\n",
310 |       "     Else (feature 21 not in {6.0})\n",
311 |       "      Predict: 1.0\n",
312 |       "  Tree 2 (weight 1.0):\n",
313 |       "    If (feature 11 in {0.0,2.0,3.0})\n",
314 |       "     Predict: 0.0\n",
315 |       "    Else (feature 11 not in {0.0,2.0,3.0})\n",
316 |       "     If (feature 20 in {2.0,3.0})\n",
317 |       "      Predict: 0.0\n",
318 |       "     Else (feature 20 not in {2.0,3.0})\n",
319 |       "      Predict: 1.0\n",
320 |       "  Tree 3 (weight 1.0):\n",
321 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
322 |       "     If (feature 7 in {0.0})\n",
323 |       "      Predict: 0.0\n",
324 |       "     Else (feature 7 not in {0.0})\n",
325 |       "      Predict: 1.0\n",
326 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
327 |       "     If (feature 20 in {2.0,3.0,5.0})\n",
328 |       "      Predict: 0.0\n",
329 |       "     Else (feature 20 not in {2.0,3.0,5.0})\n",
330 |       "      Predict: 1.0\n",
331 |       "  Tree 4 (weight 1.0):\n",
332 |       "    If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
333 |       "     If (feature 4 in {0.0,4.0,5.0,8.0})\n",
334 |       "      Predict: 0.0\n",
335 |       "     Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
336 |       "      Predict: 1.0\n",
337 |       "    Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
338 |       "     If (feature 19 in {1.0,2.0,4.0,6.0})\n",
339 |       "      Predict: 0.0\n",
340 |       "     Else (feature 19 not in {1.0,2.0,4.0,6.0})\n",
341 |       "      Predict: 1.0\n",
342 |       "  Tree 5 (weight 1.0):\n",
343 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
344 |       "     Predict: 0.0\n",
345 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
346 |       "     If (feature 20 in {2.0,3.0,5.0})\n",
347 |       "      Predict: 0.0\n",
348 |       "     Else (feature 20 not in {2.0,3.0,5.0})\n",
349 |       "      Predict: 1.0\n",
350 |       "  Tree 6 (weight 1.0):\n",
351 |       "    If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
352 |       "     Predict: 0.0\n",
353 |       "    Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
354 |       "     If (feature 18 in {0.0,3.0})\n",
355 |       "      Predict: 0.0\n",
356 |       "     Else (feature 18 not in {0.0,3.0})\n",
357 |       "      Predict: 1.0\n",
358 |       "  Tree 7 (weight 1.0):\n",
359 |       "    If (feature 20 in {1.0,2.0,3.0,4.0,5.0})\n",
360 |       "     If (feature 18 in {0.0,1.0,3.0,4.0})\n",
361 |       "      Predict: 0.0\n",
362 |       "     Else (feature 18 not in {0.0,1.0,3.0,4.0})\n",
363 |       "      Predict: 1.0\n",
364 |       "    Else (feature 20 not in {1.0,2.0,3.0,4.0,5.0})\n",
365 |       "     If (feature 3 in {1.0})\n",
366 |       "      Predict: 0.0\n",
367 |       "     Else (feature 3 not in {1.0})\n",
368 |       "      Predict: 1.0\n",
369 |       "  Tree 8 (weight 1.0):\n",
370 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
371 |       "     Predict: 0.0\n",
372 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
373 |       "     If (feature 17 in {1.0})\n",
374 |       "      Predict: 0.0\n",
375 |       "     Else (feature 17 not in {1.0})\n",
376 |       "      Predict: 1.0\n",
377 |       "  Tree 9 (weight 1.0):\n",
378 |       "    If (feature 7 in {0.0})\n",
379 |       "     If (feature 19 in {0.0,1.0,2.0,5.0,7.0,8.0})\n",
380 |       "      Predict: 0.0\n",
381 |       "     Else (feature 19 not in {0.0,1.0,2.0,5.0,7.0,8.0})\n",
382 |       "      Predict: 1.0\n",
383 |       "    Else (feature 7 not in {0.0})\n",
384 |       "     If (feature 1 in {2.0,3.0})\n",
385 |       "      Predict: 0.0\n",
386 |       "     Else (feature 1 not in {2.0,3.0})\n",
387 |       "      Predict: 1.0\n",
388 |       "  Tree 10 (weight 1.0):\n",
389 |       "    If (feature 4 in {0.0,4.0,5.0,8.0})\n",
390 |       "     Predict: 0.0\n",
391 |       "    Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
392 |       "     Predict: 1.0\n",
393 |       "  Tree 11 (weight 1.0):\n",
394 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
395 |       "     Predict: 0.0\n",
396 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
397 |       "     If (feature 6 in {1.0})\n",
398 |       "      Predict: 0.0\n",
399 |       "     Else (feature 6 not in {1.0})\n",
400 |       "      Predict: 1.0\n",
401 |       "  Tree 12 (weight 1.0):\n",
402 |       "    If (feature 12 in {0.0,2.0,3.0})\n",
403 |       "     If (feature 4 in {0.0,4.0,5.0,8.0})\n",
404 |       "      Predict: 0.0\n",
405 |       "     Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
406 |       "      Predict: 1.0\n",
407 |       "    Else (feature 12 not in {0.0,2.0,3.0})\n",
408 |       "     If (feature 18 in {0.0})\n",
409 |       "      Predict: 0.0\n",
410 |       "     Else (feature 18 not in {0.0})\n",
411 |       "      Predict: 1.0\n",
412 |       "  Tree 13 (weight 1.0):\n",
413 |       "    If (feature 4 in {0.0,4.0,5.0,8.0})\n",
414 |       "     If (feature 17 in {2.0})\n",
415 |       "      Predict: 1.0\n",
416 |       "     Else (feature 17 not in {2.0})\n",
417 |       "      Predict: 0.0\n",
418 |       "    Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
419 |       "     Predict: 1.0\n",
420 |       "  Tree 14 (weight 1.0):\n",
421 |       "    If (feature 18 in {0.0,3.0,4.0})\n",
422 |       "     If (feature 4 in {0.0,4.0,5.0,8.0})\n",
423 |       "      Predict: 0.0\n",
424 |       "     Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
425 |       "      Predict: 1.0\n",
426 |       "    Else (feature 18 not in {0.0,3.0,4.0})\n",
427 |       "     If (feature 10 in {2.0})\n",
428 |       "      Predict: 0.0\n",
429 |       "     Else (feature 10 not in {2.0})\n",
430 |       "      Predict: 1.0\n",
431 |       "  Tree 15 (weight 1.0):\n",
432 |       "    If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
433 |       "     Predict: 0.0\n",
434 |       "    Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n",
435 |       "     If (feature 4 in {0.0,8.0})\n",
436 |       "      Predict: 0.0\n",
437 |       "     Else (feature 4 not in {0.0,8.0})\n",
438 |       "      Predict: 1.0\n",
439 |       "  Tree 16 (weight 1.0):\n",
440 |       "    If (feature 4 in {0.0,4.0,5.0,8.0})\n",
441 |       "     Predict: 0.0\n",
442 |       "    Else (feature 4 not in {0.0,4.0,5.0,8.0})\n",
443 |       "     Predict: 1.0\n",
444 |       "  Tree 17 (weight 1.0):\n",
445 |       "    If (feature 11 in {0.0,2.0,3.0})\n",
446 |       "     If (feature 8 in {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
447 |       "      Predict: 0.0\n",
448 |       "     Else (feature 8 not in {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
449 |       "      Predict: 1.0\n",
450 |       "    Else (feature 11 not in {0.0,2.0,3.0})\n",
451 |       "     If (feature 4 in {0.0,8.0})\n",
452 |       "      Predict: 0.0\n",
453 |       "     Else (feature 4 not in {0.0,8.0})\n",
454 |       "      Predict: 1.0\n",
455 |       "  Tree 18 (weight 1.0):\n",
456 |       "    If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
457 |       "     If (feature 7 in {0.0})\n",
458 |       "      Predict: 0.0\n",
459 |       "     Else (feature 7 not in {0.0})\n",
460 |       "      Predict: 1.0\n",
461 |       "    Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n",
462 |       "     If (feature 19 in {1.0,2.0,4.0,6.0})\n",
463 |       "      Predict: 0.0\n",
464 |       "     Else (feature 19 not in {1.0,2.0,4.0,6.0})\n",
465 |       "      Predict: 1.0\n",
466 |       "  Tree 19 (weight 1.0):\n",
467 |       "    If (feature 18 in {0.0,3.0,4.0})\n",
468 |       "     Predict: 0.0\n",
469 |       "    Else (feature 18 not in {0.0,3.0,4.0})\n",
470 |       "     Predict: 1.0\n",
471 |       "\n"
472 |      ]
473 |     }
474 |    ],
475 |    "source": [
476 |     "print(cv_model.bestModel.toDebugString)"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "[Read for more? Go to chapter 11!](./Ch11_notebook.ipynb)"
484 |    ]
485 |   }
486 |  ],
487 |  "metadata": {
488 |   "kernelspec": {
489 |    "display_name": "mldbook",
490 |    "language": "python",
491 |    "name": "mldbook"
492 |   },
493 |   "language_info": {
494 |    "codemirror_mode": {
495 |     "name": "ipython",
496 |     "version": 3
497 |    },
498 |    "file_extension": ".py",
499 |    "mimetype": "text/x-python",
500 |    "name": "python",
501 |    "nbconvert_exporter": "python",
502 |    "pygments_lexer": "ipython3",
503 |    "version": "3.5.3"
504 |   }
505 |  },
506 |  "nbformat": 4,
507 |  "nbformat_minor": 2
508 | }
509 | 


--------------------------------------------------------------------------------
/notebooks/Ch11_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Chapter 11. Large datasets in the cloud with Amazon Web Services and S3\n",
 8 |     "====\n",
 9 |     "### Mastering Large Datasets with Python by JT Wolohan "
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "markdown",
14 |    "metadata": {},
15 |    "source": [
16 |     "### Uploading to S3 with Boto"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": null,
22 |    "metadata": {},
23 |    "outputs": [],
24 |    "source": [
25 |     "import boto3 as aws\n",
26 |     "import os.path\n",
27 |     "from functools import partial\n",
28 |     "from glob import iglob"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {},
35 |    "outputs": [],
36 |    "source": [
37 |     "def upload_file(fp, bucket):\n",
38 |     "    _, file_name = os.path.split(fp)\n",
39 |     "    s3 = aws.client(\"s3\",\n",
40 |     "        aws_access_key_id = \"YOURACCESSKEYID\",\n",
41 |     "        aws_secret_access_key = \"YOURSECRETACCESSKEY\"\n",
42 |     "    )\n",
43 |     "    response = s3.upload_file(fp, bucket, file_name)\n",
44 |     "    return file_name, response"
45 |    ]
46 |   },
47 |   {
48 |    "cell_type": "code",
49 |    "execution_count": null,
50 |    "metadata": {},
51 |    "outputs": [],
52 |    "source": [
53 |     "fs = iglob(\"/path/to/data/files/*\")\n",
54 |     "uploads = map(partial(upload_file, bucket=\"your-backet-name\"), fs)\n",
55 |     "for file_name, _ in uploads :\n",
56 |     "    print(file_name)"
57 |    ]
58 |   }
59 |  ],
60 |  "metadata": {
61 |   "kernelspec": {
62 |    "display_name": "mldbook",
63 |    "language": "python",
64 |    "name": "mldbook"
65 |   },
66 |   "language_info": {
67 |    "codemirror_mode": {
68 |     "name": "ipython",
69 |     "version": 3
70 |    },
71 |    "file_extension": ".py",
72 |    "mimetype": "text/x-python",
73 |    "name": "python",
74 |    "nbconvert_exporter": "python",
75 |    "pygments_lexer": "ipython3",
76 |    "version": "3.5.3"
77 |   }
78 |  },
79 |  "nbformat": 4,
80 |  "nbformat_minor": 2
81 | }
82 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.9.159
 2 | botocore==1.12.159
 3 | cachetools==3.1.1
 4 | certifi==2019.3.9
 5 | chardet==3.0.4
 6 | dill==0.2.9
 7 | docutils==0.14
 8 | google-api-core==1.11.1
 9 | google-auth==1.6.3
10 | google-cloud-core==1.0.2
11 | google-cloud-dataproc==0.4.0
12 | google-cloud-logging==1.11.0
13 | google-cloud-storage==1.16.1
14 | google-resumable-media==0.3.2
15 | googleapis-common-protos==1.6.0
16 | grpcio==1.21.1
17 | idna==2.8
18 | jmespath==0.9.4
19 | mrjob==0.6.9
20 | multiprocess==0.70.7
21 | numpy==1.16.4
22 | pathos==0.2.3
23 | pkg-resources==0.0.0
24 | pox==0.2.5
25 | ppft==1.6.4.9
26 | protobuf==3.8.0
27 | py4j==0.10.7
28 | pyasn1==0.4.5
29 | pyasn1-modules==0.2.5
30 | pyspark==2.4.3
31 | python-dateutil==2.8.0
32 | pytz==2019.1
33 | PyYAML==5.1.1
34 | requests==2.22.0
35 | rsa==4.0
36 | s3transfer==0.2.0
37 | six==1.12.0
38 | toolz==0.9.0
39 | urllib3==1.25.3
40 | networkx
41 | 


--------------------------------------------------------------------------------