├── .gitignore ├── Ch02 ├── competitor_blog.py ├── fizz_buzz.py ├── network_wikipedia.py ├── parallel_blog_processing.py └── phonenumber_cleaning.py ├── Ch03 ├── hacker_translate.py └── twitter_demographics.py ├── Ch04 ├── A.txt ├── B.txt ├── even_numbers.py ├── generate_poems.py ├── lake_simulation.py ├── m_words.py ├── more_filters.py └── poem_puzzle.py ├── Ch05 ├── car_profit.py ├── cars.json ├── evenfilter.py ├── frequencies.py ├── products.py ├── scrabble_scores.py └── summation.py ├── Ch06 ├── imap.py ├── naivebayes.py ├── par-filter.py ├── par-freqs.py ├── par-sum.py ├── parallel-fold.py ├── starmap.py ├── timing-chunks.py └── timing.py ├── Ch07 ├── FlorenceMachineCounts │ ├── ._SUCCESS.crc │ ├── .part-00000.crc │ ├── _SUCCESS │ └── part-00000 ├── Florence_Machine.txt ├── Florence_Nightingale.txt ├── highest_scoring.py ├── large_words ├── score_words.py ├── spark_scores.py ├── wc_mapper.py └── wc_reducer.py ├── Ch08 ├── .most-active-times.py.swp ├── 10klog.csv ├── command_elo ├── common-errors.py ├── elo-mapper.py ├── elo-reducer.py ├── serena_counter.py ├── williams-counter.py └── wta.tar.bz2.tar.bz2 ├── Ch09 ├── spark_losses.py ├── spark_scores.py ├── wikipedia_edges.txt └── wta_matches_2001.csv ├── Ch10 ├── decision_trees.py ├── iris.csv ├── leads.txt ├── mushrooms.data └── random_forest.py ├── Ch11 ├── .gitkeep └── s3_upload.py ├── Ch12 ├── crashes_nb.py ├── emr-script-example.sh ├── emr_crash_counts.sh ├── mrjob_crash_counts.py ├── mrjob_emr_nb.sh ├── mrspark_bayes.py ├── nb_on_emr.sh ├── spark_bayes.py └── spark_mrjob.conf ├── README.md ├── notebooks ├── Ch02_notebook.ipynb ├── Ch03_notebook.ipynb ├── Ch04_notebook.ipynb ├── Ch05_notebook.ipynb ├── Ch06_notebook.ipynb ├── Ch07_notebook.ipynb ├── Ch09_notebook.ipynb ├── Ch10_notebook.ipynb └── Ch11_notebook.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .pycache 2 | bookenv/* 3 | */.ipynb_checkpoints/* 4 | -------------------------------------------------------------------------------- /Ch02/competitor_blog.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from urllib import request 3 | from toolz import take 4 | 5 | 6 | def days_between(start, stop): 7 | today = date(*start) 8 | stop = date(*stop) 9 | while today < stop: 10 | yield "http://jtwolohan.com/evilblog/"+today.strftime("%m-%d-%Y") 11 | today = date.fromordinal(today.toordinal()+1) 12 | 13 | 14 | def get_url(path): 15 | return request.urlopen(path).read() 16 | 17 | 18 | if __name__ == "__main__": 19 | start = (2000, 1, 1) 20 | stop = (2001, 1, 1) 21 | xs = map(get_url, days_between(start,stop)) 22 | print(take(5,xs)) 23 | -------------------------------------------------------------------------------- /Ch02/fizz_buzz.py: -------------------------------------------------------------------------------- 1 | class FizzBuzzer: 2 | def __init__(self): 3 | self.n = 0 4 | def foo(self,_): 5 | self.n += 1 6 | if (self.n % 3) == 0: 7 | x = "buzz" 8 | else: x = "fizz" 9 | print(x) 10 | return x 11 | 12 | FB = FizzBuzzer() 13 | for i in range(21): 14 | FB.foo(i) 15 | -------------------------------------------------------------------------------- /Ch02/network_wikipedia.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib import request, parse 3 | from multiprocessing import Pool 4 | from itertools import chain 5 | import networkx as nx 6 | 7 | def link_to_title(link): 8 | return link["title"] 9 | 10 | def clean_if_key(page,key): 11 | if key in page.keys(): 12 | return map(link_to_title,page[key]) 13 | else: return [] 14 | 15 | def get_Wiki_links(pageTitle): 16 | safe_title = parse.quote(pageTitle) 17 | url = "https://en.wikipedia.org/w/api.php?action=query&\ 18 | prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&\ 19 | format=json&formatversion=2".format(safe_title) 20 | page = request.urlopen(url).read() 21 | j = json.loads(page) 22 | jpage = j['query']['pages'][0] 23 | inbound = clean_if_key(jpage,"links") 24 | outbound = clean_if_key(jpage,"linkshere") 25 | return {"title": pageTitle, 26 | "in-links":list(inbound), 27 | "out-links":list(outbound)} 28 | 29 | def flatten_network(page): 30 | return page["in-links"]+page["out-links"] 31 | 32 | def page_to_edges(page): 33 | a = [(page['title'],p) for p in page['out-links']] 34 | b = [(p,page['title']) for p in page['in-links']] 35 | return a+b 36 | 37 | if __name__ == "__main__": 38 | root = get_Wiki_links("Parallel_computing") 39 | initial_network = flatten_network(root) 40 | with Pool() as P: 41 | all_pages = P.map(get_Wiki_links, initial_network) 42 | edges = P.map(page_to_edges, all_pages) 43 | edges = chain.from_iterable(edges) 44 | 45 | G = nx.DiGraph() 46 | for e in edges: 47 | G.add_edge(*e) 48 | nx.readwrite.gexf.write_gexf(G,"./MyGraph.gexf") 49 | -------------------------------------------------------------------------------- /Ch02/parallel_blog_processing.py: -------------------------------------------------------------------------------- 1 | from datetime import date 2 | from urllib import request 3 | 4 | from multiprocessing import Pool 5 | 6 | def days_between(start,stop): 7 | today = date(*start) 8 | stop = date(*stop) 9 | while today < stop: 10 | datestr = today.strftime("%m-%d-%Y") 11 | yield "http://jtwolohan.com/arch-rival-blog/"+datestr 12 | today = date.fromordinal(today.toordinal()+1) 13 | 14 | def get_url(path): 15 | return request.urlopen(path).read() 16 | 17 | 18 | with Pool() as P: 19 | blog_posts = P.map(get_url,days_between((2000,1,1),(2011,1,1))) 20 | -------------------------------------------------------------------------------- /Ch02/phonenumber_cleaning.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class PhoneFormatter: 4 | def __init__(self): 5 | self.r = re.compile(r"\d") 6 | 7 | def pretty_format(self, phone_number): 8 | numbers = self.r.findall(phone_number) 9 | area_code = "".join(numbers[-10:-7]) 10 | first_3 = "".join(numbers[-7:-4]) 11 | last_4 = "".join(numbers[-4:len(numbers)]) 12 | return "({}) {}-{}".format(area_code, first_3, last_4) 13 | 14 | if __name__ == "__main__": 15 | phone_numbers = [ 16 | "(123) 456-7890", 17 | "1234567890", 18 | "123.456.7890", 19 | "+1 123 456-7890" 20 | ] 21 | 22 | P = PhoneFormatter() 23 | 24 | print(list(map(P.pretty_format, phone_numbers))) -------------------------------------------------------------------------------- /Ch03/hacker_translate.py: -------------------------------------------------------------------------------- 1 | import re 2 | from toolz.functoolz import pipe, compose 3 | 4 | sample_messages = [ 5 | "7his所is家4没s4mpl3动m3ss463", 6 | "don7家73ll经4nyon3法7his现m3ss463", 7 | "w3现4r3当b3in6进so好s3cr3t", 8 | "733小h33成h33去nobody看is天on分7o理us", 9 | "w3么will面n3v3r分637理c4u6ht", 10 | "w3事4r3经such没sn34ky天h4ckers"] 11 | 12 | 13 | def replace_7t(s): 14 | return s.replace('7', 't') 15 | 16 | 17 | def replace_3e(s): 18 | return s.replace('3', 'e') 19 | 20 | 21 | def replace_6g(s): 22 | return s.replace('6', 'g') 23 | 24 | 25 | def replace_4a(s): 26 | return s.replace('4', 'a') 27 | 28 | 29 | class chinese_matcher: 30 | def __init__(self): 31 | self.r = re.compile(r'[\u4e00-\u9fff]+') 32 | 33 | def sub_chinese(self,s): 34 | return self.r.sub(" ",s) 35 | 36 | 37 | if __name__ == "__main__": 38 | C = chinese_matcher() 39 | 40 | # Not chained 41 | print(list( 42 | map( C.sub_chinese, 43 | map(replace_4a, 44 | map(replace_6g, 45 | map(replace_3e, 46 | map(replace_7t, sample_messages)))))),end="\n\n") 47 | 48 | # Option 1 49 | hacker_translate = compose(C.sub_chinese, replace_4a, replace_6g, 50 | replace_3e, replace_7t) 51 | 52 | print(list(map(hacker_translate, sample_messages)),end="\n\n") 53 | 54 | # Option 2 55 | def hacker_translate(s): 56 | return pipe(s, replace_7t, replace_3e, replace_6g, 57 | replace_4a, C.sub_chinese) 58 | 59 | print(list(map(hacker_translate,sample_messages)),end="\n\n") 60 | -------------------------------------------------------------------------------- /Ch03/twitter_demographics.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | from toolz import compose, pipe 3 | import twitter 4 | 5 | Twitter = twitter.Api(consumer_key="", 6 | consumer_secret="", 7 | access_token_key="", 8 | access_token_secret="") 9 | 10 | 11 | def get_tweet_from_id(tweet_id, api=Twitter): 12 | return api.GetStatus(tweet_id, trim_user=True) 13 | 14 | 15 | def tweet_to_text(tweet): 16 | return tweet.text 17 | 18 | 19 | def tokenize_text(text): 20 | return text.split() 21 | 22 | 23 | def score_text(tokens): 24 | words = {"the":1, "to":1, "and":1, #Words with 1 indicate men 25 | "in":1, "have":1, "it":1, 26 | "be":-1, "of":-1, "a":-1, # Words with -1 indicate women 27 | "that":-1, "i":-1, "for":-1} 28 | return sum(map(lambda x: words.get(x, 0), tokens)) 29 | 30 | 31 | def score_tweet(tweet_id): 32 | return pipe(tweet_id, get_tweet_from_id, tweet_to_text, 33 | tokenize_text, score_text) 34 | 35 | 36 | def score_user(tweets): 37 | N = len(tweets) 38 | total = sum(map(score_tweet, tweets)) 39 | return total/N 40 | 41 | 42 | def categorize_user(user_score): 43 | if user_score > 0: 44 | return {"score":user_score, 45 | "gender": "Male"} 46 | return {"score":user_score, 47 | "gender":"Female"} 48 | 49 | 50 | if __name__ == "__main__": 51 | users_tweets = [ 52 | [1056365937547534341, 1056310126255034368, 1055985345341251584, 53 | 1056585873989394432, 1056585871623966720], 54 | [1055986452612419584, 1056318330037002240, 1055957256162942977, 55 | 1056585921154420736, 1056585896898805766], 56 | [1056240773572771841, 1056184836900175874, 1056367465477951490, 57 | 1056585972765224960, 1056585968155684864], 58 | [1056452187897786368, 1056314736546115584, 1055172336062816258, 59 | 1056585983175602176, 1056585980881207297]] 60 | gender_prediction_pipeline = compose(categorize_user, score_user) 61 | with Pool() as P: 62 | print(P.map(gender_prediction_pipeline, users_tweets)) 63 | -------------------------------------------------------------------------------- /Ch04/even_numbers.py: -------------------------------------------------------------------------------- 1 | def even_numbers(n): 2 | i = 1 3 | while i <= n: 4 | yield i*2 5 | i += 1 6 | 7 | first_100_even = (i*2 for i in range(1,101)) 8 | -------------------------------------------------------------------------------- /Ch04/generate_poems.py: -------------------------------------------------------------------------------- 1 | import re, os, glob 2 | from functools import reduce 3 | from random import randint, choice 4 | from multiprocessing import Pool 5 | from math import floor 6 | 7 | class ContentMatcher: 8 | def __init__(self): 9 | self.r = re.compile(r'[A-Z\W-]+') 10 | def is_content(self,l): 11 | if self.r.fullmatch(l): 12 | return False 13 | else: return True 14 | 15 | def line_to_thirds(l): 16 | words = l.split() 17 | n = len(words) 18 | breakpoint = floor(n / 3) 19 | return {"first": " ".join(words[:breakpoint]), 20 | "second": " ".join(words[breakpoint:breakpoint*2]), 21 | "third": " ".join(words[breakpoint*2:])} 22 | 23 | def join_breaks(acc,nxt): 24 | return {k:v+[nxt[k]] for k,v in acc.items()} 25 | 26 | def consolidate_content(fp,R): 27 | with open(fp) as f: 28 | with Pool() as P: 29 | content = P.map(line_to_thirds, filter(R.is_content, f.readlines())) 30 | return reduce(join_breaks, 31 | content, 32 | {"first":[],"second":[],"third":[]}) 33 | 34 | def make_line(parts): 35 | return " ".join([choice(parts['first']), 36 | choice(parts["second"]), 37 | choice(parts["third"])]) 38 | 39 | def write_poem(parts,name,i): 40 | fp = "{}/poem_{}.txt".format(name,i) 41 | num_lines = randint(7,40) 42 | lines = (make_line(parts) for _ in range(num_lines)) 43 | with open(fp,"w") as f: 44 | f.write("\n".join(lines)) 45 | 46 | def calc_total_size(): 47 | paths = glob.iglob("./author*/*") 48 | return sum(map(os.path.getsize,paths)) 49 | 50 | def generate_poems(a, b, max_size=10000000): 51 | try: 52 | os.mkdir("author_a") 53 | os.mkdir("author_b") 54 | except FileExistsError: 55 | pass 56 | i = 1 57 | #while calc_total_size() < max_size: 58 | for _ in range(floor(max_size/1000)): 59 | write_poem(a,"author_a",i) 60 | write_poem(b,"author_b",i) 61 | i+=1 62 | 63 | if __name__ == "__main__": 64 | CM = ContentMatcher() 65 | author_a = consolidate_content("A.txt",CM) 66 | author_b = consolidate_content("B.txt",CM) 67 | generate_poems(author_a, author_b) 68 | -------------------------------------------------------------------------------- /Ch04/lake_simulation.py: -------------------------------------------------------------------------------- 1 | import random, itertools 2 | from operator import methodcaller 3 | 4 | 5 | class Village: 6 | def __init__(self): 7 | self.population = random.uniform(1000,5000) 8 | self.cheat_rate = random.uniform(.05,.15) 9 | 10 | def update(self, sim): 11 | if sim.cheaters >= 2: 12 | self.cheat_rate += .05 13 | self.population = int(self.population*1.025) 14 | 15 | def go_fishing(self): 16 | if random.uniform(0,1) < self.cheat_rate: 17 | cheat = 1 18 | fish_taken = self.population * 2 19 | else: 20 | cheat = 0 21 | fish_taken = self.population * 1 22 | return fish_taken, cheat 23 | 24 | 25 | class LakeSimulation: 26 | def __init__(self): 27 | self.villages = [Village() for _ in range(4)] 28 | self.fish = 80000 29 | self.year = 1 30 | self.cheaters = 0 31 | 32 | def simulate(self): 33 | for _ in itertools.count(): 34 | yearly_results = map(methodcaller("go_fishing"), self.villages) 35 | fishs, cheats = zip(*yearly_results) 36 | total_fished = sum(fishs) 37 | self.cheaters = sum(cheats) 38 | if self.year > 1000: 39 | print("Wow! Your villages lasted 1000 years!") 40 | break 41 | if self.fish < total_fished: 42 | print("The lake was overfished in {} years.".format(self.year)) 43 | break 44 | else: 45 | self.fish = (self.fish-total_fished)* 1.15 46 | map(methodcaller("update"), self.villages) 47 | print("Year {:<5} Fish: {}".format(self.year, 48 | int(self.fish))) 49 | self.year += 1 50 | 51 | 52 | if __name__ == "__main__": 53 | random.seed("Wolohan") 54 | Lake = LakeSimulation() 55 | Lake.simulate() 56 | -------------------------------------------------------------------------------- /Ch04/m_words.py: -------------------------------------------------------------------------------- 1 | words = ["apple","mongoose","walk","mouse","good", 2 | "pineapple","yeti","minnesota","mars", 3 | "phone","cream","cucumber","coffee","elementary", 4 | "sinister","science","empire"] 5 | 6 | def contains_m(s): 7 | if "m" in s.lower(): return True 8 | else: return False 9 | 10 | m_words = filter(contains_m, words) 11 | 12 | next(m_words) 13 | next(m_words) 14 | next(m_words) 15 | 16 | print(list(m_words)) 17 | # [“mars”,”cream”,”cucumber”,”elementary”, ... ] 18 | -------------------------------------------------------------------------------- /Ch04/more_filters.py: -------------------------------------------------------------------------------- 1 | from itertools import filterfalse 2 | from toolz.dicttoolz import keyfilter, valfilter, itemfilter 3 | 4 | def is_even(x): 5 | if x % 2 == 0: return True 6 | else: return False 7 | 8 | def both_are_even(x): 9 | k,v = x 10 | if is_even(k) and is_even(v): return True 11 | else: return False 12 | 13 | print(list(filterfalse(is_even, range(10)))) 14 | # [1, 3, 5, 7, 9] 15 | 16 | print(list(keyfilter(is_even, {1:2, 2:3, 3:4, 4:5, 5:6}))) 17 | # [2, 4] 18 | 19 | print(list(valfilter(is_even, {1:2, 2:3, 3:4, 4:5, 5:6}))) 20 | # [1, 3, 5] 21 | 22 | print(list(itemfilter(both_are_even, {1:5, 2:4, 3:3, 4:2, 5:1}))) 23 | # [2, 4] 24 | -------------------------------------------------------------------------------- /Ch04/poem_puzzle.py: -------------------------------------------------------------------------------- 1 | import toolz 2 | import re, itertools 3 | from glob import iglob 4 | 5 | 6 | def word_ratio(d): 7 | """This helper function returns the ratio of a's to the's""" 8 | return float(d.get("a",0))/float(d.get("the",0.0001)) 9 | 10 | 11 | class PoemCleaner: 12 | def __init__(self): 13 | self.r = re.compile(r'[.,;:!-]') 14 | 15 | def clean_poem(self, fp): 16 | """This helper function opens a poem at a filepath and returns a clean poem. 17 | 18 | A clean poem will be a punctuation-less sequence of lowercase words, in 19 | the order that the author of the poem placed them. 20 | """ 21 | with open(fp) as poem: 22 | no_punc = self.r.sub("",poem.read()) 23 | return no_punc.lower().split() 24 | 25 | 26 | def word_is_desired(w): 27 | """This helper function detects whether a word is "a" or "the". 28 | 29 | It is designed to be used in conjunction with filter to filter a sequence 30 | of words down to just definite and indefinite articles. 31 | """ 32 | if w in ["a","the"]: 33 | return True 34 | else: 35 | return False 36 | 37 | 38 | def analyze_poems(poems, cleaner): 39 | return word_ratio( 40 | toolz.frequencies( 41 | filter(word_is_desired, 42 | itertools.chain(*map(cleaner.clean_poem, poems))))) 43 | 44 | 45 | if __name__ == "__main__": 46 | 47 | Cleaner = PoemCleaner() 48 | author_a_poems = iglob("author_a/*.txt") 49 | author_b_poems = iglob("author_b/*.txt") 50 | 51 | author_a_ratio = analyze_poems(author_a_poems, Cleaner) 52 | author_b_ratio = analyze_poems(author_b_poems, Cleaner) 53 | 54 | print(""" 55 | Original_Poem: 0.3 56 | Author A: {:.2f} 57 | Author B: {:.2f} 58 | """.format(author_a_ratio, author_b_ratio)) 59 | -------------------------------------------------------------------------------- /Ch05/car_profit.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | 4 | def low_med_hi(d, k, breaks): 5 | if float(d[k]) < breaks[0]: 6 | return "low" 7 | elif float(d[k]) < breaks[1]: 8 | return "medium" 9 | else: 10 | return "high" 11 | 12 | 13 | def clean_entry(d): 14 | r = {'profit':None, 'mpg':None, 'odo':None} 15 | r['profit'] = float(d.get("price-sell", 0)) - float(d.get("price-buy", 0)) 16 | r['mpg'] = low_med_hi(d, 'mpg', (18, 35)) 17 | r['odo'] = low_med_hi(d, 'odo', (60000, 105000)) 18 | return r 19 | 20 | 21 | def acc_average(acc, profit): 22 | acc['total'] = acc.get('total', 0) + profit 23 | acc['count'] = acc.get('count', 0) + 1 24 | acc['average'] = acc['total']/acc['count'] 25 | return acc 26 | 27 | 28 | def sort_and_add(acc, nxt): 29 | p = nxt['profit'] 30 | acc['mpg'][nxt['mpg']] = acc_average(acc['mpg'].get(nxt['mpg'], {}), p) 31 | acc['odo'][nxt['odo']] = acc_average(acc['odo'].get(nxt['odo'], {}), p) 32 | return acc 33 | 34 | 35 | if __name__ == "__main__": 36 | import json 37 | with open("cars.json") as f: 38 | xs = json.load(f) 39 | results = reduce(sort_and_add, map(clean_entry, xs), {"mpg": {}, "odo": {}}) 40 | print(json.dumps(results, indent=4)) 41 | -------------------------------------------------------------------------------- /Ch05/evenfilter.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | xs = [1, 2, 3, 4, 5, 6, 7, 8, 9] 4 | 5 | 6 | def keep_if_even(acc, nxt): 7 | if nxt % 2 == 0: 8 | return acc + [nxt] 9 | else: 10 | return acc 11 | 12 | 13 | reduce(keep_if_even, xs, []) 14 | -------------------------------------------------------------------------------- /Ch05/frequencies.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | xs = ["A", "B", "C", "A", "A", "C", "A"] 4 | ys = [1, 3, 6, 1, 2, 9, 3, 12] 5 | 6 | 7 | def make_counts(acc, nxt): 8 | acc[nxt] = acc.get(nxt, 0) + 1 9 | return acc 10 | 11 | 12 | def my_frequencies(xs): 13 | return reduce(make_counts, xs, {}) 14 | 15 | 16 | print(my_frequencies(xs)) 17 | print(my_frequencies(ys)) 18 | print(my_frequencies("mississippi")) 19 | -------------------------------------------------------------------------------- /Ch05/products.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | my_products = [ 4 | {"price": 9.99, 5 | "sn": '00231'}, 6 | {"price": 59.99, 7 | "sn": '11010'}, 8 | {"price": 74.99, 9 | "sn": '00013'}, 10 | {"price": 19.99, 11 | "sn": '00831'}, 12 | ] 13 | 14 | reduce(lambda acc, nxt: acc+nxt.get("price", 0), my_products, 0) 15 | -------------------------------------------------------------------------------- /Ch05/scrabble_scores.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | 4 | def score_word(word): 5 | points = 0 6 | for char in word: 7 | if char == "z": points += 10 8 | elif char in ["f", "h", "v", "w"]: points += 5 9 | elif char in ["b", "c", "m", "p"]: points += 3 10 | else: points += 1 11 | return points 12 | 13 | 14 | words = ["these", "are", "my", "words"] 15 | 16 | total_score = reduce(lambda acc,nxt: acc+nxt, map(score_word, words)) 17 | print(total_score) 18 | -------------------------------------------------------------------------------- /Ch05/summation.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | xs = [10, 5, 1, 19, 11, 203] 4 | 5 | 6 | def my_add(acc, nxt): 7 | return acc + nxt 8 | 9 | 10 | print(reduce(my_add, xs, 0)) 11 | 12 | # With a lambda instead: 13 | print(reduce(lambda acc, nxt: acc+nxt, xs, 0)) 14 | -------------------------------------------------------------------------------- /Ch06/imap.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | 3 | 4 | def increase(x): 5 | return x+1 6 | 7 | 8 | with Pool() as P: 9 | a = P.map(increase, range(100)) 10 | 11 | 12 | with Pool() as P: 13 | b = P.imap(increase, range(100)) 14 | 15 | 16 | with Pool() as P: 17 | c = P.imap_unordered(increase, range(100)) 18 | 19 | print(a) 20 | print(b) 21 | print(c) 22 | -------------------------------------------------------------------------------- /Ch06/naivebayes.py: -------------------------------------------------------------------------------- 1 | from itertools import starmap, repeat 2 | from functools import reduce, partial 3 | import dill as pickle 4 | from toolz.sandbox.parallel import fold 5 | from pathos.multiprocessing import ProcessingPool as PathosPool 6 | from multiprocessing import Pool 7 | from csv import DictReader 8 | 9 | def unique_keys(left, right): 10 | return set(left.keys()).union(set(right.keys())) 11 | 12 | def prod(xs): 13 | return reduce(lambda acc,nxt: acc*nxt, xs) 14 | 15 | def compute_prob(model, k, v, label, N): 16 | Cn = model['LABELS'][label] 17 | prior = Cn / N 18 | evidence = model[k][v].get(label,.001) / Cn 19 | return prior * evidence 20 | 21 | def _nb_suggest(ob, model, target): 22 | ob.pop(target) 23 | N = sum(model['LABELS'].values()) 24 | results = {} 25 | for label in model['LABELS'].keys(): 26 | p = prod(compute_prob(model, k, v, label, N) for k, v in ob.items()) 27 | results[label] = p 28 | return results 29 | 30 | def naive_bayes_suggest(obs, model, target): 31 | with Pool() as P: 32 | f = partial(_nb_suggest, target=target) 33 | return P.starmap(f, zip(obs, repeat(model))) 34 | 35 | def nb_acc(acc, nxt, target): 36 | label = nxt.pop(target) 37 | if not acc.get('LABELS', False): 38 | acc['LABELS'] = {} 39 | acc['LABELS'][label] = acc['LABELS'].get(label,0) + 1 40 | for k,v in nxt.items(): 41 | if not acc.get(k,False): 42 | acc[k] = {} 43 | if not acc[k].get(v, False): 44 | acc[k][v] = {} 45 | acc[k][v][label] = acc.get(k,{}).get(v,{}).get(label,0) + 1 46 | return acc 47 | 48 | def _nb_comb(left, right): 49 | acc = {} 50 | acc['LABELS'] = {} 51 | for k in unique_keys(left['LABELS'], right['LABELS']): 52 | acc['LABELS'][k] = left['LABELS'].get(k,0) + right['LABELS'].get(k,0) 53 | for k in unique_keys(left, right): 54 | if k == 'LABELS': continue 55 | acc[k] = {} 56 | for v in unique_keys(left.get(k,{}), right.get(k,{})): 57 | acc[k][v] = {} 58 | for label in acc['LABELS']: 59 | count_left = left.get(k,{}).get(v,{}).get(label,0) 60 | count_right = right.get(k,{}).get(v,{}).get(label,0) 61 | acc[k][v][label] = count_left + count_right 62 | return acc 63 | 64 | def naive_bayes(xs, target): 65 | acc = partial(nb_acc, target=target) 66 | with PathosPool() as P: 67 | model = fold(acc, xs, {}, map=P.map, combine=_nb_comb) 68 | return partial(naive_bayes_suggest, model=model, target=target) 69 | 70 | def max_prob(probs): 71 | return max(((k,v) for k,v in probs.items()), key=lambda x:x[1])[0] 72 | 73 | if __name__ == "__main__": 74 | # Download the nursery data and assign its path to fp 75 | # https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data 76 | fp = "/home/jt-w/Downloads/nursery.data" 77 | with open(fp) as f: 78 | reader = DictReader(f, fieldnames=["parents", "has_nurs", "form", 79 | "children", "housing", "finance", 80 | "social", "health", "recc"]) 81 | data = [row for row in reader] 82 | 83 | model = naive_bayes(data, "recc") 84 | probs = model(data) 85 | print("{}\t\t{}\t{}".format("Match", "Suggestion", "Actual")) 86 | print("{}".format("-"*45)) 87 | for i,p in enumerate(probs): 88 | suggestion = max_prob(p) 89 | actual = data[i]['recc'] 90 | match = suggestion == actual 91 | print("{}\t\t{}\t{}".format(match, suggestion, actual)) 92 | if i > 25: break 93 | -------------------------------------------------------------------------------- /Ch06/par-filter.py: -------------------------------------------------------------------------------- 1 | from pathos.multiprocessing import ProcessingPool as Pool 2 | from toolz.sandbox.parallel import fold 3 | from functools import reduce 4 | 5 | 6 | def map_combination(left, right): 7 | return left + right 8 | 9 | 10 | def keep_if_even(acc, nxt): 11 | if nxt % 2 == 0: 12 | return acc + [nxt] 13 | else: return acc 14 | 15 | 16 | with Pool() as P: 17 | fold(keep_if_even, range(500000), [], 18 | map=P.imap, combine=map_combination) 19 | 20 | print(reduce(keep_if_even, range(500), [])) 21 | 22 | -------------------------------------------------------------------------------- /Ch06/par-freqs.py: -------------------------------------------------------------------------------- 1 | from pathos.multiprocessing import ProcessingPool as Pool 2 | from toolz.sandbox.parallel import fold 3 | from random import choice 4 | from functools import reduce 5 | 6 | 7 | def combine_counts(left, right): 8 | unique_keys = set(left.keys()).union(set(right.keys())) 9 | return {k:left.get(k, 0)+right.get(k, 0) for k in unique_keys} 10 | 11 | 12 | def make_counts(acc, nxt): 13 | acc[nxt] = acc.get(nxt,0) + 1 14 | return acc 15 | 16 | 17 | xs = (choice([1, 2, 3, 4, 5, 6]) for _ in range(500000)) 18 | 19 | with Pool() as P: 20 | fold(make_counts, xs, {}, 21 | map=P.imap, combine=combine_counts) 22 | 23 | print(reduce(make_counts, (choice([1, 2, 3, 4, 5, 6]) for _ in range(500)), {})) 24 | -------------------------------------------------------------------------------- /Ch06/par-sum.py: -------------------------------------------------------------------------------- 1 | from pathos.multiprocessing import ProcessingPool as Pool 2 | from toolz.sandbox.parallel import fold 3 | from functools import reduce 4 | 5 | 6 | def my_add(left, right): 7 | return left+right 8 | 9 | 10 | with Pool() as P: 11 | fold(my_add, range(500000), map=P.imap) 12 | 13 | print(reduce(my_add, range(500))) -------------------------------------------------------------------------------- /Ch06/parallel-fold.py: -------------------------------------------------------------------------------- 1 | import dill as pickle 2 | from toolz.sandbox.parallel import fold 3 | from pathos.multiprocessing import ProcessingPool as Pool 4 | from random import choice 5 | 6 | N = 100000 7 | P = Pool() 8 | 9 | # Parallel summation 10 | def my_add(left, right): 11 | return left+right 12 | 13 | xs = range(N) 14 | 15 | print(fold(my_add, xs, map=P.imap)) 16 | 17 | # Parallel filter 18 | def map_combination(left, right): 19 | return left + right 20 | 21 | def keep_if_even(acc, nxt): 22 | if nxt % 2 == 0: 23 | return acc + [nxt] 24 | else: return acc 25 | 26 | print(fold(keep_if_even, xs, [], map=P.imap, combine=map_combination)) 27 | 28 | #Parallel frequencies 29 | def combine_counts(left, right): 30 | unique_keys = set(left.keys()).union(set(right.keys())) 31 | return {k:left.get(k,0)+right.get(k,0) for k in unique_keys} 32 | 33 | def make_counts(acc, nxt): 34 | acc[nxt] = acc.get(nxt,0) + 1 35 | return acc 36 | 37 | xs = (choice([1,2,3,4,5,6]) for _ in range(N)) 38 | 39 | print(fold(make_counts, xs, {}, map=P.imap, combine=combine_counts)) 40 | -------------------------------------------------------------------------------- /Ch06/starmap.py: -------------------------------------------------------------------------------- 1 | from itertools import starmap 2 | xs = [7, 3, 1, 19, 11] 3 | ys = [8, 1, -3, 14, 22] 4 | 5 | loop_maxes = [max(ys[i], x) for i, x in enumerate(xs)] 6 | map_maxes = list(starmap(max, zip(xs, ys))) 7 | 8 | print(loop_maxes) 9 | # [8, 3, 1, 19, 22] 10 | print(map_maxes) 11 | # [8, 3, 1, 19, 22] 12 | -------------------------------------------------------------------------------- /Ch06/timing-chunks.py: -------------------------------------------------------------------------------- 1 | from time import clock 2 | from multiprocessing import Pool 3 | 4 | 5 | def times_two(x): 6 | return x*2+7 7 | 8 | 9 | def parallel_map(xs, chunk_size=8500): 10 | with Pool(2) as P: 11 | x = P.map(times_two, xs, chunk_size) 12 | return x 13 | 14 | 15 | print(""" 16 | {:<10} | {} 17 | -------------------------""".format("chunksize", "runtime")) 18 | 19 | for i in range(0, 9): 20 | N = 1000000 21 | chunk_size = 5 * (10**i) 22 | 23 | t1 = clock() 24 | parallel_map(range(N), chunk_size) 25 | parallel_time = clock() - t1 26 | 27 | print("{:<10} {:>0.3f}".format(chunk_size, parallel_time)) 28 | -------------------------------------------------------------------------------- /Ch06/timing.py: -------------------------------------------------------------------------------- 1 | from time import clock, sleep 2 | from multiprocessing import Pool 3 | 4 | 5 | def times_two(x): 6 | return x*2+7 7 | 8 | 9 | def lazy_map(xs): 10 | return list(map(times_two, xs)) 11 | 12 | 13 | def parallel_map(xs, chunck=8500): 14 | with Pool(2) as P: 15 | x = P.map(times_two, xs, chunck) 16 | return x 17 | 18 | 19 | for i in range(0, 7): 20 | N = 10**i 21 | t1 = clock() 22 | lazy_map(range(N)) 23 | lm_time = clock() - t1 24 | 25 | t1 = clock() 26 | parallel_map(range(N)) 27 | par_time = clock() - t1 28 | print(""" 29 | -- N = {} -- 30 | Lazy map time: {} 31 | Parallel map time: {} 32 | """.format(N, lm_time, par_time)) 33 | -------------------------------------------------------------------------------- /Ch07/FlorenceMachineCounts/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /Ch07/FlorenceMachineCounts/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch07/FlorenceMachineCounts/.part-00000.crc -------------------------------------------------------------------------------- /Ch07/FlorenceMachineCounts/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch07/FlorenceMachineCounts/_SUCCESS -------------------------------------------------------------------------------- /Ch07/FlorenceMachineCounts/part-00000: -------------------------------------------------------------------------------- 1 | crashes 2 | 3 | Nothing 4 | 5 | recession 6 | 7 | messages, 8 | 9 | freedom 10 | 11 | phrases 12 | 13 | Reflections 14 | 15 | reflection 16 | 17 | through 18 | 19 | réponds 20 | 21 | escaping, 22 | 23 | Everybody 24 | 25 | describe 26 | 27 | forgive 28 | 29 | themselves. 30 | 31 | shelter 32 | 33 | pourquoi 34 | 35 | exchange 36 | 37 | defiance 38 | 39 | scrawling 40 | 41 | loosely 42 | 43 | holding 44 | 45 | Drinking 46 | 47 | doesn't 48 | 49 | they're 50 | 51 | Skyscrapers 52 | 53 | entertainers, 54 | 55 | Chicago 56 | 57 | tabletop. 58 | 59 | mistakes 60 | 61 | mountain 62 | 63 | absolution 64 | 65 | anything 66 | 67 | Darling, 68 | 69 | Cathedral 70 | 71 | inbetween 72 | 73 | thousand 74 | 75 | daylight 76 | 77 | something 78 | 79 | attitude's 80 | 81 | convinced 82 | 83 | youngsters 84 | 85 | returned 86 | 87 | Delilah 88 | 89 | tempting 90 | 91 | [Florence] 92 | 93 | answer, 94 | 95 | sailin' 96 | 97 | everywhere's 98 | 99 | willingly 100 | 101 | Rascal] 102 | 103 | [Pre-Chorus:] 104 | 105 | dancing 106 | 107 | stumble, 108 | 109 | feeling 110 | 111 | hurricane 112 | 113 | Fractured 114 | 115 | forgotten, 116 | 117 | thinking 118 | 119 | violent 120 | 121 | together 122 | 123 | there's 124 | 125 | birthday, 126 | 127 | another 128 | 129 | beginning 130 | 131 | rushing 132 | 133 | moonlight 134 | 135 | submission 136 | 137 | staring 138 | 139 | Climbing 140 | 141 | electric 142 | 143 | Looking 144 | 145 | beating 146 | 147 | language 148 | 149 | crumble 150 | 151 | weather 152 | 153 | hollyoaks, 154 | 155 | yourself 156 | 157 | kindest 158 | 159 | pillars 160 | 161 | quicker 162 | 163 | illusion, 164 | 165 | mountain, 166 | 167 | Tenderest 168 | 169 | world’s 170 | 171 | hearing 172 | 173 | solution 174 | 175 | Somewhere 176 | 177 | smallest 178 | 179 | calling 180 | 181 | various 182 | 183 | getting 184 | 185 | broken-hearted 186 | 187 | paycheck 188 | 189 | Because 190 | 191 | bedroom 192 | 193 | longing 194 | 195 | singing 196 | 197 | dreamed 198 | 199 | confessions 200 | 201 | episode 202 | 203 | whatever 204 | 205 | meaning 206 | 207 | themselves 208 | 209 | confession 210 | 211 | Checkin' 212 | 213 | mourner 214 | 215 | hurting 216 | 217 | Spilled 218 | 219 | bitterest 220 | 221 | mistress 222 | 223 | dragging 224 | 225 | television 226 | 227 | nameless, 228 | 229 | famous, 230 | 231 | throwing 232 | 233 | [Pre-Chorus 234 | 235 | different 236 | 237 | friends 238 | 239 | instead 240 | 241 | tongues 242 | 243 | warmer, 244 | 245 | accurate 246 | 247 | spinning 248 | 249 | Outside 250 | 251 | morning 252 | 253 | tiptoes 254 | 255 | darling, 256 | 257 | though, 258 | 259 | process? 260 | 261 | minutes 262 | 263 | [Pre-Chorus] 264 | 265 | Sometimes 266 | 267 | southside 268 | 269 | couldn't 270 | 271 | depression 272 | 273 | favorite 274 | 275 | heaven, 276 | 277 | overflow 278 | 279 | shameless 280 | 281 | Saviour's 282 | 283 | (Holding 284 | 285 | unblinking 286 | 287 | dites-moi 288 | 289 | Starring 290 | 291 | pressure's 292 | 293 | Darling 294 | 295 | darkest 296 | 297 | remember 298 | 299 | Spilling 300 | 301 | [Refrain:] 302 | 303 | devotion 304 | 305 | selfish 306 | 307 | forefront 308 | 309 | sweetest 310 | 311 | glasses 312 | 313 | flashing 314 | 315 | breathe 316 | 317 | slipping 318 | 319 | translate 320 | 321 | looking 322 | 323 | protected 324 | 325 | nothing 326 | 327 | sometimes 328 | 329 | Between 330 | 331 | gettin' 332 | 333 | tonight) 334 | 335 | hardest 336 | 337 | shallow 338 | 339 | thought 340 | 341 | woah-oh-o 342 | 343 | damaged 344 | 345 | breaking 346 | 347 | someone 348 | 349 | Pushing 350 | 351 | released 352 | 353 | [Refrain] 354 | 355 | saying, 356 | 357 | profession 358 | 359 | control 360 | 361 | shocked 362 | 363 | carrying 364 | 365 | affection, 366 | 367 | education 368 | 369 | amounted 370 | 371 | bleeding, 372 | 373 | ticking 374 | 375 | following 376 | 377 | because 378 | 379 | already 380 | 381 | teaching 382 | 383 | treatment 384 | 385 | Praying 386 | 387 | Uncurling 388 | 389 | lifelines 390 | 391 | hallway 392 | 393 | bargain 394 | 395 | underneath 396 | 397 | [Chorus:] 398 | 399 | ringing 400 | 401 | stumbling 402 | 403 | deserve 404 | 405 | weekend 406 | 407 | sleeping 408 | 409 | [Chorus] 410 | 411 | Christ, 412 | 413 | prayers 414 | 415 | aimless, 416 | 417 | parties 418 | 419 | Truthfully, 420 | 421 | Everybody's 422 | 423 | forsake 424 | 425 | Politician 426 | 427 | enough. 428 | 429 | Another 430 | 431 | peaceful 432 | 433 | trouble 434 | 435 | started 436 | 437 | forever 438 | 439 | prayers, 440 | 441 | o-o-o-o-o-o-out 442 | 443 | monument 444 | 445 | (That's 446 | 447 | [Dizzee 448 | 449 | sunrise 450 | 451 | Something's 452 | 453 | Deliver 454 | 455 | command 456 | 457 | Huggin' 458 | 459 | drifting 460 | 461 | two-faced, 462 | 463 | screaming 464 | 465 | gestures 466 | 467 | conclusion 468 | 469 | ground. 470 | 471 | proclamations 472 | 473 | existed 474 | 475 | baggage 476 | 477 | weekend. 478 | 479 | buildings 480 | 481 | Whenever 482 | 483 | Dealing 484 | 485 | rooftop 486 | 487 | Sweating 488 | 489 | delivered 490 | 491 | Everywhere 492 | 493 | struggling 494 | 495 | watched 496 | 497 | understand 498 | 499 | outside 500 | 501 | ashamed 502 | 503 | Pockets 504 | 505 | -------------------------------------------------------------------------------- /Ch07/Florence_Machine.txt: -------------------------------------------------------------------------------- 1 | High in the halls of the kings who are gone 2 | Jenny would dance with her ghosts 3 | The ones she had lost and the ones she had found 4 | And the ones who had loved her the most 5 | 6 | The ones who'd been gone for so very long 7 | She couldn't remember their names 8 | They spun her around on the damp old stones 9 | Spun away all her sorrow and pain 10 | 11 | And she never wanted to leave, never wanted to leave 12 | Never wanted to leave, never wanted to leave 13 | 14 | They danced through the day 15 | And into the night through the snow that swept through the hall 16 | From winter to summer then winter again 17 | Til the walls did crumble and fall 18 | 19 | And she never wanted to leave, never wanted to leave 20 | Never wanted to leave, never wanted to leave 21 | And she never wanted to leave, never wanted to leave 22 | Never wanted to leave, never wanted to leave 23 | 24 | High in the halls of the kings who are gone 25 | Jenny would dance with her ghosts 26 | The ones she had lost and the ones she had found 27 | And the ones 28 | Who had loved her the most 29 | 30 | You need a big god 31 | Big enough to hold your love 32 | You need a big god 33 | Big enough to fill you up 34 | 35 | You keep me up at night 36 | To my messages, you do not reply 37 | You know I still like you the most 38 | The best of the best and the worst of the worst 39 | Well, you can never know 40 | The places that I go 41 | I still like you the most 42 | You'll always be my favorite ghost 43 | 44 | You need a big god 45 | Big enough to hold your love 46 | You need a big god 47 | Big enough to fill you up 48 | 49 | Sometimes I think it's gettin' better 50 | And then it gets much worse 51 | Is it just part of the process? 52 | Well, Jesus Christ, it hurts 53 | Though I know I should know better 54 | Well, I can make this work 55 | Is it just part of the process? 56 | Well, Jesus Christ, Jesus Christ, it hurts 57 | Jesus Christ, Jesus Christ, it hurts 58 | 59 | You need a big god 60 | Big enough to hold your love 61 | You need a big god 62 | Big enough to fill you up 63 | 64 | Shower your affection, let it rain on me 65 | And pull down the mountain, drag your cities to the sea 66 | Shower your affection, let it rain on me 67 | Don't leave me on this white cliff 68 | Let it slide down to the, slide down to the sea 69 | Slide down to the, slide down to the sea 70 | 71 | 72 | Looking up from underneath 73 | Fractured moonlight on the sea 74 | Reflections still look the same to me 75 | As before I went under 76 | 77 | And it's peaceful in the deep 78 | Cathedral where you cannot breathe 79 | No need to pray, no need to speak 80 | Now I am under all 81 | 82 | And it's breaking over me 83 | A thousand miles down to the sea bed 84 | Found the place to rest my head 85 | Never let me go 86 | Never let me go 87 | Never let me go 88 | Never let me go 89 | 90 | And the arms of the ocean are carrying me 91 | And all this devotion was rushing out of me 92 | And the crashes are heaven for a sinner like me 93 | But the arms of the ocean delivered me 94 | 95 | Though the pressure's hard to take 96 | It's the only way I can escape 97 | It seems a heavy choice to make 98 | And now I am under all 99 | 100 | And it's breaking over me 101 | A thousand miles down to the sea bed 102 | Found the place to rest my head 103 | Never let me go 104 | Never let me go 105 | Never let me go 106 | Never let me go 107 | 108 | And the arms of the ocean are carrying me 109 | And all this devotion was rushing out of me 110 | And the crashes are heaven for a sinner like me 111 | But the arms of the ocean delivered me 112 | 113 | And it's over 114 | And I'm going under 115 | But I'm not giving up 116 | I'm just giving in 117 | 118 | I'm slipping underneath 119 | So cold and so sweet 120 | 121 | And the arms of the ocean so sweet and so cold 122 | And all this devotion I never knew at all 123 | And the crashes are heaven for a sinner released 124 | And the arms of the ocean delivered me 125 | Never let me go 126 | Never let me go 127 | Never let me go 128 | Never let me go 129 | Deliver me 130 | Never let me go 131 | Never let me go 132 | Never let me go 133 | Never let me go 134 | Deliver me 135 | Never let me go 136 | Never let me go 137 | Never let me go 138 | Never let me go 139 | Deliver me 140 | Never let me go 141 | Never let me go 142 | Never let me go 143 | Never let me go 144 | 145 | And it's over 146 | (Never let me go, Never let me go) 147 | And I'm going under 148 | (Never let me go, Never let me go) 149 | But I'm not giving up 150 | (Never let me go, Never let me go) 151 | I'm just giving in 152 | (Never let me go, Never let me go) 153 | 154 | I'm slipping underneath 155 | (Never let me go, Never let me go) 156 | So cold and so sweet 157 | (Never let me go, Never let me go) 158 | 159 | The show was ending and I had started to crack 160 | Woke up in Chicago and the sky turned black 161 | And you're so high, you're so high, you had to be an angel 162 | And I'm so high, I'm so high, I can see an angel 163 | 164 | I hear your heart beating in your chest 165 | The world slows 'till there's nothing left 166 | Skyscrapers look on like great, unblinking giants (oh) 167 | 168 | In those heavy days in June 169 | When love became an act of defiance 170 | 171 | Hold onto each other 172 | Hold onto each other 173 | Hold onto each other 174 | Hold onto each other 175 | 176 | You were broken-hearted and the world was, too 177 | And I was beginning to lose my grip 178 | And I always held it loosely 179 | But this time I admit 180 | I felt it really start to slip 181 | 182 | And choir singing in the street 183 | And I will come to you 184 | To watch the television screen 185 | In your hotel room 186 | 187 | Hold onto each other 188 | Hold onto each other 189 | Hold onto each other 190 | Hold onto each other 191 | 192 | You're so high, you're so high 193 | You're so high, you're so high 194 | You're so high, you're so high 195 | You had to be an angel 196 | I'm so high, I'm so high 197 | I'm so high, I'm so high 198 | I'm so high, I'm so high 199 | I can see an angel 200 | 201 | No walls 202 | Can keep me protected 203 | No sleep 204 | Nothing inbetween me and the rain 205 | And you can't save me now, 206 | I'm in the grip of a hurricane 207 | I'm gonna blow myself away 208 | 209 | I'm going out 210 | I'm gonna drink myself to death 211 | And in the crowd 212 | I see you with someone else, 213 | I brace myself 214 | Cause I know it's going to hurt 215 | But I like to think at least things can't get any worse 216 | 217 | No home, 218 | I don't want shelter 219 | No calm, 220 | Nothing to keep me from the storm 221 | And you can't hold me down 222 | 'Cause I belong to the hurricane 223 | It's gonna blow this all away 224 | 225 | I'm going out 226 | I'm gonna drink myself to death 227 | And in the crowd 228 | I see you with someone else 229 | I brace myself 230 | Cause I know it's going to hurt 231 | But I like to think at least things can't get any worse 232 | 233 | I hope that you see me 234 | Cause I'm staring at you 235 | But when you look over 236 | You look right through 237 | Then you lean and kiss her on the head 238 | And I never felt so alive, and so... dead. 239 | 240 | I'm going out 241 | I'm gonna drink myself to death 242 | And in the crowd 243 | I see you with someone else 244 | I brace myself 245 | Cause I know it's going to hurt 246 | I'm going out, woah-oh-o 247 | 248 | I'm going out 249 | I'm gonna drink myself to death 250 | And in the crowd 251 | I see you with someone else 252 | I brace myself 253 | Cause I know it's going to hurt 254 | I'm going out, woah-oh-o 255 | I'm going out, woah-oh-o 256 | I'm going o-o-o-o-o-o-out 257 | I'm going out, woah-oh-o 258 | I'm going out 259 | 260 | This is as good a place to fall as any 261 | We'll build our altar here 262 | Make me your Maria 263 | I'm already on my knees 264 | 265 | You had Jesus on your breath 266 | And I caught him in mine 267 | Sweating our confessions 268 | The undone and the divine 269 | 270 | 'Cause this is his body 271 | This is his love 272 | Such selfish prayers 273 | And I can't get enough 274 | 275 | Oh, woah, woah, oh 276 | Oh, whoa, whoa, yeah 277 | 278 | Spilled milk tears, 279 | I did this for you 280 | Spilling over the idol 281 | The black and the blue 282 | 283 | The sweetest submission 284 | Drinking it in 285 | The wine, the women, the bedroom hymns 286 | 287 | 'Cause this is his body 288 | This is his love 289 | Such selfish prayers and I can't get enough 290 | 291 | Oh, woah, woah, oh 292 | Whoa, whoa, yeah 293 | I can't get enough 294 | 295 | I'm not here looking for absolution 296 | Because I found myself an old solution 297 | I'm not here looking for absolution 298 | Because I found myself an old solution 299 | 300 | This is his body 301 | This is his love 302 | Such selfish prayers, I can't get enough 303 | 304 | This is his body 305 | This is his love 306 | Such selfish prayers, I can't get enough 307 | Whoa, whoa, yeah 308 | I can't get enough 309 | Whoa, whoa, yeah 310 | I can't get enough 311 | Whoa, whoa, yeah 312 | 313 | Know you’ve been hurt by someone else 314 | I can tell by the way you carry yourself 315 | If you let me, here’s what I’ll do 316 | I’ll take care of you 317 | Cause I’ve loved and I’ve lost 318 | 319 | I’ve asked about you and they told me things 320 | But my mind didn’t change and I still feel the same 321 | What's a life with no fun, please don’t be so ashamed 322 | I’ve had mine, you’ve had yours, we both know 323 | We know, they don’t get you like I will 324 | My only wish is I die real 325 | Cause that truth hurts, and those lies heal 326 | And you can’t sleep thinking that he lies still 327 | So you cry still, tears all in the pillow case 328 | Big girls all get a little taste 329 | Pushing me away so I give her space 330 | Dealing with a heart that I didn’t break 331 | I’ll be there for you, I will care for you 332 | I keep thinking you just don’t know 333 | Trying to run from that, say you’re done with that 334 | On your face girl, it just don’t show 335 | When you’re ready, just say you’re ready 336 | When all the baggage just ain’t as heavy 337 | And the parties over, just don’t forget me 338 | We’ll change the pace and we'll just go slow 339 | You won’t ever have to worry, 340 | You won’t ever have to hide 341 | You've seen all my mistakes 342 | So look me in my eyes 343 | 344 | Cause if you let me, here’s what I’ll do 345 | I’ll take care of you 346 | Cause I’ve loved and I’ve lost 347 | 348 | It’s my birthday, I'll get high if I want to 349 | Can’t deny that I want you, but I'll lie if I have to 350 | Cause you don’t say you love me 351 | To your friends when they ask you 352 | Even though we both know that you do (you do) 353 | One time, been in love one time 354 | You and all your girls in the club one time 355 | All so convinced that you’re following your heart 356 | Cause your mind don’t control what it does sometimes 357 | We all have our nights though, don’t be so ashamed 358 | I’ve had mine, you’ve had yours, we both know 359 | We know 360 | 361 | Know you’ve been hurt by someone else 362 | I can tell by the way you carry yourself 363 | If you let me, here’s what I’ll do 364 | I’ll take care of you 365 | Cause I’ve loved and I’ve lost 366 | 367 | I've loved and I've lost [3x] 368 | 369 | And the air was full 370 | Of various storms and saints 371 | Praying in the street 372 | As the banks began to break 373 | And I'm in the throes of it 374 | Somewhere in the belly of the beast 375 | But you took your toll on me 376 | So I gave myself over willingly 377 | You got a hold on me 378 | And I don't know how I don't just stand outside and scream 379 | I am teaching myself how to be free 380 | 381 | The monument of a memory 382 | You tear it down in your head 383 | Don't make the mountain your enemy 384 | Get out, get up there instead 385 | You saw the stars out in front of you 386 | Too tempting not to touch 387 | But even though it shocked you 388 | Something's electric in your blood 389 | 390 | And people just untie themselves 391 | Uncurling lifelines 392 | If you could just forgive yourself 393 | 394 | But still you stumble, feet give way 395 | Outside the world seems a violent place 396 | But you had to have him, and so you did 397 | Some things you let go in order to live 398 | While all around you, the buildings sway 399 | You sing it out loud, "who made us this way?" 400 | I know you're bleeding, but you'll be okay 401 | Hold on to your heart, you'll keep it safe 402 | Hold on to your heart, don't give it away 403 | 404 | You'll find a rooftop to sing from 405 | Or find a hallway to dance 406 | You don't need no edge to cling from 407 | Your heart is there, it's in your hands 408 | I know it seems like forever 409 | I know it seems like an age 410 | But one day this will be over 411 | I swear it's not so far away 412 | 413 | And people just untie themselves 414 | Uncurling lifelines 415 | If you could just forgive yourself 416 | 417 | But still you stumble, feet give way 418 | Outside the world seems a violent place 419 | But you had to have him, and so you did 420 | Some things you let go in order to live 421 | While all around you, the buildings sway 422 | You sing it out loud, "who made us this way?" 423 | I know you're bleeding, but you'll be okay 424 | Hold on to your heart, you'll keep it safe 425 | Hold on to your heart 426 | 427 | m drifting through the halls with the sunrise 428 | (Holding on for your call) 429 | Climbing up the walls for that flashing light 430 | (I can never let go) 431 | 432 | [Refrain:] 433 | Cause I'm gonna be free and I'm gonna be fine 434 | (Holding on for your call) 435 | Cause I'm gonna be free and I'm gonna be fine 436 | (Maybe not tonight) 437 | 438 | Now the sun is up and I'm going blind 439 | (Holding on for your call) 440 | Another drink just to pass the time 441 | (I can never say no) 442 | 443 | [Refrain] 444 | 445 | [Pre-Chorus:] 446 | It's a different kind of danger 447 | And the bells are ringing out 448 | And I'm calling for my mother 449 | As I pull the pillars down 450 | It's a different kind of danger 451 | And my feet are spinning around 452 | Never knew I was a dancer 453 | 'Till Delilah showed me how 454 | 455 | Too fast for freedom 456 | Sometimes it all falls down 457 | These chains never leave me 458 | I keep dragging them around 459 | 460 | [Chorus:] 461 | Now I'm dancing with Delilah and her vision is mine 462 | (Holding on for your call) 463 | A different kind of danger in the daylight 464 | (I can never let go) 465 | Took anything to cut you, I can find 466 | (Holding on for your call) 467 | A different kind of a danger in the daylight 468 | (Can't you let me know?) 469 | 470 | Now it's one more boy and it's one more lie 471 | (Holding on for your call) 472 | Taking the pills just to pass the time 473 | (I can never say no) 474 | 475 | [Refrain] 476 | 477 | [Pre-Chorus] 478 | 479 | [Chorus] 480 | 481 | Strung up, strung out for your love 482 | Hang in, hung up, it's so rough 483 | I'm wrung and ringing out 484 | Why can't you let me know? 485 | [x2] 486 | 487 | [Pre-Chorus x2] 488 | 489 | Too fast for freedom 490 | Sometimes it all falls down 491 | These chains never leave me 492 | I keep dragging them around 493 | [x2] 494 | 495 | When the night has come 496 | And the land is dark 497 | And the moon is the only light we see 498 | No, I won't be afraid 499 | Oh, I won't be afraid 500 | Just as long as you stand, stand by me 501 | 502 | So, darling, darling, stand by me 503 | Oh, stand by me 504 | Oh, stand now, stand by me, stand by me 505 | 506 | If the sky that we look upon 507 | Should tumble and fall 508 | Or the mountain should crumble to the sea 509 | I won't cry, I won't cry 510 | No, I won't shed a tear 511 | Just as long as you stand, stand by me 512 | 513 | And darling, darling, stand by me 514 | Oh, stand by me 515 | Oh, stand now, stand by me, stand by me 516 | 517 | And, darling, darling, stand by me 518 | Oh, stand by me 519 | Oh, stand, stand by me, stand by me 520 | 521 | Whenever you're in trouble won't you stand by me? 522 | Oh, stand by me 523 | Oh, stand now, stand by me 524 | 525 | Darling, darling, stand by me 526 | Oh, stand by me 527 | Oh, stand now, stand by me, stand by me 528 | 529 | Whenever you're in trouble won't you stand by me? 530 | Oh, stand by me 531 | Oh, stand now, stand by me, stand by me 532 | 533 | There is love in your body but you can't hold it in 534 | It pours from your eyes and spills from your skin 535 | Tenderest touch leaves the darkest of marks 536 | And the kindest of kisses break the hardest of hearts 537 | 538 | The hardest of hearts 539 | The hardest of hearts 540 | The hardest of hearts 541 | 542 | There is love in your body but you can't get it out 543 | It gets stuck in your head, won't come out of your mouth 544 | Sticks to your tongue and shows on your face 545 | That the sweetest of words have the bitterest taste 546 | 547 | Darling heart, I loved you from the start 548 | But you'll never know what a fool I've been 549 | Darling heart, I loved you from the start 550 | But that's no excuse for the state I'm in 551 | 552 | The hardest of hearts 553 | The hardest of hearts 554 | The hardest of hearts 555 | 556 | There is love in our bodies and it holds us together 557 | But pulls us apart when we're holding each other 558 | We all want something to hold in the night 559 | We don't care if it hurts or we're holding too tight 560 | 561 | There is love in your body but you can't get it out 562 | It gets stuck in your head, won't come out of your mouth 563 | Sticks to your tongue and it shows on your face 564 | That the sweetest of words have the bitterest taste 565 | 566 | Darling heart, I loved you from the start 567 | But you'll never know what a fool I've been 568 | Darling heart, I loved you from the start 569 | But that's no excuse for the state I'm in 570 | 571 | The hardest of hearts 572 | The hardest of hearts 573 | The hardest of hearts 574 | 575 | My heart swells like a water at weight 576 | Can't stop myself before it's too late 577 | Hold on to your heart 578 | 'Cause I'm coming to take you 579 | Hold on to your heart 580 | 'Cause I'm coming to break you 581 | 582 | Hold on hold on hold on hold on hold on 583 | Hold on hold on hold on hold on hold on 584 | The hardest of hearts (hold on, hold on) 585 | The hardest of hearts (hold on, hold on) 586 | The hardest of hearts (hold on) 587 | 588 | 589 | Time it took us 590 | To where the water was 591 | That’s what the water gave me 592 | And time goes quicker 593 | Between the two of us 594 | Oh, my love, don’t forsake me 595 | Take what the water gave me 596 | 597 | Lay me down 598 | Let the only sound 599 | Be the overflow 600 | Pockets full of stones 601 | 602 | Lay me down 603 | Let the only sound 604 | Be the overflow 605 | 606 | And oh, poor Atlas 607 | The world’s a beast of a burden 608 | You’ve been holding up a long time 609 | And all this longing 610 | And the ships are left to rust 611 | That’s what the water gave us 612 | 613 | So lay me down 614 | Let the only sound 615 | Be the overflow 616 | Pockets full of stones 617 | Lay me down 618 | Let the only sound 619 | Be the overflow 620 | 621 | ‘Cause they took your loved ones 622 | But returned them in exchange for you 623 | But would you have it any other way? 624 | Would you have it any other way? 625 | You couldn't have it any other way 626 | 627 | ‘Cause she’s a cruel mistress 628 | And a bargain must be made 629 | But oh, my love, don’t forget me 630 | When I let the water take me 631 | 632 | So lay me down 633 | Let the only sound 634 | Be the over flow 635 | Pockets full of stones 636 | 637 | Lay me down 638 | Let the only sound 639 | Be the overflow 640 | 641 | So lay me down 642 | Let the only sound 643 | Be the overflow 644 | Pockets full of stones 645 | 646 | Lay me down 647 | Let the only sound 648 | Be the overflow 649 | 650 | 651 | Dizzee Rascal] 652 | Everybody wants to be famous, 653 | Nobody wants to be nameless, aimless, 654 | People act shameless 655 | Tryna live like entertainers, 656 | Want a fat crib with the acres, 657 | So they spend money that they ain't made yet, 658 | Got a Benz on tick that they ain't paid yet, 659 | Spend their paycheck 660 | In the west out on a weekend 661 | Got no money by the end of the weekend. 662 | But they don't care cause their life is a movie, 663 | Starring Louis V, paid for by yours truly, 664 | Truthfully, it's a joke, like a bad episode of hollyoaks, 665 | Can't keep up with the cover notes, 666 | So they got bad credit livin' on direct debit in debt 667 | They still don't get it 668 | Cause they too busy livin' the high life, the night life 669 | Huggin' the high when livin' it large 670 | And they all say 671 | 672 | [Florence] 673 | Sometimes it seems that the going is just too rough 674 | And things go wrong no matter what I do 675 | (That's right) 676 | Now and then it seems like life is just too much 677 | But you've got the love I need to see me through 678 | 679 | [Dizzee Rascal] 680 | Let me take you down to London city 681 | Where the attitude's bad and the weather is shitty 682 | Everybody's on a paper chase 683 | It's one big rat race 684 | Everybody's got a screw face 685 | So many two-faced, 686 | Checkin' their high sayin' they're ready to ride 687 | I'm on the inside looking at the 688 | So it's an accurate reflection 689 | City wide, north, east, west and the southside 690 | Everywhere I go there's a goon on the corner 691 | Guns and drugs cause the city's like a sauna 692 | And it's getting warmer, and out of order 693 | Tryna put a struggling mother to a mourner 694 | Mr. Politician can you tell me the solution 695 | What's the answer, what's the conclusion 696 | Is it an illusion, is it a mirage 697 | I see youngsters die because they tryna live large 698 | And they all say 699 | 700 | [Florence] 701 | Sometimes I feel like throwing my hands up in the air 702 | I know I can count on all of you 703 | Sometimes I feel like saying “Lord, I just don't care” 704 | (That's right, that's right) 705 | But you've got the love I need to see me through 706 | (Check it, check it, come on, come on) 707 | 708 | You got the love 709 | (Who's got the love) 710 | You got the love 711 | (Who's got the love) 712 | You got the love 713 | (That's right, that's right, that's right) 714 | You got the love 715 | (Who's got the love) 716 | You got the love 717 | (Who's got the love) 718 | You got the love 719 | (Check it) 720 | 721 | [Dizzee Rascal] 722 | We are living in the days of the credit crunch 723 | Give me the dough 724 | I'm tryna have a bunch 725 | But I can't have rice for lunch 726 | It's not there ain't enough to share 727 | It ain't fair never dreamed that he could be rare 728 | Who cares who dares to make a change 729 | Everybody's in the club trying to make it rain 730 | But not for famine just for the sake of having 731 | 15 minutes of fame and everywhere's the same 732 | Again and again I see the same thing 733 | Everybody acting like they play sailin' 734 | But I see rough seas ahead maybe a recession 735 | And then a depression in whatever profession 736 | This is my confession I can't front I’m in the forefront 737 | Living for money ready to start like a bungee jump 738 | With no rope but I ain't trying to see the bottom 739 | Because that's where I came from, I ain't forgotten, 740 | 741 | [Florence] 742 | You got the love 743 | (Who's got the love) 744 | You got the love 745 | You got the love 746 | (Who's got the love) 747 | You got the love 748 | You got the love 749 | (That's right, that's right, that's right, that's right) 750 | You got the love 751 | You got the love 752 | You got the love 753 | (Who's got the love, who's got the love, who's got the love) 754 | 755 | Sometimes I feel like throwing my hands up in the air 756 | I know I can count on all of you 757 | Sometimes I feel like saying “Lord, I just don't care” 758 | But you've got the love I need to see me through 759 | 760 | And the heart is hard to translate 761 | It has a language of its own 762 | It talks in tongues and quiet sighs 763 | And prayers and proclamations 764 | In the grand deeds of great men and the smallest of gestures 765 | And short shallow gasps 766 | 767 | But with all my education I can't seem to command it 768 | And the words are all escaping, and coming back all damaged 769 | And I would put them back in poetry if I only knew how 770 | I can't seem to understand it 771 | 772 | And I would give all this and heaven too 773 | I would give it all if only for a moment 774 | That I could just understand the meaning of the word you see 775 | 'Cause I've been scrawling it forever but it never makes sense to me at all 776 | 777 | And it talks to me in tiptoes 778 | And it sings to me inside 779 | It cries out in the darkest night and breaks in the morning light 780 | 781 | But with all my education I can't seem to command it 782 | And the words are all escaping, and coming back all damaged 783 | And I would put them back in poetry if I only knew how 784 | I can't seem to understand it 785 | 786 | And I would give all this and heaven too 787 | I would give it all if only for a moment 788 | That I could just understand the meaning of the word you see 789 | 'Cause I've been scrawling it forever but it never makes sense to me at all 790 | 791 | And I would give all this and heaven too 792 | I would give it all if only for a moment 793 | That I could just understand the meaning of the word you see 794 | 'Cause I've been scrawling it forever but it never makes sense to me at all 795 | 796 | No, words are a language 797 | It doesn't deserve such treatment 798 | And all of my stumbling phrases never amounted to anything worth this feeling 799 | 800 | All this heaven never could describe such a feeling as I'm hearing 801 | 802 | Words were never so useful 803 | So I was screaming out a language that I never knew existed before 804 | 805 | Are you hurting the one you love? 806 | You say you've found Heaven but you can't find God. 807 | Are you hurting the one you love? 808 | Bite your tongue till it tastes like blood. 809 | 810 | Are you hurting the one you love? 811 | So many glasses on the tabletop. 812 | Are you hurting the one you love? 813 | You'd like to stay in heaven but the rules are too tough. 814 | 815 | Tough, 816 | It's just too tough. 817 | Tough, 818 | It's just too tough. 819 | 820 | Are you hurting the one you love? 821 | When they watched the walls, and the ticking clock. 822 | Are you hurting the one you love? 823 | And was it something you could not stop. 824 | 825 | Could not stop. 826 | Stop, 827 | Could not stop. 828 | Stop, 829 | Could not stop. 830 | Stop, 831 | Could not stop. 832 | Stop, 833 | Could not stop. 834 | 835 | Are you hurting the one you love? 836 | When you leave them sleeping on the hollow ground. 837 | Are you hurting the one you love? 838 | And lost for themselves. 839 | 840 | Are you hurting the one you love? 841 | And if heaven knows then who will stop. 842 | Are you hurting the one you love? 843 | You said you got to heaven, but it wasn't enough. 844 | 845 | I love you all the time 846 | Oh oh, oh oh, oh oh oh oh 847 | Oh oh, oh oh, ah 848 | 849 | I'm never alone, I look at my phone 850 | If I call you up, you're never at home 851 | I love you all the time 852 | 853 | I'm fueled up and high, I'm out with the guys 854 | A smile on my face, no reason to cry 855 | I love you all the time 856 | 857 | I can tell by that look in your eye 858 | You're looking and all you see's another guy 859 | I can tell you're going to take your love away 860 | 861 | I can tell by that look in your eye 862 | You're looking and all you see's another guy 863 | I would beg you if I thought it would make you stay 864 | 865 | Ce soir c’est le soir 866 | Et toi avec moi 867 | Et tu viens me voir 868 | Tu viens, oh la la 869 | I love you all the time 870 | 871 | Tu ne réponds pas 872 | Ah dites-moi pourquoi 873 | Just say au revoir 874 | Again me voilà 875 | I love you all the time 876 | 877 | And I can tell by that look in your eye 878 | You're looking and all you see's another guy 879 | I can tell you're going to take your love away 880 | 881 | I can tell by that look in your eye 882 | You're looking and all you see's another guy 883 | I would beg if I thought it would make you stay 884 | I would beg if I thought it would make you stay 885 | I would beg if I thought it would make you stay 886 | 887 | Ah dites-moi pourquoi 888 | Ah dites-moi pourquoi 889 | Ah dites-moi pourquoi 890 | 891 | Sometimes I feel like throwing my hands up in the air 892 | I know I can count on you 893 | Sometimes I feel like saying, "Lord, I just don't care." 894 | But you've got the love I need To see me through 895 | 896 | Sometimes it seems that the going is just too rough 897 | And things go wrong no matter what I do 898 | Now and then it seems that life is just too much 899 | But you've got the love I need to see me through 900 | 901 | When food is gone you are my daily meal 902 | When friends are gone I know my Saviour's love is real 903 | You know it's real 904 | 905 | You got the love 906 | You got the love 907 | You got the love 908 | You got the love 909 | You got the love 910 | You got the love 911 | 912 | Time after time I think, "Oh, Lord, what's the use?" 913 | Time after time I think it's just no good 914 | 'Cause sooner or later in life, the things you love you lose 915 | But you got the love I need to see me through 916 | 917 | [2x] 918 | You got the love 919 | You got the love 920 | You got the love 921 | You got the love 922 | You got the love 923 | You got the love 924 | 925 | Sometimes I feel like throwing my hands up in the air 926 | 'Cause I know I can count on you 927 | Sometimes I feel like saying, "Lord, I just don't care." 928 | But you've got the love I need to see me through 929 | -------------------------------------------------------------------------------- /Ch07/highest_scoring.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from functools import reduce 5 | 6 | def keep_highest(acc, nxt): 7 | word, score = nxt.split('\t') 8 | s = int(score) 9 | if len(acc) < 5: 10 | acc.append((word,s)) 11 | acc = sorted(acc, key=lambda x:x[1]) 12 | elif s > acc[0][1]: 13 | acc.append((word, s)) 14 | acc = sorted(acc, key=lambda x:x[1])[1:] 15 | return acc 16 | 17 | print(reduce(keep_highest, sys.stdin, [])) 18 | -------------------------------------------------------------------------------- /Ch07/large_words: -------------------------------------------------------------------------------- 1 | $HADOOP/bin/hadoop jar /home/jt-w/bin/hadoop/hadoop-streaming-3.2.0.jar \ 2 | -file ./wc_mapper.py -mapper ./wc_mapper.py \ 3 | -file ./wc_reducer.py -reducer ./wc_reducer.py \ 4 | -input 'Florence_Machine.txt' \ 5 | -output ./FlorenceMachineCounts 6 | -------------------------------------------------------------------------------- /Ch07/score_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | def score(word): 6 | total = 0 7 | for i,char in enumerate(word): 8 | if char.lower() in "dlcu": 9 | total +=1 10 | elif char.lower() in "mwfbygpvk": 11 | total += 2 12 | elif char.lower() in "jxqz": 13 | total += 4 14 | if i >= 4: 15 | total +=2 16 | return total 17 | 18 | for line in sys.stdin: 19 | for word in line.split(): 20 | print("{}\t{}".format(word, score(word))) 21 | -------------------------------------------------------------------------------- /Ch07/spark_scores.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | import re 3 | from pyspark import SparkContext 4 | 5 | if __name__ == "__main__": 6 | sc = SparkContext(appName="WordScores") 7 | PAT = re.compile(r'[-./:\s\xa0]+') 8 | text_files = sc.textFile("/home/jt-w/Code/MR-test/data/*") 9 | xs = text_files.flatMap(lambda x:PAT.split(x))\ 10 | .filter(lambda x:len(x)>6)\ 11 | .countByValue()\ 12 | 13 | for k,v in xs.items(): 14 | print("{:<30}{}".format(k.encode("ascii","ignore"),v)) 15 | -------------------------------------------------------------------------------- /Ch07/wc_mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | for line in sys.stdin: 6 | for word in line.split(): 7 | if len(word) > 6: 8 | print(word) 9 | -------------------------------------------------------------------------------- /Ch07/wc_reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from functools import reduce 5 | 6 | def make_counts(acc, nxt): 7 | acc[nxt] = acc.get(nxt,0) + 1 8 | return acc 9 | 10 | for w in reduce(make_counts, sys.stdin, {}): 11 | print(w) 12 | -------------------------------------------------------------------------------- /Ch08/.most-active-times.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch08/.most-active-times.py.swp -------------------------------------------------------------------------------- /Ch08/command_elo: -------------------------------------------------------------------------------- 1 | $HADOOP/bin/hadoop jar /home//bin/hadoop/hadoop-streaming-3.2.0.jar \ 2 | -file ./elo-mapper.py -mapper ./elo-mapper.py \ 3 | -file ./elo-reducer.py -reducer ./elo-reducer.py \ 4 | -input '/path/to/wta/files/wta_matches_200*.csv' \ 5 | -output ./tennis_ratings 6 | 7 | -------------------------------------------------------------------------------- /Ch08/common-errors.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | 3 | class ErrorCounter(MRJob): 4 | def mapper(self, _, line): 5 | fields = line.split(',') 6 | if fields[7] == '404.0': 7 | yield fields[6], 1 8 | 9 | def reducer(self, key, vals): 10 | num_404s = sum(vals) 11 | if num_404s>0: 12 | yield key, num_404s 13 | 14 | if __name__ == "__main__": 15 | ErrorCounter.run() 16 | 17 | -------------------------------------------------------------------------------- /Ch08/elo-mapper.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | import json 3 | from sys import stdin 4 | 5 | 6 | def clean_match(match): 7 | ms = match.split(',') 8 | match_data = {'winner': ms[10], 9 | 'loser': ms[20], 10 | 'surface': ms[2]} 11 | return match_data 12 | 13 | 14 | if __name__ == "__main__": 15 | for line in stdin: 16 | print(json.dumps(clean_match(line))) 17 | -------------------------------------------------------------------------------- /Ch08/elo-reducer.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | import json 3 | from sys import stdin 4 | from functools import reduce 5 | 6 | def round5(x): 7 | return 5*int(x/5) 8 | 9 | def elo_acc(acc,nxt): 10 | match_info = json.loads(nxt) 11 | w_elo = acc.get(match_info['winner'],1400) 12 | l_elo = acc.get(match_info['loser'],1400) 13 | Qw = 10**(w_elo/400) 14 | Ql = 10**(l_elo/400) 15 | Qt = Qw+Ql 16 | acc[match_info['winner']] = round5(w_elo + 100*(1-(Qw/Qt))) 17 | acc[match_info['loser']] = round5(l_elo - 100*(Ql/Qt)) 18 | return acc 19 | 20 | if __name__ == "__main__": 21 | xs = reduce(elo_acc, stdin, {}) 22 | for player, rtg in xs.items(): 23 | print(rtg, player) 24 | -------------------------------------------------------------------------------- /Ch08/serena_counter.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from functools import reduce 3 | 4 | def make_counts(acc, nxt): 5 | acc[nxt] = acc.get(nxt,0) + 1 6 | return acc 7 | 8 | def my_frequencies(xs): 9 | return reduce(make_counts, xs, {}) 10 | 11 | class SerenaCounter(MRJob): 12 | 13 | def mapper(self, _, line): 14 | fields = line.split(',') 15 | if fields[10] == 'Serena Williams': 16 | yield fields[2], 'W' 17 | elif fields[20] == 'Serena Williams': 18 | yield fields[2], 'L' 19 | 20 | def reducer(self, surface, results): 21 | counts = my_frequencies(results) 22 | yield surface, counts 23 | 24 | if __name__ == "__main__": 25 | SerenaCounter.run() 26 | -------------------------------------------------------------------------------- /Ch08/williams-counter.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from functools import reduce 3 | 4 | def make_counts(acc, nxt): 5 | acc[nxt] = acc.get(nxt,0) + 1 6 | return acc 7 | 8 | def my_frequencies(xs): 9 | return reduce(make_counts, xs, {}) 10 | 11 | class WilliamsRivalry(MRJob): 12 | 13 | def mapper(self, _, line): 14 | fields = line.split(',') 15 | players = ' '.join([fields[10], fields[20]]) 16 | if 'Serena Williams' in players and 'Venus Williams' in players: 17 | yield fields[2], fields[10] 18 | 19 | def reducer(self, surface, results): 20 | counts = my_frequencies(results) 21 | yield surface, counts 22 | 23 | if __name__ == "__main__": 24 | WilliamsRivalry.run() 25 | -------------------------------------------------------------------------------- /Ch08/wta.tar.bz2.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch08/wta.tar.bz2.tar.bz2 -------------------------------------------------------------------------------- /Ch09/spark_losses.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | from math import log2, ceil 3 | from functools import partial 4 | from pyspark import SparkContext 5 | 6 | def ceil5(x): 7 | return ceil(x/5)*5 8 | 9 | def get_winner_loser(match): 10 | ms = match.split(',') 11 | # Put the loser in first position, winner in second 12 | return (ms[20], ms[10]) 13 | 14 | def initialize_for_voting(losses): 15 | return {'losses': losses, 16 | 'n_losses': len(losses), 17 | 'rating': 100} 18 | 19 | def empty_ratings(d): 20 | d['rating'] = 0 21 | return d 22 | 23 | def allocate_points(acc, nxt): 24 | k,v = nxt 25 | boost = v['rating'] / (v['n_losses'] + .01) 26 | for loss in v['losses']: 27 | if loss not in acc.keys(): 28 | acc[loss] = {'losses':[], 'n_losses': 0} 29 | opp_rating = acc.get(loss,{}).get('rating',0) 30 | acc[loss]['rating'] = opp_rating + boost 31 | return acc 32 | 33 | def combine_scores(a, b): 34 | for k,v in b.items(): 35 | try: 36 | a[k]['rating'] = a[k]['rating'] + b[k]['rating'] 37 | except KeyError: 38 | a[k] = v 39 | return a 40 | 41 | if __name__ == "__main__": 42 | sc = SparkContext(appName="TennisRatings") 43 | match_data = sc.textFile("/media/jt-w/Seagate500G/wta_matches*") 44 | 45 | xs = match_data.map(get_winner_loser)\ 46 | .groupByKey()\ 47 | .mapValues(initialize_for_voting) 48 | 49 | for i in range(8): 50 | if i > 0: 51 | xs = sc.parallelize(zs.items()) 52 | acc = dict(xs.mapValues(empty_ratings).collect()) 53 | zs = xs.aggregate(acc, allocate_points, combine_scores) 54 | 55 | ratings = [(k,v['rating']) for k,v in zs.items()] 56 | for player, rating in sorted(ratings, key=lambda x: x[1], reverse=True)[:20]: 57 | print('{:<30}{}\t{}'.format(player, 58 | round(log2(rating+1), 1), 59 | ceil5(rating))) 60 | -------------------------------------------------------------------------------- /Ch09/spark_scores.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | import re, json 3 | from pyspark import SparkContext 4 | 5 | def round5(x): 6 | return 5*int(x/5) 7 | 8 | def clean_match(match): 9 | ms = match.split(',') 10 | match_data = {'winner': ms[10], 11 | 'loser': ms[20], 12 | 'surface': ms[2]} 13 | return match_data 14 | 15 | def elo_acc(acc,nxt): 16 | w_elo = acc.get(nxt['winner'],1600) 17 | l_elo = acc.get(nxt['loser'],1600) 18 | Qw = 10**(w_elo/400) 19 | Ql = 10**(l_elo/400) 20 | Qt = Qw+Ql 21 | acc[nxt['winner']] = round5(w_elo + 25*(1-(Qw/Qt))) 22 | acc[nxt['loser']] = round5(l_elo - 25*(Ql/Qt)) 23 | return acc 24 | 25 | def elo_comb(a,b): 26 | a.update(b) 27 | return a 28 | 29 | if __name__ == "__main__": 30 | sc = SparkContext(appName="TennisRatings") 31 | text_files = sc.textFile("/path/to/my/data/wta_matches*") 32 | xs = text_files.map(clean_match)\ 33 | .aggregate({},elo_acc, elo_comb) 34 | 35 | for x in sorted(xs.items(), key=lambda x:x[1], reverse=True)[:20]: 36 | print("{:<30}{}".format(*x)) 37 | -------------------------------------------------------------------------------- /Ch10/decision_trees.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import SparkSession 3 | from functools import reduce 4 | from pyspark.ml.feature import StringIndexer, VectorAssembler 5 | from pyspark.ml.classification import DecisionTreeClassifier 6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 7 | 8 | def string_to_index(df, label): 9 | return StringIndexer(inputCol=label, 10 | outputCol="i-"+label).fit(df) \ 11 | .transform(df) 12 | 13 | if __name__ == "__main__": 14 | 15 | spark = SparkSession.builder \ 16 | .master("local") \ 17 | .appName("Decision Trees") \ 18 | .getOrCreate() 19 | 20 | df = spark.read.csv("mushrooms.data", header=True, inferSchema=True) 21 | 22 | categories = ['cap-shape', 'cap-surface', 'cap-color'] 23 | df = reduce(string_to_index, categories, df) 24 | 25 | df = VectorAssembler(inputCols=["i-cap-shape","i-cap-surface", "i-cap-color"], 26 | outputCol="features").transform(df) 27 | 28 | df = StringIndexer(inputCol='edible?', outputCol='label').fit(df).transform(df) 29 | 30 | tree = DecisionTreeClassifier() 31 | model = tree.fit(df) 32 | #print(model.toDebugString) 33 | 34 | bce = BinaryClassificationEvaluator() 35 | 36 | auc = bce.evaluate(model.transform(df)) 37 | print("Decision Tree AUC: {:0.4f}".format(auc)) 38 | -------------------------------------------------------------------------------- /Ch10/iris.csv: -------------------------------------------------------------------------------- 1 | SepalLength,SepalWidth,PetalLength,PetalWidth,Name 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3.0,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,0.2,Iris-setosa 5 | 4.6,3.1,1.5,0.2,Iris-setosa 6 | 5.0,3.6,1.4,0.2,Iris-setosa 7 | 5.4,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5.0,3.4,1.5,0.2,Iris-setosa 10 | 4.4,2.9,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,1.6,0.2,Iris-setosa 14 | 4.8,3.0,1.4,0.1,Iris-setosa 15 | 4.3,3.0,1.1,0.1,Iris-setosa 16 | 5.8,4.0,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3,Iris-setosa 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1.0,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5.0,3.0,1.6,0.2,Iris-setosa 28 | 5.0,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5.0,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3.0,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5.0,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5.0,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3.0,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5.0,3.3,1.4,0.2,Iris-setosa 52 | 7.0,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5,Iris-versicolor 55 | 5.5,2.3,4.0,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1.0,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5.0,2.0,3.5,1.0,Iris-versicolor 63 | 5.9,3.0,4.2,1.5,Iris-versicolor 64 | 6.0,2.2,4.0,1.0,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3.0,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1.0,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4.0,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3.0,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3.0,5.0,1.7,Iris-versicolor 80 | 6.0,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1.0,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1.0,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6.0,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3.0,4.5,1.5,Iris-versicolor 87 | 6.0,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3.0,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4.0,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3.0,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4.0,1.2,Iris-versicolor 95 | 5.0,2.3,3.3,1.0,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3.0,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3.0,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6.0,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3.0,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3.0,5.8,2.2,Iris-virginica 107 | 7.6,3.0,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2.0,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3.0,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5.0,2.0,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3.0,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6.0,2.2,5.0,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2.0,Iris-virginica 124 | 7.7,2.8,6.7,2.0,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6.0,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3.0,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3.0,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2.0,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3.0,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6.0,3.0,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3.0,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5.0,1.9,Iris-virginica 149 | 6.5,3.0,5.2,2.0,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3.0,5.1,1.8,Iris-virginica 152 | -------------------------------------------------------------------------------- /Ch10/random_forest.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkContext 2 | from pyspark.sql import SparkSession 3 | from functools import reduce 4 | from pyspark.ml.feature import StringIndexer, VectorAssembler 5 | from pyspark.ml.classification import RandomForestClassifier 6 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 7 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 8 | 9 | def string_to_index(df, label): 10 | return StringIndexer(inputCol=label, 11 | outputCol="i-"+label).fit(df) \ 12 | .transform(df) 13 | 14 | if __name__ == "__main__": 15 | 16 | spark = SparkSession.builder \ 17 | .master("local") \ 18 | .appName("Random Forests") \ 19 | .getOrCreate() 20 | 21 | bce = BinaryClassificationEvaluator() 22 | 23 | forest = RandomForestClassifier() 24 | df = spark.read.csv("mushrooms.data", header=True, inferSchema=True) 25 | 26 | categories = df.columns 27 | categories.pop(categories.index('edible?')) 28 | df = reduce(string_to_index, categories, df) 29 | indexes = ["i-"+c for c in categories] 30 | df = VectorAssembler(inputCols=indexes, 31 | outputCol="features").transform(df) 32 | df = StringIndexer(inputCol='edible?', 33 | outputCol='label').fit(df).transform(df) 34 | 35 | grid = ParamGridBuilder().addGrid(forest.maxDepth, [0, 2]).build() 36 | cv = CrossValidator(estimator=forest, estimatorParamMaps=grid, 37 | evaluator=bce,numFolds=10, 38 | parallelism=4) 39 | cv_model = cv.fit(df) 40 | area_under_curve = bce.evaluate(cv_model.transform(df)) 41 | print("Random Forest AUC: {:0.4f}".format(area_under_curve)) 42 | print(cv_model.bestModel.toDebugString) 43 | -------------------------------------------------------------------------------- /Ch11/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jtwool/mastering-large-datasets/dfe78716cbd4150c2facc95035e00c2f6c15a16d/Ch11/.gitkeep -------------------------------------------------------------------------------- /Ch11/s3_upload.py: -------------------------------------------------------------------------------- 1 | import boto3 as aws 2 | import os.path 3 | from functools import partial 4 | from glob import iglob 5 | 6 | def upload_file(fp, bucket): 7 | _, file_name = os.path.split(fp) 8 | s3 = aws.client("s3", 9 | aws_access_key_id = "YOURACCESSKEYID", 10 | aws_secret_access_key = "YOURSECRETACCESSKEY" 11 | ) 12 | response = s3.upload_file(fp, bucket, file_name) 13 | return file_name, response 14 | 15 | if __name__ == "__main__": 16 | fs = iglob("/path/to/data/files/*") 17 | uploads = map(partial(upload_file, bucket="your-backet-name"), fs) 18 | for file_name, _ in uploads : 19 | print(file_name) 20 | -------------------------------------------------------------------------------- /Ch12/crashes_nb.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from pyspark import SparkContext 4 | from pyspark.sql import SparkSession 5 | from functools import reduce 6 | from pyspark.ml.feature import StringIndexer, VectorAssembler 7 | from pyspark.ml.classification import NaiveBayes 8 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 9 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 10 | 11 | def string_to_index(df, label): 12 | return StringIndexer(inputCol=label, 13 | outputCol="i-"+label).fit(df) \ 14 | .transform(df) 15 | 16 | def group_crashes(x): 17 | if int(x['Number of Vehicles Involved']) > 3: 18 | x['Number of Vehicles Involved'] = "3" 19 | return x 20 | 21 | def improve_times(x): 22 | time = x['Time'] 23 | if time < "5:00": 24 | x['Time'] = "Early morning" 25 | elif time < "7:00": 26 | x['Time'] = "Morning" 27 | elif time < "9:00": 28 | x['Time'] = "Morning commute" 29 | elif time < "12:00": 30 | x['Time'] = "Late morning" 31 | elif time < "16:00": 32 | x['Time'] = "Afternoon" 33 | elif time < "18:30": 34 | x['Time'] = "Evening commute" 35 | elif time < "22:00": 36 | x['Time'] = "Evening" 37 | else: 38 | x['Time'] = "Late night" 39 | return x 40 | 41 | if __name__ == "__main__": 42 | 43 | sc = SparkContext(appName="Crash counts") 44 | spark = SparkSession.builder \ 45 | .master("local") \ 46 | .getOrCreate() 47 | 48 | mce = MulticlassClassificationEvaluator() 49 | 50 | nb = NaiveBayes() 51 | # read in lines to RDD 52 | crashes = sc.textFile(sys.argv[1]) 53 | xs = crashes.flatMap(lambda x:x.split('\n')) \ 54 | .map(json.loads) \ 55 | .map(group_crashes) \ 56 | .map(improve_times) 57 | 58 | # conver to DF 59 | df = spark.createDataFrame(xs) 60 | 61 | feature_labels = df.columns 62 | feature_labels.pop(feature_labels.index('Number of Vehicles Involved')) 63 | df = reduce(string_to_index, feature_labels, df) 64 | indexes = ["i-"+f for f in feature_labels] 65 | 66 | df = VectorAssembler(inputCols=indexes, 67 | outputCol="features").transform(df) 68 | 69 | df = StringIndexer(inputCol='Number of Vehicles Involved', 70 | outputCol='label').fit(df).transform(df) 71 | 72 | grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \ 73 | .build() 74 | 75 | cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, 76 | evaluator=mce,numFolds=5, 77 | parallelism=4) 78 | cv_model = cv.fit(df) 79 | transformed = cv_model.transform(df) 80 | f1 = mce.evaluate(transformed) 81 | print("NB F1: {:0.4f}".format(f1)) 82 | cv_model.bestModel.save(sys.argv[2]) 83 | -------------------------------------------------------------------------------- /Ch12/emr-script-example.sh: -------------------------------------------------------------------------------- 1 | aws emr add-steps --cluster-id j-1EN18B2OUXEN5 --bootstrap-actions Args=['pip install boto3'] --steps Type=spark,Name=CrashNB,Args=[--deploy-mode,cluster,--master,yarn,s3://scorpion-elastic-jobs/spark_bayes.py],ActionOnFailure=CONTINUE 2 | -------------------------------------------------------------------------------- /Ch12/emr_crash_counts.sh: -------------------------------------------------------------------------------- 1 | python mrjob_crash_counts.py \ 2 | -r emr s3://your-bucket-name-here/ \ 3 | --output-dir=s3://your-bucket-name-here/crash-counts 4 | --conf-path= 5 | -------------------------------------------------------------------------------- /Ch12/mrjob_crash_counts.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | import json 3 | 4 | class MRWordFrequencyCount(MRJob): 5 | 6 | def mapper(self, _, line): 7 | j = json.loads(line) 8 | vehicles = j['Number of Vehicles Involved'] 9 | yield vehicles, 1 10 | 11 | def reducer(self, key, values): 12 | yield key, sum(values) 13 | 14 | 15 | if __name__ == '__main__': 16 | MRWordFrequencyCount.run() 17 | -------------------------------------------------------------------------------- /Ch12/mrjob_emr_nb.sh: -------------------------------------------------------------------------------- 1 | python3 mrspark_bayes.py \ 2 | -r emr \ 3 | s3://scorpion-nys-crashes/ \ 4 | > s3://scorpion-spark-outputs/nb-model 5 | -------------------------------------------------------------------------------- /Ch12/mrspark_bayes.py: -------------------------------------------------------------------------------- 1 | import json 2 | from functools import reduce 3 | import boto3 as aws 4 | from mrjob.job import MRJob 5 | 6 | def string_to_index(df, label): 7 | return StringIndexer(inputCol=label, 8 | outputCol="i-"+label).fit(df) \ 9 | .transform(df) 10 | 11 | def group_crashes(x): 12 | if int(x['Number of Vehicles Involved']) > 3: 13 | x['Number of Vehicles Involved'] = "3" 14 | return x 15 | 16 | def improve_times(x): 17 | time = x['Time'] 18 | if time < "5:00": 19 | x['Time'] = "Early morning" 20 | elif time < "7:00": 21 | x['Time'] = "Morning" 22 | elif time < "9:00": 23 | x['Time'] = "Morning commute" 24 | elif time < "12:00": 25 | x['Time'] = "Late morning" 26 | elif time < "16:00": 27 | x['Time'] = "Afternoon" 28 | elif time < "18:30": 29 | x['Time'] = "Evening commute" 30 | elif time < "22:00": 31 | x['Time'] = "Evening" 32 | else: 33 | x['Time'] = "Late night" 34 | return x 35 | 36 | class MRSparkBayes(MRJob): 37 | 38 | def spark(self, _, output_path): 39 | from pyspark import SparkContext 40 | from pyspark.sql import SparkSession 41 | from pyspark.ml.feature import StringIndexer, VectorAssembler 42 | from pyspark.ml.classification import NaiveBayes 43 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 44 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 45 | 46 | sc = SparkContext(appName="Crash model") 47 | spark = SparkSession.builder \ 48 | .master("local") \ 49 | .getOrCreate() 50 | 51 | s3 = aws.resource("s3") 52 | 53 | bucket = s3.Bucket("scorpion-nys-crashes") 54 | objects = [(obj.bucket_name, obj.key) for obj 55 | in bucket.objects.all()] 56 | xs = sc.parallelize(objects) \ 57 | .map(read_s3_object) \ 58 | .flatMap(lambda x:x.split("\n")) \ 59 | .filter(lambda x:x) \ 60 | .map(json.loads) \ 61 | .map(group_crashes) \ 62 | .map(improve_times) 63 | 64 | df = spark.createDataFrame(xs) 65 | 66 | feature_labels = df.columns 67 | feature_labels.pop(feature_labels.index('Number of Vehicles Involved')) 68 | df = reduce(string_to_index, feature_labels, df) 69 | indexes = ["i-"+f for f in feature_labels] 70 | 71 | df = VectorAssembler(inputCols=indexes, 72 | outputCol="features").transform(df) 73 | 74 | df = StringIndexer(inputCol='Number of Vehicles Involved', 75 | outputCol='label').fit(df).transform(df) 76 | 77 | grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \ 78 | .build() 79 | 80 | 81 | mce = MulticlassClassificationEvaluator() 82 | nb = NaiveBayes() 83 | cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, 84 | evaluator=mce,numFolds=5, 85 | parallelism=4) 86 | cv_model = cv.fit(df) 87 | transformed = cv_model.transform(df) 88 | f1 = mce.evaluate(transformed) 89 | print("NB F1: {:0.4f}".format(f1)) 90 | #cv_model.bestModel.save("./nb-model") 91 | cv_model.bestModel.save("./my-nb-model-s3") 92 | 93 | 94 | if __name__ == "__main__": 95 | MRSparkBayes.run() 96 | -------------------------------------------------------------------------------- /Ch12/nb_on_emr.sh: -------------------------------------------------------------------------------- 1 | aws emr add-steps --cluster-id j-2434JDJSLG768 --steps Type=spark,Name=SparkCrashesNB,Args=[--deploy-mode,cluster,--master,yarn,--conf,spark.yarn.submit.waitAppCompletion=false,--num-executors,2,--executor-cores,1,--executor-memory,10g,s3://scorpion-elastic-jobs/crashes_nb.py,s3://scorpion-nys-crashes/,s3://scorpion-spark-outputs/],ActionOnFailure=CONTINUE 2 | -------------------------------------------------------------------------------- /Ch12/spark_bayes.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 as aws 3 | from functools import reduce 4 | from pyspark import SparkContext 5 | from pyspark.sql import SparkSession 6 | from pyspark.ml.feature import StringIndexer, VectorAssembler 7 | from pyspark.ml.classification import NaiveBayes 8 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 9 | from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 10 | 11 | def string_to_index(df, label): 12 | return StringIndexer(inputCol=label, 13 | outputCol="i-"+label).fit(df) \ 14 | .transform(df) 15 | 16 | def group_crashes(x): 17 | if int(x['Number of Vehicles Involved']) > 3: 18 | x['Number of Vehicles Involved'] = "3" 19 | return x 20 | 21 | def improve_times(x): 22 | time = x['Time'] 23 | if time < "5:00": 24 | x['Time'] = "Early morning" 25 | elif time < "7:00": 26 | x['Time'] = "Morning" 27 | elif time < "9:00": 28 | x['Time'] = "Morning commute" 29 | elif time < "12:00": 30 | x['Time'] = "Late morning" 31 | elif time < "16:00": 32 | x['Time'] = "Afternoon" 33 | elif time < "18:30": 34 | x['Time'] = "Evening commute" 35 | elif time < "22:00": 36 | x['Time'] = "Evening" 37 | else: 38 | x['Time'] = "Late night" 39 | return x 40 | 41 | def read_s3_object(x): 42 | s3 = aws.resource("s3") 43 | obj = s3.Object(x[0], x[1]) 44 | return obj.get()['Body'].read().decode('ascii') 45 | 46 | 47 | if __name__ == "__main__": 48 | sc = SparkContext(appName="Crash model") 49 | spark = SparkSession.builder \ 50 | .master("local") \ 51 | .getOrCreate() 52 | 53 | s3 = aws.resource("s3") 54 | 55 | bucket = s3.Bucket("s3://path/to/your/bucket") 56 | objects = [(obj.bucket_name, obj.key) for obj 57 | in bucket.objects.all()] 58 | xs = sc.parallelize(objects) \ 59 | .map(read_s3_object) \ 60 | .flatMap(lambda x:x.split("\n")) \ 61 | .filter(lambda x:x) \ 62 | .map(json.loads) \ 63 | .map(group_crashes) \ 64 | .map(improve_times) 65 | 66 | df = spark.createDataFrame(xs) 67 | 68 | feature_labels = df.columns 69 | feature_labels.pop(feature_labels.index('Number of Vehicles Involved')) 70 | df = reduce(string_to_index, feature_labels, df) 71 | indexes = ["i-"+f for f in feature_labels] 72 | 73 | df = VectorAssembler(inputCols=indexes, 74 | outputCol="features").transform(df) 75 | 76 | df = StringIndexer(inputCol='Number of Vehicles Involved', 77 | outputCol='label').fit(df).transform(df) 78 | 79 | grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \ 80 | .build() 81 | 82 | 83 | mce = MulticlassClassificationEvaluator() 84 | nb = NaiveBayes() 85 | cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, 86 | evaluator=mce,numFolds=5, 87 | parallelism=4) 88 | cv_model = cv.fit(df) 89 | transformed = cv_model.transform(df) 90 | f1 = mce.evaluate(transformed) 91 | print("NB F1: {:0.4f}".format(f1)) 92 | cv_model.bestModel.save("s3://path/to/your/bucket") 93 | -------------------------------------------------------------------------------- /Ch12/spark_mrjob.conf: -------------------------------------------------------------------------------- 1 | runners: 2 | emr: 3 | num_core_instances: 2 4 | image_version: 5.24.0 5 | instance_type: m1.large 6 | region: us-east-1 7 | tags: 8 | project: Mastering Large Datasets 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Mastering Large Datasets with Python 2 | ======= 3 | JT Wolohan, 2019 4 | 5 | This repo is a companion to the book [Mastering Large Datasets with Python](https://www.manning.com/books/mastering-large-datasets-with-python). 6 | 7 | In addition to the code found in the book, most chapters have accompanying Jupyter notebook examples. 8 | 9 | 10 | 11 | 14 | 31 | 32 |
12 | 13 | 15 | Notebooks 16 | 30 |
33 | -------------------------------------------------------------------------------- /notebooks/Ch02_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 2. Working with large datasets faster: parallelization and the map function\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Listing 2.1 and 2.2 :: Formatting phone numbers with loops and maps" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import re\n", 27 | "\n", 28 | "\n", 29 | "class PhoneFormatter:\n", 30 | " def __init__(self):\n", 31 | " self.r = re.compile(r\"\\d\")\n", 32 | " \n", 33 | " def pretty_format(self, phone_number):\n", 34 | " numbers = self.r.findall(phone_number)\n", 35 | " area_code = \"\".join(numbers[-10:-7])\n", 36 | " first_3 = \"\".join(numbers[-7:-4])\n", 37 | " last_4 = \"\".join(numbers[-4:len(numbers)])\n", 38 | " return \"({}) {}-{}\".format(area_code, first_3, last_4)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "phone_numbers = [\n", 48 | " \"(123) 456-7890\",\n", 49 | " \"1234567890\",\n", 50 | " \"123.456.7890\",\n", 51 | " \"+1 123 456-7890\"\n", 52 | "]\n", 53 | "\n", 54 | "P = PhoneFormatter()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "clean_numbers = []\n", 64 | "for phone_number in phone_numbers:\n", 65 | " pretty = P.pretty_format(phone_number)\n", 66 | " clean_numbers.append(pretty)\n", 67 | "print(clean_numbers)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "print(list(map(P.pretty_format, phone_numbers)))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Parallel blog processing" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "from datetime import date\n", 93 | "from urllib import request\n", 94 | "\n", 95 | "from multiprocessing import Pool\n", 96 | "\n", 97 | "def days_between(start,stop):\n", 98 | " today = date(*start)\n", 99 | " stop = date(*stop)\n", 100 | " while today < stop:\n", 101 | " datestr = today.strftime(\"%m-%d-%Y\")\n", 102 | " yield \"http://jtwolohan.com/arch-rival-blog/\"+datestr\n", 103 | " today = date.fromordinal(today.toordinal()+1)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def get_url(path):\n", 113 | " return request.urlopen(path).read()\n", 114 | "\n", 115 | "with Pool() as P:\n", 116 | " blog_posts = P.map(get_url,days_between((2000,1,1),(2011,1,1)))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Fizz Buzz - state and parallelization" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 51, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "class FizzBuzzer:\n", 133 | " def __init__(self):\n", 134 | " self.n = 0\n", 135 | " def foo(self,_):\n", 136 | " self.n += 1\n", 137 | " if (self.n % 3) == 0:\n", 138 | " x = \"buzz\"\n", 139 | " else: x = \"fizz\"\n", 140 | " print(x)\n", 141 | " return x" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "FB = FizzBuzzer()\n", 151 | "for i in range(21):\n", 152 | " FB.foo(i)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "with Pool() as P:\n", 162 | " P.map(FB.foo, range(1,22))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Wikipedia scraping" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 3, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "import json\n", 179 | "from urllib import request, parse\n", 180 | "from multiprocessing import Pool\n", 181 | "from itertools import chain\n", 182 | "import networkx as nx" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "def link_to_title(link):\n", 192 | " return link[\"title\"]" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 9, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def clean_if_key(page,key):\n", 202 | " if key in page.keys():\n", 203 | " return map(link_to_title,page[key])\n", 204 | " else: return []" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 20, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "def get_Wiki_links(pageTitle):\n", 214 | " safe_title = parse.quote(pageTitle)\n", 215 | " url = \"https://en.wikipedia.org/w/api.php?action=query&\\\n", 216 | "prop=links|linkshere&pllimit=500&lhlimit=500&titles={}&\\\n", 217 | "format=json&formatversion=2\".format(safe_title)\n", 218 | " page = request.urlopen(url).read().decode('utf-8')\n", 219 | " j = json.loads(page)\n", 220 | " jpage = j['query']['pages'][0]\n", 221 | " inbound = clean_if_key(jpage,\"links\")\n", 222 | " outbound = clean_if_key(jpage,\"linkshere\")\n", 223 | " return {\"title\": pageTitle,\n", 224 | " \"in-links\":list(inbound),\n", 225 | " \"out-links\":list(outbound)}" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 21, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "def flatten_network(page):\n", 235 | " return page[\"in-links\"]+page[\"out-links\"]" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 22, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "def page_to_edges(page):\n", 245 | " a = [(page['title'],p) for p in page['out-links']]\n", 246 | " b = [(p,page['title']) for p in page['in-links']]\n", 247 | " return a+b" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 23, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "root = get_Wiki_links(\"Parallel_computing\")\n", 257 | "initial_network = flatten_network(root)\n", 258 | "with Pool() as P:\n", 259 | " all_pages = P.map(get_Wiki_links, initial_network)\n", 260 | " edges = P.map(page_to_edges, all_pages)\n", 261 | "edges = chain.from_iterable(edges)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "G = nx.DiGraph()\n", 271 | "for e in edges:\n", 272 | " G.add_edge(*e)\n", 273 | "nx.readwrite.gexf.write_gexf(G,\"./MyGraph.gexf\")" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "[Read for more? Go to chapter 3!](./Ch03_notebook.ipynb)" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "mldbook", 287 | "language": "python", 288 | "name": "mldbook" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.5.3" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | -------------------------------------------------------------------------------- /notebooks/Ch03_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 3. Function pipelines for mapping complex transformations\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Hacker translation" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import re\n", 27 | "from toolz.functoolz import pipe, compose\n", 28 | "\n", 29 | "sample_messages = [\n", 30 | "\"7his所is家4没s4mpl3动m3ss463\",\n", 31 | "\"don7家73ll经4nyon3法7his现m3ss463\",\n", 32 | "\"w3现4r3当b3in6进so好s3cr3t\",\n", 33 | "\"733小h33成h33去nobody看is天on分7o理us\",\n", 34 | "\"w3么will面n3v3r分637理c4u6ht\",\n", 35 | "\"w3事4r3经such没sn34ky天h4ckers\"]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "def replace_7t(s):\n", 45 | " return s.replace('7', 't')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def replace_3e(s):\n", 55 | " return s.replace('3', 'e')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def replace_6g(s):\n", 65 | " return s.replace('6', 'g')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "def replace_4a(s):\n", 75 | " return s.replace('4', 'a')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# Alternative approach\n", 85 | "# This function makes functions!\n", 86 | "def make_letter_replacer(letter_1, letter_2):\n", 87 | " def replacer(s):\n", 88 | " return s.replace(letter_1, letter_2)\n", 89 | " return replacer\n", 90 | "\n", 91 | "alt_replace_7t = make_letter_replacer('7','t')\n", 92 | "alt_replace_7t = make_letter_replacer('3','e')\n", 93 | "alt_replace_7t = make_letter_replacer('6','g')\n", 94 | "alt_replace_7t = make_letter_replacer('4','a')" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "class chinese_matcher:\n", 104 | " def __init__(self):\n", 105 | " self.r = re.compile(r'[\\u4e00-\\u9fff]+')\n", 106 | " \n", 107 | " def sub_chinese(self,s):\n", 108 | " return self.r.sub(\" \",s)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "C = chinese_matcher()\n", 118 | "\n", 119 | "# Not chained\n", 120 | "print(list(\n", 121 | "map( C.sub_chinese,\n", 122 | " map(replace_4a,\n", 123 | " map(replace_6g,\n", 124 | " map(replace_3e,\n", 125 | " map(replace_7t, sample_messages)))))),end=\"\\n\\n\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# Option 1\n", 135 | "hacker_translate = compose(C.sub_chinese, replace_4a, replace_6g,\n", 136 | " replace_3e, replace_7t)\n", 137 | "\n", 138 | "print(list(map(hacker_translate, sample_messages)),end=\"\\n\\n\")" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Option 2\n", 148 | "def hacker_translate(s):\n", 149 | " return pipe(s, replace_7t, replace_3e, replace_6g,\n", 150 | " replace_4a, C.sub_chinese)\n", 151 | "\n", 152 | "print(list(map(hacker_translate,sample_messages)),end=\"\\n\\n\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "### Twitter scraping and gender prediction" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from multiprocessing import Pool\n", 169 | "from toolz import compose, pipe\n", 170 | "import twitter\n", 171 | "\n", 172 | "# Remember to fill in the values below with your own account details\n", 173 | "Twitter = twitter.Api(consumer_key=\"\",\n", 174 | " consumer_secret=\"\",\n", 175 | " access_token_key=\"\",\n", 176 | " access_token_secret=\"\")" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "def get_tweet_from_id(tweet_id, api=Twitter):\n", 186 | " return api.GetStatus(tweet_id, trim_user=True)\n", 187 | "\n", 188 | "\n", 189 | "def tweet_to_text(tweet):\n", 190 | " return tweet.text\n", 191 | "\n", 192 | "\n", 193 | "def tokenize_text(text):\n", 194 | " return text.split()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "def score_text(tokens):\n", 204 | " words = {\"the\":1, \"to\":1, \"and\":1, #Words with 1 indicate men\n", 205 | " \"in\":1, \"have\":1, \"it\":1,\n", 206 | " \"be\":-1, \"of\":-1, \"a\":-1, # Words with -1 indicate women\n", 207 | " \"that\":-1, \"i\":-1, \"for\":-1}\n", 208 | " return sum(map(lambda x: words.get(x, 0), tokens))\n", 209 | "\n", 210 | "\n", 211 | "def score_tweet(tweet_id):\n", 212 | " return pipe(tweet_id, get_tweet_from_id, tweet_to_text,\n", 213 | " tokenize_text, score_text)\n", 214 | "\n", 215 | "\n", 216 | "def score_user(tweets):\n", 217 | " N = len(tweets)\n", 218 | " total = sum(map(score_tweet, tweets))\n", 219 | " return total/N\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "def categorize_user(user_score):\n", 229 | " if user_score > 0:\n", 230 | " return {\"score\":user_score,\n", 231 | " \"gender\": \"Male\"}\n", 232 | " return {\"score\":user_score,\n", 233 | " \"gender\":\"Female\"}" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "users_tweets = [\n", 243 | "[1056365937547534341, 1056310126255034368, 1055985345341251584,\n", 244 | "1056585873989394432, 1056585871623966720],\n", 245 | "[1055986452612419584, 1056318330037002240, 1055957256162942977,\n", 246 | " 1056585921154420736, 1056585896898805766],\n", 247 | "[1056240773572771841, 1056184836900175874, 1056367465477951490,\n", 248 | " 1056585972765224960, 1056585968155684864],\n", 249 | "[1056452187897786368, 1056314736546115584, 1055172336062816258,\n", 250 | " 1056585983175602176, 1056585980881207297]]\n", 251 | "gender_prediction_pipeline = compose(categorize_user, score_user)\n", 252 | "with Pool() as P:\n", 253 | " print(P.map(gender_prediction_pipeline, users_tweets))\n" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "[Read for more? Go to chapter 4!](./Ch04_notebook.ipynb)" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "mldbook", 267 | "language": "python", 268 | "name": "mldbook" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.5.3" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 2 285 | } 286 | -------------------------------------------------------------------------------- /notebooks/Ch04_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 4. Processing large datasets with lazy workflows\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Poem Puzzle\n", 18 | "Remember to run the poem generation script before you run this code!" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import toolz\n", 28 | "import re, itertools\n", 29 | "from glob import iglob" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def word_ratio(d):\n", 39 | " \"\"\"This helper function returns the ratio of a's to the's\"\"\"\n", 40 | " return float(d.get(\"a\",0))/float(d.get(\"the\",0.0001))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "Class PoemCleaner:\n", 50 | " def __init__(self):\n", 51 | " self.r = re.compile(r'[.,;:!-]')\n", 52 | "\n", 53 | " def clean_poem(self, fp):\n", 54 | " \"\"\"This helper function opens a poem at a filepath and returns a clean poem.\n", 55 | "\n", 56 | " A clean poem will be a punctuation-less sequence of lowercase words, in\n", 57 | " the order that the author of the poem placed them.\n", 58 | " \"\"\"\n", 59 | " with open(fp) as poem:\n", 60 | " no_punc = self.r.sub(\"\",poem.read())\n", 61 | " return no_punc.lower().split()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def word_is_desired(w):\n", 71 | " \"\"\"This helper function detects whether a word is \"a\" or \"the\".\n", 72 | "\n", 73 | " It is designed to be used in conjunction with filter to filter a sequence\n", 74 | " of words down to just definite and indefinite articles.\n", 75 | " \"\"\"\n", 76 | " if w in [\"a\",\"the\"]:\n", 77 | " return True\n", 78 | " else:\n", 79 | " return False" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def analyze_poems(poems, cleaner):\n", 89 | " return word_ratio(\n", 90 | " toolz.frequencies(\n", 91 | " filter(word_is_desired,\n", 92 | " itertools.chain(*map(cleaner.clean_poem, poems)))))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "Cleaner = PoemCleaner()\n", 102 | "author_a_poems = iglob(\"author_a/*.txt\")\n", 103 | "author_b_poems = iglob(\"author_b/*.txt\")\n", 104 | "\n", 105 | "author_a_ratio = analyze_poems(author_a_poems, Cleaner)\n", 106 | "author_b_ratio = analyze_poems(author_b_poems, Cleaner)\n", 107 | "\n", 108 | "print(\"\"\"\n", 109 | "Original_Poem: 0.3\n", 110 | "Author A: {:.2f}\n", 111 | "Author B: {:.2f}\n", 112 | "\"\"\".format(author_a_ratio, author_b_ratio))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "### Fishing village simulation" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "import random, itertools\n", 129 | "from operator import methodcaller" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "class Village:\n", 139 | " def __init__(self):\n", 140 | " self.population = random.uniform(1000,5000)\n", 141 | " self.cheat_rate = random.uniform(.05,.15)\n", 142 | "\n", 143 | " def update(self, sim):\n", 144 | " if sim.cheaters >= 2:\n", 145 | " self.cheat_rate += .05\n", 146 | " self.population = int(self.population*1.025)\n", 147 | "\n", 148 | " def go_fishing(self):\n", 149 | " if random.uniform(0,1) < self.cheat_rate:\n", 150 | " cheat = 1\n", 151 | " fish_taken = self.population * 2\n", 152 | " else:\n", 153 | " cheat = 0\n", 154 | " fish_taken = self.population * 1\n", 155 | " return fish_taken, cheat" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "class LakeSimulation:\n", 165 | " def __init__(self):\n", 166 | " self.villages = [Village() for _ in range(4)]\n", 167 | " self.fish = 80000\n", 168 | " self.year = 1\n", 169 | " self.cheaters = 0\n", 170 | "\n", 171 | " def simulate(self):\n", 172 | " for _ in itertools.count():\n", 173 | " yearly_results = map(methodcaller(\"go_fishing\"), self.villages)\n", 174 | " fishs, cheats = zip(*yearly_results)\n", 175 | " total_fished = sum(fishs)\n", 176 | " self.cheaters = sum(cheats)\n", 177 | " if self.year > 1000:\n", 178 | " print(\"Wow! Your villages lasted 1000 years!\")\n", 179 | " break\n", 180 | " if self.fish < total_fished:\n", 181 | " print(\"The lake was overfished in {} years.\".format(self.year))\n", 182 | " break\n", 183 | " else:\n", 184 | " self.fish = (self.fish-total_fished)* 1.15\n", 185 | " map(methodcaller(\"update\"), self.villages)\n", 186 | " print(\"Year {:<5} Fish: {}\".format(self.year,\n", 187 | " int(self.fish)))\n", 188 | " self.year += 1" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "random.seed(\"Wolohan\")\n", 198 | "Lake = LakeSimulation()\n", 199 | "Lake.simulate()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "[Read for more? Go to chapter 5!](./Ch05_notebook.ipynb)" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "mldbook", 213 | "language": "python", 214 | "name": "mldbook" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.5.3" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /notebooks/Ch05_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 5. Accumulation operations with Reduce\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Early chapter functions: Frequnecy and filter" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from functools import reduce\n", 27 | "\n", 28 | "xs = [1, 2, 3, 4, 5, 6, 7, 8, 9]\n", 29 | "\n", 30 | "def keep_if_even(acc, nxt):\n", 31 | " if nxt % 2 == 0:\n", 32 | " return acc + [nxt]\n", 33 | " else:\n", 34 | " return acc\n", 35 | "\n", 36 | "\n", 37 | "reduce(keep_if_even, xs, [])\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from functools import reduce\n", 47 | "\n", 48 | "xs = [\"A\", \"B\", \"C\", \"A\", \"A\", \"C\", \"A\"]\n", 49 | "ys = [1, 3, 6, 1, 2, 9, 3, 12]\n", 50 | "\n", 51 | "\n", 52 | "def make_counts(acc, nxt):\n", 53 | " acc[nxt] = acc.get(nxt, 0) + 1\n", 54 | " return acc\n", 55 | "\n", 56 | "\n", 57 | "def my_frequencies(xs):\n", 58 | " return reduce(make_counts, xs, {})\n", 59 | "\n", 60 | "\n", 61 | "print(my_frequencies(xs))\n", 62 | "print(my_frequencies(ys))\n", 63 | "print(my_frequencies(\"mississippi\"))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Analyzing car trends with reduce" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "**SCENARIO: CHANGING CAR TRENDS** *Your customer is a used car dealer. They have data on cars that they’ve bought and sold in the last 6 months and are hoping you can help them find what type of used cars they make the most profit on. One salesman believes that its high fuel-efficiency cars (those that get more than 35 miles per gallon) that make the most money, while another believe that medium-mileage cars (between 60,000 and 100,000 miles) result in the highest average profit on resale. Given a CSV file with a variety of attributes about some used cars, write a script to find the average profit on cars of low (<18 mpg), medium (18-35 mpg) and high (>35) fuel-efficiency as well as low (<60,000), medium (60,000-100,000), and high mileage (>100,000) and settle the debate.*" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from functools import reduce\n", 87 | "\n", 88 | "def low_med_hi(d, k, breaks):\n", 89 | " if float(d[k]) < breaks[0]:\n", 90 | " return \"low\"\n", 91 | " elif float(d[k]) < breaks[1]:\n", 92 | " return \"medium\"\n", 93 | " else:\n", 94 | " return \"high\"" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "def clean_entry(d):\n", 104 | " r = {'profit':None, 'mpg':None, 'odo':None}\n", 105 | " r['profit'] = float(d.get(\"price-sell\", 0)) - float(d.get(\"price-buy\", 0))\n", 106 | " r['mpg'] = low_med_hi(d, 'mpg', (18, 35))\n", 107 | " r['odo'] = low_med_hi(d, 'odo', (60000, 105000))\n", 108 | " return r" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "def acc_average(acc, profit):\n", 118 | " acc['total'] = acc.get('total', 0) + profit\n", 119 | " acc['count'] = acc.get('count', 0) + 1\n", 120 | " acc['average'] = acc['total']/acc['count']\n", 121 | " return acc" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "def sort_and_add(acc, nxt):\n", 131 | " p = nxt['profit']\n", 132 | " acc['mpg'][nxt['mpg']] = acc_average(acc['mpg'].get(nxt['mpg'], {}), p)\n", 133 | " acc['odo'][nxt['odo']] = acc_average(acc['odo'].get(nxt['odo'], {}), p)\n", 134 | " return acc" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "import json\n", 144 | "with open(\"../Ch05/cars.json\") as f:\n", 145 | " xs = json.load(f)\n", 146 | "results = reduce(sort_and_add, map(clean_entry, xs), {\"mpg\": {}, \"odo\": {}})\n", 147 | "print(json.dumps(results, indent=4))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "[Read for more? Go to chapter 6!](./Ch06_notebook.ipynb)" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "mldbook", 161 | "language": "python", 162 | "name": "mldbook" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.5.3" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | -------------------------------------------------------------------------------- /notebooks/Ch06_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 6. Speeding up map and reduce with advanced parallelization\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Timing" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "name": "stdout", 27 | "output_type": "stream", 28 | "text": [ 29 | "\n", 30 | "-- N = 1 --\n", 31 | "Lazy map time: 8.000000000008e-06\n", 32 | "Parallel map time: 0.01324599999999998\n", 33 | "\n", 34 | "\n", 35 | "-- N = 10 --\n", 36 | "Lazy map time: 9.599999999998499e-05\n", 37 | "Parallel map time: 0.014952000000000076\n", 38 | "\n", 39 | "\n", 40 | "-- N = 100 --\n", 41 | "Lazy map time: 5.900000000003125e-05\n", 42 | "Parallel map time: 0.01502199999999998\n", 43 | "\n", 44 | "\n", 45 | "-- N = 1000 --\n", 46 | "Lazy map time: 0.0003989999999999272\n", 47 | "Parallel map time: 0.014475000000000016\n", 48 | "\n", 49 | "\n", 50 | "-- N = 10000 --\n", 51 | "Lazy map time: 0.0038730000000000153\n", 52 | "Parallel map time: 0.01732200000000006\n", 53 | "\n", 54 | "\n", 55 | "-- N = 100000 --\n", 56 | "Lazy map time: 0.03707399999999994\n", 57 | "Parallel map time: 0.02400800000000003\n", 58 | "\n", 59 | "\n", 60 | "-- N = 1000000 --\n", 61 | "Lazy map time: 0.199009\n", 62 | "Parallel map time: 0.13838499999999998\n", 63 | "\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "from time import clock, sleep\n", 69 | "from multiprocessing import Pool\n", 70 | "\n", 71 | "\n", 72 | "def times_two(x):\n", 73 | " return x*2+7\n", 74 | "\n", 75 | "\n", 76 | "def lazy_map(xs):\n", 77 | " return list(map(times_two, xs))\n", 78 | "\n", 79 | "\n", 80 | "def parallel_map(xs, chunck=8500):\n", 81 | " with Pool(2) as P:\n", 82 | " x = P.map(times_two, xs, chunck)\n", 83 | " return x\n", 84 | "\n", 85 | "\n", 86 | "for i in range(0, 7):\n", 87 | " N = 10**i\n", 88 | " t1 = clock()\n", 89 | " lazy_map(range(N))\n", 90 | " lm_time = clock() - t1\n", 91 | "\n", 92 | " t1 = clock()\n", 93 | " parallel_map(range(N))\n", 94 | " par_time = clock() - t1\n", 95 | " print(\"\"\"\n", 96 | "-- N = {} --\n", 97 | "Lazy map time: {}\n", 98 | "Parallel map time: {}\n", 99 | "\"\"\".format(N, lm_time, par_time))" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "\n", 112 | "chunksize | runtime\n", 113 | "-------------------------\n", 114 | "5 | 5.083\n", 115 | "50 | 1.431\n", 116 | "500 | 0.291\n", 117 | "5000 | 0.199\n", 118 | "50000 | 0.159\n", 119 | "500000 | 0.203\n", 120 | "5000000 | 0.182\n", 121 | "50000000 | 0.164\n", 122 | "500000000 | 0.157\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "from time import clock\n", 128 | "from multiprocessing import Pool\n", 129 | "\n", 130 | "\n", 131 | "def times_two(x):\n", 132 | " return x*2+7\n", 133 | "\n", 134 | "\n", 135 | "def parallel_map(xs, chunk_size=8500):\n", 136 | " with Pool(2) as P:\n", 137 | " x = P.map(times_two, xs, chunk_size)\n", 138 | " return x\n", 139 | "\n", 140 | "\n", 141 | "print(\"\"\"\n", 142 | "{:<10} | {}\n", 143 | "-------------------------\"\"\".format(\"chunksize\", \"runtime\"))\n", 144 | "\n", 145 | "for i in range(0, 9):\n", 146 | " N = 1000000\n", 147 | " chunk_size = 5 * (10**i)\n", 148 | "\n", 149 | " t1 = clock()\n", 150 | " parallel_map(range(N), chunk_size)\n", 151 | " parallel_time = clock() - t1\n", 152 | "\n", 153 | " print(\"{:<10} | {:>0.3f}\".format(chunk_size, parallel_time))" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "### Parallel sum" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "from pathos.multiprocessing import ProcessingPool as Pool\n", 170 | "from toolz.sandbox.parallel import fold\n", 171 | "from functools import reduce\n", 172 | "\n", 173 | "\n", 174 | "def my_add(left, right):\n", 175 | " return left+right\n", 176 | "\n", 177 | "\n", 178 | "with Pool() as P: \n", 179 | " fold(my_add, range(500000), map=P.imap)\n", 180 | "\n", 181 | "print(reduce(my_add, range(500)))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Parallel filter" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 5, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 422, 424, 426, 428, 430, 432, 434, 436, 438, 440, 442, 444, 446, 448, 450, 452, 454, 456, 458, 460, 462, 464, 466, 468, 470, 472, 474, 476, 478, 480, 482, 484, 486, 488, 490, 492, 494, 496, 498]\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "from pathos.multiprocessing import ProcessingPool as Pool\n", 206 | "from toolz.sandbox.parallel import fold\n", 207 | "from functools import reduce\n", 208 | "\n", 209 | "\n", 210 | "def map_combination(left, right):\n", 211 | " return left + right\n", 212 | "\n", 213 | "\n", 214 | "def keep_if_even(acc, nxt):\n", 215 | " if nxt % 2 == 0:\n", 216 | " return acc + [nxt]\n", 217 | " else: return acc\n", 218 | "\n", 219 | "\n", 220 | "with Pool() as P:\n", 221 | " fold(keep_if_even, range(500000), [],\n", 222 | " map=P.imap, combine=map_combination)\n", 223 | "\n", 224 | "print(reduce(keep_if_even, range(500), []))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Parallel frequencies" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 6, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "{1: 76, 2: 94, 3: 74, 4: 78, 5: 88, 6: 90}\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "from pathos.multiprocessing import ProcessingPool as Pool\n", 249 | "from toolz.sandbox.parallel import fold\n", 250 | "from random import choice\n", 251 | "from functools import reduce\n", 252 | "\n", 253 | "\n", 254 | "def combine_counts(left, right):\n", 255 | " unique_keys = set(left.keys()).union(set(right.keys()))\n", 256 | " return {k:left.get(k, 0)+right.get(k, 0) for k in unique_keys}\n", 257 | "\n", 258 | "\n", 259 | "def make_counts(acc, nxt):\n", 260 | " acc[nxt] = acc.get(nxt,0) + 1\n", 261 | " return acc\n", 262 | "\n", 263 | "\n", 264 | "xs = (choice([1, 2, 3, 4, 5, 6]) for _ in range(500000))\n", 265 | "\n", 266 | "with Pool() as P:\n", 267 | " fold(make_counts, xs, {},\n", 268 | " map=P.imap, combine=combine_counts)\n", 269 | "\n", 270 | "print(reduce(make_counts, (choice([1, 2, 3, 4, 5, 6]) for _ in range(500)), {}))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Parallel Naive Bayes\n", 278 | "**NB:** *This code ended up getting cut from the book. It implements the naive Bayes algorithm in parallel using map and reduce patterns. Feel free to read through it as a bonus.*" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "from itertools import starmap, repeat\n", 288 | "from functools import reduce, partial\n", 289 | "import dill as pickle\n", 290 | "from toolz.sandbox.parallel import fold\n", 291 | "from pathos.multiprocessing import ProcessingPool as PathosPool\n", 292 | "from multiprocessing import Pool\n", 293 | "from csv import DictReader" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "def unique_keys(left, right):\n", 303 | " return set(left.keys()).union(set(right.keys()))\n", 304 | "\n", 305 | "def prod(xs):\n", 306 | " return reduce(lambda acc,nxt: acc*nxt, xs)" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "def compute_prob(model, k, v, label, N):\n", 316 | " \"\"\"Compute probabilities for event.\"\"\"\n", 317 | " Cn = model['LABELS'][label]\n", 318 | " prior = Cn / N\n", 319 | " evidence = model[k][v].get(label,.001) / Cn\n", 320 | " return prior * evidence\n", 321 | "\n", 322 | "def _nb_suggest(ob, model, target):\n", 323 | " \"\"\"maknaive Bayes prediction\"\"\"\n", 324 | " ob.pop(target)\n", 325 | " N = sum(model['LABELS'].values())\n", 326 | " results = {}\n", 327 | " for label in model['LABELS'].keys():\n", 328 | " p = prod(compute_prob(model, k, v, label, N) for k, v in ob.items())\n", 329 | " results[label] = p\n", 330 | " return results\n", 331 | "\n", 332 | "def naive_bayes_suggest(obs, model, target):\n", 333 | " \"\"\"Parallel naive Bayes prediction function\"\"\"\n", 334 | " with Pool() as P:\n", 335 | " f = partial(_nb_suggest, target=target)\n", 336 | " return P.starmap(f, zip(obs, repeat(model)))" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "def nb_acc(acc, nxt, target):\n", 346 | " label = nxt.pop(target)\n", 347 | " if not acc.get('LABELS', False):\n", 348 | " acc['LABELS'] = {}\n", 349 | " acc['LABELS'][label] = acc['LABELS'].get(label,0) + 1\n", 350 | " for k,v in nxt.items():\n", 351 | " if not acc.get(k,False):\n", 352 | " acc[k] = {}\n", 353 | " if not acc[k].get(v, False):\n", 354 | " acc[k][v] = {}\n", 355 | " acc[k][v][label] = acc.get(k,{}).get(v,{}).get(label,0) + 1\n", 356 | " return acc" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "def _nb_comb(left, right):\n", 366 | " acc = {}\n", 367 | " acc['LABELS'] = {}\n", 368 | " for k in unique_keys(left['LABELS'], right['LABELS']):\n", 369 | " acc['LABELS'][k] = left['LABELS'].get(k,0) + right['LABELS'].get(k,0)\n", 370 | " for k in unique_keys(left, right):\n", 371 | " if k == 'LABELS': continue\n", 372 | " acc[k] = {}\n", 373 | " for v in unique_keys(left.get(k,{}), right.get(k,{})):\n", 374 | " acc[k][v] = {}\n", 375 | " for label in acc['LABELS']:\n", 376 | " count_left = left.get(k,{}).get(v,{}).get(label,0)\n", 377 | " count_right = right.get(k,{}).get(v,{}).get(label,0)\n", 378 | " acc[k][v][label] = count_left + count_right\n", 379 | " return acc" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 7, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "def naive_bayes(xs, target):\n", 389 | " \"\"\"Create a naive Bayes model.\n", 390 | "\n", 391 | "\n", 392 | " Inputs\n", 393 | " xs: input data\n", 394 | " target: target variable\n", 395 | " \n", 396 | " Output\n", 397 | " prediction function\n", 398 | "\"\"\"\n", 399 | " acc = partial(nb_acc, target=target)\n", 400 | " with PathosPool() as P:\n", 401 | " model = fold(acc, xs, {}, map=P.map, combine=_nb_comb)\n", 402 | " return partial(naive_bayes_suggest, model=model, target=target)\n", 403 | "\n", 404 | "def max_prob(probs):\n", 405 | " return max(((k,v) for k,v in probs.items()), key=lambda x:x[1])[0]" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "Download [the nursery data](https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data) and assign its path to `fp` in the next block" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "fp = \"\"\n", 422 | "with open(fp) as f:\n", 423 | " reader = DictReader(f, fieldnames=[\"parents\", \"has_nurs\", \"form\",\n", 424 | " \"children\", \"housing\", \"finance\",\n", 425 | " \"social\", \"health\", \"recc\"])\n", 426 | " data = [row for row in reader]\n", 427 | "\n", 428 | "model = naive_bayes(data, \"recc\")\n", 429 | "probs = model(data)\n", 430 | "print(\"{}\\t\\t{}\\t{}\".format(\"Match\", \"Suggestion\", \"Actual\"))\n", 431 | "print(\"{}\".format(\"-\"*45))\n", 432 | "for i,p in enumerate(probs):\n", 433 | " suggestion = max_prob(p)\n", 434 | " actual = data[i]['recc']\n", 435 | " match = suggestion == actual\n", 436 | " print(\"{}\\t\\t{}\\t{}\".format(match, suggestion, actual))\n", 437 | " if i > 25: break" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "[Read for more? Go to chapter 7!](./Ch07_notebook.ipynb)" 445 | ] 446 | } 447 | ], 448 | "metadata": { 449 | "kernelspec": { 450 | "display_name": "mldbook", 451 | "language": "python", 452 | "name": "mldbook" 453 | }, 454 | "language_info": { 455 | "codemirror_mode": { 456 | "name": "ipython", 457 | "version": 3 458 | }, 459 | "file_extension": ".py", 460 | "mimetype": "text/x-python", 461 | "name": "python", 462 | "nbconvert_exporter": "python", 463 | "pygments_lexer": "ipython3", 464 | "version": "3.5.3" 465 | } 466 | }, 467 | "nbformat": 4, 468 | "nbformat_minor": 2 469 | } 470 | -------------------------------------------------------------------------------- /notebooks/Ch07_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 7. Processing truly big datasets with Hadoop and Spark\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Spark\n", 18 | "One of the great benefits of Spark is that you can run Spark jobs in a Jupyter notebook, just like this one." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import re\n", 28 | "from pyspark import SparkContext\n", 29 | "\n", 30 | "sc = SparkContext(appName=\"WordScores\")\n", 31 | "PAT = re.compile(r'[-./:\\s\\xa0]+')\n", 32 | "text_files = sc.textFile(\"../Ch07/*.txt\")\n", 33 | "xs = text_files.flatMap(lambda x:PAT.split(x))\\\n", 34 | " .filter(lambda x:len(x)>6)\\\n", 35 | " .countByValue()\n", 36 | "\n", 37 | "for k,v in xs.items():\n", 38 | " print(\"{:<30}{}\".format(k,v))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "[Read for more? Go to chapter 9!](./Ch09_notebook.ipynb)\n", 46 | "\n", 47 | "(There's no notebook for Chapter 8. Chapter 8 focuses on Hadoop.)" 48 | ] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "mldbook", 54 | "language": "python", 55 | "name": "mldbook" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.5.3" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 2 72 | } 73 | -------------------------------------------------------------------------------- /notebooks/Ch09_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 9. PageRank with Map and Reduce in PySpark\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Elo ratings in Spark" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import re, json\n", 27 | "from pyspark import SparkContext" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def round5(x):\n", 37 | " return 5*int(x/5)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def clean_match(match):\n", 47 | " ms = match.split(',')\n", 48 | " match_data = {'winner': ms[10],\n", 49 | " 'loser': ms[20],\n", 50 | " 'surface': ms[2]}\n", 51 | " return match_data" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 6, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def elo_acc(acc,nxt):\n", 61 | " w_elo = acc.get(nxt['winner'],1600)\n", 62 | " l_elo = acc.get(nxt['loser'],1600)\n", 63 | " Qw = 10**(w_elo/400)\n", 64 | " Ql = 10**(l_elo/400)\n", 65 | " Qt = Qw+Ql\n", 66 | " acc[nxt['winner']] = round5(w_elo + 25*(1-(Qw/Qt)))\n", 67 | " acc[nxt['loser']] = round5(l_elo - 25*(Ql/Qt))\n", 68 | " return acc" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 7, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "def elo_comb(a,b):\n", 78 | " a.update(b)\n", 79 | " return a" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 12, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Martina Hingis 1865\n", 92 | "Venus Williams 1830\n", 93 | "Monica Seles 1765\n", 94 | "Serena Williams 1755\n", 95 | "Lindsay Davenport 1745\n", 96 | "Maria Sharapova 1720\n", 97 | "Petra Russegger 1710\n", 98 | "Akiko Morigami 1690\n", 99 | "Garbine Muguruza 1685\n", 100 | "Victoria Azarenka 1665\n", 101 | "Nour Abbes 1660\n", 102 | "Timea Bacsinszky 1660\n", 103 | "Belinda Bencic 1655\n", 104 | "Amelie Mauresmo 1655\n", 105 | "Mary Pierce 1655\n", 106 | "Jennifer Saret 1650\n", 107 | "Angelique Kerber 1650\n", 108 | "Bermet Duvanaeva 1650\n", 109 | "Svetlana Komleva 1650\n", 110 | "Cecilia Costa Melgar 1650\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "sc = SparkContext(appName=\"TennisRatings\")\n", 116 | "text_files = sc.textFile(\"/path/to/my/data/wta_matches*\")\n", 117 | "xs = text_files.map(clean_match)\\\n", 118 | " .aggregate({},elo_acc, elo_comb)\n", 119 | "\n", 120 | "for x in sorted(xs.items(), key=lambda x:x[1], reverse=True)[:20]:\n", 121 | " print(\"{:<30}{}\".format(*x))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "### Page rank in Spark" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 25, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "from math import log2, ceil\n", 138 | "from functools import partial\n", 139 | "from pyspark import SparkContext" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 16, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def ceil5(x):\n", 149 | " return ceil(x/5)*5" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 17, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "def get_winner_loser(match):\n", 159 | " ms = match.split(',')\n", 160 | " # Put the loser in first position, winner in second\n", 161 | " return (ms[20], ms[10])" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 18, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "def initialize_for_voting(losses):\n", 171 | " return {'losses': losses,\n", 172 | " 'n_losses': len(losses),\n", 173 | " 'rating': 100}" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 19, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "def empty_ratings(d):\n", 183 | " d['rating'] = 0\n", 184 | " return d" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 20, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "def allocate_points(acc, nxt):\n", 194 | " k,v = nxt\n", 195 | " boost = v['rating'] / (v['n_losses'] + .01)\n", 196 | " for loss in v['losses']:\n", 197 | " if loss not in acc.keys():\n", 198 | " acc[loss] = {'losses':[], 'n_losses': 0}\n", 199 | " opp_rating = acc.get(loss,{}).get('rating',0)\n", 200 | " acc[loss]['rating'] = opp_rating + boost\n", 201 | " return acc" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 21, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "def combine_scores(a, b):\n", 211 | " for k,v in b.items():\n", 212 | " try:\n", 213 | " a[k]['rating'] = a[k]['rating'] + b[k]['rating']\n", 214 | " except KeyError:\n", 215 | " a[k] = v\n", 216 | " return a" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "To run the cell below, you may need to un-comment the Spark context. If you ran the Elo rating example above, leave it commented." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 26, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "Serena Williams 12.4\t5475\n", 236 | "Venus Williams 12.0\t4230\n", 237 | "Kim Clijsters 11.9\t3870\n", 238 | "Maria Sharapova 11.9\t3785\n", 239 | "Justine Henin 11.8\t3660\n", 240 | "Elena Dementieva 11.6\t3130\n", 241 | "Amelie Mauresmo 11.6\t3115\n", 242 | "Svetlana Kuznetsova 11.6\t3060\n", 243 | "Jelena Jankovic 11.6\t3055\n", 244 | "Lindsay Davenport 11.6\t3055\n", 245 | "Victoria Azarenka 11.3\t2485\n", 246 | "Ana Ivanovic 11.2\t2405\n", 247 | "Daniela Hantuchova 11.2\t2385\n", 248 | "Nadia Petrova 11.2\t2360\n", 249 | "Caroline Wozniacki 11.2\t2350\n", 250 | "Agnieszka Radwanska 11.2\t2335\n", 251 | "Vera Zvonareva 11.2\t2320\n", 252 | "Patty Schnyder 11.1\t2220\n", 253 | "Samantha Stosur 11.1\t2215\n", 254 | "Francesca Schiavone 11.0\t2100\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "#sc = SparkContext(appName=\"TennisRatings\")\n", 260 | "match_data = sc.textFile(\"path/to/tennis/files\")\n", 261 | "xs = match_data.map(get_winner_loser)\\\n", 262 | " .groupByKey()\\\n", 263 | " .mapValues(initialize_for_voting)\n", 264 | "\n", 265 | "for i in range(8):\n", 266 | " if i > 0:\n", 267 | " xs = sc.parallelize(zs.items())\n", 268 | " acc = dict(xs.mapValues(empty_ratings).collect())\n", 269 | " zs = xs.aggregate(acc, allocate_points, combine_scores)\n", 270 | "\n", 271 | "ratings = [(k,v['rating']) for k,v in zs.items()]\n", 272 | "for player, rating in sorted(ratings, key=lambda x: x[1], reverse=True)[:20]:\n", 273 | " print('{:<30}{}\\t{}'.format(player,\n", 274 | " round(log2(rating+1), 1),\n", 275 | " ceil5(rating)))\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "[Read for more? Go to chapter 10!](./Ch03_notebook.ipynb)" 283 | ] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "mldbook", 289 | "language": "python", 290 | "name": "mldbook" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.5.3" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 2 307 | } 308 | -------------------------------------------------------------------------------- /notebooks/Ch10_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 10. Faster decision making with machine learning andPySpark\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan \n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Decision Trees" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 21, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "from pyspark import SparkContext\n", 27 | "from pyspark.sql import SparkSession\n", 28 | "from functools import reduce\n", 29 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", 30 | "from pyspark.ml.classification import DecisionTreeClassifier\n", 31 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 22, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "def string_to_index(df, label):\n", 41 | " return StringIndexer(inputCol=label,\n", 42 | " outputCol=\"i-\"+label).fit(df) \\\n", 43 | " .transform(df)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 23, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "spark = SparkSession.builder \\\n", 53 | " .master(\"local\") \\\n", 54 | " .appName(\"Decision Trees\") \\\n", 55 | " .getOrCreate()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 24, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df = spark.read.csv(\"../Ch10/mushrooms.data\", header=True, inferSchema=True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 25, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "categories = ['cap-shape', 'cap-surface', 'cap-color']\n", 74 | "df = reduce(string_to_index, categories, df)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 26, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "df = VectorAssembler(inputCols=[\"i-cap-shape\",\"i-cap-surface\", \"i-cap-color\"],\n", 84 | " outputCol=\"features\").transform(df)\n", 85 | "\n", 86 | "df = StringIndexer(inputCol='edible?', outputCol='label').fit(df).transform(df)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 27, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "tree = DecisionTreeClassifier()\n", 96 | "model = tree.fit(df)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 28, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "DecisionTreeClassificationModel (uid=DecisionTreeClassifier_06513451d79b) of depth 5 with 29 nodes\n", 109 | " If (feature 1 in {2.0,3.0})\n", 110 | " If (feature 2 in {0.0,2.0,4.0,6.0,7.0})\n", 111 | " If (feature 2 in {0.0,2.0,7.0})\n", 112 | " If (feature 0 in {0.0,1.0,2.0,4.0})\n", 113 | " Predict: 0.0\n", 114 | " Else (feature 0 not in {0.0,1.0,2.0,4.0})\n", 115 | " Predict: 1.0\n", 116 | " Else (feature 2 not in {0.0,2.0,7.0})\n", 117 | " If (feature 2 in {6.0})\n", 118 | " Predict: 1.0\n", 119 | " Else (feature 2 not in {6.0})\n", 120 | " Predict: 0.0\n", 121 | " Else (feature 2 not in {0.0,2.0,4.0,6.0,7.0})\n", 122 | " If (feature 2 in {3.0})\n", 123 | " Predict: 1.0\n", 124 | " Else (feature 2 not in {3.0})\n", 125 | " Predict: 0.0\n", 126 | " Else (feature 1 not in {2.0,3.0})\n", 127 | " If (feature 0 in {3.0,5.0})\n", 128 | " If (feature 2 in {0.0,1.0,3.0})\n", 129 | " If (feature 0 in {5.0})\n", 130 | " Predict: 1.0\n", 131 | " Else (feature 0 not in {5.0})\n", 132 | " Predict: 0.0\n", 133 | " Else (feature 2 not in {0.0,1.0,3.0})\n", 134 | " If (feature 2 in {5.0,6.0})\n", 135 | " Predict: 1.0\n", 136 | " Else (feature 2 not in {5.0,6.0})\n", 137 | " If (feature 0 in {5.0})\n", 138 | " Predict: 1.0\n", 139 | " Else (feature 0 not in {5.0})\n", 140 | " Predict: 0.0\n", 141 | " Else (feature 0 not in {3.0,5.0})\n", 142 | " If (feature 0 in {2.0})\n", 143 | " If (feature 2 in {1.0,4.0,5.0,6.0})\n", 144 | " Predict: 0.0\n", 145 | " Else (feature 2 not in {1.0,4.0,5.0,6.0})\n", 146 | " Predict: 1.0\n", 147 | " Else (feature 0 not in {2.0})\n", 148 | " If (feature 2 in {8.0,9.0})\n", 149 | " Predict: 0.0\n", 150 | " Else (feature 2 not in {8.0,9.0})\n", 151 | " Predict: 1.0\n", 152 | "\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "print(model.toDebugString)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 29, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "Decision Tree AUC: 0.6333\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "bce = BinaryClassificationEvaluator()\n", 175 | "\n", 176 | "auc = bce.evaluate(model.transform(df))\n", 177 | "print(\"Decision Tree AUC: {:0.4f}\".format(auc))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "### Random Forests" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 30, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from pyspark import SparkContext\n", 194 | "from pyspark.sql import SparkSession\n", 195 | "from functools import reduce\n", 196 | "from pyspark.ml.feature import StringIndexer, VectorAssembler\n", 197 | "from pyspark.ml.classification import RandomForestClassifier\n", 198 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 199 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 31, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "spark = SparkSession.builder \\\n", 209 | " .master(\"local\") \\\n", 210 | " .appName(\"Random Forests\") \\\n", 211 | " .getOrCreate()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 32, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "bce = BinaryClassificationEvaluator()\n", 221 | "forest = RandomForestClassifier()\n", 222 | "df = spark.read.csv(\"../Ch10/mushrooms.data\", header=True, inferSchema=True)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 33, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "categories = df.columns\n", 232 | "categories.pop(categories.index('edible?'))\n", 233 | "df = reduce(string_to_index, categories, df)\n", 234 | "indexes = [\"i-\"+c for c in categories]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 34, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df = VectorAssembler(inputCols=indexes,\n", 244 | " outputCol=\"features\").transform(df)\n", 245 | "df = StringIndexer(inputCol='edible?',\n", 246 | " outputCol='label').fit(df).transform(df)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 35, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "grid = ParamGridBuilder().addGrid(forest.maxDepth, [0, 2]).build()\n", 256 | "cv = CrossValidator(estimator=forest, estimatorParamMaps=grid,\n", 257 | " evaluator=bce,numFolds=10,\n", 258 | " parallelism=4)\n", 259 | "cv_model = cv.fit(df)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 36, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "Random Forest AUC: 0.9950\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "area_under_curve = bce.evaluate(cv_model.transform(df))\n", 277 | "print(\"Random Forest AUC: {:0.4f}\".format(area_under_curve))" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 37, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "RandomForestClassificationModel (uid=RandomForestClassifier_3715b1717fde) with 20 trees\n", 290 | " Tree 0 (weight 1.0):\n", 291 | " If (feature 7 in {0.0})\n", 292 | " If (feature 11 in {0.0,2.0,3.0})\n", 293 | " Predict: 0.0\n", 294 | " Else (feature 11 not in {0.0,2.0,3.0})\n", 295 | " Predict: 1.0\n", 296 | " Else (feature 7 not in {0.0})\n", 297 | " If (feature 1 in {2.0,3.0})\n", 298 | " Predict: 0.0\n", 299 | " Else (feature 1 not in {2.0,3.0})\n", 300 | " Predict: 1.0\n", 301 | " Tree 1 (weight 1.0):\n", 302 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 303 | " If (feature 4 in {0.0,4.0,5.0})\n", 304 | " Predict: 0.0\n", 305 | " Else (feature 4 not in {0.0,4.0,5.0})\n", 306 | " Predict: 1.0\n", 307 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 308 | " If (feature 21 in {6.0})\n", 309 | " Predict: 0.0\n", 310 | " Else (feature 21 not in {6.0})\n", 311 | " Predict: 1.0\n", 312 | " Tree 2 (weight 1.0):\n", 313 | " If (feature 11 in {0.0,2.0,3.0})\n", 314 | " Predict: 0.0\n", 315 | " Else (feature 11 not in {0.0,2.0,3.0})\n", 316 | " If (feature 20 in {2.0,3.0})\n", 317 | " Predict: 0.0\n", 318 | " Else (feature 20 not in {2.0,3.0})\n", 319 | " Predict: 1.0\n", 320 | " Tree 3 (weight 1.0):\n", 321 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 322 | " If (feature 7 in {0.0})\n", 323 | " Predict: 0.0\n", 324 | " Else (feature 7 not in {0.0})\n", 325 | " Predict: 1.0\n", 326 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 327 | " If (feature 20 in {2.0,3.0,5.0})\n", 328 | " Predict: 0.0\n", 329 | " Else (feature 20 not in {2.0,3.0,5.0})\n", 330 | " Predict: 1.0\n", 331 | " Tree 4 (weight 1.0):\n", 332 | " If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 333 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 334 | " Predict: 0.0\n", 335 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 336 | " Predict: 1.0\n", 337 | " Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 338 | " If (feature 19 in {1.0,2.0,4.0,6.0})\n", 339 | " Predict: 0.0\n", 340 | " Else (feature 19 not in {1.0,2.0,4.0,6.0})\n", 341 | " Predict: 1.0\n", 342 | " Tree 5 (weight 1.0):\n", 343 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 344 | " Predict: 0.0\n", 345 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 346 | " If (feature 20 in {2.0,3.0,5.0})\n", 347 | " Predict: 0.0\n", 348 | " Else (feature 20 not in {2.0,3.0,5.0})\n", 349 | " Predict: 1.0\n", 350 | " Tree 6 (weight 1.0):\n", 351 | " If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 352 | " Predict: 0.0\n", 353 | " Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 354 | " If (feature 18 in {0.0,3.0})\n", 355 | " Predict: 0.0\n", 356 | " Else (feature 18 not in {0.0,3.0})\n", 357 | " Predict: 1.0\n", 358 | " Tree 7 (weight 1.0):\n", 359 | " If (feature 20 in {1.0,2.0,3.0,4.0,5.0})\n", 360 | " If (feature 18 in {0.0,1.0,3.0,4.0})\n", 361 | " Predict: 0.0\n", 362 | " Else (feature 18 not in {0.0,1.0,3.0,4.0})\n", 363 | " Predict: 1.0\n", 364 | " Else (feature 20 not in {1.0,2.0,3.0,4.0,5.0})\n", 365 | " If (feature 3 in {1.0})\n", 366 | " Predict: 0.0\n", 367 | " Else (feature 3 not in {1.0})\n", 368 | " Predict: 1.0\n", 369 | " Tree 8 (weight 1.0):\n", 370 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 371 | " Predict: 0.0\n", 372 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 373 | " If (feature 17 in {1.0})\n", 374 | " Predict: 0.0\n", 375 | " Else (feature 17 not in {1.0})\n", 376 | " Predict: 1.0\n", 377 | " Tree 9 (weight 1.0):\n", 378 | " If (feature 7 in {0.0})\n", 379 | " If (feature 19 in {0.0,1.0,2.0,5.0,7.0,8.0})\n", 380 | " Predict: 0.0\n", 381 | " Else (feature 19 not in {0.0,1.0,2.0,5.0,7.0,8.0})\n", 382 | " Predict: 1.0\n", 383 | " Else (feature 7 not in {0.0})\n", 384 | " If (feature 1 in {2.0,3.0})\n", 385 | " Predict: 0.0\n", 386 | " Else (feature 1 not in {2.0,3.0})\n", 387 | " Predict: 1.0\n", 388 | " Tree 10 (weight 1.0):\n", 389 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 390 | " Predict: 0.0\n", 391 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 392 | " Predict: 1.0\n", 393 | " Tree 11 (weight 1.0):\n", 394 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 395 | " Predict: 0.0\n", 396 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 397 | " If (feature 6 in {1.0})\n", 398 | " Predict: 0.0\n", 399 | " Else (feature 6 not in {1.0})\n", 400 | " Predict: 1.0\n", 401 | " Tree 12 (weight 1.0):\n", 402 | " If (feature 12 in {0.0,2.0,3.0})\n", 403 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 404 | " Predict: 0.0\n", 405 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 406 | " Predict: 1.0\n", 407 | " Else (feature 12 not in {0.0,2.0,3.0})\n", 408 | " If (feature 18 in {0.0})\n", 409 | " Predict: 0.0\n", 410 | " Else (feature 18 not in {0.0})\n", 411 | " Predict: 1.0\n", 412 | " Tree 13 (weight 1.0):\n", 413 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 414 | " If (feature 17 in {2.0})\n", 415 | " Predict: 1.0\n", 416 | " Else (feature 17 not in {2.0})\n", 417 | " Predict: 0.0\n", 418 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 419 | " Predict: 1.0\n", 420 | " Tree 14 (weight 1.0):\n", 421 | " If (feature 18 in {0.0,3.0,4.0})\n", 422 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 423 | " Predict: 0.0\n", 424 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 425 | " Predict: 1.0\n", 426 | " Else (feature 18 not in {0.0,3.0,4.0})\n", 427 | " If (feature 10 in {2.0})\n", 428 | " Predict: 0.0\n", 429 | " Else (feature 10 not in {2.0})\n", 430 | " Predict: 1.0\n", 431 | " Tree 15 (weight 1.0):\n", 432 | " If (feature 19 in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 433 | " Predict: 0.0\n", 434 | " Else (feature 19 not in {1.0,2.0,4.0,5.0,6.0,7.0,8.0})\n", 435 | " If (feature 4 in {0.0,8.0})\n", 436 | " Predict: 0.0\n", 437 | " Else (feature 4 not in {0.0,8.0})\n", 438 | " Predict: 1.0\n", 439 | " Tree 16 (weight 1.0):\n", 440 | " If (feature 4 in {0.0,4.0,5.0,8.0})\n", 441 | " Predict: 0.0\n", 442 | " Else (feature 4 not in {0.0,4.0,5.0,8.0})\n", 443 | " Predict: 1.0\n", 444 | " Tree 17 (weight 1.0):\n", 445 | " If (feature 11 in {0.0,2.0,3.0})\n", 446 | " If (feature 8 in {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 447 | " Predict: 0.0\n", 448 | " Else (feature 8 not in {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 449 | " Predict: 1.0\n", 450 | " Else (feature 11 not in {0.0,2.0,3.0})\n", 451 | " If (feature 4 in {0.0,8.0})\n", 452 | " Predict: 0.0\n", 453 | " Else (feature 4 not in {0.0,8.0})\n", 454 | " Predict: 1.0\n", 455 | " Tree 18 (weight 1.0):\n", 456 | " If (feature 8 in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 457 | " If (feature 7 in {0.0})\n", 458 | " Predict: 0.0\n", 459 | " Else (feature 7 not in {0.0})\n", 460 | " Predict: 1.0\n", 461 | " Else (feature 8 not in {2.0,3.0,6.0,7.0,8.0,9.0,10.0,11.0})\n", 462 | " If (feature 19 in {1.0,2.0,4.0,6.0})\n", 463 | " Predict: 0.0\n", 464 | " Else (feature 19 not in {1.0,2.0,4.0,6.0})\n", 465 | " Predict: 1.0\n", 466 | " Tree 19 (weight 1.0):\n", 467 | " If (feature 18 in {0.0,3.0,4.0})\n", 468 | " Predict: 0.0\n", 469 | " Else (feature 18 not in {0.0,3.0,4.0})\n", 470 | " Predict: 1.0\n", 471 | "\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "print(cv_model.bestModel.toDebugString)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "[Read for more? Go to chapter 11!](./Ch11_notebook.ipynb)" 484 | ] 485 | } 486 | ], 487 | "metadata": { 488 | "kernelspec": { 489 | "display_name": "mldbook", 490 | "language": "python", 491 | "name": "mldbook" 492 | }, 493 | "language_info": { 494 | "codemirror_mode": { 495 | "name": "ipython", 496 | "version": 3 497 | }, 498 | "file_extension": ".py", 499 | "mimetype": "text/x-python", 500 | "name": "python", 501 | "nbconvert_exporter": "python", 502 | "pygments_lexer": "ipython3", 503 | "version": "3.5.3" 504 | } 505 | }, 506 | "nbformat": 4, 507 | "nbformat_minor": 2 508 | } 509 | -------------------------------------------------------------------------------- /notebooks/Ch11_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Chapter 11. Large datasets in the cloud with Amazon Web Services and S3\n", 8 | "====\n", 9 | "### Mastering Large Datasets with Python by JT Wolohan " 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### Uploading to S3 with Boto" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import boto3 as aws\n", 26 | "import os.path\n", 27 | "from functools import partial\n", 28 | "from glob import iglob" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def upload_file(fp, bucket):\n", 38 | " _, file_name = os.path.split(fp)\n", 39 | " s3 = aws.client(\"s3\",\n", 40 | " aws_access_key_id = \"YOURACCESSKEYID\",\n", 41 | " aws_secret_access_key = \"YOURSECRETACCESSKEY\"\n", 42 | " )\n", 43 | " response = s3.upload_file(fp, bucket, file_name)\n", 44 | " return file_name, response" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "fs = iglob(\"/path/to/data/files/*\")\n", 54 | "uploads = map(partial(upload_file, bucket=\"your-backet-name\"), fs)\n", 55 | "for file_name, _ in uploads :\n", 56 | " print(file_name)" 57 | ] 58 | } 59 | ], 60 | "metadata": { 61 | "kernelspec": { 62 | "display_name": "mldbook", 63 | "language": "python", 64 | "name": "mldbook" 65 | }, 66 | "language_info": { 67 | "codemirror_mode": { 68 | "name": "ipython", 69 | "version": 3 70 | }, 71 | "file_extension": ".py", 72 | "mimetype": "text/x-python", 73 | "name": "python", 74 | "nbconvert_exporter": "python", 75 | "pygments_lexer": "ipython3", 76 | "version": "3.5.3" 77 | } 78 | }, 79 | "nbformat": 4, 80 | "nbformat_minor": 2 81 | } 82 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.9.159 2 | botocore==1.12.159 3 | cachetools==3.1.1 4 | certifi==2019.3.9 5 | chardet==3.0.4 6 | dill==0.2.9 7 | docutils==0.14 8 | google-api-core==1.11.1 9 | google-auth==1.6.3 10 | google-cloud-core==1.0.2 11 | google-cloud-dataproc==0.4.0 12 | google-cloud-logging==1.11.0 13 | google-cloud-storage==1.16.1 14 | google-resumable-media==0.3.2 15 | googleapis-common-protos==1.6.0 16 | grpcio==1.21.1 17 | idna==2.8 18 | jmespath==0.9.4 19 | mrjob==0.6.9 20 | multiprocess==0.70.7 21 | numpy==1.16.4 22 | pathos==0.2.3 23 | pkg-resources==0.0.0 24 | pox==0.2.5 25 | ppft==1.6.4.9 26 | protobuf==3.8.0 27 | py4j==0.10.7 28 | pyasn1==0.4.5 29 | pyasn1-modules==0.2.5 30 | pyspark==2.4.3 31 | python-dateutil==2.8.0 32 | pytz==2019.1 33 | PyYAML==5.1.1 34 | requests==2.22.0 35 | rsa==4.0 36 | s3transfer==0.2.0 37 | six==1.12.0 38 | toolz==0.9.0 39 | urllib3==1.25.3 40 | networkx 41 | --------------------------------------------------------------------------------