├── 9781484248584.jpg ├── Contributing.md ├── LICENSE.txt ├── README.md ├── ch1 ├── .DS_Store └── money_growth │ ├── calculate_money_growth.py │ └── calculate_money_growth_fast.py ├── ch10 ├── .DS_Store ├── artificial_network.py ├── bipartite.py ├── edges.edgelist ├── load_graph.py ├── nodes.csv ├── signed_graph.py └── usage_matrix.py ├── ch11 ├── .DS_Store ├── artificial_network_sampling.py ├── count_non_repeating_digits.py ├── count_non_repeating_digits_naive.py ├── count_occurrences_digit.py ├── count_occurrences_digit_naive.py ├── dimensionality_investigation.py ├── parallel_processing.py └── perf_test_harness.py ├── ch12 ├── .DS_Store ├── AToughGame.py ├── simple_network1.py └── simple_network2.py ├── ch2 ├── .DS_Store └── segmentation │ ├── .DS_Store │ ├── driver.py │ ├── raw_data │ └── .DS_Store │ ├── results │ └── .DS_Store │ └── scripts │ ├── .DS_Store │ ├── nyt_data.py │ └── nyt_data_chunked.py ├── ch3 ├── .DS_Store ├── bug_fixing │ ├── .DS_Store │ ├── double_preceding1.py │ ├── double_preceding2.py │ ├── double_preceding3.py │ ├── double_preceding4.py │ └── test_double_preceding.py ├── cyclomatic_complexity │ ├── sort_new.py │ └── sort_original.py ├── fibonacci │ ├── fibonacci1.py │ ├── fibonacci2.py │ └── sequencer.py ├── optimization │ ├── elevator0.py │ ├── elevator1.py │ ├── elevator2.py │ └── elevator3.py └── puzzles │ ├── puzzle1.py │ ├── puzzle2.py │ └── puzzle2b.py ├── ch4 ├── .DS_Store ├── ball_descend │ ├── .DS_Store │ ├── Simulation.ipynb │ ├── Simulation_Refactored.ipynb │ └── pathfinder │ │ ├── .DS_Store │ │ ├── __init__.py │ │ └── pathfinder.py └── hanoi │ ├── .DS_Store │ ├── Solver1.ipynb │ ├── Solver2.ipynb │ └── Solver3.ipynb ├── ch5 ├── .DS_Store └── augmented_ball_descend │ ├── .DS_Store │ ├── Terrain_Simulation_v1.1.ipynb │ ├── Terrain_Simulation_v1.2.ipynb │ ├── interactionlib │ ├── .DS_Store │ ├── __init__.py │ └── interaction_monitor.py │ ├── pathfinder │ ├── .DS_Store │ ├── __init__.py │ ├── base_pathfinder.py │ ├── non_recursive_simple_pathfinder.py │ ├── parallel_simple_pathfinder.py │ ├── pathutils.py │ └── simple_pathfinder.py │ ├── terrain_data │ ├── .DS_Store │ ├── coastline.jpg │ └── coastline_with_path.jpg │ └── testutils │ ├── .DS_Store │ ├── __init__.py │ └── create_terrain.py ├── ch6 ├── .DS_Store ├── Sample_Temperature_Plots.ipynb ├── anscombe │ ├── anscombe_altair.py │ └── anscombe_matplotlib.py ├── closest_pair │ ├── .DS_Store │ ├── __init__.py │ ├── base_closest_pair.py │ ├── fast_closest_pair.py │ └── naive_closest_pair.py └── temp_plots │ ├── .DS_Store │ ├── GHCND_sample_csv.csv │ ├── plot_stations.py │ ├── plot_temps.py │ └── temp_visualization_demo.py ├── ch7 ├── .DS_Store ├── core_concepts │ ├── .DS_Store │ ├── data_generator.py │ ├── observer.py │ └── session.py └── stock_market │ ├── .DS_Store │ ├── daily_AAPL.csv │ ├── data_preprocessing.py │ ├── data_visualization.py │ ├── driver.py │ ├── feature_engineering.py │ └── streaming_regression.py ├── ch8 ├── .DS_Store ├── lkpy_demo.py └── simple_recommender │ ├── .DS_Store │ ├── omdb_service.py │ ├── simple_movie_recommender.py │ └── tastedive_service.py └── errata.md /9781484248584.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/9781484248584.jpg -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Ervin Varga 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Practical Data Science with Python 3*](https://www.apress.com/9781484248584) by Ervin Varga (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484248584.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /ch1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch1/.DS_Store -------------------------------------------------------------------------------- /ch1/money_growth/calculate_money_growth.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | 4 | def calculate_money_growth(p0, r, t): 5 | # List of final amounts. 6 | p = [] 7 | for i in range(len(p0)): 8 | p.append(p0[i] * math.exp(r * t[i])) 9 | return p -------------------------------------------------------------------------------- /ch1/money_growth/calculate_money_growth_fast.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | 4 | def calculate_money_growth(p0, r, t): 5 | assert p0.size == t.size 6 | 7 | return p0 * np.exp(r * t) -------------------------------------------------------------------------------- /ch10/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch10/.DS_Store -------------------------------------------------------------------------------- /ch10/artificial_network.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | import pandas as pd 4 | import networkx as nx 5 | import matplotlib.pyplot as plt 6 | 7 | G = nx.karate_club_graph() 8 | 9 | node_colors = ['orange' if props['club'] == 'Officer' else 'blue' 10 | for _, props in G.nodes(data=True)] 11 | node_sizes = [180 * G.degree(u) for u in G] 12 | 13 | plt.figure(figsize=(10, 10)) 14 | pos = nx.kamada_kawai_layout(G) 15 | nx.draw_networkx(G, pos, 16 | node_size=node_sizes, 17 | node_color=node_colors, alpha=0.8, 18 | with_labels=False, 19 | edge_color='.6') 20 | 21 | main_conns = nx.edge_betweenness_centrality(G, normalized=True) 22 | main_conns = sorted(main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5] 23 | main_conns = tuple(map(operator.itemgetter(0), main_conns)) 24 | nx.draw_networkx_edges(G, pos, edgelist=main_conns, edge_color='green', alpha=0.5, width=6) 25 | nx.draw_networkx_labels(G, pos, 26 | labels={0: G.node[0]['club'], 33: G.node[33]['club']}, 27 | font_size=15, font_color='white') 28 | 29 | candidate_edges = ((8, 15), (30, 21), (29, 28), (1, 6)) 30 | nx.draw_networkx_edges(G, pos, edgelist=candidate_edges, 31 | edge_color='blue', alpha=0.5, width=2, style='dashed') 32 | nx.draw_networkx_labels(G, pos, 33 | labels={u: u for t in candidate_edges for u in t}, 34 | font_size=13, font_weight='bold', font_color='yellow') 35 | 36 | plt.axis('off') 37 | plt.tight_layout(); 38 | plt.show() 39 | 40 | # Create a data frame to store various centrality measures. 41 | df = pd.DataFrame(index=candidate_edges) 42 | 43 | # Add generic and community aware edge features for potential machine learning classification. 44 | df['pref-att'] = list(map(operator.itemgetter(2), 45 | nx.preferential_attachment(G, candidate_edges))) 46 | df['jaccard-c'] = list(map(operator.itemgetter(2), 47 | nx.jaccard_coefficient(G, candidate_edges))) 48 | df['aa-idx'] = list(map(operator.itemgetter(2), 49 | nx.adamic_adar_index(G, candidate_edges))) 50 | df['ccn'] = list(map(operator.itemgetter(2), 51 | nx.cn_soundarajan_hopcroft(G, candidate_edges, 'club'))) 52 | df['cra'] = list(map(operator.itemgetter(2), 53 | nx.ra_index_soundarajan_hopcroft(G, candidate_edges, 'club'))) 54 | 55 | print(df) -------------------------------------------------------------------------------- /ch10/bipartite.py: -------------------------------------------------------------------------------- 1 | # Call usage_matrix.py before executing this script! 2 | from networkx.algorithms import bipartite 3 | 4 | # Add two new edges as described in the book. 5 | G.add_edge(0, 10, relation='interact') 6 | G.add_edge(10, 0, relation='interact') 7 | 8 | # Select all nodes and edges from G that participate in 'interact' relation and 9 | # create an undirected graph from them. 10 | H = nx.Graph() 11 | H.add_edges_from((u, v) for u, v, r in G.edges(data='relation') if r == 'interact') 12 | 13 | # Attach a marker to specify which nodes belong to what group. 14 | for node_id in H.nodes(): 15 | H.node[node_id]['bipartite'] = G.node[node_id]['role'] == 'actor' 16 | 17 | nx.relabel_nodes(H, {n: G.node[n]['label'].replace(' ', '\n') for n in H.nodes()}, copy=False) 18 | 19 | print("Validating that H is bipartite: ", bipartite.is_bipartite(H)) 20 | 21 | # This is a graph projection operation. Here, we seek to find out what use cases 22 | # have common actors. The weights represent the commonality factor. 23 | W = bipartite.weighted_projected_graph(H, [n for n, r in H.nodes(data='bipartite') if r == 0]) 24 | 25 | # Draw the graph using matplotlib under the hood. 26 | pos = nx.shell_layout(W) 27 | nx.draw(W, pos=pos, with_labels=True, node_size=800, font_size=12) 28 | nx.draw_networkx_edge_labels(W, pos=pos, 29 | edge_labels={(u, v): d['weight'] 30 | for u, v, d in W.edges(data=True)}) 31 | -------------------------------------------------------------------------------- /ch10/edges.edgelist: -------------------------------------------------------------------------------- 1 | # Add edges for the 'impact' relationship. 2 | 0 6 600 impact 3 | 0 7 600 impact 4 | 1 6 15 impact 5 | 2 7 100 impact 6 | 2 8 900 impact 7 | 3 8 800 impact 8 | 4 8 960 impact 9 | 10 | # Add edges for the 'include' relationship. A weight of 1 is assigned as a placeholder. 11 | 0 5 1 include 12 | 1 5 1 include 13 | 2 5 1 include 14 | 15 | # Add edges for the 'extend' relationship. 16 | 3 2 1 extend 17 | 4 2 1 extend 18 | 19 | # Add edges for the 'interact' relationship. 20 | 9 0 1 interact 21 | 0 9 1 interact 22 | 10 1 1 interact 23 | 1 10 1 interact 24 | 10 2 1 interact 25 | 2 10 1 interact 26 | 11 3 1 interact 27 | 3 11 1 interact 28 | 11 4 1 interact 29 | 4 11 1 interact 30 | 0 10 1 interact 31 | 10 0 1 interact 32 | 33 | -------------------------------------------------------------------------------- /ch10/load_graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import pandas as pd 3 | 4 | G = nx.read_edgelist('edges.edgelist', 5 | create_using=nx.MultiDiGraph, 6 | nodetype=int, 7 | data=(('weight', int), ('relation', str))) 8 | 9 | df = pd.read_csv('nodes.csv', index_col=0) 10 | for row in df.itertuples(): 11 | G.node[row.Index]['role'] = row.Role 12 | G.node[row.Index]['label'] = row.Label 13 | 14 | # Make a small report. 15 | print("Nodes: \n", G.nodes(data=True), sep='') 16 | print("-" * 20, "\nEdges: \n", G.edges(data=True), sep='') 17 | -------------------------------------------------------------------------------- /ch10/nodes.csv: -------------------------------------------------------------------------------- 1 | Id,Role,Label 2 | 0,use-case,Communicate 3 | 1,use-case,Manage Dev. 4 | 2,use-case,Exec. Data Analytics 5 | 3,use-case,Use Bus. Int. 6 | 4,use-case,Use Op. Int. 7 | 5,use-case,Send/Receive Data 8 | 6,resource,Network 9 | 7,resource,Messaging 10 | 8,resource,Database 11 | 9,actor,Device 12 | 10,actor,Application 13 | 11,actor,User 14 | -------------------------------------------------------------------------------- /ch10/signed_graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | G = nx.Graph() 4 | 5 | G.add_node(0, role='quality-attribute', label='Maintainability') 6 | G.add_node(1, role='quality-attribute', label='Reliability') 7 | G.add_node(2, role='quality-attribute', label='Performance') 8 | 9 | G.add_edge(0, 1, sign='+') 10 | G.add_edge(0, 2, sign='-') 11 | G.add_edge(1, 2, sign='-') 12 | -------------------------------------------------------------------------------- /ch10/usage_matrix.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | G = nx.MultiDiGraph() 4 | 5 | # Add all nodes with their role and label. You can immediately work with labels, but having 6 | # short node identifiers keeps your code uncluttered. 7 | G.add_node(0, role='use-case', label='Communicate') 8 | G.add_node(1, role='use-case', label='Manage Dev.') 9 | G.add_node(2, role='use-case', label='Exec. Data Analytics') 10 | G.add_node(3, role='use-case', label='Use Bus. Int.') 11 | G.add_node(4, role='use-case', label='Use Op. Int.') 12 | G.add_node(5, role='use-case', label='Send/Receive Data') 13 | G.add_node(6, role='resource', label='Network') 14 | G.add_node(7, role='resource', label='Messaging') 15 | G.add_node(8, role='resource', label='Database') 16 | G.add_node(9, role='actor', label='Device') 17 | G.add_node(10, role='actor', label='Application') 18 | G.add_node(11, role='actor', label='User') 19 | 20 | # Add edges for the 'impact' relationship. 21 | G.add_edge(0, 6, weight=600, relation='impact') 22 | G.add_edge(0, 7, weight=600, relation='impact') 23 | G.add_edge(1, 6, weight=15, relation='impact') 24 | G.add_edge(2, 7, weight=100, relation='impact') 25 | G.add_edge(2, 8, weight=900, relation='impact') 26 | G.add_edge(3, 8, weight=800, relation='impact') 27 | G.add_edge(4, 8, weight=960, relation='impact') 28 | 29 | # Add edges for the 'include' relationship. 30 | G.add_edge(0, 5, relation='include') 31 | G.add_edge(1, 5, relation='include') 32 | G.add_edge(2, 5, relation='include') 33 | 34 | # Add edges for the 'extend' relationship. 35 | G.add_edge(3, 2, relation='extend') 36 | G.add_edge(4, 2, relation='extend') 37 | 38 | # Add edges for the 'interact' relationship. 39 | G.add_edge(9, 0, relation='interact') 40 | G.add_edge(0, 9, relation='interact') 41 | G.add_edge(10, 1, relation='interact') 42 | G.add_edge(1, 10, relation='interact') 43 | G.add_edge(10, 2, relation='interact') 44 | G.add_edge(2, 10, relation='interact') 45 | G.add_edge(11, 3, relation='interact') 46 | G.add_edge(3, 11, relation='interact') 47 | G.add_edge(11, 4, relation='interact') 48 | G.add_edge(4, 11, relation='interact') 49 | 50 | # Visualize the resulting graph using pydot and Graphviz. 51 | from networkx.drawing.nx_pydot import write_dot 52 | 53 | # By default NetworkX returns a deep copy of the source graph. 54 | H = G.copy() 55 | 56 | # Set some display properties for specific nodes and extract labels. 57 | node_labels = {} 58 | for node_id in H.nodes(): 59 | node_labels[node_id] = H.node[node_id]['label'] 60 | role = H.node[node_id]['role'] 61 | if role == 'resource': 62 | H.node[node_id]['style'] = 'filled' 63 | H.node[node_id]['fillcolor'] = 'cyan' 64 | H.node[node_id]['shape'] = 'component' 65 | H.node[node_id]['fixedsize'] = 'shape' 66 | elif role == 'use-case': 67 | H.node[node_id]['shape'] = 'oval' 68 | elif role == 'actor': 69 | H.node[node_id]['style'] = 'rounded' 70 | H.node[node_id]['shape'] = 'box' 71 | H.node[5]['style'] = 'dashed' 72 | 73 | nx.relabel_nodes(H, node_labels, copy=False) 74 | pos = nx.nx_pydot.graphviz_layout(H) 75 | nx.draw(H, pos=pos, with_labels=True, font_weight='bold') 76 | write_dot(H, 'usage_matrix.dot') -------------------------------------------------------------------------------- /ch11/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch11/.DS_Store -------------------------------------------------------------------------------- /ch11/artificial_network_sampling.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | import networkx as nx 4 | import matplotlib.pyplot as plt 5 | 6 | G = nx.karate_club_graph() 7 | 8 | node_colors = ['orange' if props['club'] == 'Officer' else 'blue' 9 | for _, props in G.nodes(data=True)] 10 | node_sizes = [180 * G.degree(u) for u in G] 11 | 12 | plt.figure(figsize=(10, 10)) 13 | pos = nx.kamada_kawai_layout(G) 14 | nx.draw_networkx(G, pos, 15 | node_size=node_sizes, 16 | node_color=node_colors, alpha=0.8, 17 | with_labels=False, 18 | edge_color='.6') 19 | 20 | # Calculating the absolute edge betweenness centrality. 21 | main_conns = nx.edge_betweenness_centrality(G, normalized=True) 22 | main_conns = sorted(main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5] 23 | main_conns = tuple(map(operator.itemgetter(0), main_conns)) 24 | nx.draw_networkx_edges(G, pos, edgelist=main_conns, edge_color='green', alpha=0.5, width=6) 25 | 26 | # Estimating the edge betweenness centrality by sampling 40% of nodes. 27 | NUM_SAMPLES = int(0.4 * len(G)) 28 | 29 | est_main_conns = nx.edge_betweenness_centrality(G, k=NUM_SAMPLES, normalized=True, seed=10) 30 | est_main_conns = sorted(est_main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5] 31 | est_main_conns = tuple(map(operator.itemgetter(0), est_main_conns)) 32 | nx.draw_networkx_edges(G, pos, edgelist=est_main_conns, 33 | edge_color='red', alpha=0.9, width=6, style='dashed') 34 | 35 | nx.draw_networkx_labels(G, pos, 36 | labels={0: G.node[0]['club'], 33: G.node[33]['club']}, 37 | font_size=15, font_color='white') 38 | 39 | candidate_edges = ((8, 15), (30, 21), (29, 28), (1, 6)) 40 | nx.draw_networkx_edges(G, pos, edgelist=candidate_edges, 41 | edge_color='blue', alpha=0.5, width=2, style='dashed') 42 | nx.draw_networkx_labels(G, pos, 43 | labels={u: u for t in candidate_edges for u in t}, 44 | font_size=13, font_weight='bold', font_color='yellow') 45 | 46 | plt.axis('off') 47 | plt.tight_layout(); 48 | plt.show() 49 | -------------------------------------------------------------------------------- /ch11/count_non_repeating_digits.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | a, b = tuple(map(int, input().split())) 4 | 5 | def variation_without_repetition(n, k): 6 | return math.factorial(n) // math.factorial(n - k) 7 | 8 | # Finds how many numbers with non-repeating digits are present in [0, k]. 9 | def count_numbers_with_non_repeating_digits(k): 10 | if k < 0: 11 | return 0 12 | if k == 0: 13 | return 1 14 | 15 | # We can find most numbers using combinatorics. 16 | digits = str(k) 17 | num_digits = len(digits) 18 | first_digit = int(digits[0]) 19 | span = 10 ** (num_digits - 1) 20 | 21 | s = (first_digit - 1) * variation_without_repetition(9, num_digits - 1) 22 | 23 | # We must take care of a lower interval regarding leading zeros. 24 | s += count_numbers_with_non_repeating_digits(span - 1) 25 | 26 | # We continue our search for the upper part. 27 | used_digits = {first_digit} 28 | t = num_digits == 1 29 | 30 | for i in range(1, num_digits): 31 | first_digit = int(digits[i]) 32 | allowed_digits = set(range(first_digit + 1)) - used_digits 33 | v = variation_without_repetition(9 - i, num_digits - 1 - i) 34 | used_digits.add(first_digit) 35 | 36 | if first_digit not in allowed_digits: 37 | if len(allowed_digits) == 0 and i == 1: 38 | t = 0 39 | else: 40 | t += len(allowed_digits) * v 41 | break 42 | else: 43 | t += (len(allowed_digits) - (i != num_digits - 1)) * v 44 | return s + t 45 | 46 | print(count_numbers_with_non_repeating_digits(b) - \ 47 | count_numbers_with_non_repeating_digits(a - 1)) -------------------------------------------------------------------------------- /ch11/count_non_repeating_digits_naive.py: -------------------------------------------------------------------------------- 1 | a, b = tuple(map(int, input().split())) 2 | 3 | count = 0 4 | for i in range(a, b + 1): 5 | s = str(i) 6 | if len(set(s)) == len(s): 7 | count += 1 8 | 9 | print(count) -------------------------------------------------------------------------------- /ch11/count_occurrences_digit.py: -------------------------------------------------------------------------------- 1 | def setup(): 2 | MAX_EXPONENT = 50 3 | 4 | # Holds the number of occurrences of digit k in [0, (10**i) - 1], i > 0. 5 | # If the range is partial (first part of the composite key is False), then 6 | # leading zeros are omitted (this is a special case when k == 0). 7 | table_of_occurrences = {(False, 0): 0, (False, 1): 1, 8 | (True, 0): 0, (True, 1): 1} 9 | for i in range(2, MAX_EXPONENT + 1): 10 | table_of_occurrences[(True, i)] = i * 10**(i - 1) 11 | table_of_occurrences[(False, i)] = \ 12 | 10**(i - 1) + 10 * table_of_occurrences[(False, i - 1)] - 10 13 | return table_of_occurrences 14 | 15 | def count_occurrences_digit(k, n, table_of_occurrences=setup()): 16 | digits = str(n) 17 | num_digits = len(digits) 18 | count = 0 19 | is_first_digit = num_digits > 1 20 | 21 | for digit in map(int, digits): 22 | span = 10**(num_digits - 1) 23 | 24 | count += (digit - 1) * table_of_occurrences[(True, num_digits - 1)] 25 | count += table_of_occurrences[(k != 0 or not is_first_digit, num_digits - 1)] 26 | 27 | if digit > k: 28 | if k > 0 or not is_first_digit: 29 | count += span 30 | elif digit == k: 31 | count += (n % span) + 1 32 | 33 | num_digits -= 1 34 | is_first_digit = False 35 | return count 36 | 37 | if __name__ == '__main__': 38 | k, n = tuple(map(int, input().split())) 39 | print(count_occurrences_digit(k, n)) 40 | -------------------------------------------------------------------------------- /ch11/count_occurrences_digit_naive.py: -------------------------------------------------------------------------------- 1 | def count_occurrences_digit_naive(k, n): 2 | k = str(k) 3 | count = 0 4 | for i in range(n + 1): 5 | count += str(i).count(k) 6 | return count 7 | 8 | if __name__ == '__main__': 9 | k, n = tuple(map(int, input().split())) 10 | print(count_occurrences_digit_naive(k, n)) 11 | -------------------------------------------------------------------------------- /ch11/dimensionality_investigation.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import fetch_kddcup99 2 | from sklearn.manifold import TSNE 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | def retrieve_column_desc(): 8 | import requests 9 | 10 | r = requests.get('https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names') 11 | 12 | column_desc = {} 13 | for row in r.text.split('\n')[1:]: 14 | if row.find(':') > 0: 15 | col_name, col_type = row[:-1].split(':') 16 | column_desc[col_name] = col_type.strip() 17 | return column_desc 18 | 19 | def get_numeric_columns(column_desc): 20 | return [name for name in column_desc if column_desc[name] == 'continuous'] 21 | 22 | column_desc = retrieve_column_desc() 23 | numeric_columns = get_numeric_columns(column_desc) 24 | print('Number of numeric columns:', len(numeric_columns)) 25 | 26 | X, _ = fetch_kddcup99(subset='SA', random_state=10, return_X_y=True) 27 | X = pd.DataFrame(X, columns=column_desc.keys()) 28 | X[numeric_columns] = X[numeric_columns].apply(pd.to_numeric) 29 | 30 | # We need to work on a small sample to get results in any reasonable time frame. 31 | X = X.sample(frac=0.05, random_state=10) 32 | 33 | m = TSNE(learning_rate=150, random_state=10) 34 | X_tsne = m.fit_transform(X[numeric_columns]) 35 | print('First 10 rows of the TSNE reduced dataset:') 36 | print(X_tsne[:10, :]) 37 | 38 | X['t-sne_1'] = X_tsne[:, 0] 39 | X['t-sne_2'] = X_tsne[:, 1] 40 | 41 | sns.set(rc={'figure.figsize': (10, 10)}) 42 | sns.scatterplot(x='t-sne_1', y='t-sne_2', 43 | hue='protocol_type', 44 | style='protocol_type', 45 | data=X[numeric_columns + ['protocol_type', 't-sne_1', 't-sne_2']]) 46 | plt.show() -------------------------------------------------------------------------------- /ch11/parallel_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import dask.array as da 3 | 4 | def num_divisible(a, b, c): 5 | r = a % c 6 | if r == 0: 7 | start = a 8 | else: 9 | start = a + (c - r) 10 | 11 | if start > b: 12 | return 0 13 | else: 14 | return 1 + (b - start) // c 15 | 16 | num_divisible_vect = np.vectorize(num_divisible) 17 | x = da.asanyarray([(1, 100, 10), (16789, 445267839, 7), (34, 10**18, 3000), (3, 7, 9)]) 18 | x = x.rechunk(chunks=(2, -1)) 19 | y = x.map_blocks(lambda block: num_divisible_vect(*block.T), 20 | chunks=(-1,), 21 | drop_axis=1, 22 | dtype='i8') 23 | print(y.compute()) -------------------------------------------------------------------------------- /ch11/perf_test_harness.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | def measure(f, num_repetitions=5): 7 | measurements = np.array([]) 8 | for _ in range(num_repetitions): 9 | start = time.clock() 10 | f() 11 | measurements = np.append(measurements, time.clock() - start) 12 | return measurements.mean() 13 | 14 | def execute(config): 15 | execution_times = {} 16 | 17 | for config_name in config['functions']: 18 | execution_times[config_name] = np.array([]) 19 | 20 | for x in config['span']: 21 | for config_name in config['functions']: 22 | execution_times[config_name] = np.append( 23 | execution_times[config_name], 24 | measure(lambda: config['functions'][config_name](x))) 25 | return execution_times 26 | 27 | def attach_model(execution_times, config, function_name, model_name): 28 | model_vals = np.vectorize(config['models'][model_name])(config['span']) 29 | c = np.mean(execution_times[function_name] / model_vals) 30 | execution_times[model_name] = c * model_vals 31 | 32 | def report(execution_times, x_vals, **plot_kwargs): 33 | df = pd.DataFrame(execution_times) 34 | df.index = x_vals 35 | ax = df.plot.line( 36 | figsize=(10, 8), 37 | title='Performance Test Report', 38 | grid=True, 39 | **plot_kwargs 40 | ) 41 | ax.set_xlabel('Span') 42 | ax.set_ylabel('Time [s]') 43 | return df 44 | 45 | if __name__ == '__main__': 46 | import math 47 | 48 | from count_occurrences_digit_naive import count_occurrences_digit_naive 49 | import count_occurrences_digit as cog 50 | 51 | table_of_occurrences = cog.setup() 52 | config = { 53 | 'functions': { 54 | 'naive(k=0)': lambda n: count_occurrences_digit_naive(0, n), 55 | 'fast(k=0)': lambda n: cog.count_occurrences_digit(0, n, table_of_occurrences) 56 | }, 57 | 'models': { 58 | 'O(n)': lambda n: n, 59 | 'O(log n)': lambda n: math.log(n) 60 | }, 61 | 'span': np.geomspace(10**2, 10**7, num=14, dtype=int) 62 | } 63 | execution_times = execute(config) 64 | attach_model(execution_times, config, 'naive(k=0)', 'O(n)') 65 | attach_model(execution_times, config, 'fast(k=0)', 'O(log n)') 66 | print(report(execution_times, config['span'], logx=True, style=['-ro', '-gs', ':r^', ':gv'])) -------------------------------------------------------------------------------- /ch12/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch12/.DS_Store -------------------------------------------------------------------------------- /ch12/AToughGame.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """Solution for the AToughGame Topcoder problem.""" 5 | class AToughGame: 6 | def expectedGain(self, prob, value): 7 | """ 8 | Examples: 9 | >>> EPS = 10 ** -6 10 | >>> game = AToughGame() 11 | >>> abs(game.expectedGain((1000,500), (3,4)) - 10.0) < EPS 12 | True 13 | >>> abs(game.expectedGain((1000,1), (3,4)) - 3003.9999999999977) < EPS 14 | True 15 | >>> abs(game.expectedGain((500,500,500,500,500), (1,2,3,4,5)) - 16.626830517153095) < EPS 16 | True 17 | >>> abs(game.expectedGain((250,750), (1000,1)) - 1067.6666666666667) < EPS 18 | True 19 | >>> abs(game.expectedGain((916,932,927,988,958,996,944,968,917,939,960,965,960,998,920,990,915,972,995,916,902, 968,970,962,922,959,994,915,996,996,994,986,945,947,912,946,972,951,973,965,921,910, 938,975,942,950,900,983,960,998,982,980,902,974,952,938,900,962,920,931,964,974,953, 995,946,946,903,921,923,985,919,996,930,915,991,967,996,911,999,936,1000,962,970,929, 966,960,930,920,958,926,983), (583,428,396,17,163,815,31,536,175,165,532,781,29,963,331,987,599,497,380,180,780,25, 931,607,784,613,468,140,488,604,401,912,204,785,697,173,451,849,714,914,650,652,338, 336,177,147,22,652,901,548,370,9,118,487,779,567,818,440,10,868,316,666,690,714,623, 269,501,649,324,773,173,54,391,745,504,578,81,627,319,301,16,899,658,586,604,83,520, 81,181,943,157)) - 54204.93356505282) < EPS 20 | True 21 | """ 22 | 23 | # Combines two levels into a single aggregate. This implements the safe move 24 | # of this greedy algorithm. 25 | def combine(level0, level1): 26 | p0, v0, p1, v1 = level0[0], level0[1], level1[0], level1[1] 27 | q0, q1 = 1 - p0, 1 - p1 28 | return p0 * p1, v1 + v0 * p1 * (p0 + q0 / p1) * (1 - p0 * q1) ** -2 29 | 30 | from functools import reduce 31 | return reduce(combine, zip(map(lambda p: p / 1000, prob), value))[1] 32 | 33 | if __name__ == '__main__': 34 | import doctest 35 | doctest.testmod() 36 | -------------------------------------------------------------------------------- /ch12/simple_network1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class NeuralNetwork: 5 | def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate): 6 | self.input_nodes = input_nodes 7 | self.hidden_nodes = hidden_nodes 8 | self.output_nodes = output_nodes 9 | 10 | # Initialize weights to small random values using Normal distribution. 11 | self.weights_input_to_hidden = np.random.normal( 12 | scale = self.input_nodes ** -0.5, 13 | size = (self.input_nodes, self.hidden_nodes)) 14 | self.weights_hidden_to_output = np.random.normal( 15 | scale = self.hidden_nodes ** -0.5, 16 | size = (self.hidden_nodes, self.output_nodes)) 17 | 18 | self.lr = learning_rate 19 | self.activation_function = lambda x : 1 / (1 + np.exp(-x)) # sigmoid 20 | 21 | def train(self, features, targets): 22 | delta_weights_i_h = np.zeros(self.weights_input_to_hidden.shape) 23 | delta_weights_h_o = np.zeros(self.weights_hidden_to_output.shape) 24 | 25 | for X, y in zip(features, targets): 26 | y_hat, hidden_outputs = self.__forward(X) 27 | delta_weights_i_h, delta_weights_h_o = self.__backward( 28 | y_hat, hidden_outputs, 29 | X, y, 30 | delta_weights_i_h, delta_weights_h_o) 31 | self.__update_weights(delta_weights_i_h, delta_weights_h_o) 32 | 33 | def run(self, X): 34 | return self.__forward(X)[0] 35 | 36 | def __forward(self, X): 37 | hidden_inputs = np.dot(X, self.weights_input_to_hidden) 38 | hidden_outputs = self.activation_function(hidden_inputs) 39 | final_inputs = np.dot(hidden_outputs, self.weights_hidden_to_output) 40 | y_hat = final_inputs 41 | return y_hat, hidden_outputs 42 | 43 | def __backward(self, y_hat, hidden_outputs, X, y, delta_weights_i_h, delta_weights_h_o): 44 | error = y - y_hat 45 | hidden_error = np.dot(self.weights_hidden_to_output, error) 46 | output_error_term = error 47 | hidden_error_term = hidden_error * hidden_outputs * (1 - hidden_outputs) 48 | delta_weights_i_h += np.dot( 49 | X[:, np.newaxis], hidden_error_term[np.newaxis, :]) 50 | delta_weights_h_o += np.dot( 51 | hidden_outputs[:, np.newaxis], output_error_term[np.newaxis, :]) 52 | return delta_weights_i_h, delta_weights_h_o 53 | 54 | def __update_weights(self, delta_weights_i_h, delta_weights_h_o): 55 | self.weights_hidden_to_output += self.lr * delta_weights_h_o 56 | self.weights_input_to_hidden += self.lr * delta_weights_i_h 57 | 58 | 59 | ######################################################### 60 | # Set your hyperparameters here 61 | ########################################################## 62 | iterations = 1000 63 | learning_rate = 0.005 64 | hidden_nodes = 20 65 | output_nodes = 1 66 | -------------------------------------------------------------------------------- /ch12/simple_network2.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from torch import nn 5 | 6 | 7 | class NeuralNetwork: 8 | def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate): 9 | self.model = nn.Sequential(OrderedDict([ 10 | ('fc', nn.Linear(input_nodes, hidden_nodes)), 11 | ('sigmoid', nn.Sigmoid()), 12 | ('output', nn.Linear(hidden_nodes, output_nodes))])) 13 | 14 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | self.model.to(self.device) 16 | 17 | self.criterion = nn.MSELoss() 18 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr = learning_rate) 19 | 20 | def train(self, features, targets): 21 | features, targets = features.to(self.device), targets.to(self.device) 22 | 23 | self.model.train() 24 | self.optimizer.zero_grad() 25 | output = self.model(features) 26 | loss = self.criterion(output, targets) 27 | loss.backward() 28 | self.optimizer.step() 29 | 30 | def run(self, x): 31 | self.model.eval() 32 | with torch.no_grad(): 33 | return self.model(torch.tensor(x.values, dtype = torch.float) \ 34 | .to(self.device)) \ 35 | .cpu() \ 36 | .numpy() 37 | 38 | 39 | ######################################################### 40 | # Set your hyperparameters here 41 | ########################################################## 42 | iterations = 200 43 | learning_rate = 0.05 44 | hidden_nodes = 20 45 | output_nodes = 1 46 | -------------------------------------------------------------------------------- /ch2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/.DS_Store -------------------------------------------------------------------------------- /ch2/segmentation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/.DS_Store -------------------------------------------------------------------------------- /ch2/segmentation/driver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Main driver code for calling other routines. 5 | 6 | @author: Ervin Varga 7 | """ 8 | import sys 9 | import os 10 | sys.path.append(os.path.abspath('scripts')) 11 | 12 | from nyt_data import retrieve 13 | 14 | repoUrl = 'https://github.com/oreillymedia/doing_data_science/' 15 | fileUrl = 'raw/master/dds_datasets.zip' 16 | 17 | retrieve(repoUrl + fileUrl, 'raw_data') 18 | print('Raw data files are successfully retrieved.') 19 | 20 | #import numpy as np 21 | #import pandas as pd 22 | from nyt_data_chunked import traverse 23 | 24 | """ 25 | summary_data = dict() 26 | summary_data.setdefault('CTR', np.empty(31)) 27 | summary_data.setdefault('Clicks', np.empty(31)) 28 | 29 | def select_stats_unregistered(df, file_num): 30 | summary_data['CTR'][file_num] = df['CTR']['mean'][('Unknown', '0')] 31 | summary_data['Clicks'][file_num] = df['Clicks']['sum'][('Unknown', '0')] 32 | 33 | traverse('raw_data', select_stats_unregistered) 34 | print('Raw data files are successfully processed.') 35 | 36 | # Make some plots of CTR and Total Clicks over time. 37 | df = pd.DataFrame.from_dict(summary_data) 38 | 39 | import matplotlib.pyplot as plt 40 | 41 | fig, axes = plt.subplots(nrows=2, ncols=1) 42 | df['CTR'].plot( 43 | title='Click Through Rate Over 1 Month', 44 | ax=axes[0], 45 | figsize=(8, 9), 46 | xticks=[] 47 | ); 48 | df['Clicks'].plot( 49 | xticks=range(0, 31, 2), 50 | title='Total Clicks Over 1 Month', 51 | ax=axes[1], 52 | figsize=(8, 9) 53 | ); 54 | """ 55 | 56 | def save_stats(df, file_num): 57 | targetFile = 'nyt_summary_' + str(file_num + 1) + '.parquet' 58 | df.columns = ['_'.join(column).rstrip('_') for column in df.columns.values] 59 | df.to_parquet('results/' + targetFile) 60 | 61 | traverse('raw_data', save_stats) 62 | print('Raw data files are successfully processed.') -------------------------------------------------------------------------------- /ch2/segmentation/raw_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/raw_data/.DS_Store -------------------------------------------------------------------------------- /ch2/segmentation/results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/results/.DS_Store -------------------------------------------------------------------------------- /ch2/segmentation/scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/scripts/.DS_Store -------------------------------------------------------------------------------- /ch2/segmentation/scripts/nyt_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Script to download all data, summarize a single data file and traverse the 5 | data folder to process all files. 6 | 7 | @author: Ervin Varga 8 | """ 9 | 10 | import requests, zipfile, io, shutil 11 | 12 | unpackedFolder = '/dds_datasets/' 13 | unpackedZipFile = 'dds_ch2_nyt.zip' 14 | 15 | def retrieve(sourceFile, destinationFolder): 16 | def cleanup(): 17 | try: 18 | shutil.rmtree(destinationFolder + unpackedFolder) 19 | except OSError as e: 20 | print("Folder: %s, Error: %s" % (e.filename, e.strerror)) 21 | 22 | r = requests.get(sourceFile) 23 | assert r.status_code == requests.codes.ok 24 | 25 | z = zipfile.ZipFile(io.BytesIO(r.content)) 26 | z.extractall(destinationFolder) 27 | 28 | # The top archive contains another ZIP file with our data. 29 | z = zipfile.ZipFile(destinationFolder + unpackedFolder + unpackedZipFile) 30 | z.extractall(destinationFolder) 31 | 32 | cleanup() 33 | 34 | import pandas as pd 35 | import numpy as np 36 | 37 | def summarize(data_file): 38 | def q25(x): 39 | return x.quantile(0.25) 40 | 41 | def q75(x): 42 | return x.quantile(0.75) 43 | 44 | # Read and parse the CSV data file. 45 | nyt_data = pd.read_csv(data_file, dtype={'Gender': 'category'}) 46 | 47 | # Segment users into age groups. 48 | nyt_data['Age_Group'] = pd.cut( 49 | nyt_data['Age'], 50 | bins=[-1, 0, 17, 24, 34, 44, 54, 64, 120], 51 | labels=["Unknown", 52 | "1-17", 53 | "18-24", 54 | "25-34", 55 | "35-44", 56 | "45-54", 57 | "55-64", 58 | "65+"]) 59 | nyt_data.drop('Age', axis='columns', inplace=True) 60 | 61 | # Create the click through rate feature. 62 | nyt_data['CTR'] = nyt_data['Clicks'] / nyt_data['Impressions'] 63 | nyt_data.dropna(inplace=True) 64 | nyt_data.drop((nyt_data['Clicks'] > nyt_data['Impressions']).nonzero()[0], 65 | inplace=True) 66 | 67 | # Make final description of data. 68 | compressed_nyt_data = \ 69 | nyt_data.groupby(by=['Age_Group', 'Gender'])[['CTR', 'Clicks']] \ 70 | .agg([np.mean, np.std, np.max, q25, np.median, q75, np.sum]) 71 | return compressed_nyt_data 72 | 73 | import pathlib 74 | 75 | def traverse(sourceFolder, collect): 76 | def get_file_number(data_file): 77 | return int(data_file.name[3:-4]) - 1 78 | 79 | for data_file in pathlib.Path(sourceFolder).glob('nyt*.csv'): 80 | collect(summarize(data_file.absolute()), get_file_number(data_file)) -------------------------------------------------------------------------------- /ch2/segmentation/scripts/nyt_data_chunked.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Script to download all data, summarize a single data file chunk-by-chunk and 5 | traverse the data folder to process all files. 6 | 7 | @author: Ervin Varga 8 | """ 9 | 10 | import requests, zipfile, io, shutil 11 | 12 | unpackedFolder = '/dds_datasets/' 13 | unpackedZipFile = 'dds_ch2_nyt.zip' 14 | 15 | def retrieve(sourceFile, destinationFolder): 16 | def cleanup(): 17 | try: 18 | shutil.rmtree(destinationFolder + unpackedFolder) 19 | except OSError as e: 20 | print("Folder: %s, Error: %s" % (e.filename, e.strerror)) 21 | 22 | r = requests.get(sourceFile) 23 | assert r.status_code == requests.codes.ok 24 | 25 | z = zipfile.ZipFile(io.BytesIO(r.content)) 26 | z.extractall(destinationFolder) 27 | 28 | # The top archive contains another ZIP file with our data. 29 | z = zipfile.ZipFile(destinationFolder + unpackedFolder + unpackedZipFile) 30 | z.extractall(destinationFolder) 31 | 32 | cleanup() 33 | 34 | import pandas as pd 35 | import numpy as np 36 | 37 | def summarize(data_file, chunksize): 38 | def q25(x): 39 | return x.quantile(0.25) 40 | 41 | def q75(x): 42 | return x.quantile(0.75) 43 | 44 | # Read and parse the CSV data file chunk-by-chunk. 45 | nyt_data = pd.DataFrame() 46 | for chunk_df in pd.read_csv( 47 | data_file, 48 | dtype={'Gender': 'category'}, 49 | chunksize=chunksize): 50 | 51 | # Segment users into age groups. 52 | chunk_df['Age_Group'] = pd.cut( 53 | chunk_df['Age'], 54 | bins=[-1, 0, 17, 24, 34, 44, 54, 64, 120], 55 | labels=["Unknown", 56 | "1-17", 57 | "18-24", 58 | "25-34", 59 | "35-44", 60 | "45-54", 61 | "55-64", 62 | "65+"]) 63 | 64 | # Create the click through rate feature. 65 | chunk_df['CTR'] = chunk_df['Clicks'] / chunk_df['Impressions'] 66 | chunk_df.dropna(inplace=True) 67 | chunk_df.drop((chunk_df['Clicks'] > chunk_df['Impressions']).nonzero()[0], 68 | inplace=True) 69 | 70 | # Append chunk to the main data frame. 71 | nyt_data = nyt_data.append( 72 | chunk_df[['Age_Group', 'Gender', 'Clicks', 'CTR']], 73 | ignore_index=True) 74 | 75 | # Make final description of data. 76 | compressed_nyt_data = \ 77 | nyt_data.groupby(by=['Age_Group', 'Gender'])[['CTR', 'Clicks']] \ 78 | .agg([np.mean, np.std, np.max, q25, np.median, q75, np.sum]) 79 | return compressed_nyt_data 80 | 81 | import pathlib 82 | 83 | def traverse(sourceFolder, collect, chunksize=10000): 84 | def get_file_number(data_file): 85 | return int(data_file.name[3:-4]) - 1 86 | 87 | for data_file in pathlib.Path(sourceFolder).glob('nyt*.csv'): 88 | collect(summarize(data_file.absolute(), chunksize), 89 | get_file_number(data_file)) -------------------------------------------------------------------------------- /ch3/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch3/.DS_Store -------------------------------------------------------------------------------- /ch3/bug_fixing/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch3/bug_fixing/.DS_Store -------------------------------------------------------------------------------- /ch3/bug_fixing/double_preceding1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from array import array 4 | 5 | def double_preceding(x: array) -> None: 6 | """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0. 7 | 8 | >>> x = array('i', [5, 10, 15]) 9 | >>> double_preceding(x) 10 | >>> x 11 | array('i', [0, 10, 20]) 12 | """ 13 | 14 | if x: 15 | temp = x[0]; x[0] = 0 16 | for i in range(1, len(x)): 17 | x[i] = 2 * temp; temp = x[i] 18 | -------------------------------------------------------------------------------- /ch3/bug_fixing/double_preceding2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from array import array 4 | 5 | def double_preceding(x: array) -> None: 6 | """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0. 7 | 8 | >>> x = array('i', [5, 10, 15]) 9 | >>> double_preceding(x) 10 | >>> x 11 | array('i', [0, 10, 20]) 12 | """ 13 | 14 | if x: 15 | temp = x[0]; x[0] = 0 16 | for i in range(1, len(x)): 17 | temp_2x = 2 * temp; temp = x[i]; x[i] = temp_2x 18 | 19 | -------------------------------------------------------------------------------- /ch3/bug_fixing/double_preceding3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from array import array 4 | 5 | def double_preceding(x: array) -> None: 6 | """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0. 7 | 8 | >>> x = array('i', [5, 10, 15]) 9 | >>> double_preceding(x) 10 | >>> x 11 | array('i', [0, 10, 20]) 12 | """ 13 | 14 | if x: 15 | for i in range(-1, -len(x), -1): 16 | x[i] = 2 * x[i - 1] 17 | x[0] = 0 18 | 19 | -------------------------------------------------------------------------------- /ch3/bug_fixing/double_preceding4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | 5 | def double_preceding(x: np.ndarray) -> None: 6 | """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0. 7 | 8 | >>> x = np.array([5, 10, 15]) 9 | >>> double_preceding(x) 10 | >>> x 11 | array([ 0, 10, 20]) 12 | """ 13 | 14 | if x.size != 0: 15 | x[:-x.size:-1] = 2 * x[-2::-1] 16 | x[0] = 0 17 | -------------------------------------------------------------------------------- /ch3/bug_fixing/test_double_preceding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from array import array 4 | import unittest 5 | from double_preceding4 import double_preceding 6 | 7 | class TestDoublePreceding(unittest.TestCase): 8 | """Tests for double_preceding function.""" 9 | 10 | def test_already_arranged(self): 11 | """Test with already arranged values.""" 12 | argument = array('i', [5, 10, 15]) 13 | expected = array('i', [0, 10, 20]) 14 | double_preceding(argument) 15 | self.assertEqual(expected, argument) 16 | 17 | def test_identical(self): 18 | """Test with multiple identical values.""" 19 | argument = array('i', [0, 1, 1]) 20 | expected = array('i', [0, 0, 2]) 21 | double_preceding(argument) 22 | self.assertEqual(expected, argument) 23 | 24 | def test_empty(self): 25 | """Test with an empty array.""" 26 | argument = [] 27 | expected = [] 28 | double_preceding(argument) 29 | self.assertEqual(expected, argument) 30 | 31 | if __name__ == "__main__": 32 | unittest.main() -------------------------------------------------------------------------------- /ch3/cyclomatic_complexity/sort_new.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def sort(data): 4 | for i in range(len(data)): 5 | for j in range(len(data)): 6 | avg = (data[i] + data[j]) / 2.0 7 | diff = abs(data[i] - avg) 8 | data[i] = avg - diff 9 | data[j] = avg + diff 10 | return data 11 | -------------------------------------------------------------------------------- /ch3/cyclomatic_complexity/sort_original.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def sort(data): 4 | for i in range(len(data)): 5 | for j in range(len(data)): 6 | if data[i] > data[j]: 7 | data[i], data[j] = (data[j], data[i]) 8 | return data -------------------------------------------------------------------------------- /ch3/fibonacci/fibonacci1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def fibonacci(n): 4 | sequence = [] 5 | current, next = 0, 1 6 | for _ in range(n): 7 | current, next = next, current + next 8 | sequence.append(current) 9 | return sequence -------------------------------------------------------------------------------- /ch3/fibonacci/fibonacci2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def fibonacci(n, f0=0, f1=1): 4 | sequence = [] 5 | current, next = f0, f1 6 | for _ in range(n): 7 | current, next = next, current + next 8 | sequence.append(current) 9 | return sequence -------------------------------------------------------------------------------- /ch3/fibonacci/sequencer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def simple_recurrent_sequence(n, first, second, combine_fun): 4 | sequence = [] 5 | current, next = first, second 6 | for _ in range(n): 7 | current, next = next, combine_fun(current, next) 8 | sequence.append(current) 9 | return sequence 10 | 11 | def fibonacci(n): 12 | return simple_recurrent_sequence(n, 0, 1, lambda x, y: x + y) -------------------------------------------------------------------------------- /ch3/optimization/elevator0.py: -------------------------------------------------------------------------------- 1 | def num_days(h, u, d): 2 | total_days = 1 3 | curr_height = 0 4 | 5 | while h - curr_height > u: 6 | curr_height += u - d 7 | total_days += 1 8 | return total_days -------------------------------------------------------------------------------- /ch3/optimization/elevator1.py: -------------------------------------------------------------------------------- 1 | def num_days(h, u, d): 2 | total_days = 1 3 | height_left = h 4 | 5 | while u < height_left: 6 | days = height_left // u 7 | total_days += days 8 | height_left -= days * (u - d) 9 | return total_days 10 | -------------------------------------------------------------------------------- /ch3/optimization/elevator2.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | 3 | @lru_cache(maxsize=32) 4 | def _partial_num_days(height_left, u, d): 5 | total_days = 1 6 | 7 | while u < height_left: 8 | days = height_left // u 9 | total_days += days 10 | height_left -= days * (u - d) 11 | return total_days 12 | 13 | H_LIMIT = 1000000 14 | 15 | def num_days(h, u, d): 16 | if h > H_LIMIT: 17 | days = 2 * (num_days(h // 2, u, d) - 1) 18 | height_left = h - days * (u - d) 19 | return days + _partial_num_days(height_left, u, d) 20 | else: 21 | return _partial_num_days(h, u, d) -------------------------------------------------------------------------------- /ch3/optimization/elevator3.py: -------------------------------------------------------------------------------- 1 | def num_days(h, u, d): 2 | import math 3 | 4 | height_left_until_last_day = h - u 5 | daily_progress = u - d 6 | return 1 + math.ceil(height_left_until_last_day / daily_progress) -------------------------------------------------------------------------------- /ch3/puzzles/puzzle1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def puzzle1(n): 4 | p = 0; w = 1; s = n 5 | 6 | while w <= n: 7 | w <<= 2 8 | 9 | while w != 1: 10 | w >>= 2 11 | f = p + w 12 | p >>= 1 13 | 14 | if s >= f: 15 | p += w 16 | s -= f 17 | return p 18 | -------------------------------------------------------------------------------- /ch3/puzzles/puzzle2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def puzzle2(bytes): 4 | f = [0] * 255 5 | s = k = 0 6 | 7 | for b in bytes: 8 | f[b] += 1 9 | 10 | s += f[k] 11 | k += 1 12 | while s < len(bytes) / 2: 13 | s += f[k] 14 | k += 1 15 | return k 16 | -------------------------------------------------------------------------------- /ch3/puzzles/puzzle2b.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | def puzzle2(bytes): 4 | f = [0] * 255 5 | s = k = 0 6 | 7 | for b in bytes: 8 | f[b] += 1 9 | 10 | k += 1 11 | s += f[k] 12 | while s < len(bytes) / 2: 13 | k += 1 14 | s += f[k] 15 | return k 16 | -------------------------------------------------------------------------------- /ch4/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/.DS_Store -------------------------------------------------------------------------------- /ch4/ball_descend/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/ball_descend/.DS_Store -------------------------------------------------------------------------------- /ch4/ball_descend/Simulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simulation of a Ball's Descend in a Terrain\n", 8 | "\n", 9 | "This project simulates where a ball will land in a terrain.\n", 10 | "\n", 11 | "## Input\n", 12 | "The terrain's configuration is given as a matrix of integers representing elevation at each spot. For simplicity, assume that the terrain is surrounded by a rectangular wall, that prevents the ball to escape. The inner dimensions of the terrain are NxM, where N and M are integers between 3 and 1000.\n", 13 | "\n", 14 | "The ball's initial position is given as a pair of integers (a, b).\n", 15 | "\n", 16 | "## Output\n", 17 | "The result is a list of coordinates denoting the ball's path in a terrain. The first element of the list is the starting position, and the last one is the ending position. It could happen that they are the same, if the ball has emanated from a local minima (dent).\n", 18 | "\n", 19 | "## Rules\n", 20 | "The ball moves according to the next two simple rules:\n", 21 | "- The ball rolls from the current position into the lowest neighboring one.\n", 22 | "- If the ball is surrounded by higher points, then it stops." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Usual bootstrapping code; just run this cell.\n", 32 | "import numpy as np\n", 33 | "\n", 34 | "from typing import List, Tuple\n", 35 | "\n", 36 | "from ipywidgets import interact, widgets" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "matrix([[-2, 3, 2, 1],\n", 48 | " [-2, 4, 3, 0],\n", 49 | " [-3, 3, 1, -3],\n", 50 | " [-4, 2, -1, 1],\n", 51 | " [-5, -7, 3, 0]])" 52 | ] 53 | }, 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "terrain = np.matrix([\n", 61 | " [-2, 3, 2, 1],\n", 62 | " [-2, 4, 3, 0],\n", 63 | " [-3, 3, 1, -3],\n", 64 | " [-4, 2, -1, 1],\n", 65 | " [-5, -7, 3, 0]\n", 66 | "])\n", 67 | "terrain" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "def wall(terrain:np.matrix, position:Tuple[int,int]) -> bool:\n", 77 | " \"\"\"\n", 78 | " Checks whether the provided position is hitting the wall.\n", 79 | " \n", 80 | " Args:\n", 81 | " terrain: the terrain's configuration comprised from integer elevation levels.\n", 82 | " position: the pair of integers representing the ball's potential position.\n", 83 | "\n", 84 | " Output:\n", 85 | " True if the position is hitting the wall, or False otherwise.\n", 86 | " \n", 87 | " Examples:\n", 88 | " >>> wall(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 89 | " False\n", 90 | " >>> wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0))\n", 91 | " True\n", 92 | " \"\"\"\n", 93 | " \n", 94 | " x, y = position\n", 95 | " length, width = terrain.shape\n", 96 | " return (x < 0) or (y < 0) or (x >= length) or (y >= width)\n", 97 | "\n", 98 | "def next_neighbor(terrain:np.matrix, position:Tuple[int,int]) -> Tuple[int,int]:\n", 99 | " \"\"\"\n", 100 | " Returns the position of the lowest neighbor.\n", 101 | " \n", 102 | " Args:\n", 103 | " terrain: the terrain's configuration comprised from integer elevation levels.\n", 104 | " position: the pair of integers representing the ball's current position.\n", 105 | "\n", 106 | " Output:\n", 107 | " The position (pair of coordinates) of the lowest neighbor.\n", 108 | " \n", 109 | " Example:\n", 110 | " >>> next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 111 | " (0, 0)\n", 112 | " \"\"\"\n", 113 | " \n", 114 | " x, y = position\n", 115 | " allowed_neighbors = []\n", 116 | " for delta_x in range(-1, 2):\n", 117 | " for delta_y in range(-1, 2):\n", 118 | " new_position = (x + delta_x, y + delta_y)\n", 119 | " if (not wall(terrain, new_position)):\n", 120 | " allowed_neighbors.append((terrain.item(new_position), new_position))\n", 121 | " return min(allowed_neighbors)[1]\n", 122 | "\n", 123 | "def find_path(terrain:np.matrix, position:Tuple[int,int]) -> List[Tuple[int,int]]:\n", 124 | " \"\"\"\n", 125 | " Find the path that the ball would follow while descending in the terrain.\n", 126 | " \n", 127 | " Args:\n", 128 | " terrain: the terrain's configuration comprised from integer elevation levels.\n", 129 | " position: the pair of integers representing the ball's current position.\n", 130 | " \n", 131 | " Output:\n", 132 | " The list of coordinates of the path.\n", 133 | " \n", 134 | " Example:\n", 135 | " >>> find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 136 | " [(0, 1), (0, 0)]\n", 137 | " \"\"\"\n", 138 | " \n", 139 | " next_position = next_neighbor(terrain, position)\n", 140 | " if (position == next_position):\n", 141 | " return [position]\n", 142 | " else:\n", 143 | " return [position] + find_path(terrain, next_position)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Result\n", 151 | "\n", 152 | "The cell below contains code to invoke the path finding function for a given starting position. The starting coordinates are expected to be correctly set.\n", 153 | "\n", 154 | "The terrain data is repeated here for convenience." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 4, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "matrix([[-2, 3, 2, 1],\n", 166 | " [-2, 4, 3, 0],\n", 167 | " [-3, 3, 1, -3],\n", 168 | " [-4, 2, -1, 1],\n", 169 | " [-5, -7, 3, 0]])" 170 | ] 171 | }, 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "terrain" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 7, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "application/vnd.jupyter.widget-view+json": { 189 | "model_id": "9a0e328db74542498357f2cf8a600957", 190 | "version_major": 2, 191 | "version_minor": 0 192 | }, 193 | "text/plain": [ 194 | "interactive(children=(IntSlider(value=1, description='Start X', max=4), IntSlider(value=1, description='Start …" 195 | ] 196 | }, 197 | "metadata": {}, 198 | "output_type": "display_data" 199 | } 200 | ], 201 | "source": [ 202 | "interact(lambda start_x, start_y: find_path(terrain, (start_x, start_y)),\n", 203 | " start_x = widgets.IntSlider(value=1, max=terrain.shape[0]-1, description='Start X'),\n", 204 | " start_y = widgets.IntSlider(value=1, max=terrain.shape[1]-1, description='Start Y'));" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 6, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "Trying:\n", 217 | " find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 218 | "Expecting:\n", 219 | " [(0, 1), (0, 0)]\n", 220 | "ok\n", 221 | "Trying:\n", 222 | " next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 223 | "Expecting:\n", 224 | " (0, 0)\n", 225 | "ok\n", 226 | "Trying:\n", 227 | " wall(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n", 228 | "Expecting:\n", 229 | " False\n", 230 | "ok\n", 231 | "Trying:\n", 232 | " wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0))\n", 233 | "Expecting:\n", 234 | " True\n", 235 | "ok\n", 236 | "1 items had no tests:\n", 237 | " __main__\n", 238 | "3 items passed all tests:\n", 239 | " 1 tests in __main__.find_path\n", 240 | " 1 tests in __main__.next_neighbor\n", 241 | " 2 tests in __main__.wall\n", 242 | "4 tests in 4 items.\n", 243 | "4 passed and 0 failed.\n", 244 | "Test passed.\n" 245 | ] 246 | }, 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "TestResults(failed=0, attempted=4)" 251 | ] 252 | }, 253 | "execution_count": 6, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "# Just run this cell to invoke tests embedded inside function descriptors.\n", 260 | "import doctest\n", 261 | "doctest.testmod(verbose=True)" 262 | ] 263 | } 264 | ], 265 | "metadata": { 266 | "celltoolbar": "Raw Cell Format", 267 | "kernelspec": { 268 | "display_name": "Python 3", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.6.5" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } 288 | -------------------------------------------------------------------------------- /ch4/ball_descend/Simulation_Refactored.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simulation of a Ball's Descend in a Terrain - Refactored Version\n", 8 | "\n", 9 | "This project simulates where a ball will land in a terrain. It simulates the influence of Newton's law of universal gravitation on the movement of a ball, given by the formula $F=g\\frac{m_1m_2}{r^2}$. Here, F is the resulting gravitational pull between the matching objects, $m_1$ and $m_2$ are their masses, r is the distance between the centers of their masses, and g is the gravitational constant.\n", 10 | "\n", 11 | "## Input\n", 12 | "The terrain's configuration is given as a matrix of integers representing elevation at each spot. For simplicity, assume that the terrain is surrounded by a rectangular wall, that prevents the ball to escape. The inner dimensions of the terrain are NxM, where N and M are integers between 3 and 1000.\n", 13 | "\n", 14 | "The ball's initial position is given as a pair of integers (a, b).\n", 15 | "\n", 16 | "## Output\n", 17 | "The result is a list of coordinates denoting the ball's path in a terrain. The first element of the list is the starting position, and the last one is the ending position. It could happen that they are the same, if the ball has emanated from a local minima (dent).\n", 18 | "\n", 19 | "## Rules\n", 20 | "The ball moves according to the next two simple rules:\n", 21 | "- The ball rolls from the current position into the lowest neighboring one.\n", 22 | "- If the ball is surrounded by higher points, then it stops." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Usual bootstrapping code; just run this cell.\n", 32 | "import numpy as np\n", 33 | "from ipywidgets import interact, widgets\n", 34 | "\n", 35 | "from pathfinder import find_path" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "matrix([[-2, 3, 2, 1],\n", 47 | " [-2, 4, 3, 0],\n", 48 | " [-3, 3, 1, -3],\n", 49 | " [-4, 2, -1, 1],\n", 50 | " [-5, -7, 3, 0]])" 51 | ] 52 | }, 53 | "execution_count": 2, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "terrain = np.matrix([\n", 60 | " [-2, 3, 2, 1],\n", 61 | " [-2, 4, 3, 0],\n", 62 | " [-3, 3, 1, -3],\n", 63 | " [-4, 2, -1, 1],\n", 64 | " [-5, -7, 3, 0]\n", 65 | "])\n", 66 | "terrain" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Result\n", 74 | "\n", 75 | "The cell below contains code to invoke the path finding function for a given starting position. The starting coordinates are expected to be correctly set.\n", 76 | "\n", 77 | "The terrain data is repeated here for convenience." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "matrix([[-2, 3, 2, 1],\n", 89 | " [-2, 4, 3, 0],\n", 90 | " [-3, 3, 1, -3],\n", 91 | " [-4, 2, -1, 1],\n", 92 | " [-5, -7, 3, 0]])" 93 | ] 94 | }, 95 | "execution_count": 3, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "terrain" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "application/vnd.jupyter.widget-view+json": { 112 | "model_id": "f740b2d719684aadb1d73ed473c65f5c", 113 | "version_major": 2, 114 | "version_minor": 0 115 | }, 116 | "text/plain": [ 117 | "interactive(children=(IntSlider(value=1, description='Start X', max=4), IntSlider(value=1, description='Start …" 118 | ] 119 | }, 120 | "metadata": {}, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "interact(lambda start_x, start_y: find_path(terrain, (start_x, start_y)),\n", 126 | " start_x = widgets.IntSlider(value=1, max=terrain.shape[0]-1, description='Start X'),\n", 127 | " start_y = widgets.IntSlider(value=1, max=terrain.shape[1]-1, description='Start Y'));" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "celltoolbar": "Raw Cell Format", 133 | "kernelspec": { 134 | "display_name": "Python 3", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.6.6" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 2 153 | } 154 | -------------------------------------------------------------------------------- /ch4/ball_descend/pathfinder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/ball_descend/pathfinder/.DS_Store -------------------------------------------------------------------------------- /ch4/ball_descend/pathfinder/__init__.py: -------------------------------------------------------------------------------- 1 | from pathfinder.pathfinder import find_path -------------------------------------------------------------------------------- /ch4/ball_descend/pathfinder/pathfinder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Tuple 3 | 4 | def wall(terrain:np.matrix, position:Tuple[int,int]) -> bool: 5 | """ 6 | Checks whether the provided position is hitting the wall. 7 | 8 | Args: 9 | terrain: the terrain's configuration comprised from integer elevation levels. 10 | position: the pair of integers representing the ball's potential position. 11 | 12 | Output: 13 | True if the position is hitting the wall, or False otherwise. 14 | 15 | Examples: 16 | >>> wall(np.matrix([[-2, 3, 2, 1]]), (0, 1)) 17 | False 18 | >>> wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0)) 19 | True 20 | """ 21 | 22 | x, y = position 23 | length, width = terrain.shape 24 | return (x < 0) or (y < 0) or (x >= length) or (y >= width) 25 | 26 | def next_neighbor(terrain:np.matrix, position:Tuple[int,int]) -> Tuple[int,int]: 27 | """ 28 | Returns the position of the lowest neighbor. 29 | 30 | Args: 31 | terrain: the terrain's configuration comprised from integer elevation levels. 32 | position: the pair of integers representing the ball's current position. 33 | 34 | Output: 35 | The position (pair of coordinates) of the lowest neighbor. 36 | 37 | Example: 38 | >>> next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1)) 39 | (0, 0) 40 | """ 41 | 42 | x, y = position 43 | allowed_neighbors = [] 44 | for delta_x in range(-1, 2): 45 | for delta_y in range(-1, 2): 46 | new_position = (x + delta_x, y + delta_y) 47 | if (not wall(terrain, new_position)): 48 | allowed_neighbors.append((terrain.item(new_position), new_position)) 49 | return min(allowed_neighbors)[1] 50 | 51 | def find_path(terrain:np.matrix, position:Tuple[int,int]) -> List[Tuple[int,int]]: 52 | """ 53 | Finds the path that the ball would follow while descending in the terrain. 54 | 55 | Args: 56 | terrain: the terrain's configuration comprised from integer elevation levels. 57 | position: the pair of integers representing the ball's current position. 58 | 59 | Output: 60 | The list of coordinates of the path. 61 | 62 | Example: 63 | >>> find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1)) 64 | [(0, 1), (0, 0)] 65 | """ 66 | 67 | next_position = next_neighbor(terrain, position) 68 | if (position == next_position): 69 | return [position] 70 | else: 71 | return [position] + find_path(terrain, next_position) 72 | 73 | if __name__ == "__main__": 74 | import doctest 75 | doctest.testmod() -------------------------------------------------------------------------------- /ch4/hanoi/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/hanoi/.DS_Store -------------------------------------------------------------------------------- /ch4/hanoi/Solver1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "SyntaxError", 10 | "evalue": "EOL while scanning string literal (, line 4)", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m4\u001b[0m\n\u001b[0;31m print('Move top disk from', start, 'to\", end)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m EOL while scanning string literal\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "def solve_tower(num_disks, start, end, extra):\n", 19 | " if (num_disks > 0):\n", 20 | " solve_tower(num_disks - 1, start, extra, end)\n", 21 | " print('Move top disk from', start, 'to\", end)\n", 22 | " solve_tower(num_disks - 1, extra, end, start)\n", 23 | "\n", 24 | "solve_tower(3, 'a', 'c', 'b')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "This error message is an example that Python sometimes wrongly guesses the location of the error. **Different string markers should not be mixed for the same string**." 32 | ] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.6.5" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /ch4/hanoi/Solver3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "def solve_tower(num_disks:int, start:str, end:str, extra:str) -> None:\n", 10 | " \"\"\"\n", 11 | " Solves the Tower of Hanoi puzzle.\n", 12 | " \n", 13 | " Args:\n", 14 | " num_disks: the number of disks to move.\n", 15 | " start: the name of the start pole.\n", 16 | " end: the name of the target pole.\n", 17 | " extra: the name of the temporary pole.\n", 18 | " \n", 19 | " Example:\n", 20 | " >>> solve_tower(3, 'a', 'c', 'b')\n", 21 | " Move top disk from a to c\n", 22 | " Move top disk from a to b\n", 23 | " Move top disk from c to b\n", 24 | " Move top disk from a to c\n", 25 | " Move top disk from b to a\n", 26 | " Move top disk from b to c\n", 27 | " Move top disk from a to c\n", 28 | " >>> solve_tower(-1, 'a', 'c', 'b')\n", 29 | " \"\"\"\n", 30 | " if (num_disks > 0):\n", 31 | " solve_tower(num_disks - 1, start, extra, end)\n", 32 | " print('Move top disk from', start, 'to', end)\n", 33 | " solve_tower(num_disks - 1, extra, end, start)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Trying:\n", 46 | " solve_tower(3, 'a', 'c', 'b')\n", 47 | "Expecting:\n", 48 | " Move top disk from a to c\n", 49 | " Move top disk from a to b\n", 50 | " Move top disk from c to b\n", 51 | " Move top disk from a to c\n", 52 | " Move top disk from b to a\n", 53 | " Move top disk from b to c\n", 54 | " Move top disk from a to c\n", 55 | "ok\n", 56 | "Trying:\n", 57 | " solve_tower(-1, 'a', 'c', 'b')\n", 58 | "Expecting nothing\n", 59 | "ok\n", 60 | "1 items had no tests:\n", 61 | " __main__\n", 62 | "1 items passed all tests:\n", 63 | " 2 tests in __main__.solve_tower\n", 64 | "2 tests in 2 items.\n", 65 | "2 passed and 0 failed.\n", 66 | "Test passed.\n" 67 | ] 68 | }, 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "TestResults(failed=0, attempted=2)" 73 | ] 74 | }, 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "import doctest\n", 82 | "doctest.testmod(verbose=True)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "\u001b[0;31mSignature:\u001b[0m \u001b[0msolve_tower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_disks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextra\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 94 | "\u001b[0;31mDocstring:\u001b[0m\n", 95 | "Solves the Tower of Hanoi puzzle.\n", 96 | "\n", 97 | "Args:\n", 98 | "num_disks: the number of disks to move.\n", 99 | "start: the name of the start pole.\n", 100 | "end: the name of the target pole.\n", 101 | "extra: the name of the temporary pole.\n", 102 | "\n", 103 | "Example:\n", 104 | ">>> solve_tower(3, 'a', 'c', 'b')\n", 105 | "Move top disk from a to c\n", 106 | "Move top disk from a to b\n", 107 | "Move top disk from c to b\n", 108 | "Move top disk from a to c\n", 109 | "Move top disk from b to a\n", 110 | "Move top disk from b to c\n", 111 | "Move top disk from a to c\n", 112 | ">>> solve_tower(-1, 'a', 'c', 'b')\n", 113 | "\u001b[0;31mFile:\u001b[0m ~/Projects/pdsp_book/src/ch4/hanoi/\n", 114 | "\u001b[0;31mType:\u001b[0m function\n" 115 | ] 116 | }, 117 | "metadata": {}, 118 | "output_type": "display_data" 119 | } 120 | ], 121 | "source": [ 122 | "solve_tower?" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.6.5" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /ch5/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/interactionlib/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/interactionlib/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/interactionlib/__init__.py: -------------------------------------------------------------------------------- 1 | from interactionlib.interaction_monitor import InteractionMonitor 2 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/interactionlib/interaction_monitor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Monitors whether the user is selecting an area on the image or has chosen the 3 | starting position. 4 | """ 5 | 6 | from ipywidgets import Textarea 7 | import matplotlib.pyplot as plt 8 | 9 | class InteractionMonitor: 10 | """ 11 | Detects mouse events to figure our what is a user doing. 12 | 13 | Args: 14 | fig: the matplotlib figure to monitor. 15 | info_area: the external informational area whose value needs to be updated. 16 | auto_stop_interaction: should interaction stop (when True) after selecting 17 | the starting position or not. 18 | """ 19 | 20 | def __init__(self, fig: plt.Figure, info_area: Textarea, 21 | auto_stop_interaction: bool = True): 22 | self._fig = fig 23 | self._info_area = info_area 24 | self._auto_stop_interaction = auto_stop_interaction 25 | self._cids = None 26 | self._selecting = False 27 | self._clicked = False 28 | self._clicked_position = None 29 | 30 | def _on_click(self, event): 31 | self._clicked = True 32 | 33 | def _on_release(self, event): 34 | if not self._selecting: 35 | self._clicked_position = (int(event.ydata), int(event.xdata)) 36 | self._info_area.value = str(self._clicked_position) 37 | if self._auto_stop_interaction: 38 | self.stop() 39 | 40 | self._selecting = False 41 | self._clicked = False 42 | 43 | def _on_motion(self, event): 44 | self._selecting = self._clicked 45 | 46 | @property 47 | def clicked_position(self): 48 | """Returns the clicked data position on the map.""" 49 | return self._clicked_position 50 | 51 | def start(self): 52 | """Starts monitoring mouse events on figure.""" 53 | self._cids = [ 54 | self._fig.canvas.mpl_connect('button_press_event', self._on_click), 55 | self._fig.canvas.mpl_connect('button_release_event', self._on_release), 56 | self._fig.canvas.mpl_connect('motion_notify_event', self._on_motion)] 57 | 58 | def stop(self): 59 | """Closes the figure and stops the interaction.""" 60 | plt.close(self._fig) 61 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/pathfinder/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/__init__.py: -------------------------------------------------------------------------------- 1 | from pathfinder.base_pathfinder import BasePathFinder 2 | from pathfinder.simple_pathfinder import SimplePathFinder 3 | from pathfinder.non_recursive_simple_pathfinder import NonRecursiveSimplePathFinder 4 | from pathfinder.parallel_simple_pathfinder import ParallelSimplePathFinder 5 | from pathfinder.pathutils import PathUtils -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/base_pathfinder.py: -------------------------------------------------------------------------------- 1 | """The base class for implementing various path finders.""" 2 | 3 | import abc 4 | from typing import List, Tuple, Set 5 | 6 | import numpy as np 7 | 8 | class BasePathFinder(metaclass=abc.ABCMeta): 9 | """ 10 | Finds the path of a ball that descends in a terrain from some starting 11 | position. 12 | 13 | Args: 14 | terrain: the terrain's configuration comprised from (altitude, slope) 15 | integer pairs. 16 | """ 17 | 18 | def __init__(self, terrain: np.ndarray): 19 | self._terrain = terrain 20 | 21 | @property 22 | def terrain(self): 23 | """Gets the current terrain data.""" 24 | return self._terrain 25 | 26 | def wall(self, position: Tuple[int, int]) -> bool: 27 | """ 28 | Checks whether the provided position is hitting the wall. 29 | 30 | Args: 31 | position: the pair of integers representing the ball's potential position. 32 | 33 | Output: 34 | True if the position is hitting the wall, or False otherwise. 35 | 36 | Examples: 37 | >>> BasePathFinder.__abstractmethods__ = set() 38 | >>> path_finder = BasePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]])) 39 | >>> path_finder.wall((0, 1)) 40 | False 41 | >>> BasePathFinder.__abstractmethods__ = set() 42 | >>> path_finder = BasePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]])) 43 | >>> path_finder.wall((-1, 0)) 44 | True 45 | """ 46 | 47 | curr_x, curr_y = position 48 | length, width = self.terrain.shape[:2] 49 | return (curr_x < 0) or (curr_y < 0) or (curr_x >= length) or (curr_y >= width) 50 | 51 | @abc.abstractmethod 52 | def next_neighbor(self, position: Tuple[int, int], 53 | visited: Set[Tuple[int, int]]) -> Tuple[int, int]: 54 | """ 55 | Returns the position of the lowest neighbor or the current position. 56 | 57 | Args: 58 | position: the pair of integers representing the ball's current position. 59 | visited: the set of visited points. 60 | 61 | Output: 62 | The position (pair of coordinates) of the lowest neighbor. 63 | """ 64 | 65 | @abc.abstractmethod 66 | def find_path(self, position: Tuple[int, int], 67 | visited: Set[Tuple[int, int]]) -> List[Tuple[int, int]]: 68 | """ 69 | Finds the path that the ball would follow while descending in the terrain. 70 | 71 | Args: 72 | position: the pair of integers representing the ball's current position. 73 | visited: the set of visited points (may be preset to avoid certain points). 74 | 75 | Output: 76 | The list of coordinates of the path. 77 | """ 78 | 79 | def find_paths(self, positions: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]: 80 | """ 81 | Finds paths for all provided starting positions. 82 | 83 | Args: 84 | positions: the list of positions to for which to calculate path. 85 | 86 | Output: 87 | The list of paths in the same order as positions. 88 | """ 89 | 90 | return [self.find_path(position, None) for position in positions] 91 | 92 | if __name__ == "__main__": 93 | import doctest 94 | doctest.testmod() 95 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/non_recursive_simple_pathfinder.py: -------------------------------------------------------------------------------- 1 | """Simple non-recursive path finder implementation.""" 2 | 3 | from typing import List, Tuple, Set 4 | from pathfinder.simple_pathfinder import SimplePathFinder 5 | 6 | class NonRecursiveSimplePathFinder(SimplePathFinder): 7 | """Concrete path finder that doesn't use recursion.""" 8 | 9 | def find_path(self, position: Tuple[int, int], 10 | visited: Set[Tuple[int, int]] = None) -> List[Tuple[int, int]]: 11 | """ 12 | Iteratively finds the path (without using recursion). 13 | 14 | Example: 15 | >>> path_finder = NonRecursiveSimplePathFinder(np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]])) 16 | >>> path_finder.find_path((0, 2)) 17 | [(0, 2), (0, 1)] 18 | """ 19 | 20 | if visited is None: 21 | visited = set() 22 | visited.add(position) 23 | calculated_path = [position] 24 | next_position = self.next_neighbor(position, visited) 25 | 26 | while position != next_position: 27 | position = next_position 28 | visited.add(position) 29 | calculated_path.append(position) 30 | next_position = self.next_neighbor(position, visited) 31 | 32 | return calculated_path 33 | 34 | if __name__ == "__main__": 35 | import doctest 36 | doctest.testmod() 37 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/parallel_simple_pathfinder.py: -------------------------------------------------------------------------------- 1 | """Efficient parallel version of the path finder system.""" 2 | 3 | from typing import Tuple, Set 4 | 5 | import numpy as np 6 | from numba import jit 7 | from pathfinder.non_recursive_simple_pathfinder import NonRecursiveSimplePathFinder 8 | 9 | class ParallelSimplePathFinder(NonRecursiveSimplePathFinder): 10 | """Concrete path finder that uses Numba to perform operations in parallel.""" 11 | 12 | @staticmethod 13 | @jit(nopython=True, parallel=True, cache=True) 14 | def _best_neighbor(terrain: np.ndarray, position: Tuple[int, int]) -> Tuple[int, int]: 15 | curr_x, curr_y = position 16 | length, width = terrain.shape[:2] 17 | current_slope = terrain[position][1] 18 | min_altitude = terrain[position][0] 19 | min_position = position 20 | 21 | for delta_x in range(-1, 2): 22 | for delta_y in range(-1, 2): 23 | new_position = (curr_x + delta_x, curr_y + delta_y) 24 | new_x, new_y = new_position 25 | if not ((new_x < 0) or 26 | (new_y < 0) or 27 | (new_x >= length) or 28 | (new_y >= width)) and not new_position == position: 29 | new_altitude = terrain[new_position][0] 30 | if new_altitude < min_altitude or (new_altitude == min_altitude and 31 | current_slope > 0): 32 | min_altitude = new_altitude 33 | min_position = new_position 34 | return min_position 35 | 36 | def next_neighbor(self, position: Tuple[int, int], 37 | visited: Set[Tuple[int, int]]) -> Tuple[int, int]: 38 | """ 39 | Uses a vectorized clockwise search of neighbors starting at south-west. 40 | 41 | Example: 42 | >>> terrain = np.array([[(-2, 0), (2, 0), (2, 1), (3, 1)]]) 43 | >>> path_finder = ParallelSimplePathFinder(terrain) 44 | >>> path_finder.next_neighbor((0, 2), set((0, 2))) 45 | (0, 1) 46 | """ 47 | 48 | best_neighbor = ParallelSimplePathFinder._best_neighbor(self.terrain, position) 49 | if not best_neighbor in visited: 50 | return best_neighbor 51 | return position 52 | 53 | if __name__ == "__main__": 54 | import doctest 55 | doctest.testmod() 56 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/pathutils.py: -------------------------------------------------------------------------------- 1 | """Contains various path related utility classes and methods.""" 2 | 3 | from typing import List, Tuple 4 | 5 | import numpy as np 6 | 7 | class PathUtils: 8 | """Encompasses static methods to handle paths.""" 9 | 10 | @staticmethod 11 | def encode_path(terrain: np.ndarray, descend_path: List[Tuple[int, int]]) -> np.ndarray: 12 | """ 13 | Encodes the path into the terrain by setting the points's 3rd (blue) component to 255. 14 | 15 | Args: 16 | terrain: the terrain's configuration comprised from (altitude, slope, [aspect]) 17 | integer pairs/triples. 18 | 19 | Output: 20 | New terrain with an extra 3rd dimension to encode the path. 21 | 22 | Example: 23 | >>> terrain = np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]]) 24 | >>> PathUtils.encode_path(terrain, [(0, 2), (0, 1)]) 25 | array([[[ -1, 2, 0], 26 | [ -2, 1, 255], 27 | [ -2, 2, 255], 28 | [ 1, 0, 0]]]) 29 | """ 30 | 31 | # Expand terrain with an extra dimension, as needed. 32 | if terrain.shape[2] == 2: 33 | new_shape = terrain.shape[:2] + (3,) 34 | new_terrain = np.zeros(new_shape, terrain.dtype) 35 | new_terrain[:terrain.shape[0], :terrain.shape[1], :2] = terrain 36 | else: 37 | new_terrain = np.copy(terrain) 38 | 39 | for point in descend_path: 40 | new_terrain[point][2] = 255 41 | return new_terrain 42 | 43 | @staticmethod 44 | def decode_path(terrain: np.ndarray) -> List[Tuple[int, int]]: 45 | """ 46 | Decodes the path from the terrain by picking points's whose 3rd (blue) component is 255. 47 | The reconstructed path may not be unique, which depends upon the path finder logic. 48 | 49 | Args: 50 | terrain: the terrain's configuration encoded with a single path. 51 | 52 | Output: 53 | The decoded path that is guaranteed to contain all points of the encoded path. 54 | Ordering of points may differ from what was reported by the matching path finder. 55 | """ 56 | 57 | # Extra exercise to implement this method according to the specification. 58 | raise NotImplementedError 59 | 60 | if __name__ == "__main__": 61 | import doctest 62 | doctest.testmod() 63 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/pathfinder/simple_pathfinder.py: -------------------------------------------------------------------------------- 1 | """Simple recursive path finder implementation.""" 2 | 3 | from typing import List, Tuple, Set 4 | from pathfinder.base_pathfinder import BasePathFinder 5 | 6 | class SimplePathFinder(BasePathFinder): 7 | """Concrete path finder that uses recursion and is sequential.""" 8 | 9 | def next_neighbor(self, position: Tuple[int, int], 10 | visited: Set[Tuple[int, int]]) -> Tuple[int, int]: 11 | """ 12 | Uses a simple clockwise search of neighbors starting at south-west. 13 | 14 | Example: 15 | >>> path_finder = SimplePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]])) 16 | >>> path_finder.next_neighbor((0, 1), set((0, 1))) 17 | (0, 0) 18 | """ 19 | 20 | curr_x, curr_y = position 21 | current_slope = self.terrain[position][1] 22 | min_altitude = self.terrain[position][0] 23 | min_position = position 24 | for delta_x in range(-1, 2): 25 | for delta_y in range(-1, 2): 26 | new_position = (curr_x + delta_x, curr_y + delta_y) 27 | if not self.wall(new_position) and not new_position in visited: 28 | new_altitude = self.terrain[new_position][0] 29 | if new_altitude < min_altitude or (new_altitude == min_altitude and 30 | current_slope > 0): 31 | min_altitude = new_altitude 32 | min_position = new_position 33 | return min_position 34 | 35 | def find_path(self, position: Tuple[int, int], 36 | visited: Set[Tuple[int, int]] = None) -> List[Tuple[int, int]]: 37 | """ 38 | Recursively finds the path. 39 | 40 | Example: 41 | >>> path_finder = SimplePathFinder(np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]])) 42 | >>> path_finder.find_path((0, 2)) 43 | [(0, 2), (0, 1)] 44 | """ 45 | 46 | if visited is None: 47 | visited = set() 48 | visited.add(position) 49 | next_position = self.next_neighbor(position, visited) 50 | if position == next_position: 51 | return [position] 52 | return [position] + self.find_path(next_position, visited) 53 | 54 | if __name__ == "__main__": 55 | import doctest 56 | doctest.testmod() 57 | -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/terrain_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/terrain_data/coastline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/coastline.jpg -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/terrain_data/coastline_with_path.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/coastline_with_path.jpg -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/testutils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/testutils/.DS_Store -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/testutils/__init__.py: -------------------------------------------------------------------------------- 1 | from testutils.create_terrain import create_test_terrain -------------------------------------------------------------------------------- /ch5/augmented_ball_descend/testutils/create_terrain.py: -------------------------------------------------------------------------------- 1 | """Creates a degenerate terrain for measuring various running times.""" 2 | 3 | import numpy as np 4 | 5 | def create_test_terrain(n: int) -> np.ndarray: 6 | """Creates a square maze-like terrain with alleys of decreasing altitude. 7 | 8 | Args: 9 | n: number of rows and columns of a terrain 10 | 11 | Output: 12 | The test terrain of proper size. 13 | 14 | Example: 15 | >>> terrain = create_test_terrain(9) 16 | >>> terrain[:, :, 0] 17 | array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8], 18 | [81, 81, 81, 81, 81, 81, 81, 81, 17], 19 | [26, 25, 24, 23, 22, 21, 20, 19, 18], 20 | [27, 81, 81, 81, 81, 81, 81, 81, 81], 21 | [36, 37, 38, 39, 40, 41, 42, 43, 44], 22 | [81, 81, 81, 81, 81, 81, 81, 81, 53], 23 | [62, 61, 60, 59, 58, 57, 56, 55, 54], 24 | [63, 81, 81, 81, 81, 81, 81, 81, 81], 25 | [72, 73, 74, 75, 76, 77, 78, 79, 80]]) 26 | """ 27 | 28 | size = n * n 29 | terrain = np.zeros((n, n, 2), dtype=int) 30 | terrain[:, :, 0] = np.arange(0, size).reshape((n, n)) 31 | 32 | # Reverse every 4th row to have proper ordering of elements. 33 | for i in range(2, n, 4): 34 | terrain[i, :, 0] = np.flip(terrain[i, :, 0]) 35 | 36 | # Create "walls" inside the terrain. 37 | for i in range(1, n, 4): 38 | terrain[i, :-1, 0] = size 39 | for i in range(3, n, 4): 40 | terrain[i, 1:, 0] = size 41 | 42 | return terrain 43 | 44 | if __name__ == "__main__": 45 | import doctest 46 | doctest.testmod() 47 | -------------------------------------------------------------------------------- /ch6/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/.DS_Store -------------------------------------------------------------------------------- /ch6/anscombe/anscombe_altair.py: -------------------------------------------------------------------------------- 1 | import altair as alt 2 | from vega_datasets import data 3 | 4 | source = data.anscombe() 5 | 6 | base = alt.Chart( 7 | source, title = "Anscombe's Quartets" 8 | ).mark_circle(color = 'red').encode( 9 | alt.X('X', scale = alt.Scale(zero = True)), 10 | alt.Y('Y', scale = alt.Scale(zero = True)), 11 | column = 'Series' 12 | ).properties( 13 | width = 150, 14 | height = 150 15 | ).interactive() 16 | 17 | base 18 | -------------------------------------------------------------------------------- /ch6/anscombe/anscombe_matplotlib.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | quartets = np.asarray([ 5 | ( 6 | [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 7 | [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68] 8 | ), 9 | ( 10 | [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 11 | [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74] 12 | ), 13 | ( 14 | [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 15 | [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73] 16 | ), 17 | ( 18 | [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0], 19 | [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89] 20 | ) 21 | ]) 22 | 23 | roman = ['I', 'II', 'III', 'IV'] 24 | 25 | fig = plt.figure(figsize = (12, 9)) 26 | fig.suptitle("Anscombe's Quartets", fontsize=16) 27 | axes = fig.subplots(2, 2, sharex = True, sharey = True) 28 | 29 | for quartet in range(quartets.shape[0]): 30 | x, y = quartets[quartet] 31 | coef = np.polyfit(x, y, 1) 32 | reg_line = np.poly1d(coef) 33 | 34 | ax = axes[quartet // 2, quartet % 2] 35 | ax.plot(x, y, 'ro', x, reg_line(x), '--k') 36 | ax.set_title(roman[quartet]) 37 | ax.set_xlim(3, 19.5) 38 | ax.set_ylim(2, 13) 39 | 40 | # Print summary statistics for the current dataset 41 | print("Quartet:", roman[quartet]) 42 | print("Mean X:", x.mean()) 43 | print("Variance X:", x.var()) 44 | print("Mean Y:", round(y.mean(), 2)) 45 | print("Variance Y:", round(y.var(), 2)) 46 | print("Pearson's correlation coef.:", round(np.corrcoef(x, y)[0][1], 2)) 47 | print() 48 | 49 | plt.show() -------------------------------------------------------------------------------- /ch6/closest_pair/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/closest_pair/.DS_Store -------------------------------------------------------------------------------- /ch6/closest_pair/__init__.py: -------------------------------------------------------------------------------- 1 | from closest_pair.base_closest_pair import BaseClosestPair 2 | from closest_pair.naive_closest_pair import NaiveClosestPair 3 | from closest_pair.fast_closest_pair import FastClosestPair -------------------------------------------------------------------------------- /ch6/closest_pair/base_closest_pair.py: -------------------------------------------------------------------------------- 1 | """The base class for implementing various variants to find the closest pair.""" 2 | 3 | import abc 4 | from typing import Tuple, Callable, TypeVar, Sequence, Generic 5 | import numpy as np 6 | 7 | Coordinates = TypeVar('Coordinates', Sequence[int], np.ndarray) 8 | 9 | class BaseClosestPair(Generic[Coordinates], metaclass=abc.ABCMeta): 10 | """ 11 | Finds the closest pair among 2D points given by their x and y coordinates. The 12 | distance is by default defined as a standard Euclidian distance. 13 | 14 | Args: 15 | x: the list of x coordinates of all points. 16 | y: the list of y coordinates of all points. The ordering of elements matches 17 | the list of x coordinates, i.e., the ith point is specified as (x[i], y[i]). 18 | """ 19 | 20 | _x: Coordinates 21 | _y: Coordinates 22 | 23 | def __init__(self, x: Coordinates, y: Coordinates): 24 | assert len(x) >= 2 and len(x) == len(y) 25 | self._x = x 26 | self._y = y 27 | 28 | @property 29 | def x(self) -> Coordinates: 30 | """Gets the x coordinates of points.""" 31 | return self._x 32 | 33 | @property 34 | def y(self) -> Coordinates: 35 | """Gets the y coordinates of points.""" 36 | return self._y 37 | 38 | @staticmethod 39 | def load_from_stdin() -> Tuple[Coordinates, Coordinates]: 40 | """ 41 | Loads points from standard input by enumarating x and y coordinates in succession. 42 | Each datum must be separated with space. 43 | 44 | Output: 45 | The tuple of x and y coordinates. 46 | """ 47 | 48 | import sys 49 | 50 | data = sys.stdin.read() 51 | points = list(map(int, data.split())) 52 | x = points[1::2] 53 | y = points[2::2] 54 | return x, y 55 | 56 | @staticmethod 57 | def generate_points(n: int, seed: int) -> Tuple[Coordinates, Coordinates]: 58 | """ 59 | Generates random points for stress testing. 60 | 61 | Output: 62 | The tuple of x and y coordinates. 63 | 64 | Examples: 65 | >>> BaseClosestPair.generate_points(3, 10) 66 | ([227077737, -930024104, -78967768], [36293302, 241441628, -968147565]) 67 | """ 68 | 69 | import random 70 | 71 | assert n >= 2 72 | random.seed(seed) 73 | x = [random.randint(-10**9, 10**9) for _ in range(n)] 74 | y = [random.randint(-10**9, 10**9) for _ in range(n)] 75 | 76 | return x, y 77 | 78 | @staticmethod 79 | def distance(x1: int, x2: int, y1: int, y2: int) -> float: 80 | """ 81 | Returns the Euclidian distance between two points. 82 | 83 | Args: 84 | x1: the x coordinate of the first point. 85 | x1: the x coordinate of the second point. 86 | y1: the y coordinate of the first point. 87 | y2: the y coordinate of the second point. 88 | 89 | Output: 90 | The distance between points defined as the square root of the sum of squared 91 | differences of the matching coordinates. 92 | 93 | Examples: 94 | >>> BaseClosestPair.distance(1, 2, 1, 2) 95 | 1.4142135623730951 96 | >>> BaseClosestPair.distance(1, 1, 1, 1) 97 | 0.0 98 | """ 99 | 100 | from math import sqrt 101 | 102 | return sqrt((x1 - x2)**2 + (y1 - y2)**2) 103 | 104 | @abc.abstractmethod 105 | def closest_pair(self, distance: Callable[[int, int, int, int], float]) -> Tuple[int, int, float]: 106 | """ 107 | Returns back the tuple with indexes of closest points as well as 108 | their distance. 109 | 110 | Args: 111 | distance: the function that receives four parameters (x1, x2, y1, y2) and 112 | returns back the distance between these points. 113 | """ 114 | 115 | if __name__ == "__main__": 116 | import doctest 117 | doctest.testmod() 118 | -------------------------------------------------------------------------------- /ch6/closest_pair/fast_closest_pair.py: -------------------------------------------------------------------------------- 1 | """Fast implementation of the closest pair algorithm.""" 2 | 3 | from typing import List, Tuple, Callable 4 | from closest_pair.base_closest_pair import Coordinates, BaseClosestPair 5 | 6 | class FastClosestPair(BaseClosestPair): 7 | _y_prime: List[int] 8 | 9 | def _argsort_y(self) -> List[int]: 10 | """Finds the permutation of indices that arranges points by y coordinate.""" 11 | 12 | return [t[0] for t in sorted(enumerate(self.y), key = lambda t: t[1])] 13 | 14 | def _get_x(self, i: int, s: List[int]) -> int: 15 | return self.x[self._y_prime[s[i]]] 16 | 17 | def _get_y(self, i: int, s: List[int]) -> int: 18 | return self.y[self._y_prime[s[i]]] 19 | 20 | def __init__(self, x: Coordinates, y: Coordinates): 21 | super().__init__(x, y) 22 | self._y_prime = self._argsort_y() 23 | 24 | def _selection(self, s: List[int], k: int) -> int: 25 | """Returns the x value of kth smallest point by x coordinate contained in s.""" 26 | 27 | def split(v: int) -> Tuple[List[int], List[int], List[int]]: 28 | """Indirectly splits points in-place around value v into 2 sets (left and right).""" 29 | 30 | store = 0 31 | sl_idx = 0 32 | for i in range(len(s)): 33 | if self._get_x(i, s) < v: 34 | s[i], s[store] = s[store], s[i] 35 | store += 1 36 | sl_idx = store 37 | for i in range(store, len(s)): 38 | if self._get_x(i, s) == v: 39 | s[i], s[store] = s[store], s[i] 40 | store += 1 41 | return (s[:sl_idx], s[sl_idx:store], s[store:]) 42 | 43 | import random 44 | 45 | v_idx = random.randrange(len(s)) 46 | v = self._get_x(v_idx, s) 47 | sl, sv, sr = split(v) 48 | sl_size = len(sl) 49 | sv_size = len(sv) 50 | 51 | if k <= sl_size: 52 | return self._selection(sl, k) 53 | if k > sl_size and k <= sl_size + sv_size: 54 | return self._get_x(-1, sv) 55 | return self._selection(sr, k - sl_size - sv_size) 56 | 57 | @staticmethod 58 | def _merge(sl: List[int], sr: List[int]) -> List[int]: 59 | """ 60 | Merges the two sorted sublists into a new sorted list. The temporary 61 | storage may be allocated upfront as a further optimization. 62 | """ 63 | 64 | sl_size = len(sl) 65 | sr_size = len(sr) 66 | s = [0] * (sl_size + sr_size) 67 | k = 0 68 | i = 0 69 | j = 0 70 | 71 | while i < sl_size and j < sr_size: 72 | if sl[i] <= sr[j]: 73 | s[k] = sl[i] 74 | k += 1 75 | i += 1 76 | else: 77 | s[k] = sr[j] 78 | k += 1 79 | j += 1 80 | while i < sl_size: 81 | s[k] = sl[i] 82 | k += 1 83 | i += 1 84 | while j < sr_size: 85 | s[k] = sr[j] 86 | k += 1 87 | j += 1 88 | 89 | return s 90 | 91 | def closest_pair(self, 92 | distance: Callable[[int, int, int, int], float] = BaseClosestPair.distance 93 | ) -> Tuple[int, int, float]: 94 | """ 95 | Computes the minimum distance in O(n*log n) time. 96 | 97 | Examples: 98 | >>> x = [0, 3, 100] 99 | >>> y = [0, 4, 110] 100 | >>> fcp = FastClosestPair(x, y) 101 | >>> fcp.closest_pair() 102 | (0, 1, 5.0) 103 | """ 104 | 105 | from math import inf 106 | 107 | def filter_points(s: List[int], d: float, x: int) -> List[int]: 108 | """Returns the list of point indexes that fall inside the [x-d, x+d] interval.""" 109 | 110 | return [s[i] for i in range(len(s)) if abs(self._get_x(i, s) - x) <= d] 111 | 112 | def find_nearest_neighbor(i: int, s: List[int]) -> Tuple[float, int, int]: 113 | """ 114 | Finds the minimum distance between the current point i and next 7 seven 115 | subsequent points by y coordinate. 116 | """ 117 | 118 | curr_x = self._get_x(i, s) 119 | curr_y = self._get_y(i, s) 120 | d = inf 121 | min_idx = i 122 | 123 | for j in range(i + 1, min(len(s), i + 7 + 1)): 124 | curr_d = distance(curr_x, self._get_x(j, s), curr_y, self._get_y(j, s)) 125 | if curr_d < d: 126 | d = curr_d 127 | min_idx = j 128 | return d, s[i], s[min_idx] 129 | 130 | def find_minimum_distance(s: List[int]) -> Tuple[int, int, float]: 131 | """Main driver function to find the closest pair.""" 132 | 133 | if len(s) == 1: 134 | # We will treat the distance from a single point as infinite. 135 | return s[0], -1, inf 136 | if len(s) == 2: 137 | return s[0], s[1], distance(self._get_x(0, s), 138 | self._get_x(1, s), 139 | self._get_y(0, s), 140 | self._get_y(1, s)) 141 | 142 | # This is the median value of input array x in regard of s. 143 | median_x = self._selection(s.copy(), len(s) // 2) 144 | 145 | # Separate points around median. 146 | sl = [] 147 | sr = [] 148 | for i in range(len(s)): 149 | if self._get_x(i, s) <= median_x: 150 | sl.append(s[i]) 151 | else: 152 | sr.append(s[i]) 153 | 154 | # Find minimum distances in left and right groups. 155 | p_l, q_l, d_l = find_minimum_distance(sl) 156 | p_r, q_r, d_r = find_minimum_distance(sr) 157 | if d_l < d_r: 158 | p_min, q_min = p_l, q_l 159 | d = d_l 160 | else: 161 | p_min, q_min = p_r, q_r 162 | d = d_r 163 | 164 | # Merge left and right indices keeping their sorted order. 165 | sm = FastClosestPair._merge(sl, sr) 166 | 167 | # Find the minimum distance inside the middle strip. 168 | sf = filter_points(sm, d, median_x) 169 | 170 | # Find the final minimum distance amond three groups (left, middle, and right). 171 | d_m, p_m, q_m = min([find_nearest_neighbor(i, sf) for i in range(len(sf))]) 172 | if d_m < d: 173 | return p_m, q_m, d_m 174 | else: 175 | return p_min, q_min, d 176 | 177 | p, q, d = find_minimum_distance(list(range(len(self._y_prime)))) 178 | # We need to map back the point indices into their original base. 179 | return self._y_prime[p], self._y_prime[q], d 180 | 181 | if __name__ == "__main__": 182 | import doctest 183 | doctest.testmod() 184 | 185 | -------------------------------------------------------------------------------- /ch6/closest_pair/naive_closest_pair.py: -------------------------------------------------------------------------------- 1 | """Naive implementation of the closest pair algorithm.""" 2 | 3 | from typing import Tuple, Callable 4 | from closest_pair.base_closest_pair import BaseClosestPair 5 | 6 | class NaiveClosestPair(BaseClosestPair): 7 | def closest_pair(self, distance: Callable[[int, int, int, int], float] = BaseClosestPair.distance 8 | ) -> Tuple[int, int, float]: 9 | """ 10 | Iterates over all pairs and computes their distances. 11 | 12 | Examples: 13 | >>> x = [0, 3, 100] 14 | >>> y = [0, 4, 110] 15 | >>> ncp = NaiveClosestPair(x, y) 16 | >>> ncp.closest_pair() 17 | (0, 1, 5.0) 18 | """ 19 | 20 | from math import inf 21 | 22 | n = len(self.x) 23 | min_distance = inf 24 | for i in range(n - 1): 25 | for j in range(i + 1, n): 26 | d = distance(self.x[i], self.x[j], self.y[i], self.y[j]) 27 | if d < min_distance: 28 | min_distance = d 29 | p_i = i 30 | p_j = j 31 | 32 | return p_i, p_j, min_distance 33 | 34 | if __name__ == "__main__": 35 | import doctest 36 | doctest.testmod() 37 | -------------------------------------------------------------------------------- /ch6/temp_plots/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/temp_plots/.DS_Store -------------------------------------------------------------------------------- /ch6/temp_plots/GHCND_sample_csv.csv: -------------------------------------------------------------------------------- 1 | STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,TMAX,TMIN,PRCP 2 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100101,-178,-311,0 3 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100102,-244,-322,0 4 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100103,-194,-289,0 5 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100104,-167,-200,15 6 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100105,-133,-167,9999 7 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100106,-133,-172,9999 8 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100107,-150,-278,0 9 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100108,-233,-328,0 10 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100109,-233,-322,0 11 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100110,-117,-244,0 12 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100111,-67,-128,0 13 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100112,-78,-122,0 14 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100113,-17,-89,0 15 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100114,39,-72,0 16 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100115,-67,-72,0 17 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100116,22,-50,0 18 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100117,33,-44,0 19 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100118,6,-172,0 20 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100119,-56,-183,0 21 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100120,-67,-139,0 22 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100121,-67,-94,25 23 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100122,-44,-67,0 24 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100123,-6,-44,0 25 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100124,0,-11,0 26 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100125,-11,-161,0 27 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100126,-161,-233,0 28 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100127,-167,-222,0 29 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100128,-167,-283,0 30 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100129,-189,-283,0 31 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100130,-156,-267,0 32 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100131,-150,-272,0 33 | -------------------------------------------------------------------------------- /ch6/temp_plots/plot_stations.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import mplleaflet 3 | 4 | def plot_stations(longitudes, latitudes, embedded = False): 5 | if embedded: 6 | plt.figure(figsize = (8, 8)) 7 | plt.scatter(longitudes, latitudes, 8 | c = 'b', 9 | marker = 'D', 10 | alpha = 0.7, 11 | s = 200) 12 | return mplleaflet.display() if embedded else mplleaflet.show() 13 | -------------------------------------------------------------------------------- /ch6/temp_plots/plot_temps.py: -------------------------------------------------------------------------------- 1 | from matplotlib.ticker import MultipleLocator 2 | 3 | def plot_temps(df, min_temp, max_temp, extreme_low_temps, extreme_high_temps): 4 | ax1 = df.plot.line(y = ['TMAX', 'TMIN'], 5 | figsize = (12, 9), 6 | ylim = (1.3 * min_temp, 1.3 * max_temp), 7 | rot = 45, fontsize = 12, style = ['-r', '-b'], linewidth = 0.6, 8 | legend = False, 9 | x_compat = True) 10 | ax1.lines[0].set_label('Max. temperature') 11 | ax1.lines[-1].set_label('Min. temperature') 12 | ax1.set_title('Low and High Temperatures in January 2010\nNorth Dakota, United States', 13 | fontsize = 20, y = 1.06) 14 | ax1.set_xlabel('Date', fontsize = 14, labelpad = 15) 15 | ax1.set_ylabel('Temperature [\u2103]', fontsize = 14) 16 | ax1.spines['right'].set_visible(False) 17 | ax1.spines['top'].set_visible(False) 18 | ax1.yaxis.set_minor_locator(MultipleLocator(5)) 19 | ax1.fill_between(df.index, df['TMAX'], df['TMIN'], 20 | facecolor = 'lightgray', alpha = 0.25) 21 | 22 | def fahrenheit_to_celisus(temp): 23 | return 1.8 * temp + 32 24 | 25 | ax2 = ax1.twinx() 26 | y_min, y_max = ax1.get_ylim() 27 | ax2.set_ylim(fahrenheit_to_celisus(y_min), fahrenheit_to_celisus(y_max)) 28 | ax2.set_ylabel('Temperature [\u2109]', fontsize = 14, labelpad = 15) 29 | ax2.spines['top'].set_visible(False) 30 | ax2.yaxis.set_minor_locator(MultipleLocator(5)) 31 | for label in ax2.get_yticklabels(): 32 | label.set_fontsize(12) 33 | 34 | ax1.scatter(extreme_low_temps.index, extreme_low_temps, 35 | color = 'blue', marker = 'v', s = 100, 36 | label = 'Unusually low temperatures') 37 | ax1.scatter(extreme_high_temps.index, extreme_high_temps, 38 | color = 'red', marker = '^', s = 100, 39 | label = 'Unusually high temperatures') 40 | ax1.legend(loc = 4, frameon = False, title = 'Legend') 41 | -------------------------------------------------------------------------------- /ch6/temp_plots/temp_visualization_demo.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_csv('GHCND_sample_csv.csv', 4 | usecols = [3, 4, 5, 6, 7], 5 | index_col = 2, 6 | parse_dates = True, 7 | infer_datetime_format = True) 8 | df['TMIN'] = df['TMIN'] / 10 9 | df['TMAX'] = df['TMAX'] / 10 10 | print(df.head()) 11 | 12 | from plot_stations import plot_stations 13 | plot_stations(df['LONGITUDE'].tolist()[0], df['LATITUDE'].tolist()[0]) 14 | 15 | min_temp = df['TMIN'].min() 16 | max_temp = df['TMAX'].max() 17 | print("\nMinimum temperature: %g\nMaximum temperature: %g\n" % (min_temp, max_temp)) 18 | 19 | LIMIT_HIGH = 0 20 | LIMIT_LOW = -30 21 | 22 | extreme_high_temps = df['TMAX'][df['TMAX'] > LIMIT_HIGH] 23 | extreme_low_temps = df['TMIN'][df['TMIN'] < LIMIT_LOW] 24 | 25 | print('Extreme low temperatures\n', extreme_low_temps) 26 | print('\nExtreme high temperatures\n', extreme_high_temps) 27 | 28 | from plot_temps import plot_temps 29 | plot_temps(df, min_temp, max_temp, extreme_low_temps, extreme_high_temps) 30 | -------------------------------------------------------------------------------- /ch7/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/.DS_Store -------------------------------------------------------------------------------- /ch7/core_concepts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/core_concepts/.DS_Store -------------------------------------------------------------------------------- /ch7/core_concepts/data_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Produces features and outputs based upon various criteria by simulating 5 | fake "real world" processes. 6 | 7 | @author: Ervin Varga 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | 12 | def generate_base_features(sample_size): 13 | x_normal = np.random.normal(6, 9, sample_size) 14 | x_uniform = np.random.uniform(0, 1, sample_size) 15 | x_interacting = x_normal * x_uniform 16 | x_combined = 3.6 * x_normal + np.random.exponential(2/3, sample_size) 17 | x_collinear = 5.6 * x_combined 18 | 19 | features = { 20 | 'x_normal': x_normal, 21 | 'x_uniform': x_uniform, 22 | 'x_interacting': x_interacting, 23 | 'x_combined': x_combined, 24 | 'x_collinear': x_collinear 25 | } 26 | return pd.DataFrame.from_dict(features) 27 | 28 | def identity(x): 29 | return x 30 | 31 | def generate_response(X, error_spread, beta, f=identity): 32 | error = np.random.normal(0, error_spread, (X.shape[0], 1)) 33 | intercept = beta[0] 34 | coef = np.array(beta[1:]).reshape(X.shape[1], 1) 35 | return f(intercept + np.dot(X, coef)) + error -------------------------------------------------------------------------------- /ch7/core_concepts/observer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Contains functions to recover parameters and demonstrate various effects 5 | pertaining to training, testing, and evaluation. 6 | 7 | @author: Ervin Varga 8 | """ 9 | import numpy as np 10 | import pandas as pd 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | 14 | plt.style.use('seaborn-whitegrid') 15 | 16 | def train_model(model, X_train, y_train): 17 | model.fit(X_train, y_train) 18 | 19 | def evaluate_model(model, X_test, y_test, plot_residuals=False, title=''): 20 | from sklearn.metrics import mean_squared_error, explained_variance_score 21 | 22 | y_pred = model.predict(X_test) 23 | 24 | if plot_residuals: 25 | _, ax = plt.subplots(figsize=(9, 9)) 26 | ax.set_title('Residuals Plot - ' + title, fontsize=19) 27 | ax.set_xlabel('Predicted values', fontsize=15) 28 | ax.set_ylabel('Residuals', fontsize=15) 29 | sns.residplot(y_pred.squeeze(), y_test.squeeze(), 30 | lowess=True, 31 | ax=ax, 32 | scatter_kws={'alpha': 0.3}, 33 | line_kws={'color': 'black', 'lw': 2, 'ls': '--'}) 34 | 35 | metrics = { 36 | 'explained_variance': explained_variance_score(y_test, y_pred), 37 | 'mse': mean_squared_error(y_test, y_pred) 38 | } 39 | return metrics 40 | 41 | def make_poly_pipeline(model, degree): 42 | from sklearn.pipeline import make_pipeline 43 | from sklearn.preprocessing import PolynomialFeatures 44 | 45 | return make_pipeline(PolynomialFeatures(degree=degree, include_bias=False), model) 46 | 47 | def print_parameters(linear_model, metrics): 48 | print('Intercept: %.3f' % linear_model.intercept_) 49 | print('Coefficients: \n', linear_model.coef_) 50 | print('Explained variance score: %.3f' % metrics['explained_variance']) 51 | print("Mean squared error: %.3f" % metrics['mse']) 52 | 53 | def plot_mse(model, X, y, title, error_spread): 54 | def collect_mse(): 55 | from sklearn.model_selection import train_test_split 56 | from sklearn.model_selection import cross_val_score 57 | 58 | metrics_all = [] 59 | for train_size_pct in range(10, 110, 10): 60 | X_train, X_test, y_train, y_test = \ 61 | train_test_split(X, y, shuffle=False, train_size=train_size_pct / 100) 62 | metrics_current = dict() 63 | metrics_current['percent_train'] = train_size_pct 64 | train_model(model, X_train, y_train) 65 | metrics_train = evaluate_model(model, X_train, y_train) 66 | metrics_current['Training score'] = metrics_train['mse'] 67 | metrics_cv = cross_val_score( 68 | model, 69 | X_train, y_train, 70 | scoring='neg_mean_squared_error', 71 | cv=10) 72 | metrics_current['CV score'] = -metrics_cv.mean() 73 | if X_test.shape[0] > 0: 74 | metrics_test = evaluate_model(model, X_test, y_test) 75 | metrics_current['Testing score'] = metrics_test['mse'] 76 | else: 77 | metrics_current['Testing score'] = np.NaN 78 | metrics_all.append(metrics_current) 79 | return pd.DataFrame.from_records(metrics_all) 80 | 81 | import matplotlib.ticker as mtick 82 | 83 | df = collect_mse() 84 | error_variance = error_spread**2 85 | ax = df.plot( 86 | x='percent_train', 87 | title=title, 88 | kind='line', 89 | xticks=range(10, 110, 10), 90 | sort_columns=True, 91 | style=['b+--', 'ro-', 'gx:'], 92 | markersize=10.0, 93 | grid=False, 94 | figsize=(8, 6), 95 | lw=2) 96 | ax.set_xlabel('Training set size', fontsize=15) 97 | ax.xaxis.set_major_formatter(mtick.PercentFormatter()) 98 | y_min, y_max = ax.get_ylim() 99 | # FIX ME: See Exercise 3! 100 | ax.set_ylim(max(0, y_min), min(2 * error_variance, y_max)) 101 | ax.set_ylabel('MSE', fontsize=15) 102 | ax.title.set_size(19) 103 | 104 | # Draw and annotate the minimum MSE. 105 | ax.axhline(error_variance, color='g', ls='--', lw=1) 106 | ax.annotate( 107 | 'Inherent error level', 108 | xy=(15, error_variance), 109 | textcoords='offset pixels', 110 | xytext=(10, 80), 111 | arrowprops=dict(facecolor='black', width=1, shrink=0.05)) 112 | 113 | def explain_sse(slope, intercept, x, y): 114 | # Configure the diagram. 115 | _, ax = plt.subplots(figsize=(7, 9)) 116 | ax.set_xlabel('x', fontsize=15) 117 | ax.set_ylabel('y', fontsize=15) 118 | ax.set_title(r'$SSE = \sum_{i=1}^n (y_i - \hat{y}_i)^2$', fontsize=19) 119 | ax.grid(False) 120 | ax.spines["top"].set_visible(False) 121 | ax.spines["right"].set_visible(False) 122 | ax.tick_params(direction='out', length=6, width=2, colors='black') 123 | 124 | # Show x-y pairs. 125 | ax.scatter(x, y, alpha=0.5, marker='x') 126 | 127 | # Draw the regression line. 128 | xlims = np.array([np.min(x), np.max(x)]) 129 | ax.plot(xlims, slope * xlims + intercept, lw=2, color='b') 130 | 131 | # Draw the error terms. 132 | for x_i, y_i in zip(x, y): 133 | ax.plot([x_i, x_i], [y_i, slope * x_i + intercept], color='r', lw=2, ls='--') -------------------------------------------------------------------------------- /ch7/core_concepts/session.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Contains functions that depict steps to reconstruct different world parameters 5 | from observations using various noise levels. 6 | 7 | @author: Ervin Varga 8 | """ 9 | import warnings 10 | warnings.simplefilter(action='ignore', category=FutureWarning) 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.linear_model import LinearRegression 15 | 16 | from data_generator import * 17 | from observer import * 18 | 19 | def set_session_seed(seed): 20 | np.random.seed(seed) # Enables perfect reproduction of published results. 21 | 22 | def demo_metrics_and_mse(): 23 | set_session_seed(100) 24 | X = generate_base_features(1000)[['x_normal']] 25 | for noise_level in [0, 2, 15]: 26 | y = generate_response(X, noise_level, [-1.5, 4.1]) 27 | model = LinearRegression() 28 | train_model(model, X, y) 29 | metrics = evaluate_model(model, X, y) 30 | 31 | print('\nIteration with noise level: %d' % noise_level) 32 | print_parameters(model, metrics) 33 | 34 | # Visualize the regression line and error terms. 35 | if noise_level == 15: 36 | slope = model.coef_[0][0] 37 | intercept = model.intercept_ 38 | explain_sse(slope, intercept, X[:15].values, y[:15]) 39 | 40 | def demo_overfitting(): 41 | def visualize_overfitting(): 42 | train_model(optimal_model, X, y) 43 | train_model(complex_model, X, y) 44 | 45 | _, ax = plt.subplots(figsize=(9, 7)) 46 | ax.set_yticklabels([]) 47 | ax.set_xticklabels([]) 48 | ax.grid(False) 49 | 50 | X_test = np.linspace(0, 1.2, 100) 51 | plt.plot(X_test, np.sin(2 * np.pi * X_test), label='True function') 52 | plt.plot( 53 | X_test, 54 | optimal_model.predict(X_test[:, np.newaxis]), 55 | label='Optimal model', 56 | ls='-.') 57 | plt.plot( 58 | X_test, 59 | complex_model.predict(X_test[:, np.newaxis]), 60 | label='Complex model', 61 | ls='--', 62 | lw=2, 63 | color='red') 64 | plt.scatter(X, y, alpha=0.2, edgecolor='b', s=20, label='Training Samples') 65 | ax.fill_between(X_test, -2, 2, where=X_test > 1, hatch='/', alpha=0.05, color='black') 66 | plt.xlabel('x', fontsize=15) 67 | plt.ylabel('y', fontsize=15) 68 | plt.xlim((0, 1.2)) 69 | plt.ylim((-2, 2)) 70 | plt.legend(loc='upper left') 71 | plt.title('Visualization of How Overfitting Occurs', fontsize=19) 72 | plt.show() 73 | 74 | set_session_seed(172) 75 | X = generate_base_features(120)[['x_uniform']] 76 | y = generate_response(X, 0.1, [0, 2 * np.pi], f=np.sin) 77 | 78 | optimal_model = make_poly_pipeline(LinearRegression(), 5) 79 | plot_mse(optimal_model, X, y, 'Optimal Model', 0.1) 80 | complex_model = make_poly_pipeline(LinearRegression(), 35) 81 | plot_mse(complex_model, X, y, 'Complex Model', 0.1) 82 | 83 | visualize_overfitting() 84 | 85 | def demo_underfitting(): 86 | set_session_seed(15) 87 | X = generate_base_features(200) 88 | X_interacting = X[['x_interacting']] 89 | y = generate_response(X_interacting, 2, [1.7, -4.3]) 90 | plot_mse(LinearRegression(), X_interacting, y, 'Optimal Model', 2) 91 | X_weak = X[['x_normal', 'x_uniform']] 92 | plot_mse(LinearRegression(), X_weak, y, 'Weak Model', 2) 93 | 94 | def demo_collinearity(): 95 | set_session_seed(10) 96 | X = generate_base_features(1000) 97 | X_world = X[['x_normal', 'x_combined']] 98 | y = generate_response(X_world, 2, [1.1, -2.3, 3.1]) 99 | 100 | model = LinearRegression() 101 | # Showcase the first assumed model. 102 | train_model(model, X_world, y) 103 | metrics = evaluate_model(model, X_world, y) 104 | print('\nDumping stats for model 1') 105 | print_parameters(model, metrics) 106 | 107 | # Showcase the second assumed model. 108 | X_extended_world = X[['x_normal', 'x_combined', 'x_collinear']] 109 | train_model(model, X_extended_world, y) 110 | metrics = evaluate_model(model, X_extended_world, y) 111 | print('\nDumping stats for model 2') 112 | print_parameters(model, metrics) 113 | 114 | # Produce a scatter matrix plot. 115 | df = X 116 | df.columns = ['x' + str(i + 1) for i in range(len(df.columns))] 117 | df['y'] = y 118 | pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='kde') 119 | 120 | def demo_residuals(): 121 | def plot_regression_line(x, y, case_num): 122 | _, ax = plt.subplots(figsize=(9, 9)) 123 | ax.set_title('Regression Plot - Case ' + str(case_num), fontsize=19) 124 | ax.set_xlabel('x', fontsize=15) 125 | ax.set_ylabel('y', fontsize=15) 126 | sns.regplot(x.squeeze(), y.squeeze(), 127 | ci=None, 128 | ax=ax, 129 | scatter_kws={'alpha': 0.3}, 130 | line_kws={'color': 'green', 'lw': 3}) 131 | 132 | set_session_seed(100) 133 | X = generate_base_features(1000) 134 | X1 = X[['x_normal']] 135 | y1 = generate_response(X1, 0.04, [1.2, 0.00003]) 136 | X2 = X1**2 137 | y2 = generate_response(X2, 0.04, [1.2, 0.00003]) 138 | 139 | model = LinearRegression() 140 | # Showcase the first world with a linearly assumed model. 141 | plot_regression_line(X1, y1, 1) 142 | train_model(model, X1, y1) 143 | metrics = evaluate_model(model, X1, y1, True, 'Case 1') 144 | print('\nDumping stats for case 1') 145 | print_parameters(model, metrics) 146 | 147 | # Showcase the second world with a linearly assumed model. 148 | plot_regression_line(X1, y2, 2) 149 | train_model(model, X1, y2) 150 | metrics = evaluate_model(model, X1, y2, True, 'Case 2') 151 | print('\nDumping stats for case 2') 152 | print_parameters(model, metrics) 153 | 154 | def demo_regularization(): 155 | from sklearn.linear_model import RidgeCV 156 | 157 | set_session_seed(172) 158 | X = generate_base_features(120)[['x_uniform']] 159 | y = generate_response(X, 0.1, [0, 2 * np.pi], f=np.sin) 160 | 161 | regularized_model = make_poly_pipeline( 162 | RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 5, 10, 20], gcv_mode='auto'), 163 | 35) 164 | plot_mse(regularized_model, X, y, 'Regularized Model', 0.1) -------------------------------------------------------------------------------- /ch7/stock_market/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/stock_market/.DS_Store -------------------------------------------------------------------------------- /ch7/stock_market/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Module to preprocess financal data and prepare for further regression analysis. 5 | 6 | @author: Ervin Varga 7 | """ 8 | import numpy as np 9 | import pandas as pd 10 | 11 | def read_daily_equity_data(file): 12 | stock_data = pd.read_csv(file, usecols=[0, 4, 5], skiprows=[1]) 13 | stock_data['timestamp'] = pd.to_datetime(stock_data['timestamp']) 14 | stock_data.set_index('timestamp', inplace=True, verify_integrity=True) 15 | stock_data.sort_index(inplace=True) 16 | return stock_data 17 | 18 | def compose_trends(ts): 19 | from sklearn.preprocessing import MinMaxScaler 20 | 21 | scaler = MinMaxScaler() 22 | scaled_ts = pd.DataFrame(scaler.fit_transform(ts), columns=ts.columns, index=ts.index) 23 | return pd.concat([scaled_ts['close'].rolling('365D').mean(), 24 | scaled_ts['volume'].rolling('365D').mean()], axis=1) 25 | 26 | def create_log_returns(ts, halflife, normalize_close=True): 27 | ts['close_ret'] = np.log(ts['close']).diff() 28 | if normalize_close: 29 | ts['close_ret'] /= ts['close_ret'].ewm(halflife=halflife).std() 30 | ts['volume_ret'] = np.log(ts['volume']).diff() 31 | return ts.dropna() -------------------------------------------------------------------------------- /ch7/stock_market/data_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Contains various utility visualization routines. 5 | 6 | @author: Ervin Varga 7 | """ 8 | import matplotlib.pyplot as plt 9 | 10 | def plot_time_series(ts, title_prefix, style='b-'): 11 | ax = ts.plot(figsize=(9, 8), lw=2, fontsize=12, style=style) 12 | ax.set_title('%s Over Time' % title_prefix, fontsize=19) 13 | ax.set_xlabel('Year', fontsize=15) 14 | plt.show() 15 | 16 | def hist_time_series(ts, xlabel, bins): 17 | ax = ts.hist(figsize=(9, 8), xlabelsize=12, ylabelsize=12, bins=bins, grid=False) 18 | ax.set_title('Distribution of %s' % xlabel, fontsize=19) 19 | ax.set_xlabel(xlabel, fontsize=15) 20 | plt.show() 21 | 22 | def scatter_time_series(ts, x, y): 23 | ax = ts.plot(x=x, y=y, figsize=(9, 8), kind='scatter', fontsize=12) 24 | ax.set_title('Auto-correlation Graph', fontsize=19) 25 | ax.set_xlabel(x, fontsize=15) 26 | ax.set_ylabel(y, fontsize=15) 27 | plt.show() 28 | 29 | def heat_corr_plot(corr_matrix): 30 | import numpy as np 31 | import seaborn as sns 32 | 33 | mask = np.zeros_like(corr_matrix) 34 | mask[np.triu_indices_from(mask)] = True 35 | _, ax = plt.subplots(figsize=(9, 8)) 36 | sns.heatmap(corr_matrix, annot=True, cmap='gist_gray', fmt=".2f", lw=.5, mask=mask, ax=ax) 37 | plt.tight_layout() 38 | plt.show() -------------------------------------------------------------------------------- /ch7/stock_market/driver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | The main driver file that connects all pieces together. 5 | 6 | @author: Ervin Varga 7 | """ 8 | from data_preprocessing import * 9 | from data_visualization import * 10 | 11 | # Data Acquisition stage. 12 | stock_data = read_daily_equity_data('daily_AAPL.csv') 13 | 14 | # Data Preprocessing stage. 15 | stock_data = create_log_returns(stock_data, 23) 16 | 17 | plot_time_series(stock_data['close'], 'AAPL Closing Levels') 18 | plot_time_series(stock_data['close'].rolling('365D').mean(), 'AAPL Closing Trend') 19 | plot_time_series(compose_trends(stock_data), 'AAPL Closing & Volume Trends', ['b-', 'g--']) 20 | 21 | # To produce the non-normalized price log returns plot you must call 22 | # the create_log_returns function with normalize_close=False. Try this as an 23 | # additional exercise. 24 | plot_time_series(stock_data['close_ret'], 'AAPL Volatility-Norm. Price Log Returns') 25 | plot_time_series(stock_data['volume_ret'], 'AAPL Volume Log Returns') 26 | 27 | hist_time_series(stock_data['close_ret'], 'Daily Stock Log Returns', 50) 28 | hist_time_series(stock_data['volume_ret'], 'Daily Volume Log Returns', 50) 29 | 30 | # Feature Engineering stage. 31 | from feature_engineering import * 32 | 33 | report_auto_correlation(stock_data) 34 | corr_matrix = create_features(stock_data) 35 | heat_corr_plot(corr_matrix) 36 | 37 | # Regression Implementation stage. 38 | from pyspark.sql import SparkSession 39 | 40 | from streaming_regression import * 41 | 42 | sparkSession = SparkSession.builder \ 43 | .master("local[4]") \ 44 | .appName("Streaming Regression Case Study")\ 45 | .getOrCreate() 46 | fit_and_predict(sparkSession, stock_data) -------------------------------------------------------------------------------- /ch7/stock_market/feature_engineering.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Module to help perform feature engineering. 5 | 6 | @author: Ervin Varga 7 | """ 8 | from data_visualization import * 9 | 10 | def report_auto_correlation(ts, periods=5): 11 | for column in filter(lambda str: str.endswith('_ret'), ts.columns): 12 | future_column = 'future_' + column 13 | ts[future_column] = ts[column].shift(-periods).rolling(periods).sum() 14 | current_column = 'current_' + column 15 | ts[current_column] = ts[column].rolling(periods).sum() 16 | 17 | print(ts[[current_column, future_column]].corr()) 18 | scatter_time_series(ts, current_column, future_column) 19 | 20 | def create_features(ts): 21 | from talib import SMA, RSI, OBV 22 | 23 | target = 'future_close_ret' 24 | features = ['current_close_ret', 'current_volume_ret'] 25 | 26 | for n in [14, 25, 50, 100]: 27 | ts['sma_' + str(n)] = SMA(ts['close'].values, timeperiod=n) / ts['close'] 28 | ts['rsi_' + str(n)] = RSI(ts['close'].values, timeperiod=n) 29 | ts['obv'] = OBV(ts['close'].values, ts['volume'].values.astype('float64')) 30 | 31 | ts.drop(['close', 'volume', 'close_ret', 'volume_ret', 'future_volume_ret'], 32 | axis='columns', 33 | inplace=True) 34 | ts.dropna(inplace=True) 35 | return ts.corr() -------------------------------------------------------------------------------- /ch7/stock_market/streaming_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Fits a linear model using streaming regression. 5 | 6 | @author: Ervin Varga 7 | """ 8 | def fit_and_predict(sparkSession, ts): 9 | import numpy as np 10 | from sklearn.model_selection import train_test_split 11 | from pyspark.streaming import StreamingContext 12 | from pyspark.mllib.regression import StreamingLinearRegressionWithSGD 13 | 14 | def to_scaled_rdd(pandasDataFrame): 15 | import pandas as pd 16 | from sklearn.preprocessing import RobustScaler 17 | from pyspark.mllib.regression import LabeledPoint 18 | 19 | regressors = pandasDataFrame.columns[1:] 20 | num_regressors = len(regressors) 21 | # FIX ME: As a bonus exercise, read the last paragraph from section about residual 22 | # plots and make the necessary bug fix! Compare the behavior of this version with the 23 | # fixed one and see whether you can decipher anything from the outputs. 24 | scaler = RobustScaler() 25 | scaled_regressors = scaler.fit_transform(pandasDataFrame[regressors]) 26 | scaled_pandasDataFrame = pd.DataFrame(scaled_regressors, columns=regressors) 27 | scaled_pandasDataFrame['target'] = pandasDataFrame[pandasDataFrame.columns[0]].values 28 | 29 | sparkDataFrame = sparkSession.createDataFrame(scaled_pandasDataFrame) 30 | return sparkDataFrame.rdd.map( 31 | lambda row: LabeledPoint(row[num_regressors], row[:num_regressors])) 32 | 33 | def report_accuracy(result_rdd): 34 | from pyspark.mllib.evaluation import RegressionMetrics 35 | 36 | if not result_rdd.isEmpty(): 37 | metrics = RegressionMetrics( 38 | result_rdd.map(lambda t: (float(t[1]), float(t[0])))) 39 | print("MSE = %s" % metrics.meanSquaredError) 40 | print("RMSE = %s" % metrics.rootMeanSquaredError) 41 | print("R-squared = %s" % metrics.r2) 42 | print("MAE = %s" % metrics.meanAbsoluteError) 43 | print("Explained variance = %s" % metrics.explainedVariance) 44 | 45 | df_train, df_test = train_test_split(ts, test_size=0.2, shuffle=False) 46 | train_rdd = to_scaled_rdd(df_train) 47 | test_rdd = to_scaled_rdd(df_test) 48 | 49 | streamContext = StreamingContext(sparkSession.sparkContext, 1) 50 | train_stream = streamContext.queueStream([train_rdd]) 51 | test_stream = streamContext.queueStream([test_rdd]) 52 | 53 | numFeatures = len(ts.columns) - 1 54 | model = StreamingLinearRegressionWithSGD(stepSize=0.05, numIterations=300) 55 | np.random.seed(0) 56 | model.setInitialWeights(np.random.rand(numFeatures)) 57 | 58 | model.trainOn(train_stream) 59 | result_stream = model.predictOnValues(test_stream.map(lambda lp: (lp.label, lp.features))) 60 | result_stream.cache() 61 | result_stream.foreachRDD(report_accuracy) 62 | 63 | streamContext.start() 64 | streamContext.awaitTermination() -------------------------------------------------------------------------------- /ch8/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch8/.DS_Store -------------------------------------------------------------------------------- /ch8/lkpy_demo.py: -------------------------------------------------------------------------------- 1 | from itertools import tee 2 | 3 | import pandas as pd 4 | 5 | from lenskit import batch 6 | from lenskit import crossfold as xf 7 | from lenskit.algorithms import funksvd, item_knn, user_knn 8 | from lenskit.metrics import topn 9 | 10 | ratings = pd.read_csv('data/ratings.csv') 11 | ratings.rename({'userId': 'user', 'movieId': 'item'}, axis = 'columns', inplace = True) 12 | print(ratings.head()) 13 | 14 | xf_dataset_batch, xf_dataset_test = tee(xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2))) 15 | truth = pd.concat([test for _, test in xf_dataset_test], ignore_index = True) 16 | 17 | runner = batch.MultiEval('result', False, nprocs = 4) 18 | runner.add_algorithms( 19 | [item_knn.ItemItem(10), item_knn.ItemItem(20), item_knn.ItemItem(30)], 20 | False, 21 | ['nnbrs'] 22 | ) 23 | runner.add_algorithms( 24 | [user_knn.UserUser(10), user_knn.UserUser(20), user_knn.UserUser(30)], 25 | True, 26 | ['nnbrs'] 27 | ) 28 | runner.add_algorithms( 29 | [funksvd.FunkSVD(40, damping = 0), funksvd.FunkSVD(50, damping = 5), funksvd.FunkSVD(60, damping = 10)], 30 | False, 31 | ['features', 'damping'] 32 | ) 33 | runner.add_datasets(xf_dataset_batch) 34 | runner.run() 35 | 36 | runs = pd.read_parquet('result/runs.parquet', 37 | columns = ('AlgoClass','RunId','damping','features','nnbrs')) 38 | runs.rename({'AlgoClass': 'Algorithm'}, axis = 'columns', inplace = True) 39 | 40 | def extract_config(x): 41 | from math import isnan 42 | 43 | damping, features, nnbrs = x 44 | result = '' 45 | if not isnan(damping): 46 | result = "damping=%.2f " % damping 47 | if not isnan(features): 48 | result += "features=%.2f " % features 49 | if not isnan(nnbrs): 50 | result += "nnbrs=%.2f" % nnbrs 51 | return result.strip() 52 | 53 | runs['Configuration'] = runs[['damping','features','nnbrs']].apply(extract_config, axis = 1) 54 | runs.drop(columns = ['damping','features','nnbrs'], inplace = True) 55 | 56 | recs = pd.read_parquet('result/recommendations.parquet') 57 | recs = recs.merge(runs, on = 'RunId') 58 | recs.drop(columns = ['RunId'], inplace = True) 59 | print(recs.head(10)) 60 | 61 | user_dcg = recs.groupby(['Algorithm', 'Configuration', 'user']).rating.apply(topn.dcg) 62 | user_dcg = user_dcg.reset_index(name='DCG') 63 | ideal_dcg = topn.compute_ideal_dcgs(truth) 64 | user_ndcg = pd.merge(user_dcg, ideal_dcg) 65 | user_ndcg['nDCG'] = user_ndcg.DCG / user_ndcg.ideal_dcg 66 | user_ndcg = user_ndcg.groupby(['Algorithm', 'Configuration']).nDCG.mean() 67 | 68 | %matplotlib inline 69 | user_ndcg.plot.bar() -------------------------------------------------------------------------------- /ch8/simple_recommender/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch8/simple_recommender/.DS_Store -------------------------------------------------------------------------------- /ch8/simple_recommender/omdb_service.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | class OMDbService: 4 | API_URL = 'http://www.omdbapi.com/' 5 | 6 | def __init__(self, api_key): 7 | self._api_key = api_key 8 | 9 | def retrieve_info(self, title): 10 | """Returns information about the movie title in JSON format.""" 11 | params = {'apikey': self._api_key, 't': title, 'type': 'movie', 'r': 'json'} 12 | return requests.get(OMDbService.API_URL, params).json() -------------------------------------------------------------------------------- /ch8/simple_recommender/simple_movie_recommender.py: -------------------------------------------------------------------------------- 1 | from tastedive_service import TasteDiveService 2 | from omdb_service import OMDbService 3 | 4 | class SimpleMovieRecommender: 5 | PRIMARY_SOURCE = 'Internet Movie Database' 6 | 7 | def __init__(self, omdb_api_key): 8 | self._omdb = OMDbService(omdb_api_key) 9 | self._td = TasteDiveService() 10 | 11 | @staticmethod 12 | def _retrieve_rating(omdb_response): 13 | for rating in omdb_response['Ratings']: 14 | if rating['Source'] == SimpleMovieRecommender.PRIMARY_SOURCE: 15 | return float(rating['Value'].split('/')[0]) 16 | return float(omdb_response['imdbRating']) 17 | 18 | def recommendations(self, titles, limit = 5): 19 | """ 20 | Return a list of recommended movie titles up to the specified limit. 21 | The items are ordered according to their ratings (from top to bottom). 22 | """ 23 | similar_titles = self._td.similar_titles(titles, limit) 24 | ratings = map(lambda title: SimpleMovieRecommender._retrieve_rating(self._omdb.retrieve_info(title)), 25 | similar_titles) 26 | return list(map(lambda item: item[1], sorted(zip(ratings, similar_titles), reverse = True))) -------------------------------------------------------------------------------- /ch8/simple_recommender/tastedive_service.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | class TasteDiveService: 4 | SUPPORTED_ARTIFACTS = ['music', 'movies', 'shows', 'podcasts', 'books', 'authors', 'games'] 5 | API_URL = 'https://tastedive.com/api/similar' 6 | 7 | def __init__(self, artifact_type = 'movies'): 8 | assert artifact_type in TasteDiveService.SUPPORTED_ARTIFACTS, 'Invalid artifact type' 9 | 10 | self._artifact_type = artifact_type 11 | 12 | def _retrieve_artifacts(self, name, limit): 13 | params = {'q': name, 'type': self._artifact_type, 'limit': limit} 14 | return requests.get(TasteDiveService.API_URL, params).json() 15 | 16 | @staticmethod 17 | def _extract_titles(response): 18 | artifacts = response['Similar']['Results'] 19 | return [artifact['Name'] for artifact in artifacts] 20 | 21 | def similar_titles(self, titles, limit = 5): 22 | """ 23 | Returns a set of similar titles up to the defined limit. Each instance of 24 | this class is supposed to work only with one artifact type. This type is specified 25 | during object construction. 26 | """ 27 | assert 0 < limit <= 50, 'Limit must be in range (0, 50].' 28 | 29 | return {similar_title 30 | for title in titles 31 | for similar_title in TasteDiveService._extract_titles(self._retrieve_artifacts(title, limit))} 32 | -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** --------------------------------------------------------------------------------