├── 9781484248584.jpg
├── Contributing.md
├── LICENSE.txt
├── README.md
├── ch1
    ├── .DS_Store
    └── money_growth
    │   ├── calculate_money_growth.py
    │   └── calculate_money_growth_fast.py
├── ch10
    ├── .DS_Store
    ├── artificial_network.py
    ├── bipartite.py
    ├── edges.edgelist
    ├── load_graph.py
    ├── nodes.csv
    ├── signed_graph.py
    └── usage_matrix.py
├── ch11
    ├── .DS_Store
    ├── artificial_network_sampling.py
    ├── count_non_repeating_digits.py
    ├── count_non_repeating_digits_naive.py
    ├── count_occurrences_digit.py
    ├── count_occurrences_digit_naive.py
    ├── dimensionality_investigation.py
    ├── parallel_processing.py
    └── perf_test_harness.py
├── ch12
    ├── .DS_Store
    ├── AToughGame.py
    ├── simple_network1.py
    └── simple_network2.py
├── ch2
    ├── .DS_Store
    └── segmentation
    │   ├── .DS_Store
    │   ├── driver.py
    │   ├── raw_data
    │       └── .DS_Store
    │   ├── results
    │       └── .DS_Store
    │   └── scripts
    │       ├── .DS_Store
    │       ├── nyt_data.py
    │       └── nyt_data_chunked.py
├── ch3
    ├── .DS_Store
    ├── bug_fixing
    │   ├── .DS_Store
    │   ├── double_preceding1.py
    │   ├── double_preceding2.py
    │   ├── double_preceding3.py
    │   ├── double_preceding4.py
    │   └── test_double_preceding.py
    ├── cyclomatic_complexity
    │   ├── sort_new.py
    │   └── sort_original.py
    ├── fibonacci
    │   ├── fibonacci1.py
    │   ├── fibonacci2.py
    │   └── sequencer.py
    ├── optimization
    │   ├── elevator0.py
    │   ├── elevator1.py
    │   ├── elevator2.py
    │   └── elevator3.py
    └── puzzles
    │   ├── puzzle1.py
    │   ├── puzzle2.py
    │   └── puzzle2b.py
├── ch4
    ├── .DS_Store
    ├── ball_descend
    │   ├── .DS_Store
    │   ├── Simulation.ipynb
    │   ├── Simulation_Refactored.ipynb
    │   └── pathfinder
    │   │   ├── .DS_Store
    │   │   ├── __init__.py
    │   │   └── pathfinder.py
    └── hanoi
    │   ├── .DS_Store
    │   ├── Solver1.ipynb
    │   ├── Solver2.ipynb
    │   └── Solver3.ipynb
├── ch5
    ├── .DS_Store
    └── augmented_ball_descend
    │   ├── .DS_Store
    │   ├── Terrain_Simulation_v1.1.ipynb
    │   ├── Terrain_Simulation_v1.2.ipynb
    │   ├── interactionlib
    │       ├── .DS_Store
    │       ├── __init__.py
    │       └── interaction_monitor.py
    │   ├── pathfinder
    │       ├── .DS_Store
    │       ├── __init__.py
    │       ├── base_pathfinder.py
    │       ├── non_recursive_simple_pathfinder.py
    │       ├── parallel_simple_pathfinder.py
    │       ├── pathutils.py
    │       └── simple_pathfinder.py
    │   ├── terrain_data
    │       ├── .DS_Store
    │       ├── coastline.jpg
    │       └── coastline_with_path.jpg
    │   └── testutils
    │       ├── .DS_Store
    │       ├── __init__.py
    │       └── create_terrain.py
├── ch6
    ├── .DS_Store
    ├── Sample_Temperature_Plots.ipynb
    ├── anscombe
    │   ├── anscombe_altair.py
    │   └── anscombe_matplotlib.py
    ├── closest_pair
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── base_closest_pair.py
    │   ├── fast_closest_pair.py
    │   └── naive_closest_pair.py
    └── temp_plots
    │   ├── .DS_Store
    │   ├── GHCND_sample_csv.csv
    │   ├── plot_stations.py
    │   ├── plot_temps.py
    │   └── temp_visualization_demo.py
├── ch7
    ├── .DS_Store
    ├── core_concepts
    │   ├── .DS_Store
    │   ├── data_generator.py
    │   ├── observer.py
    │   └── session.py
    └── stock_market
    │   ├── .DS_Store
    │   ├── daily_AAPL.csv
    │   ├── data_preprocessing.py
    │   ├── data_visualization.py
    │   ├── driver.py
    │   ├── feature_engineering.py
    │   └── streaming_regression.py
├── ch8
    ├── .DS_Store
    ├── lkpy_demo.py
    └── simple_recommender
    │   ├── .DS_Store
    │   ├── omdb_service.py
    │   ├── simple_movie_recommender.py
    │   └── tastedive_service.py
└── errata.md


/9781484248584.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/9781484248584.jpg


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2019 Ervin Varga
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Practical Data Science with Python 3*](https://www.apress.com/9781484248584) by Ervin Varga (Apress, 2019).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484248584.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/ch1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch1/.DS_Store


--------------------------------------------------------------------------------
/ch1/money_growth/calculate_money_growth.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import math
3 | 
4 | def calculate_money_growth(p0, r, t):
5 |     # List of final amounts.
6 |     p = []
7 |     for i in range(len(p0)):
8 |         p.append(p0[i] * math.exp(r * t[i]))
9 |     return p


--------------------------------------------------------------------------------
/ch1/money_growth/calculate_money_growth_fast.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import numpy as np
3 | 
4 | def calculate_money_growth(p0, r, t):
5 |     assert p0.size == t.size
6 |     
7 |     return p0 * np.exp(r * t)


--------------------------------------------------------------------------------
/ch10/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch10/.DS_Store


--------------------------------------------------------------------------------
/ch10/artificial_network.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | 
 3 | import pandas as pd
 4 | import networkx as nx
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | G = nx.karate_club_graph()
 8 | 
 9 | node_colors = ['orange' if props['club'] == 'Officer' else 'blue' 
10 |                for _, props in G.nodes(data=True)]
11 | node_sizes = [180 * G.degree(u) for u in G]
12 | 
13 | plt.figure(figsize=(10, 10))
14 | pos = nx.kamada_kawai_layout(G)
15 | nx.draw_networkx(G, pos, 
16 |                  node_size=node_sizes, 
17 |                  node_color=node_colors, alpha=0.8, 
18 |                  with_labels=False, 
19 |                  edge_color='.6')
20 | 
21 | main_conns = nx.edge_betweenness_centrality(G, normalized=True)
22 | main_conns = sorted(main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5]
23 | main_conns = tuple(map(operator.itemgetter(0), main_conns))
24 | nx.draw_networkx_edges(G, pos, edgelist=main_conns, edge_color='green', alpha=0.5, width=6)
25 | nx.draw_networkx_labels(G, pos, 
26 |                         labels={0: G.node[0]['club'], 33: G.node[33]['club']}, 
27 |                         font_size=15, font_color='white')
28 | 
29 | candidate_edges = ((8, 15), (30, 21), (29, 28), (1, 6))
30 | nx.draw_networkx_edges(G, pos, edgelist=candidate_edges, 
31 |                        edge_color='blue', alpha=0.5, width=2, style='dashed')
32 | nx.draw_networkx_labels(G, pos, 
33 |                         labels={u: u for t in candidate_edges for u in t}, 
34 |                         font_size=13, font_weight='bold', font_color='yellow')
35 | 
36 | plt.axis('off')
37 | plt.tight_layout();
38 | plt.show()
39 | 
40 | # Create a data frame to store various centrality measures.
41 | df = pd.DataFrame(index=candidate_edges)
42 | 
43 | # Add generic and community aware edge features for potential machine learning classification.
44 | df['pref-att'] = list(map(operator.itemgetter(2), 
45 |                           nx.preferential_attachment(G, candidate_edges)))
46 | df['jaccard-c'] = list(map(operator.itemgetter(2), 
47 |                           nx.jaccard_coefficient(G, candidate_edges)))
48 | df['aa-idx'] = list(map(operator.itemgetter(2), 
49 |                           nx.adamic_adar_index(G, candidate_edges)))
50 | df['ccn'] = list(map(operator.itemgetter(2), 
51 |                           nx.cn_soundarajan_hopcroft(G, candidate_edges, 'club')))
52 | df['cra'] = list(map(operator.itemgetter(2), 
53 |                           nx.ra_index_soundarajan_hopcroft(G, candidate_edges, 'club')))    
54 | 
55 | print(df)


--------------------------------------------------------------------------------
/ch10/bipartite.py:
--------------------------------------------------------------------------------
 1 | # Call usage_matrix.py before executing this script!
 2 | from networkx.algorithms import bipartite
 3 | 
 4 | # Add two new edges as described in the book.
 5 | G.add_edge(0, 10, relation='interact')
 6 | G.add_edge(10, 0, relation='interact')
 7 | 
 8 | # Select all nodes and edges from G that participate in 'interact' relation and
 9 | # create an undirected graph from them.
10 | H = nx.Graph()
11 | H.add_edges_from((u, v) for u, v, r in G.edges(data='relation') if r == 'interact')
12 | 
13 | # Attach a marker to specify which nodes belong to what group.
14 | for node_id in H.nodes():
15 |     H.node[node_id]['bipartite'] = G.node[node_id]['role'] == 'actor'
16 | 
17 | nx.relabel_nodes(H, {n: G.node[n]['label'].replace(' ', '\n') for n in H.nodes()}, copy=False)
18 | 
19 | print("Validating that H is bipartite: ", bipartite.is_bipartite(H))
20 | 
21 | # This is a graph projection operation. Here, we seek to find out what use cases
22 | # have common actors. The weights represent the commonality factor.
23 | W = bipartite.weighted_projected_graph(H, [n for n, r in H.nodes(data='bipartite') if r == 0])
24 | 
25 | # Draw the graph using matplotlib under the hood.
26 | pos = nx.shell_layout(W)
27 | nx.draw(W, pos=pos, with_labels=True, node_size=800, font_size=12)
28 | nx.draw_networkx_edge_labels(W, pos=pos, 
29 |                              edge_labels={(u, v): d['weight'] 
30 |                                           for u, v, d in W.edges(data=True)})
31 | 


--------------------------------------------------------------------------------
/ch10/edges.edgelist:
--------------------------------------------------------------------------------
 1 | # Add edges for the 'impact' relationship.
 2 | 0 6 600 impact 
 3 | 0 7 600 impact
 4 | 1 6 15 impact 
 5 | 2 7 100 impact 
 6 | 2 8 900 impact 
 7 | 3 8 800 impact 
 8 | 4 8 960 impact
 9 | 
10 | # Add edges for the 'include' relationship. A weight of 1 is assigned as a placeholder.
11 | 0 5 1 include 
12 | 1 5 1 include 
13 | 2 5 1 include 
14 | 
15 | # Add edges for the 'extend' relationship.
16 | 3 2 1 extend 
17 | 4 2 1 extend 
18 | 
19 | # Add edges for the 'interact' relationship.
20 | 9 0 1 interact 
21 | 0 9 1 interact
22 | 10 1 1 interact 
23 | 1 10 1 interact
24 | 10 2 1 interact
25 | 2 10 1 interact
26 | 11 3 1 interact
27 | 3 11 1 interact
28 | 11 4 1 interact
29 | 4 11 1 interact
30 | 0 10 1 interact
31 | 10 0 1 interact
32 | 
33 | 


--------------------------------------------------------------------------------
/ch10/load_graph.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | import pandas as pd
 3 | 
 4 | G = nx.read_edgelist('edges.edgelist', 
 5 |                      create_using=nx.MultiDiGraph,
 6 |                      nodetype=int,
 7 |                      data=(('weight', int), ('relation', str)))
 8 | 
 9 | df = pd.read_csv('nodes.csv', index_col=0)
10 | for row in df.itertuples():
11 |     G.node[row.Index]['role'] = row.Role
12 |     G.node[row.Index]['label'] = row.Label
13 | 
14 | # Make a small report.
15 | print("Nodes: \n", G.nodes(data=True), sep='')
16 | print("-" * 20, "\nEdges: \n", G.edges(data=True), sep='')
17 | 


--------------------------------------------------------------------------------
/ch10/nodes.csv:
--------------------------------------------------------------------------------
 1 | Id,Role,Label
 2 | 0,use-case,Communicate
 3 | 1,use-case,Manage Dev.
 4 | 2,use-case,Exec. Data Analytics
 5 | 3,use-case,Use Bus. Int.
 6 | 4,use-case,Use Op. Int.
 7 | 5,use-case,Send/Receive Data
 8 | 6,resource,Network
 9 | 7,resource,Messaging
10 | 8,resource,Database
11 | 9,actor,Device
12 | 10,actor,Application
13 | 11,actor,User
14 | 


--------------------------------------------------------------------------------
/ch10/signed_graph.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | 
 3 | G = nx.Graph()
 4 | 
 5 | G.add_node(0, role='quality-attribute', label='Maintainability')
 6 | G.add_node(1, role='quality-attribute', label='Reliability')
 7 | G.add_node(2, role='quality-attribute', label='Performance')
 8 | 
 9 | G.add_edge(0, 1, sign='+')
10 | G.add_edge(0, 2, sign='-')
11 | G.add_edge(1, 2, sign='-')
12 | 


--------------------------------------------------------------------------------
/ch10/usage_matrix.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | 
 3 | G = nx.MultiDiGraph()
 4 | 
 5 | # Add all nodes with their role and label. You can immediately work with labels, but having
 6 | # short node identifiers keeps your code uncluttered.
 7 | G.add_node(0, role='use-case', label='Communicate')
 8 | G.add_node(1, role='use-case', label='Manage Dev.')
 9 | G.add_node(2, role='use-case', label='Exec. Data Analytics')
10 | G.add_node(3, role='use-case', label='Use Bus. Int.')
11 | G.add_node(4, role='use-case', label='Use Op. Int.')
12 | G.add_node(5, role='use-case', label='Send/Receive Data')
13 | G.add_node(6, role='resource', label='Network')
14 | G.add_node(7, role='resource', label='Messaging')
15 | G.add_node(8, role='resource', label='Database')
16 | G.add_node(9, role='actor', label='Device')
17 | G.add_node(10, role='actor', label='Application')
18 | G.add_node(11, role='actor', label='User')
19 | 
20 | # Add edges for the 'impact' relationship.
21 | G.add_edge(0, 6, weight=600, relation='impact') 
22 | G.add_edge(0, 7, weight=600, relation='impact')
23 | G.add_edge(1, 6, weight=15, relation='impact') 
24 | G.add_edge(2, 7, weight=100, relation='impact') 
25 | G.add_edge(2, 8, weight=900, relation='impact') 
26 | G.add_edge(3, 8, weight=800, relation='impact') 
27 | G.add_edge(4, 8, weight=960, relation='impact')
28 | 
29 | # Add edges for the 'include' relationship.
30 | G.add_edge(0, 5, relation='include') 
31 | G.add_edge(1, 5, relation='include') 
32 | G.add_edge(2, 5, relation='include') 
33 | 
34 | # Add edges for the 'extend' relationship.
35 | G.add_edge(3, 2, relation='extend') 
36 | G.add_edge(4, 2, relation='extend') 
37 | 
38 | # Add edges for the 'interact' relationship.
39 | G.add_edge(9, 0, relation='interact') 
40 | G.add_edge(0, 9, relation='interact') 
41 | G.add_edge(10, 1, relation='interact') 
42 | G.add_edge(1, 10, relation='interact') 
43 | G.add_edge(10, 2, relation='interact') 
44 | G.add_edge(2, 10, relation='interact') 
45 | G.add_edge(11, 3, relation='interact') 
46 | G.add_edge(3, 11, relation='interact') 
47 | G.add_edge(11, 4, relation='interact') 
48 | G.add_edge(4, 11, relation='interact') 
49 | 
50 | # Visualize the resulting graph using pydot and Graphviz.
51 | from networkx.drawing.nx_pydot import write_dot
52 | 
53 | # By default NetworkX returns a deep copy of the source graph.
54 | H = G.copy()
55 | 
56 | # Set some display properties for specific nodes and extract labels.
57 | node_labels = {}
58 | for node_id in H.nodes():
59 |     node_labels[node_id] = H.node[node_id]['label']
60 |     role = H.node[node_id]['role']
61 |     if role == 'resource':
62 |         H.node[node_id]['style'] = 'filled'
63 |         H.node[node_id]['fillcolor'] = 'cyan'
64 |         H.node[node_id]['shape'] = 'component'
65 |         H.node[node_id]['fixedsize'] = 'shape'
66 |     elif role == 'use-case':
67 |         H.node[node_id]['shape'] = 'oval'
68 |     elif role == 'actor':
69 |         H.node[node_id]['style'] = 'rounded'
70 |         H.node[node_id]['shape'] = 'box'
71 | H.node[5]['style'] = 'dashed'
72 | 
73 | nx.relabel_nodes(H, node_labels, copy=False)
74 | pos = nx.nx_pydot.graphviz_layout(H)
75 | nx.draw(H, pos=pos, with_labels=True, font_weight='bold')
76 | write_dot(H, 'usage_matrix.dot')


--------------------------------------------------------------------------------
/ch11/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch11/.DS_Store


--------------------------------------------------------------------------------
/ch11/artificial_network_sampling.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | 
 3 | import networkx as nx
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | G = nx.karate_club_graph()
 7 | 
 8 | node_colors = ['orange' if props['club'] == 'Officer' else 'blue' 
 9 |                for _, props in G.nodes(data=True)]
10 | node_sizes = [180 * G.degree(u) for u in G]
11 | 
12 | plt.figure(figsize=(10, 10))
13 | pos = nx.kamada_kawai_layout(G)
14 | nx.draw_networkx(G, pos, 
15 |                  node_size=node_sizes, 
16 |                  node_color=node_colors, alpha=0.8, 
17 |                  with_labels=False, 
18 |                  edge_color='.6')
19 | 
20 | # Calculating the absolute edge betweenness centrality.
21 | main_conns = nx.edge_betweenness_centrality(G, normalized=True)
22 | main_conns = sorted(main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5]
23 | main_conns = tuple(map(operator.itemgetter(0), main_conns))
24 | nx.draw_networkx_edges(G, pos, edgelist=main_conns, edge_color='green', alpha=0.5, width=6)
25 | 
26 | # Estimating the edge betweenness centrality by sampling 40% of nodes.
27 | NUM_SAMPLES = int(0.4 * len(G))
28 | 
29 | est_main_conns = nx.edge_betweenness_centrality(G, k=NUM_SAMPLES, normalized=True, seed=10)
30 | est_main_conns = sorted(est_main_conns.items(), key=operator.itemgetter(1), reverse=True)[:5]
31 | est_main_conns = tuple(map(operator.itemgetter(0), est_main_conns))
32 | nx.draw_networkx_edges(G, pos, edgelist=est_main_conns,
33 |                        edge_color='red', alpha=0.9, width=6, style='dashed')
34 | 
35 | nx.draw_networkx_labels(G, pos, 
36 |                         labels={0: G.node[0]['club'], 33: G.node[33]['club']}, 
37 |                         font_size=15, font_color='white')
38 | 
39 | candidate_edges = ((8, 15), (30, 21), (29, 28), (1, 6))
40 | nx.draw_networkx_edges(G, pos, edgelist=candidate_edges, 
41 |                        edge_color='blue', alpha=0.5, width=2, style='dashed')
42 | nx.draw_networkx_labels(G, pos, 
43 |                         labels={u: u for t in candidate_edges for u in t}, 
44 |                         font_size=13, font_weight='bold', font_color='yellow')
45 | 
46 | plt.axis('off')
47 | plt.tight_layout();
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/ch11/count_non_repeating_digits.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | a, b = tuple(map(int, input().split()))
 4 | 
 5 | def variation_without_repetition(n, k):
 6 |     return math.factorial(n) // math.factorial(n - k)
 7 | 
 8 | # Finds how many numbers with non-repeating digits are present in [0, k].
 9 | def count_numbers_with_non_repeating_digits(k):
10 |     if k < 0:
11 |         return 0
12 |     if k == 0:
13 |         return 1
14 | 
15 |     # We can find most numbers using combinatorics.
16 |     digits = str(k)
17 |     num_digits = len(digits)
18 |     first_digit = int(digits[0])
19 |     span = 10 ** (num_digits - 1)
20 |     
21 |     s = (first_digit - 1) * variation_without_repetition(9, num_digits - 1)
22 |     
23 |     # We must take care of a lower interval regarding leading zeros.
24 |     s += count_numbers_with_non_repeating_digits(span - 1)
25 | 
26 |     # We continue our search for the upper part.
27 |     used_digits = {first_digit}
28 |     t = num_digits == 1
29 | 
30 |     for i in range(1, num_digits):
31 |         first_digit = int(digits[i])
32 |         allowed_digits = set(range(first_digit + 1)) - used_digits
33 |         v = variation_without_repetition(9 - i, num_digits - 1 - i)
34 |         used_digits.add(first_digit)
35 | 
36 |         if first_digit not in allowed_digits:
37 |             if len(allowed_digits) == 0 and i == 1:
38 |                 t = 0
39 |             else:
40 |                 t += len(allowed_digits) * v
41 |             break
42 |         else:
43 |             t += (len(allowed_digits) - (i != num_digits - 1)) * v                
44 |     return s + t
45 |     
46 | print(count_numbers_with_non_repeating_digits(b) - \
47 |       count_numbers_with_non_repeating_digits(a - 1))


--------------------------------------------------------------------------------
/ch11/count_non_repeating_digits_naive.py:
--------------------------------------------------------------------------------
1 | a, b = tuple(map(int, input().split()))
2 | 
3 | count = 0
4 | for i in range(a, b + 1):
5 |     s = str(i)
6 |     if len(set(s)) == len(s):
7 |         count += 1
8 |         
9 | print(count)


--------------------------------------------------------------------------------
/ch11/count_occurrences_digit.py:
--------------------------------------------------------------------------------
 1 | def setup():
 2 |     MAX_EXPONENT = 50
 3 | 
 4 |     # Holds the number of occurrences of digit k in [0, (10**i) - 1], i > 0. 
 5 |     # If the range is partial (first part of the composite key is False), then
 6 |     # leading zeros are omitted (this is a special case when k == 0).
 7 |     table_of_occurrences = {(False, 0): 0, (False, 1): 1, 
 8 |                              (True, 0): 0, (True, 1): 1}
 9 |     for i in range(2, MAX_EXPONENT + 1):
10 |         table_of_occurrences[(True, i)] = i * 10**(i - 1)
11 |         table_of_occurrences[(False, i)] = \
12 |             10**(i - 1) + 10 * table_of_occurrences[(False, i - 1)] - 10 
13 |     return table_of_occurrences
14 |     
15 | def count_occurrences_digit(k, n, table_of_occurrences=setup()):
16 |     digits = str(n)
17 |     num_digits = len(digits)
18 |     count = 0
19 |     is_first_digit = num_digits > 1
20 | 
21 |     for digit in map(int, digits):
22 |         span = 10**(num_digits - 1)
23 | 
24 |         count += (digit - 1) * table_of_occurrences[(True, num_digits - 1)]
25 |         count += table_of_occurrences[(k != 0 or not is_first_digit, num_digits - 1)]
26 |            
27 |         if digit > k:
28 |             if k > 0 or not is_first_digit:
29 |                 count += span
30 |         elif digit == k:
31 |             count += (n % span) + 1
32 | 
33 |         num_digits -= 1
34 |         is_first_digit = False
35 |     return count
36 | 
37 | if __name__ == '__main__':
38 |     k, n = tuple(map(int, input().split()))
39 |     print(count_occurrences_digit(k, n))
40 | 


--------------------------------------------------------------------------------
/ch11/count_occurrences_digit_naive.py:
--------------------------------------------------------------------------------
 1 | def count_occurrences_digit_naive(k, n):
 2 |     k = str(k)
 3 |     count = 0
 4 |     for i in range(n + 1):
 5 |         count += str(i).count(k)
 6 |     return count
 7 | 
 8 | if __name__ == '__main__':
 9 |     k, n = tuple(map(int, input().split()))
10 |     print(count_occurrences_digit_naive(k, n))
11 | 


--------------------------------------------------------------------------------
/ch11/dimensionality_investigation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import fetch_kddcup99
 2 | from sklearn.manifold import TSNE
 3 | import pandas as pd
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | def retrieve_column_desc():
 8 |     import requests
 9 | 
10 |     r = requests.get('https://kdd.ics.uci.edu/databases/kddcup99/kddcup.names')
11 | 
12 |     column_desc = {}
13 |     for row in r.text.split('\n')[1:]:
14 |         if row.find(':') > 0:
15 |             col_name, col_type = row[:-1].split(':')
16 |             column_desc[col_name] = col_type.strip()
17 |     return column_desc
18 | 
19 | def get_numeric_columns(column_desc):
20 |     return [name for name in column_desc if column_desc[name] == 'continuous']    
21 | 
22 | column_desc = retrieve_column_desc()
23 | numeric_columns = get_numeric_columns(column_desc)
24 | print('Number of numeric columns:', len(numeric_columns))
25 | 
26 | X, _ = fetch_kddcup99(subset='SA', random_state=10, return_X_y=True)
27 | X = pd.DataFrame(X, columns=column_desc.keys())
28 | X[numeric_columns] = X[numeric_columns].apply(pd.to_numeric)
29 | 
30 | # We need to work on a small sample to get results in any reasonable time frame.
31 | X = X.sample(frac=0.05, random_state=10)
32 | 
33 | m = TSNE(learning_rate=150, random_state=10)
34 | X_tsne = m.fit_transform(X[numeric_columns])
35 | print('First 10 rows of the TSNE reduced dataset:')
36 | print(X_tsne[:10, :])
37 | 
38 | X['t-sne_1'] = X_tsne[:, 0]
39 | X['t-sne_2'] = X_tsne[:, 1]
40 | 
41 | sns.set(rc={'figure.figsize': (10, 10)})
42 | sns.scatterplot(x='t-sne_1', y='t-sne_2',
43 |                 hue='protocol_type',
44 |                 style='protocol_type',
45 |                 data=X[numeric_columns + ['protocol_type', 't-sne_1', 't-sne_2']])
46 | plt.show()


--------------------------------------------------------------------------------
/ch11/parallel_processing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import dask.array as da
 3 | 
 4 | def num_divisible(a, b, c):
 5 |     r = a % c
 6 |     if r == 0:
 7 |         start = a
 8 |     else:
 9 |         start = a + (c - r)
10 | 
11 |     if start > b:
12 |         return 0
13 |     else:
14 |         return 1 + (b - start) // c
15 |     
16 | num_divisible_vect = np.vectorize(num_divisible)
17 | x = da.asanyarray([(1, 100, 10), (16789, 445267839, 7), (34, 10**18, 3000), (3, 7, 9)])
18 | x = x.rechunk(chunks=(2, -1))
19 | y = x.map_blocks(lambda block: num_divisible_vect(*block.T), 
20 |                  chunks=(-1,), 
21 |                  drop_axis=1, 
22 |                  dtype='i8')
23 | print(y.compute())


--------------------------------------------------------------------------------
/ch11/perf_test_harness.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | def measure(f, num_repetitions=5):
 7 |     measurements = np.array([])
 8 |     for _ in range(num_repetitions):
 9 |         start = time.clock()
10 |         f()
11 |         measurements = np.append(measurements, time.clock() - start)
12 |     return measurements.mean()
13 | 
14 | def execute(config):
15 |     execution_times = {}
16 | 
17 |     for config_name in config['functions']:
18 |         execution_times[config_name] = np.array([])
19 |     
20 |     for x in config['span']:
21 |         for config_name in config['functions']:
22 |             execution_times[config_name] = np.append(
23 |                     execution_times[config_name], 
24 |                     measure(lambda: config['functions'][config_name](x)))
25 |     return execution_times
26 | 
27 | def attach_model(execution_times, config, function_name, model_name):
28 |     model_vals = np.vectorize(config['models'][model_name])(config['span'])
29 |     c = np.mean(execution_times[function_name] / model_vals)
30 |     execution_times[model_name] = c * model_vals
31 |  
32 | def report(execution_times, x_vals, **plot_kwargs):
33 |     df = pd.DataFrame(execution_times)
34 |     df.index = x_vals
35 |     ax = df.plot.line(
36 |         figsize=(10, 8),
37 |         title='Performance Test Report',
38 |         grid=True, 
39 |         **plot_kwargs
40 |     )
41 |     ax.set_xlabel('Span')
42 |     ax.set_ylabel('Time [s]')
43 |     return df
44 | 
45 | if __name__ == '__main__':
46 |     import math
47 |     
48 |     from count_occurrences_digit_naive import count_occurrences_digit_naive
49 |     import count_occurrences_digit as cog
50 | 
51 |     table_of_occurrences = cog.setup()
52 |     config = {
53 |         'functions': {
54 |             'naive(k=0)': lambda n: count_occurrences_digit_naive(0, n),
55 |             'fast(k=0)': lambda n: cog.count_occurrences_digit(0, n, table_of_occurrences)
56 |         },
57 |         'models': {
58 |             'O(n)': lambda n: n,
59 |             'O(log n)': lambda n: math.log(n)
60 |         },
61 |         'span': np.geomspace(10**2, 10**7, num=14, dtype=int)
62 |     }
63 |     execution_times = execute(config)
64 |     attach_model(execution_times, config, 'naive(k=0)', 'O(n)')
65 |     attach_model(execution_times, config, 'fast(k=0)', 'O(log n)')
66 |     print(report(execution_times, config['span'], logx=True, style=['-ro', '-gs', ':r^', ':gv']))


--------------------------------------------------------------------------------
/ch12/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch12/.DS_Store


--------------------------------------------------------------------------------
/ch12/AToughGame.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Solution for the AToughGame Topcoder problem."""
 5 | class AToughGame:
 6 |     def expectedGain(self, prob, value):
 7 |         """
 8 |         Examples:
 9 |         >>> EPS = 10 ** -6
10 |         >>> game = AToughGame()
11 |         >>> abs(game.expectedGain((1000,500), (3,4)) - 10.0) < EPS
12 |         True
13 |         >>> abs(game.expectedGain((1000,1), (3,4)) - 3003.9999999999977) < EPS
14 |         True
15 |         >>> abs(game.expectedGain((500,500,500,500,500), (1,2,3,4,5)) - 16.626830517153095)  < EPS
16 |         True
17 |         >>> abs(game.expectedGain((250,750), (1000,1)) - 1067.6666666666667) < EPS
18 |         True
19 |         >>> abs(game.expectedGain((916,932,927,988,958,996,944,968,917,939,960,965,960,998,920,990,915,972,995,916,902, 968,970,962,922,959,994,915,996,996,994,986,945,947,912,946,972,951,973,965,921,910, 938,975,942,950,900,983,960,998,982,980,902,974,952,938,900,962,920,931,964,974,953, 995,946,946,903,921,923,985,919,996,930,915,991,967,996,911,999,936,1000,962,970,929, 966,960,930,920,958,926,983), (583,428,396,17,163,815,31,536,175,165,532,781,29,963,331,987,599,497,380,180,780,25, 931,607,784,613,468,140,488,604,401,912,204,785,697,173,451,849,714,914,650,652,338, 336,177,147,22,652,901,548,370,9,118,487,779,567,818,440,10,868,316,666,690,714,623, 269,501,649,324,773,173,54,391,745,504,578,81,627,319,301,16,899,658,586,604,83,520, 81,181,943,157)) - 54204.93356505282) < EPS
20 |         True
21 |         """
22 | 
23 |         # Combines two levels into a single aggregate. This implements the safe move
24 |         # of this greedy algorithm.
25 |         def combine(level0, level1):
26 |             p0, v0, p1, v1 = level0[0], level0[1], level1[0], level1[1]
27 |             q0, q1 = 1 - p0, 1 - p1
28 |             return p0 * p1, v1 + v0 * p1 * (p0 + q0 / p1) * (1 - p0 * q1) ** -2
29 | 
30 |         from functools import reduce
31 |         return reduce(combine, zip(map(lambda p: p / 1000, prob), value))[1]
32 | 
33 | if __name__ == '__main__':
34 |     import doctest
35 |     doctest.testmod()
36 | 


--------------------------------------------------------------------------------
/ch12/simple_network1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class NeuralNetwork:
 5 |     def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
 6 |         self.input_nodes = input_nodes
 7 |         self.hidden_nodes = hidden_nodes
 8 |         self.output_nodes = output_nodes
 9 | 
10 |         # Initialize weights to small random values using Normal distribution.
11 |         self.weights_input_to_hidden = np.random.normal(
12 |             scale = self.input_nodes ** -0.5, 
13 |             size = (self.input_nodes, self.hidden_nodes))
14 |         self.weights_hidden_to_output = np.random.normal(
15 |             scale = self.hidden_nodes ** -0.5, 
16 |             size = (self.hidden_nodes, self.output_nodes))
17 | 
18 |         self.lr = learning_rate
19 |         self.activation_function = lambda x : 1 / (1 + np.exp(-x)) # sigmoid                   
20 | 
21 |     def train(self, features, targets):
22 |         delta_weights_i_h = np.zeros(self.weights_input_to_hidden.shape)
23 |         delta_weights_h_o = np.zeros(self.weights_hidden_to_output.shape)
24 | 
25 |         for X, y in zip(features, targets):
26 |             y_hat, hidden_outputs = self.__forward(X)
27 |             delta_weights_i_h, delta_weights_h_o = self.__backward(
28 |                 y_hat, hidden_outputs, 
29 |                 X, y,
30 |                 delta_weights_i_h, delta_weights_h_o)
31 |         self.__update_weights(delta_weights_i_h, delta_weights_h_o)
32 | 
33 |     def run(self, X):
34 |         return self.__forward(X)[0]
35 | 
36 |     def __forward(self, X):
37 |         hidden_inputs = np.dot(X, self.weights_input_to_hidden)
38 |         hidden_outputs = self.activation_function(hidden_inputs)
39 |         final_inputs = np.dot(hidden_outputs, self.weights_hidden_to_output)
40 |         y_hat = final_inputs
41 |         return y_hat, hidden_outputs
42 |     
43 |     def __backward(self, y_hat, hidden_outputs, X, y, delta_weights_i_h, delta_weights_h_o):
44 |         error = y - y_hat
45 |         hidden_error = np.dot(self.weights_hidden_to_output, error)
46 |         output_error_term = error
47 |         hidden_error_term = hidden_error * hidden_outputs * (1 - hidden_outputs)
48 |         delta_weights_i_h += np.dot(
49 |             X[:, np.newaxis], hidden_error_term[np.newaxis, :])
50 |         delta_weights_h_o += np.dot(
51 |             hidden_outputs[:, np.newaxis], output_error_term[np.newaxis, :])
52 |         return delta_weights_i_h, delta_weights_h_o
53 | 
54 |     def __update_weights(self, delta_weights_i_h, delta_weights_h_o):
55 |         self.weights_hidden_to_output += self.lr * delta_weights_h_o
56 |         self.weights_input_to_hidden += self.lr * delta_weights_i_h
57 | 
58 | 
59 | #########################################################
60 | # Set your hyperparameters here
61 | ##########################################################
62 | iterations = 1000
63 | learning_rate = 0.005
64 | hidden_nodes = 20
65 | output_nodes = 1
66 | 


--------------------------------------------------------------------------------
/ch12/simple_network2.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | 
 7 | class NeuralNetwork:
 8 |     def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
 9 |         self.model = nn.Sequential(OrderedDict([
10 |                 ('fc', nn.Linear(input_nodes, hidden_nodes)),
11 |                 ('sigmoid', nn.Sigmoid()),
12 |                 ('output', nn.Linear(hidden_nodes, output_nodes))]))
13 | 
14 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |         self.model.to(self.device)
16 | 
17 |         self.criterion = nn.MSELoss()
18 |         self.optimizer = torch.optim.SGD(self.model.parameters(), lr = learning_rate)
19 |         
20 |     def train(self, features, targets):
21 |         features, targets = features.to(self.device), targets.to(self.device)
22 |         
23 |         self.model.train()
24 |         self.optimizer.zero_grad()        
25 |         output = self.model(features)
26 |         loss = self.criterion(output, targets)
27 |         loss.backward()
28 |         self.optimizer.step()
29 | 
30 |     def run(self, x):
31 |         self.model.eval()
32 |         with torch.no_grad():
33 |             return self.model(torch.tensor(x.values, dtype = torch.float) \
34 |                        .to(self.device)) \
35 |                        .cpu() \
36 |                        .numpy()
37 |         
38 | 
39 | #########################################################
40 | # Set your hyperparameters here
41 | ##########################################################
42 | iterations = 200
43 | learning_rate = 0.05
44 | hidden_nodes = 20
45 | output_nodes = 1
46 | 


--------------------------------------------------------------------------------
/ch2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/.DS_Store


--------------------------------------------------------------------------------
/ch2/segmentation/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/.DS_Store


--------------------------------------------------------------------------------
/ch2/segmentation/driver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Main driver code for calling other routines.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | import sys
 9 | import os
10 | sys.path.append(os.path.abspath('scripts'))
11 | 
12 | from nyt_data import retrieve
13 | 
14 | repoUrl = 'https://github.com/oreillymedia/doing_data_science/'
15 | fileUrl = 'raw/master/dds_datasets.zip'
16 | 
17 | retrieve(repoUrl + fileUrl, 'raw_data')
18 | print('Raw data files are successfully retrieved.')
19 | 
20 | #import numpy as np
21 | #import pandas as pd
22 | from nyt_data_chunked import traverse
23 | 
24 | """
25 | summary_data = dict()
26 | summary_data.setdefault('CTR', np.empty(31))
27 | summary_data.setdefault('Clicks', np.empty(31))
28 | 
29 | def select_stats_unregistered(df, file_num):
30 |     summary_data['CTR'][file_num] = df['CTR']['mean'][('Unknown', '0')]
31 |     summary_data['Clicks'][file_num] = df['Clicks']['sum'][('Unknown', '0')]
32 | 
33 | traverse('raw_data', select_stats_unregistered)
34 | print('Raw data files are successfully processed.')
35 | 
36 | # Make some plots of CTR and Total Clicks over time.
37 | df = pd.DataFrame.from_dict(summary_data)
38 | 
39 | import matplotlib.pyplot as plt
40 | 
41 | fig, axes = plt.subplots(nrows=2, ncols=1)
42 | df['CTR'].plot(
43 |         title='Click Through Rate Over 1 Month',
44 |         ax=axes[0],
45 |         figsize=(8, 9),
46 |         xticks=[]
47 | );
48 | df['Clicks'].plot(
49 |         xticks=range(0, 31, 2), 
50 |         title='Total Clicks Over 1 Month',
51 |         ax=axes[1],
52 |         figsize=(8, 9)
53 | );
54 | """
55 | 
56 | def save_stats(df, file_num):
57 |     targetFile = 'nyt_summary_' + str(file_num + 1) + '.parquet'
58 |     df.columns = ['_'.join(column).rstrip('_') for column in df.columns.values]
59 |     df.to_parquet('results/' + targetFile)
60 | 
61 | traverse('raw_data', save_stats)
62 | print('Raw data files are successfully processed.')


--------------------------------------------------------------------------------
/ch2/segmentation/raw_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/raw_data/.DS_Store


--------------------------------------------------------------------------------
/ch2/segmentation/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/results/.DS_Store


--------------------------------------------------------------------------------
/ch2/segmentation/scripts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch2/segmentation/scripts/.DS_Store


--------------------------------------------------------------------------------
/ch2/segmentation/scripts/nyt_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Script to download all data, summarize a single data file and traverse the
 5 | data folder to process all files.
 6 | 
 7 | @author: Ervin Varga
 8 | """
 9 | 
10 | import requests, zipfile, io, shutil
11 | 
12 | unpackedFolder = '/dds_datasets/'
13 | unpackedZipFile = 'dds_ch2_nyt.zip'
14 | 
15 | def retrieve(sourceFile, destinationFolder):
16 |     def cleanup():
17 |         try:
18 |             shutil.rmtree(destinationFolder + unpackedFolder)
19 |         except OSError as e:
20 |             print("Folder: %s, Error: %s" % (e.filename, e.strerror))                
21 | 
22 |     r = requests.get(sourceFile)
23 |     assert r.status_code == requests.codes.ok
24 | 
25 |     z = zipfile.ZipFile(io.BytesIO(r.content))
26 |     z.extractall(destinationFolder)
27 |     
28 |     # The top archive contains another ZIP file with our data.
29 |     z = zipfile.ZipFile(destinationFolder + unpackedFolder + unpackedZipFile)
30 |     z.extractall(destinationFolder)
31 |     
32 |     cleanup()
33 | 
34 | import pandas as pd
35 | import numpy as np
36 | 
37 | def summarize(data_file):
38 |     def q25(x):
39 |         return x.quantile(0.25)
40 | 
41 |     def q75(x):
42 |         return x.quantile(0.75)
43 | 
44 |     # Read and parse the CSV data file.
45 |     nyt_data = pd.read_csv(data_file, dtype={'Gender': 'category'})
46 |     
47 |     # Segment users into age groups.    
48 |     nyt_data['Age_Group'] = pd.cut(
49 |             nyt_data['Age'], 
50 |             bins=[-1, 0, 17, 24, 34, 44, 54, 64, 120], 
51 |             labels=["Unknown", 
52 |                     "1-17", 
53 |                     "18-24", 
54 |                     "25-34", 
55 |                     "35-44", 
56 |                     "45-54", 
57 |                     "55-64", 
58 |                     "65+"])
59 |     nyt_data.drop('Age', axis='columns', inplace=True)
60 | 
61 |     # Create the click through rate feature.    
62 |     nyt_data['CTR'] = nyt_data['Clicks'] / nyt_data['Impressions']
63 |     nyt_data.dropna(inplace=True)
64 |     nyt_data.drop((nyt_data['Clicks'] > nyt_data['Impressions']).nonzero()[0], 
65 |                   inplace=True)
66 | 
67 |     # Make final description of data.
68 |     compressed_nyt_data = \
69 |         nyt_data.groupby(by=['Age_Group', 'Gender'])[['CTR', 'Clicks']] \
70 |                 .agg([np.mean, np.std, np.max, q25, np.median, q75, np.sum])
71 |     return compressed_nyt_data
72 | 
73 | import pathlib
74 | 
75 | def traverse(sourceFolder, collect):
76 |     def get_file_number(data_file):
77 |         return int(data_file.name[3:-4]) - 1
78 |         
79 |     for data_file in pathlib.Path(sourceFolder).glob('nyt*.csv'):
80 |         collect(summarize(data_file.absolute()), get_file_number(data_file))


--------------------------------------------------------------------------------
/ch2/segmentation/scripts/nyt_data_chunked.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Script to download all data, summarize a single data file chunk-by-chunk and 
 5 | traverse the data folder to process all files.
 6 | 
 7 | @author: Ervin Varga
 8 | """
 9 | 
10 | import requests, zipfile, io, shutil
11 | 
12 | unpackedFolder = '/dds_datasets/'
13 | unpackedZipFile = 'dds_ch2_nyt.zip'
14 | 
15 | def retrieve(sourceFile, destinationFolder):
16 |     def cleanup():
17 |         try:
18 |             shutil.rmtree(destinationFolder + unpackedFolder)
19 |         except OSError as e:
20 |             print("Folder: %s, Error: %s" % (e.filename, e.strerror))                
21 | 
22 |     r = requests.get(sourceFile)
23 |     assert r.status_code == requests.codes.ok
24 | 
25 |     z = zipfile.ZipFile(io.BytesIO(r.content))
26 |     z.extractall(destinationFolder)
27 |     
28 |     # The top archive contains another ZIP file with our data.
29 |     z = zipfile.ZipFile(destinationFolder + unpackedFolder + unpackedZipFile)
30 |     z.extractall(destinationFolder)
31 |     
32 |     cleanup()
33 | 
34 | import pandas as pd
35 | import numpy as np
36 | 
37 | def summarize(data_file, chunksize):
38 |     def q25(x):
39 |         return x.quantile(0.25)
40 | 
41 |     def q75(x):
42 |         return x.quantile(0.75)
43 | 
44 |     # Read and parse the CSV data file chunk-by-chunk.
45 |     nyt_data = pd.DataFrame()
46 |     for chunk_df in pd.read_csv(
47 |             data_file, 
48 |             dtype={'Gender': 'category'},
49 |             chunksize=chunksize):
50 |     
51 |         # Segment users into age groups.    
52 |         chunk_df['Age_Group'] = pd.cut(
53 |                 chunk_df['Age'], 
54 |                 bins=[-1, 0, 17, 24, 34, 44, 54, 64, 120], 
55 |                 labels=["Unknown", 
56 |                         "1-17", 
57 |                         "18-24", 
58 |                         "25-34", 
59 |                         "35-44", 
60 |                         "45-54", 
61 |                         "55-64", 
62 |                         "65+"])
63 | 
64 |         # Create the click through rate feature.    
65 |         chunk_df['CTR'] = chunk_df['Clicks'] / chunk_df['Impressions']
66 |         chunk_df.dropna(inplace=True)
67 |         chunk_df.drop((chunk_df['Clicks'] > chunk_df['Impressions']).nonzero()[0], 
68 |                       inplace=True)
69 |         
70 |         # Append chunk to the main data frame.
71 |         nyt_data = nyt_data.append(
72 |                 chunk_df[['Age_Group', 'Gender', 'Clicks', 'CTR']], 
73 |                 ignore_index=True)
74 | 
75 |     # Make final description of data.
76 |     compressed_nyt_data = \
77 |         nyt_data.groupby(by=['Age_Group', 'Gender'])[['CTR', 'Clicks']] \
78 |                 .agg([np.mean, np.std, np.max, q25, np.median, q75, np.sum])
79 |     return compressed_nyt_data
80 | 
81 | import pathlib
82 | 
83 | def traverse(sourceFolder, collect, chunksize=10000):
84 |     def get_file_number(data_file):
85 |         return int(data_file.name[3:-4]) - 1
86 |         
87 |     for data_file in pathlib.Path(sourceFolder).glob('nyt*.csv'):
88 |         collect(summarize(data_file.absolute(), chunksize), 
89 |                 get_file_number(data_file))


--------------------------------------------------------------------------------
/ch3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch3/.DS_Store


--------------------------------------------------------------------------------
/ch3/bug_fixing/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch3/bug_fixing/.DS_Store


--------------------------------------------------------------------------------
/ch3/bug_fixing/double_preceding1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from array import array
 4 | 
 5 | def double_preceding(x: array) -> None:
 6 |     """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0.
 7 | 
 8 |     >>> x = array('i', [5, 10, 15])
 9 |     >>> double_preceding(x)
10 |     >>> x
11 |     array('i', [0, 10, 20])
12 |     """
13 | 
14 |     if x:
15 |         temp = x[0]; x[0] = 0
16 |         for i in range(1, len(x)):
17 |             x[i] = 2 * temp; temp = x[i]
18 | 


--------------------------------------------------------------------------------
/ch3/bug_fixing/double_preceding2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from array import array
 4 | 
 5 | def double_preceding(x: array) -> None:
 6 |     """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0.
 7 | 
 8 |     >>> x = array('i', [5, 10, 15])
 9 |     >>> double_preceding(x)
10 |     >>> x
11 |     array('i', [0, 10, 20])
12 |     """
13 | 
14 |     if x:
15 |         temp = x[0]; x[0] = 0
16 |         for i in range(1, len(x)):
17 |             temp_2x = 2 * temp; temp = x[i]; x[i] = temp_2x
18 | 
19 | 


--------------------------------------------------------------------------------
/ch3/bug_fixing/double_preceding3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from array import array
 4 | 
 5 | def double_preceding(x: array) -> None:
 6 |     """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0.
 7 | 
 8 |     >>> x = array('i', [5, 10, 15])
 9 |     >>> double_preceding(x)
10 |     >>> x
11 |     array('i', [0, 10, 20])
12 |     """
13 | 
14 |     if x:
15 |         for i in range(-1, -len(x), -1):
16 |             x[i] = 2 * x[i - 1]
17 |         x[0] = 0
18 | 
19 | 


--------------------------------------------------------------------------------
/ch3/bug_fixing/double_preceding4.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import numpy as np
 4 | 
 5 | def double_preceding(x: np.ndarray) -> None:
 6 |     """Transforms the array by setting x[i] = 2 * x[i-1] and x[0] = 0.
 7 | 
 8 |     >>> x = np.array([5, 10, 15])
 9 |     >>> double_preceding(x)
10 |     >>> x
11 |     array([ 0, 10, 20])
12 |     """
13 | 
14 |     if x.size != 0:
15 |         x[:-x.size:-1] = 2 * x[-2::-1]
16 |         x[0] = 0
17 | 


--------------------------------------------------------------------------------
/ch3/bug_fixing/test_double_preceding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from array import array
 4 | import unittest
 5 | from double_preceding4 import double_preceding
 6 | 
 7 | class TestDoublePreceding(unittest.TestCase):
 8 |     """Tests for double_preceding function.""" 
 9 | 
10 |     def test_already_arranged(self):
11 |         """Test with already arranged values.""" 
12 |         argument = array('i', [5, 10, 15])
13 |         expected = array('i', [0, 10, 20])
14 |         double_preceding(argument)
15 |         self.assertEqual(expected, argument)
16 | 
17 |     def test_identical(self):
18 |         """Test with multiple identical values.""" 
19 |         argument = array('i', [0, 1, 1])
20 |         expected = array('i', [0, 0, 2])
21 |         double_preceding(argument)
22 |         self.assertEqual(expected, argument)
23 | 
24 |     def test_empty(self):
25 |         """Test with an empty array."""
26 |         argument = []
27 |         expected = []
28 |         double_preceding(argument)
29 |         self.assertEqual(expected, argument)
30 | 
31 | if __name__ == "__main__":
32 |     unittest.main()


--------------------------------------------------------------------------------
/ch3/cyclomatic_complexity/sort_new.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | def sort(data):
 4 |     for i in range(len(data)):
 5 |         for j in range(len(data)):
 6 |             avg = (data[i] + data[j]) / 2.0
 7 |             diff = abs(data[i] - avg)
 8 |             data[i] = avg - diff
 9 |             data[j] = avg + diff
10 |     return data
11 | 


--------------------------------------------------------------------------------
/ch3/cyclomatic_complexity/sort_original.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | def sort(data):
4 |     for i in range(len(data)):
5 |         for j in range(len(data)):
6 |             if data[i] > data[j]:
7 |                 data[i], data[j] = (data[j], data[i])
8 |     return data


--------------------------------------------------------------------------------
/ch3/fibonacci/fibonacci1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | def fibonacci(n): 
4 |     sequence = []
5 |     current, next = 0, 1 
6 |     for _ in range(n):
7 |         current, next = next, current + next
8 |         sequence.append(current) 
9 |     return sequence


--------------------------------------------------------------------------------
/ch3/fibonacci/fibonacci2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | def fibonacci(n, f0=0, f1=1): 
4 |     sequence = []
5 |     current, next = f0, f1 
6 |     for _ in range(n):
7 |         current, next = next, current + next
8 |         sequence.append(current) 
9 |     return sequence


--------------------------------------------------------------------------------
/ch3/fibonacci/sequencer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | def simple_recurrent_sequence(n, first, second, combine_fun):
 4 |     sequence = []
 5 |     current, next = first, second 
 6 |     for _ in range(n):
 7 |         current, next = next, combine_fun(current, next)
 8 |         sequence.append(current) 
 9 |     return sequence    
10 | 
11 | def fibonacci(n):
12 |     return simple_recurrent_sequence(n, 0, 1, lambda x, y: x + y)


--------------------------------------------------------------------------------
/ch3/optimization/elevator0.py:
--------------------------------------------------------------------------------
1 | def num_days(h, u, d):
2 |     total_days = 1
3 |     curr_height = 0
4 |     
5 |     while h - curr_height > u:
6 |         curr_height += u - d
7 |         total_days += 1
8 |     return total_days


--------------------------------------------------------------------------------
/ch3/optimization/elevator1.py:
--------------------------------------------------------------------------------
 1 | def num_days(h, u, d):
 2 |     total_days = 1
 3 |     height_left = h
 4 | 
 5 |     while u < height_left:
 6 |         days = height_left // u
 7 |         total_days += days
 8 |         height_left -= days * (u - d)
 9 |     return total_days
10 | 


--------------------------------------------------------------------------------
/ch3/optimization/elevator2.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | 
 3 | @lru_cache(maxsize=32)
 4 | def _partial_num_days(height_left, u, d):
 5 |     total_days = 1
 6 | 
 7 |     while u < height_left:
 8 |         days = height_left // u
 9 |         total_days += days
10 |         height_left -= days * (u - d)
11 |     return total_days
12 | 
13 | H_LIMIT = 1000000
14 | 
15 | def num_days(h, u, d):
16 |     if h > H_LIMIT:
17 |         days = 2 * (num_days(h // 2, u, d) - 1)
18 |         height_left = h - days * (u - d)
19 |         return days + _partial_num_days(height_left, u, d)
20 |     else:
21 |         return _partial_num_days(h, u, d)


--------------------------------------------------------------------------------
/ch3/optimization/elevator3.py:
--------------------------------------------------------------------------------
1 | def num_days(h, u, d):
2 |     import math
3 |     
4 |     height_left_until_last_day = h - u
5 |     daily_progress = u - d
6 |     return 1 + math.ceil(height_left_until_last_day / daily_progress)


--------------------------------------------------------------------------------
/ch3/puzzles/puzzle1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | def puzzle1(n):
 4 |     p = 0; w = 1; s = n
 5 |     
 6 |     while w <= n:
 7 |         w <<= 2
 8 | 
 9 |     while w != 1:
10 |         w >>= 2
11 |         f = p + w
12 |         p >>= 1
13 | 
14 |         if s >= f:
15 |             p += w
16 |             s -= f
17 |     return p
18 | 


--------------------------------------------------------------------------------
/ch3/puzzles/puzzle2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | def puzzle2(bytes):
 4 |     f = [0] * 255
 5 |     s = k = 0
 6 |     
 7 |     for b in bytes:
 8 |         f[b] += 1
 9 |     
10 |     s += f[k]
11 |     k += 1
12 |     while s < len(bytes) / 2:
13 |         s += f[k]
14 |         k += 1
15 |     return k
16 | 


--------------------------------------------------------------------------------
/ch3/puzzles/puzzle2b.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | def puzzle2(bytes):
 4 |     f = [0] * 255
 5 |     s = k = 0
 6 |     
 7 |     for b in bytes:
 8 |         f[b] += 1
 9 | 
10 |     k += 1    
11 |     s += f[k]
12 |     while s < len(bytes) / 2:
13 |         k += 1
14 |         s += f[k]
15 |     return k
16 | 


--------------------------------------------------------------------------------
/ch4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/.DS_Store


--------------------------------------------------------------------------------
/ch4/ball_descend/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/ball_descend/.DS_Store


--------------------------------------------------------------------------------
/ch4/ball_descend/Simulation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Simulation of a Ball's Descend in a Terrain\n",
  8 |     "\n",
  9 |     "This project simulates where a ball will land in a terrain.\n",
 10 |     "\n",
 11 |     "## Input\n",
 12 |     "The terrain's configuration is given as a matrix of integers representing elevation at each spot. For simplicity, assume that the terrain is surrounded by a rectangular wall, that prevents the ball to escape. The inner dimensions of the terrain are NxM, where N and M are integers between 3 and 1000.\n",
 13 |     "\n",
 14 |     "The ball's initial position is given as a pair of integers (a, b).\n",
 15 |     "\n",
 16 |     "## Output\n",
 17 |     "The result is a list of coordinates denoting the ball's path in a terrain. The first element of the list is the starting position, and the last one is the ending position. It could happen that they are the same, if the ball has emanated from a local minima (dent).\n",
 18 |     "\n",
 19 |     "## Rules\n",
 20 |     "The ball moves according to the next two simple rules:\n",
 21 |     "- The ball rolls from the current position into the lowest neighboring one.\n",
 22 |     "- If the ball is surrounded by higher points, then it stops."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Usual bootstrapping code; just run this cell.\n",
 32 |     "import numpy as np\n",
 33 |     "\n",
 34 |     "from typing import List, Tuple\n",
 35 |     "\n",
 36 |     "from ipywidgets import interact, widgets"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "matrix([[-2,  3,  2,  1],\n",
 48 |        "        [-2,  4,  3,  0],\n",
 49 |        "        [-3,  3,  1, -3],\n",
 50 |        "        [-4,  2, -1,  1],\n",
 51 |        "        [-5, -7,  3,  0]])"
 52 |       ]
 53 |      },
 54 |      "execution_count": 2,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "terrain = np.matrix([\n",
 61 |     "    [-2, 3, 2, 1],\n",
 62 |     "    [-2, 4, 3, 0],\n",
 63 |     "    [-3, 3, 1, -3],\n",
 64 |     "    [-4, 2, -1, 1],\n",
 65 |     "    [-5, -7, 3, 0]\n",
 66 |     "])\n",
 67 |     "terrain"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "def wall(terrain:np.matrix, position:Tuple[int,int]) -> bool:\n",
 77 |     "    \"\"\"\n",
 78 |     "    Checks whether the provided position is hitting the wall.\n",
 79 |     "    \n",
 80 |     "    Args:\n",
 81 |     "    terrain: the terrain's configuration comprised from integer elevation levels.\n",
 82 |     "    position: the pair of integers representing the ball's potential position.\n",
 83 |     "\n",
 84 |     "    Output:\n",
 85 |     "    True if the position is hitting the wall, or False otherwise.\n",
 86 |     "    \n",
 87 |     "    Examples:\n",
 88 |     "    >>> wall(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
 89 |     "    False\n",
 90 |     "    >>> wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0))\n",
 91 |     "    True\n",
 92 |     "    \"\"\"\n",
 93 |     "    \n",
 94 |     "    x, y = position\n",
 95 |     "    length, width = terrain.shape\n",
 96 |     "    return (x < 0) or (y < 0) or (x >= length) or (y >= width)\n",
 97 |     "\n",
 98 |     "def next_neighbor(terrain:np.matrix, position:Tuple[int,int]) -> Tuple[int,int]:\n",
 99 |     "    \"\"\"\n",
100 |     "    Returns the position of the lowest neighbor.\n",
101 |     "    \n",
102 |     "    Args:\n",
103 |     "    terrain: the terrain's configuration comprised from integer elevation levels.\n",
104 |     "    position: the pair of integers representing the ball's current position.\n",
105 |     "\n",
106 |     "    Output:\n",
107 |     "    The position (pair of coordinates) of the lowest neighbor.\n",
108 |     "    \n",
109 |     "    Example:\n",
110 |     "    >>> next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
111 |     "    (0, 0)\n",
112 |     "    \"\"\"\n",
113 |     "    \n",
114 |     "    x, y = position\n",
115 |     "    allowed_neighbors = []\n",
116 |     "    for delta_x in range(-1, 2):\n",
117 |     "        for delta_y in range(-1, 2):\n",
118 |     "            new_position = (x + delta_x, y + delta_y)\n",
119 |     "            if (not wall(terrain, new_position)):\n",
120 |     "                allowed_neighbors.append((terrain.item(new_position), new_position))\n",
121 |     "    return min(allowed_neighbors)[1]\n",
122 |     "\n",
123 |     "def find_path(terrain:np.matrix, position:Tuple[int,int]) -> List[Tuple[int,int]]:\n",
124 |     "    \"\"\"\n",
125 |     "    Find the path that the ball would follow while descending in the terrain.\n",
126 |     "    \n",
127 |     "    Args:\n",
128 |     "    terrain: the terrain's configuration comprised from integer elevation levels.\n",
129 |     "    position: the pair of integers representing the ball's current position.\n",
130 |     "    \n",
131 |     "    Output:\n",
132 |     "    The list of coordinates of the path.\n",
133 |     "    \n",
134 |     "    Example:\n",
135 |     "    >>> find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
136 |     "    [(0, 1), (0, 0)]\n",
137 |     "    \"\"\"\n",
138 |     "    \n",
139 |     "    next_position = next_neighbor(terrain, position)\n",
140 |     "    if (position == next_position):\n",
141 |     "        return [position]\n",
142 |     "    else:\n",
143 |     "        return [position] + find_path(terrain, next_position)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Result\n",
151 |     "\n",
152 |     "The cell below contains code to invoke the path finding function for a given starting position. The starting coordinates are expected to be correctly set.\n",
153 |     "\n",
154 |     "The terrain data is repeated here for convenience."
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 4,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "data": {
164 |       "text/plain": [
165 |        "matrix([[-2,  3,  2,  1],\n",
166 |        "        [-2,  4,  3,  0],\n",
167 |        "        [-3,  3,  1, -3],\n",
168 |        "        [-4,  2, -1,  1],\n",
169 |        "        [-5, -7,  3,  0]])"
170 |       ]
171 |      },
172 |      "execution_count": 4,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "terrain"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 7,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "application/vnd.jupyter.widget-view+json": {
189 |        "model_id": "9a0e328db74542498357f2cf8a600957",
190 |        "version_major": 2,
191 |        "version_minor": 0
192 |       },
193 |       "text/plain": [
194 |        "interactive(children=(IntSlider(value=1, description='Start X', max=4), IntSlider(value=1, description='Start …"
195 |       ]
196 |      },
197 |      "metadata": {},
198 |      "output_type": "display_data"
199 |     }
200 |    ],
201 |    "source": [
202 |     "interact(lambda start_x, start_y: find_path(terrain, (start_x, start_y)),\n",
203 |     "         start_x = widgets.IntSlider(value=1, max=terrain.shape[0]-1, description='Start X'),\n",
204 |     "         start_y = widgets.IntSlider(value=1, max=terrain.shape[1]-1, description='Start Y'));"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 6,
210 |    "metadata": {},
211 |    "outputs": [
212 |     {
213 |      "name": "stdout",
214 |      "output_type": "stream",
215 |      "text": [
216 |       "Trying:\n",
217 |       "    find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
218 |       "Expecting:\n",
219 |       "    [(0, 1), (0, 0)]\n",
220 |       "ok\n",
221 |       "Trying:\n",
222 |       "    next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
223 |       "Expecting:\n",
224 |       "    (0, 0)\n",
225 |       "ok\n",
226 |       "Trying:\n",
227 |       "    wall(np.matrix([[-2, 3, 2, 1]]), (0, 1))\n",
228 |       "Expecting:\n",
229 |       "    False\n",
230 |       "ok\n",
231 |       "Trying:\n",
232 |       "    wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0))\n",
233 |       "Expecting:\n",
234 |       "    True\n",
235 |       "ok\n",
236 |       "1 items had no tests:\n",
237 |       "    __main__\n",
238 |       "3 items passed all tests:\n",
239 |       "   1 tests in __main__.find_path\n",
240 |       "   1 tests in __main__.next_neighbor\n",
241 |       "   2 tests in __main__.wall\n",
242 |       "4 tests in 4 items.\n",
243 |       "4 passed and 0 failed.\n",
244 |       "Test passed.\n"
245 |      ]
246 |     },
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "TestResults(failed=0, attempted=4)"
251 |       ]
252 |      },
253 |      "execution_count": 6,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "# Just run this cell to invoke tests embedded inside function descriptors.\n",
260 |     "import doctest\n",
261 |     "doctest.testmod(verbose=True)"
262 |    ]
263 |   }
264 |  ],
265 |  "metadata": {
266 |   "celltoolbar": "Raw Cell Format",
267 |   "kernelspec": {
268 |    "display_name": "Python 3",
269 |    "language": "python",
270 |    "name": "python3"
271 |   },
272 |   "language_info": {
273 |    "codemirror_mode": {
274 |     "name": "ipython",
275 |     "version": 3
276 |    },
277 |    "file_extension": ".py",
278 |    "mimetype": "text/x-python",
279 |    "name": "python",
280 |    "nbconvert_exporter": "python",
281 |    "pygments_lexer": "ipython3",
282 |    "version": "3.6.5"
283 |   }
284 |  },
285 |  "nbformat": 4,
286 |  "nbformat_minor": 2
287 | }
288 | 


--------------------------------------------------------------------------------
/ch4/ball_descend/Simulation_Refactored.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Simulation of a Ball's Descend in a Terrain - Refactored Version\n",
  8 |     "\n",
  9 |     "This project simulates where a ball will land in a terrain. It simulates the influence of Newton's law of universal gravitation on the movement of a ball, given by the formula $F=g\\frac{m_1m_2}{r^2}$. Here, F is the resulting gravitational pull between the matching objects, $m_1$ and $m_2$ are their masses, r is the distance between the centers of their masses, and g is the gravitational constant.\n",
 10 |     "\n",
 11 |     "## Input\n",
 12 |     "The terrain's configuration is given as a matrix of integers representing elevation at each spot. For simplicity, assume that the terrain is surrounded by a rectangular wall, that prevents the ball to escape. The inner dimensions of the terrain are NxM, where N and M are integers between 3 and 1000.\n",
 13 |     "\n",
 14 |     "The ball's initial position is given as a pair of integers (a, b).\n",
 15 |     "\n",
 16 |     "## Output\n",
 17 |     "The result is a list of coordinates denoting the ball's path in a terrain. The first element of the list is the starting position, and the last one is the ending position. It could happen that they are the same, if the ball has emanated from a local minima (dent).\n",
 18 |     "\n",
 19 |     "## Rules\n",
 20 |     "The ball moves according to the next two simple rules:\n",
 21 |     "- The ball rolls from the current position into the lowest neighboring one.\n",
 22 |     "- If the ball is surrounded by higher points, then it stops."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Usual bootstrapping code; just run this cell.\n",
 32 |     "import numpy as np\n",
 33 |     "from ipywidgets import interact, widgets\n",
 34 |     "\n",
 35 |     "from pathfinder import find_path"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "matrix([[-2,  3,  2,  1],\n",
 47 |        "        [-2,  4,  3,  0],\n",
 48 |        "        [-3,  3,  1, -3],\n",
 49 |        "        [-4,  2, -1,  1],\n",
 50 |        "        [-5, -7,  3,  0]])"
 51 |       ]
 52 |      },
 53 |      "execution_count": 2,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "terrain = np.matrix([\n",
 60 |     "    [-2, 3, 2, 1],\n",
 61 |     "    [-2, 4, 3, 0],\n",
 62 |     "    [-3, 3, 1, -3],\n",
 63 |     "    [-4, 2, -1, 1],\n",
 64 |     "    [-5, -7, 3, 0]\n",
 65 |     "])\n",
 66 |     "terrain"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Result\n",
 74 |     "\n",
 75 |     "The cell below contains code to invoke the path finding function for a given starting position. The starting coordinates are expected to be correctly set.\n",
 76 |     "\n",
 77 |     "The terrain data is repeated here for convenience."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "matrix([[-2,  3,  2,  1],\n",
 89 |        "        [-2,  4,  3,  0],\n",
 90 |        "        [-3,  3,  1, -3],\n",
 91 |        "        [-4,  2, -1,  1],\n",
 92 |        "        [-5, -7,  3,  0]])"
 93 |       ]
 94 |      },
 95 |      "execution_count": 3,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "terrain"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "application/vnd.jupyter.widget-view+json": {
112 |        "model_id": "f740b2d719684aadb1d73ed473c65f5c",
113 |        "version_major": 2,
114 |        "version_minor": 0
115 |       },
116 |       "text/plain": [
117 |        "interactive(children=(IntSlider(value=1, description='Start X', max=4), IntSlider(value=1, description='Start …"
118 |       ]
119 |      },
120 |      "metadata": {},
121 |      "output_type": "display_data"
122 |     }
123 |    ],
124 |    "source": [
125 |     "interact(lambda start_x, start_y: find_path(terrain, (start_x, start_y)),\n",
126 |     "         start_x = widgets.IntSlider(value=1, max=terrain.shape[0]-1, description='Start X'),\n",
127 |     "         start_y = widgets.IntSlider(value=1, max=terrain.shape[1]-1, description='Start Y'));"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "celltoolbar": "Raw Cell Format",
133 |   "kernelspec": {
134 |    "display_name": "Python 3",
135 |    "language": "python",
136 |    "name": "python3"
137 |   },
138 |   "language_info": {
139 |    "codemirror_mode": {
140 |     "name": "ipython",
141 |     "version": 3
142 |    },
143 |    "file_extension": ".py",
144 |    "mimetype": "text/x-python",
145 |    "name": "python",
146 |    "nbconvert_exporter": "python",
147 |    "pygments_lexer": "ipython3",
148 |    "version": "3.6.6"
149 |   }
150 |  },
151 |  "nbformat": 4,
152 |  "nbformat_minor": 2
153 | }
154 | 


--------------------------------------------------------------------------------
/ch4/ball_descend/pathfinder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/ball_descend/pathfinder/.DS_Store


--------------------------------------------------------------------------------
/ch4/ball_descend/pathfinder/__init__.py:
--------------------------------------------------------------------------------
1 | from pathfinder.pathfinder import find_path


--------------------------------------------------------------------------------
/ch4/ball_descend/pathfinder/pathfinder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List, Tuple
 3 | 
 4 | def wall(terrain:np.matrix, position:Tuple[int,int]) -> bool:
 5 |     """
 6 |     Checks whether the provided position is hitting the wall.
 7 |     
 8 |     Args:
 9 |     terrain: the terrain's configuration comprised from integer elevation levels.
10 |     position: the pair of integers representing the ball's potential position.
11 | 
12 |     Output:
13 |     True if the position is hitting the wall, or False otherwise.
14 |     
15 |     Examples:
16 |     >>> wall(np.matrix([[-2, 3, 2, 1]]), (0, 1))
17 |     False
18 |     >>> wall(np.matrix([[-2, 3, 2, 1]]), (-1, 0))
19 |     True
20 |     """
21 |     
22 |     x, y = position
23 |     length, width = terrain.shape
24 |     return (x < 0) or (y < 0) or (x >= length) or (y >= width)
25 | 
26 | def next_neighbor(terrain:np.matrix, position:Tuple[int,int]) -> Tuple[int,int]:
27 |     """
28 |     Returns the position of the lowest neighbor.
29 |     
30 |     Args:
31 |     terrain: the terrain's configuration comprised from integer elevation levels.
32 |     position: the pair of integers representing the ball's current position.
33 | 
34 |     Output:
35 |     The position (pair of coordinates) of the lowest neighbor.
36 |     
37 |     Example:
38 |     >>> next_neighbor(np.matrix([[-2, 3, 2, 1]]), (0, 1))
39 |     (0, 0)
40 |     """
41 |     
42 |     x, y = position
43 |     allowed_neighbors = []
44 |     for delta_x in range(-1, 2):
45 |         for delta_y in range(-1, 2):
46 |             new_position = (x + delta_x, y + delta_y)
47 |             if (not wall(terrain, new_position)):
48 |                 allowed_neighbors.append((terrain.item(new_position), new_position))
49 |     return min(allowed_neighbors)[1]
50 | 
51 | def find_path(terrain:np.matrix, position:Tuple[int,int]) -> List[Tuple[int,int]]:
52 |     """
53 |     Finds the path that the ball would follow while descending in the terrain.
54 |     
55 |     Args:
56 |     terrain: the terrain's configuration comprised from integer elevation levels.
57 |     position: the pair of integers representing the ball's current position.
58 |     
59 |     Output:
60 |     The list of coordinates of the path.
61 |     
62 |     Example:
63 |     >>> find_path(np.matrix([[-2, 3, 2, 1]]), (0, 1))
64 |     [(0, 1), (0, 0)]
65 |     """
66 |     
67 |     next_position = next_neighbor(terrain, position)
68 |     if (position == next_position):
69 |         return [position]
70 |     else:
71 |         return [position] + find_path(terrain, next_position)
72 |     
73 | if __name__ == "__main__":
74 |     import doctest
75 |     doctest.testmod()


--------------------------------------------------------------------------------
/ch4/hanoi/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch4/hanoi/.DS_Store


--------------------------------------------------------------------------------
/ch4/hanoi/Solver1.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "ename": "SyntaxError",
10 |      "evalue": "EOL while scanning string literal (<ipython-input-2-7eeda1002555>, line 4)",
11 |      "output_type": "error",
12 |      "traceback": [
13 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-2-7eeda1002555>\"\u001b[0;36m, line \u001b[0;32m4\u001b[0m\n\u001b[0;31m    print('Move top disk from', start, 'to\", end)\u001b[0m\n\u001b[0m                                                 ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m EOL while scanning string literal\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": [
18 |     "def solve_tower(num_disks, start, end, extra):\n",
19 |     "    if (num_disks > 0):\n",
20 |     "        solve_tower(num_disks - 1, start, extra, end)\n",
21 |     "        print('Move top disk from', start, 'to\", end)\n",
22 |     "        solve_tower(num_disks - 1, extra, end, start)\n",
23 |     "\n",
24 |     "solve_tower(3, 'a', 'c', 'b')"
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "markdown",
29 |    "metadata": {},
30 |    "source": [
31 |     "This error message is an example that Python sometimes wrongly guesses the location of the error. **Different string markers should not be mixed for the same string**."
32 |    ]
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.6.5"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 2
56 | }
57 | 


--------------------------------------------------------------------------------
/ch4/hanoi/Solver3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "def solve_tower(num_disks:int, start:str, end:str, extra:str) -> None:\n",
 10 |     "    \"\"\"\n",
 11 |     "    Solves the Tower of Hanoi puzzle.\n",
 12 |     "    \n",
 13 |     "    Args:\n",
 14 |     "    num_disks: the number of disks to move.\n",
 15 |     "    start: the name of the start pole.\n",
 16 |     "    end: the name of the target pole.\n",
 17 |     "    extra: the name of the temporary pole.\n",
 18 |     "    \n",
 19 |     "    Example:\n",
 20 |     "    >>> solve_tower(3, 'a', 'c', 'b')\n",
 21 |     "    Move top disk from a to c\n",
 22 |     "    Move top disk from a to b\n",
 23 |     "    Move top disk from c to b\n",
 24 |     "    Move top disk from a to c\n",
 25 |     "    Move top disk from b to a\n",
 26 |     "    Move top disk from b to c\n",
 27 |     "    Move top disk from a to c\n",
 28 |     "    >>> solve_tower(-1, 'a', 'c', 'b')\n",
 29 |     "    \"\"\"\n",
 30 |     "    if (num_disks > 0):\n",
 31 |     "        solve_tower(num_disks - 1, start, extra, end)\n",
 32 |     "        print('Move top disk from', start, 'to', end)\n",
 33 |     "        solve_tower(num_disks - 1, extra, end, start)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "Trying:\n",
 46 |       "    solve_tower(3, 'a', 'c', 'b')\n",
 47 |       "Expecting:\n",
 48 |       "    Move top disk from a to c\n",
 49 |       "    Move top disk from a to b\n",
 50 |       "    Move top disk from c to b\n",
 51 |       "    Move top disk from a to c\n",
 52 |       "    Move top disk from b to a\n",
 53 |       "    Move top disk from b to c\n",
 54 |       "    Move top disk from a to c\n",
 55 |       "ok\n",
 56 |       "Trying:\n",
 57 |       "    solve_tower(-1, 'a', 'c', 'b')\n",
 58 |       "Expecting nothing\n",
 59 |       "ok\n",
 60 |       "1 items had no tests:\n",
 61 |       "    __main__\n",
 62 |       "1 items passed all tests:\n",
 63 |       "   2 tests in __main__.solve_tower\n",
 64 |       "2 tests in 2 items.\n",
 65 |       "2 passed and 0 failed.\n",
 66 |       "Test passed.\n"
 67 |      ]
 68 |     },
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "TestResults(failed=0, attempted=2)"
 73 |       ]
 74 |      },
 75 |      "execution_count": 2,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "import doctest\n",
 82 |     "doctest.testmod(verbose=True)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "\u001b[0;31mSignature:\u001b[0m \u001b[0msolve_tower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_disks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextra\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 94 |        "\u001b[0;31mDocstring:\u001b[0m\n",
 95 |        "Solves the Tower of Hanoi puzzle.\n",
 96 |        "\n",
 97 |        "Args:\n",
 98 |        "num_disks: the number of disks to move.\n",
 99 |        "start: the name of the start pole.\n",
100 |        "end: the name of the target pole.\n",
101 |        "extra: the name of the temporary pole.\n",
102 |        "\n",
103 |        "Example:\n",
104 |        ">>> solve_tower(3, 'a', 'c', 'b')\n",
105 |        "Move top disk from a to c\n",
106 |        "Move top disk from a to b\n",
107 |        "Move top disk from c to b\n",
108 |        "Move top disk from a to c\n",
109 |        "Move top disk from b to a\n",
110 |        "Move top disk from b to c\n",
111 |        "Move top disk from a to c\n",
112 |        ">>> solve_tower(-1, 'a', 'c', 'b')\n",
113 |        "\u001b[0;31mFile:\u001b[0m      ~/Projects/pdsp_book/src/ch4/hanoi/<ipython-input-1-6c7aae7506f1>\n",
114 |        "\u001b[0;31mType:\u001b[0m      function\n"
115 |       ]
116 |      },
117 |      "metadata": {},
118 |      "output_type": "display_data"
119 |     }
120 |    ],
121 |    "source": [
122 |     "solve_tower?"
123 |    ]
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 3
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython3",
142 |    "version": "3.6.5"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 2
147 | }
148 | 


--------------------------------------------------------------------------------
/ch5/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/interactionlib/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/interactionlib/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/interactionlib/__init__.py:
--------------------------------------------------------------------------------
1 | from interactionlib.interaction_monitor import InteractionMonitor
2 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/interactionlib/interaction_monitor.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Monitors whether the user is selecting an area on the image or has chosen the
 3 | starting position.
 4 | """
 5 | 
 6 | from ipywidgets import Textarea
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | class InteractionMonitor:
10 |     """
11 |     Detects mouse events to figure our what is a user doing.
12 | 
13 |     Args:
14 |     fig: the matplotlib figure to monitor.
15 |     info_area: the external informational area whose value needs to be updated.
16 |     auto_stop_interaction: should interaction stop (when True) after selecting
17 |     the starting position or not.
18 |     """
19 | 
20 |     def __init__(self, fig: plt.Figure, info_area: Textarea,
21 |                  auto_stop_interaction: bool = True):
22 |         self._fig = fig
23 |         self._info_area = info_area
24 |         self._auto_stop_interaction = auto_stop_interaction
25 |         self._cids = None
26 |         self._selecting = False
27 |         self._clicked = False
28 |         self._clicked_position = None
29 | 
30 |     def _on_click(self, event):
31 |         self._clicked = True
32 | 
33 |     def _on_release(self, event):
34 |         if not self._selecting:
35 |             self._clicked_position = (int(event.ydata), int(event.xdata))
36 |             self._info_area.value = str(self._clicked_position)
37 |             if self._auto_stop_interaction:
38 |                 self.stop()
39 | 
40 |         self._selecting = False
41 |         self._clicked = False
42 | 
43 |     def _on_motion(self, event):
44 |         self._selecting = self._clicked
45 | 
46 |     @property
47 |     def clicked_position(self):
48 |         """Returns the clicked data position on the map."""
49 |         return self._clicked_position
50 | 
51 |     def start(self):
52 |         """Starts monitoring mouse events on figure."""
53 |         self._cids = [
54 |             self._fig.canvas.mpl_connect('button_press_event', self._on_click),
55 |             self._fig.canvas.mpl_connect('button_release_event', self._on_release),
56 |             self._fig.canvas.mpl_connect('motion_notify_event', self._on_motion)]
57 | 
58 |     def stop(self):
59 |         """Closes the figure and stops the interaction."""
60 |         plt.close(self._fig)
61 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/pathfinder/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/__init__.py:
--------------------------------------------------------------------------------
1 | from pathfinder.base_pathfinder import BasePathFinder
2 | from pathfinder.simple_pathfinder import SimplePathFinder
3 | from pathfinder.non_recursive_simple_pathfinder import NonRecursiveSimplePathFinder
4 | from pathfinder.parallel_simple_pathfinder import ParallelSimplePathFinder
5 | from pathfinder.pathutils import PathUtils


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/base_pathfinder.py:
--------------------------------------------------------------------------------
 1 | """The base class for implementing various path finders."""
 2 | 
 3 | import abc
 4 | from typing import List, Tuple, Set
 5 | 
 6 | import numpy as np
 7 | 
 8 | class BasePathFinder(metaclass=abc.ABCMeta):
 9 |     """
10 |     Finds the path of a ball that descends in a terrain from some starting
11 |     position.
12 | 
13 |     Args:
14 |     terrain: the terrain's configuration comprised from (altitude, slope)
15 |     integer pairs.
16 |     """
17 | 
18 |     def __init__(self, terrain: np.ndarray):
19 |         self._terrain = terrain
20 | 
21 |     @property
22 |     def terrain(self):
23 |         """Gets the current terrain data."""
24 |         return self._terrain
25 | 
26 |     def wall(self, position: Tuple[int, int]) -> bool:
27 |         """
28 |         Checks whether the provided position is hitting the wall.
29 | 
30 |         Args:
31 |         position: the pair of integers representing the ball's potential position.
32 | 
33 |         Output:
34 |         True if the position is hitting the wall, or False otherwise.
35 | 
36 |         Examples:
37 |         >>> BasePathFinder.__abstractmethods__ = set()
38 |         >>> path_finder = BasePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]]))
39 |         >>> path_finder.wall((0, 1))
40 |         False
41 |         >>> BasePathFinder.__abstractmethods__ = set()
42 |         >>> path_finder = BasePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]]))
43 |         >>> path_finder.wall((-1, 0))
44 |         True
45 |         """
46 | 
47 |         curr_x, curr_y = position
48 |         length, width = self.terrain.shape[:2]
49 |         return (curr_x < 0) or (curr_y < 0) or (curr_x >= length) or (curr_y >= width)
50 | 
51 |     @abc.abstractmethod
52 |     def next_neighbor(self, position: Tuple[int, int],
53 |                       visited: Set[Tuple[int, int]]) -> Tuple[int, int]:
54 |         """
55 |         Returns the position of the lowest neighbor or the current position.
56 | 
57 |         Args:
58 |         position: the pair of integers representing the ball's current position.
59 |         visited: the set of visited points.
60 | 
61 |         Output:
62 |         The position (pair of coordinates) of the lowest neighbor.
63 |         """
64 | 
65 |     @abc.abstractmethod
66 |     def find_path(self, position: Tuple[int, int],
67 |                   visited: Set[Tuple[int, int]]) -> List[Tuple[int, int]]:
68 |         """
69 |         Finds the path that the ball would follow while descending in the terrain.
70 | 
71 |         Args:
72 |         position: the pair of integers representing the ball's current position.
73 |         visited: the set of visited points (may be preset to avoid certain points).
74 | 
75 |         Output:
76 |         The list of coordinates of the path.
77 |         """
78 | 
79 |     def find_paths(self, positions: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
80 |         """
81 |         Finds paths for all provided starting positions.
82 | 
83 |         Args:
84 |         positions: the list of positions to for which to calculate path.
85 | 
86 |         Output:
87 |         The list of paths in the same order as positions.
88 |         """
89 | 
90 |         return [self.find_path(position, None) for position in positions]
91 | 
92 | if __name__ == "__main__":
93 |     import doctest
94 |     doctest.testmod()
95 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/non_recursive_simple_pathfinder.py:
--------------------------------------------------------------------------------
 1 | """Simple non-recursive path finder implementation."""
 2 | 
 3 | from typing import List, Tuple, Set
 4 | from pathfinder.simple_pathfinder import SimplePathFinder
 5 | 
 6 | class NonRecursiveSimplePathFinder(SimplePathFinder):
 7 |     """Concrete path finder that doesn't use recursion."""
 8 | 
 9 |     def find_path(self, position: Tuple[int, int],
10 |                   visited: Set[Tuple[int, int]] = None) -> List[Tuple[int, int]]:
11 |         """
12 |         Iteratively finds the path (without using recursion).
13 | 
14 |         Example:
15 |         >>> path_finder = NonRecursiveSimplePathFinder(np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]]))
16 |         >>> path_finder.find_path((0, 2))
17 |         [(0, 2), (0, 1)]
18 |         """
19 | 
20 |         if visited is None:
21 |             visited = set()
22 |         visited.add(position)
23 |         calculated_path = [position]
24 |         next_position = self.next_neighbor(position, visited)
25 | 
26 |         while position != next_position:
27 |             position = next_position
28 |             visited.add(position)
29 |             calculated_path.append(position)
30 |             next_position = self.next_neighbor(position, visited)
31 | 
32 |         return calculated_path
33 | 
34 | if __name__ == "__main__":
35 |     import doctest
36 |     doctest.testmod()
37 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/parallel_simple_pathfinder.py:
--------------------------------------------------------------------------------
 1 | """Efficient parallel version of the path finder system."""
 2 | 
 3 | from typing import Tuple, Set
 4 | 
 5 | import numpy as np
 6 | from numba import jit
 7 | from pathfinder.non_recursive_simple_pathfinder import NonRecursiveSimplePathFinder
 8 | 
 9 | class ParallelSimplePathFinder(NonRecursiveSimplePathFinder):
10 |     """Concrete path finder that uses Numba to perform operations in parallel."""
11 | 
12 |     @staticmethod
13 |     @jit(nopython=True, parallel=True, cache=True)
14 |     def _best_neighbor(terrain: np.ndarray, position: Tuple[int, int]) -> Tuple[int, int]:
15 |         curr_x, curr_y = position
16 |         length, width = terrain.shape[:2]
17 |         current_slope = terrain[position][1]
18 |         min_altitude = terrain[position][0]
19 |         min_position = position
20 | 
21 |         for delta_x in range(-1, 2):
22 |             for delta_y in range(-1, 2):
23 |                 new_position = (curr_x + delta_x, curr_y + delta_y)
24 |                 new_x, new_y = new_position
25 |                 if not ((new_x < 0) or
26 |                         (new_y < 0) or
27 |                         (new_x >= length) or
28 |                         (new_y >= width)) and not new_position == position:
29 |                     new_altitude = terrain[new_position][0]
30 |                     if new_altitude < min_altitude or (new_altitude == min_altitude and
31 |                                                        current_slope > 0):
32 |                         min_altitude = new_altitude
33 |                         min_position = new_position
34 |         return min_position
35 | 
36 |     def next_neighbor(self, position: Tuple[int, int],
37 |                       visited: Set[Tuple[int, int]]) -> Tuple[int, int]:
38 |         """
39 |         Uses a vectorized clockwise search of neighbors starting at south-west.
40 | 
41 |         Example:
42 |         >>> terrain = np.array([[(-2, 0), (2, 0), (2, 1), (3, 1)]])
43 |         >>> path_finder = ParallelSimplePathFinder(terrain)
44 |         >>> path_finder.next_neighbor((0, 2), set((0, 2)))
45 |         (0, 1)
46 |         """
47 | 
48 |         best_neighbor = ParallelSimplePathFinder._best_neighbor(self.terrain, position)
49 |         if not best_neighbor in visited:
50 |             return best_neighbor
51 |         return position
52 | 
53 | if __name__ == "__main__":
54 |     import doctest
55 |     doctest.testmod()
56 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/pathutils.py:
--------------------------------------------------------------------------------
 1 | """Contains various path related utility classes and methods."""
 2 | 
 3 | from typing import List, Tuple
 4 | 
 5 | import numpy as np
 6 | 
 7 | class PathUtils:
 8 |     """Encompasses static methods to handle paths."""
 9 | 
10 |     @staticmethod
11 |     def encode_path(terrain: np.ndarray, descend_path: List[Tuple[int, int]]) -> np.ndarray:
12 |         """
13 |         Encodes the path into the terrain by setting the points's 3rd (blue) component to 255.
14 | 
15 |         Args:
16 |         terrain: the terrain's configuration comprised from (altitude, slope, [aspect])
17 |         integer pairs/triples.
18 | 
19 |         Output:
20 |         New terrain with an extra 3rd dimension to encode the path.
21 | 
22 |         Example:
23 |         >>> terrain = np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]])
24 |         >>> PathUtils.encode_path(terrain, [(0, 2), (0, 1)])
25 |         array([[[ -1,   2,   0],
26 |                 [ -2,   1, 255],
27 |                 [ -2,   2, 255],
28 |                 [  1,   0,   0]]])
29 |         """
30 | 
31 |         # Expand terrain with an extra dimension, as needed.
32 |         if terrain.shape[2] == 2:
33 |             new_shape = terrain.shape[:2] + (3,)
34 |             new_terrain = np.zeros(new_shape, terrain.dtype)
35 |             new_terrain[:terrain.shape[0], :terrain.shape[1], :2] = terrain
36 |         else:
37 |             new_terrain = np.copy(terrain)
38 | 
39 |         for point in descend_path:
40 |             new_terrain[point][2] = 255
41 |         return new_terrain
42 | 
43 |     @staticmethod
44 |     def decode_path(terrain: np.ndarray) -> List[Tuple[int, int]]:
45 |         """
46 |         Decodes the path from the terrain by picking points's whose 3rd (blue) component is 255.
47 |         The reconstructed path may not be unique, which depends upon the path finder logic.
48 | 
49 |         Args:
50 |         terrain: the terrain's configuration encoded with a single path.
51 | 
52 |         Output:
53 |         The decoded path that is guaranteed to contain all points of the encoded path.
54 |         Ordering of points may differ from what was reported by the matching path finder.
55 |         """
56 | 
57 |         # Extra exercise to implement this method according to the specification.
58 |         raise NotImplementedError
59 | 
60 | if __name__ == "__main__":
61 |     import doctest
62 |     doctest.testmod()
63 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/pathfinder/simple_pathfinder.py:
--------------------------------------------------------------------------------
 1 | """Simple recursive path finder implementation."""
 2 | 
 3 | from typing import List, Tuple, Set
 4 | from pathfinder.base_pathfinder import BasePathFinder
 5 | 
 6 | class SimplePathFinder(BasePathFinder):
 7 |     """Concrete path finder that uses recursion and is sequential."""
 8 | 
 9 |     def next_neighbor(self, position: Tuple[int, int],
10 |                       visited: Set[Tuple[int, int]]) -> Tuple[int, int]:
11 |         """
12 |         Uses a simple clockwise search of neighbors starting at south-west.
13 | 
14 |         Example:
15 |         >>> path_finder = SimplePathFinder(np.array([[(-2, 0), (3, 0), (2, 0), (1, 0)]]))
16 |         >>> path_finder.next_neighbor((0, 1), set((0, 1)))
17 |         (0, 0)
18 |         """
19 | 
20 |         curr_x, curr_y = position
21 |         current_slope = self.terrain[position][1]
22 |         min_altitude = self.terrain[position][0]
23 |         min_position = position
24 |         for delta_x in range(-1, 2):
25 |             for delta_y in range(-1, 2):
26 |                 new_position = (curr_x + delta_x, curr_y + delta_y)
27 |                 if not self.wall(new_position) and not new_position in visited:
28 |                     new_altitude = self.terrain[new_position][0]
29 |                     if new_altitude < min_altitude or (new_altitude == min_altitude and
30 |                                                        current_slope > 0):
31 |                         min_altitude = new_altitude
32 |                         min_position = new_position
33 |         return min_position
34 | 
35 |     def find_path(self, position: Tuple[int, int],
36 |                   visited: Set[Tuple[int, int]] = None) -> List[Tuple[int, int]]:
37 |         """
38 |         Recursively finds the path.
39 | 
40 |         Example:
41 |         >>> path_finder = SimplePathFinder(np.array([[(-1, 2), (-2, 1), (-2, 2), (1, 0)]]))
42 |         >>> path_finder.find_path((0, 2))
43 |         [(0, 2), (0, 1)]
44 |         """
45 | 
46 |         if visited is None:
47 |             visited = set()
48 |         visited.add(position)
49 |         next_position = self.next_neighbor(position, visited)
50 |         if position == next_position:
51 |             return [position]
52 |         return [position] + self.find_path(next_position, visited)
53 | 
54 | if __name__ == "__main__":
55 |     import doctest
56 |     doctest.testmod()
57 | 


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/terrain_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/terrain_data/coastline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/coastline.jpg


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/terrain_data/coastline_with_path.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/terrain_data/coastline_with_path.jpg


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/testutils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch5/augmented_ball_descend/testutils/.DS_Store


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/testutils/__init__.py:
--------------------------------------------------------------------------------
1 | from testutils.create_terrain import create_test_terrain


--------------------------------------------------------------------------------
/ch5/augmented_ball_descend/testutils/create_terrain.py:
--------------------------------------------------------------------------------
 1 | """Creates a degenerate terrain for measuring various running times."""
 2 | 
 3 | import numpy as np
 4 | 
 5 | def create_test_terrain(n: int) -> np.ndarray:
 6 |     """Creates a square maze-like terrain with alleys of decreasing altitude.
 7 | 
 8 |     Args:
 9 |     n: number of rows and columns of a terrain
10 | 
11 |     Output:
12 |     The test terrain of proper size.
13 | 
14 |     Example:
15 |     >>> terrain = create_test_terrain(9)
16 |     >>> terrain[:, :, 0]
17 |     array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
18 |            [81, 81, 81, 81, 81, 81, 81, 81, 17],
19 |            [26, 25, 24, 23, 22, 21, 20, 19, 18],
20 |            [27, 81, 81, 81, 81, 81, 81, 81, 81],
21 |            [36, 37, 38, 39, 40, 41, 42, 43, 44],
22 |            [81, 81, 81, 81, 81, 81, 81, 81, 53],
23 |            [62, 61, 60, 59, 58, 57, 56, 55, 54],
24 |            [63, 81, 81, 81, 81, 81, 81, 81, 81],
25 |            [72, 73, 74, 75, 76, 77, 78, 79, 80]])
26 |     """
27 | 
28 |     size = n * n
29 |     terrain = np.zeros((n, n, 2), dtype=int)
30 |     terrain[:, :, 0] = np.arange(0, size).reshape((n, n))
31 |     
32 |     # Reverse every 4th row to have proper ordering of elements.
33 |     for i in range(2, n, 4):
34 |         terrain[i, :, 0] = np.flip(terrain[i, :, 0])
35 | 
36 |     # Create "walls" inside the terrain.
37 |     for i in range(1, n, 4):
38 |         terrain[i, :-1, 0] = size
39 |     for i in range(3, n, 4):
40 |         terrain[i, 1:, 0] = size
41 | 
42 |     return terrain
43 | 
44 | if __name__ == "__main__":
45 |     import doctest
46 |     doctest.testmod()
47 | 


--------------------------------------------------------------------------------
/ch6/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/.DS_Store


--------------------------------------------------------------------------------
/ch6/anscombe/anscombe_altair.py:
--------------------------------------------------------------------------------
 1 | import altair as alt
 2 | from vega_datasets import data
 3 | 
 4 | source = data.anscombe()
 5 | 
 6 | base = alt.Chart(
 7 |     source, title = "Anscombe's Quartets"
 8 | ).mark_circle(color = 'red').encode(
 9 |     alt.X('X', scale = alt.Scale(zero = True)),
10 |     alt.Y('Y', scale = alt.Scale(zero = True)),
11 |     column = 'Series'
12 | ).properties(
13 |     width = 150,
14 |     height = 150
15 | ).interactive()
16 | 
17 | base
18 | 


--------------------------------------------------------------------------------
/ch6/anscombe/anscombe_matplotlib.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | quartets = np.asarray([
 5 |     (
 6 |      [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 
 7 |      [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
 8 |     ), 
 9 |     (
10 |      [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 
11 |      [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
12 |     ), 
13 |     (
14 |      [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0], 
15 |      [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
16 |     ), 
17 |     (
18 |      [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0],
19 |      [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
20 |     )
21 | ])
22 |     
23 | roman = ['I', 'II', 'III', 'IV']
24 | 
25 | fig = plt.figure(figsize = (12, 9))
26 | fig.suptitle("Anscombe's Quartets", fontsize=16)
27 | axes = fig.subplots(2, 2, sharex = True, sharey = True)
28 | 
29 | for quartet in range(quartets.shape[0]):
30 |     x, y = quartets[quartet]
31 |     coef = np.polyfit(x, y, 1)
32 |     reg_line = np.poly1d(coef) 
33 | 
34 |     ax = axes[quartet // 2, quartet % 2]
35 |     ax.plot(x, y, 'ro', x, reg_line(x), '--k')
36 |     ax.set_title(roman[quartet])
37 |     ax.set_xlim(3, 19.5)
38 |     ax.set_ylim(2, 13)    
39 |     
40 |     # Print summary statistics for the current dataset
41 |     print("Quartet:", roman[quartet])
42 |     print("Mean X:", x.mean())
43 |     print("Variance X:", x.var())
44 |     print("Mean Y:", round(y.mean(), 2))
45 |     print("Variance Y:", round(y.var(), 2))
46 |     print("Pearson's correlation coef.:", round(np.corrcoef(x, y)[0][1], 2))
47 |     print()
48 | 
49 | plt.show()


--------------------------------------------------------------------------------
/ch6/closest_pair/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/closest_pair/.DS_Store


--------------------------------------------------------------------------------
/ch6/closest_pair/__init__.py:
--------------------------------------------------------------------------------
1 | from closest_pair.base_closest_pair import BaseClosestPair
2 | from closest_pair.naive_closest_pair import NaiveClosestPair
3 | from closest_pair.fast_closest_pair import FastClosestPair


--------------------------------------------------------------------------------
/ch6/closest_pair/base_closest_pair.py:
--------------------------------------------------------------------------------
  1 | """The base class for implementing various variants to find the closest pair."""
  2 | 
  3 | import abc
  4 | from typing import Tuple, Callable, TypeVar, Sequence, Generic
  5 | import numpy as np
  6 | 
  7 | Coordinates = TypeVar('Coordinates', Sequence[int], np.ndarray)
  8 | 
  9 | class BaseClosestPair(Generic[Coordinates], metaclass=abc.ABCMeta):
 10 |     """
 11 |     Finds the closest pair among 2D points given by their x and y coordinates. The
 12 |     distance is by default defined as a standard Euclidian distance.
 13 | 
 14 |     Args:
 15 |     x: the list of x coordinates of all points.
 16 |     y: the list of y coordinates of all points. The ordering of elements matches
 17 |     the list of x coordinates, i.e., the ith point is specified as (x[i], y[i]).
 18 |     """
 19 | 
 20 |     _x: Coordinates
 21 |     _y: Coordinates
 22 | 
 23 |     def __init__(self, x: Coordinates, y: Coordinates):
 24 |         assert len(x) >= 2 and len(x) == len(y)
 25 |         self._x = x
 26 |         self._y = y
 27 | 
 28 |     @property
 29 |     def x(self) -> Coordinates:
 30 |         """Gets the x coordinates of points."""
 31 |         return self._x
 32 | 
 33 |     @property
 34 |     def y(self) -> Coordinates:
 35 |         """Gets the y coordinates of points."""
 36 |         return self._y
 37 | 
 38 |     @staticmethod
 39 |     def load_from_stdin() -> Tuple[Coordinates, Coordinates]:
 40 |         """
 41 |         Loads points from standard input by enumarating x and y coordinates in succession.
 42 |         Each datum must be separated with space.
 43 | 
 44 |         Output:
 45 |         The tuple of x and y coordinates.
 46 |         """
 47 | 
 48 |         import sys
 49 | 
 50 |         data = sys.stdin.read()
 51 |         points = list(map(int, data.split()))
 52 |         x = points[1::2]
 53 |         y = points[2::2]
 54 |         return x, y
 55 | 
 56 |     @staticmethod
 57 |     def generate_points(n: int, seed: int) -> Tuple[Coordinates, Coordinates]:
 58 |         """
 59 |         Generates random points for stress testing.
 60 | 
 61 |         Output:
 62 |         The tuple of x and y coordinates.
 63 | 
 64 |         Examples:
 65 |         >>> BaseClosestPair.generate_points(3, 10)
 66 |         ([227077737, -930024104, -78967768], [36293302, 241441628, -968147565])
 67 |         """
 68 | 
 69 |         import random
 70 | 
 71 |         assert n >= 2
 72 |         random.seed(seed)
 73 |         x = [random.randint(-10**9, 10**9) for _ in range(n)]
 74 |         y = [random.randint(-10**9, 10**9) for _ in range(n)]
 75 | 
 76 |         return x, y
 77 | 
 78 |     @staticmethod
 79 |     def distance(x1: int, x2: int, y1: int, y2: int) -> float:
 80 |         """
 81 |         Returns the Euclidian distance between two points.
 82 | 
 83 |         Args:
 84 |         x1: the x coordinate of the first point.
 85 |         x1: the x coordinate of the second point.
 86 |         y1: the y coordinate of the first point.
 87 |         y2: the y coordinate of the second point.
 88 | 
 89 |         Output:
 90 |         The distance between points defined as the square root of the sum of squared
 91 |         differences of the matching coordinates.
 92 | 
 93 |         Examples:
 94 |         >>> BaseClosestPair.distance(1, 2, 1, 2)
 95 |         1.4142135623730951
 96 |         >>> BaseClosestPair.distance(1, 1, 1, 1)
 97 |         0.0
 98 |         """
 99 | 
100 |         from math import sqrt
101 | 
102 |         return sqrt((x1 - x2)**2 + (y1 - y2)**2)
103 | 
104 |     @abc.abstractmethod
105 |     def closest_pair(self, distance: Callable[[int, int, int, int], float]) -> Tuple[int, int, float]:
106 |         """
107 |         Returns back the tuple with indexes of closest points as well as
108 |         their distance.
109 | 
110 |         Args:
111 |         distance: the function that receives four parameters (x1, x2, y1, y2) and
112 |         returns back the distance between these points.
113 |         """
114 | 
115 | if __name__ == "__main__":
116 |     import doctest
117 |     doctest.testmod()
118 | 


--------------------------------------------------------------------------------
/ch6/closest_pair/fast_closest_pair.py:
--------------------------------------------------------------------------------
  1 | """Fast implementation of the closest pair algorithm."""
  2 | 
  3 | from typing import List, Tuple, Callable
  4 | from closest_pair.base_closest_pair import Coordinates, BaseClosestPair
  5 | 
  6 | class FastClosestPair(BaseClosestPair):
  7 |     _y_prime: List[int]
  8 | 
  9 |     def _argsort_y(self) -> List[int]:
 10 |         """Finds the permutation of indices that arranges points by y coordinate."""
 11 | 
 12 |         return [t[0] for t in sorted(enumerate(self.y), key = lambda t: t[1])]
 13 | 
 14 |     def _get_x(self, i: int, s: List[int]) -> int:
 15 |         return self.x[self._y_prime[s[i]]]
 16 |     
 17 |     def _get_y(self, i: int, s: List[int]) -> int:
 18 |         return self.y[self._y_prime[s[i]]]
 19 |         
 20 |     def __init__(self, x: Coordinates, y: Coordinates):
 21 |         super().__init__(x, y)
 22 |         self._y_prime = self._argsort_y()
 23 |         
 24 |     def _selection(self, s: List[int], k: int) -> int:
 25 |         """Returns the x value of kth smallest point by x coordinate contained in s."""
 26 | 
 27 |         def split(v: int) -> Tuple[List[int], List[int], List[int]]:
 28 |             """Indirectly splits points in-place around value v into 2 sets (left and right)."""
 29 | 
 30 |             store = 0
 31 |             sl_idx = 0
 32 |             for i in range(len(s)):
 33 |                 if self._get_x(i, s) < v:
 34 |                     s[i], s[store] = s[store], s[i]
 35 |                     store += 1
 36 |                     sl_idx = store
 37 |             for i in range(store, len(s)):
 38 |                 if self._get_x(i, s) == v:
 39 |                     s[i], s[store] = s[store], s[i]
 40 |                     store += 1
 41 |             return (s[:sl_idx], s[sl_idx:store], s[store:])
 42 |           
 43 |         import random
 44 |         
 45 |         v_idx = random.randrange(len(s))
 46 |         v = self._get_x(v_idx, s)
 47 |         sl, sv, sr = split(v)
 48 |         sl_size = len(sl)
 49 |         sv_size = len(sv)
 50 |         
 51 |         if k <= sl_size:
 52 |             return self._selection(sl, k)
 53 |         if k > sl_size and k <= sl_size + sv_size:
 54 |             return self._get_x(-1, sv)
 55 |         return self._selection(sr, k - sl_size - sv_size)
 56 | 
 57 |     @staticmethod
 58 |     def _merge(sl: List[int], sr: List[int]) -> List[int]:
 59 |         """
 60 |         Merges the two sorted sublists into a new sorted list. The temporary
 61 |         storage may be allocated upfront as a further optimization.
 62 |         """
 63 | 
 64 |         sl_size = len(sl)
 65 |         sr_size = len(sr)
 66 |         s = [0] * (sl_size + sr_size) 
 67 |         k = 0
 68 |         i = 0
 69 |         j = 0
 70 |     
 71 |         while i < sl_size and j < sr_size:
 72 |             if sl[i] <= sr[j]:
 73 |                 s[k] = sl[i]
 74 |                 k += 1
 75 |                 i += 1
 76 |             else:
 77 |                 s[k] = sr[j]
 78 |                 k += 1
 79 |                 j += 1                       
 80 |         while i < sl_size:
 81 |             s[k] = sl[i]
 82 |             k += 1
 83 |             i += 1
 84 |         while j < sr_size:
 85 |             s[k] = sr[j]
 86 |             k += 1
 87 |             j += 1
 88 |                     
 89 |         return s
 90 |             
 91 |     def closest_pair(self, 
 92 |                      distance: Callable[[int, int, int, int], float] = BaseClosestPair.distance
 93 |                     ) -> Tuple[int, int, float]:
 94 |         """
 95 |         Computes the minimum distance in O(n*log n) time.
 96 |         
 97 |         Examples:
 98 |         >>> x = [0, 3, 100]
 99 |         >>> y = [0, 4, 110]
100 |         >>> fcp = FastClosestPair(x, y)
101 |         >>> fcp.closest_pair()
102 |         (0, 1, 5.0)
103 |         """
104 |         
105 |         from math import inf
106 |         
107 |         def filter_points(s: List[int], d: float, x: int) -> List[int]:
108 |             """Returns the list of point indexes that fall inside the [x-d, x+d] interval."""
109 | 
110 |             return [s[i] for i in range(len(s)) if abs(self._get_x(i, s) - x) <= d]
111 | 
112 |         def find_nearest_neighbor(i: int, s: List[int]) -> Tuple[float, int, int]:
113 |             """
114 |             Finds the minimum distance between the current point i and next 7 seven
115 |             subsequent points by y coordinate.
116 |             """
117 | 
118 |             curr_x = self._get_x(i, s)
119 |             curr_y = self._get_y(i, s)
120 |             d = inf
121 |             min_idx = i
122 | 
123 |             for j in range(i + 1, min(len(s), i + 7 + 1)):
124 |                 curr_d = distance(curr_x, self._get_x(j, s), curr_y, self._get_y(j, s))
125 |                 if curr_d < d:
126 |                     d = curr_d
127 |                     min_idx = j
128 |             return d, s[i], s[min_idx]
129 | 
130 |         def find_minimum_distance(s: List[int]) -> Tuple[int, int, float]:
131 |             """Main driver function to find the closest pair."""
132 | 
133 |             if len(s) == 1:
134 |                 # We will treat the distance from a single point as infinite.
135 |                 return s[0], -1, inf
136 |             if len(s) == 2:
137 |                 return s[0], s[1], distance(self._get_x(0, s), 
138 |                                       self._get_x(1, s), 
139 |                                       self._get_y(0, s), 
140 |                                       self._get_y(1, s))
141 |         
142 |             # This is the median value of input array x in regard of s.
143 |             median_x = self._selection(s.copy(), len(s) // 2)
144 | 
145 |             # Separate points around median.
146 |             sl = []
147 |             sr = []
148 |             for i in range(len(s)):
149 |                 if self._get_x(i, s) <= median_x:
150 |                     sl.append(s[i])
151 |                 else:
152 |                     sr.append(s[i])
153 |      
154 |             # Find minimum distances in left and right groups.
155 |             p_l, q_l, d_l = find_minimum_distance(sl)
156 |             p_r, q_r, d_r = find_minimum_distance(sr)
157 |             if d_l < d_r:
158 |                 p_min, q_min = p_l, q_l
159 |                 d = d_l
160 |             else:
161 |                 p_min, q_min = p_r, q_r
162 |                 d = d_r
163 |         
164 |             # Merge left and right indices keeping their sorted order.
165 |             sm = FastClosestPair._merge(sl, sr)
166 |             
167 |             # Find the minimum distance inside the middle strip.
168 |             sf = filter_points(sm, d, median_x)
169 |         
170 |             # Find the final minimum distance amond three groups (left, middle, and right).
171 |             d_m, p_m, q_m = min([find_nearest_neighbor(i, sf) for i in range(len(sf))])        
172 |             if d_m < d:
173 |                 return p_m, q_m, d_m
174 |             else:
175 |                 return p_min, q_min, d
176 | 
177 |         p, q, d = find_minimum_distance(list(range(len(self._y_prime))))
178 |         # We need to map back the point indices into their original base.
179 |         return self._y_prime[p], self._y_prime[q], d
180 | 
181 | if __name__ == "__main__":
182 |     import doctest
183 |     doctest.testmod()
184 | 
185 | 


--------------------------------------------------------------------------------
/ch6/closest_pair/naive_closest_pair.py:
--------------------------------------------------------------------------------
 1 | """Naive implementation of the closest pair algorithm."""
 2 | 
 3 | from typing import Tuple, Callable
 4 | from closest_pair.base_closest_pair import BaseClosestPair
 5 | 
 6 | class NaiveClosestPair(BaseClosestPair):  
 7 |     def closest_pair(self, distance: Callable[[int, int, int, int], float] = BaseClosestPair.distance
 8 |                     ) -> Tuple[int, int, float]:
 9 |         """
10 |         Iterates over all pairs and computes their distances.
11 |         
12 |         Examples:
13 |         >>> x = [0, 3, 100]
14 |         >>> y = [0, 4, 110]
15 |         >>> ncp = NaiveClosestPair(x, y)
16 |         >>> ncp.closest_pair()
17 |         (0, 1, 5.0)
18 |         """
19 |         
20 |         from math import inf
21 | 
22 |         n = len(self.x)
23 |         min_distance = inf
24 |         for i in range(n - 1):
25 |             for j in range(i + 1, n):
26 |                 d = distance(self.x[i], self.x[j], self.y[i], self.y[j])
27 |                 if d < min_distance:
28 |                     min_distance = d
29 |                     p_i = i
30 |                     p_j = j
31 |                     
32 |         return p_i, p_j, min_distance
33 | 
34 | if __name__ == "__main__":
35 |     import doctest
36 |     doctest.testmod()
37 | 


--------------------------------------------------------------------------------
/ch6/temp_plots/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch6/temp_plots/.DS_Store


--------------------------------------------------------------------------------
/ch6/temp_plots/GHCND_sample_csv.csv:
--------------------------------------------------------------------------------
 1 | STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,TMAX,TMIN,PRCP
 2 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100101,-178,-311,0
 3 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100102,-244,-322,0
 4 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100103,-194,-289,0
 5 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100104,-167,-200,15
 6 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100105,-133,-167,9999
 7 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100106,-133,-172,9999
 8 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100107,-150,-278,0
 9 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100108,-233,-328,0
10 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100109,-233,-322,0
11 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100110,-117,-244,0
12 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100111,-67,-128,0
13 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100112,-78,-122,0
14 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100113,-17,-89,0
15 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100114,39,-72,0
16 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100115,-67,-72,0
17 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100116,22,-50,0
18 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100117,33,-44,0
19 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100118,6,-172,0
20 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100119,-56,-183,0
21 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100120,-67,-139,0
22 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100121,-67,-94,25
23 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100122,-44,-67,0
24 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100123,-6,-44,0
25 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100124,0,-11,0
26 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100125,-11,-161,0
27 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100126,-161,-233,0
28 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100127,-167,-222,0
29 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100128,-167,-283,0
30 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100129,-189,-283,0
31 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100130,-156,-267,0
32 | GHCND:USC00327027,PETERSBURG 2 N ND US,466.3,48.0355,-98.01,20100131,-150,-272,0
33 | 


--------------------------------------------------------------------------------
/ch6/temp_plots/plot_stations.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import mplleaflet
 3 | 
 4 | def plot_stations(longitudes, latitudes, embedded = False):
 5 |     if embedded:
 6 |         plt.figure(figsize = (8, 8))
 7 |     plt.scatter(longitudes, latitudes, 
 8 |                 c = 'b',
 9 |                 marker = 'D',
10 |                 alpha = 0.7, 
11 |                 s = 200)
12 |     return mplleaflet.display() if embedded else mplleaflet.show()
13 | 


--------------------------------------------------------------------------------
/ch6/temp_plots/plot_temps.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.ticker import MultipleLocator
 2 | 
 3 | def plot_temps(df, min_temp, max_temp, extreme_low_temps, extreme_high_temps):
 4 |     ax1 = df.plot.line(y = ['TMAX', 'TMIN'], 
 5 |             figsize = (12, 9),
 6 |             ylim = (1.3 * min_temp, 1.3 * max_temp),
 7 |             rot = 45, fontsize = 12, style = ['-r', '-b'], linewidth = 0.6,
 8 |             legend = False,
 9 |             x_compat = True)
10 |     ax1.lines[0].set_label('Max. temperature')
11 |     ax1.lines[-1].set_label('Min. temperature')
12 |     ax1.set_title('Low and High Temperatures in January 2010\nNorth Dakota, United States', 
13 |                   fontsize = 20, y = 1.06)
14 |     ax1.set_xlabel('Date', fontsize = 14, labelpad = 15)
15 |     ax1.set_ylabel('Temperature [\u2103]', fontsize = 14)
16 |     ax1.spines['right'].set_visible(False)
17 |     ax1.spines['top'].set_visible(False)
18 |     ax1.yaxis.set_minor_locator(MultipleLocator(5))
19 |     ax1.fill_between(df.index, df['TMAX'], df['TMIN'], 
20 |                      facecolor = 'lightgray', alpha = 0.25)
21 | 
22 |     def fahrenheit_to_celisus(temp):
23 |         return 1.8 * temp + 32
24 | 
25 |     ax2 = ax1.twinx()
26 |     y_min, y_max = ax1.get_ylim()
27 |     ax2.set_ylim(fahrenheit_to_celisus(y_min), fahrenheit_to_celisus(y_max))
28 |     ax2.set_ylabel('Temperature [\u2109]', fontsize = 14, labelpad = 15)
29 |     ax2.spines['top'].set_visible(False)
30 |     ax2.yaxis.set_minor_locator(MultipleLocator(5))
31 |     for label in ax2.get_yticklabels():
32 |         label.set_fontsize(12)
33 |     
34 |     ax1.scatter(extreme_low_temps.index, extreme_low_temps, 
35 |                 color = 'blue', marker = 'v', s = 100, 
36 |                 label = 'Unusually low temperatures')
37 |     ax1.scatter(extreme_high_temps.index, extreme_high_temps, 
38 |                 color = 'red', marker = '^', s = 100, 
39 |                 label = 'Unusually high temperatures')
40 |     ax1.legend(loc = 4, frameon = False, title = 'Legend')
41 | 


--------------------------------------------------------------------------------
/ch6/temp_plots/temp_visualization_demo.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df = pd.read_csv('GHCND_sample_csv.csv',
 4 |                  usecols = [3, 4, 5, 6, 7], 
 5 |                  index_col = 2,
 6 |                  parse_dates = True, 
 7 |                  infer_datetime_format = True)
 8 | df['TMIN'] = df['TMIN'] / 10
 9 | df['TMAX'] = df['TMAX'] / 10
10 | print(df.head())
11 | 
12 | from plot_stations import plot_stations
13 | plot_stations(df['LONGITUDE'].tolist()[0], df['LATITUDE'].tolist()[0])
14 | 
15 | min_temp = df['TMIN'].min()
16 | max_temp = df['TMAX'].max()
17 | print("\nMinimum temperature: %g\nMaximum temperature: %g\n" % (min_temp, max_temp))
18 | 
19 | LIMIT_HIGH = 0
20 | LIMIT_LOW = -30
21 | 
22 | extreme_high_temps = df['TMAX'][df['TMAX'] > LIMIT_HIGH]
23 | extreme_low_temps = df['TMIN'][df['TMIN'] < LIMIT_LOW]
24 | 
25 | print('Extreme low temperatures\n', extreme_low_temps)
26 | print('\nExtreme high temperatures\n', extreme_high_temps)
27 | 
28 | from plot_temps import plot_temps
29 | plot_temps(df, min_temp, max_temp, extreme_low_temps, extreme_high_temps)
30 | 


--------------------------------------------------------------------------------
/ch7/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/.DS_Store


--------------------------------------------------------------------------------
/ch7/core_concepts/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/core_concepts/.DS_Store


--------------------------------------------------------------------------------
/ch7/core_concepts/data_generator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Produces features and outputs based upon various criteria by simulating 
 5 | fake "real world" processes.
 6 | 
 7 | @author: Ervin Varga
 8 | """
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | def generate_base_features(sample_size):
13 |     x_normal = np.random.normal(6, 9, sample_size)
14 |     x_uniform = np.random.uniform(0, 1, sample_size)
15 |     x_interacting = x_normal * x_uniform
16 |     x_combined = 3.6 * x_normal + np.random.exponential(2/3, sample_size)
17 |     x_collinear = 5.6 * x_combined
18 |     
19 |     features = {
20 |         'x_normal': x_normal,
21 |         'x_uniform': x_uniform,
22 |         'x_interacting': x_interacting,
23 |         'x_combined': x_combined,
24 |         'x_collinear': x_collinear
25 |     }
26 |     return pd.DataFrame.from_dict(features)
27 | 
28 | def identity(x):
29 |     return x
30 | 
31 | def generate_response(X, error_spread, beta, f=identity):
32 |     error = np.random.normal(0, error_spread, (X.shape[0], 1))
33 |     intercept = beta[0]
34 |     coef = np.array(beta[1:]).reshape(X.shape[1], 1)
35 |     return f(intercept + np.dot(X, coef)) + error


--------------------------------------------------------------------------------
/ch7/core_concepts/observer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Contains functions to recover parameters and demonstrate various effects 
  5 | pertaining to training, testing, and evaluation.
  6 | 
  7 | @author: Ervin Varga
  8 | """
  9 | import numpy as np
 10 | import pandas as pd
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | plt.style.use('seaborn-whitegrid')
 15 | 
 16 | def train_model(model, X_train, y_train):
 17 |     model.fit(X_train, y_train)
 18 |     
 19 | def evaluate_model(model, X_test, y_test, plot_residuals=False, title=''):
 20 |     from sklearn.metrics import mean_squared_error, explained_variance_score
 21 | 
 22 |     y_pred = model.predict(X_test)
 23 | 
 24 |     if plot_residuals:
 25 |         _, ax = plt.subplots(figsize=(9, 9))
 26 |         ax.set_title('Residuals Plot - ' + title, fontsize=19)
 27 |         ax.set_xlabel('Predicted values', fontsize=15)
 28 |         ax.set_ylabel('Residuals', fontsize=15)
 29 |         sns.residplot(y_pred.squeeze(), y_test.squeeze(), 
 30 |                       lowess=True,
 31 |                       ax=ax,
 32 |                       scatter_kws={'alpha': 0.3}, 
 33 |                       line_kws={'color': 'black', 'lw': 2, 'ls': '--'})
 34 |     
 35 |     metrics = {
 36 |         'explained_variance': explained_variance_score(y_test, y_pred),
 37 |         'mse': mean_squared_error(y_test, y_pred) 
 38 |     }
 39 |     return metrics
 40 | 
 41 | def make_poly_pipeline(model, degree):
 42 |     from sklearn.pipeline import make_pipeline
 43 |     from sklearn.preprocessing import PolynomialFeatures
 44 |     
 45 |     return make_pipeline(PolynomialFeatures(degree=degree, include_bias=False), model)
 46 | 
 47 | def print_parameters(linear_model, metrics):
 48 |     print('Intercept: %.3f' % linear_model.intercept_)
 49 |     print('Coefficients: \n', linear_model.coef_)
 50 |     print('Explained variance score: %.3f' % metrics['explained_variance'])
 51 |     print("Mean squared error: %.3f" % metrics['mse'])
 52 | 
 53 | def plot_mse(model, X, y, title, error_spread):
 54 |     def collect_mse():
 55 |         from sklearn.model_selection import train_test_split
 56 |         from sklearn.model_selection import cross_val_score
 57 |     
 58 |         metrics_all = []
 59 |         for train_size_pct in range(10, 110, 10):
 60 |             X_train, X_test, y_train, y_test = \
 61 |                 train_test_split(X, y, shuffle=False, train_size=train_size_pct / 100)
 62 |             metrics_current = dict()
 63 |             metrics_current['percent_train'] = train_size_pct
 64 |             train_model(model, X_train, y_train)
 65 |             metrics_train = evaluate_model(model, X_train, y_train)
 66 |             metrics_current['Training score'] = metrics_train['mse']
 67 |             metrics_cv = cross_val_score(
 68 |                 model, 
 69 |                 X_train, y_train, 
 70 |                 scoring='neg_mean_squared_error', 
 71 |                 cv=10)
 72 |             metrics_current['CV score'] = -metrics_cv.mean()
 73 |             if X_test.shape[0] > 0:
 74 |                 metrics_test = evaluate_model(model, X_test, y_test)
 75 |                 metrics_current['Testing score'] = metrics_test['mse']
 76 |             else:
 77 |                 metrics_current['Testing score'] = np.NaN                
 78 |             metrics_all.append(metrics_current)
 79 |         return pd.DataFrame.from_records(metrics_all)        
 80 | 
 81 |     import matplotlib.ticker as mtick
 82 |     
 83 |     df = collect_mse()
 84 |     error_variance = error_spread**2
 85 |     ax = df.plot(
 86 |         x='percent_train',
 87 |         title=title,
 88 |         kind='line',
 89 |         xticks=range(10, 110, 10),
 90 |         sort_columns=True,
 91 |         style=['b+--', 'ro-', 'gx:'],
 92 |         markersize=10.0,
 93 |         grid=False,
 94 |         figsize=(8, 6),
 95 |         lw=2)
 96 |     ax.set_xlabel('Training set size', fontsize=15)
 97 |     ax.xaxis.set_major_formatter(mtick.PercentFormatter())
 98 |     y_min, y_max = ax.get_ylim()
 99 |     # FIX ME: See Exercise 3!
100 |     ax.set_ylim(max(0, y_min), min(2 * error_variance, y_max))
101 |     ax.set_ylabel('MSE', fontsize=15)
102 |     ax.title.set_size(19)
103 |     
104 |     # Draw and annotate the minimum MSE.
105 |     ax.axhline(error_variance, color='g', ls='--', lw=1)
106 |     ax.annotate(
107 |         'Inherent error level', 
108 |         xy=(15, error_variance), 
109 |         textcoords='offset pixels',
110 |         xytext=(10, 80),
111 |         arrowprops=dict(facecolor='black', width=1, shrink=0.05))
112 | 
113 | def explain_sse(slope, intercept, x, y):
114 |     # Configure the diagram.
115 |     _, ax = plt.subplots(figsize=(7, 9))
116 |     ax.set_xlabel('x', fontsize=15)
117 |     ax.set_ylabel('y', fontsize=15)
118 |     ax.set_title(r'$SSE = \sum_{i=1}^n (y_i - \hat{y}_i)^2$', fontsize=19)
119 |     ax.grid(False)
120 |     ax.spines["top"].set_visible(False)
121 |     ax.spines["right"].set_visible(False)
122 |     ax.tick_params(direction='out', length=6, width=2, colors='black')
123 | 
124 |     # Show x-y pairs.
125 |     ax.scatter(x, y, alpha=0.5, marker='x')
126 | 
127 |     # Draw the regression line.
128 |     xlims = np.array([np.min(x), np.max(x)])
129 |     ax.plot(xlims, slope * xlims + intercept, lw=2, color='b')
130 | 
131 |     # Draw the error terms.
132 |     for x_i, y_i in zip(x, y):
133 |         ax.plot([x_i, x_i], [y_i, slope * x_i + intercept], color='r', lw=2, ls='--')


--------------------------------------------------------------------------------
/ch7/core_concepts/session.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Contains functions that depict steps to reconstruct different world parameters 
  5 | from observations using various noise levels.
  6 | 
  7 | @author: Ervin Varga
  8 | """
  9 | import warnings
 10 | warnings.simplefilter(action='ignore', category=FutureWarning)
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | from sklearn.linear_model import LinearRegression
 15 | 
 16 | from data_generator import *
 17 | from observer import *
 18 | 
 19 | def set_session_seed(seed):
 20 |     np.random.seed(seed)    # Enables perfect reproduction of published results.
 21 | 
 22 | def demo_metrics_and_mse():
 23 |     set_session_seed(100)
 24 |     X = generate_base_features(1000)[['x_normal']]
 25 |     for noise_level in [0, 2, 15]:
 26 |         y = generate_response(X, noise_level, [-1.5, 4.1])
 27 |         model = LinearRegression()
 28 |         train_model(model, X, y)
 29 |         metrics = evaluate_model(model, X, y)
 30 | 
 31 |         print('\nIteration with noise level: %d' % noise_level)
 32 |         print_parameters(model, metrics)
 33 | 
 34 |         # Visualize the regression line and error terms.        
 35 |         if noise_level == 15:
 36 |             slope = model.coef_[0][0]
 37 |             intercept = model.intercept_
 38 |             explain_sse(slope, intercept, X[:15].values, y[:15])
 39 | 
 40 | def demo_overfitting():
 41 |     def visualize_overfitting():
 42 |         train_model(optimal_model, X, y)        
 43 |         train_model(complex_model, X, y)        
 44 |         
 45 |         _, ax = plt.subplots(figsize=(9, 7))
 46 |         ax.set_yticklabels([])
 47 |         ax.set_xticklabels([])
 48 |         ax.grid(False)
 49 | 
 50 |         X_test = np.linspace(0, 1.2, 100)
 51 |         plt.plot(X_test, np.sin(2 * np.pi * X_test), label='True function')
 52 |         plt.plot(
 53 |             X_test, 
 54 |             optimal_model.predict(X_test[:, np.newaxis]), 
 55 |             label='Optimal model',
 56 |             ls='-.')
 57 |         plt.plot(
 58 |             X_test, 
 59 |             complex_model.predict(X_test[:, np.newaxis]), 
 60 |             label='Complex model',
 61 |             ls='--',
 62 |             lw=2,
 63 |             color='red')
 64 |         plt.scatter(X, y, alpha=0.2, edgecolor='b', s=20, label='Training Samples')
 65 |         ax.fill_between(X_test, -2, 2, where=X_test > 1, hatch='/', alpha=0.05, color='black')
 66 |         plt.xlabel('x', fontsize=15)
 67 |         plt.ylabel('y', fontsize=15)
 68 |         plt.xlim((0, 1.2))
 69 |         plt.ylim((-2, 2))
 70 |         plt.legend(loc='upper left')
 71 |         plt.title('Visualization of How Overfitting Occurs', fontsize=19)
 72 |         plt.show()        
 73 |     
 74 |     set_session_seed(172)
 75 |     X = generate_base_features(120)[['x_uniform']]
 76 |     y = generate_response(X, 0.1, [0, 2 * np.pi], f=np.sin)
 77 |     
 78 |     optimal_model = make_poly_pipeline(LinearRegression(), 5)    
 79 |     plot_mse(optimal_model, X, y, 'Optimal Model', 0.1)
 80 |     complex_model = make_poly_pipeline(LinearRegression(), 35)
 81 |     plot_mse(complex_model, X, y, 'Complex Model', 0.1)
 82 |     
 83 |     visualize_overfitting()
 84 |     
 85 | def demo_underfitting():
 86 |     set_session_seed(15)
 87 |     X = generate_base_features(200)
 88 |     X_interacting = X[['x_interacting']]
 89 |     y = generate_response(X_interacting, 2, [1.7, -4.3])
 90 |     plot_mse(LinearRegression(), X_interacting, y, 'Optimal Model', 2)
 91 |     X_weak = X[['x_normal', 'x_uniform']]
 92 |     plot_mse(LinearRegression(), X_weak, y, 'Weak Model', 2)
 93 |     
 94 | def demo_collinearity():
 95 |     set_session_seed(10)
 96 |     X = generate_base_features(1000)
 97 |     X_world = X[['x_normal', 'x_combined']]
 98 |     y = generate_response(X_world, 2, [1.1, -2.3, 3.1])
 99 |     
100 |     model = LinearRegression()
101 |     # Showcase the first assumed model.
102 |     train_model(model, X_world, y)
103 |     metrics = evaluate_model(model, X_world, y)
104 |     print('\nDumping stats for model 1')
105 |     print_parameters(model, metrics)
106 | 
107 |     # Showcase the second assumed model.
108 |     X_extended_world = X[['x_normal', 'x_combined', 'x_collinear']]
109 |     train_model(model, X_extended_world, y)
110 |     metrics = evaluate_model(model, X_extended_world, y)
111 |     print('\nDumping stats for model 2')
112 |     print_parameters(model, metrics)
113 |     
114 |     # Produce a scatter matrix plot.
115 |     df = X
116 |     df.columns = ['x' + str(i + 1) for i in range(len(df.columns))]
117 |     df['y'] = y
118 |     pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(10, 10), diagonal='kde')
119 |     
120 | def demo_residuals():
121 |     def plot_regression_line(x, y, case_num):
122 |         _, ax = plt.subplots(figsize=(9, 9))
123 |         ax.set_title('Regression Plot - Case ' + str(case_num), fontsize=19)
124 |         ax.set_xlabel('x', fontsize=15)
125 |         ax.set_ylabel('y', fontsize=15)
126 |         sns.regplot(x.squeeze(), y.squeeze(),
127 |                     ci=None,
128 |                     ax=ax,
129 |                     scatter_kws={'alpha': 0.3}, 
130 |                     line_kws={'color': 'green', 'lw': 3})
131 |     
132 |     set_session_seed(100)
133 |     X = generate_base_features(1000)
134 |     X1 = X[['x_normal']]
135 |     y1 = generate_response(X1, 0.04, [1.2, 0.00003])
136 |     X2 = X1**2
137 |     y2 = generate_response(X2, 0.04, [1.2, 0.00003])
138 |     
139 |     model = LinearRegression()
140 |     # Showcase the first world with a linearly assumed model.
141 |     plot_regression_line(X1, y1, 1)
142 |     train_model(model, X1, y1)
143 |     metrics = evaluate_model(model, X1, y1, True, 'Case 1')
144 |     print('\nDumping stats for case 1')
145 |     print_parameters(model, metrics)
146 | 
147 |     # Showcase the second world with a linearly assumed model.
148 |     plot_regression_line(X1, y2, 2)
149 |     train_model(model, X1, y2)
150 |     metrics = evaluate_model(model, X1, y2, True, 'Case 2')
151 |     print('\nDumping stats for case 2')
152 |     print_parameters(model, metrics)
153 | 
154 | def demo_regularization():
155 |     from sklearn.linear_model import RidgeCV
156 |     
157 |     set_session_seed(172)
158 |     X = generate_base_features(120)[['x_uniform']]
159 |     y = generate_response(X, 0.1, [0, 2 * np.pi], f=np.sin)
160 |     
161 |     regularized_model = make_poly_pipeline(
162 |             RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 5, 10, 20], gcv_mode='auto'),
163 |             35)
164 |     plot_mse(regularized_model, X, y, 'Regularized Model', 0.1)


--------------------------------------------------------------------------------
/ch7/stock_market/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch7/stock_market/.DS_Store


--------------------------------------------------------------------------------
/ch7/stock_market/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Module to preprocess financal data and prepare for further regression analysis.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | def read_daily_equity_data(file):
12 |     stock_data = pd.read_csv(file, usecols=[0, 4, 5], skiprows=[1])
13 |     stock_data['timestamp'] = pd.to_datetime(stock_data['timestamp'])
14 |     stock_data.set_index('timestamp', inplace=True, verify_integrity=True)
15 |     stock_data.sort_index(inplace=True)
16 |     return stock_data
17 | 
18 | def compose_trends(ts):
19 |     from sklearn.preprocessing import MinMaxScaler
20 |     
21 |     scaler = MinMaxScaler()
22 |     scaled_ts = pd.DataFrame(scaler.fit_transform(ts), columns=ts.columns, index=ts.index)
23 |     return pd.concat([scaled_ts['close'].rolling('365D').mean(), 
24 |                       scaled_ts['volume'].rolling('365D').mean()], axis=1)
25 | 
26 | def create_log_returns(ts, halflife, normalize_close=True):
27 |     ts['close_ret'] = np.log(ts['close']).diff()
28 |     if normalize_close:
29 |         ts['close_ret'] /= ts['close_ret'].ewm(halflife=halflife).std()
30 |     ts['volume_ret'] = np.log(ts['volume']).diff()
31 |     return ts.dropna()


--------------------------------------------------------------------------------
/ch7/stock_market/data_visualization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Contains various utility visualization routines.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | import matplotlib.pyplot as plt
 9 | 
10 | def plot_time_series(ts, title_prefix, style='b-'):
11 |     ax = ts.plot(figsize=(9, 8), lw=2, fontsize=12, style=style)
12 |     ax.set_title('%s Over Time' % title_prefix, fontsize=19)
13 |     ax.set_xlabel('Year', fontsize=15)
14 |     plt.show()
15 | 
16 | def hist_time_series(ts, xlabel, bins):
17 |     ax = ts.hist(figsize=(9, 8), xlabelsize=12, ylabelsize=12, bins=bins, grid=False)
18 |     ax.set_title('Distribution of %s' % xlabel, fontsize=19)
19 |     ax.set_xlabel(xlabel, fontsize=15)
20 |     plt.show()
21 |     
22 | def scatter_time_series(ts, x, y):
23 |     ax = ts.plot(x=x, y=y, figsize=(9, 8), kind='scatter', fontsize=12)
24 |     ax.set_title('Auto-correlation Graph', fontsize=19)
25 |     ax.set_xlabel(x, fontsize=15)
26 |     ax.set_ylabel(y, fontsize=15)
27 |     plt.show()
28 |     
29 | def heat_corr_plot(corr_matrix):
30 |     import numpy as np
31 |     import seaborn as sns
32 |     
33 |     mask = np.zeros_like(corr_matrix)
34 |     mask[np.triu_indices_from(mask)] = True
35 |     _, ax = plt.subplots(figsize=(9, 8))
36 |     sns.heatmap(corr_matrix, annot=True, cmap='gist_gray', fmt=".2f", lw=.5, mask=mask, ax=ax)
37 |     plt.tight_layout()
38 |     plt.show()


--------------------------------------------------------------------------------
/ch7/stock_market/driver.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | The main driver file that connects all pieces together.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | from data_preprocessing import *
 9 | from data_visualization import *
10 | 
11 | # Data Acquisition stage.
12 | stock_data = read_daily_equity_data('daily_AAPL.csv')
13 | 
14 | # Data Preprocessing stage.
15 | stock_data = create_log_returns(stock_data, 23)
16 | 
17 | plot_time_series(stock_data['close'], 'AAPL Closing Levels')
18 | plot_time_series(stock_data['close'].rolling('365D').mean(), 'AAPL Closing Trend')
19 | plot_time_series(compose_trends(stock_data), 'AAPL Closing & Volume Trends', ['b-', 'g--'])
20 | 
21 | # To produce the non-normalized price log returns plot you must call 
22 | # the create_log_returns function with normalize_close=False. Try this as an
23 | # additional exercise.
24 | plot_time_series(stock_data['close_ret'], 'AAPL Volatility-Norm. Price Log Returns')
25 | plot_time_series(stock_data['volume_ret'], 'AAPL Volume Log Returns')
26 | 
27 | hist_time_series(stock_data['close_ret'], 'Daily Stock Log Returns', 50)
28 | hist_time_series(stock_data['volume_ret'], 'Daily Volume Log Returns', 50)
29 | 
30 | # Feature Engineering stage.
31 | from feature_engineering import *
32 | 
33 | report_auto_correlation(stock_data)
34 | corr_matrix = create_features(stock_data)
35 | heat_corr_plot(corr_matrix)
36 | 
37 | # Regression Implementation stage.
38 | from pyspark.sql import SparkSession
39 | 
40 | from streaming_regression import *
41 | 
42 | sparkSession = SparkSession.builder \
43 |                            .master("local[4]") \
44 |                            .appName("Streaming Regression Case Study")\
45 |                            .getOrCreate()
46 | fit_and_predict(sparkSession, stock_data)


--------------------------------------------------------------------------------
/ch7/stock_market/feature_engineering.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Module to help perform feature engineering.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | from data_visualization import *
 9 | 
10 | def report_auto_correlation(ts, periods=5):
11 |     for column in filter(lambda str: str.endswith('_ret'), ts.columns):
12 |         future_column = 'future_' + column
13 |         ts[future_column] = ts[column].shift(-periods).rolling(periods).sum()
14 |         current_column = 'current_' + column
15 |         ts[current_column] = ts[column].rolling(periods).sum()
16 |         
17 |         print(ts[[current_column, future_column]].corr())
18 |         scatter_time_series(ts, current_column, future_column)
19 |         
20 | def create_features(ts):
21 |     from talib import SMA, RSI, OBV
22 |     
23 |     target = 'future_close_ret'
24 |     features = ['current_close_ret', 'current_volume_ret']
25 | 
26 |     for n in [14, 25, 50, 100]:
27 |         ts['sma_' + str(n)] = SMA(ts['close'].values, timeperiod=n) / ts['close']
28 |         ts['rsi_' + str(n)] = RSI(ts['close'].values, timeperiod=n)
29 |     ts['obv'] = OBV(ts['close'].values, ts['volume'].values.astype('float64'))
30 |     
31 |     ts.drop(['close', 'volume', 'close_ret', 'volume_ret', 'future_volume_ret'],
32 |             axis='columns',
33 |             inplace=True)
34 |     ts.dropna(inplace=True)
35 |     return ts.corr()


--------------------------------------------------------------------------------
/ch7/stock_market/streaming_regression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Fits a linear model using streaming regression.
 5 | 
 6 | @author: Ervin Varga
 7 | """
 8 | def fit_and_predict(sparkSession, ts):
 9 |     import numpy as np
10 |     from sklearn.model_selection import train_test_split
11 |     from pyspark.streaming import StreamingContext
12 |     from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
13 | 
14 |     def to_scaled_rdd(pandasDataFrame):
15 |         import pandas as pd
16 |         from sklearn.preprocessing import RobustScaler
17 |         from pyspark.mllib.regression import LabeledPoint
18 | 
19 |         regressors = pandasDataFrame.columns[1:]
20 |         num_regressors = len(regressors)
21 |         # FIX ME: As a bonus exercise, read the last paragraph from section about residual
22 |         # plots and make the necessary bug fix! Compare the behavior of this version with the
23 |         # fixed one and see whether you can decipher anything from the outputs.
24 |         scaler = RobustScaler()
25 |         scaled_regressors = scaler.fit_transform(pandasDataFrame[regressors])
26 |         scaled_pandasDataFrame = pd.DataFrame(scaled_regressors, columns=regressors)
27 |         scaled_pandasDataFrame['target'] = pandasDataFrame[pandasDataFrame.columns[0]].values
28 |         
29 |         sparkDataFrame = sparkSession.createDataFrame(scaled_pandasDataFrame)
30 |         return sparkDataFrame.rdd.map(
31 |                 lambda row: LabeledPoint(row[num_regressors], row[:num_regressors]))
32 | 
33 |     def report_accuracy(result_rdd):
34 |         from pyspark.mllib.evaluation import RegressionMetrics
35 |         
36 |         if not result_rdd.isEmpty():
37 |             metrics = RegressionMetrics(
38 |                     result_rdd.map(lambda t: (float(t[1]), float(t[0]))))
39 |             print("MSE = %s" % metrics.meanSquaredError)
40 |             print("RMSE = %s" % metrics.rootMeanSquaredError)
41 |             print("R-squared = %s" % metrics.r2)
42 |             print("MAE = %s" % metrics.meanAbsoluteError)
43 |             print("Explained variance = %s" % metrics.explainedVariance)        
44 |     
45 |     df_train, df_test = train_test_split(ts, test_size=0.2, shuffle=False)    
46 |     train_rdd = to_scaled_rdd(df_train)
47 |     test_rdd = to_scaled_rdd(df_test)
48 |     
49 |     streamContext = StreamingContext(sparkSession.sparkContext, 1)
50 |     train_stream = streamContext.queueStream([train_rdd])
51 |     test_stream = streamContext.queueStream([test_rdd])
52 |     
53 |     numFeatures = len(ts.columns) - 1
54 |     model = StreamingLinearRegressionWithSGD(stepSize=0.05, numIterations=300)
55 |     np.random.seed(0)
56 |     model.setInitialWeights(np.random.rand(numFeatures))
57 | 
58 |     model.trainOn(train_stream)
59 |     result_stream = model.predictOnValues(test_stream.map(lambda lp: (lp.label, lp.features)))
60 |     result_stream.cache()
61 |     result_stream.foreachRDD(report_accuracy)
62 | 
63 |     streamContext.start()
64 |     streamContext.awaitTermination()


--------------------------------------------------------------------------------
/ch8/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch8/.DS_Store


--------------------------------------------------------------------------------
/ch8/lkpy_demo.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from lenskit import batch
 6 | from lenskit import crossfold as xf
 7 | from lenskit.algorithms import funksvd, item_knn, user_knn
 8 | from lenskit.metrics import topn
 9 | 
10 | ratings = pd.read_csv('data/ratings.csv')
11 | ratings.rename({'userId': 'user', 'movieId': 'item'}, axis = 'columns', inplace = True)
12 | print(ratings.head())
13 | 
14 | xf_dataset_batch, xf_dataset_test = tee(xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)))
15 | truth = pd.concat([test for _, test in xf_dataset_test], ignore_index = True)
16 | 
17 | runner = batch.MultiEval('result', False, nprocs = 4)
18 | runner.add_algorithms(
19 |     [item_knn.ItemItem(10), item_knn.ItemItem(20), item_knn.ItemItem(30)],
20 |     False,
21 |     ['nnbrs']
22 | )
23 | runner.add_algorithms(
24 |     [user_knn.UserUser(10), user_knn.UserUser(20), user_knn.UserUser(30)],
25 |     True,
26 |     ['nnbrs']
27 | )
28 | runner.add_algorithms(
29 |     [funksvd.FunkSVD(40, damping = 0), funksvd.FunkSVD(50, damping = 5), funksvd.FunkSVD(60, damping = 10)],
30 |     False,
31 |     ['features', 'damping']
32 | )
33 | runner.add_datasets(xf_dataset_batch)
34 | runner.run()
35 | 
36 | runs = pd.read_parquet('result/runs.parquet', 
37 |                        columns = ('AlgoClass','RunId','damping','features','nnbrs'))
38 | runs.rename({'AlgoClass': 'Algorithm'}, axis = 'columns', inplace = True)
39 | 
40 | def extract_config(x):
41 |     from math import isnan
42 |     
43 |     damping, features, nnbrs = x
44 |     result = ''
45 |     if not isnan(damping):
46 |         result = "damping=%.2f " % damping
47 |     if not isnan(features):
48 |         result += "features=%.2f " % features
49 |     if not isnan(nnbrs):
50 |         result += "nnbrs=%.2f" % nnbrs    
51 |     return result.strip()
52 | 
53 | runs['Configuration'] = runs[['damping','features','nnbrs']].apply(extract_config, axis = 1)
54 | runs.drop(columns = ['damping','features','nnbrs'], inplace = True)
55 | 
56 | recs = pd.read_parquet('result/recommendations.parquet')
57 | recs = recs.merge(runs, on = 'RunId')
58 | recs.drop(columns = ['RunId'], inplace = True)
59 | print(recs.head(10))
60 | 
61 | user_dcg = recs.groupby(['Algorithm', 'Configuration', 'user']).rating.apply(topn.dcg)
62 | user_dcg = user_dcg.reset_index(name='DCG')
63 | ideal_dcg = topn.compute_ideal_dcgs(truth)
64 | user_ndcg = pd.merge(user_dcg, ideal_dcg)
65 | user_ndcg['nDCG'] = user_ndcg.DCG / user_ndcg.ideal_dcg
66 | user_ndcg = user_ndcg.groupby(['Algorithm', 'Configuration']).nDCG.mean()
67 | 
68 | %matplotlib inline
69 | user_ndcg.plot.bar()


--------------------------------------------------------------------------------
/ch8/simple_recommender/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/practical-data-science-with-python-3/8665bd4cc3cb9da838bdb1b3d2a76fcb54de6d0e/ch8/simple_recommender/.DS_Store


--------------------------------------------------------------------------------
/ch8/simple_recommender/omdb_service.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | class OMDbService:
 4 |     API_URL = 'http://www.omdbapi.com/'
 5 |     
 6 |     def __init__(self, api_key):
 7 |         self._api_key = api_key
 8 | 
 9 |     def retrieve_info(self, title):
10 |         """Returns information about the movie title in JSON format."""
11 |         params = {'apikey': self._api_key, 't': title, 'type': 'movie', 'r': 'json'}
12 |         return requests.get(OMDbService.API_URL, params).json()


--------------------------------------------------------------------------------
/ch8/simple_recommender/simple_movie_recommender.py:
--------------------------------------------------------------------------------
 1 | from tastedive_service import TasteDiveService
 2 | from omdb_service import OMDbService
 3 | 
 4 | class SimpleMovieRecommender:
 5 |     PRIMARY_SOURCE = 'Internet Movie Database'
 6 |     
 7 |     def __init__(self, omdb_api_key):
 8 |         self._omdb = OMDbService(omdb_api_key)
 9 |         self._td = TasteDiveService()
10 |     
11 |     @staticmethod
12 |     def _retrieve_rating(omdb_response):
13 |         for rating in omdb_response['Ratings']:
14 |             if rating['Source'] == SimpleMovieRecommender.PRIMARY_SOURCE:
15 |                 return float(rating['Value'].split('/')[0])
16 |         return float(omdb_response['imdbRating'])
17 | 
18 |     def recommendations(self, titles, limit = 5):
19 |         """
20 |         Return a list of recommended movie titles up to the specified limit.
21 |         The items are ordered according to their ratings (from top to bottom).
22 |         """
23 |         similar_titles = self._td.similar_titles(titles, limit)
24 |         ratings = map(lambda title: SimpleMovieRecommender._retrieve_rating(self._omdb.retrieve_info(title)), 
25 |                       similar_titles)      
26 |         return list(map(lambda item: item[1], sorted(zip(ratings, similar_titles), reverse = True)))


--------------------------------------------------------------------------------
/ch8/simple_recommender/tastedive_service.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | class TasteDiveService:
 4 |     SUPPORTED_ARTIFACTS = ['music', 'movies', 'shows', 'podcasts', 'books', 'authors', 'games']
 5 |     API_URL = 'https://tastedive.com/api/similar'
 6 |     
 7 |     def __init__(self, artifact_type = 'movies'):
 8 |         assert artifact_type in TasteDiveService.SUPPORTED_ARTIFACTS, 'Invalid artifact type'
 9 | 
10 |         self._artifact_type = artifact_type
11 | 
12 |     def _retrieve_artifacts(self, name, limit):
13 |         params = {'q': name, 'type': self._artifact_type, 'limit': limit} 
14 |         return requests.get(TasteDiveService.API_URL, params).json()
15 | 
16 |     @staticmethod
17 |     def _extract_titles(response):
18 |         artifacts = response['Similar']['Results']
19 |         return [artifact['Name'] for artifact in artifacts]
20 |     
21 |     def similar_titles(self, titles, limit = 5):
22 |         """
23 |         Returns a set of similar titles up to the defined limit. Each instance of
24 |         this class is supposed to work only with one artifact type. This type is specified
25 |         during object construction.
26 |         """
27 |         assert 0 < limit <= 50, 'Limit must be in range (0, 50].'
28 | 
29 |         return {similar_title 
30 |                 for title in titles
31 |                     for similar_title in TasteDiveService._extract_titles(self._retrieve_artifacts(title, limit))}
32 | 


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------