├── .github
├── ISSUE_TEMPLATE
│ ├── bug-report.md
│ └── troubleshoot-installation-issues.md
└── workflows
│ ├── env_checks.yml
│ └── stale.yml
├── .gitignore
├── LICENSE
├── README.md
├── _img
├── cover.PNG
└── pandas_drawing.PNG
├── acknowledgements.md
├── appendix
├── README.md
├── choosing_the_appropriate_plot_flow_chart.png
├── data_analysis_workflow.png
└── ml_workflow.png
├── apt.txt
├── ch_01
├── check_environment.py
├── checking_your_setup.ipynb
├── exercises.ipynb
├── introduction_to_data_analysis.ipynb
└── stats_viz.py
├── ch_02
├── 1-pandas_data_structures.ipynb
├── 2-creating_dataframes.ipynb
├── 3-making_dataframes_from_api_requests.ipynb
├── 4-inspecting_dataframes.ipynb
├── 5-selection.ipynb
├── 6-adding_and_removing_data.ipynb
└── data
│ ├── earthquakes.csv
│ ├── example_data.csv
│ ├── parsed.csv
│ ├── quakes.db
│ └── tsunamis.csv
├── ch_03
├── 1-wide_vs_long.ipynb
├── 2-using_the_weather_api.ipynb
├── 3-cleaning_data.ipynb
├── 4-reshaping_data.ipynb
├── 5-handling_data_issues.ipynb
├── data
│ ├── bitcoin.csv
│ ├── dirty_data.csv
│ ├── long_data.csv
│ ├── nyc_temperatures.csv
│ ├── sp500.csv
│ └── wide_data.csv
└── exercises
│ ├── aapl.csv
│ ├── amzn.csv
│ ├── fb.csv
│ ├── goog.csv
│ └── nflx.csv
├── ch_04
├── 0-weather_data_collection.ipynb
├── 1-querying_and_merging.ipynb
├── 2-dataframe_operations.ipynb
├── 3-aggregations.ipynb
├── 4-time_series.ipynb
├── data
│ ├── dirty_data.csv
│ ├── fb_2018.csv
│ ├── fb_week_of_may_20_per_minute.csv
│ ├── melted_stock_data.csv
│ ├── nyc_weather_2018.csv
│ ├── stocks.db
│ ├── weather.db
│ ├── weather_by_station.csv
│ └── weather_stations.csv
├── exercises
│ ├── earthquakes.csv
│ └── faang.csv
├── understanding_window_calculations.ipynb
└── window_calc.py
├── ch_05
├── 1-introducing_matplotlib.ipynb
├── 2-plotting_with_pandas.ipynb
├── 3-pandas_plotting_subpackage.ipynb
└── data
│ ├── earthquakes.csv
│ └── fb_stock_prices_2018.csv
├── ch_06
├── 1-introduction_to_seaborn.ipynb
├── 2-formatting_plots.ipynb
├── 3-customizing_visualizations.ipynb
├── color_utils.py
├── data
│ ├── earthquakes.csv
│ └── fb_stock_prices_2018.csv
├── reg_resid_plot.py
└── std_from_mean_kde.py
├── ch_07
├── data
│ ├── amazon.csv
│ ├── apple.csv
│ ├── bitcoin.csv
│ ├── facebook.csv
│ ├── google.csv
│ ├── netflix.csv
│ ├── netflix_january_2019.csv
│ └── sp500.csv
├── financial_analysis.ipynb
└── random_walk.py
├── ch_08
├── anomaly_detection.ipynb
├── logs
│ ├── attacks.csv
│ └── log.csv
├── simulate.py
└── user_data
│ ├── user_base.txt
│ └── user_ips.json
├── ch_09
├── data
│ ├── binaries.csv
│ ├── planets.csv
│ ├── sample_roc_curves.csv
│ ├── stars.csv
│ ├── systems.csv
│ ├── winequality-red.csv
│ └── winequality-white.csv
├── planet_data_collection.ipynb
├── planets_ml.ipynb
├── preprocessing.ipynb
├── red_wine.ipynb
└── wine.ipynb
├── ch_10
├── data
│ ├── planets.csv
│ ├── stars.csv
│ ├── winequality-red.csv
│ └── winequality-white.csv
├── planets_ml.ipynb
├── red_wine.ipynb
└── wine.ipynb
├── ch_11
├── 0-simulating_the_data.ipynb
├── 1-EDA_unlabeled_data.ipynb
├── 2-unsupervised_anomaly_detection.ipynb
├── 3-EDA_labeled_data.ipynb
├── 4-supervised_anomaly_detection.ipynb
├── 5-online_learning.ipynb
├── logs
│ ├── hackers_2018.csv
│ ├── hackers_2019.csv
│ ├── logs.db
│ ├── logs_2018.csv
│ └── logs_2019.csv
├── merge_logs.py
├── run_simulations.sh
├── simulate.py
└── user_data
│ ├── user_base.txt
│ └── user_ips.json
├── ch_12
└── README.md
├── environment.yml
├── requirements.txt
├── runtime.txt
└── solutions
├── ch_01
└── solutions.ipynb
├── ch_02
└── solutions.ipynb
├── ch_03
├── faang.csv
└── solutions.ipynb
├── ch_04
└── solutions.ipynb
├── ch_05
└── solutions.ipynb
├── ch_06
└── solutions.ipynb
├── ch_07
└── solutions.ipynb
├── ch_08
├── dec_2018_attacks.csv
├── dec_2018_log.csv
└── solutions.ipynb
├── ch_09
├── exercise_1.ipynb
├── exercise_2.ipynb
├── exercise_3.ipynb
├── exercise_4.ipynb
└── exercise_5.ipynb
├── ch_10
├── exercise_1.ipynb
├── exercise_2.ipynb
├── exercise_3.ipynb
├── exercise_4.ipynb
└── exercise_5.ipynb
└── ch_11
├── exercise_1.ipynb
├── exercise_2.ipynb
├── exercise_3.ipynb
└── exercise_4.ipynb
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Something isn't working as expected.
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | ### Required attestation
11 | - [ ] I am using a virtual environment that matches the book specifications **exactly**.
12 | - [ ] I confirm that my fork is up to date with the latest changes in this repository.
13 | - [ ] I have checked that this issue has not already been reported or resolved [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/issues?q=is%3Aissue).
14 | - [ ] I have confirmed that my results match those obtained when running the code using [this](https://mybinder.org/v2/gh/stefmolin/binder-environments/1st_edition?urlpath=git-pull?repo=https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas) Binder environment.
15 |
16 | ---
17 |
18 | ### Background information
19 | ##### 1. Which OS are you using?
20 | TODO: Provide your OS here – make sure to differentiate between Intel and M1 chip Macs.
21 |
22 | ##### 2. Which Python version are you using?
23 | TODO: Provide your Python version here.
24 |
25 | ##### 3. Are you using `conda` or `venv`?
26 | TODO: Indicate whether you are using `conda` or `venv`.
27 |
28 | ##### 4. Package versions
29 |
30 | versions installed
31 |
32 | ```
33 | TODO: Paste the result of running `pip freeze` or `conda list`
34 | ```
35 |
36 |
37 |
38 | ##### 5. Run the `ch_01/checking_your_setup.ipynb` notebook
39 | Screenshot after running the `ch_01/checking_your_setup.ipynb` notebook:
40 |
41 | TODO: paste your screenshot here
42 |
43 | ---
44 |
45 | ### Commands run and their outputs
46 | Please provide **all** of the commands you ran as well as the traceback:
47 |
48 |
49 |
50 | ```
51 | TODO: paste commands and any traceback here
52 | ```
53 |
54 |
55 |
56 |
57 | ### Screenshots
58 | Optionally, include any screenshots that will help diagnose the issue.
59 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/troubleshoot-installation-issues.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Troubleshoot installation issues
3 | about: Help troubleshooting issues encountered when setting up the virtual environment.
4 | title: ''
5 | labels: troubleshooting
6 | assignees: ''
7 |
8 | ---
9 |
10 | ### Required attestation
11 | - [ ] I have **completely and exactly** followed the virtual environment setup instructions from the book.
12 | - [ ] I have cloned either this repository or my fork of this repository so that I have all necessary files on my local machine.
13 | - [ ] I have checked that this issue has not already been reported or resolved [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/issues?q=is%3Aissue).
14 | - [ ] I am aware that there is a [pre-built Binder environment](https://mybinder.org/v2/gh/stefmolin/binder-environments/1st_edition?urlpath=git-pull?repo=https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas) that I can use, but I want to install locally on my machine instead.
15 |
16 | ---
17 |
18 | ### Background information
19 | ##### 1. Which OS are you using?
20 | TODO: Provide your OS here – make sure to differentiate between Intel and M1 chip Macs.
21 |
22 | ##### 2. Which Python version are you using?
23 | TODO: Provide your Python version here.
24 |
25 | ##### 3. Are you using `conda` or `venv`?
26 | TODO: Indicate whether you are using `conda` or `venv`.
27 |
28 | ---
29 |
30 | ### Commands run and their outputs
31 | Please provide **all** of the commands you ran as well as the traceback:
32 |
33 |
34 |
35 | ```
36 | TODO: paste commands and any traceback here
37 | ```
38 |
39 |
40 |
41 |
42 | ### Screenshots
43 | Optionally, include any screenshots that will help diagnose the issue.
44 |
--------------------------------------------------------------------------------
/.github/workflows/env_checks.yml:
--------------------------------------------------------------------------------
1 | # This workflow builds the book environment on Mac, Linux, and Windows for
2 | # multiple versions of Python to confirm it can be properly installed.
3 | #
4 | # Author: Stefanie Molin
5 |
6 | name: Env Build
7 |
8 | # Controls when the workflow will run
9 | on:
10 | # Triggers the workflow on push events
11 | push:
12 | branches: [ "master" ]
13 |
14 | # Trigger on pull request always (note the trailing colon)
15 | pull_request:
16 |
17 | # Allows you to run this workflow manually from the Actions tab
18 | workflow_dispatch:
19 |
20 | # Run this every month
21 | schedule:
22 | - cron: "44 22 11 * *"
23 |
24 | concurrency:
25 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
26 | cancel-in-progress: true
27 |
28 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
29 | jobs:
30 | # This workflow contains a single job called "build"
31 | build:
32 | name: Python ${{ matrix.python-version }}, ${{ matrix.os }}
33 |
34 | # The type of runner that the job will run on
35 | runs-on: ${{ matrix.os }}
36 |
37 | defaults:
38 | run:
39 | shell: bash -el {0}
40 |
41 | strategy:
42 | fail-fast: false
43 | matrix:
44 | os: [macos-13, ubuntu-latest, windows-latest]
45 | python-version: ["3.6", "3.7"]
46 |
47 | # Steps represent a sequence of tasks that will be executed as part of the job
48 | steps:
49 | # checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
50 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
51 |
52 | # remove the Python version from the file for testing
53 | - name: strip hardcoded Python version from environment for testing
54 | run: |
55 | if [[ ${{ matrix.os }} == "macos"* ]]; then
56 | sed -i '' -e '/- python[>=]/d' environment.yml;
57 | else
58 | sed -i -e '/- python[>=]/d' environment.yml;
59 | fi;
60 |
61 | # create the conda env
62 | - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4
63 | with:
64 | python-version: ${{ matrix.python-version }}
65 | auto-update-conda: true
66 | channels: conda-forge
67 | channel-priority: true
68 | activate-environment: book_env
69 | environment-file: environment.yml
70 |
71 | - name: conda diagnostics
72 | run: |
73 | conda info
74 | conda list
75 | conda config --show-sources
76 | conda config --show
77 | printenv | sort
78 |
79 | - name: verify install
80 | run: cd ch_01 && python check_environment.py
81 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
2 | #
3 | # You can adjust the behavior by modifying this file.
4 | # For more information, see:
5 | # https://github.com/actions/stale
6 | name: Mark stale issues and pull requests
7 |
8 | on:
9 | schedule:
10 | - cron: '15 9 * * *'
11 |
12 | jobs:
13 | stale:
14 |
15 | runs-on: ubuntu-latest
16 | permissions:
17 | issues: write
18 | pull-requests: write
19 |
20 | steps:
21 | - uses: actions/stale@v3
22 | with:
23 | repo-token: ${{ secrets.GITHUB_TOKEN }}
24 | days-before-stale: 30
25 | days-before-close: 7
26 | stale-issue-message: 'This issue has been marked as stale due to lack of recent activity. It will be closed if no further activity occurs.'
27 | stale-pr-message: ''
28 | stale-issue-label: 'stale'
29 | stale-pr-label: 'stale'
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | __pycache__/
3 | *images/
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018, 2019 Stefanie Molin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hands-On Data Analysis with Pandas
2 | [](https://mybinder.org/v2/gh/stefmolin/Hands-On-Data-Analysis-with-Pandas/master?urlpath=lab) [](https://colab.research.google.com/github/stefmolin/Hands-On-Data-Analysis-with-Pandas/blob/master) [](https://nbviewer.jupyter.org/github/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/) [](https://www.amazon.com/Hands-Data-Analysis-Pandas-visualization/dp/1789615321)
3 |
4 |
5 |
6 | This is the code repository for my book [Hands-On Data Analysis with Pandas](https://www.packtpub.com/big-data-and-business-intelligence/hands-data-analysis-pandas), published by Packt on July 26, 2019.
7 |
8 | *The [1st_edition tag](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/1st_edition) contains all materials as they were at time of publishing the first edition.*
9 |
10 | ---
11 |
12 | **IMPORTANT NOTE** (April 29, 2021):
13 |
14 | This is the code repository for the **first edition**. For the **second edition**, use [this repository](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition) instead.
15 |
16 | ---
17 |
18 | ## Book Description
19 | Data analysis has become an essential skill in a variety of domains where knowing how to work with data and extract insights can generate significant value.
20 |
21 | *Hands-On Data Analysis with Pandas* will show you how to analyze your data, get started with machine learning, and work effectively with Python libraries often used for data science, such as pandas, NumPy, matplotlib, seaborn, and scikit-learn. Using real-world datasets, you will learn how to use the powerful pandas library to perform data wrangling to reshape, clean, and aggregate your data. Then, you will learn how to conduct exploratory data analysis by calculating summary statistics and visualizing the data to find patterns. In the concluding chapters, you will explore some applications of anomaly detection, regression, clustering, and classification, using scikit-learn, to make predictions based on past data.
22 |
23 | By the end of this book, you will be equipped with the skills you need to use pandas to ensure the veracity of your data, visualize it for effective decision-making, and reliably reproduce analysis across multiple domains.
24 |
25 | ## What You Will Learn
26 | *Prerequisite: Basic knowledge of Python or past experience with another language (R, SAS, MATLAB, etc.).*
27 | - Understand how data analysts and scientists gather and analyze data
28 | - Perform data analysis and data wrangling in Python
29 | - Combine, group, and aggregate data from multiple sources
30 | - Create data visualizations with `pandas`, `matplotlib`, and `seaborn`
31 | - Apply machine learning algorithms with `sklearn` to identify patterns and make predictions
32 | - Use Python data science libraries to analyze real-world datasets.
33 | - Use `pandas` to solve several common data representation and analysis problems
34 | - Collect data from APIs
35 | - Build Python scripts, modules, and packages for reusable analysis code.
36 | - Utilize computer science concepts and algorithms to write more efficient code for data analysis
37 | - Write and run simulations
38 |
39 | ## Table of Contents
40 | - [Chapter 1, *Introduction to Data Analysis*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_01), will teach you the fundamentals of data analysis, give you a foundation in statistics, and get your environment set up for working with data in Python and using Jupyter Notebooks.
41 |
42 | - [Chapter 2, *Working with Pandas DataFrames*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_02), introduces you to the `pandas` library and shows you the basics of working with `DataFrames`.
43 |
44 | - [Chapter 3, *Data Wrangling with Pandas*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_03), discusses the process of data manipulation, shows you how to explore an API to gather data, and guides you through data cleaning and reshaping with pandas.
45 |
46 | - [Chapter 4, *Aggregating Pandas DataFrames*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_04), teaches you how to query and merge DataFrames, perform complex operations on them, including rolling calculations and aggregations, and how to work effectively with time series data.
47 |
48 | - [Chapter 5, *Visualizing Data with Pandas and Matplotlib*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_05), shows you how to create your own data visualizations in Python, first using the `matplotlib` library, and then directly from `pandas` objects.
49 |
50 | - [Chapter 6, *Plotting with Seaborn and Customization Techniques*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_06), continues the discussion on data visualization by teaching you how to use the `seaborn` library for visualizing your long form data and giving you the tools you need to customize your visualizations, making them presentation-ready.
51 |
52 | - [Chapter 7, *Financial Analysis: Bitcoin and the Stock Market*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_07), walks you through the creation of a [Python package for analyzing stocks](https://github.com/stefmolin/stock-analysis), building upon everything learned in chapters 1-6 and applying it to a financial application.
53 |
54 | - [Chapter 8, *Rule-Based Anomaly Detection*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_08), covers [simulating data](https://github.com/stefmolin/login-attempt-simulator) and applying everything learned in chapters 1-6 to catching hackers attempting to authenticate to a website, using rule-based strategies for anomaly detection.
55 |
56 | - [Chapter 9, *Getting Started with Machine Learning in Python*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_09), introduces you to machine learning and building models using the `sklearn` library.
57 |
58 | - [Chapter 10, *Making Better Predictions: Optimizing Models*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_10), shows you strategies for improving the performance of your machine learning models.
59 |
60 | - [Chapter 11, *Machine Learning Anomaly Detection*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_11), revisits anomaly detection on login attempt data, using machine learning techniques, all while giving you a taste of how the workflow looks in practice.
61 |
62 | - [Chapter 12, *The Road Ahead*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_12), contains resources for taking your skills to the next level and further avenues for exploration.
63 |
64 | ## Notes on Environment Setup
65 | [](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/actions/workflows/env_checks.yml) 
66 |
67 | Environment setup instructions are in the chapter 1 of the text. If you don't have the book, you must install Python 3.6 or 3.7, [set up a virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#creating-a-virtual-environment), [activate it](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment), and then [install the packages listed in requirements.txt](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#using-requirements-files). You can then launch JupyterLab and use the `ch_01/checking_your_setup.ipynb` Jupyter notebook to check your setup. Consult [this resource](https://anbasile.github.io/programming/2017/06/25/jupyter-venv/) if you have issues with using your virtual environment in Jupyter.
68 |
69 | ## Solutions
70 | Each chapter comes with exercises. The solutions for chapters 1-11 can be found [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/solutions).
71 |
72 | ## About the Author
73 | Stefanie Molin ([@stefmolin](https://github.com/stefmolin)) is a software engineer and data scientist at Bloomberg in New York City, where she tackles tough problems in information security, particularly those revolving around data wrangling/visualization, building tools for gathering data, and knowledge sharing. She holds a bachelor’s of science degree in operations research from Columbia University's Fu Foundation School of Engineering and Applied Science with minors in Economics and Entrepreneurship and Innovation, as well as a master’s degree in computer science, with a specialization in machine learning, from Georgia Tech. In her free time, she enjoys traveling the world, inventing new recipes, and learning new languages spoken both among people and computers.
74 |
75 | ## Acknowledgements
76 | Since the book limited the acknowledgements to 450 characters, the full version is [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/blob/master/acknowledgements.md).
77 |
--------------------------------------------------------------------------------
/_img/cover.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/_img/cover.PNG
--------------------------------------------------------------------------------
/_img/pandas_drawing.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/_img/pandas_drawing.PNG
--------------------------------------------------------------------------------
/acknowledgements.md:
--------------------------------------------------------------------------------
1 | # Acknowledgements
2 |
3 | Writing this book has been a tremendous amount of work, but I have grown a lot through the experience: as a writer, as a technologist, and as a person. This wouldn't have been possible without the help I received along the way. It was truly touching to see that friends, family, and colleagues were so willing to help. I'm very grateful to you all.
4 |
5 | To my family. **Mom**: for always being there when I needed to vent after working multiple 14 hour days straight writing this book and making sure I had something to eat...and not getting too upset when it took me months to find the time to look into the Japan itinerary. Y ahora, por fin, puedo tomar esa clase de Zumba contigo. **Ryan**: for the sports resources in chapter 12 and always telling it like it is. **Dad**: for sharing a countdown with me.
6 |
7 | To my friend and reviewer, **Aliki Mavromoustaki**: It's rare to meet someone over video conference and instantly form a friendship, but I am very happy our paths crossed. Even though I told you this was an offer you could refuse, you took on the extra work to review the first drafts, ευχαριστώ. We finally got to work on something together!
8 |
9 | To **Felipe Moreno**: for being an extremely supportive manager and my toughest critic. You managed to fit more writing into the margins of each page than I would have thought possible. While this led to rewriting large portions of the earlier chapters for the final drafts, I know it made a huge difference in the quality of the final outcome. Obrigada!
10 |
11 | To **Suphannee Sivakorn**: for volunteering to review the drafts. I know you thought I wouldn't make it past chapter 5, but you didn't really know me then...ขอบคุณค่ะ (ka ka ka 🕊️) for all the drawings (and occasional poem). They made me smile when I really needed it and kept me sane. Now, I have a poem for you:
12 |
13 |
14 |
15 | Roses are red,
16 | violets are blue;
17 | I'm done with this book,
18 | and so are you.
19 |
20 |
21 |
22 | Get ready to be a famous artist:
23 |
24 |
25 |
26 |
27 |
28 | To my colleagues. **Lucy Hao** and **Javon Thompson**: for reviewing the financial analysis code and chapter 7 despite never having met me. Lucy, I am very glad I got the chance to meet you when I gave my lightning talk at the Princeton office. You reassured me that the content of this book is relevant and useful to its audience, and it sounded like you got just as much out of the experience of reviewing that chapter as I did. **Alexander Comerford**: for additional resources in chapter 12 and knitting me desk gloves (paws?) for work. You kept my fingers from freezing, so I could write this book. I'm still waiting for my blanket though...
29 |
30 | Finally, thank you to everyone else that helped along the way whether it was support, kind words, or being a sounding board I truly appreciate it, and it made a difference.
31 |
--------------------------------------------------------------------------------
/appendix/README.md:
--------------------------------------------------------------------------------
1 | # Appendix
2 | Here are some workflow diagrams for reference.
3 |
4 | ## Data Analysis Workflow
5 |
6 |
7 | ## Choosing the Appropriate Plot
8 |
9 |
10 | ## Machine Learning Workflow
11 |
12 |
--------------------------------------------------------------------------------
/appendix/choosing_the_appropriate_plot_flow_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/choosing_the_appropriate_plot_flow_chart.png
--------------------------------------------------------------------------------
/appendix/data_analysis_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/data_analysis_workflow.png
--------------------------------------------------------------------------------
/appendix/ml_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/ml_workflow.png
--------------------------------------------------------------------------------
/apt.txt:
--------------------------------------------------------------------------------
1 | graphviz
2 |
--------------------------------------------------------------------------------
/ch_01/check_environment.py:
--------------------------------------------------------------------------------
1 | """Check environment for following along with the text."""
2 |
3 | from distutils.version import LooseVersion as Version
4 | import importlib
5 | import sys
6 | import re
7 |
8 |
9 | OK = '\x1b[42m[ OK ]\x1b[0m'
10 | FAIL = '\x1b[41m[FAIL]\x1b[0m'
11 |
12 | github_package_pattern = re.compile(r'(?:\/)([\w*\-*]*)(?:\.git)')
13 |
14 | def run_checks(raise_exc=False):
15 | """
16 | Check that the packages we need are installed and the Python version is good.
17 |
18 | Parameters
19 | ----------
20 | raise_exc : bool, default ``False``
21 | Whether to raise an exception if any of the packages doesn't
22 | match the requirements (used for GitHub Action).
23 | """
24 | failures = []
25 |
26 | # check the python version
27 | print('Using Python in %s:' % sys.prefix)
28 | if Version(sys.version) >= '3.6.0' and Version(sys.version) < '3.8.0':
29 | print(OK, 'Python is version %s\n' % sys.version)
30 | else:
31 | print(FAIL, 'Python version >= 3.6.0 and < 3.8.0 is required, but %s is installed.\n' % sys.version)
32 | failures.append('Python')
33 |
34 | # read in the requirements
35 | with open('../requirements.txt', 'r') as file:
36 | requirements = {}
37 | for line in file.read().splitlines():
38 | github_package = re.search(github_package_pattern, line)
39 | if github_package:
40 | pkg = github_package.group(1).replace('-', '_')
41 | version = None
42 | else:
43 | if line.startswith('./'):
44 | line = line.replace('./', '')
45 | try:
46 | if '>=' in line:
47 | pkg, versions = line.split('>=')
48 | version = versions.split(',<=')
49 | else:
50 | pkg, version = line.split('==')
51 | except ValueError:
52 | pkg, version = line, None
53 | if pkg == 'imbalanced-learn':
54 | pkg = 'imblearn'
55 | elif pkg == 'scikit-learn':
56 | pkg = 'sklearn'
57 |
58 | requirements[pkg.replace('-', '_')] = version
59 |
60 | # check the requirements
61 | for pkg, req_version in requirements.items():
62 | try:
63 | mod = importlib.import_module(pkg)
64 | if req_version:
65 | version = mod.__version__
66 | if isinstance(req_version, list):
67 | min_version, max_version = req_version
68 | if Version(version) < min_version or Version(version) > max_version:
69 | print(FAIL, '%s version >= %s and <= %s is required, but %s installed.' % (pkg, min_version, max_version, version))
70 | failures.append(pkg)
71 | continue
72 | else:
73 | if Version(version) != req_version:
74 | print(FAIL, '%s version %s is required, but %s installed.' % (pkg, req_version, version))
75 | failures.append(pkg)
76 | continue
77 | print(OK, '%s' % pkg)
78 | except ImportError:
79 | print(FAIL, '%s not installed.' % pkg)
80 | failures.append(pkg)
81 |
82 | if failures and raise_exc:
83 | raise Exception(
84 | 'Environment failed inspection due to incorrect versions '
85 | f'of {len(failures)} item(s): {", ".join(failures)}.'
86 | )
87 |
88 | if __name__ == '__main__':
89 | run_checks(raise_exc=True)
90 |
--------------------------------------------------------------------------------
/ch_01/checking_your_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Checking your setup\n",
8 | "Run through this notebook to make sure your environment is properly setup. Be sure to launch Jupyter from inside the virtual environment."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [
16 | {
17 | "name": "stdout",
18 | "output_type": "stream",
19 | "text": [
20 | "Using Python in c:\\users\\molinstefanie\\packt\\venv:\n",
21 | "\u001b[42m[ OK ]\u001b[0m Python is version 3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 22:20:52) [MSC v.1916 32 bit (Intel)]\n",
22 | "\n",
23 | "\u001b[42m[ OK ]\u001b[0m graphviz\n",
24 | "\u001b[42m[ OK ]\u001b[0m imblearn\n",
25 | "\u001b[42m[ OK ]\u001b[0m jupyter\n",
26 | "\u001b[42m[ OK ]\u001b[0m jupyterlab\n",
27 | "\u001b[42m[ OK ]\u001b[0m matplotlib\n",
28 | "\u001b[42m[ OK ]\u001b[0m numpy\n",
29 | "\u001b[42m[ OK ]\u001b[0m pandas\n",
30 | "\u001b[42m[ OK ]\u001b[0m pandas_datareader\n",
31 | "\u001b[42m[ OK ]\u001b[0m requests\n",
32 | "\u001b[42m[ OK ]\u001b[0m sklearn\n",
33 | "\u001b[42m[ OK ]\u001b[0m scipy\n",
34 | "\u001b[42m[ OK ]\u001b[0m seaborn\n",
35 | "\u001b[42m[ OK ]\u001b[0m sqlalchemy\n",
36 | "\u001b[42m[ OK ]\u001b[0m statsmodels\n",
37 | "\u001b[42m[ OK ]\u001b[0m login_attempt_simulator\n",
38 | "\u001b[42m[ OK ]\u001b[0m ml_utils\n",
39 | "\u001b[42m[ OK ]\u001b[0m stock_analysis\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "from check_environment import run_checks\n",
45 | "run_checks()"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "*Note: Adapted from Andreas Mueller's [`check_env.ipynb` notebook](https://github.com/amueller/ml-workshop-1-of-4/blob/master/check_env.ipynb).*"
53 | ]
54 | }
55 | ],
56 | "metadata": {
57 | "kernelspec": {
58 | "display_name": "Python 3",
59 | "language": "python",
60 | "name": "python3"
61 | },
62 | "language_info": {
63 | "codemirror_mode": {
64 | "name": "ipython",
65 | "version": 3
66 | },
67 | "file_extension": ".py",
68 | "mimetype": "text/x-python",
69 | "name": "python",
70 | "nbconvert_exporter": "python",
71 | "pygments_lexer": "ipython3",
72 | "version": "3.7.2"
73 | }
74 | },
75 | "nbformat": 4,
76 | "nbformat_minor": 2
77 | }
78 |
--------------------------------------------------------------------------------
/ch_01/exercises.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Coding exercises\n",
8 | "Exercises 1-3 are thought exercises that don't require coding. \n",
9 | "\n",
10 | "## Exercise 4: Generate the data by running this cell\n",
11 | "This will give you a list of numbers to work with in the remaining exercises."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import random\n",
21 | "random.seed(0)\n",
22 | "salaries = [round(random.random()*1000000, -3) for _ in range(100)]"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Exercise 5: Calculating statistics and verifying\n",
30 | "### mean"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": []
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### median"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": []
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "### mode"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": []
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "### sample variance\n",
73 | "Remember to use Bessel's correction."
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": []
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### sample standard deviation\n",
88 | "Remember to use Bessel's correction."
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": []
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "## Exercise 6: Calculating more statistics\n",
103 | "### range"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": []
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### coefficient of variation"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": []
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "### interquartile range"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {},
138 | "outputs": [],
139 | "source": []
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "### quartile coefficent of dispersion"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": []
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "## Exercise 7: Scaling data\n",
160 | "### min-max scaling"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": []
169 | },
170 | {
171 | "cell_type": "markdown",
172 | "metadata": {},
173 | "source": [
174 | "### standardizing"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [],
182 | "source": []
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "## Exercise 8: Calculating covariance and correlation\n",
189 | "### covariance"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": []
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | "### Pearson correlation coefficient ($\\rho$)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": []
212 | }
213 | ],
214 | "metadata": {
215 | "kernelspec": {
216 | "display_name": "Python 3",
217 | "language": "python",
218 | "name": "python3"
219 | },
220 | "language_info": {
221 | "codemirror_mode": {
222 | "name": "ipython",
223 | "version": 3
224 | },
225 | "file_extension": ".py",
226 | "mimetype": "text/x-python",
227 | "name": "python",
228 | "nbconvert_exporter": "python",
229 | "pygments_lexer": "ipython3",
230 | "version": "3.7.2"
231 | }
232 | },
233 | "nbformat": 4,
234 | "nbformat_minor": 2
235 | }
236 |
--------------------------------------------------------------------------------
/ch_02/data/example_data.csv:
--------------------------------------------------------------------------------
1 | time;place;magType;mag;alert;tsunami
2 | 2018-10-13 11:10:23.560;262km NW of Ozernovskiy, Russia;mww;6.7;green;1
3 | 2018-10-13 04:34:15.580;25km E of Bitung, Indonesia;mww;5.2;green;0
4 | 2018-10-13 00:13:46.220;42km WNW of Sola, Vanuatu;mww;5.7;green;0
5 | 2018-10-12 21:09:49.240;13km E of Nueva Concepcion, Guatemala;mww;5.7;green;0
6 | 2018-10-12 02:52:03.620;128km SE of Kimbe, Papua New Guinea;mww;5.6;green;1
7 |
--------------------------------------------------------------------------------
/ch_02/data/quakes.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_02/data/quakes.db
--------------------------------------------------------------------------------
/ch_02/data/tsunamis.csv:
--------------------------------------------------------------------------------
1 | alert,type,title,place,magType,mag,time
2 | ,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christmas Island","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090
3 | green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560
4 | green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620
5 | green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130
6 | green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130
7 | green,earthquake,"M 5.9 - 117km ESE of Kimbe, Papua New Guinea","117km ESE of Kimbe, Papua New Guinea",mww,5.9,1539205996680
8 | green,earthquake,"M 5.9 - 113km ESE of Kimbe, Papua New Guinea","113km ESE of Kimbe, Papua New Guinea",mww,5.9,1539205141060
9 | green,earthquake,"M 7.0 - 117km E of Kimbe, Papua New Guinea","117km E of Kimbe, Papua New Guinea",mww,7.0,1539204500290
10 | green,earthquake,"M 6.1 - 132km E of Kimbe, Papua New Guinea","132km E of Kimbe, Papua New Guinea",mb,6.1,1539204326420
11 | green,earthquake,"M 5.0 - 61km SSW of Chignik Lake, Alaska","61km SSW of Chignik Lake, Alaska",ml,5.0,1539152878406
12 | ,earthquake,"M 5.3 - 65km NNW of Lae, Papua New Guinea","65km NNW of Lae, Papua New Guinea",mb,5.3,1539150837980
13 | ,earthquake,"M 5.4 - 62km NW of Finschhafen, Papua New Guinea","62km NW of Finschhafen, Papua New Guinea",mww,5.4,1539136980090
14 | green,earthquake,"M 4.0 - 71km SW of Kaktovik, Alaska","71km SW of Kaktovik, Alaska",ml,4.0,1539069081499
15 | ,earthquake,"M 5.1 - 13km E of Palu, Indonesia","13km E of Palu, Indonesia",mb,5.1,1539033346530
16 | ,earthquake,"M 5.1 - 14km ENE of Sambelia, Indonesia","14km ENE of Sambelia, Indonesia",mww,5.1,1538935041200
17 | green,earthquake,"M 4.0 - 60km WNW of Valdez, Alaska","60km WNW of Valdez, Alaska",ml,4.0,1538904354275
18 | ,earthquake,"M 5.0 - 11km ESE of Kimbe, Papua New Guinea","11km ESE of Kimbe, Papua New Guinea",mww,5.0,1538842952660
19 | green,earthquake,"M 3.8 - 5km SW of Tres Pinos, CA","5km SW of Tres Pinos, CA",mw,3.83,1538746177550
20 | green,earthquake,"M 4.0 - 67km SSW of Kaktovik, Alaska","67km SSW of Kaktovik, Alaska",ml,4.0,1538658776412
21 | ,earthquake,"M 5.3 - 29km SSW of Nggongi, Indonesia","29km SSW of Nggongi, Indonesia",mb,5.3,1538570285120
22 | ,earthquake,"M 5.0 - 38km S of Nggongi Satu, Indonesia","38km S of Nggongi Satu, Indonesia",mww,5.0,1538560686080
23 | ,earthquake,"M 5.0 - 50km WSW of Kasiguncu, Indonesia","50km WSW of Kasiguncu, Indonesia",mb,5.0,1538456366290
24 | green,earthquake,"M 5.6 - 33km SSW of Nggongi Satu, Indonesia","33km SSW of Nggongi Satu, Indonesia",mww,5.6,1538455771470
25 | green,earthquake,"M 5.9 - 30km SSW of Nggongi, Indonesia","30km SSW of Nggongi, Indonesia",mww,5.9,1538439405760
26 | green,earthquake,"M 6.0 - 33km S of Nggongi Satu, Indonesia","33km S of Nggongi Satu, Indonesia",mww,6.0,1538438383070
27 | ,earthquake,"M 5.2 - 25km N of Palu, Indonesia","25km N of Palu, Indonesia",mww,5.2,1538437599550
28 | ,earthquake,"M 5.4 - 31km S of Nggongi Satu, Indonesia","31km S of Nggongi Satu, Indonesia",mb,5.4,1538436426090
29 | ,earthquake,"M 5.0 - 33km SSW of Nggongi Satu, Indonesia","33km SSW of Nggongi Satu, Indonesia",mww,5.0,1538435522580
30 | ,earthquake,"M 5.1 - 101km NNW of Palu, Indonesia","101km NNW of Palu, Indonesia",mww,5.1,1538372615190
31 | ,earthquake,"M 5.0 - 106km NNW of Lae, Papua New Guinea","106km NNW of Lae, Papua New Guinea",mb,5.0,1538344682130
32 | green,earthquake,"M 6.7 - 263km NNE of Ndoi Island, Fiji","263km NNE of Ndoi Island, Fiji",mww,6.7,1538304744240
33 | ,earthquake,"M 5.1 - 49km W of Kasiguncu, Indonesia","49km W of Kasiguncu, Indonesia",mww,5.1,1538217018480
34 | ,earthquake,"M 5.1 - 53km W of Kasiguncu, Indonesia","53km W of Kasiguncu, Indonesia",mb,5.1,1538206811760
35 | green,earthquake,"M 4.4 - 1km SE of Delta, B.C., MX","1km SE of Delta, B.C., MX",mw,4.41,1538187466720
36 | ,earthquake,"M 5.0 - 55km WSW of Kasiguncu, Indonesia","55km WSW of Kasiguncu, Indonesia",mww,5.0,1538169841560
37 | ,earthquake,"M 5.0 - 45km SSW of Palu, Indonesia","45km SSW of Palu, Indonesia",mb,5.0,1538148942250
38 | ,earthquake,"M 5.4 - 113km NNW of Palu, Indonesia","113km NNW of Palu, Indonesia",mb,5.4,1538144760960
39 | ,earthquake,"M 5.2 - 58km S of Palu, Indonesia","58km S of Palu, Indonesia",mb,5.2,1538141984430
40 | green,earthquake,"M 5.7 - 107km N of Palu, Indonesia","107km N of Palu, Indonesia",ms_20,5.7,1538141730630
41 | ,earthquake,"M 5.1 - 47km N of Palu, Indonesia","47km N of Palu, Indonesia",mb,5.1,1538137653240
42 | ,earthquake,"M 5.2 - 68km SSE of Palu, Indonesia","68km SSE of Palu, Indonesia",mb,5.2,1538132811150
43 | green,earthquake,"M 5.7 - 17km NNE of Palu, Indonesia","17km NNE of Palu, Indonesia",mb,5.7,1538131825150
44 | ,earthquake,"M 5.1 - 42km N of Palu, Indonesia","42km N of Palu, Indonesia",mb,5.1,1538131664560
45 | ,earthquake,"M 5.4 - 24km N of Palu, Indonesia","24km N of Palu, Indonesia",mb,5.4,1538131143050
46 | green,earthquake,"M 5.8 - 21km SSE of Palu, Indonesia","21km SSE of Palu, Indonesia",mb,5.8,1538130304440
47 | green,earthquake,"M 5.7 - 17km E of Palu, Indonesia","17km E of Palu, Indonesia",mb,5.7,1538129809140
48 | green,earthquake,"M 5.8 - 99km N of Palu, Indonesia","99km N of Palu, Indonesia",mb,5.8,1538129660450
49 | red,earthquake,"M 7.5 - 78km N of Palu, Indonesia","78km N of Palu, Indonesia",mww,7.5,1538128963480
50 | ,earthquake,"M 5.0 - 60km N of Palu, Indonesia","60km N of Palu, Indonesia",mb,5.0,1538123098480
51 | ,earthquake,"M 5.4 - 77km NNE of Palu, Indonesia","77km NNE of Palu, Indonesia",mb,5.4,1538118198440
52 | green,earthquake,"M 6.1 - 55km NNW of Palu, Indonesia","55km NNW of Palu, Indonesia",mww,6.1,1538118001950
53 | ,earthquake,"M 5.2 - 91km WNW of Panguna, Papua New Guinea","91km WNW of Panguna, Papua New Guinea",mww,5.2,1538063612790
54 | ,earthquake,"M 5.1 - 117km N of Saumlaki, Indonesia","117km N of Saumlaki, Indonesia",mb,5.1,1538026140750
55 | ,earthquake,"M 5.1 - 19km WNW of Langsa, Indonesia","19km WNW of Langsa, Indonesia",mww,5.1,1537984301360
56 | ,earthquake,"M 5.0 - 85km W of Manokwari, Indonesia","85km W of Manokwari, Indonesia",mww,5.0,1537954061090
57 | ,earthquake,"M 5.0 - 10km WSW of Kainantu, Papua New Guinea","10km WSW of Kainantu, Papua New Guinea",mb,5.0,1537760541200
58 | ,earthquake,"M 5.4 - 228km S of Taron, Papua New Guinea","228km S of Taron, Papua New Guinea",mb,5.4,1537427126700
59 | ,earthquake,"M 5.1 - 278km SE of Pondaguitan, Philippines","278km SE of Pondaguitan, Philippines",mb,5.1,1537411002190
60 | green,earthquake,"M 5.1 - 64km SSW of Kaktovik, Alaska","64km SSW of Kaktovik, Alaska",ml,5.1,1537274456960
61 | ,earthquake,"M 5.2 - 126km N of Dili, East Timor","126km N of Dili, East Timor",mb,5.2,1537262729590
62 | ,earthquake,"M 5.1 - 34km NW of Finschhafen, Papua New Guinea","34km NW of Finschhafen, Papua New Guinea",mb,5.1,1537236235470
63 |
--------------------------------------------------------------------------------
/ch_03/data/long_data.csv:
--------------------------------------------------------------------------------
1 | attributes,datatype,date,station,value
2 | ",,H,0700",TMAX,2018-10-01T00:00:00,GHCND:USC00280907,21.1
3 | ",,H,0700",TMIN,2018-10-01T00:00:00,GHCND:USC00280907,8.9
4 | ",,H,0700",TOBS,2018-10-01T00:00:00,GHCND:USC00280907,13.9
5 | ",,H,0700",TMAX,2018-10-02T00:00:00,GHCND:USC00280907,23.9
6 | ",,H,0700",TMIN,2018-10-02T00:00:00,GHCND:USC00280907,13.9
7 | ",,H,0700",TOBS,2018-10-02T00:00:00,GHCND:USC00280907,17.2
8 | ",,H,0700",TMAX,2018-10-03T00:00:00,GHCND:USC00280907,25.0
9 | ",,H,0700",TMIN,2018-10-03T00:00:00,GHCND:USC00280907,15.6
10 | ",,H,0700",TOBS,2018-10-03T00:00:00,GHCND:USC00280907,16.1
11 | ",,H,0700",TMAX,2018-10-04T00:00:00,GHCND:USC00280907,22.8
12 | ",,H,0700",TMIN,2018-10-04T00:00:00,GHCND:USC00280907,11.7
13 | ",,H,0700",TOBS,2018-10-04T00:00:00,GHCND:USC00280907,11.7
14 | ",,H,0700",TMAX,2018-10-05T00:00:00,GHCND:USC00280907,23.3
15 | ",,H,0700",TMIN,2018-10-05T00:00:00,GHCND:USC00280907,11.7
16 | ",,H,0700",TOBS,2018-10-05T00:00:00,GHCND:USC00280907,18.9
17 | ",,H,0700",TMAX,2018-10-06T00:00:00,GHCND:USC00280907,20.0
18 | ",,H,0700",TMIN,2018-10-06T00:00:00,GHCND:USC00280907,13.3
19 | ",,H,0700",TOBS,2018-10-06T00:00:00,GHCND:USC00280907,16.1
20 | ",,H,0700",TMAX,2018-10-07T00:00:00,GHCND:USC00280907,20.0
21 | ",,H,0700",TMIN,2018-10-07T00:00:00,GHCND:USC00280907,16.1
22 | ",,H,0700",TOBS,2018-10-07T00:00:00,GHCND:USC00280907,20.0
23 | ",,H,0700",TMAX,2018-10-08T00:00:00,GHCND:USC00280907,26.7
24 | ",,H,0700",TMIN,2018-10-08T00:00:00,GHCND:USC00280907,17.8
25 | ",,H,0700",TOBS,2018-10-08T00:00:00,GHCND:USC00280907,17.8
26 | ",,H,0700",TMAX,2018-10-09T00:00:00,GHCND:USC00280907,18.9
27 | ",,H,0700",TMIN,2018-10-09T00:00:00,GHCND:USC00280907,17.2
28 | ",,H,0700",TOBS,2018-10-09T00:00:00,GHCND:USC00280907,17.8
29 | ",,H,0700",TMAX,2018-10-10T00:00:00,GHCND:USC00280907,24.4
30 | ",,H,0700",TMIN,2018-10-10T00:00:00,GHCND:USC00280907,17.2
31 | ",,H,0700",TOBS,2018-10-10T00:00:00,GHCND:USC00280907,18.3
32 | ",,H,0700",TMAX,2018-10-11T00:00:00,GHCND:USC00280907,26.1
33 | ",,H,0700",TMIN,2018-10-11T00:00:00,GHCND:USC00280907,17.8
34 | ",,H,0700",TOBS,2018-10-11T00:00:00,GHCND:USC00280907,21.7
35 | ",,H,0700",TMAX,2018-10-12T00:00:00,GHCND:USC00280907,22.8
36 | ",,H,0700",TMIN,2018-10-12T00:00:00,GHCND:USC00280907,14.4
37 | ",,H,0700",TOBS,2018-10-12T00:00:00,GHCND:USC00280907,15.6
38 | ",,H,0700",TMAX,2018-10-13T00:00:00,GHCND:USC00280907,15.6
39 | ",,H,0700",TMIN,2018-10-13T00:00:00,GHCND:USC00280907,7.2
40 | ",,H,0700",TOBS,2018-10-13T00:00:00,GHCND:USC00280907,8.3
41 | ",,H,0700",TMAX,2018-10-14T00:00:00,GHCND:USC00280907,13.3
42 | ",,H,0700",TMIN,2018-10-14T00:00:00,GHCND:USC00280907,5.6
43 | ",,H,0700",TOBS,2018-10-14T00:00:00,GHCND:USC00280907,6.7
44 | ",,H,0700",TMAX,2018-10-15T00:00:00,GHCND:USC00280907,13.3
45 | ",,H,0700",TMIN,2018-10-15T00:00:00,GHCND:USC00280907,6.7
46 | ",,H,0700",TOBS,2018-10-15T00:00:00,GHCND:USC00280907,10.0
47 | ",,H,0700",TMAX,2018-10-16T00:00:00,GHCND:USC00280907,18.9
48 | ",,H,0700",TMIN,2018-10-16T00:00:00,GHCND:USC00280907,7.8
49 | ",,H,0700",TOBS,2018-10-16T00:00:00,GHCND:USC00280907,7.8
50 | ",,H,0700",TMAX,2018-10-17T00:00:00,GHCND:USC00280907,13.3
51 | ",,H,0700",TMIN,2018-10-17T00:00:00,GHCND:USC00280907,3.3
52 | ",,H,0700",TOBS,2018-10-17T00:00:00,GHCND:USC00280907,5.0
53 | ",,H,0700",TMAX,2018-10-18T00:00:00,GHCND:USC00280907,16.1
54 | ",,H,0700",TMIN,2018-10-18T00:00:00,GHCND:USC00280907,4.4
55 | ",,H,0700",TOBS,2018-10-18T00:00:00,GHCND:USC00280907,5.0
56 | ",,H,0700",TMAX,2018-10-19T00:00:00,GHCND:USC00280907,10.0
57 | ",,H,0700",TMIN,2018-10-19T00:00:00,GHCND:USC00280907,-1.1
58 | ",,H,0700",TOBS,2018-10-19T00:00:00,GHCND:USC00280907,0.0
59 | ",,H,0700",TMAX,2018-10-20T00:00:00,GHCND:USC00280907,15.0
60 | ",,H,0700",TMIN,2018-10-20T00:00:00,GHCND:USC00280907,-0.6
61 | ",,H,0700",TOBS,2018-10-20T00:00:00,GHCND:USC00280907,10.6
62 | ",,H,0700",TMAX,2018-10-21T00:00:00,GHCND:USC00280907,16.7
63 | ",,H,0700",TMIN,2018-10-21T00:00:00,GHCND:USC00280907,7.8
64 | ",,H,0700",TOBS,2018-10-21T00:00:00,GHCND:USC00280907,7.8
65 | ",,H,0700",TMAX,2018-10-22T00:00:00,GHCND:USC00280907,7.8
66 | ",,H,0700",TMIN,2018-10-22T00:00:00,GHCND:USC00280907,-1.1
67 | ",,H,0700",TOBS,2018-10-22T00:00:00,GHCND:USC00280907,-1.1
68 | ",,H,0700",TMAX,2018-10-23T00:00:00,GHCND:USC00280907,15.6
69 | ",,H,0700",TMIN,2018-10-23T00:00:00,GHCND:USC00280907,-1.1
70 | ",,H,0700",TOBS,2018-10-23T00:00:00,GHCND:USC00280907,10.0
71 | ",,H,0700",TMAX,2018-10-24T00:00:00,GHCND:USC00280907,16.7
72 | ",,H,0700",TMIN,2018-10-24T00:00:00,GHCND:USC00280907,4.4
73 | ",,H,0700",TOBS,2018-10-24T00:00:00,GHCND:USC00280907,6.7
74 | ",,H,0700",TMAX,2018-10-25T00:00:00,GHCND:USC00280907,11.7
75 | ",,H,0700",TMIN,2018-10-25T00:00:00,GHCND:USC00280907,2.8
76 | ",,H,0700",TOBS,2018-10-25T00:00:00,GHCND:USC00280907,2.8
77 | ",,H,0700",TMAX,2018-10-26T00:00:00,GHCND:USC00280907,9.4
78 | ",,H,0700",TMIN,2018-10-26T00:00:00,GHCND:USC00280907,-0.6
79 | ",,H,0700",TOBS,2018-10-26T00:00:00,GHCND:USC00280907,-0.6
80 | ",,H,0700",TMAX,2018-10-27T00:00:00,GHCND:USC00280907,8.9
81 | ",,H,0700",TMIN,2018-10-27T00:00:00,GHCND:USC00280907,-0.6
82 | ",,H,0700",TOBS,2018-10-27T00:00:00,GHCND:USC00280907,6.1
83 | ",,H,0700",TMAX,2018-10-28T00:00:00,GHCND:USC00280907,8.3
84 | ",,H,0700",TMIN,2018-10-28T00:00:00,GHCND:USC00280907,5.0
85 | ",,H,0700",TOBS,2018-10-28T00:00:00,GHCND:USC00280907,7.2
86 | ",,H,0700",TMAX,2018-10-29T00:00:00,GHCND:USC00280907,10.6
87 | ",,H,0700",TMIN,2018-10-29T00:00:00,GHCND:USC00280907,6.7
88 | ",,H,0700",TOBS,2018-10-29T00:00:00,GHCND:USC00280907,8.3
89 | ",,H,0700",TMAX,2018-10-30T00:00:00,GHCND:USC00280907,13.3
90 | ",,H,0700",TMIN,2018-10-30T00:00:00,GHCND:USC00280907,2.2
91 | ",,H,0700",TOBS,2018-10-30T00:00:00,GHCND:USC00280907,5.0
92 | ",,H,0700",TMAX,2018-10-31T00:00:00,GHCND:USC00280907,12.2
93 | ",,H,0700",TMIN,2018-10-31T00:00:00,GHCND:USC00280907,0.0
94 | ",,H,0700",TOBS,2018-10-31T00:00:00,GHCND:USC00280907,0.0
95 |
--------------------------------------------------------------------------------
/ch_03/data/nyc_temperatures.csv:
--------------------------------------------------------------------------------
1 | attributes,datatype,date,station,value
2 | "H,,S,",TAVG,2018-10-01T00:00:00,GHCND:USW00014732,21.2
3 | ",,W,2400",TMAX,2018-10-01T00:00:00,GHCND:USW00014732,25.6
4 | ",,W,2400",TMIN,2018-10-01T00:00:00,GHCND:USW00014732,18.3
5 | "H,,S,",TAVG,2018-10-02T00:00:00,GHCND:USW00014732,22.7
6 | ",,W,2400",TMAX,2018-10-02T00:00:00,GHCND:USW00014732,26.1
7 | ",,W,2400",TMIN,2018-10-02T00:00:00,GHCND:USW00014732,19.4
8 | "H,,S,",TAVG,2018-10-03T00:00:00,GHCND:USW00014732,21.8
9 | ",,W,2400",TMAX,2018-10-03T00:00:00,GHCND:USW00014732,25.0
10 | ",,W,2400",TMIN,2018-10-03T00:00:00,GHCND:USW00014732,18.9
11 | "H,,S,",TAVG,2018-10-04T00:00:00,GHCND:USW00014732,21.3
12 | ",,W,2400",TMAX,2018-10-04T00:00:00,GHCND:USW00014732,26.1
13 | ",,W,2400",TMIN,2018-10-04T00:00:00,GHCND:USW00014732,17.8
14 | "H,,S,",TAVG,2018-10-05T00:00:00,GHCND:USW00014732,20.3
15 | ",,W,2400",TMAX,2018-10-05T00:00:00,GHCND:USW00014732,22.8
16 | ",,W,2400",TMIN,2018-10-05T00:00:00,GHCND:USW00014732,16.1
17 | "H,,S,",TAVG,2018-10-06T00:00:00,GHCND:USW00014732,18.7
18 | ",,W,2400",TMAX,2018-10-06T00:00:00,GHCND:USW00014732,21.1
19 | ",,W,2400",TMIN,2018-10-06T00:00:00,GHCND:USW00014732,17.8
20 | "H,,S,",TAVG,2018-10-07T00:00:00,GHCND:USW00014732,22.8
21 | ",,W,2400",TMAX,2018-10-07T00:00:00,GHCND:USW00014732,27.8
22 | ",,W,2400",TMIN,2018-10-07T00:00:00,GHCND:USW00014732,21.1
23 | "H,,S,",TAVG,2018-10-08T00:00:00,GHCND:USW00014732,20.9
24 | ",,W,2400",TMAX,2018-10-08T00:00:00,GHCND:USW00014732,22.8
25 | ",,W,2400",TMIN,2018-10-08T00:00:00,GHCND:USW00014732,18.3
26 | "H,,S,",TAVG,2018-10-09T00:00:00,GHCND:USW00014732,21.8
27 | ",,W,2400",TMAX,2018-10-09T00:00:00,GHCND:USW00014732,25.6
28 | ",,W,2400",TMIN,2018-10-09T00:00:00,GHCND:USW00014732,19.4
29 | "H,,S,",TAVG,2018-10-10T00:00:00,GHCND:USW00014732,23.8
30 | ",,W,2400",TMAX,2018-10-10T00:00:00,GHCND:USW00014732,27.8
31 | ",,W,2400",TMIN,2018-10-10T00:00:00,GHCND:USW00014732,21.7
32 | "H,,S,",TAVG,2018-10-11T00:00:00,GHCND:USW00014732,23.4
33 | ",,W,2400",TMAX,2018-10-11T00:00:00,GHCND:USW00014732,26.7
34 | ",,W,2400",TMIN,2018-10-11T00:00:00,GHCND:USW00014732,21.7
35 | "H,,S,",TAVG,2018-10-12T00:00:00,GHCND:USW00014732,18.3
36 | ",,W,2400",TMAX,2018-10-12T00:00:00,GHCND:USW00014732,22.2
37 | ",,W,2400",TMIN,2018-10-12T00:00:00,GHCND:USW00014732,12.2
38 | "H,,S,",TAVG,2018-10-13T00:00:00,GHCND:USW00014732,12.2
39 | ",,W,2400",TMAX,2018-10-13T00:00:00,GHCND:USW00014732,15.0
40 | ",,W,2400",TMIN,2018-10-13T00:00:00,GHCND:USW00014732,9.4
41 | "H,,S,",TAVG,2018-10-14T00:00:00,GHCND:USW00014732,12.9
42 | ",,W,2400",TMAX,2018-10-14T00:00:00,GHCND:USW00014732,15.6
43 | ",,W,2400",TMIN,2018-10-14T00:00:00,GHCND:USW00014732,10.6
44 | "H,,S,",TAVG,2018-10-15T00:00:00,GHCND:USW00014732,15.8
45 | ",,W,2400",TMAX,2018-10-15T00:00:00,GHCND:USW00014732,21.1
46 | ",,W,2400",TMIN,2018-10-15T00:00:00,GHCND:USW00014732,12.8
47 | "H,,S,",TAVG,2018-10-16T00:00:00,GHCND:USW00014732,14.3
48 | ",,W,2400",TMAX,2018-10-16T00:00:00,GHCND:USW00014732,16.7
49 | ",,W,2400",TMIN,2018-10-16T00:00:00,GHCND:USW00014732,9.4
50 | "H,,S,",TAVG,2018-10-17T00:00:00,GHCND:USW00014732,13.2
51 | ",,W,2400",TMAX,2018-10-17T00:00:00,GHCND:USW00014732,17.8
52 | ",,W,2400",TMIN,2018-10-17T00:00:00,GHCND:USW00014732,8.9
53 | "H,,S,",TAVG,2018-10-18T00:00:00,GHCND:USW00014732,9.6
54 | ",,W,2400",TMAX,2018-10-18T00:00:00,GHCND:USW00014732,11.7
55 | ",,W,2400",TMIN,2018-10-18T00:00:00,GHCND:USW00014732,6.7
56 | "H,,S,",TAVG,2018-10-19T00:00:00,GHCND:USW00014732,11.3
57 | ",,W,2400",TMAX,2018-10-19T00:00:00,GHCND:USW00014732,17.2
58 | ",,W,2400",TMIN,2018-10-19T00:00:00,GHCND:USW00014732,7.2
59 | "H,,S,",TAVG,2018-10-20T00:00:00,GHCND:USW00014732,15.0
60 | ",,W,2400",TMAX,2018-10-20T00:00:00,GHCND:USW00014732,18.3
61 | ",,W,2400",TMIN,2018-10-20T00:00:00,GHCND:USW00014732,12.2
62 | "H,,S,",TAVG,2018-10-21T00:00:00,GHCND:USW00014732,10.7
63 | ",,W,2400",TMAX,2018-10-21T00:00:00,GHCND:USW00014732,12.2
64 | ",,W,2400",TMIN,2018-10-21T00:00:00,GHCND:USW00014732,6.1
65 | "H,,S,",TAVG,2018-10-22T00:00:00,GHCND:USW00014732,8.3
66 | ",,W,2400",TMAX,2018-10-22T00:00:00,GHCND:USW00014732,11.1
67 | ",,W,2400",TMIN,2018-10-22T00:00:00,GHCND:USW00014732,5.6
68 | "H,,S,",TAVG,2018-10-23T00:00:00,GHCND:USW00014732,12.6
69 | ",,W,2400",TMAX,2018-10-23T00:00:00,GHCND:USW00014732,19.4
70 | ",,W,2400",TMIN,2018-10-23T00:00:00,GHCND:USW00014732,8.3
71 | "H,,S,",TAVG,2018-10-24T00:00:00,GHCND:USW00014732,11.0
72 | ",,W,2400",TMAX,2018-10-24T00:00:00,GHCND:USW00014732,13.3
73 | ",,W,2400",TMIN,2018-10-24T00:00:00,GHCND:USW00014732,7.8
74 | "H,,S,",TAVG,2018-10-25T00:00:00,GHCND:USW00014732,8.8
75 | ",,W,2400",TMAX,2018-10-25T00:00:00,GHCND:USW00014732,11.1
76 | ",,W,2400",TMIN,2018-10-25T00:00:00,GHCND:USW00014732,6.1
77 | "H,,S,",TAVG,2018-10-26T00:00:00,GHCND:USW00014732,7.3
78 | ",,W,2400",TMAX,2018-10-26T00:00:00,GHCND:USW00014732,10.0
79 | ",,W,2400",TMIN,2018-10-26T00:00:00,GHCND:USW00014732,5.6
80 | "H,,S,",TAVG,2018-10-27T00:00:00,GHCND:USW00014732,9.4
81 | ",,W,2400",TMAX,2018-10-27T00:00:00,GHCND:USW00014732,11.7
82 | ",,W,2400",TMIN,2018-10-27T00:00:00,GHCND:USW00014732,7.2
83 | "H,,S,",TAVG,2018-10-28T00:00:00,GHCND:USW00014732,10.2
84 | ",,W,2400",TMAX,2018-10-28T00:00:00,GHCND:USW00014732,12.2
85 | ",,W,2400",TMIN,2018-10-28T00:00:00,GHCND:USW00014732,8.3
86 | "H,,S,",TAVG,2018-10-29T00:00:00,GHCND:USW00014732,11.8
87 | ",,W,2400",TMAX,2018-10-29T00:00:00,GHCND:USW00014732,14.4
88 | ",,W,2400",TMIN,2018-10-29T00:00:00,GHCND:USW00014732,9.4
89 | "H,,S,",TAVG,2018-10-30T00:00:00,GHCND:USW00014732,10.2
90 | ",,W,2400",TMAX,2018-10-30T00:00:00,GHCND:USW00014732,13.9
91 | ",,W,2400",TMIN,2018-10-30T00:00:00,GHCND:USW00014732,7.2
92 | "H,,S,",TAVG,2018-10-31T00:00:00,GHCND:USW00014732,12.6
93 | ",,W,2400",TMAX,2018-10-31T00:00:00,GHCND:USW00014732,17.8
94 | ",,W,2400",TMIN,2018-10-31T00:00:00,GHCND:USW00014732,7.2
95 |
--------------------------------------------------------------------------------
/ch_03/data/wide_data.csv:
--------------------------------------------------------------------------------
1 | date,TMAX,TMIN,TOBS
2 | 2018-10-01,21.1,8.9,13.9
3 | 2018-10-02,23.9,13.9,17.2
4 | 2018-10-03,25.0,15.6,16.1
5 | 2018-10-04,22.8,11.7,11.7
6 | 2018-10-05,23.3,11.7,18.9
7 | 2018-10-06,20.0,13.3,16.1
8 | 2018-10-07,20.0,16.1,20.0
9 | 2018-10-08,26.7,17.8,17.8
10 | 2018-10-09,18.9,17.2,17.8
11 | 2018-10-10,24.4,17.2,18.3
12 | 2018-10-11,26.1,17.8,21.7
13 | 2018-10-12,22.8,14.4,15.6
14 | 2018-10-13,15.6,7.2,8.3
15 | 2018-10-14,13.3,5.6,6.7
16 | 2018-10-15,13.3,6.7,10.0
17 | 2018-10-16,18.9,7.8,7.8
18 | 2018-10-17,13.3,3.3,5.0
19 | 2018-10-18,16.1,4.4,5.0
20 | 2018-10-19,10.0,-1.1,0.0
21 | 2018-10-20,15.0,-0.6,10.6
22 | 2018-10-21,16.7,7.8,7.8
23 | 2018-10-22,7.8,-1.1,-1.1
24 | 2018-10-23,15.6,-1.1,10.0
25 | 2018-10-24,16.7,4.4,6.7
26 | 2018-10-25,11.7,2.8,2.8
27 | 2018-10-26,9.4,-0.6,-0.6
28 | 2018-10-27,8.9,-0.6,6.1
29 | 2018-10-28,8.3,5.0,7.2
30 | 2018-10-29,10.6,6.7,8.3
31 | 2018-10-30,13.3,2.2,5.0
32 | 2018-10-31,12.2,0.0,0.0
33 |
--------------------------------------------------------------------------------
/ch_03/exercises/fb.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903
3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563
4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896
5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535
6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726
7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057
8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894
9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587
10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299
11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842
12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376
13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901
14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540
15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464
16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781
17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548
18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740
19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212
20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172
21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556
22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144
23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293
24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484
25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206
26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505
27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886
28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321
29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626
30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133
31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350
32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704
33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120
34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610
35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921
36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804
37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192
38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288
39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703
40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806
41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039
42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626
43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905
44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280
45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784
46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293
47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946
48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292
49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229
50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477
51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728
52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035
53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438
54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060
55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768
56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834
57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979
58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706
59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634
60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995
61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170
62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293
63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991
64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865
65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584
66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609
67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812
68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227
69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041
70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633
71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956
72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561
73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435
74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029
75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568
76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961
77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438
78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102
79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926
80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581
81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934
82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657
83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478
84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932
85 | 2018-05-02,174.24599999999998,178.08,174.2,176.07,30424450
86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071
87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844
88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195
89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211
90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811
91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403
92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848
93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744
94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433
95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495
96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675
97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451
98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864
99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419
100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100
101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742
102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061
103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937
104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866
105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631
106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245
107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795
108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294
109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920
110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171
111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092
112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907
113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704
114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821
115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866
116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931
117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023
118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996
119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933
120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717
121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188
122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137
123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576
124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408
125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439
126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602
127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578
128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514
129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193
130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131
131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437
132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067
133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377
134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706
135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401
136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209
137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892
138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907
139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429
140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508
141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969
142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681
143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585
144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668
145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749
146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787
147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471
148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109
149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954
150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434
151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192
152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562
153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230
154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120
155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410
156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264
157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995
158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231
159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784
160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176
161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006
162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514
163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083
164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567
165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556
166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935
167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675
168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301
169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532
170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159
171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971
172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744
173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834
174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600
175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680
176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088
177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118
178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775
179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405
180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321
181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236
182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996
183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038
184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800
185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775
186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806
187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231
188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856
189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638
190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677
191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977
192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456
193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635
194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047
195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968
196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425
197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970
198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901
199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492
200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521
201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095
202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003
203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084
204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347
205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658
206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032
207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597
208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696
209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341
210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784
211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278
212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251
213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786
214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695
215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849
216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124
217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372
218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814
219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898
220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123
221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710
222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384
223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280
224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560
225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729
226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475
227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735
228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128
229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640
230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318
231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505
232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713
233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577
234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226
235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400
236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145
237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460
238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173
239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349
240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936
241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610
242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820
243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959
244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084
245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894
246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944
247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491
248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002
249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370
250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509
251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569
252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308
253 |
--------------------------------------------------------------------------------
/ch_03/exercises/nflx.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2018-01-02,196.1,201.65,195.42,201.07,10966889
3 | 2018-01-03,202.05,206.21,201.5,205.05,8591369
4 | 2018-01-04,206.2,207.05,204.0006,205.63,6029616
5 | 2018-01-05,207.25,210.02,205.59,209.99,7033240
6 | 2018-01-08,210.02,212.5,208.44,212.05,5580178
7 | 2018-01-09,212.11,212.98,208.59,209.31,6125855
8 | 2018-01-10,207.57,213.64,206.91,212.52,5951486
9 | 2018-01-11,214.29,217.75,213.35,217.24,7659485
10 | 2018-01-12,217.18,222.55,216.0,221.23,8199423
11 | 2018-01-16,224.24,226.07,217.2,221.53,13516067
12 | 2018-01-17,221.0,221.15,216.32,217.5,9123056
13 | 2018-01-18,220.34,220.58,216.55,220.33,8225339
14 | 2018-01-19,222.75,223.49,218.5,220.46,10548567
15 | 2018-01-22,222.0,227.785,221.2,227.58,17703293
16 | 2018-01-23,255.05,257.71,248.02,250.29,27705332
17 | 2018-01-24,250.88,261.71,249.31099999999998,261.3,17352448
18 | 2018-01-25,263.0,272.3,260.23,269.7,15336378
19 | 2018-01-26,271.485,274.6,268.76,274.6,11021839
20 | 2018-01-29,274.2,286.81,273.92,284.59,17529749
21 | 2018-01-30,277.0,282.73,272.7,278.8,12482852
22 | 2018-01-31,281.94,282.289,269.58,270.3,11695072
23 | 2018-02-01,266.41,271.95,263.38,265.07,9669011
24 | 2018-02-02,263.0,270.62,262.71,267.43,9123610
25 | 2018-02-05,262.0,267.899,250.03,254.26,11896053
26 | 2018-02-06,247.7,266.7,245.0,265.72,12595801
27 | 2018-02-07,266.58,272.45,264.325,264.56,8981548
28 | 2018-02-08,267.08,267.62,250.0,250.1,9306701
29 | 2018-02-09,253.85,255.7999,236.11,249.47,16906942
30 | 2018-02-12,252.14,259.15,249.0,257.95,8534906
31 | 2018-02-13,257.29,261.41,254.7,258.27,6855151
32 | 2018-02-14,260.47,269.88,260.33,266.0,10971985
33 | 2018-02-15,270.03,280.5,267.63,280.27,10759667
34 | 2018-02-16,278.73,281.96,275.69,278.52,8312380
35 | 2018-02-20,277.74,285.812,276.61,278.55,7769023
36 | 2018-02-21,282.07,286.64,280.01,281.04,9371121
37 | 2018-02-22,283.88,284.5,274.45,278.14,8891535
38 | 2018-02-23,281.0,286.0,277.81,285.93,7301809
39 | 2018-02-26,288.75,295.6475,287.01,294.16,10268633
40 | 2018-02-27,294.77,297.36,290.59,290.61,9416489
41 | 2018-02-28,293.1,295.75,290.78,291.38,7653454
42 | 2018-03-01,292.75,295.25,283.83,290.39,11932051
43 | 2018-03-02,284.65,301.18,283.23,301.05,13345313
44 | 2018-03-05,302.85,316.91,297.6,315.0,18986099
45 | 2018-03-06,319.88,325.79,316.5,325.22,18525844
46 | 2018-03-07,320.0,323.74,314.55,321.16,17132222
47 | 2018-03-08,322.2,322.9176,314.13,317.0,11340066
48 | 2018-03-09,321.33,331.44,320.23,331.44,14500219
49 | 2018-03-12,333.56,333.98,318.6,321.3,20369152
50 | 2018-03-13,323.87,325.8409,313.278,315.88,12917224
51 | 2018-03-14,318.16,323.88,317.7,321.55,10475073
52 | 2018-03-15,323.17,323.4,318.14,321.09,5642883
53 | 2018-03-16,321.42,324.11,318.37,318.45,7333734
54 | 2018-03-19,315.8,317.0,307.34,313.48,9925162
55 | 2018-03-20,313.26,319.5,312.8,317.5,5991945
56 | 2018-03-21,316.35,319.4,314.51099999999997,316.48,5263911
57 | 2018-03-22,313.07,314.12,305.66,306.7,8063305
58 | 2018-03-23,307.41,310.73,300.36,300.94,9529948
59 | 2018-03-26,309.36,321.03,302.0,320.35,11988274
60 | 2018-03-27,322.49,322.9,297.0,300.69,12068632
61 | 2018-03-28,298.39,298.8,281.61,285.77,18972912
62 | 2018-03-29,287.0,295.35,275.9,295.35,19145522
63 | 2018-04-02,291.94,292.87,275.05,280.29,13405760
64 | 2018-04-03,285.45,291.25,278.01,283.67,12694862
65 | 2018-04-04,273.63,290.31,271.2239,288.94,12913978
66 | 2018-04-05,293.15,299.16,289.11,293.97,10655178
67 | 2018-04-06,289.1,298.85,285.65,288.85,11444777
68 | 2018-04-09,291.77,299.55,289.12,289.93,9853564
69 | 2018-04-10,297.68,298.95,291.69,298.07,10719097
70 | 2018-04-11,302.8847,311.64,301.82,303.67,14877429
71 | 2018-04-12,309.7187,311.13,306.75,309.25,10249403
72 | 2018-04-13,317.29,317.49,308.23,311.65,12046573
73 | 2018-04-16,315.99,316.1,304.0,307.78,20307921
74 | 2018-04-17,329.66,338.62,323.77,336.06,33866456
75 | 2018-04-18,336.3,338.82,331.1,334.52,11221139
76 | 2018-04-19,332.88,335.31,326.77,332.7,8438825
77 | 2018-04-20,332.22,336.51,326.0,327.77,9158655
78 | 2018-04-23,329.1499,331.22,317.08,318.69,8968015
79 | 2018-04-24,319.2168,320.249,302.31,307.02,13893217
80 | 2018-04-25,306.37,309.98,292.615,305.76,14919698
81 | 2018-04-26,310.0,316.63,305.58,313.98,9266699
82 | 2018-04-27,316.25,317.45,306.5,311.76,7074384
83 | 2018-04-30,311.07,317.88,310.118,312.46,6088787
84 | 2018-05-01,310.36,313.48,306.69,313.3,6036639
85 | 2018-05-02,311.65,317.1,310.4034,313.36,5697120
86 | 2018-05-03,312.59,312.59,305.73,311.69,6135828
87 | 2018-05-04,308.71,320.98,307.67,320.09,8209513
88 | 2018-05-07,321.9947,329.0234,319.34,326.26,7117823
89 | 2018-05-08,325.9,327.348,323.05,326.89,4735738
90 | 2018-05-09,328.79,331.95,327.51,330.3,5633444
91 | 2018-05-10,331.5,332.055,327.3438,329.6,5302254
92 | 2018-05-11,329.65,331.26,324.87,326.46,4589731
93 | 2018-05-14,327.25,330.5038,327.04,328.53,4089800
94 | 2018-05-15,325.94,326.94,322.434,326.13,4746096
95 | 2018-05-16,326.28,329.72,325.14,328.19,3671690
96 | 2018-05-17,327.53,330.45,323.1734,325.22,4935708
97 | 2018-05-18,324.9,326.42,322.8,324.18,3577717
98 | 2018-05-21,327.11,331.88,325.45,331.82,6657326
99 | 2018-05-22,334.05,336.63,331.15,331.62,5964448
100 | 2018-05-23,329.04,345.0,328.09,344.72,10049147
101 | 2018-05-24,344.34,354.0,341.12,349.29,14758553
102 | 2018-05-25,349.9,354.36,348.83,351.29,7817400
103 | 2018-05-29,351.5,356.1,346.71,349.73,9717921
104 | 2018-05-30,352.37,354.0,349.26,353.54,5685531
105 | 2018-05-31,353.8,355.53,350.21,351.6,6921687
106 | 2018-06-01,353.88,359.99,352.82,359.93,7112292
107 | 2018-06-04,362.6847,363.0,355.51,361.81,7681995
108 | 2018-06-05,363.32,369.83,361.4124,365.8,8358045
109 | 2018-06-06,367.7848,369.6799,363.33,367.45,7712302
110 | 2018-06-07,368.54,368.7,357.8,361.4,8278040
111 | 2018-06-08,358.06,362.39,356.25,360.57,5225736
112 | 2018-06-11,361.88,365.67,360.91,361.45,4432445
113 | 2018-06-12,363.6,365.98,362.0,363.83,4290969
114 | 2018-06-13,367.53,384.2537,364.11,379.93,18222799
115 | 2018-06-14,384.27,395.03,383.25,392.87,14598333
116 | 2018-06-15,390.71,398.86,387.51,391.98,13588114
117 | 2018-06-18,387.72,393.16,386.5,390.4,6824794
118 | 2018-06-19,389.5,405.29,388.5,404.98,16697104
119 | 2018-06-20,415.15,419.4675,409.6,416.76,16494572
120 | 2018-06-21,421.38,423.2056,406.3701,415.44,18389936
121 | 2018-06-22,419.98,420.5,409.651,411.09,10428621
122 | 2018-06-25,404.69,405.99,378.75,384.48,22490922
123 | 2018-06-26,393.28,404.78,389.05,399.39,15191157
124 | 2018-06-27,407.56,411.5865,390.0,390.39,16541426
125 | 2018-06-28,395.0,396.9,387.1,395.42,12219888
126 | 2018-06-29,399.19,401.3299,390.55,391.43,9252511
127 | 2018-07-02,385.45,398.38,380.0,398.18,8142457
128 | 2018-07-03,399.49,399.98,389.5,390.52,5280344
129 | 2018-07-05,393.8003,399.24,390.86,398.39,8448937
130 | 2018-07-06,397.45,408.6495,395.5225,408.25,8629606
131 | 2018-07-09,415.95,419.12,411.1,418.97,11127477
132 | 2018-07-10,417.24,419.44,413.08,415.63,9382944
133 | 2018-07-11,411.34,419.77,410.6,418.65,9713904
134 | 2018-07-12,415.1553,416.79,407.8,413.5,12743273
135 | 2018-07-13,409.19,410.0,395.08099999999996,395.8,15747266
136 | 2018-07-16,398.98,403.355,391.75,400.48,22959984
137 | 2018-07-17,346.95,385.0,344.0,379.48,58410362
138 | 2018-07-18,381.24,383.13,372.3552,375.13,21746266
139 | 2018-07-19,371.06,375.749,363.0,364.23,16878681
140 | 2018-07-20,364.92,370.5,360.14,361.05,15113740
141 | 2018-07-23,359.1453,363.9,353.6,362.66,11505232
142 | 2018-07-24,366.94,367.4,354.56,357.32,12851457
143 | 2018-07-25,357.57,363.28,355.65,362.87,8516248
144 | 2018-07-26,358.19,365.54,356.625,363.09,6993684
145 | 2018-07-27,366.85,367.0,351.65,355.21,8949491
146 | 2018-07-30,351.93,352.03,334.0201,334.96,18260710
147 | 2018-07-31,331.51,342.5,328.0,337.45,14085369
148 | 2018-08-01,335.87,344.41,334.02,338.38,7790477
149 | 2018-08-02,337.23,345.0,334.71,344.5,7131328
150 | 2018-08-03,347.75,347.86,338.4768,343.09,8848367
151 | 2018-08-06,342.8653,351.98,341.74,350.92,8198076
152 | 2018-08-07,353.23,357.31,349.01,351.83,7970930
153 | 2018-08-08,352.21,352.29,346.61,347.61,5402465
154 | 2018-08-09,347.96,352.439,345.8157,349.36,4820313
155 | 2018-08-10,346.91,349.1,344.4233,345.87,4337481
156 | 2018-08-13,339.89,347.19,339.07,341.31,6893649
157 | 2018-08-14,342.09,342.41,336.25,337.49,5805182
158 | 2018-08-15,334.03,335.4962,321.0,326.4,11784485
159 | 2018-08-16,329.9,331.17,321.2138,322.44,6689733
160 | 2018-08-17,319.01,324.365,312.96,316.78,10407908
161 | 2018-08-20,314.64,331.6,310.928,327.73,13591100
162 | 2018-08-21,331.0,341.5,329.7,338.02,14783246
163 | 2018-08-22,338.49,346.21,337.406,344.44,8930669
164 | 2018-08-23,348.11,350.08,337.65,339.17,11336425
165 | 2018-08-24,346.0,359.15,344.541,358.82,14729137
166 | 2018-08-27,367.1453,374.49,360.0,364.58,17427304
167 | 2018-08-28,367.23,369.99,360.38,368.49,9222617
168 | 2018-08-29,367.2,369.86,362.91,368.04,8118734
169 | 2018-08-30,365.0,376.8085,363.54,370.98,10981013
170 | 2018-08-31,370.66,376.0,367.0766,367.68,7943443
171 | 2018-09-04,366.47,368.88,361.26,363.6,7605161
172 | 2018-09-05,360.0,363.39,335.83,341.18,13092847
173 | 2018-09-06,347.44,356.0,341.99,346.46,13050156
174 | 2018-09-07,342.2,355.75,341.25,348.68,9105817
175 | 2018-09-10,352.27,352.5,343.08,348.41,5752184
176 | 2018-09-11,344.67,356.15,343.9001,355.93,6198063
177 | 2018-09-12,359.08,370.43,356.24,369.95,10480769
178 | 2018-09-13,371.91,374.09,366.84,368.15,8366122
179 | 2018-09-14,368.55,371.09,363.46,364.56,4756426
180 | 2018-09-17,364.22,367.33,349.57,350.35,7071945
181 | 2018-09-18,353.67,368.15,351.56,367.65,10413981
182 | 2018-09-19,373.95,377.61,359.17,366.96,11876841
183 | 2018-09-20,370.26,370.26,363.17,365.36,6768086
184 | 2018-09-21,366.59,372.22,360.74,361.19,11930568
185 | 2018-09-24,359.0,373.64,354.33,369.61,9322522
186 | 2018-09-25,370.23,371.34,364.49199999999996,369.43,6799816
187 | 2018-09-26,373.59,382.0,370.88,377.88,13799728
188 | 2018-09-27,379.87,383.2,376.0,380.71,7326246
189 | 2018-09-28,379.24,380.8,373.73,374.13,7114878
190 | 2018-10-01,375.85,386.11,375.59,381.43,8376560
191 | 2018-10-02,384.38,386.7999,373.83,377.14,8638717
192 | 2018-10-03,378.53,380.93,374.881,377.05,5798605
193 | 2018-10-04,375.88,375.92,360.4,363.65,9074350
194 | 2018-10-05,359.77,363.5,343.0,351.35,13522957
195 | 2018-10-08,345.18,352.945,338.11,349.1,12375496
196 | 2018-10-09,348.48,358.72,347.09,355.71,8754990
197 | 2018-10-10,353.52,355.15,325.39,325.89,17183120
198 | 2018-10-11,324.94,334.2,315.81,321.1,16082056
199 | 2018-10-12,339.57,341.3,328.9,339.56,14870830
200 | 2018-10-15,337.63,339.2057,326.93,333.13,11214956
201 | 2018-10-16,337.24,347.95,330.555,346.4,20156418
202 | 2018-10-17,378.33,380.0,356.5,364.7,32610947
203 | 2018-10-18,360.673,362.2,346.05,346.71,18461040
204 | 2018-10-19,351.0,355.8,332.2,332.67,16717233
205 | 2018-10-22,333.1,335.8,320.34,329.54,17097175
206 | 2018-10-23,318.0,336.58,316.77,333.16,14907326
207 | 2018-10-24,332.28,333.0,300.73,301.83,19039297
208 | 2018-10-25,307.12,319.94,305.25,312.87,13346921
209 | 2018-10-26,300.51,313.99,292.3,299.83,19616041
210 | 2018-10-29,305.26,307.89,275.4,284.84,21698841
211 | 2018-10-30,275.57,290.525,271.2093,285.81,23685702
212 | 2018-10-31,297.77,311.5,295.05,301.78,20360342
213 | 2018-11-01,304.59,318.45,296.67,317.38,15121450
214 | 2018-11-02,318.0,321.88,308.33,309.1,13404646
215 | 2018-11-05,311.1,317.53,303.74,315.44,10283044
216 | 2018-11-06,314.76,320.22,305.3,310.84,9710424
217 | 2018-11-07,312.9,328.56,311.0,327.5,13328328
218 | 2018-11-08,328.0,332.0499,316.6103,317.92,11023853
219 | 2018-11-09,311.07,312.98,298.01,303.47,13480792
220 | 2018-11-12,300.0,302.49,290.63,294.07,10924827
221 | 2018-11-13,295.0,303.55,289.1,294.4,12232162
222 | 2018-11-14,300.4,301.84,278.2969,286.73,16853574
223 | 2018-11-15,285.51,292.5,282.16,290.06,9967098
224 | 2018-11-16,287.14,291.72,281.0,286.21,9099485
225 | 2018-11-19,283.79,285.09,269.15,270.6,12993797
226 | 2018-11-20,254.63,276.34,250.0,266.98,16693809
227 | 2018-11-21,274.42,275.34,261.51,262.13,11023037
228 | 2018-11-23,260.11,265.5,256.84,258.82,5245123
229 | 2018-11-26,260.55,266.25,253.8,261.43,12498560
230 | 2018-11-27,259.24,269.08,256.14,266.63,11149501
231 | 2018-11-28,271.98,284.0,263.34,282.65,14801333
232 | 2018-11-29,282.32,290.49,275.5,288.75,15431538
233 | 2018-11-30,288.0,290.81,283.061,286.13,11860117
234 | 2018-12-03,293.19,298.72,284.58,290.3,14117370
235 | 2018-12-04,288.13,295.74,274.72,275.33,12800586
236 | 2018-12-06,268.33,283.22,267.14,282.88,13074324
237 | 2018-12-07,282.48,284.209,263.38,265.14,12466711
238 | 2018-12-10,264.19,271.18,260.6094,269.7,9605553
239 | 2018-12-11,274.08,274.5,262.76,265.32,9843199
240 | 2018-12-12,267.66,281.7695,266.48,274.88,11456716
241 | 2018-12-13,277.64,279.32,271.85,276.02,8379292
242 | 2018-12-14,271.81,277.6665,265.0,266.84,9915319
243 | 2018-12-17,266.51,272.98,261.075,262.8,9634734
244 | 2018-12-18,263.3,275.75,263.29,270.94,10350079
245 | 2018-12-19,269.96,280.87,263.77,266.77,13788448
246 | 2018-12-20,264.64,269.9,251.88,260.58,16792928
247 | 2018-12-21,263.83,264.5,241.29,246.39,21397595
248 | 2018-12-24,242.0,250.65,233.68,233.88,9547616
249 | 2018-12-26,233.92,254.5,231.23,253.67,14402735
250 | 2018-12-27,250.11,255.59,240.1,255.565,12235217
251 | 2018-12-28,257.94,261.9144,249.8,256.08,10987286
252 | 2018-12-31,260.16,270.1001,260.0,267.66,13508920
253 |
--------------------------------------------------------------------------------
/ch_04/0-weather_data_collection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Collecting weather data from an API\n",
8 | "\n",
9 | "## About the data\n",
10 | "In this notebook, we will be collecting daily weather data from the [National Centers for Environmental Information (NCEI) API](https://www.ncdc.noaa.gov/cdo-web/webservices/v2). We will use the Global Historical Climatology Network - Daily (GHCND) data set; see the documentation [here](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf).\n",
11 | "\n",
12 | "*Note: The NCEI is part of the National Oceanic and Atmospheric Administration (NOAA) and, as you can see from the URL for the API, this resource was created when the NCEI was called the NCDC. Should the URL for this resource change in the future, you can search for the NCEI weather API to find the updated one.*\n",
13 | "\n",
14 | "## Using the NCEI API\n",
15 | "Paste your token below."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import requests\n",
25 | "\n",
26 | "def make_request(endpoint, payload=None):\n",
27 | " \"\"\"\n",
28 | " Make a request to a specific endpoint on the weather API\n",
29 | " passing headers and optional payload.\n",
30 | " \n",
31 | " Parameters:\n",
32 | " - endpoint: The endpoint of the API you want to \n",
33 | " make a GET request to.\n",
34 | " - payload: A dictionary of data to pass along \n",
35 | " with the request.\n",
36 | " \n",
37 | " Returns:\n",
38 | " Response object.\n",
39 | " \"\"\"\n",
40 | " return requests.get(\n",
41 | " f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',\n",
42 | " headers={\n",
43 | " 'token': 'PASTE_YOUR_TOKEN_HERE'\n",
44 | " },\n",
45 | " params=payload\n",
46 | " )"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## Collect All Data Points for 2018 In NYC (Various Stations)\n",
54 | "We can make a loop to query for all the data points one day at a time. Here we create a list of all the results:"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "'Gathering data for 2018-12-31'"
66 | ]
67 | },
68 | "metadata": {},
69 | "output_type": "display_data"
70 | }
71 | ],
72 | "source": [
73 | "import datetime\n",
74 | "\n",
75 | "from IPython import display # for updating the cell dynamically\n",
76 | "\n",
77 | "current = datetime.date(2018, 1, 1)\n",
78 | "end = datetime.date(2019, 1, 1)\n",
79 | "\n",
80 | "results = []\n",
81 | "\n",
82 | "while current < end:\n",
83 | " # update the cell with status information\n",
84 | " display.clear_output(wait=True)\n",
85 | " display.display(f'Gathering data for {str(current)}')\n",
86 | " \n",
87 | " response = make_request(\n",
88 | " 'data', \n",
89 | " {\n",
90 | " 'datasetid' : 'GHCND', # Global Historical Climatology Network - Daily (GHCND) dataset\n",
91 | " 'locationid' : 'CITY:US360019', # NYC\n",
92 | " 'startdate' : current,\n",
93 | " 'enddate' : current,\n",
94 | " 'units' : 'metric',\n",
95 | " 'limit' : 1000 # max allowed\n",
96 | " }\n",
97 | " )\n",
98 | "\n",
99 | " if response.ok:\n",
100 | " # we extend the list instead of appending to avoid getting a nested list\n",
101 | " results.extend(response.json()['results'])\n",
102 | "\n",
103 | " # update the current date to avoid an infinite loop\n",
104 | " current += datetime.timedelta(days=1)"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "Now, we can create a dataframe with all this data. Notice there are multiple stations with values for each datatype on a given day. We don't know what the stations are, but we can look them up and add them to the data:"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 3,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/html": [
122 | "\n",
123 | "\n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " | \n",
140 | " attributes | \n",
141 | " datatype | \n",
142 | " date | \n",
143 | " station | \n",
144 | " value | \n",
145 | "
\n",
146 | " \n",
147 | " \n",
148 | " \n",
149 | " 0 | \n",
150 | " ,,N, | \n",
151 | " PRCP | \n",
152 | " 2018-01-01T00:00:00 | \n",
153 | " GHCND:US1CTFR0039 | \n",
154 | " 0.0 | \n",
155 | "
\n",
156 | " \n",
157 | " 1 | \n",
158 | " ,,N, | \n",
159 | " PRCP | \n",
160 | " 2018-01-01T00:00:00 | \n",
161 | " GHCND:US1NJBG0015 | \n",
162 | " 0.0 | \n",
163 | "
\n",
164 | " \n",
165 | " 2 | \n",
166 | " ,,N, | \n",
167 | " SNOW | \n",
168 | " 2018-01-01T00:00:00 | \n",
169 | " GHCND:US1NJBG0015 | \n",
170 | " 0.0 | \n",
171 | "
\n",
172 | " \n",
173 | " 3 | \n",
174 | " ,,N, | \n",
175 | " PRCP | \n",
176 | " 2018-01-01T00:00:00 | \n",
177 | " GHCND:US1NJBG0017 | \n",
178 | " 0.0 | \n",
179 | "
\n",
180 | " \n",
181 | " 4 | \n",
182 | " ,,N, | \n",
183 | " SNOW | \n",
184 | " 2018-01-01T00:00:00 | \n",
185 | " GHCND:US1NJBG0017 | \n",
186 | " 0.0 | \n",
187 | "
\n",
188 | " \n",
189 | "
\n",
190 | "
"
191 | ],
192 | "text/plain": [
193 | " attributes datatype date station value\n",
194 | "0 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1CTFR0039 0.0\n",
195 | "1 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1NJBG0015 0.0\n",
196 | "2 ,,N, SNOW 2018-01-01T00:00:00 GHCND:US1NJBG0015 0.0\n",
197 | "3 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1NJBG0017 0.0\n",
198 | "4 ,,N, SNOW 2018-01-01T00:00:00 GHCND:US1NJBG0017 0.0"
199 | ]
200 | },
201 | "execution_count": 3,
202 | "metadata": {},
203 | "output_type": "execute_result"
204 | }
205 | ],
206 | "source": [
207 | "import pandas as pd\n",
208 | "\n",
209 | "df = pd.DataFrame(results)\n",
210 | "df.head()"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "Save this data to a file:"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 4,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "df.to_csv('data/nyc_weather_2018.csv', index=False)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "and write it to the database:"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 5,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "import sqlite3\n",
243 | "\n",
244 | "with sqlite3.connect('data/weather.db') as connection:\n",
245 | " df.to_sql(\n",
246 | " 'weather', connection, index=False, if_exists='replace'\n",
247 | " )"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "For learning about merging dataframes, we will also get the data mapping station IDs to information about the station:"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 6,
260 | "metadata": {},
261 | "outputs": [],
262 | "source": [
263 | "response = make_request(\n",
264 | " 'stations', \n",
265 | " {\n",
266 | " 'datasetid' : 'GHCND', # Global Historical Climatology Network - Daily (GHCND) dataset\n",
267 | " 'locationid' : 'CITY:US360019', # NYC\n",
268 | " 'limit' : 1000 # max allowed\n",
269 | " }\n",
270 | ")\n",
271 | "\n",
272 | "stations = pd.DataFrame(response.json()['results'])[['id', 'name', 'latitude', 'longitude', 'elevation']]\n",
273 | "stations.to_csv('data/weather_stations.csv', index=False)\n",
274 | "\n",
275 | "with sqlite3.connect('data/weather.db') as connection:\n",
276 | " stations.to_sql(\n",
277 | " 'stations', connection, index=False, if_exists='replace'\n",
278 | " )"
279 | ]
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.7.2"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 2
303 | }
304 |
--------------------------------------------------------------------------------
/ch_04/data/fb_2018.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903
3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563
4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896
5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535
6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726
7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057
8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894
9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587
10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299
11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842
12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376
13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901
14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540
15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464
16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781
17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548
18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740
19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212
20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172
21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556
22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144
23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293
24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484
25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206
26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505
27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886
28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321
29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626
30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133
31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350
32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704
33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120
34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610
35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921
36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804
37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192
38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288
39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703
40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806
41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039
42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626
43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905
44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280
45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784
46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293
47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946
48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292
49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229
50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477
51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728
52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035
53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438
54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060
55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768
56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834
57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979
58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706
59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634
60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995
61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170
62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293
63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991
64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865
65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584
66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609
67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812
68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227
69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041
70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633
71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956
72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561
73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435
74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029
75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568
76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961
77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438
78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102
79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926
80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581
81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934
82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657
83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478
84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932
85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450
86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071
87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844
88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195
89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211
90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811
91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403
92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848
93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744
94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433
95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495
96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675
97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451
98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864
99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419
100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100
101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742
102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061
103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937
104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866
105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631
106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245
107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795
108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294
109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920
110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171
111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092
112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907
113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704
114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821
115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866
116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931
117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023
118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996
119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933
120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717
121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188
122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137
123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576
124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408
125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439
126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602
127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578
128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514
129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193
130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131
131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437
132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067
133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377
134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706
135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401
136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209
137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892
138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907
139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429
140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508
141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969
142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681
143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585
144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668
145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749
146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787
147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471
148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109
149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954
150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434
151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192
152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562
153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230
154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120
155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410
156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264
157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995
158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231
159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784
160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176
161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006
162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514
163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083
164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567
165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556
166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935
167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675
168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301
169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532
170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159
171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971
172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744
173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834
174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600
175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680
176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088
177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118
178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775
179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405
180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321
181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236
182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996
183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038
184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800
185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775
186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806
187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231
188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856
189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638
190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677
191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977
192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456
193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635
194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047
195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968
196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425
197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970
198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901
199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492
200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521
201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095
202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003
203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084
204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347
205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658
206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032
207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597
208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696
209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341
210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784
211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278
212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251
213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786
214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695
215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849
216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124
217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372
218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814
219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898
220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123
221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710
222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384
223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280
224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560
225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729
226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475
227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735
228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128
229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640
230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318
231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505
232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713
233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577
234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226
235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400
236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145
237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460
238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173
239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349
240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936
241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610
242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820
243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959
244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084
245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894
246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944
247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491
248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002
249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370
250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509
251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569
252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308
253 |
--------------------------------------------------------------------------------
/ch_04/data/stocks.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_04/data/stocks.db
--------------------------------------------------------------------------------
/ch_04/data/weather.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_04/data/weather.db
--------------------------------------------------------------------------------
/ch_04/understanding_window_calculations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Understanding Window Calculations\n",
8 | "\n",
9 | "## About the Data\n",
10 | "In this notebook, we will be working with Facebook's stock price throughout 2018 (obtained using the [`stock_analysis` package](https://github.com/stefmolin/stock-analysis)).\n",
11 | "\n",
12 | "## Interactive Visualizations\n",
13 | "If you follow the installation instructions [here](https://github.com/matplotlib/jupyter-matplotlib), you can run the following interactive plot to see what different window calculations look like. This requires you to install `ipyml` and `node.js` and run a few commands from the command line as indicated in the aforementioned link. Note you will need to restart the kernel.\n",
14 | "\n",
15 | "*More information on the `interact()` function can be found [here](https://ipywidgets.readthedocs.io/en/stable/examples/Using%20Interact.html).*"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 17,
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "application/vnd.jupyter.widget-view+json": {
26 | "model_id": "555c8238c49c438289c1c676783bde85",
27 | "version_major": 2,
28 | "version_minor": 0
29 | },
30 | "text/plain": [
31 | "interactive(children=(IntSlider(value=20, description='period', max=200, step=5), Dropdown(description='window…"
32 | ]
33 | },
34 | "metadata": {},
35 | "output_type": "display_data"
36 | }
37 | ],
38 | "source": [
39 | "%matplotlib widget\n",
40 | "from ipywidgets import interact\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "import pandas as pd\n",
43 | "\n",
44 | "fb = pd.read_csv('data/fb_2018.csv', index_col='date', parse_dates=True)\n",
45 | "\n",
46 | "def window_calculations(df):\n",
47 | " def plot_viz(period=20, window_type='rolling', agg='mean'):\n",
48 | " ax = df.plot(y='close')\n",
49 | " window_func = getattr(df.close, window_type)\n",
50 | " if window_type == 'rolling':\n",
51 | " kwargs = {'window': period}\n",
52 | " elif window_type == 'expanding':\n",
53 | " kwargs = {'min_periods': period}\n",
54 | " elif window_type == 'ewm':\n",
55 | " kwargs = {'span': period}\n",
56 | " if agg != 'mean':\n",
57 | " print('Changing to mean')\n",
58 | " agg = 'mean'\n",
59 | " window_func(**kwargs).agg(agg).plot(ax=ax, label=f'{window_type} {period}D {agg}')\n",
60 | "\n",
61 | " plt.suptitle('Window Calculations on Facebook Closing Price')\n",
62 | " plt.title('(Note: EWM only works with mean)')\n",
63 | " plt.ylabel('price ($)')\n",
64 | " plt.legend()\n",
65 | " \n",
66 | " return plot_viz\n",
67 | "interact(\n",
68 | " window_calculations(fb), \n",
69 | " period=(0, 200, 5), \n",
70 | " window_type=['rolling', 'expanding', 'ewm'], \n",
71 | " agg=['sum', 'min', 'max', 'mean']\n",
72 | ");"
73 | ]
74 | }
75 | ],
76 | "metadata": {
77 | "kernelspec": {
78 | "display_name": "Python 3",
79 | "language": "python",
80 | "name": "python3"
81 | },
82 | "language_info": {
83 | "codemirror_mode": {
84 | "name": "ipython",
85 | "version": 3
86 | },
87 | "file_extension": ".py",
88 | "mimetype": "text/x-python",
89 | "name": "python",
90 | "nbconvert_exporter": "python",
91 | "pygments_lexer": "ipython3",
92 | "version": "3.7.2"
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 2
97 | }
98 |
--------------------------------------------------------------------------------
/ch_04/window_calc.py:
--------------------------------------------------------------------------------
1 | def window_calc(df, func, agg_dict, *args, **kwargs):
2 | """
3 | Run a window calculation of your choice on a DataFrame.
4 |
5 | Parameters:
6 | - df: The DataFrame to run the calculation on.
7 | - func: The window calculation method that takes df
8 | as the first argument.
9 | - agg_dict: Information to pass to `agg()`, could be a
10 | dictionary mapping the columns to the aggregation
11 | function to use, a string name for the function,
12 | or the function itself.
13 | - args: Positional arguments to pass to `func`.
14 | - kwargs: Keyword arguments to pass to `func`.
15 |
16 | Returns:
17 | - A new DataFrame object.
18 | """
19 | return df.pipe(func, *args, **kwargs).agg(agg_dict)
--------------------------------------------------------------------------------
/ch_05/data/fb_stock_prices_2018.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903
3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563
4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896
5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535
6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726
7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057
8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894
9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587
10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299
11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842
12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376
13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901
14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540
15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464
16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781
17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548
18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740
19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212
20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172
21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556
22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144
23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293
24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484
25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206
26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505
27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886
28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321
29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626
30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133
31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350
32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704
33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120
34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610
35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921
36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804
37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192
38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288
39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703
40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806
41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039
42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626
43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905
44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280
45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784
46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293
47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946
48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292
49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229
50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477
51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728
52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035
53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438
54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060
55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768
56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834
57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979
58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706
59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634
60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995
61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170
62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293
63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991
64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865
65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584
66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609
67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812
68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227
69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041
70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633
71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956
72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561
73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435
74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029
75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568
76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961
77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438
78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102
79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926
80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581
81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934
82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657
83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478
84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932
85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450
86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071
87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844
88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195
89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211
90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811
91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403
92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848
93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744
94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433
95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495
96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675
97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451
98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864
99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419
100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100
101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742
102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061
103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937
104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866
105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631
106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245
107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795
108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294
109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920
110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171
111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092
112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907
113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704
114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821
115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866
116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931
117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023
118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996
119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933
120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717
121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188
122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137
123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576
124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408
125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439
126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602
127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578
128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514
129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193
130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131
131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437
132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067
133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377
134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706
135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401
136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209
137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892
138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907
139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429
140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508
141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969
142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681
143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585
144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668
145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749
146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787
147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471
148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109
149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954
150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434
151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192
152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562
153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230
154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120
155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410
156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264
157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995
158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231
159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784
160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176
161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006
162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514
163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083
164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567
165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556
166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935
167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675
168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301
169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532
170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159
171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971
172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744
173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834
174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600
175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680
176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088
177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118
178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775
179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405
180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321
181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236
182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996
183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038
184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800
185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775
186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806
187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231
188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856
189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638
190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677
191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977
192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456
193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635
194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047
195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968
196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425
197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970
198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901
199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492
200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521
201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095
202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003
203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084
204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347
205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658
206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032
207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597
208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696
209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341
210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784
211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278
212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251
213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786
214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695
215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849
216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124
217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372
218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814
219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898
220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123
221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710
222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384
223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280
224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560
225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729
226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475
227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735
228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128
229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640
230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318
231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505
232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713
233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577
234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226
235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400
236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145
237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460
238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173
239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349
240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936
241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610
242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820
243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959
244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084
245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894
246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944
247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491
248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002
249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370
250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509
251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569
252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308
253 |
--------------------------------------------------------------------------------
/ch_06/color_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import matplotlib.pyplot as plt
4 | from matplotlib.colors import ListedColormap
5 | import numpy as np
6 |
7 | def hex_to_rgb_color_list(colors):
8 | """
9 | Take color or list of hex code colors and convert them
10 | to RGB colors in the range [0,1].
11 |
12 | Parameters:
13 | - colors: Color or list of color strings of the format
14 | '#FFF' or '#FFFFFF'
15 |
16 | Returns:
17 | The color or list of colors in RGB representation.
18 | """
19 | if isinstance(colors, str):
20 | colors = [colors]
21 |
22 | for i, color in enumerate(
23 | [color.replace('#', '') for color in colors]
24 | ):
25 | hex_length = len(color)
26 |
27 | if hex_length not in [3, 6]:
28 | raise ValueError(
29 | 'Your colors must be of the form #FFFFFF or #FFFFFF'
30 | )
31 |
32 | regex = '.' * (hex_length // 3)
33 | colors[i] = [
34 | int(val * (6 // hex_length), 16)/255 \
35 | for val in re.findall(regex, color)
36 | ]
37 |
38 | return colors[0] if len(colors) == 1 else colors
39 |
40 | def two_color_sequential_cmap(rgb_color_list):
41 | """
42 | Created a sequential colormap blending from one color to the other.
43 |
44 | Parameters:
45 | - rgb_color_list: A list of colors represented as [R, G, B] values
46 | in the range [0, 1], like [[0, 0, 0], [1, 1, 1]],
47 | for black and white, respectively.
48 |
49 | Returns:
50 | A matplotlib ListedColormap object with your colormap.
51 | """
52 | if not isinstance(rgb_color_list, list):
53 | raise ValueError('Colors must be passed as a list!')
54 | elif len(rgb_color_list) != 2:
55 | raise ValueError(
56 | 'Can only specify two colors; '
57 | 'one for each end of the spectrum.'
58 | )
59 | elif (
60 | not isinstance(rgb_color_list[0], list) or not isinstance(rgb_color_list[1], list)
61 | ) or (
62 | len(rgb_color_list[0]) != 3 or len(rgb_color_list[1]) != 3
63 | ):
64 | raise ValueError('Each color should be represented as a list of size 3.')
65 |
66 | N = 256
67 | entries = 4 # red, green, blue, alpha
68 | rgbas = np.ones((N, entries))
69 | for i in range(entries - 1): # we don't alter alphas
70 | rgbas[:, i] = np.linspace(
71 | start=rgb_color_list[0][i],
72 | stop=rgb_color_list[1][i],
73 | num=N
74 | )
75 |
76 | return ListedColormap(rgbas)
77 |
78 | def draw_cmap(cmap):
79 | """
80 | Draw a colorbar for visualizing a colormap.
81 |
82 | Parameters:
83 | - cmap: A matplotlib colormap
84 |
85 | Returns:
86 | A matplotlib colorbar, which you can save with:
87 | `plt.savefig(, bbox_inches='tight')`
88 | """
89 | img = plt.imshow(np.array([[0,1]]), cmap=cmap)
90 | cbar = plt.colorbar(orientation='horizontal', cmap=cmap)
91 | img.axes.remove()
92 | return cbar
93 |
94 | def blended_cmap(rgb_color_list):
95 | """
96 | Created a colormap blending from one color to the other.
97 |
98 | Parameters:
99 | - rgb_color_list: A list of colors represented as [R, G, B] values
100 | in the range [0, 1], like [[0, 0, 0], [1, 1, 1]],
101 | for black and white, respectively.
102 |
103 | Returns:
104 | A matplotlib ListedColormap object with your colormap.
105 | """
106 | if not isinstance(rgb_color_list, list):
107 | raise ValueError('Colors must be passed as a list!')
108 | elif len(rgb_color_list) < 2:
109 | raise ValueError('Must specify at least 2 colors.')
110 | elif (
111 | not isinstance(rgb_color_list[0], list) \
112 | or not isinstance(rgb_color_list[1], list)
113 | ) or (
114 | len(rgb_color_list[0]) != 3 or len(rgb_color_list[1]) != 3
115 | ):
116 | raise ValueError(
117 | 'Each color should be represented as a list of size 3.'
118 | )
119 |
120 | N = 256
121 | entries = 4 # red, green, blue, alpha
122 | rgbas = np.ones((N, entries))
123 |
124 | segment_count = len(rgb_color_list) - 1
125 | segment_size = N // segment_count
126 | remainder = N % segment_count # need to add this back later
127 |
128 | for i in range(entries - 1): # we don't alter alphas
129 | updates = []
130 | for seg in range(1, segment_count + 1):
131 | # determine how much needs to be added back to account for remainders
132 | if not remainder or seg > 1:
133 | offset = 0
134 | else:
135 | offset = remainder
136 |
137 | updates.append(np.linspace(
138 | start=rgb_color_list[seg-1][i],
139 | stop=rgb_color_list[seg][i],
140 | num=segment_size + offset
141 | ))
142 |
143 | rgbas[:,i] = np.concatenate(updates)
144 |
145 | return ListedColormap(rgbas)
--------------------------------------------------------------------------------
/ch_06/data/fb_stock_prices_2018.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903
3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563
4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896
5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535
6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726
7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057
8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894
9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587
10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299
11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842
12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376
13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901
14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540
15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464
16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781
17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548
18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740
19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212
20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172
21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556
22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144
23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293
24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484
25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206
26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505
27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886
28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321
29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626
30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133
31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350
32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704
33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120
34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610
35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921
36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804
37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192
38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288
39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703
40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806
41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039
42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626
43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905
44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280
45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784
46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293
47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946
48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292
49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229
50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477
51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728
52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035
53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438
54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060
55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768
56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834
57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979
58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706
59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634
60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995
61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170
62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293
63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991
64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865
65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584
66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609
67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812
68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227
69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041
70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633
71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956
72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561
73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435
74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029
75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568
76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961
77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438
78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102
79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926
80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581
81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934
82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657
83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478
84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932
85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450
86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071
87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844
88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195
89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211
90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811
91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403
92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848
93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744
94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433
95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495
96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675
97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451
98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864
99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419
100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100
101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742
102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061
103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937
104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866
105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631
106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245
107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795
108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294
109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920
110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171
111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092
112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907
113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704
114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821
115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866
116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931
117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023
118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996
119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933
120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717
121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188
122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137
123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576
124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408
125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439
126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602
127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578
128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514
129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193
130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131
131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437
132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067
133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377
134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706
135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401
136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209
137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892
138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907
139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429
140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508
141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969
142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681
143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585
144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668
145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749
146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787
147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471
148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109
149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954
150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434
151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192
152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562
153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230
154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120
155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410
156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264
157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995
158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231
159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784
160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176
161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006
162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514
163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083
164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567
165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556
166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935
167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675
168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301
169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532
170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159
171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971
172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744
173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834
174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600
175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680
176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088
177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118
178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775
179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405
180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321
181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236
182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996
183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038
184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800
185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775
186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806
187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231
188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856
189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638
190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677
191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977
192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456
193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635
194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047
195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968
196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425
197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970
198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901
199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492
200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521
201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095
202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003
203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084
204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347
205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658
206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032
207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597
208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696
209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341
210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784
211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278
212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251
213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786
214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695
215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849
216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124
217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372
218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814
219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898
220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123
221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710
222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384
223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280
224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560
225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729
226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475
227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735
228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128
229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640
230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318
231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505
232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713
233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577
234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226
235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400
236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145
237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460
238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173
239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349
240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936
241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610
242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820
243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959
244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084
245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894
246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944
247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491
248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002
249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370
250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509
251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569
252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308
253 |
--------------------------------------------------------------------------------
/ch_06/reg_resid_plot.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | import matplotlib.pyplot as plt
4 | import seaborn as sns
5 |
6 | def reg_resid_plots(data):
7 | """
8 | Using seaborn, plot the regression and residuals
9 | plots side-by-side for every permutation of 2 columns
10 | in the data.
11 |
12 | Parameters:
13 | - data: A pandas DataFrame
14 |
15 | Returns:
16 | A matplotlib Figure object.
17 | """
18 | num_cols = data.shape[1]
19 | permutation_count = num_cols * (num_cols - 1)
20 |
21 | fig, ax = plt.subplots(permutation_count, 2, figsize=(15, 8))
22 |
23 | for (x, y), axes, color in zip(
24 | itertools.permutations(data.columns, 2),
25 | ax,
26 | itertools.cycle(['royalblue', 'darkorange'])
27 | ):
28 | for subplot, func in zip(axes, (sns.regplot, sns.residplot)):
29 | func(x=x, y=y, data=data, ax=subplot, color=color)
30 | plt.close()
31 | return fig
--------------------------------------------------------------------------------
/ch_06/std_from_mean_kde.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | def std_from_mean_kde(data):
4 | """
5 | Plot the KDE of the pandas series along with vertical
6 | reference lines for each standard deviation from the mean.
7 |
8 | Parameters:
9 | - data: pandas Series with numeric data
10 |
11 | Returns:
12 | Matplotlib Axes object.
13 | """
14 | mean_mag, std_mean = data.mean(), data.std()
15 |
16 | ax = data.plot(kind='kde')
17 | ax.axvline(mean_mag, color='b', alpha=0.2, label='mean')
18 |
19 | colors = ['green', 'orange', 'red']
20 | multipliers = [1, 2, 3]
21 | signs = ['-', '+']
22 |
23 | for sign, (color, multiplier) in itertools.product(
24 | signs, zip(colors, multipliers)
25 | ):
26 | adjustment = multiplier * std_mean
27 | if sign == '-':
28 | value = mean_mag - adjustment
29 | label = '{} {}{}{}'.format(
30 | r'$\mu$',
31 | r'$\pm$',
32 | multiplier,
33 | r'$\sigma$'
34 | )
35 | else:
36 | value = mean_mag + adjustment
37 | label = None
38 | ax.axvline(value, color=color, label=label, alpha=0.5)
39 |
40 | ax.legend()
41 | return ax
--------------------------------------------------------------------------------
/ch_07/data/netflix_january_2019.csv:
--------------------------------------------------------------------------------
1 | date,open,high,low,close,volume
2 | 2019-01-02,259.28,269.7499,256.58,267.66,11679528
3 | 2019-01-03,270.2,275.79,264.43,271.2,14969647
4 | 2019-01-04,281.88,297.8,278.54,297.57,19330102
5 | 2019-01-07,302.1,316.8,301.65,315.34,18620116
6 | 2019-01-08,319.98,320.593,308.01,320.27,15359180
7 | 2019-01-09,317.71,323.3508,313.5,319.96,13343152
8 | 2019-01-10,314.57,325.37,312.5,324.66,13472475
9 | 2019-01-11,330.96,341.09,328.52,337.59,19500429
10 | 2019-01-14,334.24,335.48,329.13,332.94,10499582
11 | 2019-01-15,349.6,357.22,347.0,354.64,21181234
12 | 2019-01-16,354.0,358.85,348.11,351.39,15385548
13 | 2019-01-17,349.5,355.79,346.41,353.19,18871195
14 | 2019-01-18,351.97,353.0,336.73,339.1,26621040
15 | 2019-01-22,334.89,336.88,321.03,325.16,17941416
16 | 2019-01-23,328.25,331.75,318.6,321.99,13480138
17 | 2019-01-24,320.6,331.8,319.0,326.67,11131627
18 | 2019-01-25,328.72,340.0,328.51,338.05,11166609
19 | 2019-01-28,334.7,336.3,328.88,335.66,8652082
20 | 2019-01-29,335.87,338.22,328.151,328.9,7655189
21 | 2019-01-30,332.75,341.78,330.8,340.66,9234530
22 | 2019-01-31,339.68,345.99,338.0919,339.5,8535517
23 |
--------------------------------------------------------------------------------
/ch_07/random_walk.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 |
4 | import matplotlib.pyplot as plt
5 |
6 | def random_walk_stock_comparison(df, choices=[-1, 1], probs=[0.5, 0.5], seed=2):
7 | """
8 | Model a random walk from a stock's first closing price in the dataframe.
9 | Displays 3 random walks and the actual data in randomly assigned subplots.
10 | Can you find the real data?
11 |
12 | Parameters:
13 | - df: The dataframe of the real stock data.
14 | - choices: The choices of step sizes, defaults to [-1, 1].
15 | - probs: The probability of getting each step size,
16 | defaults to [0.5, 0.5]. This should be the same
17 | size as choices.
18 | - seed: The random seed for repeatability.
19 |
20 | Returns:
21 | Prints the location of the actual data and
22 | returns the matplotlib Axes object.
23 | """
24 | random.seed(seed)
25 |
26 | fig, axes = plt.subplots(2, 2, figsize=(15, 10))
27 | stock_location = random.randint(0, 3)
28 |
29 | for i, ax in enumerate(axes.flatten()):
30 | if i == stock_location:
31 | ax.plot(df.index, df.close)
32 | else:
33 | steps = random.choices(
34 | choices, weights=probs, k=len(df.index) - 1
35 | )
36 | walk = [df.first('1B').close.iat[0]]
37 | for step in steps:
38 | walk.append(walk[-1] + step)
39 | ax.plot(df.index, walk)
40 | ax.set_ylabel('price')
41 |
42 | ax.set_title(string.ascii_uppercase[i])
43 |
44 | real_stock = f'real stock is at location {string.ascii_uppercase[stock_location]}'
45 |
46 | return real_stock, axes
--------------------------------------------------------------------------------
/ch_08/logs/attacks.csv:
--------------------------------------------------------------------------------
1 | start,end,source_ip
2 | 2018-11-01 10:44:29.667759,2018-11-01 10:48:37.667759,23.143.69.122
3 | 2018-11-02 03:27:23.313068,2018-11-02 03:31:18.313068,141.20.164.200
4 | 2018-11-02 06:37:54.905352,2018-11-02 06:42:08.905352,200.115.24.107
5 | 2018-11-02 08:36:24.435229,2018-11-02 08:40:24.435229,207.119.101.16
6 | 2018-11-02 11:34:18.719900,2018-11-02 11:34:39.719900,66.109.118.50
7 | 2018-11-03 14:53:58.156032,2018-11-03 14:56:48.156032,229.230.103.243
8 | 2018-11-04 03:53:18.034815,2018-11-04 03:57:28.034815,182.166.104.62
9 | 2018-11-06 23:15:35.495361,2018-11-06 23:19:47.495361,54.192.138.165
10 | 2018-11-08 20:25:31.468864,2018-11-08 20:29:20.468864,3.37.217.34
11 | 2018-11-08 23:29:01.571634,2018-11-08 23:33:10.571634,219.216.146.199
12 | 2018-11-10 04:33:57.988715,2018-11-10 04:38:10.988715,121.25.210.210
13 | 2018-11-12 20:13:38.282383,2018-11-12 20:17:50.282383,126.173.124.168
14 | 2018-11-14 23:23:46.300364,2018-11-14 23:25:01.300364,63.80.87.152
15 | 2018-11-15 08:28:06.168628,2018-11-15 08:31:53.168628,209.39.194.7
16 | 2018-11-15 19:57:06.739763,2018-11-15 20:01:17.739763,14.105.57.60
17 | 2018-11-15 23:49:21.857030,2018-11-15 23:53:10.857030,35.69.133.207
18 | 2018-11-16 06:36:06.769436,2018-11-16 06:39:52.769436,151.161.63.71
19 | 2018-11-17 08:34:25.007268,2018-11-17 08:38:32.007268,15.25.40.20
20 | 2018-11-18 12:54:00.448884,2018-11-18 12:58:15.448884,146.116.200.234
21 | 2018-11-19 11:37:56.140599,2018-11-19 11:42:07.140599,186.4.202.15
22 | 2018-11-19 13:55:09.201913,2018-11-19 13:56:14.201913,82.192.228.167
23 | 2018-11-19 18:55:45.842577,2018-11-19 18:59:53.842577,7.188.56.193
24 | 2018-11-20 05:41:42.433953,2018-11-20 05:44:05.433953,45.36.27.25
25 | 2018-11-20 22:52:41.179710,2018-11-20 22:56:51.179710,132.46.92.143
26 | 2018-11-21 19:52:41.242397,2018-11-21 19:56:51.242397,141.160.80.85
27 | 2018-11-22 14:41:01.817158,2018-11-22 14:42:29.817158,174.178.69.43
28 | 2018-11-23 22:13:01.743022,2018-11-23 22:16:33.743022,43.212.208.159
29 | 2018-11-24 06:54:14.868831,2018-11-24 06:58:24.868831,146.235.86.65
30 | 2018-11-25 04:57:27.351116,2018-11-25 04:57:55.351116,67.151.67.186
31 | 2018-11-25 21:33:56.485642,2018-11-25 21:38:05.485642,162.180.192.242
32 | 2018-11-25 22:18:47.466777,2018-11-25 22:23:05.466777,135.158.66.165
33 | 2018-11-26 06:04:15.277956,2018-11-26 06:05:12.277956,165.6.227.176
34 | 2018-11-26 19:33:12.128095,2018-11-26 19:37:23.128095,184.129.203.46
35 | 2018-11-27 09:38:57.760709,2018-11-27 09:39:52.760709,45.65.160.229
36 | 2018-11-27 10:02:21.919370,2018-11-27 10:05:30.919370,142.230.199.14
37 | 2018-11-27 20:34:49.331332,2018-11-27 20:36:27.331332,205.116.13.22
38 | 2018-11-29 06:52:45.020568,2018-11-29 06:53:52.020568,50.244.204.83
39 | 2018-11-29 13:58:14.376959,2018-11-29 14:02:27.376959,38.56.22.170
40 | 2018-11-29 23:12:42.663323,2018-11-29 23:16:57.663323,44.123.120.49
41 |
--------------------------------------------------------------------------------
/ch_08/simulate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import datetime as dt
3 | import os
4 | import logging
5 | import random
6 |
7 | import login_attempt_simulator as sim
8 |
9 | # Logging configuration
10 | FORMAT = '[%(levelname)s] [ %(name)s ] %(message)s'
11 | logging.basicConfig(level=logging.INFO, format=FORMAT)
12 | logger = logging.getLogger(os.path.basename(__file__))
13 |
14 | def get_simulation_file_path(path_provided, directory, default_file):
15 | """Get the path to the file creating the directory and using the default if necessary."""
16 | if path_provided:
17 | file = path_provided
18 | else:
19 | if not os.path.exists(directory):
20 | os.mkdir(directory)
21 | file = os.path.join(directory, default_file)
22 | return file
23 |
24 | def get_user_base_file_path(path_provided, default_file):
25 | """Get the path for a user_data directory file."""
26 | return get_simulation_file_path(path_provided, 'user_data', default_file)
27 |
28 | def get_log_file_path(path_provided, default_file):
29 | """Get the path for a logs directory file."""
30 | return get_simulation_file_path(path_provided, 'logs', default_file)
31 |
32 | if __name__ == '__main__':
33 | # command line argument parsing
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument(
36 | "days", type=float, help="number of days to simulate from start"
37 | )
38 | parser.add_argument(
39 | "start_date", type=str,
40 | help="datetime to start in the form 'YYYY-MM-DD' or 'YYYY-MM-DD-HH'"
41 | )
42 | parser.add_argument(
43 | "-m", "--make", action='store_true', help="make userbase"
44 | )
45 | parser.add_argument(
46 | "-s", "--seed", type=int, help="set a seed for reproducibility"
47 | )
48 | parser.add_argument(
49 | "-u", "--userbase", help="file to write the userbase to"
50 | )
51 | parser.add_argument(
52 | "-i", "--ip", help="file to write the user-ip address map to"
53 | )
54 | parser.add_argument(
55 | "-l", "--log", help="file to write the attempt log to"
56 | )
57 | parser.add_argument(
58 | "-hl", "--hacklog", help="file to write the hack log to"
59 | )
60 | args = parser.parse_args()
61 | user_ip_mapping_file = get_user_base_file_path(args.ip, 'user_ips.json')
62 |
63 | if args.make:
64 | logger.warning('Creating new user base and mapping IP addresses to them.')
65 |
66 | user_base_file = get_user_base_file_path(args.userbase, 'user_base.txt')
67 |
68 | # seed the creation of userbase
69 | random.seed(args.seed)
70 |
71 | # create usernames and write to file
72 | sim.utils.make_userbase(user_base_file)
73 |
74 | # create one or more IP addresses per user and save mapping to file
75 | valid_users = sim.utils.get_valid_users(user_base_file)
76 | sim.utils.save_user_ips(
77 | sim.utils.assign_ip_addresses(valid_users), user_ip_mapping_file
78 | )
79 |
80 | try:
81 | start = dt.datetime(*map(int, args.start_date.split('-')))
82 | except TypeError:
83 | logger.error('Start date must be in the format "YYYY-MM-DD"')
84 | raise
85 | except ValueError:
86 | logger.warning(
87 | f'Could not interpret {args.start_date}, '
88 | 'using January 1, 2019 at 12AM as start instead'
89 | )
90 | start = dt.datetime(2019, 1, 1)
91 |
92 | end = start + dt.timedelta(days=args.days)
93 |
94 | try:
95 | logger.info(f'Simulating {args.days} days...')
96 | simulator = sim.LoginAttemptSimulator(
97 | user_ip_mapping_file, start, end, seed=args.seed
98 | )
99 | simulator.simulate(attack_prob=0.05, try_all_users_prob=0.5, vary_ips=False)
100 |
101 | # save logs
102 | logger.info('Saving logs')
103 | simulator.save_hack_log(get_log_file_path(args.hacklog, 'attacks.csv'))
104 | simulator.save_log(get_log_file_path(args.log, 'log.csv'))
105 |
106 | logger.info('All done!')
107 | except:
108 | logger.error('Oops! Something went wrong...')
109 |
--------------------------------------------------------------------------------
/ch_08/user_data/user_base.txt:
--------------------------------------------------------------------------------
1 | asmith
2 | ajones
3 | akim
4 | alopez
5 | abrown
6 | bsmith
7 | bjones
8 | bkim
9 | blopez
10 | bbrown
11 | csmith
12 | cjones
13 | ckim
14 | clopez
15 | cbrown
16 | dsmith
17 | djones
18 | dkim
19 | dlopez
20 | dbrown
21 | esmith
22 | ejones
23 | ekim
24 | elopez
25 | ebrown
26 | fsmith
27 | fjones
28 | fkim
29 | flopez
30 | fbrown
31 | gsmith
32 | gjones
33 | gkim
34 | glopez
35 | gbrown
36 | hsmith
37 | hjones
38 | hkim
39 | hlopez
40 | hbrown
41 | ismith
42 | ijones
43 | ikim
44 | ilopez
45 | ibrown
46 | jsmith
47 | jjones
48 | jkim
49 | jlopez
50 | jbrown
51 | ksmith
52 | kjones
53 | kkim
54 | klopez
55 | kbrown
56 | lsmith
57 | ljones
58 | lkim
59 | llopez
60 | lbrown
61 | msmith
62 | mjones
63 | mkim
64 | mlopez
65 | mbrown
66 | nsmith
67 | njones
68 | nkim
69 | nlopez
70 | nbrown
71 | osmith
72 | ojones
73 | okim
74 | olopez
75 | obrown
76 | psmith
77 | pjones
78 | pkim
79 | plopez
80 | pbrown
81 | qsmith
82 | qjones
83 | qkim
84 | qlopez
85 | qbrown
86 | rsmith
87 | rjones
88 | rkim
89 | rlopez
90 | rbrown
91 | ssmith
92 | sjones
93 | skim
94 | slopez
95 | sbrown
96 | tsmith
97 | tjones
98 | tkim
99 | tlopez
100 | tbrown
101 | usmith
102 | ujones
103 | ukim
104 | ulopez
105 | ubrown
106 | vsmith
107 | vjones
108 | vkim
109 | vlopez
110 | vbrown
111 | wsmith
112 | wjones
113 | wkim
114 | wlopez
115 | wbrown
116 | xsmith
117 | xjones
118 | xkim
119 | xlopez
120 | xbrown
121 | ysmith
122 | yjones
123 | ykim
124 | ylopez
125 | ybrown
126 | zsmith
127 | zjones
128 | zkim
129 | zlopez
130 | zbrown
131 | admin
132 | master
133 | dba
134 |
--------------------------------------------------------------------------------
/ch_08/user_data/user_ips.json:
--------------------------------------------------------------------------------
1 | {"asmith": ["215.20.132.248", "207.155.244.183"], "ajones": ["111.71.144.71", "48.128.75.158", "50.37.169.241"], "akim": ["51.181.222.161", "104.244.226.133", "31.7.47.204"], "alopez": ["166.32.97.113", "122.72.229.46", "41.163.250.55"], "abrown": ["149.63.170.104", "147.227.46.197"], "bsmith": ["123.148.94.96", "95.16.133.243"], "bjones": ["45.66.76.19"], "bkim": ["200.141.120.110"], "blopez": ["214.140.230.252", "182.42.166.59", "8.138.59.112"], "bbrown": ["87.170.218.31", "51.74.112.23"], "csmith": ["37.13.63.96", "61.200.46.189", "59.18.11.99"], "cjones": ["63.245.107.31"], "ckim": ["11.217.51.133", "35.113.36.154", "179.223.92.31"], "clopez": ["239.20.51.200", "102.133.183.240", "86.104.29.81"], "cbrown": ["175.128.60.226"], "dsmith": ["89.6.241.209", "159.182.198.128", "78.6.234.40"], "djones": ["23.143.69.122", "67.158.198.212"], "dkim": ["41.0.98.171", "81.122.114.229", "193.212.16.205"], "dlopez": ["214.23.84.228", "32.132.80.228", "166.159.239.25"], "dbrown": ["96.42.66.7", "205.213.161.1"], "esmith": ["7.1.50.97"], "ejones": ["101.154.143.93"], "ekim": ["140.231.59.131"], "elopez": ["177.58.79.142"], "ebrown": ["21.20.105.132"], "fsmith": ["161.187.21.253", "234.222.190.91", "106.192.149.4"], "fjones": ["77.138.170.172"], "fkim": ["47.173.18.21", "138.83.76.148"], "flopez": ["202.66.150.58", "91.36.154.206"], "fbrown": ["153.212.55.50", "63.245.59.254"], "gsmith": ["19.154.171.79", "85.192.44.33"], "gjones": ["101.113.31.197"], "gkim": ["50.201.148.229"], "glopez": ["111.216.42.188", "112.133.85.220"], "gbrown": ["183.58.32.14"], "hsmith": ["231.103.60.254", "203.131.106.21", "110.74.53.101"], "hjones": ["193.185.77.53", "103.112.4.174"], "hkim": ["162.164.18.75", "131.79.194.150", "33.115.66.20"], "hlopez": ["7.229.169.82", "76.235.190.195"], "hbrown": ["17.46.39.218", "105.148.213.246", "198.119.10.0"], "ismith": ["93.154.130.170", "33.252.134.155", "208.196.196.31"], "ijones": ["65.122.146.171"], "ikim": ["18.246.213.72"], "ilopez": ["41.77.180.210", "18.238.197.234"], "ibrown": ["51.241.77.10"], "jsmith": ["67.165.53.177"], "jjones": ["196.251.56.30"], "jkim": ["239.172.63.151", "65.198.150.62", "96.19.200.227"], "jlopez": ["97.233.182.38", "22.20.248.130"], "jbrown": ["110.117.47.215"], "ksmith": ["156.58.74.218", "216.43.53.212", "32.50.212.79"], "kjones": ["15.228.220.213", "15.254.166.129", "40.180.36.62"], "kkim": ["15.176.178.91", "5.118.187.36"], "klopez": ["73.106.1.104", "63.3.150.188", "12.119.72.95"], "kbrown": ["57.244.176.132", "66.14.106.185"], "lsmith": ["94.41.52.157", "80.192.75.64"], "ljones": ["161.124.121.94"], "lkim": ["190.214.23.67", "67.215.153.213"], "llopez": ["72.216.152.181", "43.127.227.189", "29.192.209.4"], "lbrown": ["164.225.104.190", "150.241.46.94"], "msmith": ["141.57.78.228"], "mjones": ["94.215.221.89", "126.232.174.72"], "mkim": ["236.44.247.104", "150.0.229.236"], "mlopez": ["111.152.58.154"], "mbrown": ["79.217.241.47", "11.61.138.20", "227.52.129.181"], "nsmith": ["100.43.18.36", "134.156.174.60"], "njones": ["148.144.68.107", "53.210.206.142", "149.226.190.70"], "nkim": ["63.61.195.205"], "nlopez": ["239.71.153.181", "227.153.73.253", "26.110.13.181"], "nbrown": ["200.5.34.41", "202.3.184.21"], "osmith": ["1.138.149.116"], "ojones": ["147.97.53.222"], "okim": ["169.196.86.169", "215.222.75.229"], "olopez": ["75.161.66.106", "95.227.178.199", "218.251.199.112"], "obrown": ["224.104.25.198"], "psmith": ["119.43.95.186"], "pjones": ["88.119.152.44"], "pkim": ["145.180.210.234", "27.220.232.250", "130.243.110.172"], "plopez": ["21.22.26.83", "179.1.148.3"], "pbrown": ["32.219.113.203"], "qsmith": ["113.232.98.173", "52.43.163.165", "233.166.130.14"], "qjones": ["22.97.188.41", "107.177.96.103", "128.154.159.196"], "qkim": ["156.37.4.235", "43.41.123.50"], "qlopez": ["211.109.225.39"], "qbrown": ["201.20.92.127", "180.163.222.54"], "rsmith": ["146.103.151.226", "236.133.139.118", "8.60.50.88"], "rjones": ["212.127.111.145", "3.219.25.62", "196.139.60.183"], "rkim": ["144.113.122.33"], "rlopez": ["157.167.119.191", "7.167.187.12", "66.202.79.90"], "rbrown": ["39.69.105.254", "109.120.67.119", "196.180.67.255"], "ssmith": ["13.183.250.233"], "sjones": ["6.112.83.253", "70.205.97.162"], "skim": ["196.30.106.19", "161.127.175.225"], "slopez": ["115.133.176.83", "156.8.182.29", "77.180.11.251"], "sbrown": ["31.12.123.23", "6.115.167.34", "31.176.216.69"], "tsmith": ["229.222.72.183"], "tjones": ["90.168.209.195", "4.209.135.236"], "tkim": ["62.209.199.87"], "tlopez": ["70.75.40.168"], "tbrown": ["90.126.11.86"], "usmith": ["86.40.218.53", "234.76.20.129", "174.192.13.18"], "ujones": ["45.183.149.77", "234.120.182.83"], "ukim": ["207.172.138.252", "201.7.159.147", "233.202.61.206"], "ulopez": ["17.130.148.106", "174.197.128.106"], "ubrown": ["168.124.180.83"], "vsmith": ["169.4.26.79"], "vjones": ["185.149.150.165", "64.174.4.245"], "vkim": ["131.96.35.217", "142.89.86.32", "80.56.196.222"], "vlopez": ["159.145.6.219", "143.132.162.175"], "vbrown": ["221.72.3.79"], "wsmith": ["196.184.237.18", "210.116.8.185", "81.99.181.254"], "wjones": ["215.39.229.121"], "wkim": ["230.51.96.84", "225.33.218.202", "139.129.223.182"], "wlopez": ["167.46.157.15", "203.197.223.199", "19.238.181.64"], "wbrown": ["142.167.12.203", "177.184.2.35", "97.56.241.22"], "xsmith": ["12.161.201.64", "140.208.72.75"], "xjones": ["156.30.83.64", "68.246.23.22"], "xkim": ["198.92.176.42", "41.89.135.103", "133.167.129.132"], "xlopez": ["233.79.229.78", "19.90.17.161", "36.98.233.122"], "xbrown": ["81.170.69.243", "29.42.175.1"], "ysmith": ["53.218.180.231"], "yjones": ["193.185.60.69", "162.11.93.64"], "ykim": ["172.98.22.211"], "ylopez": ["31.159.199.26", "86.183.39.209", "27.225.181.130"], "ybrown": ["158.232.211.92", "15.232.135.97", "198.32.183.49"], "zsmith": ["13.179.10.90"], "zjones": ["6.165.233.253", "134.14.49.41"], "zkim": ["183.50.241.16", "78.146.18.0"], "zlopez": ["172.80.75.82", "89.81.124.168"], "zbrown": ["123.144.168.87"], "admin": ["180.115.83.215"], "master": ["186.70.197.6", "83.2.199.88"], "dba": ["9.12.165.1"]}
--------------------------------------------------------------------------------
/ch_09/data/binaries.csv:
--------------------------------------------------------------------------------
1 | separation,name,positionangle,binaries,planets,stars
2 | 39.56,16 Cygni,133.30,1.0,1.0,3.0
3 | 3.4,16 Cygni AC,209,0.0,0.0,2.0
4 | 12.37,2M0441+2301,237.3,1.0,1.0,3.0
5 | 0.2323,2M 044145,79.61,0.0,0.0,2.0
6 | ,2M 1938+4603,,0.0,1.0,2.0
7 | 2.17,2MASS J02495639-0557352,233.1,0.0,1.0,2.0
8 | 38.00,30 Ari,275,2.0,1.0,4.0
9 | ,30 Ari A,,0.0,0.0,2.0
10 | 0.536,30 Ari BC,285.2,0.0,1.0,2.0
11 | 66.70,WDS J04376-0228,163,1.0,1.0,3.0
12 | ,GJ 3305,,0.0,0.0,2.0
13 | 84,55 Cancri,130,0.0,5.0,2.0
14 | 28.6,83 Leonis,150,0.0,2.0,2.0
15 | 49.4,91 Aquarii,313,1.0,1.0,3.0
16 | ,91 Aquarii BC,,0.0,0.0,2.0
17 | 31.60,alf Tau AB,113,0.0,1.0,2.0
18 | ,,,1.0,3.0,3.0
19 | ,Alpha Centauri,,0.0,2.0,2.0
20 | ,DP Leo,,0.0,1.0,2.0
21 | 17.25,,,0.0,1.0,2.0
22 | 24.14,,,0.0,1.0,2.0
23 | 9.78,,,0.0,1.0,2.0
24 | 21.21,EPIC 201549860,,0.0,2.0,2.0
25 | 12.3,,,0.0,1.0,2.0
26 | 1.9,EPIC 201637175,227,0.0,1.0,2.0
27 | 12.15,,,0.0,1.0,2.0
28 | 2.46,,,0.0,1.0,2.0
29 | 2.00,WDS J15576+2653,10,0.0,1.0,2.0
30 | 403.10,Epsilon Indi,88,1.0,1.0,3.0
31 | ,Epsilon Indi B,,0.0,0.0,2.0
32 | ,FL Lyr,,0.0,1.0,2.0
33 | 20407.6,MAM 1,337.91,1.0,1.0,3.0
34 | 7062.7,SHY 106,187.88,0.0,1.0,2.0
35 | 0.0754,FW Tau,3.4,0.0,1.0,2.0
36 | ,gamma Cephei,,0.0,1.0,2.0
37 | 4.60,Gamma Leonis,127,0.0,1.0,2.0
38 | 6.80,GJ 229,168,0.0,1.0,2.0
39 | ,GJ 725,,0.0,1.0,2.0
40 | ,Gliese 15,,0.0,1.0,2.0
41 | 3.90,,355,0.0,1.0,2.0
42 | 32.70,,142,1.0,7.0,3.0
43 | ,,,0.0,0.0,2.0
44 | 50.0,,135,0.0,4.0,2.0
45 | 179,,234,0.0,2.0,2.0
46 | ,Gliese 86,,0.0,1.0,2.0
47 | 0.8576,HAT-P-14,264.10,0.0,1.0,2.0
48 | 0.6916,HAT-P-16,153.83,0.0,1.0,2.0
49 | 4.9442,HAT-P-24,170.872,0.0,1.0,2.0
50 | 3.8366,HAT-P-30,4.206,0.0,1.0,2.0
51 | 2.9355,HAT-P-32,110.624,0.0,1.0,2.0
52 | 0.3063,HAT-P-33,118.05,0.0,1.0,2.0
53 | 2.667,HAT-P-57,234,1.0,1.0,3.0
54 | 0.225,HAT-P-57 BC,165,0.0,0.0,2.0
55 | 3.8587,HAT-P-7,89.8,0.0,1.0,2.0
56 | 1.040,,137.9,1.0,1.0,3.0
57 | 15,,222,0.0,0.0,2.0
58 | ,HD 106906,,0.0,1.0,2.0
59 | 8.28,,179.75,0.0,1.0,2.0
60 | 8.05,,333,0.0,1.0,2.0
61 | 3.26,,30,0.0,1.0,2.0
62 | 29.7,HD 11964,133,0.0,2.0,2.0
63 | 41.9,HD 126614 AC,299,1.0,1.0,3.0
64 | 0.499,HD 126614 AB,60.70,0.0,1.0,2.0
65 | ,HD 131399,,2.0,1.0,4.0
66 | ,HD 131399 A,,0.0,1.0,2.0
67 | 0.10,WDS J14544-3409 B,235,0.0,0.0,2.0
68 | 4.130,HD 132563,276.95,1.0,1.0,3.0
69 | ,HD 132563 A,,0.0,0.0,2.0
70 | ,,,0.0,3.0,2.0
71 | 20.4,,130,0.0,1.0,2.0
72 | 5.4,,177,0.0,2.0,2.0
73 | 64.40,Gliese 617,14,0.0,1.0,2.0
74 | 345,,245,0.0,1.0,2.0
75 | 4.80,HD 156846,73,0.0,1.0,2.0
76 | 6.2,,187,0.0,1.0,2.0
77 | ,HD 176051,,0.0,1.0,2.0
78 | 1.645,HD 177830,84.85,0.0,2.0,2.0
79 | 16.1,,82,1.0,1.0,3.0
80 | ,,,0.0,0.0,2.0
81 | 75.80,LDS 6334,152,0.0,2.0,2.0
82 | 4.547,HD 185269,8.15,1.0,1.0,3.0
83 | 0.0956,,221.1,0.0,0.0,2.0
84 | 13,,85,0.0,1.0,2.0
85 | 11.2,,246,0.0,1.0,2.0
86 | 3.5,,330,0.0,1.0,2.0
87 | 10.9,,175,1.0,1.0,3.0
88 | ,,,0.0,0.0,2.0
89 | ,,,0.0,1.0,2.0
90 | 3.676,HD 197037,182.14,0.0,1.0,2.0
91 | ,,,0.0,1.0,2.0
92 | 253.00,WDS J03201-2851,358.11,0.0,3.0,2.0
93 | 95.8,,127,0.0,1.0,2.0
94 | 2.8560,HD 217786,170.34,0.0,1.0,2.0
95 | 113,,302,0.0,1.0,2.0
96 | 839.60,WDS J00293-0555,51,1.0,1.0,3.0
97 | 0.526,HD 2638 BC,166.7,0.0,1.0,2.0
98 | 83.70,40 Eri,102,1.0,1.0,3.0
99 | ,40 Eri BC,,0.0,0.0,2.0
100 | 13.8,,36,0.0,1.0,2.0
101 | 284,,305,0.0,2.0,2.0
102 | 192,,290,1.0,1.0,3.0
103 | ,,,0.0,0.0,2.0
104 | ,,,1.0,1.0,3.0
105 | ,,,0.0,0.0,2.0
106 | 49.000,HD 4113,350.30,1.0,1.0,3.0
107 | ,HD 4113 AC,,0.0,1.0,2.0
108 | 9.4,,308,0.0,1.0,2.0
109 | ,HD 59686,,0.0,1.0,2.0
110 | ,HD 7449,,0.0,2.0,2.0
111 | 21.5,,78,0.0,1.0,2.0
112 | 20.6,,269,0.0,1.0,2.0
113 | 0.308,HD 8673,333.4,0.0,1.0,2.0
114 | 63.0,,48,0.0,1.0,2.0
115 | 377,,262,0.0,1.0,2.0
116 | ,HU Aqr,,0.0,3.0,2.0
117 | ,,,1.0,1.0,3.0
118 | ,,,0.0,1.0,2.0
119 | 0.730,K2-136,7.9,0.0,3.0,2.0
120 | 2.29,KELT-2,328.6,0.0,1.0,2.0
121 | 1.5,,,1.0,1.0,3.0
122 | 0.034,KELT-4BC,,0.0,0.0,2.0
123 | 1.05,Kepler-108,118.4,0.0,2.0,2.0
124 | 1.15,Kepler-13,281,1.0,1.0,3.0
125 | ,Kepler-13 BC,,0.0,0.0,2.0
126 | ,Kepler-16,,0.0,1.0,2.0
127 | 0.7739,Kepler-21,129.53,0.0,1.0,2.0
128 | 0.217,,217.3,0.0,5.0,2.0
129 | ,Kepler-34,,0.0,1.0,2.0
130 | ,Kepler-35,,0.0,1.0,2.0
131 | ,Kepler-38,,0.0,1.0,2.0
132 | 1.70,,35,0.0,1.0,2.0
133 | ,,,0.0,1.0,2.0
134 | ,Kepler-444,,1.0,5.0,3.0
135 | ,,,0.0,0.0,2.0
136 | 0.08,Kepler-449,68.4,0.0,2.0,2.0
137 | 0.9,Kepler-450,,0.0,3.0,2.0
138 | ,Kepler-47,,0.0,2.0,2.0
139 | 10.979,Kepler-68,145.43,0.0,3.0,2.0
140 | 0.4104,KIC 7177553,193.6,2.0,1.0,4.0
141 | ,KIC 7177553 A,,0.0,1.0,2.0
142 | ,KIC 7177553 B,,0.0,0.0,2.0
143 | ,KIC 9632895,,0.0,1.0,2.0
144 | ,KOI-1257,,0.0,1.0,2.0
145 | 0.8730,KOI-1299,20.86,0.0,2.0,2.0
146 | 2.89,KOI-2939,176.02,1.0,1.0,3.0
147 | ,KOI-2939 AB,,0.0,1.0,2.0
148 | ,,,0.0,2.0,2.0
149 | ,,,0.0,2.0,2.0
150 | ,nu Oct,,0.0,1.0,2.0
151 | ,,,0.0,1.0,2.0
152 | ,,,0.0,1.0,2.0
153 | 48,,,0.0,1.0,2.0
154 | 10.536,,,0.0,1.0,2.0
155 | 1.74,OGLE-2013-BLG-0723L,,0.0,1.0,2.0
156 | 2.96,,,0.0,1.0,2.0
157 | 0.7,PH-1,123,2.0,1.0,4.0
158 | ,PH-1 A,,0.0,1.0,2.0
159 | ,PH-1 B,,0.0,0.0,2.0
160 | 30.104,Psi-1 Draconis,15.49,1.0,1.0,3.0
161 | ,HD 162003,,0.0,0.0,2.0
162 | ,B1620-26,,0.0,1.0,2.0
163 | 0.475,Ross 458 AB,81.5,0.0,1.0,2.0
164 | 0.083,ROXs 42 B,157.9,0.0,1.0,2.0
165 | ,,,0.0,1.0,2.0
166 | 2.8,SR 12,96,0.0,1.0,2.0
167 | ,tau Boo,,0.0,1.0,2.0
168 | 1.90,tau Gem,177,0.0,1.0,2.0
169 | 1.1054,TrES-2,136.325,0.0,1.0,2.0
170 | 1.555,TrES-4,359.8,0.0,1.0,2.0
171 | 52,,150,0.0,4.0,2.0
172 | 0.1094,VHS 1256-1257,173.3,0.0,1.0,2.0
173 | 0.3425,WASP-11,214.09,0.0,1.0,2.0
174 | 1.064,,251.3,1.0,1.0,3.0
175 | 0.0843,,84,0.0,0.0,2.0
176 | ,,,0.0,1.0,2.0
177 | 1.4491,WASP-14,102.210,0.0,1.0,2.0
178 | 6.1,WASP-173,110.1,0.0,1.0,2.0
179 | 1587,WASP-1,1.953,0.0,1.0,2.0
180 | 0.757,WASP-2,104.7,0.0,1.0,2.0
181 | 1.1910,WASP-3,87.070,0.0,1.0,2.0
182 | 3.3,WASP-70,167,0.0,1.0,2.0
183 | 3.30,,150,0.0,1.0,2.0
184 | 1.48,,99.6,0.0,1.0,2.0
185 | 4.5052,WASP-8,170.948,0.0,2.0,2.0
186 | 2700,,,0.0,2.0,2.0
187 | 31,,342,0.0,4.0,2.0
188 |
--------------------------------------------------------------------------------
/ch_09/data/sample_roc_curves.csv:
--------------------------------------------------------------------------------
1 | x,y,label
2 | 0.0,0.0,good
3 | 0.01,0.21,good
4 | 0.02,0.25,good
5 | 0.03,0.29,good
6 | 0.04,0.4,good
7 | 0.05,0.47,good
8 | 0.06,0.51,good
9 | 0.07,0.52,good
10 | 0.08,0.55,good
11 | 0.09,0.56,good
12 | 0.1,0.6,good
13 | 0.12,0.61,good
14 | 0.13,0.61,good
15 | 0.15,0.69,good
16 | 0.16,0.71,good
17 | 0.17,0.73,good
18 | 0.21,0.8,good
19 | 0.22,0.81,good
20 | 0.24,0.85,good
21 | 0.25,0.87,good
22 | 0.27,0.89,good
23 | 0.3,0.89,good
24 | 0.37,0.92,good
25 | 0.41,0.94,good
26 | 0.45,0.96,good
27 | 0.47,0.96,good
28 | 0.5,0.96,good
29 | 0.56,0.98,good
30 | 0.6,0.98,good
31 | 0.66,0.98,good
32 | 0.7,0.98,good
33 | 0.75,0.98,good
34 | 0.9,1.0,good
35 | 1.0,1.0,good
36 | 0.0,0.0,better
37 | 0.01,0.26,better
38 | 0.02,0.44999999999999996,better
39 | 0.03,0.54,better
40 | 0.04,0.65,better
41 | 0.05,0.72,better
42 | 0.06,0.76,better
43 | 0.07,0.77,better
44 | 0.08,0.8,better
45 | 0.09,0.81,better
46 | 0.1,0.85,better
47 | 0.12,0.86,better
48 | 0.13,0.86,better
49 | 0.15,0.8899999999999999,better
50 | 0.16,0.9099999999999999,better
51 | 0.17,0.9299999999999999,better
52 | 0.21,0.95,better
53 | 0.22,0.95,better
54 | 0.24,0.98,better
55 | 0.25,0.98,better
56 | 0.27,0.98,better
57 | 0.3,0.98,better
58 | 0.37,0.98,better
59 | 0.41,0.99,better
60 | 0.45,0.99,better
61 | 0.47,1.0,better
62 | 0.5,1.0,better
63 | 0.56,1.0,better
64 | 0.6,1.0,better
65 | 0.66,1.0,better
66 | 0.7,1.0,better
67 | 0.75,1.0,better
68 | 1.0,1.0,better
69 | 1.0,1.0,better
70 | 0.0,0.0,best
71 | 0.01,0.31,best
72 | 0.02,0.75,best
73 | 0.03,0.9,best
74 | 0.04,0.95,best
75 | 0.05,0.96,best
76 | 0.06,0.98,best
77 | 0.07,0.98,best
78 | 0.08,0.98,best
79 | 0.09,0.98,best
80 | 0.1,0.98,best
81 | 0.12,0.99,best
82 | 0.13,0.99,best
83 | 0.15,0.99,best
84 | 0.16,0.99,best
85 | 0.17,0.99,best
86 | 0.21,1.0,best
87 | 0.22,1.0,best
88 | 0.24,1.0,best
89 | 0.25,1.0,best
90 | 0.27,1.0,best
91 | 0.3,1.0,best
92 | 0.37,1.0,best
93 | 0.41,1.0,best
94 | 0.45,1.0,best
95 | 0.47,1.0,best
96 | 0.5,1.0,best
97 | 0.56,1.0,best
98 | 0.6,1.0,best
99 | 0.66,1.0,best
100 | 0.7,1.0,best
101 | 0.75,1.0,best
102 | 1.0,1.0,best
103 | 1.0,1.0,best
104 |
--------------------------------------------------------------------------------
/ch_11/0-simulating_the_data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Simulating the data\n",
8 | "Before we go into some exploratory data analysis, let's see how we simulated the data:\n",
9 | "\n",
10 | "```\n",
11 | "# example of how to run this simulation from jupyter (remove the ! to run from the command line)\n",
12 | "!python simulate.py -s 0 --stealthy -l logs/jan_2018.csv -hl logs/hackers_jan_2018.csv 31 \"2018-01-01\" 0.01 0.5\n",
13 | "```\n",
14 | "\n",
15 | "| Month | Probability of attack in a given hour | Probability of trying entire userbase | Vary IP addresses? |\n",
16 | "| --- | --- | --- | --- |\n",
17 | "| Jan 2018 | 1.00% | 50% | Yes |\n",
18 | "| Feb 2018 | 0.50% | 25% | Yes |\n",
19 | "| Mar 2018 | 0.10% | 10% | Yes |\n",
20 | "| Apr 2018 | 1.00% | 65% | Yes |\n",
21 | "| May 2018 | 0.01% | 5% | Yes |\n",
22 | "| Jun 2018 | 0.05% | 5% | Yes |\n",
23 | "| Jul 2018 | 1.00% | 15% | Yes |\n",
24 | "| Aug 2018 | 0.50% | 10% | Yes |\n",
25 | "| Sep 2018 | 0.50% | 10% | No |\n",
26 | "| Oct 2018 | 0.20% | 12% | No |\n",
27 | "| Nov 2018 | 0.70% | 17% | Yes |\n",
28 | "| Dec 2018 | 8.00% | 88% | Yes |\n",
29 | "| Jan 2019 | 0.80% | 8% | Yes |\n",
30 | "| Feb 2019 | 0.10% | 18% | Yes |\n",
31 | "| Mar 2019 | 0.10% | 18% | Yes |\n",
32 | "\n",
33 | "We use pandas to combine the files by year. First, we create a utility function for concatenating the files:\n",
34 | "\n",
35 | "```\n",
36 | "import pandas as pd\n",
37 | "\n",
38 | "def cat_csvs(format_string_file_pattern, index_col, month_list):\n",
39 | " \"\"\"\n",
40 | " Utility function for concatentating CSV files from simulation.\n",
41 | " \n",
42 | " Parameters: \n",
43 | " - format_string_file_pattern: The pattern for the file name with `{}` in the place of the month\n",
44 | " - index_col: The column with the datetimes to sort on.\n",
45 | " - month_list: The list of the months as formatted in the file names.\n",
46 | " \n",
47 | " Returns:\n",
48 | " A concatenated pandas DataFrame\n",
49 | " \"\"\"\n",
50 | " return pd.concat([\n",
51 | " pd.read_csv(\n",
52 | " format_string_file_pattern.format(file), index_col=index_col, parse_dates=True\n",
53 | " ) for file in month_list\n",
54 | " ])\n",
55 | "```\n",
56 | "\n",
57 | "Next, we concatenate the 2018 logs making sure to not record any data from early January 1, 2019 which may have been generated from the Poisson process in December 2018:\n",
58 | "```\n",
59 | "logs_2018 = cat_csvs(\n",
60 | " 'logs/{}_2018.csv', 'datetime', \n",
61 | " ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']\n",
62 | ")\n",
63 | "logs_2018['2018'].sort_index().to_csv('logs/logs_2018.csv') # sometimes the simulation overshoots the end date\n",
64 | "```\n",
65 | "\n",
66 | "Now, we concatenate the 2019 logs remembering to add back the 2019 entries that got into the December 2018 simulation and clip the April 2019 entries from the March simulation:\n",
67 | "```\n",
68 | "logs_2019 = pd.concat([cat_csvs('logs/{}_2019.csv', 'datetime', ['jan', 'feb', 'mar']), logs_2018['2019']])\n",
69 | "logs_2019['2019-Q1'].to_csv('logs/logs_2019.csv') # sometimes the simulation overshoots the end date\n",
70 | "```\n",
71 | "\n",
72 | "After we have the login attempts logs, we concatenate the 2018 hacker logs:\n",
73 | "```\n",
74 | "hackers_2018 = cat_csvs(\n",
75 | " 'logs/hackers_{}_2018.csv', 'start', \n",
76 | " ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']\n",
77 | ")\n",
78 | "hackers_2018['2018'].sort_index().to_csv('logs/hackers_2018.csv')\n",
79 | "```\n",
80 | "\n",
81 | "Concatenating the 2019 hacker logs is the same process:\n",
82 | "```\n",
83 | "hackers_2019 = pd.concat([\n",
84 | " cat_csvs('logs/hackers_{}_2019.csv', 'start', ['jan', 'feb', 'mar']), hackers_2018['2019']\n",
85 | "])\n",
86 | "hackers_2019['2019-Q1'].sort_index().to_csv('logs/hackers_2019.csv')\n",
87 | "```\n",
88 | "\n",
89 | "The process of building the CSV files from the individual simulations is contained in `merge_logs.py` and the entire process is in the bash script `run_simulations.sh`. You don't have to run either of these."
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "# Create SQLite Database"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 1,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "import sqlite3\n",
106 | "import numpy as np\n",
107 | "import pandas as pd\n",
108 | "\n",
109 | "# read in files\n",
110 | "logs_2018 = pd.read_csv('logs/logs_2018.csv', index_col='datetime')\n",
111 | "logs_2019 = pd.read_csv('logs/logs_2019.csv', index_col='datetime')\n",
112 | "hackers_2018 = pd.read_csv('logs/hackers_2018.csv', index_col='start')\n",
113 | "hackers_2019 = pd.read_csv('logs/hackers_2019.csv', index_col='start')\n",
114 | "\n",
115 | "# write to database\n",
116 | "with sqlite3.connect('logs/logs.db') as conn:\n",
117 | " logs_2018.to_sql('logs', conn, if_exists='replace')\n",
118 | " logs_2019.to_sql('logs', conn, if_exists='append')\n",
119 | " hackers_2018.to_sql('attacks', conn, if_exists='replace')\n",
120 | " hackers_2019.to_sql('attacks', conn, if_exists='append')"
121 | ]
122 | }
123 | ],
124 | "metadata": {
125 | "kernelspec": {
126 | "display_name": "Python 3",
127 | "language": "python",
128 | "name": "python3"
129 | },
130 | "language_info": {
131 | "codemirror_mode": {
132 | "name": "ipython",
133 | "version": 3
134 | },
135 | "file_extension": ".py",
136 | "mimetype": "text/x-python",
137 | "name": "python",
138 | "nbconvert_exporter": "python",
139 | "pygments_lexer": "ipython3",
140 | "version": "3.7.2"
141 | }
142 | },
143 | "nbformat": 4,
144 | "nbformat_minor": 2
145 | }
146 |
--------------------------------------------------------------------------------
/ch_11/logs/hackers_2018.csv:
--------------------------------------------------------------------------------
1 | start,end,source_ip
2 | 2018-01-05 06:03:42.470259,2018-01-05 06:03:51.470259,170.9.4.108
3 | 2018-01-11 03:08:43.284085,2018-01-11 03:09:14.284085,27.255.30.3
4 | 2018-01-17 00:41:43.985324,2018-01-17 00:45:56.985324,226.98.192.152
5 | 2018-01-21 10:34:57.842776,2018-01-21 10:38:01.842776,102.178.107.171
6 | 2018-01-21 23:12:10.852725,2018-01-21 23:12:38.852725,48.172.61.152
7 | 2018-01-23 20:56:23.219809,2018-01-23 20:58:40.219809,62.209.68.197
8 | 2018-01-26 05:56:35.872139,2018-01-26 06:00:42.872139,27.193.94.129
9 | 2018-02-12 17:33:22.974691,2018-02-12 17:35:53.974691,93.253.75.244
10 | 2018-02-18 06:54:30.130457,2018-02-18 06:58:47.130457,51.221.142.249
11 | 2018-02-19 01:36:20.192653,2018-02-19 01:37:31.192653,139.251.218.76
12 | 2018-02-23 19:56:41.865629,2018-02-23 19:57:59.865629,84.4.101.68
13 | 2018-04-02 01:55:20.503176,2018-04-02 01:59:28.503176,131.59.202.195
14 | 2018-04-03 00:16:23.223652,2018-04-03 00:20:27.223652,13.94.42.10
15 | 2018-07-03 22:52:58.170083,2018-07-03 22:56:34.170083,74.178.145.82
16 | 2018-07-19 22:25:03.009372,2018-07-19 22:29:12.009372,6.199.47.120
17 | 2018-07-19 23:50:18.364552,2018-07-19 23:53:10.364552,112.214.73.179
18 | 2018-07-28 13:34:34.220996,2018-07-28 13:36:23.220996,229.204.248.216
19 | 2018-07-31 03:30:46.577582,2018-07-31 03:34:57.577582,142.85.18.115
20 | 2018-08-03 02:28:00.788461,2018-08-03 02:29:37.788461,172.5.153.120
21 | 2018-08-04 10:21:32.855163,2018-08-04 10:23:32.855163,208.224.58.84
22 | 2018-08-05 07:47:05.202707,2018-08-05 07:47:13.202707,59.115.153.240
23 | 2018-08-26 19:48:07.338588,2018-08-26 19:49:02.338588,152.34.193.165
24 | 2018-09-09 11:04:19.470385,2018-09-09 11:08:27.470385,78.174.30.56
25 | 2018-09-11 18:57:00.687504,2018-09-11 18:58:12.687504,17.4.47.84
26 | 2018-09-12 20:52:34.155677,2018-09-12 20:55:28.155677,173.207.252.26
27 | 2018-09-17 15:30:25.969556,2018-09-17 15:33:32.969556,22.210.104.44
28 | 2018-09-18 17:46:09.932924,2018-09-18 17:48:38.932924,174.147.116.255
29 | 2018-09-19 11:08:32.108147,2018-09-19 11:11:07.108147,138.101.91.226
30 | 2018-09-21 19:31:50.003252,2018-09-21 19:35:21.003252,215.189.60.53
31 | 2018-09-24 23:26:31.709160,2018-09-24 23:30:34.709160,228.144.254.255
32 | 2018-10-06 00:04:11.688537,2018-10-06 00:06:08.688537,12.85.219.94
33 | 2018-11-01 04:12:34.982693,2018-11-01 04:16:19.982693,104.8.35.137
34 | 2018-11-07 01:50:33.596719,2018-11-07 01:51:16.596719,183.16.40.217
35 | 2018-11-11 14:21:01.413492,2018-11-11 14:25:06.413492,107.210.163.30
36 | 2018-11-15 10:06:25.095924,2018-11-15 10:09:03.095924,105.178.119.27
37 | 2018-11-20 18:50:37.325372,2018-11-20 18:54:49.325372,118.189.202.82
38 | 2018-11-23 16:26:28.482645,2018-11-23 16:28:14.482645,88.199.57.22
39 | 2018-11-24 04:34:58.053877,2018-11-24 04:35:43.053877,206.121.220.195
40 | 2018-12-01 00:06:51.879131,2018-12-01 00:07:06.879131,168.10.158.149
41 | 2018-12-01 06:43:34.778738,2018-12-01 06:45:41.778738,137.174.91.123
42 | 2018-12-01 12:29:34.979806,2018-12-01 12:33:48.979806,218.114.210.223
43 | 2018-12-01 18:19:18.507327,2018-12-01 18:23:31.507327,118.29.144.220
44 | 2018-12-02 13:22:42.567875,2018-12-02 13:26:55.567875,29.101.15.78
45 | 2018-12-02 20:05:21.387524,2018-12-02 20:09:29.387524,121.120.155.251
46 | 2018-12-02 21:09:01.894714,2018-12-02 21:13:04.894714,232.222.234.177
47 | 2018-12-02 22:40:49.319515,2018-12-02 22:45:01.319515,123.182.42.106
48 | 2018-12-03 01:28:16.456126,2018-12-03 01:32:24.456126,118.93.53.14
49 | 2018-12-03 04:59:47.275187,2018-12-03 05:02:37.275187,151.246.57.5
50 | 2018-12-03 16:52:01.590703,2018-12-03 16:56:17.590703,211.229.145.233
51 | 2018-12-04 00:20:53.519253,2018-12-04 00:25:04.519253,111.202.163.183
52 | 2018-12-04 05:21:35.664250,2018-12-04 05:25:43.664250,105.35.130.3
53 | 2018-12-04 22:17:16.477064,2018-12-04 22:21:26.477064,8.245.180.35
54 | 2018-12-06 06:49:29.023733,2018-12-06 06:53:35.023733,135.248.127.1
55 | 2018-12-06 07:39:45.474778,2018-12-06 07:43:53.474778,14.230.220.30
56 | 2018-12-07 05:06:07.981164,2018-12-07 05:10:22.981164,191.157.174.232
57 | 2018-12-07 07:01:52.126580,2018-12-07 07:03:26.126580,58.205.207.0
58 | 2018-12-09 08:15:30.388657,2018-12-09 08:19:38.388657,215.47.10.84
59 | 2018-12-10 00:54:42.208933,2018-12-10 00:55:12.208933,100.228.159.98
60 | 2018-12-10 04:10:01.520520,2018-12-10 04:12:23.520520,209.19.127.19
61 | 2018-12-10 07:19:15.373502,2018-12-10 07:23:30.373502,154.165.28.184
62 | 2018-12-10 10:16:13.278728,2018-12-10 10:20:32.278728,211.164.183.50
63 | 2018-12-10 19:56:52.369200,2018-12-10 20:01:00.369200,43.222.72.24
64 | 2018-12-11 14:54:49.328939,2018-12-11 14:59:07.328939,36.117.206.37
65 | 2018-12-12 04:50:14.055379,2018-12-12 04:54:26.055379,59.12.7.192
66 | 2018-12-12 10:15:23.076522,2018-12-12 10:19:31.076522,2.190.1.26
67 | 2018-12-12 13:20:16.800584,2018-12-12 13:24:29.800584,15.26.194.139
68 | 2018-12-12 14:13:06.789965,2018-12-12 14:17:17.789965,210.54.123.17
69 | 2018-12-13 05:54:40.470621,2018-12-13 05:58:59.470621,145.195.179.188
70 | 2018-12-13 14:52:35.667099,2018-12-13 14:56:34.667099,203.31.153.203
71 | 2018-12-13 17:51:29.542459,2018-12-13 17:55:47.542459,233.179.120.157
72 | 2018-12-14 09:46:15.839045,2018-12-14 09:50:25.839045,206.47.210.131
73 | 2018-12-14 10:39:20.772384,2018-12-14 10:43:24.772384,230.109.220.92
74 | 2018-12-14 13:08:01.549749,2018-12-14 13:12:12.549749,24.137.19.206
75 | 2018-12-14 16:25:18.228474,2018-12-14 16:29:24.228474,32.105.154.210
76 | 2018-12-15 03:59:23.509947,2018-12-15 04:03:32.509947,12.101.52.83
77 | 2018-12-15 09:54:52.168127,2018-12-15 09:59:11.168127,211.99.79.128
78 | 2018-12-16 00:27:27.587855,2018-12-16 00:31:35.587855,26.121.6.174
79 | 2018-12-16 16:42:04.730609,2018-12-16 16:46:12.730609,234.78.210.179
80 | 2018-12-16 19:21:33.557561,2018-12-16 19:25:46.557561,8.114.151.106
81 | 2018-12-18 10:36:03.439159,2018-12-18 10:40:14.439159,36.61.121.85
82 | 2018-12-19 20:58:39.747942,2018-12-19 21:02:47.747942,86.18.45.118
83 | 2018-12-19 21:44:18.269827,2018-12-19 21:48:30.269827,65.209.187.180
84 | 2018-12-20 00:43:20.356847,2018-12-20 00:47:28.356847,113.26.222.48
85 | 2018-12-20 06:07:46.630802,2018-12-20 06:11:52.630802,230.26.154.133
86 | 2018-12-20 12:26:56.839296,2018-12-20 12:31:05.839296,171.147.124.35
87 | 2018-12-21 00:31:19.644809,2018-12-21 00:35:29.644809,197.71.159.62
88 | 2018-12-21 22:22:38.012453,2018-12-21 22:26:43.012453,121.115.24.225
89 | 2018-12-22 09:45:01.755742,2018-12-22 09:49:15.755742,69.130.109.64
90 | 2018-12-22 17:02:03.504117,2018-12-22 17:06:15.504117,211.219.25.119
91 | 2018-12-23 08:29:59.530428,2018-12-23 08:34:12.530428,40.48.69.140
92 | 2018-12-23 20:43:58.797374,2018-12-23 20:48:03.797374,49.79.153.96
93 | 2018-12-24 04:45:53.184929,2018-12-24 04:50:06.184929,166.198.17.114
94 | 2018-12-24 07:26:30.035556,2018-12-24 07:30:35.035556,43.180.136.184
95 | 2018-12-24 13:23:21.875545,2018-12-24 13:27:30.875545,180.246.142.130
96 | 2018-12-24 14:19:14.642045,2018-12-24 14:23:29.642045,148.170.131.157
97 | 2018-12-24 22:35:00.889513,2018-12-24 22:39:12.889513,32.148.94.35
98 | 2018-12-25 01:47:21.899698,2018-12-25 01:51:31.899698,4.7.249.142
99 | 2018-12-25 16:15:31.808751,2018-12-25 16:17:54.808751,19.101.185.116
100 | 2018-12-26 07:54:59.939399,2018-12-26 07:59:10.939399,113.25.122.147
101 | 2018-12-26 12:09:23.860732,2018-12-26 12:13:32.860732,19.68.134.150
102 | 2018-12-26 18:31:22.538764,2018-12-26 18:35:32.538764,41.152.93.178
103 | 2018-12-26 23:27:24.500651,2018-12-26 23:31:36.500651,76.26.233.66
104 | 2018-12-27 05:47:48.433162,2018-12-27 05:51:58.433162,237.151.209.164
105 | 2018-12-27 11:26:20.357926,2018-12-27 11:30:28.357926,150.82.255.148
106 | 2018-12-28 04:25:17.262554,2018-12-28 04:29:32.262554,239.87.171.169
107 | 2018-12-28 18:00:07.773171,2018-12-28 18:04:21.773171,86.29.47.60
108 | 2018-12-29 03:27:22.432176,2018-12-29 03:31:35.432176,86.94.179.28
109 | 2018-12-29 15:39:10.376311,2018-12-29 15:39:14.376311,45.208.66.139
110 | 2018-12-29 19:52:45.347044,2018-12-29 19:56:55.347044,191.136.131.14
111 | 2018-12-31 08:09:03.009937,2018-12-31 08:13:17.009937,162.197.35.101
112 | 2018-12-31 15:41:57.004462,2018-12-31 15:46:03.004462,138.177.154.55
113 |
--------------------------------------------------------------------------------
/ch_11/logs/hackers_2019.csv:
--------------------------------------------------------------------------------
1 | start,end,source_ip
2 | 2019-01-01 12:14:53.238041,2019-01-01 12:16:03.238041,152.133.44.190
3 | 2019-01-09 19:43:31.272840,2019-01-09 19:45:54.272840,140.121.49.84
4 | 2019-01-10 02:01:29.175645,2019-01-10 02:05:41.175645,141.58.118.74
5 | 2019-01-17 19:09:13.638984,2019-01-17 19:10:12.638984,87.98.217.249
6 | 2019-01-18 07:55:38.736720,2019-01-18 07:55:54.736720,78.218.146.197
7 | 2019-01-25 14:26:46.575983,2019-01-25 14:29:11.575983,153.125.66.175
8 | 2019-01-28 08:46:43.924377,2019-01-28 08:50:44.924377,117.44.11.122
9 | 2019-02-02 04:50:49.424024,2019-02-02 04:54:54.424024,12.48.219.98
10 | 2019-02-02 22:43:48.927638,2019-02-02 22:48:06.927638,218.251.192.57
11 | 2019-02-11 04:49:19.277079,2019-02-11 04:49:19.277079,210.159.230.101
12 | 2019-02-27 18:27:56.987467,2019-02-27 18:32:06.987467,124.83.133.90
13 | 2019-02-28 00:23:46.072334,2019-02-28 00:27:53.072334,175.25.26.95
14 | 2019-03-01 07:15:03.608034,2019-03-01 07:19:10.608034,134.199.232.226
15 | 2019-03-15 19:19:01.250667,2019-03-15 19:23:07.250667,75.138.25.164
16 | 2019-03-16 07:40:57.416222,2019-03-16 07:44:57.416222,38.26.34.112
17 | 2019-03-21 17:20:17.142681,2019-03-21 17:23:52.142681,1.185.125.159
18 | 2019-03-22 07:16:36.708436,2019-03-22 07:17:39.708436,239.101.53.203
19 | 2019-03-27 09:33:02.261349,2019-03-27 09:35:43.261349,152.138.125.171
20 |
--------------------------------------------------------------------------------
/ch_11/logs/logs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_11/logs/logs.db
--------------------------------------------------------------------------------
/ch_11/merge_logs.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | def cat_csvs(format_string_file_pattern, index_col, month_list):
4 | """
5 | Utility function for concatentating CSV files from simulation.
6 |
7 | Parameters:
8 | - format_string_file_pattern: The pattern for the file name with `{}` in the place of the month
9 | - index_col: The column with the datetimes to sort on.
10 | - month_list: The list of the months as formatted in the file names.
11 |
12 | Returns:
13 | A concatenated pandas DataFrame
14 | """
15 | return pd.concat([
16 | pd.read_csv(
17 | format_string_file_pattern.format(file), index_col=index_col, parse_dates=True
18 | ) for file in month_list
19 | ])
20 |
21 | if __name__ == '__main__':
22 | logs_2018 = cat_csvs(
23 | 'logs/{}_2018.csv', 'datetime',
24 | ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
25 | )
26 | logs_2018['2018'].sort_index().to_csv('logs/logs_2018.csv') # sometimes the simulation overshoots the end date
27 |
28 | logs_2019 = pd.concat([cat_csvs('logs/{}_2019.csv', 'datetime', ['jan', 'feb', 'mar']), logs_2018.get('2019')])
29 | logs_2019['2019-Q1'].to_csv('logs/logs_2019.csv') # sometimes the simulation overshoots the end date
30 |
31 | hackers_2018 = cat_csvs(
32 | 'logs/hackers_{}_2018.csv', 'start',
33 | ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
34 | )
35 | hackers_2018['2018'].sort_index().to_csv('logs/hackers_2018.csv')
36 |
37 | hackers_2019 = pd.concat([
38 | cat_csvs('logs/hackers_{}_2019.csv', 'start', ['jan', 'feb', 'mar']), hackers_2018.get('2019')
39 | ])
40 | hackers_2019['2019-Q1'].sort_index().to_csv('logs/hackers_2019.csv')
41 |
42 | print('All done!')
43 |
--------------------------------------------------------------------------------
/ch_11/run_simulations.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | LOGS="logs"
4 |
5 | # make a directory for our logs
6 | if ! [ -d "$LOGS" ]; then
7 | mkdir "$LOGS"
8 | fi
9 |
10 | # run the simulations
11 | echo 'Simulating January 2018...'
12 | python simulate.py -s 1 --stealthy -l "$LOGS"/jan_2018.csv -hl "$LOGS"/hackers_jan_2018.csv 31 "2018-01-01" 0.01 0.5
13 |
14 | printf '\nSimulating February 2018...\n'
15 | python simulate.py -s 2 --stealthy -l "$LOGS"/feb_2018.csv -hl "$LOGS"/hackers_feb_2018.csv 28 "2018-02-01" 0.005 0.25
16 |
17 | printf '\nSimulating March 2018...\n'
18 | python simulate.py -s 3 --stealthy -l "$LOGS"/mar_2018.csv -hl "$LOGS"/hackers_mar_2018.csv 31 "2018-03-01" 0.001 0.10
19 |
20 | printf '\nSimulating April 2018...\n'
21 | python simulate.py -s 4 --stealthy -l "$LOGS"/apr_2018.csv -hl "$LOGS"/hackers_apr_2018.csv 30 "2018-04-01" 0.01 0.65
22 |
23 | printf '\nSimulating May 2018...\n'
24 | python simulate.py -s 5 --stealthy -l "$LOGS"/may_2018.csv -hl "$LOGS"/hackers_may_2018.csv 31 "2018-05-01" 0.0001 0.05
25 |
26 | printf '\nSimulating June 2018...\n'
27 | python simulate.py -s 6 --stealthy -l "$LOGS"/jun_2018.csv -hl "$LOGS"/hackers_jun_2018.csv 30 "2018-06-01" 0.0005 0.05
28 |
29 | printf '\nSimulating July 2018...\n'
30 | python simulate.py -s 7 --stealthy -l "$LOGS"/jul_2018.csv -hl "$LOGS"/hackers_jul_2018.csv 31 "2018-07-01" 0.01 0.15
31 |
32 | printf '\nSimulating August 2018...\n'
33 | python simulate.py -s 8 --stealthy -l "$LOGS"/aug_2018.csv -hl "$LOGS"/hackers_aug_2018.csv 31 "2018-08-01" 0.005 0.1
34 |
35 | printf '\nSimulating September 2018...\n'
36 | python simulate.py -s 9 -l "$LOGS"/sep_2018.csv -hl "$LOGS"/hackers_sep_2018.csv 30 "2018-09-01" 0.005 0.1
37 |
38 | printf '\nSimulating October 2018...\n'
39 | python simulate.py -s 10 -l "$LOGS"/oct_2018.csv -hl "$LOGS"/hackers_oct_2018.csv 31 "2018-10-01" 0.002 0.12
40 |
41 | printf '\nSimulating November 2018...\n'
42 | python simulate.py -s 11 --stealthy -l "$LOGS"/nov_2018.csv -hl "$LOGS"/hackers_nov_2018.csv 30 "2018-11-01" 0.007 0.17
43 |
44 | printf '\nSimulating December 2018...\n'
45 | python simulate.py -s 12 --stealthy -l "$LOGS"/dec_2018.csv -hl "$LOGS"/hackers_dec_2018.csv 31 "2018-12-01" 0.08 0.88
46 |
47 | printf '\nSimulating January 2019...\n'
48 | python simulate.py -s 13 --stealthy -l "$LOGS"/jan_2019.csv -hl "$LOGS"/hackers_jan_2019.csv 31 "2019-01-01" 0.008 0.08
49 |
50 | printf '\nSimulating February 2019...\n'
51 | python simulate.py -s 14 --stealthy -l "$LOGS"/feb_2019.csv -hl "$LOGS"/hackers_feb_2019.csv 28 "2019-02-01" 0.01 0.18
52 |
53 | printf '\nSimulating March 2019...\n'
54 | python simulate.py -s 15 --stealthy -l "$LOGS"/mar_2019.csv -hl "$LOGS"/hackers_mar_2019.csv 31 "2019-03-01" 0.01 0.18
55 |
56 | # combine the files
57 | echo 'Merging files...'
58 | python merge_logs.py
59 |
60 | # remove unnecessary files
61 | echo 'Cleaning up...'
62 | cd "$LOGS"
63 | echo "$(ls)" | grep -E "(^[a-z]{3}_{1})|(hackers_[a-z]{3})" | xargs rm
64 | cd ..
65 |
66 | echo 'Success!'
67 |
--------------------------------------------------------------------------------
/ch_11/simulate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import datetime as dt
3 | import os
4 | import logging
5 | import random
6 |
7 | import login_attempt_simulator as sim
8 |
9 | # Logging configuration
10 | FORMAT = '[%(levelname)s] [ %(name)s ] %(message)s'
11 | logging.basicConfig(level=logging.INFO, format=FORMAT)
12 | logger = logging.getLogger(os.path.basename(__file__))
13 |
14 | def get_simulation_file_path(path_provided, directory, default_file):
15 | """Get the path to the file creating the directory and using the default if necessary."""
16 | if path_provided:
17 | file = path_provided
18 | else:
19 | if not os.path.exists(directory):
20 | os.mkdir(directory)
21 | file = os.path.join(directory, default_file)
22 | return file
23 |
24 | def get_user_base_file_path(path_provided, default_file):
25 | """Get the path for a user_data directory file."""
26 | return get_simulation_file_path(path_provided, 'user_data', default_file)
27 |
28 | def get_log_file_path(path_provided, default_file):
29 | """Get the path for a logs directory file."""
30 | return get_simulation_file_path(path_provided, 'logs', default_file)
31 |
32 | if __name__ == '__main__':
33 | # command line argument parsing
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument(
36 | "days", type=float,
37 | help="number of days to simulate from start"
38 | )
39 | parser.add_argument(
40 | "start_date", type=str,
41 | help="datetime to start in the form 'YYYY-MM-DD' or 'YYYY-MM-DD-HH'"
42 | )
43 | parser.add_argument(
44 | "-s", "--seed", type=int, help="set a seed for reproducibility"
45 | )
46 | parser.add_argument(
47 | "attack_prob", type=float,
48 | help="probability of attack in a given hour"
49 | )
50 | parser.add_argument(
51 | "try_all_users_prob", type=float,
52 | help="probability attacker tries to guess credentials for all usernames"
53 | )
54 | parser.add_argument(
55 | "-st", "--stealthy", action='store_true', help="be stealthy? (vary IP addresses?)"
56 | )
57 | parser.add_argument(
58 | "-m", "--make", action='store_true', help="make userbase"
59 | )
60 | parser.add_argument(
61 | "-u", "--userbase", help="file to write the userbase to"
62 | )
63 | parser.add_argument(
64 | "-i", "--ip", help="file to write the user-ip address map to"
65 | )
66 | parser.add_argument(
67 | "-l", "--log", help="file to write the attempt log to"
68 | )
69 | parser.add_argument(
70 | "-hl", "--hacklog", help="file to write the hack log to"
71 | )
72 | args = parser.parse_args()
73 | user_ip_mapping_file = get_user_base_file_path(args.ip, 'user_ips.json')
74 |
75 | if args.make:
76 | logger.warning('Creating new user base and mapping IP addresses to them.')
77 |
78 | user_base_file = get_user_base_file_path(args.userbase, 'user_base.txt')
79 |
80 | # seed the creation of userbase
81 | random.seed(args.seed)
82 |
83 | # create usernames and write to file
84 | sim.utils.make_userbase(user_base_file)
85 |
86 | # create one or more IP addresses per user and save mapping to file
87 | valid_users = sim.utils.get_valid_users(user_base_file)
88 | sim.utils.save_user_ips(
89 | sim.utils.assign_ip_addresses(valid_users), user_ip_mapping_file
90 | )
91 |
92 | try:
93 | start = dt.datetime(*map(int, args.start_date.split('-')))
94 | except TypeError:
95 | logger.error('Start date must be in the format "YYYY-MM-DD"')
96 | raise
97 | except ValueError:
98 | logger.warning(
99 | f'Could not interpret {args.start_date}, '
100 | 'using January 1, 2019 at 12AM as start instead'
101 | )
102 | start = dt.datetime(2019, 1, 1)
103 |
104 |
105 | end = start + dt.timedelta(days=args.days)
106 |
107 | try:
108 | logger.info(f'Simulating {args.days} days...')
109 | simulator = sim.LoginAttemptSimulator(
110 | user_ip_mapping_file, start, end, seed=args.seed
111 | )
112 | simulator.simulate(
113 | attack_prob=args.attack_prob,
114 | try_all_users_prob=args.try_all_users_prob,
115 | vary_ips=args.stealthy
116 | )
117 |
118 | # save logs
119 | logger.info('Saving logs')
120 | simulator.save_hack_log(get_log_file_path(args.hacklog, 'attacks.csv'))
121 | simulator.save_log(get_log_file_path(args.log, 'log.csv'))
122 |
123 | logger.info('All done!')
124 | except:
125 | logger.error('Oops! Something went wrong...')
126 |
--------------------------------------------------------------------------------
/ch_11/user_data/user_base.txt:
--------------------------------------------------------------------------------
1 | asmith
2 | ajones
3 | akim
4 | alopez
5 | abrown
6 | bsmith
7 | bjones
8 | bkim
9 | blopez
10 | bbrown
11 | csmith
12 | cjones
13 | ckim
14 | clopez
15 | cbrown
16 | dsmith
17 | djones
18 | dkim
19 | dlopez
20 | dbrown
21 | esmith
22 | ejones
23 | ekim
24 | elopez
25 | ebrown
26 | fsmith
27 | fjones
28 | fkim
29 | flopez
30 | fbrown
31 | gsmith
32 | gjones
33 | gkim
34 | glopez
35 | gbrown
36 | hsmith
37 | hjones
38 | hkim
39 | hlopez
40 | hbrown
41 | ismith
42 | ijones
43 | ikim
44 | ilopez
45 | ibrown
46 | jsmith
47 | jjones
48 | jkim
49 | jlopez
50 | jbrown
51 | ksmith
52 | kjones
53 | kkim
54 | klopez
55 | kbrown
56 | lsmith
57 | ljones
58 | lkim
59 | llopez
60 | lbrown
61 | msmith
62 | mjones
63 | mkim
64 | mlopez
65 | mbrown
66 | nsmith
67 | njones
68 | nkim
69 | nlopez
70 | nbrown
71 | osmith
72 | ojones
73 | okim
74 | olopez
75 | obrown
76 | psmith
77 | pjones
78 | pkim
79 | plopez
80 | pbrown
81 | qsmith
82 | qjones
83 | qkim
84 | qlopez
85 | qbrown
86 | rsmith
87 | rjones
88 | rkim
89 | rlopez
90 | rbrown
91 | ssmith
92 | sjones
93 | skim
94 | slopez
95 | sbrown
96 | tsmith
97 | tjones
98 | tkim
99 | tlopez
100 | tbrown
101 | usmith
102 | ujones
103 | ukim
104 | ulopez
105 | ubrown
106 | vsmith
107 | vjones
108 | vkim
109 | vlopez
110 | vbrown
111 | wsmith
112 | wjones
113 | wkim
114 | wlopez
115 | wbrown
116 | xsmith
117 | xjones
118 | xkim
119 | xlopez
120 | xbrown
121 | ysmith
122 | yjones
123 | ykim
124 | ylopez
125 | ybrown
126 | zsmith
127 | zjones
128 | zkim
129 | zlopez
130 | zbrown
131 | admin
132 | master
133 | dba
134 |
--------------------------------------------------------------------------------
/ch_11/user_data/user_ips.json:
--------------------------------------------------------------------------------
1 | {"asmith": ["6.252.142.27"], "ajones": ["173.50.12.181"], "akim": ["98.43.141.103", "220.40.22.86", "124.178.25.98"], "alopez": ["218.160.255.160", "19.161.178.228", "62.253.28.155"], "abrown": ["237.189.239.120"], "bsmith": ["83.113.51.172", "1.218.123.66"], "bjones": ["139.17.174.247", "227.187.186.96"], "bkim": ["12.1.71.46", "5.91.167.118", "234.179.140.103"], "blopez": ["224.156.38.13", "7.145.44.102"], "bbrown": ["165.149.191.111", "203.116.114.41", "219.243.101.100"], "csmith": ["219.34.152.98"], "cjones": ["16.250.160.227", "158.239.189.2"], "ckim": ["204.205.71.171"], "clopez": ["100.208.255.37", "81.153.10.191"], "cbrown": ["24.112.17.125"], "dsmith": ["6.132.27.197"], "djones": ["223.178.55.3"], "dkim": ["32.62.195.210", "145.135.160.219", "88.97.1.201"], "dlopez": ["81.142.166.142"], "dbrown": ["82.151.213.13"], "esmith": ["157.153.26.116", "49.47.127.7", "102.63.17.136"], "ejones": ["35.25.197.235"], "ekim": ["93.10.102.171", "4.40.46.47"], "elopez": ["116.147.197.72", "47.113.213.68", "178.117.47.173"], "ebrown": ["44.23.94.53", "95.170.61.237"], "fsmith": ["25.52.89.61", "21.79.236.54", "109.126.45.254"], "fjones": ["156.123.42.2", "108.234.173.69"], "fkim": ["196.59.227.183", "169.48.40.197"], "flopez": ["235.138.253.203"], "fbrown": ["16.118.156.50"], "gsmith": ["52.196.239.10"], "gjones": ["220.178.30.239"], "gkim": ["207.134.95.170", "124.78.101.138"], "glopez": ["116.115.247.65", "32.45.175.69", "167.7.157.50"], "gbrown": ["34.70.31.26", "60.87.41.200"], "hsmith": ["194.16.87.189", "150.105.190.166", "197.130.196.192"], "hjones": ["77.228.157.126", "103.6.163.47"], "hkim": ["222.242.117.220"], "hlopez": ["25.246.225.197"], "hbrown": ["121.208.14.158", "234.193.77.22"], "ismith": ["235.42.240.18", "168.249.146.54"], "ijones": ["92.90.216.98"], "ikim": ["18.170.45.213", "38.215.94.52"], "ilopez": ["125.182.32.22", "237.25.177.213", "237.149.204.131"], "ibrown": ["171.243.99.246", "149.210.206.193"], "jsmith": ["2.147.74.216"], "jjones": ["195.14.154.50", "202.169.215.170"], "jkim": ["45.195.73.202", "142.163.253.58", "88.89.16.210"], "jlopez": ["196.139.212.129", "219.231.94.92"], "jbrown": ["29.74.37.92"], "ksmith": ["67.144.9.42"], "kjones": ["96.251.147.183"], "kkim": ["156.125.97.32"], "klopez": ["37.0.160.74"], "kbrown": ["156.71.60.56"], "lsmith": ["121.0.179.182"], "ljones": ["187.211.58.58", "140.159.149.19", "105.118.5.129"], "lkim": ["180.225.253.52", "108.94.111.193"], "llopez": ["236.71.190.208"], "lbrown": ["142.117.171.37"], "msmith": ["110.216.28.51"], "mjones": ["163.70.126.33", "214.202.65.75", "163.18.185.107"], "mkim": ["90.49.69.190"], "mlopez": ["172.112.56.41", "52.204.133.50", "39.208.143.86"], "mbrown": ["164.253.117.48", "181.136.173.32"], "nsmith": ["169.248.242.6", "86.134.40.87", "119.137.148.47"], "njones": ["29.82.101.154", "168.143.138.162", "139.140.155.139"], "nkim": ["215.156.73.91", "173.251.190.189", "59.143.160.204"], "nlopez": ["120.99.79.161", "182.42.181.138"], "nbrown": ["142.6.232.17"], "osmith": ["37.76.79.196"], "ojones": ["107.142.221.216", "125.15.117.120", "114.94.122.0"], "okim": ["144.196.236.37", "188.178.79.238"], "olopez": ["84.63.187.234", "27.140.145.181"], "obrown": ["230.231.48.120", "216.231.167.224"], "psmith": ["133.93.124.180", "199.209.181.250"], "pjones": ["59.57.184.103", "207.37.53.232"], "pkim": ["204.182.96.183", "150.64.165.159"], "plopez": ["156.154.161.197", "21.27.218.73", "210.235.225.68"], "pbrown": ["174.108.17.132", "87.92.168.152"], "qsmith": ["179.88.247.35"], "qjones": ["126.228.43.201"], "qkim": ["227.55.249.250"], "qlopez": ["132.124.26.73"], "qbrown": ["122.220.83.23"], "rsmith": ["103.150.207.72"], "rjones": ["237.129.153.201"], "rkim": ["129.116.213.203", "68.150.183.142"], "rlopez": ["65.170.52.77"], "rbrown": ["99.10.254.223", "100.54.46.85"], "ssmith": ["113.97.58.143", "186.89.187.107"], "sjones": ["218.187.113.154", "145.7.65.90", "141.111.232.255"], "skim": ["114.134.47.17", "227.118.165.93"], "slopez": ["172.0.92.175"], "sbrown": ["197.253.160.32", "126.124.56.18"], "tsmith": ["58.49.140.185", "29.70.157.24"], "tjones": ["12.165.104.67"], "tkim": ["101.74.49.187", "11.76.99.35"], "tlopez": ["168.123.156.81"], "tbrown": ["9.233.172.254"], "usmith": ["134.237.148.156"], "ujones": ["206.3.217.210", "90.55.81.7"], "ukim": ["98.227.162.125", "169.29.57.78"], "ulopez": ["142.106.44.88", "158.143.215.210", "5.140.68.6"], "ubrown": ["195.152.80.113", "193.98.19.38", "206.126.112.26"], "vsmith": ["198.236.51.210", "93.36.180.212", "141.175.29.185"], "vjones": ["128.251.20.76"], "vkim": ["71.202.159.171"], "vlopez": ["15.249.62.144"], "vbrown": ["88.121.135.247"], "wsmith": ["59.243.45.6", "142.76.134.181"], "wjones": ["233.31.221.213", "118.4.128.126"], "wkim": ["105.29.8.22", "189.77.78.5"], "wlopez": ["51.185.92.40", "106.225.19.157"], "wbrown": ["208.101.11.88"], "xsmith": ["43.212.185.23"], "xjones": ["169.194.51.116", "82.41.241.235", "26.26.183.131"], "xkim": ["54.57.131.131", "26.32.56.22", "91.244.227.100"], "xlopez": ["101.44.104.162", "155.56.67.107"], "xbrown": ["30.67.241.95"], "ysmith": ["27.95.79.35"], "yjones": ["158.11.94.79"], "ykim": ["52.93.173.144", "200.56.97.220", "209.70.184.211"], "ylopez": ["180.78.246.173", "190.171.127.35", "142.7.151.142"], "ybrown": ["131.139.241.249", "25.12.71.150", "110.235.65.232"], "zsmith": ["226.227.77.216"], "zjones": ["112.33.54.152"], "zkim": ["222.178.53.26"], "zlopez": ["162.160.195.60"], "zbrown": ["230.71.97.153"], "admin": ["140.116.30.11", "41.253.247.255", "22.45.110.229"], "master": ["160.78.166.50", "158.68.77.78"], "dba": ["171.114.66.123"]}
--------------------------------------------------------------------------------
/ch_12/README.md:
--------------------------------------------------------------------------------
1 | # Additional Resources
2 |
3 | ## Finding Data
4 | Both `seaborn` and `sklearn` provide built-in sample datasets that you can experiment with. Check out the documentation for more information.
5 |
6 | ### Searching for Data
7 | The following are a few places you can search for data on a variety of topics:
8 | - [DataHub](https://datahub.io/search)
9 | - [Google Dataset Search](https://toolbox.google.com/datasetsearch)
10 | - [Open data on Amazon Web Services](https://registry.opendata.aws/)
11 | - [OpenML](https://www.openml.org)
12 | - [SNAP library of datasets collected by Stanford University](https://snap.stanford.edu/data/index.html)
13 | - [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/index.php)
14 |
15 | ### APIs
16 | - [Facebook API](https://developers.facebook.com/docs/graph-api)
17 | - [NOAA Climate data API](https://www.ncdc.noaa.gov/cdo-web/webservices/v2)
18 | - [NYTimes API](https://developer.nytimes.com/)
19 | - [Open Weather Map API](https://openweathermap.org/api)
20 | - [Twitter API](https://developer.twitter.com/en/docs.html)
21 | - [USGS Earthquake API](https://earthquake.usgs.gov/fdsnws/event/1/)
22 |
23 | ### Websites by Topic
24 | This section contains selected data resources across various topics, which can be accessed through a website. Obtaining the data for an analysis may be as simple as downloading a CSV file or may require parsing HTML with pandas. If you must resort to scraping the page (make sure you have tried the ways we discussed in this book first), be sure that you aren't violating the terms of use of the website.
25 |
26 | #### Finance
27 | In addition to the `pandas_datareader` and `stock_analysis` packages we discussed in chapter 7, consult the following:
28 | - [Google finance](https://www.google.com/finance)
29 | - [NASDAQ historical stock prices](https://www.nasdaq.com/quotes/historical-quotes.aspx)
30 | - [Quandl](https://www.quandl.com)
31 | - [Yahoo! finance](https://finance.yahoo.com)
32 |
33 | #### Government data
34 | - [European Union open data](http://data.europa.eu/euodp/en/data)
35 | - [NASA](https://data.nasa.gov/)
36 | - [NYC data](https://opendata.cityofnewyork.us/data/)
37 | - [UK government data](https://data.gov.uk/)
38 | - [UN data](http://data.un.org/)
39 | - [US census data](https://census.gov/data.html)
40 | - [US government data](https://www.data.gov/)
41 |
42 | #### Health and economy
43 | - [Gapminder](https://www.gapminder.org/data/)
44 | - [Health data](https://healthdata.gov/search/type/dataset)
45 | - [World Health Organization](https://www.who.int/gho/en/)
46 |
47 | #### Social networks
48 | For those interested in text-based data or graph data, check out the following resources on social networks:
49 | - [List of Twitter data resources](https://github.com/shaypal5/awesome-twitter-data)
50 | - [Social network data](https://snap.stanford.edu/data/ego-Facebook.html)
51 |
52 | #### Sports
53 | - [Baseball database (practice working with a DB)](http://www.seanlahman.com/baseball-archive/statistics/)
54 | - [Baseball player statistics](https://www.baseball-reference.com/players/)
55 | - [Basketball player statistics](https://www.basketball-reference.com/players/)
56 | - [Football (American) player statistics](https://www.pro-football-reference.com/players/)
57 | - [Football (soccer) statistics](https://www.whoscored.com/Statistics)
58 | - [Hockey player statistics](https://www.hockey-reference.com/players/)
59 |
60 | #### Miscellaneous
61 | The following resources vary in topic, but be sure to check these out if nothing so far has piqued your interest:
62 | - [Amazon reviews data](https://snap.stanford.edu/data/web-Amazon.html)
63 | - [Data extracted from Wikipedia](https://wiki.dbpedia.org/develop/datasets)
64 | - [Google Trends](https://trends.google.com/trends/)
65 | - [Movies from MovieLens](https://grouplens.org/datasets/movielens/)
66 | - [Yahoo Webscope (reference library of datasets)](https://webscope.sandbox.yahoo.com/)
67 |
68 | ## Practice working with data
69 | - [Datacamp](https://www.datacamp.com/)
70 | - [Kaggle](https://www.kaggle.com/)
71 |
72 | ## Python practice
73 | - [HackerRank](https://www.hackerrank.com)
74 | - [CodeWars](https://www.codewars.com)
75 | - [LeetCode](https://www.leetcode.com)
76 | - [CodinGame](https://www.codingame.com)
77 | - [Python Morsels](https://www.pythonmorsels.com/)
78 | - [Pramp](https://www.pramp.com)
79 | - [Khan Academy](https://www.khanacademy.org/)
80 | - [LinkedIn Learning](https://www.linkedin.com/learning/)
81 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: book_env
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python=3.7
6 | - imbalanced-learn=0.4.3
7 | - jupyterlab=0.35.6
8 | - matplotlib=3.0.3
9 | - numpy=1.16.3
10 | - pandas=0.23.4
11 | - pandas-datareader=0.7.0
12 | - requests=2.21.0
13 | - pip
14 | - python-graphviz=0.10.1
15 | - scikit-learn=0.20.3
16 | - scipy=1.2.1
17 | - seaborn=0.9.0
18 | - sqlalchemy=1.3.3
19 | - statsmodels=0.9.0
20 | - pip:
21 | - git+https://github.com/stefmolin/login-attempt-simulator.git@pandas_book
22 | - git+https://github.com/stefmolin/ml-utils.git@pandas_book
23 | - git+https://github.com/stefmolin/stock-analysis.git@pandas_book
24 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | graphviz==0.10.1
2 | imbalanced-learn==0.4.3
3 | jupyterlab==0.35.6
4 | matplotlib==3.0.3
5 | numpy==1.16.3
6 | pandas==0.23.4
7 | pandas-datareader==0.7.0
8 | requests==2.21.0
9 | scikit-learn==0.20.3
10 | scipy==1.2.1
11 | seaborn==0.9.0
12 | sqlalchemy==1.3.3
13 | statsmodels==0.9.0
14 | git+https://github.com/stefmolin/login-attempt-simulator.git@pandas_book
15 | git+https://github.com/stefmolin/ml-utils.git@pandas_book
16 | git+https://github.com/stefmolin/stock-analysis.git@pandas_book
17 |
--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.7
2 |
--------------------------------------------------------------------------------
/solutions/ch_08/dec_2018_attacks.csv:
--------------------------------------------------------------------------------
1 | start,end,source_ip
2 | 2018-12-01 06:09:25.126252,2018-12-01 06:13:32.126252,87.106.250.46
3 | 2018-12-02 17:08:51.659516,2018-12-02 17:13:06.659516,124.100.154.251
4 | 2018-12-02 22:31:34.932660,2018-12-02 22:35:47.932660,162.193.160.52
5 | 2018-12-03 03:41:24.236627,2018-12-03 03:41:58.236627,3.187.98.222
6 | 2018-12-04 04:51:10.436412,2018-12-04 04:55:18.436412,12.203.107.201
7 | 2018-12-04 06:33:49.212236,2018-12-04 06:38:03.212236,189.129.85.67
8 | 2018-12-05 08:34:28.448176,2018-12-05 08:38:43.448176,130.0.243.199
9 | 2018-12-06 13:19:19.870073,2018-12-06 13:22:45.870073,119.224.29.115
10 | 2018-12-07 23:09:16.749674,2018-12-07 23:10:50.749674,81.169.193.182
11 | 2018-12-10 05:49:18.805774,2018-12-10 05:53:23.805774,184.99.196.253
12 | 2018-12-10 10:17:56.201382,2018-12-10 10:22:09.201382,96.187.45.99
13 | 2018-12-11 20:42:42.135463,2018-12-11 20:44:42.135463,232.159.100.161
14 | 2018-12-12 07:57:40.912757,2018-12-12 07:57:44.912757,214.199.76.60
15 | 2018-12-13 03:02:37.937394,2018-12-13 03:06:55.937394,185.2.99.58
16 | 2018-12-13 17:52:32.076269,2018-12-13 17:56:43.076269,153.60.249.72
17 | 2018-12-14 01:02:33.845786,2018-12-14 01:06:38.845786,122.197.157.59
18 | 2018-12-14 09:33:16.032817,2018-12-14 09:37:25.032817,121.147.201.245
19 | 2018-12-14 10:15:50.057193,2018-12-14 10:17:42.057193,115.94.212.74
20 | 2018-12-18 12:47:32.171941,2018-12-18 12:49:11.171941,121.90.233.205
21 | 2018-12-23 00:23:46.442783,2018-12-23 00:27:59.442783,48.85.35.228
22 | 2018-12-23 03:05:14.626905,2018-12-23 03:09:26.626905,62.38.152.194
23 | 2018-12-23 18:21:52.780180,2018-12-23 18:26:05.780180,8.46.128.35
24 | 2018-12-24 23:00:04.367951,2018-12-24 23:02:04.367951,97.32.235.17
25 | 2018-12-25 10:58:42.476592,2018-12-25 11:02:54.476592,113.162.208.193
26 | 2018-12-25 11:01:04.947973,2018-12-25 11:05:11.947973,49.76.49.155
27 | 2018-12-26 10:53:43.622708,2018-12-26 10:57:55.622708,25.215.139.208
28 | 2018-12-26 19:57:11.911521,2018-12-26 20:01:20.911521,196.229.137.227
29 | 2018-12-28 12:18:55.920246,2018-12-28 12:22:54.920246,209.190.162.180
30 | 2018-12-29 18:46:09.159686,2018-12-29 18:50:22.159686,78.6.154.160
31 | 2018-12-29 22:03:32.201530,2018-12-29 22:07:39.201530,211.41.206.218
32 | 2018-12-30 07:58:56.299054,2018-12-30 08:03:04.299054,151.19.160.7
33 | 2018-12-30 10:24:32.325926,2018-12-30 10:25:50.325926,42.36.247.249
34 | 2018-12-30 18:02:10.330542,2018-12-30 18:06:22.330542,31.166.161.10
35 | 2018-12-30 20:53:05.198396,2018-12-30 20:57:15.198396,216.47.145.212
36 | 2018-12-31 15:25:24.965666,2018-12-31 15:28:30.965666,220.140.156.223
37 |
--------------------------------------------------------------------------------
/solutions/ch_11/exercise_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Unsupervised anomaly detection with One-Class SVM\n",
8 | "\n",
9 | "## Setup"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | "\n",
21 | "\n",
34 | "
\n",
35 | " \n",
36 | " \n",
37 | " | \n",
38 | " source_ip | \n",
39 | " username | \n",
40 | " success | \n",
41 | " failure_reason | \n",
42 | "
\n",
43 | " \n",
44 | " datetime | \n",
45 | " | \n",
46 | " | \n",
47 | " | \n",
48 | " | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " 2018-01-01 00:06:19.353126 | \n",
54 | " 223.178.55.3 | \n",
55 | " djones | \n",
56 | " 1 | \n",
57 | " None | \n",
58 | "
\n",
59 | " \n",
60 | " 2018-01-01 00:09:07.147971 | \n",
61 | " 223.178.55.3 | \n",
62 | " djones | \n",
63 | " 1 | \n",
64 | " None | \n",
65 | "
\n",
66 | " \n",
67 | " 2018-01-01 01:08:08.610041 | \n",
68 | " 6.252.142.27 | \n",
69 | " asmith | \n",
70 | " 1 | \n",
71 | " None | \n",
72 | "
\n",
73 | " \n",
74 | " 2018-01-01 02:37:50.329298 | \n",
75 | " 124.178.25.98 | \n",
76 | " akim | \n",
77 | " 1 | \n",
78 | " None | \n",
79 | "
\n",
80 | " \n",
81 | " 2018-01-01 02:45:20.382080 | \n",
82 | " 98.43.141.103 | \n",
83 | " akim | \n",
84 | " 1 | \n",
85 | " None | \n",
86 | "
\n",
87 | " \n",
88 | "
\n",
89 | "
"
90 | ],
91 | "text/plain": [
92 | " source_ip username success failure_reason\n",
93 | "datetime \n",
94 | "2018-01-01 00:06:19.353126 223.178.55.3 djones 1 None\n",
95 | "2018-01-01 00:09:07.147971 223.178.55.3 djones 1 None\n",
96 | "2018-01-01 01:08:08.610041 6.252.142.27 asmith 1 None\n",
97 | "2018-01-01 02:37:50.329298 124.178.25.98 akim 1 None\n",
98 | "2018-01-01 02:45:20.382080 98.43.141.103 akim 1 None"
99 | ]
100 | },
101 | "execution_count": 1,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "import numpy as np\n",
108 | "import pandas as pd\n",
109 | "\n",
110 | "import sqlite3\n",
111 | "\n",
112 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
113 | " logs_2018 = pd.read_sql(\n",
114 | " \"\"\"\n",
115 | " SELECT * \n",
116 | " FROM logs \n",
117 | " WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n",
118 | " \"\"\", \n",
119 | " conn, parse_dates=['datetime'], index_col='datetime'\n",
120 | " )\n",
121 | "logs_2018.head()"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 2,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "def get_X(log, day):\n",
131 | " \"\"\"\n",
132 | " Get data we can use for the X\n",
133 | " \n",
134 | " Parameters:\n",
135 | " - log: The logs dataframe\n",
136 | " - day: A day or single value we can use as a datetime index slice\n",
137 | " \n",
138 | " Returns: \n",
139 | " A pandas DataFrame\n",
140 | " \"\"\"\n",
141 | " return pd.get_dummies(log[day].assign(\n",
142 | " failures=lambda x: 1 - x.success\n",
143 | " ).query('failures > 0').resample('1min').agg(\n",
144 | " {'username':'nunique', 'failures': 'sum'}\n",
145 | " ).dropna().rename(\n",
146 | " columns={'username':'usernames_with_failures'}\n",
147 | " ).assign(\n",
148 | " day_of_week=lambda x: x.index.dayofweek, \n",
149 | " hour=lambda x: x.index.hour\n",
150 | " ).drop(columns=['failures']), columns=['day_of_week', 'hour'])"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 3,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n",
162 | " 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n",
163 | " 'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n",
164 | " 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n",
165 | " 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n",
166 | " 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n",
167 | " dtype='object')"
168 | ]
169 | },
170 | "execution_count": 3,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "X = get_X(logs_2018, '2018-01')\n",
177 | "X.columns"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "## One-class SVM"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 4,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stderr",
194 | "output_type": "stream",
195 | "text": [
196 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
197 | " return self.partial_fit(X, y)\n",
198 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
199 | " return self.fit(X, **fit_params).transform(X)\n",
200 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\svm\\classes.py:1177: DeprecationWarning: The random_state parameter is deprecated and will be removed in version 0.22.\n",
201 | " \" be removed in version 0.22.\", DeprecationWarning)\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "from sklearn.svm import OneClassSVM\n",
207 | "from sklearn.pipeline import Pipeline\n",
208 | "from sklearn.preprocessing import StandardScaler\n",
209 | "\n",
210 | "one_class_svm_pipeline = Pipeline([\n",
211 | " ('scale', StandardScaler()),\n",
212 | " ('svm', OneClassSVM(random_state=0))\n",
213 | "]).fit(X)"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 5,
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stderr",
223 | "output_type": "stream",
224 | "text": [
225 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
226 | " Xt = transform.transform(Xt)\n"
227 | ]
228 | },
229 | {
230 | "data": {
231 | "text/plain": [
232 | "outlier 22823\n",
233 | "inlier 18794\n",
234 | "dtype: int64"
235 | ]
236 | },
237 | "execution_count": 5,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "preds = one_class_svm_pipeline.predict(X)\n",
244 | "pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()"
245 | ]
246 | }
247 | ],
248 | "metadata": {
249 | "kernelspec": {
250 | "display_name": "Python 3",
251 | "language": "python",
252 | "name": "python3"
253 | },
254 | "language_info": {
255 | "codemirror_mode": {
256 | "name": "ipython",
257 | "version": 3
258 | },
259 | "file_extension": ".py",
260 | "mimetype": "text/x-python",
261 | "name": "python",
262 | "nbconvert_exporter": "python",
263 | "pygments_lexer": "ipython3",
264 | "version": "3.7.2"
265 | }
266 | },
267 | "nbformat": 4,
268 | "nbformat_minor": 2
269 | }
270 |
--------------------------------------------------------------------------------
/solutions/ch_11/exercise_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Finding Outliers with k-Means\n",
8 | "\n",
9 | "## Setup"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/html": [
20 | "\n",
21 | "\n",
34 | "
\n",
35 | " \n",
36 | " \n",
37 | " | \n",
38 | " source_ip | \n",
39 | " username | \n",
40 | " success | \n",
41 | " failure_reason | \n",
42 | "
\n",
43 | " \n",
44 | " datetime | \n",
45 | " | \n",
46 | " | \n",
47 | " | \n",
48 | " | \n",
49 | "
\n",
50 | " \n",
51 | " \n",
52 | " \n",
53 | " 2018-01-01 00:06:19.353126 | \n",
54 | " 223.178.55.3 | \n",
55 | " djones | \n",
56 | " 1 | \n",
57 | " None | \n",
58 | "
\n",
59 | " \n",
60 | " 2018-01-01 00:09:07.147971 | \n",
61 | " 223.178.55.3 | \n",
62 | " djones | \n",
63 | " 1 | \n",
64 | " None | \n",
65 | "
\n",
66 | " \n",
67 | " 2018-01-01 01:08:08.610041 | \n",
68 | " 6.252.142.27 | \n",
69 | " asmith | \n",
70 | " 1 | \n",
71 | " None | \n",
72 | "
\n",
73 | " \n",
74 | " 2018-01-01 02:37:50.329298 | \n",
75 | " 124.178.25.98 | \n",
76 | " akim | \n",
77 | " 1 | \n",
78 | " None | \n",
79 | "
\n",
80 | " \n",
81 | " 2018-01-01 02:45:20.382080 | \n",
82 | " 98.43.141.103 | \n",
83 | " akim | \n",
84 | " 1 | \n",
85 | " None | \n",
86 | "
\n",
87 | " \n",
88 | "
\n",
89 | "
"
90 | ],
91 | "text/plain": [
92 | " source_ip username success failure_reason\n",
93 | "datetime \n",
94 | "2018-01-01 00:06:19.353126 223.178.55.3 djones 1 None\n",
95 | "2018-01-01 00:09:07.147971 223.178.55.3 djones 1 None\n",
96 | "2018-01-01 01:08:08.610041 6.252.142.27 asmith 1 None\n",
97 | "2018-01-01 02:37:50.329298 124.178.25.98 akim 1 None\n",
98 | "2018-01-01 02:45:20.382080 98.43.141.103 akim 1 None"
99 | ]
100 | },
101 | "execution_count": 1,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "import numpy as np\n",
108 | "import pandas as pd\n",
109 | "\n",
110 | "import sqlite3\n",
111 | "\n",
112 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
113 | " logs_2018 = pd.read_sql(\n",
114 | " \"\"\"\n",
115 | " SELECT * \n",
116 | " FROM logs \n",
117 | " WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n",
118 | " \"\"\", \n",
119 | " conn, parse_dates=['datetime'], index_col='datetime'\n",
120 | " )\n",
121 | "logs_2018.head()"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 2,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "def get_X(log, day):\n",
131 | " \"\"\"\n",
132 | " Get data we can use for the X\n",
133 | " \n",
134 | " Parameters:\n",
135 | " - log: The logs dataframe\n",
136 | " - day: A day or single value we can use as a datetime index slice\n",
137 | " \n",
138 | " Returns: \n",
139 | " A pandas DataFrame\n",
140 | " \"\"\"\n",
141 | " return pd.get_dummies(log[day].assign(\n",
142 | " failures=lambda x: 1 - x.success\n",
143 | " ).query('failures > 0').resample('1min').agg(\n",
144 | " {'username':'nunique', 'failures': 'sum'}\n",
145 | " ).dropna().rename(\n",
146 | " columns={'username':'usernames_with_failures'}\n",
147 | " ).assign(\n",
148 | " day_of_week=lambda x: x.index.dayofweek, \n",
149 | " hour=lambda x: x.index.hour\n",
150 | " ).drop(columns=['failures']), columns=['day_of_week', 'hour'])"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 3,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n",
162 | " 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n",
163 | " 'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n",
164 | " 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n",
165 | " 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n",
166 | " 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n",
167 | " dtype='object')"
168 | ]
169 | },
170 | "execution_count": 3,
171 | "metadata": {},
172 | "output_type": "execute_result"
173 | }
174 | ],
175 | "source": [
176 | "X = get_X(logs_2018, '2018')\n",
177 | "X.columns"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "## k-Means\n",
185 | "Since we want a \"normal\" activity cluster and an \"anomaly\" cluster, we need to make 2 clusters."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 4,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "name": "stderr",
195 | "output_type": "stream",
196 | "text": [
197 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
198 | " return self.partial_fit(X, y)\n",
199 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
200 | " return self.fit(X, **fit_params).transform(X)\n"
201 | ]
202 | }
203 | ],
204 | "source": [
205 | "from sklearn.cluster import KMeans\n",
206 | "from sklearn.pipeline import Pipeline\n",
207 | "from sklearn.preprocessing import StandardScaler\n",
208 | "\n",
209 | "kmeans_pipeline = Pipeline([\n",
210 | " ('scale', StandardScaler()),\n",
211 | " ('kmeans', KMeans(random_state=0, n_clusters=2))\n",
212 | "]).fit(X)"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "The cluster label doesn't mean anything to us, but we can examine the size of each cluster. We don't expect the clusters to be of equal size because anomalous activity doesn't happen as often as normal activity (we presume)."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 5,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stderr",
229 | "output_type": "stream",
230 | "text": [
231 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n",
232 | " Xt = transform.transform(Xt)\n"
233 | ]
234 | },
235 | {
236 | "data": {
237 | "text/plain": [
238 | "0 430546\n",
239 | "1 93600\n",
240 | "dtype: int64"
241 | ]
242 | },
243 | "execution_count": 5,
244 | "metadata": {},
245 | "output_type": "execute_result"
246 | }
247 | ],
248 | "source": [
249 | "preds = kmeans_pipeline.predict(X)\n",
250 | "pd.Series(preds).value_counts()"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### Evaluating the clustering\n",
258 | "#### Step 1: Get the true labels"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 6,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n",
268 | " hackers_2018 = pd.read_sql(\n",
269 | " 'SELECT * FROM attacks WHERE start BETWEEN \"2018-01-01\" AND \"2019-01-01\";', \n",
270 | " conn, parse_dates=['start', 'end']\n",
271 | " ).assign(\n",
272 | " duration=lambda x: x.end - x.start, \n",
273 | " start_floor=lambda x: x.start.dt.floor('min'),\n",
274 | " end_ceil=lambda x: x.end.dt.ceil('min')\n",
275 | " )"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 7,
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "def get_y(datetimes, hackers, resolution='1min'):\n",
285 | " \"\"\"\n",
286 | " Get data we can use for the y (whether or not a hacker attempted a log in during that time).\n",
287 | " \n",
288 | " Parameters:\n",
289 | " - datetimes: The datetimes to check for hackers\n",
290 | " - hackers: The dataframe indicating when the attacks started and stopped\n",
291 | " - resolution: The granularity of the datetime. Default is 1 minute.\n",
292 | " \n",
293 | " Returns:\n",
294 | " A pandas Series of booleans.\n",
295 | " \"\"\"\n",
296 | " date_ranges = hackers.apply(\n",
297 | " lambda x: pd.date_range(x.start_floor, x.end_ceil, freq=resolution), \n",
298 | " axis=1\n",
299 | " )\n",
300 | " dates = pd.Series()\n",
301 | " for date_range in date_ranges:\n",
302 | " dates = pd.concat([dates, date_range.to_series()])\n",
303 | " return datetimes.isin(dates)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 8,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "is_hacker = get_y(X.reset_index().datetime, hackers_2018)"
313 | ]
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {},
318 | "source": [
319 | "### Step 2: Calculate Fowlkes Mallows Score\n",
320 | "This indicates percentage of the observations belong to the same cluster in the true labels and in the predicted labels."
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 9,
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "0.8395916262911648"
332 | ]
333 | },
334 | "execution_count": 9,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "from sklearn.metrics import fowlkes_mallows_score\n",
341 | "\n",
342 | "fowlkes_mallows_score(is_hacker, preds)"
343 | ]
344 | }
345 | ],
346 | "metadata": {
347 | "kernelspec": {
348 | "display_name": "Python 3",
349 | "language": "python",
350 | "name": "python3"
351 | },
352 | "language_info": {
353 | "codemirror_mode": {
354 | "name": "ipython",
355 | "version": 3
356 | },
357 | "file_extension": ".py",
358 | "mimetype": "text/x-python",
359 | "name": "python",
360 | "nbconvert_exporter": "python",
361 | "pygments_lexer": "ipython3",
362 | "version": "3.7.2"
363 | }
364 | },
365 | "nbformat": 4,
366 | "nbformat_minor": 2
367 | }
368 |
--------------------------------------------------------------------------------