├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ └── troubleshoot-installation-issues.md └── workflows │ ├── env_checks.yml │ └── stale.yml ├── .gitignore ├── LICENSE ├── README.md ├── _img ├── cover.PNG └── pandas_drawing.PNG ├── acknowledgements.md ├── appendix ├── README.md ├── choosing_the_appropriate_plot_flow_chart.png ├── data_analysis_workflow.png └── ml_workflow.png ├── apt.txt ├── ch_01 ├── check_environment.py ├── checking_your_setup.ipynb ├── exercises.ipynb ├── introduction_to_data_analysis.ipynb └── stats_viz.py ├── ch_02 ├── 1-pandas_data_structures.ipynb ├── 2-creating_dataframes.ipynb ├── 3-making_dataframes_from_api_requests.ipynb ├── 4-inspecting_dataframes.ipynb ├── 5-selection.ipynb ├── 6-adding_and_removing_data.ipynb └── data │ ├── earthquakes.csv │ ├── example_data.csv │ ├── parsed.csv │ ├── quakes.db │ └── tsunamis.csv ├── ch_03 ├── 1-wide_vs_long.ipynb ├── 2-using_the_weather_api.ipynb ├── 3-cleaning_data.ipynb ├── 4-reshaping_data.ipynb ├── 5-handling_data_issues.ipynb ├── data │ ├── bitcoin.csv │ ├── dirty_data.csv │ ├── long_data.csv │ ├── nyc_temperatures.csv │ ├── sp500.csv │ └── wide_data.csv └── exercises │ ├── aapl.csv │ ├── amzn.csv │ ├── fb.csv │ ├── goog.csv │ └── nflx.csv ├── ch_04 ├── 0-weather_data_collection.ipynb ├── 1-querying_and_merging.ipynb ├── 2-dataframe_operations.ipynb ├── 3-aggregations.ipynb ├── 4-time_series.ipynb ├── data │ ├── dirty_data.csv │ ├── fb_2018.csv │ ├── fb_week_of_may_20_per_minute.csv │ ├── melted_stock_data.csv │ ├── nyc_weather_2018.csv │ ├── stocks.db │ ├── weather.db │ ├── weather_by_station.csv │ └── weather_stations.csv ├── exercises │ ├── earthquakes.csv │ └── faang.csv ├── understanding_window_calculations.ipynb └── window_calc.py ├── ch_05 ├── 1-introducing_matplotlib.ipynb ├── 2-plotting_with_pandas.ipynb ├── 3-pandas_plotting_subpackage.ipynb └── data │ ├── earthquakes.csv │ └── fb_stock_prices_2018.csv ├── ch_06 ├── 1-introduction_to_seaborn.ipynb ├── 2-formatting_plots.ipynb ├── 3-customizing_visualizations.ipynb ├── color_utils.py ├── data │ ├── earthquakes.csv │ └── fb_stock_prices_2018.csv ├── reg_resid_plot.py └── std_from_mean_kde.py ├── ch_07 ├── data │ ├── amazon.csv │ ├── apple.csv │ ├── bitcoin.csv │ ├── facebook.csv │ ├── google.csv │ ├── netflix.csv │ ├── netflix_january_2019.csv │ └── sp500.csv ├── financial_analysis.ipynb └── random_walk.py ├── ch_08 ├── anomaly_detection.ipynb ├── logs │ ├── attacks.csv │ └── log.csv ├── simulate.py └── user_data │ ├── user_base.txt │ └── user_ips.json ├── ch_09 ├── data │ ├── binaries.csv │ ├── planets.csv │ ├── sample_roc_curves.csv │ ├── stars.csv │ ├── systems.csv │ ├── winequality-red.csv │ └── winequality-white.csv ├── planet_data_collection.ipynb ├── planets_ml.ipynb ├── preprocessing.ipynb ├── red_wine.ipynb └── wine.ipynb ├── ch_10 ├── data │ ├── planets.csv │ ├── stars.csv │ ├── winequality-red.csv │ └── winequality-white.csv ├── planets_ml.ipynb ├── red_wine.ipynb └── wine.ipynb ├── ch_11 ├── 0-simulating_the_data.ipynb ├── 1-EDA_unlabeled_data.ipynb ├── 2-unsupervised_anomaly_detection.ipynb ├── 3-EDA_labeled_data.ipynb ├── 4-supervised_anomaly_detection.ipynb ├── 5-online_learning.ipynb ├── logs │ ├── hackers_2018.csv │ ├── hackers_2019.csv │ ├── logs.db │ ├── logs_2018.csv │ └── logs_2019.csv ├── merge_logs.py ├── run_simulations.sh ├── simulate.py └── user_data │ ├── user_base.txt │ └── user_ips.json ├── ch_12 └── README.md ├── environment.yml ├── requirements.txt ├── runtime.txt └── solutions ├── ch_01 └── solutions.ipynb ├── ch_02 └── solutions.ipynb ├── ch_03 ├── faang.csv └── solutions.ipynb ├── ch_04 └── solutions.ipynb ├── ch_05 └── solutions.ipynb ├── ch_06 └── solutions.ipynb ├── ch_07 └── solutions.ipynb ├── ch_08 ├── dec_2018_attacks.csv ├── dec_2018_log.csv └── solutions.ipynb ├── ch_09 ├── exercise_1.ipynb ├── exercise_2.ipynb ├── exercise_3.ipynb ├── exercise_4.ipynb └── exercise_5.ipynb ├── ch_10 ├── exercise_1.ipynb ├── exercise_2.ipynb ├── exercise_3.ipynb ├── exercise_4.ipynb └── exercise_5.ipynb └── ch_11 ├── exercise_1.ipynb ├── exercise_2.ipynb ├── exercise_3.ipynb └── exercise_4.ipynb /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Something isn't working as expected. 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Required attestation 11 | - [ ] I am using a virtual environment that matches the book specifications **exactly**. 12 | - [ ] I confirm that my fork is up to date with the latest changes in this repository. 13 | - [ ] I have checked that this issue has not already been reported or resolved [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/issues?q=is%3Aissue). 14 | - [ ] I have confirmed that my results match those obtained when running the code using [this](https://mybinder.org/v2/gh/stefmolin/binder-environments/1st_edition?urlpath=git-pull?repo=https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas) Binder environment. 15 | 16 | --- 17 | 18 | ### Background information 19 | ##### 1. Which OS are you using? 20 | TODO: Provide your OS here – make sure to differentiate between Intel and M1 chip Macs. 21 | 22 | ##### 2. Which Python version are you using? 23 | TODO: Provide your Python version here. 24 | 25 | ##### 3. Are you using `conda` or `venv`? 26 | TODO: Indicate whether you are using `conda` or `venv`. 27 | 28 | ##### 4. Package versions 29 |
30 | versions installed 31 | 32 | ``` 33 | TODO: Paste the result of running `pip freeze` or `conda list` 34 | ``` 35 | 36 |
37 | 38 | ##### 5. Run the `ch_01/checking_your_setup.ipynb` notebook 39 | Screenshot after running the `ch_01/checking_your_setup.ipynb` notebook: 40 | 41 | TODO: paste your screenshot here 42 | 43 | --- 44 | 45 | ### Commands run and their outputs 46 | Please provide **all** of the commands you ran as well as the traceback: 47 | 48 |
49 | 50 | ``` 51 | TODO: paste commands and any traceback here 52 | ``` 53 | 54 | 55 |
56 | 57 | ### Screenshots 58 | Optionally, include any screenshots that will help diagnose the issue. 59 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/troubleshoot-installation-issues.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Troubleshoot installation issues 3 | about: Help troubleshooting issues encountered when setting up the virtual environment. 4 | title: '' 5 | labels: troubleshooting 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Required attestation 11 | - [ ] I have **completely and exactly** followed the virtual environment setup instructions from the book. 12 | - [ ] I have cloned either this repository or my fork of this repository so that I have all necessary files on my local machine. 13 | - [ ] I have checked that this issue has not already been reported or resolved [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/issues?q=is%3Aissue). 14 | - [ ] I am aware that there is a [pre-built Binder environment](https://mybinder.org/v2/gh/stefmolin/binder-environments/1st_edition?urlpath=git-pull?repo=https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas) that I can use, but I want to install locally on my machine instead. 15 | 16 | --- 17 | 18 | ### Background information 19 | ##### 1. Which OS are you using? 20 | TODO: Provide your OS here – make sure to differentiate between Intel and M1 chip Macs. 21 | 22 | ##### 2. Which Python version are you using? 23 | TODO: Provide your Python version here. 24 | 25 | ##### 3. Are you using `conda` or `venv`? 26 | TODO: Indicate whether you are using `conda` or `venv`. 27 | 28 | --- 29 | 30 | ### Commands run and their outputs 31 | Please provide **all** of the commands you ran as well as the traceback: 32 | 33 |
34 | 35 | ``` 36 | TODO: paste commands and any traceback here 37 | ``` 38 | 39 | 40 |
41 | 42 | ### Screenshots 43 | Optionally, include any screenshots that will help diagnose the issue. 44 | -------------------------------------------------------------------------------- /.github/workflows/env_checks.yml: -------------------------------------------------------------------------------- 1 | # This workflow builds the book environment on Mac, Linux, and Windows for 2 | # multiple versions of Python to confirm it can be properly installed. 3 | # 4 | # Author: Stefanie Molin 5 | 6 | name: Env Build 7 | 8 | # Controls when the workflow will run 9 | on: 10 | # Triggers the workflow on push events 11 | push: 12 | branches: [ "master" ] 13 | 14 | # Trigger on pull request always (note the trailing colon) 15 | pull_request: 16 | 17 | # Allows you to run this workflow manually from the Actions tab 18 | workflow_dispatch: 19 | 20 | # Run this every month 21 | schedule: 22 | - cron: "44 22 11 * *" 23 | 24 | concurrency: 25 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 26 | cancel-in-progress: true 27 | 28 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 29 | jobs: 30 | # This workflow contains a single job called "build" 31 | build: 32 | name: Python ${{ matrix.python-version }}, ${{ matrix.os }} 33 | 34 | # The type of runner that the job will run on 35 | runs-on: ${{ matrix.os }} 36 | 37 | defaults: 38 | run: 39 | shell: bash -el {0} 40 | 41 | strategy: 42 | fail-fast: false 43 | matrix: 44 | os: [macos-13, ubuntu-latest, windows-latest] 45 | python-version: ["3.6", "3.7"] 46 | 47 | # Steps represent a sequence of tasks that will be executed as part of the job 48 | steps: 49 | # checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 50 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 51 | 52 | # remove the Python version from the file for testing 53 | - name: strip hardcoded Python version from environment for testing 54 | run: | 55 | if [[ ${{ matrix.os }} == "macos"* ]]; then 56 | sed -i '' -e '/- python[>=]/d' environment.yml; 57 | else 58 | sed -i -e '/- python[>=]/d' environment.yml; 59 | fi; 60 | 61 | # create the conda env 62 | - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 63 | with: 64 | python-version: ${{ matrix.python-version }} 65 | auto-update-conda: true 66 | channels: conda-forge 67 | channel-priority: true 68 | activate-environment: book_env 69 | environment-file: environment.yml 70 | 71 | - name: conda diagnostics 72 | run: | 73 | conda info 74 | conda list 75 | conda config --show-sources 76 | conda config --show 77 | printenv | sort 78 | 79 | - name: verify install 80 | run: cd ch_01 && python check_environment.py 81 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '15 9 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v3 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | days-before-stale: 30 25 | days-before-close: 7 26 | stale-issue-message: 'This issue has been marked as stale due to lack of recent activity. It will be closed if no further activity occurs.' 27 | stale-pr-message: '' 28 | stale-issue-label: 'stale' 29 | stale-pr-label: 'stale' 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | __pycache__/ 3 | *images/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, 2019 Stefanie Molin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hands-On Data Analysis with Pandas 2 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/stefmolin/Hands-On-Data-Analysis-with-Pandas/master?urlpath=lab) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/stefmolin/Hands-On-Data-Analysis-with-Pandas/blob/master) [![Nbviewer](https://img.shields.io/badge/render-nbviewer-lightgrey?logo=jupyter)](https://nbviewer.jupyter.org/github/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/) [![Purchase the book on Amazon](https://img.shields.io/badge/Amazon-purchase-orange?logo=amazon&logoColor=orange)](https://www.amazon.com/Hands-Data-Analysis-Pandas-visualization/dp/1789615321) 3 | Hands-On Data Analysis with Pandas 4 | 5 | 6 | This is the code repository for my book [Hands-On Data Analysis with Pandas](https://www.packtpub.com/big-data-and-business-intelligence/hands-data-analysis-pandas), published by Packt on July 26, 2019. 7 | 8 | *The [1st_edition tag](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/1st_edition) contains all materials as they were at time of publishing the first edition.* 9 | 10 | --- 11 | 12 | **IMPORTANT NOTE** (April 29, 2021): 13 | 14 | This is the code repository for the **first edition**. For the **second edition**, use [this repository](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition) instead. 15 | 16 | --- 17 | 18 | ## Book Description 19 | Data analysis has become an essential skill in a variety of domains where knowing how to work with data and extract insights can generate significant value. 20 | 21 | *Hands-On Data Analysis with Pandas* will show you how to analyze your data, get started with machine learning, and work effectively with Python libraries often used for data science, such as pandas, NumPy, matplotlib, seaborn, and scikit-learn. Using real-world datasets, you will learn how to use the powerful pandas library to perform data wrangling to reshape, clean, and aggregate your data. Then, you will learn how to conduct exploratory data analysis by calculating summary statistics and visualizing the data to find patterns. In the concluding chapters, you will explore some applications of anomaly detection, regression, clustering, and classification, using scikit-learn, to make predictions based on past data. 22 | 23 | By the end of this book, you will be equipped with the skills you need to use pandas to ensure the veracity of your data, visualize it for effective decision-making, and reliably reproduce analysis across multiple domains. 24 | 25 | ## What You Will Learn 26 | *Prerequisite: Basic knowledge of Python or past experience with another language (R, SAS, MATLAB, etc.).* 27 | - Understand how data analysts and scientists gather and analyze data 28 | - Perform data analysis and data wrangling in Python 29 | - Combine, group, and aggregate data from multiple sources 30 | - Create data visualizations with `pandas`, `matplotlib`, and `seaborn` 31 | - Apply machine learning algorithms with `sklearn` to identify patterns and make predictions 32 | - Use Python data science libraries to analyze real-world datasets. 33 | - Use `pandas` to solve several common data representation and analysis problems 34 | - Collect data from APIs 35 | - Build Python scripts, modules, and packages for reusable analysis code. 36 | - Utilize computer science concepts and algorithms to write more efficient code for data analysis 37 | - Write and run simulations 38 | 39 | ## Table of Contents 40 | - [Chapter 1, *Introduction to Data Analysis*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_01), will teach you the fundamentals of data analysis, give you a foundation in statistics, and get your environment set up for working with data in Python and using Jupyter Notebooks. 41 | 42 | - [Chapter 2, *Working with Pandas DataFrames*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_02), introduces you to the `pandas` library and shows you the basics of working with `DataFrames`. 43 | 44 | - [Chapter 3, *Data Wrangling with Pandas*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_03), discusses the process of data manipulation, shows you how to explore an API to gather data, and guides you through data cleaning and reshaping with pandas. 45 | 46 | - [Chapter 4, *Aggregating Pandas DataFrames*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_04), teaches you how to query and merge DataFrames, perform complex operations on them, including rolling calculations and aggregations, and how to work effectively with time series data. 47 | 48 | - [Chapter 5, *Visualizing Data with Pandas and Matplotlib*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_05), shows you how to create your own data visualizations in Python, first using the `matplotlib` library, and then directly from `pandas` objects. 49 | 50 | - [Chapter 6, *Plotting with Seaborn and Customization Techniques*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_06), continues the discussion on data visualization by teaching you how to use the `seaborn` library for visualizing your long form data and giving you the tools you need to customize your visualizations, making them presentation-ready. 51 | 52 | - [Chapter 7, *Financial Analysis: Bitcoin and the Stock Market*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_07), walks you through the creation of a [Python package for analyzing stocks](https://github.com/stefmolin/stock-analysis), building upon everything learned in chapters 1-6 and applying it to a financial application. 53 | 54 | - [Chapter 8, *Rule-Based Anomaly Detection*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_08), covers [simulating data](https://github.com/stefmolin/login-attempt-simulator) and applying everything learned in chapters 1-6 to catching hackers attempting to authenticate to a website, using rule-based strategies for anomaly detection. 55 | 56 | - [Chapter 9, *Getting Started with Machine Learning in Python*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_09), introduces you to machine learning and building models using the `sklearn` library. 57 | 58 | - [Chapter 10, *Making Better Predictions: Optimizing Models*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_10), shows you strategies for improving the performance of your machine learning models. 59 | 60 | - [Chapter 11, *Machine Learning Anomaly Detection*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_11), revisits anomaly detection on login attempt data, using machine learning techniques, all while giving you a taste of how the workflow looks in practice. 61 | 62 | - [Chapter 12, *The Road Ahead*](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/ch_12), contains resources for taking your skills to the next level and further avenues for exploration. 63 | 64 | ## Notes on Environment Setup 65 | [![Env Build Workflow Status](https://img.shields.io/github/actions/workflow/status/stefmolin/Hands-On-Data-Analysis-with-Pandas/env_checks.yml?label=env%20build&logo=github&logoColor=white)](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/actions/workflows/env_checks.yml) ![GitHub repo size](https://img.shields.io/github/repo-size/stefmolin/Hands-On-Data-Analysis-with-Pandas?logo=git&logoColor=white) 66 | 67 | Environment setup instructions are in the chapter 1 of the text. If you don't have the book, you must install Python 3.6 or 3.7, [set up a virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#creating-a-virtual-environment), [activate it](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#activating-a-virtual-environment), and then [install the packages listed in requirements.txt](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/#using-requirements-files). You can then launch JupyterLab and use the `ch_01/checking_your_setup.ipynb` Jupyter notebook to check your setup. Consult [this resource](https://anbasile.github.io/programming/2017/06/25/jupyter-venv/) if you have issues with using your virtual environment in Jupyter. 68 | 69 | ## Solutions 70 | Each chapter comes with exercises. The solutions for chapters 1-11 can be found [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/tree/master/solutions). 71 | 72 | ## About the Author 73 | Stefanie Molin ([@stefmolin](https://github.com/stefmolin)) is a software engineer and data scientist at Bloomberg in New York City, where she tackles tough problems in information security, particularly those revolving around data wrangling/visualization, building tools for gathering data, and knowledge sharing. She holds a bachelor’s of science degree in operations research from Columbia University's Fu Foundation School of Engineering and Applied Science with minors in Economics and Entrepreneurship and Innovation, as well as a master’s degree in computer science, with a specialization in machine learning, from Georgia Tech. In her free time, she enjoys traveling the world, inventing new recipes, and learning new languages spoken both among people and computers. 74 | 75 | ## Acknowledgements 76 | Since the book limited the acknowledgements to 450 characters, the full version is [here](https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/blob/master/acknowledgements.md). 77 | -------------------------------------------------------------------------------- /_img/cover.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/_img/cover.PNG -------------------------------------------------------------------------------- /_img/pandas_drawing.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/_img/pandas_drawing.PNG -------------------------------------------------------------------------------- /acknowledgements.md: -------------------------------------------------------------------------------- 1 | # Acknowledgements 2 | 3 | Writing this book has been a tremendous amount of work, but I have grown a lot through the experience: as a writer, as a technologist, and as a person. This wouldn't have been possible without the help I received along the way. It was truly touching to see that friends, family, and colleagues were so willing to help. I'm very grateful to you all. 4 | 5 | To my family. **Mom**: for always being there when I needed to vent after working multiple 14 hour days straight writing this book and making sure I had something to eat...and not getting too upset when it took me months to find the time to look into the Japan itinerary. Y ahora, por fin, puedo tomar esa clase de Zumba contigo. **Ryan**: for the sports resources in chapter 12 and always telling it like it is. **Dad**: for sharing a countdown with me. 6 | 7 | To my friend and reviewer, **Aliki Mavromoustaki**: It's rare to meet someone over video conference and instantly form a friendship, but I am very happy our paths crossed. Even though I told you this was an offer you could refuse, you took on the extra work to review the first drafts, ευχαριστώ. We finally got to work on something together! 8 | 9 | To **Felipe Moreno**: for being an extremely supportive manager and my toughest critic. You managed to fit more writing into the margins of each page than I would have thought possible. While this led to rewriting large portions of the earlier chapters for the final drafts, I know it made a huge difference in the quality of the final outcome. Obrigada! 10 | 11 | To **Suphannee Sivakorn**: for volunteering to review the drafts. I know you thought I wouldn't make it past chapter 5, but you didn't really know me then...ขอบคุณค่ะ (ka ka ka 🕊️) for all the drawings (and occasional poem). They made me smile when I really needed it and kept me sane. Now, I have a poem for you: 12 | 13 |

14 | 15 | Roses are red,
16 | violets are blue;
17 | I'm done with this book,
18 | and so are you. 19 |
20 |

21 | 22 | Get ready to be a famous artist: 23 | 24 |

25 | artist rendition of the book 26 |

27 | 28 | To my colleagues. **Lucy Hao** and **Javon Thompson**: for reviewing the financial analysis code and chapter 7 despite never having met me. Lucy, I am very glad I got the chance to meet you when I gave my lightning talk at the Princeton office. You reassured me that the content of this book is relevant and useful to its audience, and it sounded like you got just as much out of the experience of reviewing that chapter as I did. **Alexander Comerford**: for additional resources in chapter 12 and knitting me desk gloves (paws?) for work. You kept my fingers from freezing, so I could write this book. I'm still waiting for my blanket though... 29 | 30 | Finally, thank you to everyone else that helped along the way whether it was support, kind words, or being a sounding board I truly appreciate it, and it made a difference. 31 | -------------------------------------------------------------------------------- /appendix/README.md: -------------------------------------------------------------------------------- 1 | # Appendix 2 | Here are some workflow diagrams for reference. 3 | 4 | ## Data Analysis Workflow 5 | data analysis workflow 6 | 7 | ## Choosing the Appropriate Plot 8 | choosing the appropriate plot 9 | 10 | ## Machine Learning Workflow 11 | machine learning workflow 12 | -------------------------------------------------------------------------------- /appendix/choosing_the_appropriate_plot_flow_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/choosing_the_appropriate_plot_flow_chart.png -------------------------------------------------------------------------------- /appendix/data_analysis_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/data_analysis_workflow.png -------------------------------------------------------------------------------- /appendix/ml_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/appendix/ml_workflow.png -------------------------------------------------------------------------------- /apt.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | -------------------------------------------------------------------------------- /ch_01/check_environment.py: -------------------------------------------------------------------------------- 1 | """Check environment for following along with the text.""" 2 | 3 | from distutils.version import LooseVersion as Version 4 | import importlib 5 | import sys 6 | import re 7 | 8 | 9 | OK = '\x1b[42m[ OK ]\x1b[0m' 10 | FAIL = '\x1b[41m[FAIL]\x1b[0m' 11 | 12 | github_package_pattern = re.compile(r'(?:\/)([\w*\-*]*)(?:\.git)') 13 | 14 | def run_checks(raise_exc=False): 15 | """ 16 | Check that the packages we need are installed and the Python version is good. 17 | 18 | Parameters 19 | ---------- 20 | raise_exc : bool, default ``False`` 21 | Whether to raise an exception if any of the packages doesn't 22 | match the requirements (used for GitHub Action). 23 | """ 24 | failures = [] 25 | 26 | # check the python version 27 | print('Using Python in %s:' % sys.prefix) 28 | if Version(sys.version) >= '3.6.0' and Version(sys.version) < '3.8.0': 29 | print(OK, 'Python is version %s\n' % sys.version) 30 | else: 31 | print(FAIL, 'Python version >= 3.6.0 and < 3.8.0 is required, but %s is installed.\n' % sys.version) 32 | failures.append('Python') 33 | 34 | # read in the requirements 35 | with open('../requirements.txt', 'r') as file: 36 | requirements = {} 37 | for line in file.read().splitlines(): 38 | github_package = re.search(github_package_pattern, line) 39 | if github_package: 40 | pkg = github_package.group(1).replace('-', '_') 41 | version = None 42 | else: 43 | if line.startswith('./'): 44 | line = line.replace('./', '') 45 | try: 46 | if '>=' in line: 47 | pkg, versions = line.split('>=') 48 | version = versions.split(',<=') 49 | else: 50 | pkg, version = line.split('==') 51 | except ValueError: 52 | pkg, version = line, None 53 | if pkg == 'imbalanced-learn': 54 | pkg = 'imblearn' 55 | elif pkg == 'scikit-learn': 56 | pkg = 'sklearn' 57 | 58 | requirements[pkg.replace('-', '_')] = version 59 | 60 | # check the requirements 61 | for pkg, req_version in requirements.items(): 62 | try: 63 | mod = importlib.import_module(pkg) 64 | if req_version: 65 | version = mod.__version__ 66 | if isinstance(req_version, list): 67 | min_version, max_version = req_version 68 | if Version(version) < min_version or Version(version) > max_version: 69 | print(FAIL, '%s version >= %s and <= %s is required, but %s installed.' % (pkg, min_version, max_version, version)) 70 | failures.append(pkg) 71 | continue 72 | else: 73 | if Version(version) != req_version: 74 | print(FAIL, '%s version %s is required, but %s installed.' % (pkg, req_version, version)) 75 | failures.append(pkg) 76 | continue 77 | print(OK, '%s' % pkg) 78 | except ImportError: 79 | print(FAIL, '%s not installed.' % pkg) 80 | failures.append(pkg) 81 | 82 | if failures and raise_exc: 83 | raise Exception( 84 | 'Environment failed inspection due to incorrect versions ' 85 | f'of {len(failures)} item(s): {", ".join(failures)}.' 86 | ) 87 | 88 | if __name__ == '__main__': 89 | run_checks(raise_exc=True) 90 | -------------------------------------------------------------------------------- /ch_01/checking_your_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Checking your setup\n", 8 | "Run through this notebook to make sure your environment is properly setup. Be sure to launch Jupyter from inside the virtual environment." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [ 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "Using Python in c:\\users\\molinstefanie\\packt\\venv:\n", 21 | "\u001b[42m[ OK ]\u001b[0m Python is version 3.7.2 (tags/v3.7.2:9a3ffc0492, Dec 23 2018, 22:20:52) [MSC v.1916 32 bit (Intel)]\n", 22 | "\n", 23 | "\u001b[42m[ OK ]\u001b[0m graphviz\n", 24 | "\u001b[42m[ OK ]\u001b[0m imblearn\n", 25 | "\u001b[42m[ OK ]\u001b[0m jupyter\n", 26 | "\u001b[42m[ OK ]\u001b[0m jupyterlab\n", 27 | "\u001b[42m[ OK ]\u001b[0m matplotlib\n", 28 | "\u001b[42m[ OK ]\u001b[0m numpy\n", 29 | "\u001b[42m[ OK ]\u001b[0m pandas\n", 30 | "\u001b[42m[ OK ]\u001b[0m pandas_datareader\n", 31 | "\u001b[42m[ OK ]\u001b[0m requests\n", 32 | "\u001b[42m[ OK ]\u001b[0m sklearn\n", 33 | "\u001b[42m[ OK ]\u001b[0m scipy\n", 34 | "\u001b[42m[ OK ]\u001b[0m seaborn\n", 35 | "\u001b[42m[ OK ]\u001b[0m sqlalchemy\n", 36 | "\u001b[42m[ OK ]\u001b[0m statsmodels\n", 37 | "\u001b[42m[ OK ]\u001b[0m login_attempt_simulator\n", 38 | "\u001b[42m[ OK ]\u001b[0m ml_utils\n", 39 | "\u001b[42m[ OK ]\u001b[0m stock_analysis\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "from check_environment import run_checks\n", 45 | "run_checks()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "*Note: Adapted from Andreas Mueller's [`check_env.ipynb` notebook](https://github.com/amueller/ml-workshop-1-of-4/blob/master/check_env.ipynb).*" 53 | ] 54 | } 55 | ], 56 | "metadata": { 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.7.2" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 2 77 | } 78 | -------------------------------------------------------------------------------- /ch_01/exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Coding exercises\n", 8 | "Exercises 1-3 are thought exercises that don't require coding. \n", 9 | "\n", 10 | "## Exercise 4: Generate the data by running this cell\n", 11 | "This will give you a list of numbers to work with in the remaining exercises." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import random\n", 21 | "random.seed(0)\n", 22 | "salaries = [round(random.random()*1000000, -3) for _ in range(100)]" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Exercise 5: Calculating statistics and verifying\n", 30 | "### mean" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### median" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "### mode" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### sample variance\n", 73 | "Remember to use Bessel's correction." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### sample standard deviation\n", 88 | "Remember to use Bessel's correction." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Exercise 6: Calculating more statistics\n", 103 | "### range" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### coefficient of variation" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "### interquartile range" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### quartile coefficent of dispersion" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "## Exercise 7: Scaling data\n", 160 | "### min-max scaling" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### standardizing" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Exercise 8: Calculating covariance and correlation\n", 189 | "### covariance" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### Pearson correlation coefficient ($\\rho$)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.7.2" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 2 235 | } 236 | -------------------------------------------------------------------------------- /ch_02/data/example_data.csv: -------------------------------------------------------------------------------- 1 | time;place;magType;mag;alert;tsunami 2 | 2018-10-13 11:10:23.560;262km NW of Ozernovskiy, Russia;mww;6.7;green;1 3 | 2018-10-13 04:34:15.580;25km E of Bitung, Indonesia;mww;5.2;green;0 4 | 2018-10-13 00:13:46.220;42km WNW of Sola, Vanuatu;mww;5.7;green;0 5 | 2018-10-12 21:09:49.240;13km E of Nueva Concepcion, Guatemala;mww;5.7;green;0 6 | 2018-10-12 02:52:03.620;128km SE of Kimbe, Papua New Guinea;mww;5.6;green;1 7 | -------------------------------------------------------------------------------- /ch_02/data/quakes.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_02/data/quakes.db -------------------------------------------------------------------------------- /ch_02/data/tsunamis.csv: -------------------------------------------------------------------------------- 1 | alert,type,title,place,magType,mag,time 2 | ,earthquake,"M 5.0 - 165km NNW of Flying Fish Cove, Christmas Island","165km NNW of Flying Fish Cove, Christmas Island",mww,5.0,1539459504090 3 | green,earthquake,"M 6.7 - 262km NW of Ozernovskiy, Russia","262km NW of Ozernovskiy, Russia",mww,6.7,1539429023560 4 | green,earthquake,"M 5.6 - 128km SE of Kimbe, Papua New Guinea","128km SE of Kimbe, Papua New Guinea",mww,5.6,1539312723620 5 | green,earthquake,"M 6.5 - 148km S of Severo-Kuril'sk, Russia","148km S of Severo-Kuril'sk, Russia",mww,6.5,1539213362130 6 | green,earthquake,"M 6.2 - 94km SW of Kokopo, Papua New Guinea","94km SW of Kokopo, Papua New Guinea",mww,6.2,1539208835130 7 | green,earthquake,"M 5.9 - 117km ESE of Kimbe, Papua New Guinea","117km ESE of Kimbe, Papua New Guinea",mww,5.9,1539205996680 8 | green,earthquake,"M 5.9 - 113km ESE of Kimbe, Papua New Guinea","113km ESE of Kimbe, Papua New Guinea",mww,5.9,1539205141060 9 | green,earthquake,"M 7.0 - 117km E of Kimbe, Papua New Guinea","117km E of Kimbe, Papua New Guinea",mww,7.0,1539204500290 10 | green,earthquake,"M 6.1 - 132km E of Kimbe, Papua New Guinea","132km E of Kimbe, Papua New Guinea",mb,6.1,1539204326420 11 | green,earthquake,"M 5.0 - 61km SSW of Chignik Lake, Alaska","61km SSW of Chignik Lake, Alaska",ml,5.0,1539152878406 12 | ,earthquake,"M 5.3 - 65km NNW of Lae, Papua New Guinea","65km NNW of Lae, Papua New Guinea",mb,5.3,1539150837980 13 | ,earthquake,"M 5.4 - 62km NW of Finschhafen, Papua New Guinea","62km NW of Finschhafen, Papua New Guinea",mww,5.4,1539136980090 14 | green,earthquake,"M 4.0 - 71km SW of Kaktovik, Alaska","71km SW of Kaktovik, Alaska",ml,4.0,1539069081499 15 | ,earthquake,"M 5.1 - 13km E of Palu, Indonesia","13km E of Palu, Indonesia",mb,5.1,1539033346530 16 | ,earthquake,"M 5.1 - 14km ENE of Sambelia, Indonesia","14km ENE of Sambelia, Indonesia",mww,5.1,1538935041200 17 | green,earthquake,"M 4.0 - 60km WNW of Valdez, Alaska","60km WNW of Valdez, Alaska",ml,4.0,1538904354275 18 | ,earthquake,"M 5.0 - 11km ESE of Kimbe, Papua New Guinea","11km ESE of Kimbe, Papua New Guinea",mww,5.0,1538842952660 19 | green,earthquake,"M 3.8 - 5km SW of Tres Pinos, CA","5km SW of Tres Pinos, CA",mw,3.83,1538746177550 20 | green,earthquake,"M 4.0 - 67km SSW of Kaktovik, Alaska","67km SSW of Kaktovik, Alaska",ml,4.0,1538658776412 21 | ,earthquake,"M 5.3 - 29km SSW of Nggongi, Indonesia","29km SSW of Nggongi, Indonesia",mb,5.3,1538570285120 22 | ,earthquake,"M 5.0 - 38km S of Nggongi Satu, Indonesia","38km S of Nggongi Satu, Indonesia",mww,5.0,1538560686080 23 | ,earthquake,"M 5.0 - 50km WSW of Kasiguncu, Indonesia","50km WSW of Kasiguncu, Indonesia",mb,5.0,1538456366290 24 | green,earthquake,"M 5.6 - 33km SSW of Nggongi Satu, Indonesia","33km SSW of Nggongi Satu, Indonesia",mww,5.6,1538455771470 25 | green,earthquake,"M 5.9 - 30km SSW of Nggongi, Indonesia","30km SSW of Nggongi, Indonesia",mww,5.9,1538439405760 26 | green,earthquake,"M 6.0 - 33km S of Nggongi Satu, Indonesia","33km S of Nggongi Satu, Indonesia",mww,6.0,1538438383070 27 | ,earthquake,"M 5.2 - 25km N of Palu, Indonesia","25km N of Palu, Indonesia",mww,5.2,1538437599550 28 | ,earthquake,"M 5.4 - 31km S of Nggongi Satu, Indonesia","31km S of Nggongi Satu, Indonesia",mb,5.4,1538436426090 29 | ,earthquake,"M 5.0 - 33km SSW of Nggongi Satu, Indonesia","33km SSW of Nggongi Satu, Indonesia",mww,5.0,1538435522580 30 | ,earthquake,"M 5.1 - 101km NNW of Palu, Indonesia","101km NNW of Palu, Indonesia",mww,5.1,1538372615190 31 | ,earthquake,"M 5.0 - 106km NNW of Lae, Papua New Guinea","106km NNW of Lae, Papua New Guinea",mb,5.0,1538344682130 32 | green,earthquake,"M 6.7 - 263km NNE of Ndoi Island, Fiji","263km NNE of Ndoi Island, Fiji",mww,6.7,1538304744240 33 | ,earthquake,"M 5.1 - 49km W of Kasiguncu, Indonesia","49km W of Kasiguncu, Indonesia",mww,5.1,1538217018480 34 | ,earthquake,"M 5.1 - 53km W of Kasiguncu, Indonesia","53km W of Kasiguncu, Indonesia",mb,5.1,1538206811760 35 | green,earthquake,"M 4.4 - 1km SE of Delta, B.C., MX","1km SE of Delta, B.C., MX",mw,4.41,1538187466720 36 | ,earthquake,"M 5.0 - 55km WSW of Kasiguncu, Indonesia","55km WSW of Kasiguncu, Indonesia",mww,5.0,1538169841560 37 | ,earthquake,"M 5.0 - 45km SSW of Palu, Indonesia","45km SSW of Palu, Indonesia",mb,5.0,1538148942250 38 | ,earthquake,"M 5.4 - 113km NNW of Palu, Indonesia","113km NNW of Palu, Indonesia",mb,5.4,1538144760960 39 | ,earthquake,"M 5.2 - 58km S of Palu, Indonesia","58km S of Palu, Indonesia",mb,5.2,1538141984430 40 | green,earthquake,"M 5.7 - 107km N of Palu, Indonesia","107km N of Palu, Indonesia",ms_20,5.7,1538141730630 41 | ,earthquake,"M 5.1 - 47km N of Palu, Indonesia","47km N of Palu, Indonesia",mb,5.1,1538137653240 42 | ,earthquake,"M 5.2 - 68km SSE of Palu, Indonesia","68km SSE of Palu, Indonesia",mb,5.2,1538132811150 43 | green,earthquake,"M 5.7 - 17km NNE of Palu, Indonesia","17km NNE of Palu, Indonesia",mb,5.7,1538131825150 44 | ,earthquake,"M 5.1 - 42km N of Palu, Indonesia","42km N of Palu, Indonesia",mb,5.1,1538131664560 45 | ,earthquake,"M 5.4 - 24km N of Palu, Indonesia","24km N of Palu, Indonesia",mb,5.4,1538131143050 46 | green,earthquake,"M 5.8 - 21km SSE of Palu, Indonesia","21km SSE of Palu, Indonesia",mb,5.8,1538130304440 47 | green,earthquake,"M 5.7 - 17km E of Palu, Indonesia","17km E of Palu, Indonesia",mb,5.7,1538129809140 48 | green,earthquake,"M 5.8 - 99km N of Palu, Indonesia","99km N of Palu, Indonesia",mb,5.8,1538129660450 49 | red,earthquake,"M 7.5 - 78km N of Palu, Indonesia","78km N of Palu, Indonesia",mww,7.5,1538128963480 50 | ,earthquake,"M 5.0 - 60km N of Palu, Indonesia","60km N of Palu, Indonesia",mb,5.0,1538123098480 51 | ,earthquake,"M 5.4 - 77km NNE of Palu, Indonesia","77km NNE of Palu, Indonesia",mb,5.4,1538118198440 52 | green,earthquake,"M 6.1 - 55km NNW of Palu, Indonesia","55km NNW of Palu, Indonesia",mww,6.1,1538118001950 53 | ,earthquake,"M 5.2 - 91km WNW of Panguna, Papua New Guinea","91km WNW of Panguna, Papua New Guinea",mww,5.2,1538063612790 54 | ,earthquake,"M 5.1 - 117km N of Saumlaki, Indonesia","117km N of Saumlaki, Indonesia",mb,5.1,1538026140750 55 | ,earthquake,"M 5.1 - 19km WNW of Langsa, Indonesia","19km WNW of Langsa, Indonesia",mww,5.1,1537984301360 56 | ,earthquake,"M 5.0 - 85km W of Manokwari, Indonesia","85km W of Manokwari, Indonesia",mww,5.0,1537954061090 57 | ,earthquake,"M 5.0 - 10km WSW of Kainantu, Papua New Guinea","10km WSW of Kainantu, Papua New Guinea",mb,5.0,1537760541200 58 | ,earthquake,"M 5.4 - 228km S of Taron, Papua New Guinea","228km S of Taron, Papua New Guinea",mb,5.4,1537427126700 59 | ,earthquake,"M 5.1 - 278km SE of Pondaguitan, Philippines","278km SE of Pondaguitan, Philippines",mb,5.1,1537411002190 60 | green,earthquake,"M 5.1 - 64km SSW of Kaktovik, Alaska","64km SSW of Kaktovik, Alaska",ml,5.1,1537274456960 61 | ,earthquake,"M 5.2 - 126km N of Dili, East Timor","126km N of Dili, East Timor",mb,5.2,1537262729590 62 | ,earthquake,"M 5.1 - 34km NW of Finschhafen, Papua New Guinea","34km NW of Finschhafen, Papua New Guinea",mb,5.1,1537236235470 63 | -------------------------------------------------------------------------------- /ch_03/data/long_data.csv: -------------------------------------------------------------------------------- 1 | attributes,datatype,date,station,value 2 | ",,H,0700",TMAX,2018-10-01T00:00:00,GHCND:USC00280907,21.1 3 | ",,H,0700",TMIN,2018-10-01T00:00:00,GHCND:USC00280907,8.9 4 | ",,H,0700",TOBS,2018-10-01T00:00:00,GHCND:USC00280907,13.9 5 | ",,H,0700",TMAX,2018-10-02T00:00:00,GHCND:USC00280907,23.9 6 | ",,H,0700",TMIN,2018-10-02T00:00:00,GHCND:USC00280907,13.9 7 | ",,H,0700",TOBS,2018-10-02T00:00:00,GHCND:USC00280907,17.2 8 | ",,H,0700",TMAX,2018-10-03T00:00:00,GHCND:USC00280907,25.0 9 | ",,H,0700",TMIN,2018-10-03T00:00:00,GHCND:USC00280907,15.6 10 | ",,H,0700",TOBS,2018-10-03T00:00:00,GHCND:USC00280907,16.1 11 | ",,H,0700",TMAX,2018-10-04T00:00:00,GHCND:USC00280907,22.8 12 | ",,H,0700",TMIN,2018-10-04T00:00:00,GHCND:USC00280907,11.7 13 | ",,H,0700",TOBS,2018-10-04T00:00:00,GHCND:USC00280907,11.7 14 | ",,H,0700",TMAX,2018-10-05T00:00:00,GHCND:USC00280907,23.3 15 | ",,H,0700",TMIN,2018-10-05T00:00:00,GHCND:USC00280907,11.7 16 | ",,H,0700",TOBS,2018-10-05T00:00:00,GHCND:USC00280907,18.9 17 | ",,H,0700",TMAX,2018-10-06T00:00:00,GHCND:USC00280907,20.0 18 | ",,H,0700",TMIN,2018-10-06T00:00:00,GHCND:USC00280907,13.3 19 | ",,H,0700",TOBS,2018-10-06T00:00:00,GHCND:USC00280907,16.1 20 | ",,H,0700",TMAX,2018-10-07T00:00:00,GHCND:USC00280907,20.0 21 | ",,H,0700",TMIN,2018-10-07T00:00:00,GHCND:USC00280907,16.1 22 | ",,H,0700",TOBS,2018-10-07T00:00:00,GHCND:USC00280907,20.0 23 | ",,H,0700",TMAX,2018-10-08T00:00:00,GHCND:USC00280907,26.7 24 | ",,H,0700",TMIN,2018-10-08T00:00:00,GHCND:USC00280907,17.8 25 | ",,H,0700",TOBS,2018-10-08T00:00:00,GHCND:USC00280907,17.8 26 | ",,H,0700",TMAX,2018-10-09T00:00:00,GHCND:USC00280907,18.9 27 | ",,H,0700",TMIN,2018-10-09T00:00:00,GHCND:USC00280907,17.2 28 | ",,H,0700",TOBS,2018-10-09T00:00:00,GHCND:USC00280907,17.8 29 | ",,H,0700",TMAX,2018-10-10T00:00:00,GHCND:USC00280907,24.4 30 | ",,H,0700",TMIN,2018-10-10T00:00:00,GHCND:USC00280907,17.2 31 | ",,H,0700",TOBS,2018-10-10T00:00:00,GHCND:USC00280907,18.3 32 | ",,H,0700",TMAX,2018-10-11T00:00:00,GHCND:USC00280907,26.1 33 | ",,H,0700",TMIN,2018-10-11T00:00:00,GHCND:USC00280907,17.8 34 | ",,H,0700",TOBS,2018-10-11T00:00:00,GHCND:USC00280907,21.7 35 | ",,H,0700",TMAX,2018-10-12T00:00:00,GHCND:USC00280907,22.8 36 | ",,H,0700",TMIN,2018-10-12T00:00:00,GHCND:USC00280907,14.4 37 | ",,H,0700",TOBS,2018-10-12T00:00:00,GHCND:USC00280907,15.6 38 | ",,H,0700",TMAX,2018-10-13T00:00:00,GHCND:USC00280907,15.6 39 | ",,H,0700",TMIN,2018-10-13T00:00:00,GHCND:USC00280907,7.2 40 | ",,H,0700",TOBS,2018-10-13T00:00:00,GHCND:USC00280907,8.3 41 | ",,H,0700",TMAX,2018-10-14T00:00:00,GHCND:USC00280907,13.3 42 | ",,H,0700",TMIN,2018-10-14T00:00:00,GHCND:USC00280907,5.6 43 | ",,H,0700",TOBS,2018-10-14T00:00:00,GHCND:USC00280907,6.7 44 | ",,H,0700",TMAX,2018-10-15T00:00:00,GHCND:USC00280907,13.3 45 | ",,H,0700",TMIN,2018-10-15T00:00:00,GHCND:USC00280907,6.7 46 | ",,H,0700",TOBS,2018-10-15T00:00:00,GHCND:USC00280907,10.0 47 | ",,H,0700",TMAX,2018-10-16T00:00:00,GHCND:USC00280907,18.9 48 | ",,H,0700",TMIN,2018-10-16T00:00:00,GHCND:USC00280907,7.8 49 | ",,H,0700",TOBS,2018-10-16T00:00:00,GHCND:USC00280907,7.8 50 | ",,H,0700",TMAX,2018-10-17T00:00:00,GHCND:USC00280907,13.3 51 | ",,H,0700",TMIN,2018-10-17T00:00:00,GHCND:USC00280907,3.3 52 | ",,H,0700",TOBS,2018-10-17T00:00:00,GHCND:USC00280907,5.0 53 | ",,H,0700",TMAX,2018-10-18T00:00:00,GHCND:USC00280907,16.1 54 | ",,H,0700",TMIN,2018-10-18T00:00:00,GHCND:USC00280907,4.4 55 | ",,H,0700",TOBS,2018-10-18T00:00:00,GHCND:USC00280907,5.0 56 | ",,H,0700",TMAX,2018-10-19T00:00:00,GHCND:USC00280907,10.0 57 | ",,H,0700",TMIN,2018-10-19T00:00:00,GHCND:USC00280907,-1.1 58 | ",,H,0700",TOBS,2018-10-19T00:00:00,GHCND:USC00280907,0.0 59 | ",,H,0700",TMAX,2018-10-20T00:00:00,GHCND:USC00280907,15.0 60 | ",,H,0700",TMIN,2018-10-20T00:00:00,GHCND:USC00280907,-0.6 61 | ",,H,0700",TOBS,2018-10-20T00:00:00,GHCND:USC00280907,10.6 62 | ",,H,0700",TMAX,2018-10-21T00:00:00,GHCND:USC00280907,16.7 63 | ",,H,0700",TMIN,2018-10-21T00:00:00,GHCND:USC00280907,7.8 64 | ",,H,0700",TOBS,2018-10-21T00:00:00,GHCND:USC00280907,7.8 65 | ",,H,0700",TMAX,2018-10-22T00:00:00,GHCND:USC00280907,7.8 66 | ",,H,0700",TMIN,2018-10-22T00:00:00,GHCND:USC00280907,-1.1 67 | ",,H,0700",TOBS,2018-10-22T00:00:00,GHCND:USC00280907,-1.1 68 | ",,H,0700",TMAX,2018-10-23T00:00:00,GHCND:USC00280907,15.6 69 | ",,H,0700",TMIN,2018-10-23T00:00:00,GHCND:USC00280907,-1.1 70 | ",,H,0700",TOBS,2018-10-23T00:00:00,GHCND:USC00280907,10.0 71 | ",,H,0700",TMAX,2018-10-24T00:00:00,GHCND:USC00280907,16.7 72 | ",,H,0700",TMIN,2018-10-24T00:00:00,GHCND:USC00280907,4.4 73 | ",,H,0700",TOBS,2018-10-24T00:00:00,GHCND:USC00280907,6.7 74 | ",,H,0700",TMAX,2018-10-25T00:00:00,GHCND:USC00280907,11.7 75 | ",,H,0700",TMIN,2018-10-25T00:00:00,GHCND:USC00280907,2.8 76 | ",,H,0700",TOBS,2018-10-25T00:00:00,GHCND:USC00280907,2.8 77 | ",,H,0700",TMAX,2018-10-26T00:00:00,GHCND:USC00280907,9.4 78 | ",,H,0700",TMIN,2018-10-26T00:00:00,GHCND:USC00280907,-0.6 79 | ",,H,0700",TOBS,2018-10-26T00:00:00,GHCND:USC00280907,-0.6 80 | ",,H,0700",TMAX,2018-10-27T00:00:00,GHCND:USC00280907,8.9 81 | ",,H,0700",TMIN,2018-10-27T00:00:00,GHCND:USC00280907,-0.6 82 | ",,H,0700",TOBS,2018-10-27T00:00:00,GHCND:USC00280907,6.1 83 | ",,H,0700",TMAX,2018-10-28T00:00:00,GHCND:USC00280907,8.3 84 | ",,H,0700",TMIN,2018-10-28T00:00:00,GHCND:USC00280907,5.0 85 | ",,H,0700",TOBS,2018-10-28T00:00:00,GHCND:USC00280907,7.2 86 | ",,H,0700",TMAX,2018-10-29T00:00:00,GHCND:USC00280907,10.6 87 | ",,H,0700",TMIN,2018-10-29T00:00:00,GHCND:USC00280907,6.7 88 | ",,H,0700",TOBS,2018-10-29T00:00:00,GHCND:USC00280907,8.3 89 | ",,H,0700",TMAX,2018-10-30T00:00:00,GHCND:USC00280907,13.3 90 | ",,H,0700",TMIN,2018-10-30T00:00:00,GHCND:USC00280907,2.2 91 | ",,H,0700",TOBS,2018-10-30T00:00:00,GHCND:USC00280907,5.0 92 | ",,H,0700",TMAX,2018-10-31T00:00:00,GHCND:USC00280907,12.2 93 | ",,H,0700",TMIN,2018-10-31T00:00:00,GHCND:USC00280907,0.0 94 | ",,H,0700",TOBS,2018-10-31T00:00:00,GHCND:USC00280907,0.0 95 | -------------------------------------------------------------------------------- /ch_03/data/nyc_temperatures.csv: -------------------------------------------------------------------------------- 1 | attributes,datatype,date,station,value 2 | "H,,S,",TAVG,2018-10-01T00:00:00,GHCND:USW00014732,21.2 3 | ",,W,2400",TMAX,2018-10-01T00:00:00,GHCND:USW00014732,25.6 4 | ",,W,2400",TMIN,2018-10-01T00:00:00,GHCND:USW00014732,18.3 5 | "H,,S,",TAVG,2018-10-02T00:00:00,GHCND:USW00014732,22.7 6 | ",,W,2400",TMAX,2018-10-02T00:00:00,GHCND:USW00014732,26.1 7 | ",,W,2400",TMIN,2018-10-02T00:00:00,GHCND:USW00014732,19.4 8 | "H,,S,",TAVG,2018-10-03T00:00:00,GHCND:USW00014732,21.8 9 | ",,W,2400",TMAX,2018-10-03T00:00:00,GHCND:USW00014732,25.0 10 | ",,W,2400",TMIN,2018-10-03T00:00:00,GHCND:USW00014732,18.9 11 | "H,,S,",TAVG,2018-10-04T00:00:00,GHCND:USW00014732,21.3 12 | ",,W,2400",TMAX,2018-10-04T00:00:00,GHCND:USW00014732,26.1 13 | ",,W,2400",TMIN,2018-10-04T00:00:00,GHCND:USW00014732,17.8 14 | "H,,S,",TAVG,2018-10-05T00:00:00,GHCND:USW00014732,20.3 15 | ",,W,2400",TMAX,2018-10-05T00:00:00,GHCND:USW00014732,22.8 16 | ",,W,2400",TMIN,2018-10-05T00:00:00,GHCND:USW00014732,16.1 17 | "H,,S,",TAVG,2018-10-06T00:00:00,GHCND:USW00014732,18.7 18 | ",,W,2400",TMAX,2018-10-06T00:00:00,GHCND:USW00014732,21.1 19 | ",,W,2400",TMIN,2018-10-06T00:00:00,GHCND:USW00014732,17.8 20 | "H,,S,",TAVG,2018-10-07T00:00:00,GHCND:USW00014732,22.8 21 | ",,W,2400",TMAX,2018-10-07T00:00:00,GHCND:USW00014732,27.8 22 | ",,W,2400",TMIN,2018-10-07T00:00:00,GHCND:USW00014732,21.1 23 | "H,,S,",TAVG,2018-10-08T00:00:00,GHCND:USW00014732,20.9 24 | ",,W,2400",TMAX,2018-10-08T00:00:00,GHCND:USW00014732,22.8 25 | ",,W,2400",TMIN,2018-10-08T00:00:00,GHCND:USW00014732,18.3 26 | "H,,S,",TAVG,2018-10-09T00:00:00,GHCND:USW00014732,21.8 27 | ",,W,2400",TMAX,2018-10-09T00:00:00,GHCND:USW00014732,25.6 28 | ",,W,2400",TMIN,2018-10-09T00:00:00,GHCND:USW00014732,19.4 29 | "H,,S,",TAVG,2018-10-10T00:00:00,GHCND:USW00014732,23.8 30 | ",,W,2400",TMAX,2018-10-10T00:00:00,GHCND:USW00014732,27.8 31 | ",,W,2400",TMIN,2018-10-10T00:00:00,GHCND:USW00014732,21.7 32 | "H,,S,",TAVG,2018-10-11T00:00:00,GHCND:USW00014732,23.4 33 | ",,W,2400",TMAX,2018-10-11T00:00:00,GHCND:USW00014732,26.7 34 | ",,W,2400",TMIN,2018-10-11T00:00:00,GHCND:USW00014732,21.7 35 | "H,,S,",TAVG,2018-10-12T00:00:00,GHCND:USW00014732,18.3 36 | ",,W,2400",TMAX,2018-10-12T00:00:00,GHCND:USW00014732,22.2 37 | ",,W,2400",TMIN,2018-10-12T00:00:00,GHCND:USW00014732,12.2 38 | "H,,S,",TAVG,2018-10-13T00:00:00,GHCND:USW00014732,12.2 39 | ",,W,2400",TMAX,2018-10-13T00:00:00,GHCND:USW00014732,15.0 40 | ",,W,2400",TMIN,2018-10-13T00:00:00,GHCND:USW00014732,9.4 41 | "H,,S,",TAVG,2018-10-14T00:00:00,GHCND:USW00014732,12.9 42 | ",,W,2400",TMAX,2018-10-14T00:00:00,GHCND:USW00014732,15.6 43 | ",,W,2400",TMIN,2018-10-14T00:00:00,GHCND:USW00014732,10.6 44 | "H,,S,",TAVG,2018-10-15T00:00:00,GHCND:USW00014732,15.8 45 | ",,W,2400",TMAX,2018-10-15T00:00:00,GHCND:USW00014732,21.1 46 | ",,W,2400",TMIN,2018-10-15T00:00:00,GHCND:USW00014732,12.8 47 | "H,,S,",TAVG,2018-10-16T00:00:00,GHCND:USW00014732,14.3 48 | ",,W,2400",TMAX,2018-10-16T00:00:00,GHCND:USW00014732,16.7 49 | ",,W,2400",TMIN,2018-10-16T00:00:00,GHCND:USW00014732,9.4 50 | "H,,S,",TAVG,2018-10-17T00:00:00,GHCND:USW00014732,13.2 51 | ",,W,2400",TMAX,2018-10-17T00:00:00,GHCND:USW00014732,17.8 52 | ",,W,2400",TMIN,2018-10-17T00:00:00,GHCND:USW00014732,8.9 53 | "H,,S,",TAVG,2018-10-18T00:00:00,GHCND:USW00014732,9.6 54 | ",,W,2400",TMAX,2018-10-18T00:00:00,GHCND:USW00014732,11.7 55 | ",,W,2400",TMIN,2018-10-18T00:00:00,GHCND:USW00014732,6.7 56 | "H,,S,",TAVG,2018-10-19T00:00:00,GHCND:USW00014732,11.3 57 | ",,W,2400",TMAX,2018-10-19T00:00:00,GHCND:USW00014732,17.2 58 | ",,W,2400",TMIN,2018-10-19T00:00:00,GHCND:USW00014732,7.2 59 | "H,,S,",TAVG,2018-10-20T00:00:00,GHCND:USW00014732,15.0 60 | ",,W,2400",TMAX,2018-10-20T00:00:00,GHCND:USW00014732,18.3 61 | ",,W,2400",TMIN,2018-10-20T00:00:00,GHCND:USW00014732,12.2 62 | "H,,S,",TAVG,2018-10-21T00:00:00,GHCND:USW00014732,10.7 63 | ",,W,2400",TMAX,2018-10-21T00:00:00,GHCND:USW00014732,12.2 64 | ",,W,2400",TMIN,2018-10-21T00:00:00,GHCND:USW00014732,6.1 65 | "H,,S,",TAVG,2018-10-22T00:00:00,GHCND:USW00014732,8.3 66 | ",,W,2400",TMAX,2018-10-22T00:00:00,GHCND:USW00014732,11.1 67 | ",,W,2400",TMIN,2018-10-22T00:00:00,GHCND:USW00014732,5.6 68 | "H,,S,",TAVG,2018-10-23T00:00:00,GHCND:USW00014732,12.6 69 | ",,W,2400",TMAX,2018-10-23T00:00:00,GHCND:USW00014732,19.4 70 | ",,W,2400",TMIN,2018-10-23T00:00:00,GHCND:USW00014732,8.3 71 | "H,,S,",TAVG,2018-10-24T00:00:00,GHCND:USW00014732,11.0 72 | ",,W,2400",TMAX,2018-10-24T00:00:00,GHCND:USW00014732,13.3 73 | ",,W,2400",TMIN,2018-10-24T00:00:00,GHCND:USW00014732,7.8 74 | "H,,S,",TAVG,2018-10-25T00:00:00,GHCND:USW00014732,8.8 75 | ",,W,2400",TMAX,2018-10-25T00:00:00,GHCND:USW00014732,11.1 76 | ",,W,2400",TMIN,2018-10-25T00:00:00,GHCND:USW00014732,6.1 77 | "H,,S,",TAVG,2018-10-26T00:00:00,GHCND:USW00014732,7.3 78 | ",,W,2400",TMAX,2018-10-26T00:00:00,GHCND:USW00014732,10.0 79 | ",,W,2400",TMIN,2018-10-26T00:00:00,GHCND:USW00014732,5.6 80 | "H,,S,",TAVG,2018-10-27T00:00:00,GHCND:USW00014732,9.4 81 | ",,W,2400",TMAX,2018-10-27T00:00:00,GHCND:USW00014732,11.7 82 | ",,W,2400",TMIN,2018-10-27T00:00:00,GHCND:USW00014732,7.2 83 | "H,,S,",TAVG,2018-10-28T00:00:00,GHCND:USW00014732,10.2 84 | ",,W,2400",TMAX,2018-10-28T00:00:00,GHCND:USW00014732,12.2 85 | ",,W,2400",TMIN,2018-10-28T00:00:00,GHCND:USW00014732,8.3 86 | "H,,S,",TAVG,2018-10-29T00:00:00,GHCND:USW00014732,11.8 87 | ",,W,2400",TMAX,2018-10-29T00:00:00,GHCND:USW00014732,14.4 88 | ",,W,2400",TMIN,2018-10-29T00:00:00,GHCND:USW00014732,9.4 89 | "H,,S,",TAVG,2018-10-30T00:00:00,GHCND:USW00014732,10.2 90 | ",,W,2400",TMAX,2018-10-30T00:00:00,GHCND:USW00014732,13.9 91 | ",,W,2400",TMIN,2018-10-30T00:00:00,GHCND:USW00014732,7.2 92 | "H,,S,",TAVG,2018-10-31T00:00:00,GHCND:USW00014732,12.6 93 | ",,W,2400",TMAX,2018-10-31T00:00:00,GHCND:USW00014732,17.8 94 | ",,W,2400",TMIN,2018-10-31T00:00:00,GHCND:USW00014732,7.2 95 | -------------------------------------------------------------------------------- /ch_03/data/wide_data.csv: -------------------------------------------------------------------------------- 1 | date,TMAX,TMIN,TOBS 2 | 2018-10-01,21.1,8.9,13.9 3 | 2018-10-02,23.9,13.9,17.2 4 | 2018-10-03,25.0,15.6,16.1 5 | 2018-10-04,22.8,11.7,11.7 6 | 2018-10-05,23.3,11.7,18.9 7 | 2018-10-06,20.0,13.3,16.1 8 | 2018-10-07,20.0,16.1,20.0 9 | 2018-10-08,26.7,17.8,17.8 10 | 2018-10-09,18.9,17.2,17.8 11 | 2018-10-10,24.4,17.2,18.3 12 | 2018-10-11,26.1,17.8,21.7 13 | 2018-10-12,22.8,14.4,15.6 14 | 2018-10-13,15.6,7.2,8.3 15 | 2018-10-14,13.3,5.6,6.7 16 | 2018-10-15,13.3,6.7,10.0 17 | 2018-10-16,18.9,7.8,7.8 18 | 2018-10-17,13.3,3.3,5.0 19 | 2018-10-18,16.1,4.4,5.0 20 | 2018-10-19,10.0,-1.1,0.0 21 | 2018-10-20,15.0,-0.6,10.6 22 | 2018-10-21,16.7,7.8,7.8 23 | 2018-10-22,7.8,-1.1,-1.1 24 | 2018-10-23,15.6,-1.1,10.0 25 | 2018-10-24,16.7,4.4,6.7 26 | 2018-10-25,11.7,2.8,2.8 27 | 2018-10-26,9.4,-0.6,-0.6 28 | 2018-10-27,8.9,-0.6,6.1 29 | 2018-10-28,8.3,5.0,7.2 30 | 2018-10-29,10.6,6.7,8.3 31 | 2018-10-30,13.3,2.2,5.0 32 | 2018-10-31,12.2,0.0,0.0 33 | -------------------------------------------------------------------------------- /ch_03/exercises/fb.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903 3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563 4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896 5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535 6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726 7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057 8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894 9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587 10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299 11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842 12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376 13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901 14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540 15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464 16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781 17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548 18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740 19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212 20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172 21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556 22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144 23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293 24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484 25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206 26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505 27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886 28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321 29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626 30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133 31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350 32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704 33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120 34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610 35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921 36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804 37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192 38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288 39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703 40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806 41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039 42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626 43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905 44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280 45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784 46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293 47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946 48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292 49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229 50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477 51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728 52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035 53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438 54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060 55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768 56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834 57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979 58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706 59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634 60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995 61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170 62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293 63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991 64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865 65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584 66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609 67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812 68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227 69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041 70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633 71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956 72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561 73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435 74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029 75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568 76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961 77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438 78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102 79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926 80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581 81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934 82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657 83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478 84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932 85 | 2018-05-02,174.24599999999998,178.08,174.2,176.07,30424450 86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071 87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844 88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195 89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211 90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811 91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403 92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848 93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744 94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433 95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495 96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675 97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451 98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864 99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419 100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100 101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742 102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061 103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937 104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866 105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631 106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245 107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795 108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294 109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920 110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171 111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092 112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907 113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704 114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821 115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866 116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931 117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023 118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996 119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933 120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717 121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188 122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137 123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576 124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408 125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439 126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602 127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578 128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514 129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193 130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131 131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437 132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067 133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377 134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706 135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401 136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209 137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892 138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907 139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429 140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508 141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969 142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681 143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585 144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668 145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749 146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787 147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471 148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109 149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954 150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434 151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192 152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562 153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230 154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120 155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410 156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264 157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995 158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231 159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784 160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176 161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006 162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514 163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083 164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567 165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556 166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935 167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675 168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301 169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532 170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159 171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971 172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744 173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834 174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600 175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680 176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088 177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118 178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775 179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405 180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321 181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236 182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996 183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038 184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800 185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775 186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806 187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231 188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856 189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638 190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677 191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977 192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456 193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635 194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047 195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968 196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425 197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970 198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901 199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492 200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521 201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095 202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003 203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084 204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347 205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658 206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032 207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597 208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696 209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341 210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784 211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278 212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251 213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786 214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695 215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849 216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124 217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372 218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814 219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898 220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123 221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710 222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384 223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280 224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560 225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729 226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475 227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735 228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128 229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640 230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318 231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505 232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713 233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577 234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226 235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400 236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145 237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460 238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173 239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349 240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936 241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610 242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820 243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959 244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084 245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894 246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944 247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491 248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002 249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370 250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509 251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569 252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308 253 | -------------------------------------------------------------------------------- /ch_03/exercises/nflx.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2018-01-02,196.1,201.65,195.42,201.07,10966889 3 | 2018-01-03,202.05,206.21,201.5,205.05,8591369 4 | 2018-01-04,206.2,207.05,204.0006,205.63,6029616 5 | 2018-01-05,207.25,210.02,205.59,209.99,7033240 6 | 2018-01-08,210.02,212.5,208.44,212.05,5580178 7 | 2018-01-09,212.11,212.98,208.59,209.31,6125855 8 | 2018-01-10,207.57,213.64,206.91,212.52,5951486 9 | 2018-01-11,214.29,217.75,213.35,217.24,7659485 10 | 2018-01-12,217.18,222.55,216.0,221.23,8199423 11 | 2018-01-16,224.24,226.07,217.2,221.53,13516067 12 | 2018-01-17,221.0,221.15,216.32,217.5,9123056 13 | 2018-01-18,220.34,220.58,216.55,220.33,8225339 14 | 2018-01-19,222.75,223.49,218.5,220.46,10548567 15 | 2018-01-22,222.0,227.785,221.2,227.58,17703293 16 | 2018-01-23,255.05,257.71,248.02,250.29,27705332 17 | 2018-01-24,250.88,261.71,249.31099999999998,261.3,17352448 18 | 2018-01-25,263.0,272.3,260.23,269.7,15336378 19 | 2018-01-26,271.485,274.6,268.76,274.6,11021839 20 | 2018-01-29,274.2,286.81,273.92,284.59,17529749 21 | 2018-01-30,277.0,282.73,272.7,278.8,12482852 22 | 2018-01-31,281.94,282.289,269.58,270.3,11695072 23 | 2018-02-01,266.41,271.95,263.38,265.07,9669011 24 | 2018-02-02,263.0,270.62,262.71,267.43,9123610 25 | 2018-02-05,262.0,267.899,250.03,254.26,11896053 26 | 2018-02-06,247.7,266.7,245.0,265.72,12595801 27 | 2018-02-07,266.58,272.45,264.325,264.56,8981548 28 | 2018-02-08,267.08,267.62,250.0,250.1,9306701 29 | 2018-02-09,253.85,255.7999,236.11,249.47,16906942 30 | 2018-02-12,252.14,259.15,249.0,257.95,8534906 31 | 2018-02-13,257.29,261.41,254.7,258.27,6855151 32 | 2018-02-14,260.47,269.88,260.33,266.0,10971985 33 | 2018-02-15,270.03,280.5,267.63,280.27,10759667 34 | 2018-02-16,278.73,281.96,275.69,278.52,8312380 35 | 2018-02-20,277.74,285.812,276.61,278.55,7769023 36 | 2018-02-21,282.07,286.64,280.01,281.04,9371121 37 | 2018-02-22,283.88,284.5,274.45,278.14,8891535 38 | 2018-02-23,281.0,286.0,277.81,285.93,7301809 39 | 2018-02-26,288.75,295.6475,287.01,294.16,10268633 40 | 2018-02-27,294.77,297.36,290.59,290.61,9416489 41 | 2018-02-28,293.1,295.75,290.78,291.38,7653454 42 | 2018-03-01,292.75,295.25,283.83,290.39,11932051 43 | 2018-03-02,284.65,301.18,283.23,301.05,13345313 44 | 2018-03-05,302.85,316.91,297.6,315.0,18986099 45 | 2018-03-06,319.88,325.79,316.5,325.22,18525844 46 | 2018-03-07,320.0,323.74,314.55,321.16,17132222 47 | 2018-03-08,322.2,322.9176,314.13,317.0,11340066 48 | 2018-03-09,321.33,331.44,320.23,331.44,14500219 49 | 2018-03-12,333.56,333.98,318.6,321.3,20369152 50 | 2018-03-13,323.87,325.8409,313.278,315.88,12917224 51 | 2018-03-14,318.16,323.88,317.7,321.55,10475073 52 | 2018-03-15,323.17,323.4,318.14,321.09,5642883 53 | 2018-03-16,321.42,324.11,318.37,318.45,7333734 54 | 2018-03-19,315.8,317.0,307.34,313.48,9925162 55 | 2018-03-20,313.26,319.5,312.8,317.5,5991945 56 | 2018-03-21,316.35,319.4,314.51099999999997,316.48,5263911 57 | 2018-03-22,313.07,314.12,305.66,306.7,8063305 58 | 2018-03-23,307.41,310.73,300.36,300.94,9529948 59 | 2018-03-26,309.36,321.03,302.0,320.35,11988274 60 | 2018-03-27,322.49,322.9,297.0,300.69,12068632 61 | 2018-03-28,298.39,298.8,281.61,285.77,18972912 62 | 2018-03-29,287.0,295.35,275.9,295.35,19145522 63 | 2018-04-02,291.94,292.87,275.05,280.29,13405760 64 | 2018-04-03,285.45,291.25,278.01,283.67,12694862 65 | 2018-04-04,273.63,290.31,271.2239,288.94,12913978 66 | 2018-04-05,293.15,299.16,289.11,293.97,10655178 67 | 2018-04-06,289.1,298.85,285.65,288.85,11444777 68 | 2018-04-09,291.77,299.55,289.12,289.93,9853564 69 | 2018-04-10,297.68,298.95,291.69,298.07,10719097 70 | 2018-04-11,302.8847,311.64,301.82,303.67,14877429 71 | 2018-04-12,309.7187,311.13,306.75,309.25,10249403 72 | 2018-04-13,317.29,317.49,308.23,311.65,12046573 73 | 2018-04-16,315.99,316.1,304.0,307.78,20307921 74 | 2018-04-17,329.66,338.62,323.77,336.06,33866456 75 | 2018-04-18,336.3,338.82,331.1,334.52,11221139 76 | 2018-04-19,332.88,335.31,326.77,332.7,8438825 77 | 2018-04-20,332.22,336.51,326.0,327.77,9158655 78 | 2018-04-23,329.1499,331.22,317.08,318.69,8968015 79 | 2018-04-24,319.2168,320.249,302.31,307.02,13893217 80 | 2018-04-25,306.37,309.98,292.615,305.76,14919698 81 | 2018-04-26,310.0,316.63,305.58,313.98,9266699 82 | 2018-04-27,316.25,317.45,306.5,311.76,7074384 83 | 2018-04-30,311.07,317.88,310.118,312.46,6088787 84 | 2018-05-01,310.36,313.48,306.69,313.3,6036639 85 | 2018-05-02,311.65,317.1,310.4034,313.36,5697120 86 | 2018-05-03,312.59,312.59,305.73,311.69,6135828 87 | 2018-05-04,308.71,320.98,307.67,320.09,8209513 88 | 2018-05-07,321.9947,329.0234,319.34,326.26,7117823 89 | 2018-05-08,325.9,327.348,323.05,326.89,4735738 90 | 2018-05-09,328.79,331.95,327.51,330.3,5633444 91 | 2018-05-10,331.5,332.055,327.3438,329.6,5302254 92 | 2018-05-11,329.65,331.26,324.87,326.46,4589731 93 | 2018-05-14,327.25,330.5038,327.04,328.53,4089800 94 | 2018-05-15,325.94,326.94,322.434,326.13,4746096 95 | 2018-05-16,326.28,329.72,325.14,328.19,3671690 96 | 2018-05-17,327.53,330.45,323.1734,325.22,4935708 97 | 2018-05-18,324.9,326.42,322.8,324.18,3577717 98 | 2018-05-21,327.11,331.88,325.45,331.82,6657326 99 | 2018-05-22,334.05,336.63,331.15,331.62,5964448 100 | 2018-05-23,329.04,345.0,328.09,344.72,10049147 101 | 2018-05-24,344.34,354.0,341.12,349.29,14758553 102 | 2018-05-25,349.9,354.36,348.83,351.29,7817400 103 | 2018-05-29,351.5,356.1,346.71,349.73,9717921 104 | 2018-05-30,352.37,354.0,349.26,353.54,5685531 105 | 2018-05-31,353.8,355.53,350.21,351.6,6921687 106 | 2018-06-01,353.88,359.99,352.82,359.93,7112292 107 | 2018-06-04,362.6847,363.0,355.51,361.81,7681995 108 | 2018-06-05,363.32,369.83,361.4124,365.8,8358045 109 | 2018-06-06,367.7848,369.6799,363.33,367.45,7712302 110 | 2018-06-07,368.54,368.7,357.8,361.4,8278040 111 | 2018-06-08,358.06,362.39,356.25,360.57,5225736 112 | 2018-06-11,361.88,365.67,360.91,361.45,4432445 113 | 2018-06-12,363.6,365.98,362.0,363.83,4290969 114 | 2018-06-13,367.53,384.2537,364.11,379.93,18222799 115 | 2018-06-14,384.27,395.03,383.25,392.87,14598333 116 | 2018-06-15,390.71,398.86,387.51,391.98,13588114 117 | 2018-06-18,387.72,393.16,386.5,390.4,6824794 118 | 2018-06-19,389.5,405.29,388.5,404.98,16697104 119 | 2018-06-20,415.15,419.4675,409.6,416.76,16494572 120 | 2018-06-21,421.38,423.2056,406.3701,415.44,18389936 121 | 2018-06-22,419.98,420.5,409.651,411.09,10428621 122 | 2018-06-25,404.69,405.99,378.75,384.48,22490922 123 | 2018-06-26,393.28,404.78,389.05,399.39,15191157 124 | 2018-06-27,407.56,411.5865,390.0,390.39,16541426 125 | 2018-06-28,395.0,396.9,387.1,395.42,12219888 126 | 2018-06-29,399.19,401.3299,390.55,391.43,9252511 127 | 2018-07-02,385.45,398.38,380.0,398.18,8142457 128 | 2018-07-03,399.49,399.98,389.5,390.52,5280344 129 | 2018-07-05,393.8003,399.24,390.86,398.39,8448937 130 | 2018-07-06,397.45,408.6495,395.5225,408.25,8629606 131 | 2018-07-09,415.95,419.12,411.1,418.97,11127477 132 | 2018-07-10,417.24,419.44,413.08,415.63,9382944 133 | 2018-07-11,411.34,419.77,410.6,418.65,9713904 134 | 2018-07-12,415.1553,416.79,407.8,413.5,12743273 135 | 2018-07-13,409.19,410.0,395.08099999999996,395.8,15747266 136 | 2018-07-16,398.98,403.355,391.75,400.48,22959984 137 | 2018-07-17,346.95,385.0,344.0,379.48,58410362 138 | 2018-07-18,381.24,383.13,372.3552,375.13,21746266 139 | 2018-07-19,371.06,375.749,363.0,364.23,16878681 140 | 2018-07-20,364.92,370.5,360.14,361.05,15113740 141 | 2018-07-23,359.1453,363.9,353.6,362.66,11505232 142 | 2018-07-24,366.94,367.4,354.56,357.32,12851457 143 | 2018-07-25,357.57,363.28,355.65,362.87,8516248 144 | 2018-07-26,358.19,365.54,356.625,363.09,6993684 145 | 2018-07-27,366.85,367.0,351.65,355.21,8949491 146 | 2018-07-30,351.93,352.03,334.0201,334.96,18260710 147 | 2018-07-31,331.51,342.5,328.0,337.45,14085369 148 | 2018-08-01,335.87,344.41,334.02,338.38,7790477 149 | 2018-08-02,337.23,345.0,334.71,344.5,7131328 150 | 2018-08-03,347.75,347.86,338.4768,343.09,8848367 151 | 2018-08-06,342.8653,351.98,341.74,350.92,8198076 152 | 2018-08-07,353.23,357.31,349.01,351.83,7970930 153 | 2018-08-08,352.21,352.29,346.61,347.61,5402465 154 | 2018-08-09,347.96,352.439,345.8157,349.36,4820313 155 | 2018-08-10,346.91,349.1,344.4233,345.87,4337481 156 | 2018-08-13,339.89,347.19,339.07,341.31,6893649 157 | 2018-08-14,342.09,342.41,336.25,337.49,5805182 158 | 2018-08-15,334.03,335.4962,321.0,326.4,11784485 159 | 2018-08-16,329.9,331.17,321.2138,322.44,6689733 160 | 2018-08-17,319.01,324.365,312.96,316.78,10407908 161 | 2018-08-20,314.64,331.6,310.928,327.73,13591100 162 | 2018-08-21,331.0,341.5,329.7,338.02,14783246 163 | 2018-08-22,338.49,346.21,337.406,344.44,8930669 164 | 2018-08-23,348.11,350.08,337.65,339.17,11336425 165 | 2018-08-24,346.0,359.15,344.541,358.82,14729137 166 | 2018-08-27,367.1453,374.49,360.0,364.58,17427304 167 | 2018-08-28,367.23,369.99,360.38,368.49,9222617 168 | 2018-08-29,367.2,369.86,362.91,368.04,8118734 169 | 2018-08-30,365.0,376.8085,363.54,370.98,10981013 170 | 2018-08-31,370.66,376.0,367.0766,367.68,7943443 171 | 2018-09-04,366.47,368.88,361.26,363.6,7605161 172 | 2018-09-05,360.0,363.39,335.83,341.18,13092847 173 | 2018-09-06,347.44,356.0,341.99,346.46,13050156 174 | 2018-09-07,342.2,355.75,341.25,348.68,9105817 175 | 2018-09-10,352.27,352.5,343.08,348.41,5752184 176 | 2018-09-11,344.67,356.15,343.9001,355.93,6198063 177 | 2018-09-12,359.08,370.43,356.24,369.95,10480769 178 | 2018-09-13,371.91,374.09,366.84,368.15,8366122 179 | 2018-09-14,368.55,371.09,363.46,364.56,4756426 180 | 2018-09-17,364.22,367.33,349.57,350.35,7071945 181 | 2018-09-18,353.67,368.15,351.56,367.65,10413981 182 | 2018-09-19,373.95,377.61,359.17,366.96,11876841 183 | 2018-09-20,370.26,370.26,363.17,365.36,6768086 184 | 2018-09-21,366.59,372.22,360.74,361.19,11930568 185 | 2018-09-24,359.0,373.64,354.33,369.61,9322522 186 | 2018-09-25,370.23,371.34,364.49199999999996,369.43,6799816 187 | 2018-09-26,373.59,382.0,370.88,377.88,13799728 188 | 2018-09-27,379.87,383.2,376.0,380.71,7326246 189 | 2018-09-28,379.24,380.8,373.73,374.13,7114878 190 | 2018-10-01,375.85,386.11,375.59,381.43,8376560 191 | 2018-10-02,384.38,386.7999,373.83,377.14,8638717 192 | 2018-10-03,378.53,380.93,374.881,377.05,5798605 193 | 2018-10-04,375.88,375.92,360.4,363.65,9074350 194 | 2018-10-05,359.77,363.5,343.0,351.35,13522957 195 | 2018-10-08,345.18,352.945,338.11,349.1,12375496 196 | 2018-10-09,348.48,358.72,347.09,355.71,8754990 197 | 2018-10-10,353.52,355.15,325.39,325.89,17183120 198 | 2018-10-11,324.94,334.2,315.81,321.1,16082056 199 | 2018-10-12,339.57,341.3,328.9,339.56,14870830 200 | 2018-10-15,337.63,339.2057,326.93,333.13,11214956 201 | 2018-10-16,337.24,347.95,330.555,346.4,20156418 202 | 2018-10-17,378.33,380.0,356.5,364.7,32610947 203 | 2018-10-18,360.673,362.2,346.05,346.71,18461040 204 | 2018-10-19,351.0,355.8,332.2,332.67,16717233 205 | 2018-10-22,333.1,335.8,320.34,329.54,17097175 206 | 2018-10-23,318.0,336.58,316.77,333.16,14907326 207 | 2018-10-24,332.28,333.0,300.73,301.83,19039297 208 | 2018-10-25,307.12,319.94,305.25,312.87,13346921 209 | 2018-10-26,300.51,313.99,292.3,299.83,19616041 210 | 2018-10-29,305.26,307.89,275.4,284.84,21698841 211 | 2018-10-30,275.57,290.525,271.2093,285.81,23685702 212 | 2018-10-31,297.77,311.5,295.05,301.78,20360342 213 | 2018-11-01,304.59,318.45,296.67,317.38,15121450 214 | 2018-11-02,318.0,321.88,308.33,309.1,13404646 215 | 2018-11-05,311.1,317.53,303.74,315.44,10283044 216 | 2018-11-06,314.76,320.22,305.3,310.84,9710424 217 | 2018-11-07,312.9,328.56,311.0,327.5,13328328 218 | 2018-11-08,328.0,332.0499,316.6103,317.92,11023853 219 | 2018-11-09,311.07,312.98,298.01,303.47,13480792 220 | 2018-11-12,300.0,302.49,290.63,294.07,10924827 221 | 2018-11-13,295.0,303.55,289.1,294.4,12232162 222 | 2018-11-14,300.4,301.84,278.2969,286.73,16853574 223 | 2018-11-15,285.51,292.5,282.16,290.06,9967098 224 | 2018-11-16,287.14,291.72,281.0,286.21,9099485 225 | 2018-11-19,283.79,285.09,269.15,270.6,12993797 226 | 2018-11-20,254.63,276.34,250.0,266.98,16693809 227 | 2018-11-21,274.42,275.34,261.51,262.13,11023037 228 | 2018-11-23,260.11,265.5,256.84,258.82,5245123 229 | 2018-11-26,260.55,266.25,253.8,261.43,12498560 230 | 2018-11-27,259.24,269.08,256.14,266.63,11149501 231 | 2018-11-28,271.98,284.0,263.34,282.65,14801333 232 | 2018-11-29,282.32,290.49,275.5,288.75,15431538 233 | 2018-11-30,288.0,290.81,283.061,286.13,11860117 234 | 2018-12-03,293.19,298.72,284.58,290.3,14117370 235 | 2018-12-04,288.13,295.74,274.72,275.33,12800586 236 | 2018-12-06,268.33,283.22,267.14,282.88,13074324 237 | 2018-12-07,282.48,284.209,263.38,265.14,12466711 238 | 2018-12-10,264.19,271.18,260.6094,269.7,9605553 239 | 2018-12-11,274.08,274.5,262.76,265.32,9843199 240 | 2018-12-12,267.66,281.7695,266.48,274.88,11456716 241 | 2018-12-13,277.64,279.32,271.85,276.02,8379292 242 | 2018-12-14,271.81,277.6665,265.0,266.84,9915319 243 | 2018-12-17,266.51,272.98,261.075,262.8,9634734 244 | 2018-12-18,263.3,275.75,263.29,270.94,10350079 245 | 2018-12-19,269.96,280.87,263.77,266.77,13788448 246 | 2018-12-20,264.64,269.9,251.88,260.58,16792928 247 | 2018-12-21,263.83,264.5,241.29,246.39,21397595 248 | 2018-12-24,242.0,250.65,233.68,233.88,9547616 249 | 2018-12-26,233.92,254.5,231.23,253.67,14402735 250 | 2018-12-27,250.11,255.59,240.1,255.565,12235217 251 | 2018-12-28,257.94,261.9144,249.8,256.08,10987286 252 | 2018-12-31,260.16,270.1001,260.0,267.66,13508920 253 | -------------------------------------------------------------------------------- /ch_04/0-weather_data_collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Collecting weather data from an API\n", 8 | "\n", 9 | "## About the data\n", 10 | "In this notebook, we will be collecting daily weather data from the [National Centers for Environmental Information (NCEI) API](https://www.ncdc.noaa.gov/cdo-web/webservices/v2). We will use the Global Historical Climatology Network - Daily (GHCND) data set; see the documentation [here](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf).\n", 11 | "\n", 12 | "*Note: The NCEI is part of the National Oceanic and Atmospheric Administration (NOAA) and, as you can see from the URL for the API, this resource was created when the NCEI was called the NCDC. Should the URL for this resource change in the future, you can search for the NCEI weather API to find the updated one.*\n", 13 | "\n", 14 | "## Using the NCEI API\n", 15 | "Paste your token below." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import requests\n", 25 | "\n", 26 | "def make_request(endpoint, payload=None):\n", 27 | " \"\"\"\n", 28 | " Make a request to a specific endpoint on the weather API\n", 29 | " passing headers and optional payload.\n", 30 | " \n", 31 | " Parameters:\n", 32 | " - endpoint: The endpoint of the API you want to \n", 33 | " make a GET request to.\n", 34 | " - payload: A dictionary of data to pass along \n", 35 | " with the request.\n", 36 | " \n", 37 | " Returns:\n", 38 | " Response object.\n", 39 | " \"\"\"\n", 40 | " return requests.get(\n", 41 | " f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',\n", 42 | " headers={\n", 43 | " 'token': 'PASTE_YOUR_TOKEN_HERE'\n", 44 | " },\n", 45 | " params=payload\n", 46 | " )" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Collect All Data Points for 2018 In NYC (Various Stations)\n", 54 | "We can make a loop to query for all the data points one day at a time. Here we create a list of all the results:" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "'Gathering data for 2018-12-31'" 66 | ] 67 | }, 68 | "metadata": {}, 69 | "output_type": "display_data" 70 | } 71 | ], 72 | "source": [ 73 | "import datetime\n", 74 | "\n", 75 | "from IPython import display # for updating the cell dynamically\n", 76 | "\n", 77 | "current = datetime.date(2018, 1, 1)\n", 78 | "end = datetime.date(2019, 1, 1)\n", 79 | "\n", 80 | "results = []\n", 81 | "\n", 82 | "while current < end:\n", 83 | " # update the cell with status information\n", 84 | " display.clear_output(wait=True)\n", 85 | " display.display(f'Gathering data for {str(current)}')\n", 86 | " \n", 87 | " response = make_request(\n", 88 | " 'data', \n", 89 | " {\n", 90 | " 'datasetid' : 'GHCND', # Global Historical Climatology Network - Daily (GHCND) dataset\n", 91 | " 'locationid' : 'CITY:US360019', # NYC\n", 92 | " 'startdate' : current,\n", 93 | " 'enddate' : current,\n", 94 | " 'units' : 'metric',\n", 95 | " 'limit' : 1000 # max allowed\n", 96 | " }\n", 97 | " )\n", 98 | "\n", 99 | " if response.ok:\n", 100 | " # we extend the list instead of appending to avoid getting a nested list\n", 101 | " results.extend(response.json()['results'])\n", 102 | "\n", 103 | " # update the current date to avoid an infinite loop\n", 104 | " current += datetime.timedelta(days=1)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Now, we can create a dataframe with all this data. Notice there are multiple stations with values for each datatype on a given day. We don't know what the stations are, but we can look them up and add them to the data:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | "
attributesdatatypedatestationvalue
0,,N,PRCP2018-01-01T00:00:00GHCND:US1CTFR00390.0
1,,N,PRCP2018-01-01T00:00:00GHCND:US1NJBG00150.0
2,,N,SNOW2018-01-01T00:00:00GHCND:US1NJBG00150.0
3,,N,PRCP2018-01-01T00:00:00GHCND:US1NJBG00170.0
4,,N,SNOW2018-01-01T00:00:00GHCND:US1NJBG00170.0
\n", 190 | "
" 191 | ], 192 | "text/plain": [ 193 | " attributes datatype date station value\n", 194 | "0 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1CTFR0039 0.0\n", 195 | "1 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1NJBG0015 0.0\n", 196 | "2 ,,N, SNOW 2018-01-01T00:00:00 GHCND:US1NJBG0015 0.0\n", 197 | "3 ,,N, PRCP 2018-01-01T00:00:00 GHCND:US1NJBG0017 0.0\n", 198 | "4 ,,N, SNOW 2018-01-01T00:00:00 GHCND:US1NJBG0017 0.0" 199 | ] 200 | }, 201 | "execution_count": 3, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "import pandas as pd\n", 208 | "\n", 209 | "df = pd.DataFrame(results)\n", 210 | "df.head()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Save this data to a file:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 4, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "df.to_csv('data/nyc_weather_2018.csv', index=False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "and write it to the database:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 5, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "import sqlite3\n", 243 | "\n", 244 | "with sqlite3.connect('data/weather.db') as connection:\n", 245 | " df.to_sql(\n", 246 | " 'weather', connection, index=False, if_exists='replace'\n", 247 | " )" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "For learning about merging dataframes, we will also get the data mapping station IDs to information about the station:" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 6, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "response = make_request(\n", 264 | " 'stations', \n", 265 | " {\n", 266 | " 'datasetid' : 'GHCND', # Global Historical Climatology Network - Daily (GHCND) dataset\n", 267 | " 'locationid' : 'CITY:US360019', # NYC\n", 268 | " 'limit' : 1000 # max allowed\n", 269 | " }\n", 270 | ")\n", 271 | "\n", 272 | "stations = pd.DataFrame(response.json()['results'])[['id', 'name', 'latitude', 'longitude', 'elevation']]\n", 273 | "stations.to_csv('data/weather_stations.csv', index=False)\n", 274 | "\n", 275 | "with sqlite3.connect('data/weather.db') as connection:\n", 276 | " stations.to_sql(\n", 277 | " 'stations', connection, index=False, if_exists='replace'\n", 278 | " )" 279 | ] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.7.2" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /ch_04/data/fb_2018.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903 3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563 4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896 5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535 6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726 7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057 8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894 9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587 10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299 11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842 12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376 13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901 14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540 15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464 16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781 17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548 18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740 19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212 20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172 21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556 22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144 23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293 24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484 25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206 26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505 27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886 28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321 29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626 30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133 31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350 32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704 33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120 34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610 35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921 36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804 37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192 38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288 39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703 40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806 41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039 42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626 43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905 44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280 45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784 46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293 47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946 48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292 49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229 50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477 51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728 52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035 53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438 54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060 55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768 56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834 57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979 58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706 59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634 60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995 61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170 62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293 63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991 64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865 65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584 66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609 67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812 68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227 69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041 70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633 71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956 72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561 73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435 74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029 75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568 76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961 77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438 78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102 79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926 80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581 81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934 82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657 83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478 84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932 85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450 86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071 87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844 88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195 89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211 90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811 91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403 92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848 93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744 94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433 95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495 96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675 97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451 98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864 99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419 100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100 101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742 102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061 103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937 104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866 105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631 106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245 107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795 108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294 109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920 110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171 111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092 112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907 113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704 114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821 115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866 116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931 117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023 118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996 119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933 120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717 121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188 122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137 123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576 124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408 125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439 126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602 127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578 128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514 129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193 130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131 131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437 132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067 133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377 134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706 135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401 136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209 137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892 138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907 139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429 140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508 141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969 142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681 143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585 144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668 145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749 146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787 147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471 148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109 149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954 150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434 151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192 152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562 153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230 154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120 155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410 156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264 157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995 158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231 159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784 160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176 161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006 162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514 163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083 164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567 165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556 166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935 167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675 168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301 169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532 170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159 171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971 172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744 173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834 174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600 175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680 176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088 177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118 178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775 179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405 180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321 181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236 182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996 183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038 184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800 185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775 186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806 187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231 188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856 189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638 190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677 191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977 192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456 193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635 194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047 195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968 196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425 197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970 198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901 199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492 200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521 201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095 202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003 203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084 204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347 205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658 206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032 207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597 208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696 209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341 210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784 211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278 212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251 213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786 214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695 215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849 216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124 217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372 218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814 219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898 220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123 221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710 222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384 223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280 224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560 225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729 226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475 227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735 228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128 229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640 230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318 231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505 232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713 233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577 234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226 235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400 236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145 237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460 238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173 239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349 240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936 241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610 242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820 243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959 244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084 245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894 246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944 247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491 248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002 249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370 250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509 251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569 252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308 253 | -------------------------------------------------------------------------------- /ch_04/data/stocks.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_04/data/stocks.db -------------------------------------------------------------------------------- /ch_04/data/weather.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_04/data/weather.db -------------------------------------------------------------------------------- /ch_04/understanding_window_calculations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Understanding Window Calculations\n", 8 | "\n", 9 | "## About the Data\n", 10 | "In this notebook, we will be working with Facebook's stock price throughout 2018 (obtained using the [`stock_analysis` package](https://github.com/stefmolin/stock-analysis)).\n", 11 | "\n", 12 | "## Interactive Visualizations\n", 13 | "If you follow the installation instructions [here](https://github.com/matplotlib/jupyter-matplotlib), you can run the following interactive plot to see what different window calculations look like. This requires you to install `ipyml` and `node.js` and run a few commands from the command line as indicated in the aforementioned link. Note you will need to restart the kernel.\n", 14 | "\n", 15 | "*More information on the `interact()` function can be found [here](https://ipywidgets.readthedocs.io/en/stable/examples/Using%20Interact.html).*" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 17, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "application/vnd.jupyter.widget-view+json": { 26 | "model_id": "555c8238c49c438289c1c676783bde85", 27 | "version_major": 2, 28 | "version_minor": 0 29 | }, 30 | "text/plain": [ 31 | "interactive(children=(IntSlider(value=20, description='period', max=200, step=5), Dropdown(description='window…" 32 | ] 33 | }, 34 | "metadata": {}, 35 | "output_type": "display_data" 36 | } 37 | ], 38 | "source": [ 39 | "%matplotlib widget\n", 40 | "from ipywidgets import interact\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import pandas as pd\n", 43 | "\n", 44 | "fb = pd.read_csv('data/fb_2018.csv', index_col='date', parse_dates=True)\n", 45 | "\n", 46 | "def window_calculations(df):\n", 47 | " def plot_viz(period=20, window_type='rolling', agg='mean'):\n", 48 | " ax = df.plot(y='close')\n", 49 | " window_func = getattr(df.close, window_type)\n", 50 | " if window_type == 'rolling':\n", 51 | " kwargs = {'window': period}\n", 52 | " elif window_type == 'expanding':\n", 53 | " kwargs = {'min_periods': period}\n", 54 | " elif window_type == 'ewm':\n", 55 | " kwargs = {'span': period}\n", 56 | " if agg != 'mean':\n", 57 | " print('Changing to mean')\n", 58 | " agg = 'mean'\n", 59 | " window_func(**kwargs).agg(agg).plot(ax=ax, label=f'{window_type} {period}D {agg}')\n", 60 | "\n", 61 | " plt.suptitle('Window Calculations on Facebook Closing Price')\n", 62 | " plt.title('(Note: EWM only works with mean)')\n", 63 | " plt.ylabel('price ($)')\n", 64 | " plt.legend()\n", 65 | " \n", 66 | " return plot_viz\n", 67 | "interact(\n", 68 | " window_calculations(fb), \n", 69 | " period=(0, 200, 5), \n", 70 | " window_type=['rolling', 'expanding', 'ewm'], \n", 71 | " agg=['sum', 'min', 'max', 'mean']\n", 72 | ");" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.7.2" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /ch_04/window_calc.py: -------------------------------------------------------------------------------- 1 | def window_calc(df, func, agg_dict, *args, **kwargs): 2 | """ 3 | Run a window calculation of your choice on a DataFrame. 4 | 5 | Parameters: 6 | - df: The DataFrame to run the calculation on. 7 | - func: The window calculation method that takes df 8 | as the first argument. 9 | - agg_dict: Information to pass to `agg()`, could be a 10 | dictionary mapping the columns to the aggregation 11 | function to use, a string name for the function, 12 | or the function itself. 13 | - args: Positional arguments to pass to `func`. 14 | - kwargs: Keyword arguments to pass to `func`. 15 | 16 | Returns: 17 | - A new DataFrame object. 18 | """ 19 | return df.pipe(func, *args, **kwargs).agg(agg_dict) -------------------------------------------------------------------------------- /ch_05/data/fb_stock_prices_2018.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903 3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563 4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896 5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535 6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726 7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057 8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894 9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587 10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299 11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842 12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376 13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901 14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540 15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464 16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781 17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548 18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740 19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212 20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172 21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556 22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144 23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293 24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484 25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206 26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505 27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886 28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321 29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626 30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133 31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350 32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704 33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120 34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610 35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921 36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804 37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192 38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288 39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703 40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806 41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039 42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626 43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905 44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280 45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784 46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293 47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946 48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292 49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229 50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477 51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728 52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035 53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438 54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060 55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768 56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834 57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979 58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706 59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634 60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995 61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170 62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293 63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991 64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865 65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584 66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609 67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812 68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227 69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041 70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633 71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956 72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561 73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435 74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029 75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568 76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961 77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438 78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102 79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926 80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581 81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934 82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657 83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478 84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932 85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450 86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071 87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844 88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195 89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211 90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811 91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403 92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848 93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744 94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433 95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495 96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675 97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451 98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864 99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419 100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100 101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742 102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061 103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937 104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866 105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631 106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245 107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795 108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294 109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920 110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171 111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092 112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907 113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704 114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821 115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866 116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931 117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023 118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996 119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933 120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717 121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188 122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137 123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576 124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408 125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439 126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602 127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578 128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514 129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193 130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131 131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437 132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067 133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377 134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706 135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401 136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209 137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892 138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907 139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429 140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508 141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969 142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681 143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585 144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668 145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749 146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787 147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471 148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109 149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954 150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434 151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192 152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562 153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230 154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120 155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410 156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264 157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995 158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231 159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784 160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176 161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006 162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514 163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083 164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567 165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556 166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935 167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675 168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301 169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532 170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159 171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971 172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744 173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834 174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600 175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680 176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088 177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118 178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775 179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405 180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321 181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236 182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996 183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038 184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800 185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775 186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806 187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231 188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856 189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638 190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677 191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977 192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456 193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635 194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047 195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968 196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425 197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970 198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901 199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492 200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521 201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095 202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003 203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084 204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347 205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658 206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032 207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597 208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696 209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341 210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784 211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278 212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251 213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786 214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695 215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849 216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124 217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372 218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814 219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898 220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123 221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710 222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384 223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280 224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560 225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729 226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475 227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735 228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128 229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640 230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318 231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505 232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713 233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577 234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226 235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400 236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145 237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460 238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173 239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349 240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936 241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610 242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820 243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959 244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084 245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894 246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944 247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491 248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002 249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370 250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509 251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569 252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308 253 | -------------------------------------------------------------------------------- /ch_06/color_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import matplotlib.pyplot as plt 4 | from matplotlib.colors import ListedColormap 5 | import numpy as np 6 | 7 | def hex_to_rgb_color_list(colors): 8 | """ 9 | Take color or list of hex code colors and convert them 10 | to RGB colors in the range [0,1]. 11 | 12 | Parameters: 13 | - colors: Color or list of color strings of the format 14 | '#FFF' or '#FFFFFF' 15 | 16 | Returns: 17 | The color or list of colors in RGB representation. 18 | """ 19 | if isinstance(colors, str): 20 | colors = [colors] 21 | 22 | for i, color in enumerate( 23 | [color.replace('#', '') for color in colors] 24 | ): 25 | hex_length = len(color) 26 | 27 | if hex_length not in [3, 6]: 28 | raise ValueError( 29 | 'Your colors must be of the form #FFFFFF or #FFFFFF' 30 | ) 31 | 32 | regex = '.' * (hex_length // 3) 33 | colors[i] = [ 34 | int(val * (6 // hex_length), 16)/255 \ 35 | for val in re.findall(regex, color) 36 | ] 37 | 38 | return colors[0] if len(colors) == 1 else colors 39 | 40 | def two_color_sequential_cmap(rgb_color_list): 41 | """ 42 | Created a sequential colormap blending from one color to the other. 43 | 44 | Parameters: 45 | - rgb_color_list: A list of colors represented as [R, G, B] values 46 | in the range [0, 1], like [[0, 0, 0], [1, 1, 1]], 47 | for black and white, respectively. 48 | 49 | Returns: 50 | A matplotlib ListedColormap object with your colormap. 51 | """ 52 | if not isinstance(rgb_color_list, list): 53 | raise ValueError('Colors must be passed as a list!') 54 | elif len(rgb_color_list) != 2: 55 | raise ValueError( 56 | 'Can only specify two colors; ' 57 | 'one for each end of the spectrum.' 58 | ) 59 | elif ( 60 | not isinstance(rgb_color_list[0], list) or not isinstance(rgb_color_list[1], list) 61 | ) or ( 62 | len(rgb_color_list[0]) != 3 or len(rgb_color_list[1]) != 3 63 | ): 64 | raise ValueError('Each color should be represented as a list of size 3.') 65 | 66 | N = 256 67 | entries = 4 # red, green, blue, alpha 68 | rgbas = np.ones((N, entries)) 69 | for i in range(entries - 1): # we don't alter alphas 70 | rgbas[:, i] = np.linspace( 71 | start=rgb_color_list[0][i], 72 | stop=rgb_color_list[1][i], 73 | num=N 74 | ) 75 | 76 | return ListedColormap(rgbas) 77 | 78 | def draw_cmap(cmap): 79 | """ 80 | Draw a colorbar for visualizing a colormap. 81 | 82 | Parameters: 83 | - cmap: A matplotlib colormap 84 | 85 | Returns: 86 | A matplotlib colorbar, which you can save with: 87 | `plt.savefig(, bbox_inches='tight')` 88 | """ 89 | img = plt.imshow(np.array([[0,1]]), cmap=cmap) 90 | cbar = plt.colorbar(orientation='horizontal', cmap=cmap) 91 | img.axes.remove() 92 | return cbar 93 | 94 | def blended_cmap(rgb_color_list): 95 | """ 96 | Created a colormap blending from one color to the other. 97 | 98 | Parameters: 99 | - rgb_color_list: A list of colors represented as [R, G, B] values 100 | in the range [0, 1], like [[0, 0, 0], [1, 1, 1]], 101 | for black and white, respectively. 102 | 103 | Returns: 104 | A matplotlib ListedColormap object with your colormap. 105 | """ 106 | if not isinstance(rgb_color_list, list): 107 | raise ValueError('Colors must be passed as a list!') 108 | elif len(rgb_color_list) < 2: 109 | raise ValueError('Must specify at least 2 colors.') 110 | elif ( 111 | not isinstance(rgb_color_list[0], list) \ 112 | or not isinstance(rgb_color_list[1], list) 113 | ) or ( 114 | len(rgb_color_list[0]) != 3 or len(rgb_color_list[1]) != 3 115 | ): 116 | raise ValueError( 117 | 'Each color should be represented as a list of size 3.' 118 | ) 119 | 120 | N = 256 121 | entries = 4 # red, green, blue, alpha 122 | rgbas = np.ones((N, entries)) 123 | 124 | segment_count = len(rgb_color_list) - 1 125 | segment_size = N // segment_count 126 | remainder = N % segment_count # need to add this back later 127 | 128 | for i in range(entries - 1): # we don't alter alphas 129 | updates = [] 130 | for seg in range(1, segment_count + 1): 131 | # determine how much needs to be added back to account for remainders 132 | if not remainder or seg > 1: 133 | offset = 0 134 | else: 135 | offset = remainder 136 | 137 | updates.append(np.linspace( 138 | start=rgb_color_list[seg-1][i], 139 | stop=rgb_color_list[seg][i], 140 | num=segment_size + offset 141 | )) 142 | 143 | rgbas[:,i] = np.concatenate(updates) 144 | 145 | return ListedColormap(rgbas) -------------------------------------------------------------------------------- /ch_06/data/fb_stock_prices_2018.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2018-01-02,177.68,181.58,177.55,181.42,18151903 3 | 2018-01-03,181.88,184.78,181.33,184.67,16886563 4 | 2018-01-04,184.9,186.21,184.0996,184.33,13880896 5 | 2018-01-05,185.59,186.9,184.93,186.85,13574535 6 | 2018-01-08,187.2,188.9,186.33,188.28,17994726 7 | 2018-01-09,188.7,188.8,187.1,187.87,12393057 8 | 2018-01-10,186.94,187.89,185.63,187.84,10529894 9 | 2018-01-11,188.4,188.4,187.38,187.77,9588587 10 | 2018-01-12,178.06,181.48,177.4,179.37,77551299 11 | 2018-01-16,181.5,181.75,178.04,178.39,36183842 12 | 2018-01-17,179.26,179.32,175.8,177.6,27992376 13 | 2018-01-18,178.13,180.98,177.08,179.8,23304901 14 | 2018-01-19,180.85,182.37,180.1702,181.29,26826540 15 | 2018-01-22,180.8,185.39,180.41,185.37,21059464 16 | 2018-01-23,186.05,189.55,185.55,189.35,25678781 17 | 2018-01-24,189.89,190.66,186.52,186.55,24334548 18 | 2018-01-25,187.95,188.62,186.6,187.48,17377740 19 | 2018-01-26,187.75,190.0,186.81,190.0,17759212 20 | 2018-01-29,188.75,188.84,185.6301,185.98,20453172 21 | 2018-01-30,183.01,188.18,181.84,187.12,20858556 22 | 2018-01-31,188.37,189.83,185.22,186.89,43275144 23 | 2018-02-01,188.22,195.32,187.89,193.09,54211293 24 | 2018-02-02,192.04,194.21,189.98,190.28,26677484 25 | 2018-02-05,186.93,190.61,180.61,181.26,33128206 26 | 2018-02-06,178.57,185.77,177.74,185.31,37758505 27 | 2018-02-07,184.15,185.0817,179.95,180.18,27601886 28 | 2018-02-08,181.01,181.84,171.4815,171.58,38478321 29 | 2018-02-09,174.76,176.9,167.18,176.11,39887626 30 | 2018-02-12,177.06,177.545,171.84,176.41,32092133 31 | 2018-02-13,175.62,175.97,173.1,173.15,21809350 32 | 2018-02-14,173.45,179.81,173.2119,179.52,28929704 33 | 2018-02-15,180.5,180.5,176.84,179.96,20922120 34 | 2018-02-16,178.99,179.88,176.3,177.36,21015610 35 | 2018-02-20,175.77,177.95,175.11,176.01,21204921 36 | 2018-02-21,176.71,181.27,176.4,177.91,23200804 37 | 2018-02-22,178.7,180.21,177.41,178.99,18464192 38 | 2018-02-23,179.9,183.39,179.51,183.29,19007288 39 | 2018-02-26,184.58,185.66,183.2228,184.93,17599703 40 | 2018-02-27,184.45,184.7,181.46,181.46,15849806 41 | 2018-02-28,182.3,182.88,178.14,178.32,18783039 42 | 2018-03-01,179.01,180.12,174.41,175.94,23201626 43 | 2018-03-02,173.29,177.11,172.99,176.62,20025905 44 | 2018-03-05,176.2,181.1475,175.89,180.4,16189280 45 | 2018-03-06,181.78,182.38,179.11,179.78,15086784 46 | 2018-03-07,178.74,183.82,178.07,183.71,19097293 47 | 2018-03-08,183.56,184.4,181.45,182.34,17225946 48 | 2018-03-09,183.91,185.51,183.21,185.23,18526292 49 | 2018-03-12,185.23,186.1,184.22,184.76,15301229 50 | 2018-03-13,185.61,185.99,181.11,181.88,18067477 51 | 2018-03-14,182.6,184.25,181.85,184.19,16821728 52 | 2018-03-15,183.24,184.0,182.19,183.86,15645035 53 | 2018-03-16,184.49,185.33,183.41,185.09,24403438 54 | 2018-03-19,177.01,177.17,170.06,172.56,88140060 55 | 2018-03-20,167.47,170.2,161.95,168.15,129851768 56 | 2018-03-21,164.8,173.4,163.3,169.39,106598834 57 | 2018-03-22,166.13,170.27,163.72,164.89,73742979 58 | 2018-03-23,165.44,167.1,159.02,159.39,53609706 59 | 2018-03-26,160.82,161.1,149.02,160.06,126116634 60 | 2018-03-27,156.31,162.85,150.75,152.22,79116995 61 | 2018-03-28,151.65,155.88,150.8,153.03,60029170 62 | 2018-03-29,155.15,161.42,154.14,159.79,59434293 63 | 2018-04-02,157.81,159.2,154.111,155.39,36795991 64 | 2018-04-03,156.55,157.39,150.81,156.11,42543865 65 | 2018-04-04,152.025,155.56,150.51,155.1,49885584 66 | 2018-04-05,161.56,161.575,156.65,159.34,41449609 67 | 2018-04-06,157.73,161.42,156.81,157.2,41644812 68 | 2018-04-09,157.82,160.53,156.04,157.93,34915227 69 | 2018-04-10,157.93,165.98,157.01,165.04,58947041 70 | 2018-04-11,165.36,168.65,163.25,166.32,56144633 71 | 2018-04-12,166.98,167.45,163.1,163.87,38262956 72 | 2018-04-13,164.58,165.7036,163.77,164.52,19990561 73 | 2018-04-16,165.7249,165.78,163.39,164.83,18119435 74 | 2018-04-17,165.83,169.0,165.66,168.66,22743029 75 | 2018-04-18,166.88,168.12,165.77,166.36,20969568 76 | 2018-04-19,166.2,168.33,165.2,168.1,22234961 77 | 2018-04-20,167.79,168.43,165.81,166.28,19119438 78 | 2018-04-23,167.27,168.45,165.09,165.84,23088102 79 | 2018-04-24,165.43,166.1,158.19,159.69,35079926 80 | 2018-04-25,160.1448,161.06,156.19,159.69,41083581 81 | 2018-04-26,173.22,176.27,170.8,174.16,77556934 82 | 2018-04-27,176.81,177.1,172.6,173.59,29804657 83 | 2018-04-30,173.79,175.72,171.71,172.0,20750478 84 | 2018-05-01,172.0,174.02,170.23,173.86,26025932 85 | 2018-05-02,174.246,178.08,174.2,176.07,30424450 86 | 2018-05-03,175.13,176.12,172.12,174.02,24026071 87 | 2018-05-04,173.08,176.98,173.06,176.61,17677844 88 | 2018-05-07,177.35,179.5,177.17,177.97,18697195 89 | 2018-05-08,178.25,179.04,177.11,178.92,15577211 90 | 2018-05-09,179.67,183.01,178.7807,182.66,23282811 91 | 2018-05-10,183.15,186.1292,182.5,185.53,21071403 92 | 2018-05-11,184.85,188.32,184.18,186.99,21207848 93 | 2018-05-14,187.71,187.86,186.2,186.64,15646744 94 | 2018-05-15,184.88,185.29,183.2,184.32,15429433 95 | 2018-05-16,183.6952,184.32,182.66,183.2,16975495 96 | 2018-05-17,182.68,184.06,182.22,183.76,14840675 97 | 2018-05-18,183.49,184.19,182.61,182.68,13130451 98 | 2018-05-21,183.77,185.3,183.13,184.49,13532864 99 | 2018-05-22,184.93,185.42,183.43,183.8,12731419 100 | 2018-05-23,182.5,186.91,182.18,186.9,16628100 101 | 2018-05-24,185.88,186.8,185.03,185.93,12354742 102 | 2018-05-25,186.02,186.33,184.45,184.92,10965061 103 | 2018-05-29,184.34,186.81,183.71,185.74,16398937 104 | 2018-05-30,186.54,188.0,185.25,187.67,13736866 105 | 2018-05-31,187.87,192.72,187.48,191.78,30782631 106 | 2018-06-01,193.065,194.5492,192.07,193.99,17307245 107 | 2018-06-04,191.84,193.98,191.47,193.28,18939795 108 | 2018-06-05,194.3,195.0,192.62,192.94,15544294 109 | 2018-06-06,191.0252,192.53,189.11,191.34,22558920 110 | 2018-06-07,190.75,190.97,186.77,188.18,21503171 111 | 2018-06-08,187.53,189.4754,186.43,189.1,12677092 112 | 2018-06-11,188.81,192.6,188.8,191.54,12928907 113 | 2018-06-12,192.17,193.28,191.56,192.4,11562704 114 | 2018-06-13,192.74,194.5,191.91,192.41,15853821 115 | 2018-06-14,193.1,197.28,192.91,196.81,19120866 116 | 2018-06-15,195.79,197.07,194.64,195.85,21860931 117 | 2018-06-18,194.8,199.58,194.13,198.31,16826023 118 | 2018-06-19,196.2352,197.96,193.79,197.49,19993996 119 | 2018-06-20,199.1,203.55,198.805,202.0,28230933 120 | 2018-06-21,202.76,203.39,200.09,201.5,19045717 121 | 2018-06-22,201.16,202.24,199.31,201.74,17420188 122 | 2018-06-25,200.0,200.0,193.11,196.35,25275137 123 | 2018-06-26,197.6,199.1,196.23,199.0,17897576 124 | 2018-06-27,199.18,200.75,195.8,195.84,18734408 125 | 2018-06-28,195.18,197.34,193.26,196.23,18172439 126 | 2018-06-29,197.32,197.5997,193.955,194.32,15811602 127 | 2018-07-02,193.37,197.45,192.22,197.36,13961578 128 | 2018-07-03,194.55,195.4,192.52,192.73,13489514 129 | 2018-07-05,194.74,198.65,194.03,198.45,19684193 130 | 2018-07-06,198.45,203.64,197.7,203.23,19740131 131 | 2018-07-09,204.93,205.8,202.1201,204.74,18149437 132 | 2018-07-10,204.5,204.91,202.26,203.54,13190067 133 | 2018-07-11,202.22,204.5,201.75,202.54,12927377 134 | 2018-07-12,203.43,207.08,203.19,206.92,15454706 135 | 2018-07-13,207.81,208.43,206.45,207.32,11503401 136 | 2018-07-16,207.5,208.72,206.84,207.23,11078209 137 | 2018-07-17,204.9,210.46,204.84,209.99,15349892 138 | 2018-07-18,209.82,210.99,208.44,209.36,15334907 139 | 2018-07-19,208.77,209.99,207.76,208.09,11350429 140 | 2018-07-20,208.85,211.5,208.5,209.94,16241508 141 | 2018-07-23,210.58,211.62,208.8,210.91,16731969 142 | 2018-07-24,215.11,216.2,212.6,214.67,28468681 143 | 2018-07-25,215.715,218.62,214.27,217.5,64592585 144 | 2018-07-26,174.89,180.13,173.75,176.26,169803668 145 | 2018-07-27,179.87,179.93,173.0,174.89,60073749 146 | 2018-07-30,175.3,175.3,166.56,171.06,65280787 147 | 2018-07-31,170.67,174.24,170.0,172.58,40356471 148 | 2018-08-01,173.93,175.08,170.9,171.65,34042109 149 | 2018-08-02,170.68,176.79,170.27,176.37,32399954 150 | 2018-08-03,177.69,178.85,176.15,177.78,24763434 151 | 2018-08-06,178.97,185.79,178.38,185.69,49716192 152 | 2018-08-07,186.5,188.3,183.72,183.81,33398562 153 | 2018-08-08,184.75,186.85,183.76,185.18,22205230 154 | 2018-08-09,185.8492,186.57,182.48,183.09,19732120 155 | 2018-08-10,182.04,182.1,179.42,180.26,21500410 156 | 2018-08-13,180.1,182.61,178.9,180.05,17423264 157 | 2018-08-14,180.71,181.99,178.62,181.11,19101995 158 | 2018-08-15,179.34,180.87,174.78,179.53,33020231 159 | 2018-08-16,180.42,180.5,174.01,174.7,31351784 160 | 2018-08-17,174.5,176.22,172.04,173.8,24893176 161 | 2018-08-20,174.04,174.57,170.91,172.5,21518006 162 | 2018-08-21,172.81,174.17,171.39,172.62,19578514 163 | 2018-08-22,172.21,174.24,172.13,173.64,16894083 164 | 2018-08-23,173.09,175.55,172.83,172.9,18053567 165 | 2018-08-24,173.7,174.82,172.92,174.645,14631556 166 | 2018-08-27,175.99,178.67,175.79,177.46,17921935 167 | 2018-08-28,178.1,178.2399,175.83,176.26,15910675 168 | 2018-08-29,176.295,176.79,174.75,175.9,18678301 169 | 2018-08-30,175.9,179.7901,175.7,177.64,24216532 170 | 2018-08-31,177.15,177.62,174.9815,175.73,18065159 171 | 2018-09-04,173.5,173.89,168.8,171.16,29808971 172 | 2018-09-05,169.49,171.125,166.67,167.18,31226744 173 | 2018-09-06,166.98,166.98,160.0,162.53,41514834 174 | 2018-09-07,160.31,164.6269,160.16,163.04,24300600 175 | 2018-09-10,163.51,165.01,162.16,164.18,20197680 176 | 2018-09-11,163.94,167.19,163.72,165.94,20457088 177 | 2018-09-12,163.25,164.49,161.8,162.0,24078118 178 | 2018-09-13,162.0,163.32,160.86,161.36,25453775 179 | 2018-09-14,161.715,162.84,160.34,162.32,21770405 180 | 2018-09-17,161.92,162.06,159.77,160.58,21005321 181 | 2018-09-18,159.39,161.7639,158.8656,160.3,22465236 182 | 2018-09-19,160.08,163.44,159.48,163.06,19628996 183 | 2018-09-20,164.5,166.45,164.4722,166.02,18936038 184 | 2018-09-21,166.64,167.25,162.81,162.93,45994800 185 | 2018-09-24,161.03,165.7,160.88,165.41,19222775 186 | 2018-09-25,161.99,165.59,161.15,164.91,27622806 187 | 2018-09-26,164.3,169.3,164.21,166.95,25252231 188 | 2018-09-27,167.55,171.77,167.21,168.84,27266856 189 | 2018-09-28,168.33,168.79,162.56,164.46,34265638 190 | 2018-10-01,163.03,165.88,161.26,162.44,26407677 191 | 2018-10-02,161.58,162.28,158.67,159.33,36030977 192 | 2018-10-03,160.0,163.66,159.53,162.43,23109456 193 | 2018-10-04,161.46,161.46,157.35,158.85,25739635 194 | 2018-10-05,159.21,160.9,156.2,157.33,25744047 195 | 2018-10-08,155.54,158.34,154.39,157.25,24045968 196 | 2018-10-09,157.69,160.59,157.42,157.9,18844425 197 | 2018-10-10,156.82,157.69,151.31,151.38,30609970 198 | 2018-10-11,150.13,154.81,149.16,153.35,35338901 199 | 2018-10-12,156.73,156.89,151.2998,153.74,25293492 200 | 2018-10-15,153.32,155.57,152.55,153.52,15433521 201 | 2018-10-16,155.4,159.46,155.01,158.78,19180095 202 | 2018-10-17,159.56,160.49,157.95,159.42,17592003 203 | 2018-10-18,158.51,158.66,153.28,154.92,21675084 204 | 2018-10-19,155.86,157.35,153.55,154.05,19761347 205 | 2018-10-22,154.76,157.34,154.46,154.78,15424658 206 | 2018-10-23,151.22,154.77,150.85,154.39,19095032 207 | 2018-10-24,154.28,154.65,145.6,146.04,27744597 208 | 2018-10-25,147.73,152.21,147.0,150.95,22105696 209 | 2018-10-26,145.82,149.0,143.8,145.37,31303341 210 | 2018-10-29,148.5,148.83,139.03,142.09,31336784 211 | 2018-10-30,139.935,146.64,139.7419,146.22,50528278 212 | 2018-10-31,155.0,156.4,148.96,151.79,60101251 213 | 2018-11-01,151.52,152.75,149.35,151.75,25640786 214 | 2018-11-02,151.8,154.13,148.96,150.35,24708695 215 | 2018-11-05,150.1,150.19,147.44,148.68,15969849 216 | 2018-11-06,149.31,150.97,148.0,149.94,16667124 217 | 2018-11-07,151.57,153.01,149.83,151.53,21877372 218 | 2018-11-08,150.49,150.94,146.74,147.87,24145814 219 | 2018-11-09,146.75,147.76,144.07,144.96,17326898 220 | 2018-11-12,144.48,145.04,140.4899,141.55,18542123 221 | 2018-11-13,142.0,144.88,141.62,142.16,15141710 222 | 2018-11-14,143.7,145.58,141.55,144.22,22068384 223 | 2018-11-15,142.33,144.84,140.83,143.85,30320280 224 | 2018-11-16,141.07,141.77,137.77,139.53,37250560 225 | 2018-11-19,137.61,137.75,131.21,131.55,44362729 226 | 2018-11-20,127.03,134.1592,126.85,132.43,41939475 227 | 2018-11-21,134.4,137.19,134.13,134.82,25469735 228 | 2018-11-23,133.65,134.5,131.2551,131.73,11886128 229 | 2018-11-26,133.0,137.0,132.78,136.38,24263640 230 | 2018-11-27,135.75,136.6126,133.71,135.0,20750318 231 | 2018-11-28,136.28,136.7899,131.85,136.76,29847505 232 | 2018-11-29,135.92,139.99,135.66,138.68,24238713 233 | 2018-11-30,138.26,140.966,137.36,140.61,25732577 234 | 2018-12-03,143.0,143.6799,140.76,141.09,24819226 235 | 2018-12-04,140.73,143.39,137.16,137.93,30307400 236 | 2018-12-06,133.82,139.7,133.67,139.63,28218145 237 | 2018-12-07,139.25,140.87,136.6566,137.42,21195460 238 | 2018-12-10,139.6,143.05,139.01,141.85,26422173 239 | 2018-12-11,143.88,143.88,141.1,142.08,20300349 240 | 2018-12-12,143.08,147.19,142.51,144.5,23696936 241 | 2018-12-13,145.57,145.85,143.19,145.01,18148610 242 | 2018-12-14,143.34,146.01,142.51,144.06,21785820 243 | 2018-12-17,143.08,144.92,138.42,140.19,24333959 244 | 2018-12-18,141.08,145.93,139.8301,143.66,24709084 245 | 2018-12-19,141.21,144.91,132.5,133.24,57404894 246 | 2018-12-20,130.7,135.57,130.0,133.4,40297944 247 | 2018-12-21,133.39,134.9,123.42,124.95,56901491 248 | 2018-12-24,123.1,129.74,123.02,124.06,22066002 249 | 2018-12-26,126.0,134.24,125.89,134.18,39723370 250 | 2018-12-27,132.44,134.99,129.67,134.52,31202509 251 | 2018-12-28,135.34,135.92,132.2,133.2,22627569 252 | 2018-12-31,134.45,134.64,129.95,131.09,24625308 253 | -------------------------------------------------------------------------------- /ch_06/reg_resid_plot.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | def reg_resid_plots(data): 7 | """ 8 | Using seaborn, plot the regression and residuals 9 | plots side-by-side for every permutation of 2 columns 10 | in the data. 11 | 12 | Parameters: 13 | - data: A pandas DataFrame 14 | 15 | Returns: 16 | A matplotlib Figure object. 17 | """ 18 | num_cols = data.shape[1] 19 | permutation_count = num_cols * (num_cols - 1) 20 | 21 | fig, ax = plt.subplots(permutation_count, 2, figsize=(15, 8)) 22 | 23 | for (x, y), axes, color in zip( 24 | itertools.permutations(data.columns, 2), 25 | ax, 26 | itertools.cycle(['royalblue', 'darkorange']) 27 | ): 28 | for subplot, func in zip(axes, (sns.regplot, sns.residplot)): 29 | func(x=x, y=y, data=data, ax=subplot, color=color) 30 | plt.close() 31 | return fig -------------------------------------------------------------------------------- /ch_06/std_from_mean_kde.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | def std_from_mean_kde(data): 4 | """ 5 | Plot the KDE of the pandas series along with vertical 6 | reference lines for each standard deviation from the mean. 7 | 8 | Parameters: 9 | - data: pandas Series with numeric data 10 | 11 | Returns: 12 | Matplotlib Axes object. 13 | """ 14 | mean_mag, std_mean = data.mean(), data.std() 15 | 16 | ax = data.plot(kind='kde') 17 | ax.axvline(mean_mag, color='b', alpha=0.2, label='mean') 18 | 19 | colors = ['green', 'orange', 'red'] 20 | multipliers = [1, 2, 3] 21 | signs = ['-', '+'] 22 | 23 | for sign, (color, multiplier) in itertools.product( 24 | signs, zip(colors, multipliers) 25 | ): 26 | adjustment = multiplier * std_mean 27 | if sign == '-': 28 | value = mean_mag - adjustment 29 | label = '{} {}{}{}'.format( 30 | r'$\mu$', 31 | r'$\pm$', 32 | multiplier, 33 | r'$\sigma$' 34 | ) 35 | else: 36 | value = mean_mag + adjustment 37 | label = None 38 | ax.axvline(value, color=color, label=label, alpha=0.5) 39 | 40 | ax.legend() 41 | return ax -------------------------------------------------------------------------------- /ch_07/data/netflix_january_2019.csv: -------------------------------------------------------------------------------- 1 | date,open,high,low,close,volume 2 | 2019-01-02,259.28,269.7499,256.58,267.66,11679528 3 | 2019-01-03,270.2,275.79,264.43,271.2,14969647 4 | 2019-01-04,281.88,297.8,278.54,297.57,19330102 5 | 2019-01-07,302.1,316.8,301.65,315.34,18620116 6 | 2019-01-08,319.98,320.593,308.01,320.27,15359180 7 | 2019-01-09,317.71,323.3508,313.5,319.96,13343152 8 | 2019-01-10,314.57,325.37,312.5,324.66,13472475 9 | 2019-01-11,330.96,341.09,328.52,337.59,19500429 10 | 2019-01-14,334.24,335.48,329.13,332.94,10499582 11 | 2019-01-15,349.6,357.22,347.0,354.64,21181234 12 | 2019-01-16,354.0,358.85,348.11,351.39,15385548 13 | 2019-01-17,349.5,355.79,346.41,353.19,18871195 14 | 2019-01-18,351.97,353.0,336.73,339.1,26621040 15 | 2019-01-22,334.89,336.88,321.03,325.16,17941416 16 | 2019-01-23,328.25,331.75,318.6,321.99,13480138 17 | 2019-01-24,320.6,331.8,319.0,326.67,11131627 18 | 2019-01-25,328.72,340.0,328.51,338.05,11166609 19 | 2019-01-28,334.7,336.3,328.88,335.66,8652082 20 | 2019-01-29,335.87,338.22,328.151,328.9,7655189 21 | 2019-01-30,332.75,341.78,330.8,340.66,9234530 22 | 2019-01-31,339.68,345.99,338.0919,339.5,8535517 23 | -------------------------------------------------------------------------------- /ch_07/random_walk.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | def random_walk_stock_comparison(df, choices=[-1, 1], probs=[0.5, 0.5], seed=2): 7 | """ 8 | Model a random walk from a stock's first closing price in the dataframe. 9 | Displays 3 random walks and the actual data in randomly assigned subplots. 10 | Can you find the real data? 11 | 12 | Parameters: 13 | - df: The dataframe of the real stock data. 14 | - choices: The choices of step sizes, defaults to [-1, 1]. 15 | - probs: The probability of getting each step size, 16 | defaults to [0.5, 0.5]. This should be the same 17 | size as choices. 18 | - seed: The random seed for repeatability. 19 | 20 | Returns: 21 | Prints the location of the actual data and 22 | returns the matplotlib Axes object. 23 | """ 24 | random.seed(seed) 25 | 26 | fig, axes = plt.subplots(2, 2, figsize=(15, 10)) 27 | stock_location = random.randint(0, 3) 28 | 29 | for i, ax in enumerate(axes.flatten()): 30 | if i == stock_location: 31 | ax.plot(df.index, df.close) 32 | else: 33 | steps = random.choices( 34 | choices, weights=probs, k=len(df.index) - 1 35 | ) 36 | walk = [df.first('1B').close.iat[0]] 37 | for step in steps: 38 | walk.append(walk[-1] + step) 39 | ax.plot(df.index, walk) 40 | ax.set_ylabel('price') 41 | 42 | ax.set_title(string.ascii_uppercase[i]) 43 | 44 | real_stock = f'real stock is at location {string.ascii_uppercase[stock_location]}' 45 | 46 | return real_stock, axes -------------------------------------------------------------------------------- /ch_08/logs/attacks.csv: -------------------------------------------------------------------------------- 1 | start,end,source_ip 2 | 2018-11-01 10:44:29.667759,2018-11-01 10:48:37.667759,23.143.69.122 3 | 2018-11-02 03:27:23.313068,2018-11-02 03:31:18.313068,141.20.164.200 4 | 2018-11-02 06:37:54.905352,2018-11-02 06:42:08.905352,200.115.24.107 5 | 2018-11-02 08:36:24.435229,2018-11-02 08:40:24.435229,207.119.101.16 6 | 2018-11-02 11:34:18.719900,2018-11-02 11:34:39.719900,66.109.118.50 7 | 2018-11-03 14:53:58.156032,2018-11-03 14:56:48.156032,229.230.103.243 8 | 2018-11-04 03:53:18.034815,2018-11-04 03:57:28.034815,182.166.104.62 9 | 2018-11-06 23:15:35.495361,2018-11-06 23:19:47.495361,54.192.138.165 10 | 2018-11-08 20:25:31.468864,2018-11-08 20:29:20.468864,3.37.217.34 11 | 2018-11-08 23:29:01.571634,2018-11-08 23:33:10.571634,219.216.146.199 12 | 2018-11-10 04:33:57.988715,2018-11-10 04:38:10.988715,121.25.210.210 13 | 2018-11-12 20:13:38.282383,2018-11-12 20:17:50.282383,126.173.124.168 14 | 2018-11-14 23:23:46.300364,2018-11-14 23:25:01.300364,63.80.87.152 15 | 2018-11-15 08:28:06.168628,2018-11-15 08:31:53.168628,209.39.194.7 16 | 2018-11-15 19:57:06.739763,2018-11-15 20:01:17.739763,14.105.57.60 17 | 2018-11-15 23:49:21.857030,2018-11-15 23:53:10.857030,35.69.133.207 18 | 2018-11-16 06:36:06.769436,2018-11-16 06:39:52.769436,151.161.63.71 19 | 2018-11-17 08:34:25.007268,2018-11-17 08:38:32.007268,15.25.40.20 20 | 2018-11-18 12:54:00.448884,2018-11-18 12:58:15.448884,146.116.200.234 21 | 2018-11-19 11:37:56.140599,2018-11-19 11:42:07.140599,186.4.202.15 22 | 2018-11-19 13:55:09.201913,2018-11-19 13:56:14.201913,82.192.228.167 23 | 2018-11-19 18:55:45.842577,2018-11-19 18:59:53.842577,7.188.56.193 24 | 2018-11-20 05:41:42.433953,2018-11-20 05:44:05.433953,45.36.27.25 25 | 2018-11-20 22:52:41.179710,2018-11-20 22:56:51.179710,132.46.92.143 26 | 2018-11-21 19:52:41.242397,2018-11-21 19:56:51.242397,141.160.80.85 27 | 2018-11-22 14:41:01.817158,2018-11-22 14:42:29.817158,174.178.69.43 28 | 2018-11-23 22:13:01.743022,2018-11-23 22:16:33.743022,43.212.208.159 29 | 2018-11-24 06:54:14.868831,2018-11-24 06:58:24.868831,146.235.86.65 30 | 2018-11-25 04:57:27.351116,2018-11-25 04:57:55.351116,67.151.67.186 31 | 2018-11-25 21:33:56.485642,2018-11-25 21:38:05.485642,162.180.192.242 32 | 2018-11-25 22:18:47.466777,2018-11-25 22:23:05.466777,135.158.66.165 33 | 2018-11-26 06:04:15.277956,2018-11-26 06:05:12.277956,165.6.227.176 34 | 2018-11-26 19:33:12.128095,2018-11-26 19:37:23.128095,184.129.203.46 35 | 2018-11-27 09:38:57.760709,2018-11-27 09:39:52.760709,45.65.160.229 36 | 2018-11-27 10:02:21.919370,2018-11-27 10:05:30.919370,142.230.199.14 37 | 2018-11-27 20:34:49.331332,2018-11-27 20:36:27.331332,205.116.13.22 38 | 2018-11-29 06:52:45.020568,2018-11-29 06:53:52.020568,50.244.204.83 39 | 2018-11-29 13:58:14.376959,2018-11-29 14:02:27.376959,38.56.22.170 40 | 2018-11-29 23:12:42.663323,2018-11-29 23:16:57.663323,44.123.120.49 41 | -------------------------------------------------------------------------------- /ch_08/simulate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime as dt 3 | import os 4 | import logging 5 | import random 6 | 7 | import login_attempt_simulator as sim 8 | 9 | # Logging configuration 10 | FORMAT = '[%(levelname)s] [ %(name)s ] %(message)s' 11 | logging.basicConfig(level=logging.INFO, format=FORMAT) 12 | logger = logging.getLogger(os.path.basename(__file__)) 13 | 14 | def get_simulation_file_path(path_provided, directory, default_file): 15 | """Get the path to the file creating the directory and using the default if necessary.""" 16 | if path_provided: 17 | file = path_provided 18 | else: 19 | if not os.path.exists(directory): 20 | os.mkdir(directory) 21 | file = os.path.join(directory, default_file) 22 | return file 23 | 24 | def get_user_base_file_path(path_provided, default_file): 25 | """Get the path for a user_data directory file.""" 26 | return get_simulation_file_path(path_provided, 'user_data', default_file) 27 | 28 | def get_log_file_path(path_provided, default_file): 29 | """Get the path for a logs directory file.""" 30 | return get_simulation_file_path(path_provided, 'logs', default_file) 31 | 32 | if __name__ == '__main__': 33 | # command line argument parsing 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument( 36 | "days", type=float, help="number of days to simulate from start" 37 | ) 38 | parser.add_argument( 39 | "start_date", type=str, 40 | help="datetime to start in the form 'YYYY-MM-DD' or 'YYYY-MM-DD-HH'" 41 | ) 42 | parser.add_argument( 43 | "-m", "--make", action='store_true', help="make userbase" 44 | ) 45 | parser.add_argument( 46 | "-s", "--seed", type=int, help="set a seed for reproducibility" 47 | ) 48 | parser.add_argument( 49 | "-u", "--userbase", help="file to write the userbase to" 50 | ) 51 | parser.add_argument( 52 | "-i", "--ip", help="file to write the user-ip address map to" 53 | ) 54 | parser.add_argument( 55 | "-l", "--log", help="file to write the attempt log to" 56 | ) 57 | parser.add_argument( 58 | "-hl", "--hacklog", help="file to write the hack log to" 59 | ) 60 | args = parser.parse_args() 61 | user_ip_mapping_file = get_user_base_file_path(args.ip, 'user_ips.json') 62 | 63 | if args.make: 64 | logger.warning('Creating new user base and mapping IP addresses to them.') 65 | 66 | user_base_file = get_user_base_file_path(args.userbase, 'user_base.txt') 67 | 68 | # seed the creation of userbase 69 | random.seed(args.seed) 70 | 71 | # create usernames and write to file 72 | sim.utils.make_userbase(user_base_file) 73 | 74 | # create one or more IP addresses per user and save mapping to file 75 | valid_users = sim.utils.get_valid_users(user_base_file) 76 | sim.utils.save_user_ips( 77 | sim.utils.assign_ip_addresses(valid_users), user_ip_mapping_file 78 | ) 79 | 80 | try: 81 | start = dt.datetime(*map(int, args.start_date.split('-'))) 82 | except TypeError: 83 | logger.error('Start date must be in the format "YYYY-MM-DD"') 84 | raise 85 | except ValueError: 86 | logger.warning( 87 | f'Could not interpret {args.start_date}, ' 88 | 'using January 1, 2019 at 12AM as start instead' 89 | ) 90 | start = dt.datetime(2019, 1, 1) 91 | 92 | end = start + dt.timedelta(days=args.days) 93 | 94 | try: 95 | logger.info(f'Simulating {args.days} days...') 96 | simulator = sim.LoginAttemptSimulator( 97 | user_ip_mapping_file, start, end, seed=args.seed 98 | ) 99 | simulator.simulate(attack_prob=0.05, try_all_users_prob=0.5, vary_ips=False) 100 | 101 | # save logs 102 | logger.info('Saving logs') 103 | simulator.save_hack_log(get_log_file_path(args.hacklog, 'attacks.csv')) 104 | simulator.save_log(get_log_file_path(args.log, 'log.csv')) 105 | 106 | logger.info('All done!') 107 | except: 108 | logger.error('Oops! Something went wrong...') 109 | -------------------------------------------------------------------------------- /ch_08/user_data/user_base.txt: -------------------------------------------------------------------------------- 1 | asmith 2 | ajones 3 | akim 4 | alopez 5 | abrown 6 | bsmith 7 | bjones 8 | bkim 9 | blopez 10 | bbrown 11 | csmith 12 | cjones 13 | ckim 14 | clopez 15 | cbrown 16 | dsmith 17 | djones 18 | dkim 19 | dlopez 20 | dbrown 21 | esmith 22 | ejones 23 | ekim 24 | elopez 25 | ebrown 26 | fsmith 27 | fjones 28 | fkim 29 | flopez 30 | fbrown 31 | gsmith 32 | gjones 33 | gkim 34 | glopez 35 | gbrown 36 | hsmith 37 | hjones 38 | hkim 39 | hlopez 40 | hbrown 41 | ismith 42 | ijones 43 | ikim 44 | ilopez 45 | ibrown 46 | jsmith 47 | jjones 48 | jkim 49 | jlopez 50 | jbrown 51 | ksmith 52 | kjones 53 | kkim 54 | klopez 55 | kbrown 56 | lsmith 57 | ljones 58 | lkim 59 | llopez 60 | lbrown 61 | msmith 62 | mjones 63 | mkim 64 | mlopez 65 | mbrown 66 | nsmith 67 | njones 68 | nkim 69 | nlopez 70 | nbrown 71 | osmith 72 | ojones 73 | okim 74 | olopez 75 | obrown 76 | psmith 77 | pjones 78 | pkim 79 | plopez 80 | pbrown 81 | qsmith 82 | qjones 83 | qkim 84 | qlopez 85 | qbrown 86 | rsmith 87 | rjones 88 | rkim 89 | rlopez 90 | rbrown 91 | ssmith 92 | sjones 93 | skim 94 | slopez 95 | sbrown 96 | tsmith 97 | tjones 98 | tkim 99 | tlopez 100 | tbrown 101 | usmith 102 | ujones 103 | ukim 104 | ulopez 105 | ubrown 106 | vsmith 107 | vjones 108 | vkim 109 | vlopez 110 | vbrown 111 | wsmith 112 | wjones 113 | wkim 114 | wlopez 115 | wbrown 116 | xsmith 117 | xjones 118 | xkim 119 | xlopez 120 | xbrown 121 | ysmith 122 | yjones 123 | ykim 124 | ylopez 125 | ybrown 126 | zsmith 127 | zjones 128 | zkim 129 | zlopez 130 | zbrown 131 | admin 132 | master 133 | dba 134 | -------------------------------------------------------------------------------- /ch_08/user_data/user_ips.json: -------------------------------------------------------------------------------- 1 | {"asmith": ["215.20.132.248", "207.155.244.183"], "ajones": ["111.71.144.71", "48.128.75.158", "50.37.169.241"], "akim": ["51.181.222.161", "104.244.226.133", "31.7.47.204"], "alopez": ["166.32.97.113", "122.72.229.46", "41.163.250.55"], "abrown": ["149.63.170.104", "147.227.46.197"], "bsmith": ["123.148.94.96", "95.16.133.243"], "bjones": ["45.66.76.19"], "bkim": ["200.141.120.110"], "blopez": ["214.140.230.252", "182.42.166.59", "8.138.59.112"], "bbrown": ["87.170.218.31", "51.74.112.23"], "csmith": ["37.13.63.96", "61.200.46.189", "59.18.11.99"], "cjones": ["63.245.107.31"], "ckim": ["11.217.51.133", "35.113.36.154", "179.223.92.31"], "clopez": ["239.20.51.200", "102.133.183.240", "86.104.29.81"], "cbrown": ["175.128.60.226"], "dsmith": ["89.6.241.209", "159.182.198.128", "78.6.234.40"], "djones": ["23.143.69.122", "67.158.198.212"], "dkim": ["41.0.98.171", "81.122.114.229", "193.212.16.205"], "dlopez": ["214.23.84.228", "32.132.80.228", "166.159.239.25"], "dbrown": ["96.42.66.7", "205.213.161.1"], "esmith": ["7.1.50.97"], "ejones": ["101.154.143.93"], "ekim": ["140.231.59.131"], "elopez": ["177.58.79.142"], "ebrown": ["21.20.105.132"], "fsmith": ["161.187.21.253", "234.222.190.91", "106.192.149.4"], "fjones": ["77.138.170.172"], "fkim": ["47.173.18.21", "138.83.76.148"], "flopez": ["202.66.150.58", "91.36.154.206"], "fbrown": ["153.212.55.50", "63.245.59.254"], "gsmith": ["19.154.171.79", "85.192.44.33"], "gjones": ["101.113.31.197"], "gkim": ["50.201.148.229"], "glopez": ["111.216.42.188", "112.133.85.220"], "gbrown": ["183.58.32.14"], "hsmith": ["231.103.60.254", "203.131.106.21", "110.74.53.101"], "hjones": ["193.185.77.53", "103.112.4.174"], "hkim": ["162.164.18.75", "131.79.194.150", "33.115.66.20"], "hlopez": ["7.229.169.82", "76.235.190.195"], "hbrown": ["17.46.39.218", "105.148.213.246", "198.119.10.0"], "ismith": ["93.154.130.170", "33.252.134.155", "208.196.196.31"], "ijones": ["65.122.146.171"], "ikim": ["18.246.213.72"], "ilopez": ["41.77.180.210", "18.238.197.234"], "ibrown": ["51.241.77.10"], "jsmith": ["67.165.53.177"], "jjones": ["196.251.56.30"], "jkim": ["239.172.63.151", "65.198.150.62", "96.19.200.227"], "jlopez": ["97.233.182.38", "22.20.248.130"], "jbrown": ["110.117.47.215"], "ksmith": ["156.58.74.218", "216.43.53.212", "32.50.212.79"], "kjones": ["15.228.220.213", "15.254.166.129", "40.180.36.62"], "kkim": ["15.176.178.91", "5.118.187.36"], "klopez": ["73.106.1.104", "63.3.150.188", "12.119.72.95"], "kbrown": ["57.244.176.132", "66.14.106.185"], "lsmith": ["94.41.52.157", "80.192.75.64"], "ljones": ["161.124.121.94"], "lkim": ["190.214.23.67", "67.215.153.213"], "llopez": ["72.216.152.181", "43.127.227.189", "29.192.209.4"], "lbrown": ["164.225.104.190", "150.241.46.94"], "msmith": ["141.57.78.228"], "mjones": ["94.215.221.89", "126.232.174.72"], "mkim": ["236.44.247.104", "150.0.229.236"], "mlopez": ["111.152.58.154"], "mbrown": ["79.217.241.47", "11.61.138.20", "227.52.129.181"], "nsmith": ["100.43.18.36", "134.156.174.60"], "njones": ["148.144.68.107", "53.210.206.142", "149.226.190.70"], "nkim": ["63.61.195.205"], "nlopez": ["239.71.153.181", "227.153.73.253", "26.110.13.181"], "nbrown": ["200.5.34.41", "202.3.184.21"], "osmith": ["1.138.149.116"], "ojones": ["147.97.53.222"], "okim": ["169.196.86.169", "215.222.75.229"], "olopez": ["75.161.66.106", "95.227.178.199", "218.251.199.112"], "obrown": ["224.104.25.198"], "psmith": ["119.43.95.186"], "pjones": ["88.119.152.44"], "pkim": ["145.180.210.234", "27.220.232.250", "130.243.110.172"], "plopez": ["21.22.26.83", "179.1.148.3"], "pbrown": ["32.219.113.203"], "qsmith": ["113.232.98.173", "52.43.163.165", "233.166.130.14"], "qjones": ["22.97.188.41", "107.177.96.103", "128.154.159.196"], "qkim": ["156.37.4.235", "43.41.123.50"], "qlopez": ["211.109.225.39"], "qbrown": ["201.20.92.127", "180.163.222.54"], "rsmith": ["146.103.151.226", "236.133.139.118", "8.60.50.88"], "rjones": ["212.127.111.145", "3.219.25.62", "196.139.60.183"], "rkim": ["144.113.122.33"], "rlopez": ["157.167.119.191", "7.167.187.12", "66.202.79.90"], "rbrown": ["39.69.105.254", "109.120.67.119", "196.180.67.255"], "ssmith": ["13.183.250.233"], "sjones": ["6.112.83.253", "70.205.97.162"], "skim": ["196.30.106.19", "161.127.175.225"], "slopez": ["115.133.176.83", "156.8.182.29", "77.180.11.251"], "sbrown": ["31.12.123.23", "6.115.167.34", "31.176.216.69"], "tsmith": ["229.222.72.183"], "tjones": ["90.168.209.195", "4.209.135.236"], "tkim": ["62.209.199.87"], "tlopez": ["70.75.40.168"], "tbrown": ["90.126.11.86"], "usmith": ["86.40.218.53", "234.76.20.129", "174.192.13.18"], "ujones": ["45.183.149.77", "234.120.182.83"], "ukim": ["207.172.138.252", "201.7.159.147", "233.202.61.206"], "ulopez": ["17.130.148.106", "174.197.128.106"], "ubrown": ["168.124.180.83"], "vsmith": ["169.4.26.79"], "vjones": ["185.149.150.165", "64.174.4.245"], "vkim": ["131.96.35.217", "142.89.86.32", "80.56.196.222"], "vlopez": ["159.145.6.219", "143.132.162.175"], "vbrown": ["221.72.3.79"], "wsmith": ["196.184.237.18", "210.116.8.185", "81.99.181.254"], "wjones": ["215.39.229.121"], "wkim": ["230.51.96.84", "225.33.218.202", "139.129.223.182"], "wlopez": ["167.46.157.15", "203.197.223.199", "19.238.181.64"], "wbrown": ["142.167.12.203", "177.184.2.35", "97.56.241.22"], "xsmith": ["12.161.201.64", "140.208.72.75"], "xjones": ["156.30.83.64", "68.246.23.22"], "xkim": ["198.92.176.42", "41.89.135.103", "133.167.129.132"], "xlopez": ["233.79.229.78", "19.90.17.161", "36.98.233.122"], "xbrown": ["81.170.69.243", "29.42.175.1"], "ysmith": ["53.218.180.231"], "yjones": ["193.185.60.69", "162.11.93.64"], "ykim": ["172.98.22.211"], "ylopez": ["31.159.199.26", "86.183.39.209", "27.225.181.130"], "ybrown": ["158.232.211.92", "15.232.135.97", "198.32.183.49"], "zsmith": ["13.179.10.90"], "zjones": ["6.165.233.253", "134.14.49.41"], "zkim": ["183.50.241.16", "78.146.18.0"], "zlopez": ["172.80.75.82", "89.81.124.168"], "zbrown": ["123.144.168.87"], "admin": ["180.115.83.215"], "master": ["186.70.197.6", "83.2.199.88"], "dba": ["9.12.165.1"]} -------------------------------------------------------------------------------- /ch_09/data/binaries.csv: -------------------------------------------------------------------------------- 1 | separation,name,positionangle,binaries,planets,stars 2 | 39.56,16 Cygni,133.30,1.0,1.0,3.0 3 | 3.4,16 Cygni AC,209,0.0,0.0,2.0 4 | 12.37,2M0441+2301,237.3,1.0,1.0,3.0 5 | 0.2323,2M 044145,79.61,0.0,0.0,2.0 6 | ,2M 1938+4603,,0.0,1.0,2.0 7 | 2.17,2MASS J02495639-0557352,233.1,0.0,1.0,2.0 8 | 38.00,30 Ari,275,2.0,1.0,4.0 9 | ,30 Ari A,,0.0,0.0,2.0 10 | 0.536,30 Ari BC,285.2,0.0,1.0,2.0 11 | 66.70,WDS J04376-0228,163,1.0,1.0,3.0 12 | ,GJ 3305,,0.0,0.0,2.0 13 | 84,55 Cancri,130,0.0,5.0,2.0 14 | 28.6,83 Leonis,150,0.0,2.0,2.0 15 | 49.4,91 Aquarii,313,1.0,1.0,3.0 16 | ,91 Aquarii BC,,0.0,0.0,2.0 17 | 31.60,alf Tau AB,113,0.0,1.0,2.0 18 | ,,,1.0,3.0,3.0 19 | ,Alpha Centauri,,0.0,2.0,2.0 20 | ,DP Leo,,0.0,1.0,2.0 21 | 17.25,,,0.0,1.0,2.0 22 | 24.14,,,0.0,1.0,2.0 23 | 9.78,,,0.0,1.0,2.0 24 | 21.21,EPIC 201549860,,0.0,2.0,2.0 25 | 12.3,,,0.0,1.0,2.0 26 | 1.9,EPIC 201637175,227,0.0,1.0,2.0 27 | 12.15,,,0.0,1.0,2.0 28 | 2.46,,,0.0,1.0,2.0 29 | 2.00,WDS J15576+2653,10,0.0,1.0,2.0 30 | 403.10,Epsilon Indi,88,1.0,1.0,3.0 31 | ,Epsilon Indi B,,0.0,0.0,2.0 32 | ,FL Lyr,,0.0,1.0,2.0 33 | 20407.6,MAM 1,337.91,1.0,1.0,3.0 34 | 7062.7,SHY 106,187.88,0.0,1.0,2.0 35 | 0.0754,FW Tau,3.4,0.0,1.0,2.0 36 | ,gamma Cephei,,0.0,1.0,2.0 37 | 4.60,Gamma Leonis,127,0.0,1.0,2.0 38 | 6.80,GJ 229,168,0.0,1.0,2.0 39 | ,GJ 725,,0.0,1.0,2.0 40 | ,Gliese 15,,0.0,1.0,2.0 41 | 3.90,,355,0.0,1.0,2.0 42 | 32.70,,142,1.0,7.0,3.0 43 | ,,,0.0,0.0,2.0 44 | 50.0,,135,0.0,4.0,2.0 45 | 179,,234,0.0,2.0,2.0 46 | ,Gliese 86,,0.0,1.0,2.0 47 | 0.8576,HAT-P-14,264.10,0.0,1.0,2.0 48 | 0.6916,HAT-P-16,153.83,0.0,1.0,2.0 49 | 4.9442,HAT-P-24,170.872,0.0,1.0,2.0 50 | 3.8366,HAT-P-30,4.206,0.0,1.0,2.0 51 | 2.9355,HAT-P-32,110.624,0.0,1.0,2.0 52 | 0.3063,HAT-P-33,118.05,0.0,1.0,2.0 53 | 2.667,HAT-P-57,234,1.0,1.0,3.0 54 | 0.225,HAT-P-57 BC,165,0.0,0.0,2.0 55 | 3.8587,HAT-P-7,89.8,0.0,1.0,2.0 56 | 1.040,,137.9,1.0,1.0,3.0 57 | 15,,222,0.0,0.0,2.0 58 | ,HD 106906,,0.0,1.0,2.0 59 | 8.28,,179.75,0.0,1.0,2.0 60 | 8.05,,333,0.0,1.0,2.0 61 | 3.26,,30,0.0,1.0,2.0 62 | 29.7,HD 11964,133,0.0,2.0,2.0 63 | 41.9,HD 126614 AC,299,1.0,1.0,3.0 64 | 0.499,HD 126614 AB,60.70,0.0,1.0,2.0 65 | ,HD 131399,,2.0,1.0,4.0 66 | ,HD 131399 A,,0.0,1.0,2.0 67 | 0.10,WDS J14544-3409 B,235,0.0,0.0,2.0 68 | 4.130,HD 132563,276.95,1.0,1.0,3.0 69 | ,HD 132563 A,,0.0,0.0,2.0 70 | ,,,0.0,3.0,2.0 71 | 20.4,,130,0.0,1.0,2.0 72 | 5.4,,177,0.0,2.0,2.0 73 | 64.40,Gliese 617,14,0.0,1.0,2.0 74 | 345,,245,0.0,1.0,2.0 75 | 4.80,HD 156846,73,0.0,1.0,2.0 76 | 6.2,,187,0.0,1.0,2.0 77 | ,HD 176051,,0.0,1.0,2.0 78 | 1.645,HD 177830,84.85,0.0,2.0,2.0 79 | 16.1,,82,1.0,1.0,3.0 80 | ,,,0.0,0.0,2.0 81 | 75.80,LDS 6334,152,0.0,2.0,2.0 82 | 4.547,HD 185269,8.15,1.0,1.0,3.0 83 | 0.0956,,221.1,0.0,0.0,2.0 84 | 13,,85,0.0,1.0,2.0 85 | 11.2,,246,0.0,1.0,2.0 86 | 3.5,,330,0.0,1.0,2.0 87 | 10.9,,175,1.0,1.0,3.0 88 | ,,,0.0,0.0,2.0 89 | ,,,0.0,1.0,2.0 90 | 3.676,HD 197037,182.14,0.0,1.0,2.0 91 | ,,,0.0,1.0,2.0 92 | 253.00,WDS J03201-2851,358.11,0.0,3.0,2.0 93 | 95.8,,127,0.0,1.0,2.0 94 | 2.8560,HD 217786,170.34,0.0,1.0,2.0 95 | 113,,302,0.0,1.0,2.0 96 | 839.60,WDS J00293-0555,51,1.0,1.0,3.0 97 | 0.526,HD 2638 BC,166.7,0.0,1.0,2.0 98 | 83.70,40 Eri,102,1.0,1.0,3.0 99 | ,40 Eri BC,,0.0,0.0,2.0 100 | 13.8,,36,0.0,1.0,2.0 101 | 284,,305,0.0,2.0,2.0 102 | 192,,290,1.0,1.0,3.0 103 | ,,,0.0,0.0,2.0 104 | ,,,1.0,1.0,3.0 105 | ,,,0.0,0.0,2.0 106 | 49.000,HD 4113,350.30,1.0,1.0,3.0 107 | ,HD 4113 AC,,0.0,1.0,2.0 108 | 9.4,,308,0.0,1.0,2.0 109 | ,HD 59686,,0.0,1.0,2.0 110 | ,HD 7449,,0.0,2.0,2.0 111 | 21.5,,78,0.0,1.0,2.0 112 | 20.6,,269,0.0,1.0,2.0 113 | 0.308,HD 8673,333.4,0.0,1.0,2.0 114 | 63.0,,48,0.0,1.0,2.0 115 | 377,,262,0.0,1.0,2.0 116 | ,HU Aqr,,0.0,3.0,2.0 117 | ,,,1.0,1.0,3.0 118 | ,,,0.0,1.0,2.0 119 | 0.730,K2-136,7.9,0.0,3.0,2.0 120 | 2.29,KELT-2,328.6,0.0,1.0,2.0 121 | 1.5,,,1.0,1.0,3.0 122 | 0.034,KELT-4BC,,0.0,0.0,2.0 123 | 1.05,Kepler-108,118.4,0.0,2.0,2.0 124 | 1.15,Kepler-13,281,1.0,1.0,3.0 125 | ,Kepler-13 BC,,0.0,0.0,2.0 126 | ,Kepler-16,,0.0,1.0,2.0 127 | 0.7739,Kepler-21,129.53,0.0,1.0,2.0 128 | 0.217,,217.3,0.0,5.0,2.0 129 | ,Kepler-34,,0.0,1.0,2.0 130 | ,Kepler-35,,0.0,1.0,2.0 131 | ,Kepler-38,,0.0,1.0,2.0 132 | 1.70,,35,0.0,1.0,2.0 133 | ,,,0.0,1.0,2.0 134 | ,Kepler-444,,1.0,5.0,3.0 135 | ,,,0.0,0.0,2.0 136 | 0.08,Kepler-449,68.4,0.0,2.0,2.0 137 | 0.9,Kepler-450,,0.0,3.0,2.0 138 | ,Kepler-47,,0.0,2.0,2.0 139 | 10.979,Kepler-68,145.43,0.0,3.0,2.0 140 | 0.4104,KIC 7177553,193.6,2.0,1.0,4.0 141 | ,KIC 7177553 A,,0.0,1.0,2.0 142 | ,KIC 7177553 B,,0.0,0.0,2.0 143 | ,KIC 9632895,,0.0,1.0,2.0 144 | ,KOI-1257,,0.0,1.0,2.0 145 | 0.8730,KOI-1299,20.86,0.0,2.0,2.0 146 | 2.89,KOI-2939,176.02,1.0,1.0,3.0 147 | ,KOI-2939 AB,,0.0,1.0,2.0 148 | ,,,0.0,2.0,2.0 149 | ,,,0.0,2.0,2.0 150 | ,nu Oct,,0.0,1.0,2.0 151 | ,,,0.0,1.0,2.0 152 | ,,,0.0,1.0,2.0 153 | 48,,,0.0,1.0,2.0 154 | 10.536,,,0.0,1.0,2.0 155 | 1.74,OGLE-2013-BLG-0723L,,0.0,1.0,2.0 156 | 2.96,,,0.0,1.0,2.0 157 | 0.7,PH-1,123,2.0,1.0,4.0 158 | ,PH-1 A,,0.0,1.0,2.0 159 | ,PH-1 B,,0.0,0.0,2.0 160 | 30.104,Psi-1 Draconis,15.49,1.0,1.0,3.0 161 | ,HD 162003,,0.0,0.0,2.0 162 | ,B1620-26,,0.0,1.0,2.0 163 | 0.475,Ross 458 AB,81.5,0.0,1.0,2.0 164 | 0.083,ROXs 42 B,157.9,0.0,1.0,2.0 165 | ,,,0.0,1.0,2.0 166 | 2.8,SR 12,96,0.0,1.0,2.0 167 | ,tau Boo,,0.0,1.0,2.0 168 | 1.90,tau Gem,177,0.0,1.0,2.0 169 | 1.1054,TrES-2,136.325,0.0,1.0,2.0 170 | 1.555,TrES-4,359.8,0.0,1.0,2.0 171 | 52,,150,0.0,4.0,2.0 172 | 0.1094,VHS 1256-1257,173.3,0.0,1.0,2.0 173 | 0.3425,WASP-11,214.09,0.0,1.0,2.0 174 | 1.064,,251.3,1.0,1.0,3.0 175 | 0.0843,,84,0.0,0.0,2.0 176 | ,,,0.0,1.0,2.0 177 | 1.4491,WASP-14,102.210,0.0,1.0,2.0 178 | 6.1,WASP-173,110.1,0.0,1.0,2.0 179 | 1587,WASP-1,1.953,0.0,1.0,2.0 180 | 0.757,WASP-2,104.7,0.0,1.0,2.0 181 | 1.1910,WASP-3,87.070,0.0,1.0,2.0 182 | 3.3,WASP-70,167,0.0,1.0,2.0 183 | 3.30,,150,0.0,1.0,2.0 184 | 1.48,,99.6,0.0,1.0,2.0 185 | 4.5052,WASP-8,170.948,0.0,2.0,2.0 186 | 2700,,,0.0,2.0,2.0 187 | 31,,342,0.0,4.0,2.0 188 | -------------------------------------------------------------------------------- /ch_09/data/sample_roc_curves.csv: -------------------------------------------------------------------------------- 1 | x,y,label 2 | 0.0,0.0,good 3 | 0.01,0.21,good 4 | 0.02,0.25,good 5 | 0.03,0.29,good 6 | 0.04,0.4,good 7 | 0.05,0.47,good 8 | 0.06,0.51,good 9 | 0.07,0.52,good 10 | 0.08,0.55,good 11 | 0.09,0.56,good 12 | 0.1,0.6,good 13 | 0.12,0.61,good 14 | 0.13,0.61,good 15 | 0.15,0.69,good 16 | 0.16,0.71,good 17 | 0.17,0.73,good 18 | 0.21,0.8,good 19 | 0.22,0.81,good 20 | 0.24,0.85,good 21 | 0.25,0.87,good 22 | 0.27,0.89,good 23 | 0.3,0.89,good 24 | 0.37,0.92,good 25 | 0.41,0.94,good 26 | 0.45,0.96,good 27 | 0.47,0.96,good 28 | 0.5,0.96,good 29 | 0.56,0.98,good 30 | 0.6,0.98,good 31 | 0.66,0.98,good 32 | 0.7,0.98,good 33 | 0.75,0.98,good 34 | 0.9,1.0,good 35 | 1.0,1.0,good 36 | 0.0,0.0,better 37 | 0.01,0.26,better 38 | 0.02,0.44999999999999996,better 39 | 0.03,0.54,better 40 | 0.04,0.65,better 41 | 0.05,0.72,better 42 | 0.06,0.76,better 43 | 0.07,0.77,better 44 | 0.08,0.8,better 45 | 0.09,0.81,better 46 | 0.1,0.85,better 47 | 0.12,0.86,better 48 | 0.13,0.86,better 49 | 0.15,0.8899999999999999,better 50 | 0.16,0.9099999999999999,better 51 | 0.17,0.9299999999999999,better 52 | 0.21,0.95,better 53 | 0.22,0.95,better 54 | 0.24,0.98,better 55 | 0.25,0.98,better 56 | 0.27,0.98,better 57 | 0.3,0.98,better 58 | 0.37,0.98,better 59 | 0.41,0.99,better 60 | 0.45,0.99,better 61 | 0.47,1.0,better 62 | 0.5,1.0,better 63 | 0.56,1.0,better 64 | 0.6,1.0,better 65 | 0.66,1.0,better 66 | 0.7,1.0,better 67 | 0.75,1.0,better 68 | 1.0,1.0,better 69 | 1.0,1.0,better 70 | 0.0,0.0,best 71 | 0.01,0.31,best 72 | 0.02,0.75,best 73 | 0.03,0.9,best 74 | 0.04,0.95,best 75 | 0.05,0.96,best 76 | 0.06,0.98,best 77 | 0.07,0.98,best 78 | 0.08,0.98,best 79 | 0.09,0.98,best 80 | 0.1,0.98,best 81 | 0.12,0.99,best 82 | 0.13,0.99,best 83 | 0.15,0.99,best 84 | 0.16,0.99,best 85 | 0.17,0.99,best 86 | 0.21,1.0,best 87 | 0.22,1.0,best 88 | 0.24,1.0,best 89 | 0.25,1.0,best 90 | 0.27,1.0,best 91 | 0.3,1.0,best 92 | 0.37,1.0,best 93 | 0.41,1.0,best 94 | 0.45,1.0,best 95 | 0.47,1.0,best 96 | 0.5,1.0,best 97 | 0.56,1.0,best 98 | 0.6,1.0,best 99 | 0.66,1.0,best 100 | 0.7,1.0,best 101 | 0.75,1.0,best 102 | 1.0,1.0,best 103 | 1.0,1.0,best 104 | -------------------------------------------------------------------------------- /ch_11/0-simulating_the_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simulating the data\n", 8 | "Before we go into some exploratory data analysis, let's see how we simulated the data:\n", 9 | "\n", 10 | "```\n", 11 | "# example of how to run this simulation from jupyter (remove the ! to run from the command line)\n", 12 | "!python simulate.py -s 0 --stealthy -l logs/jan_2018.csv -hl logs/hackers_jan_2018.csv 31 \"2018-01-01\" 0.01 0.5\n", 13 | "```\n", 14 | "\n", 15 | "| Month | Probability of attack in a given hour | Probability of trying entire userbase | Vary IP addresses? |\n", 16 | "| --- | --- | --- | --- |\n", 17 | "| Jan 2018 | 1.00% | 50% | Yes |\n", 18 | "| Feb 2018 | 0.50% | 25% | Yes |\n", 19 | "| Mar 2018 | 0.10% | 10% | Yes |\n", 20 | "| Apr 2018 | 1.00% | 65% | Yes |\n", 21 | "| May 2018 | 0.01% | 5% | Yes |\n", 22 | "| Jun 2018 | 0.05% | 5% | Yes |\n", 23 | "| Jul 2018 | 1.00% | 15% | Yes |\n", 24 | "| Aug 2018 | 0.50% | 10% | Yes |\n", 25 | "| Sep 2018 | 0.50% | 10% | No |\n", 26 | "| Oct 2018 | 0.20% | 12% | No |\n", 27 | "| Nov 2018 | 0.70% | 17% | Yes |\n", 28 | "| Dec 2018 | 8.00% | 88% | Yes |\n", 29 | "| Jan 2019 | 0.80% | 8% | Yes |\n", 30 | "| Feb 2019 | 0.10% | 18% | Yes |\n", 31 | "| Mar 2019 | 0.10% | 18% | Yes |\n", 32 | "\n", 33 | "We use pandas to combine the files by year. First, we create a utility function for concatenating the files:\n", 34 | "\n", 35 | "```\n", 36 | "import pandas as pd\n", 37 | "\n", 38 | "def cat_csvs(format_string_file_pattern, index_col, month_list):\n", 39 | " \"\"\"\n", 40 | " Utility function for concatentating CSV files from simulation.\n", 41 | " \n", 42 | " Parameters: \n", 43 | " - format_string_file_pattern: The pattern for the file name with `{}` in the place of the month\n", 44 | " - index_col: The column with the datetimes to sort on.\n", 45 | " - month_list: The list of the months as formatted in the file names.\n", 46 | " \n", 47 | " Returns:\n", 48 | " A concatenated pandas DataFrame\n", 49 | " \"\"\"\n", 50 | " return pd.concat([\n", 51 | " pd.read_csv(\n", 52 | " format_string_file_pattern.format(file), index_col=index_col, parse_dates=True\n", 53 | " ) for file in month_list\n", 54 | " ])\n", 55 | "```\n", 56 | "\n", 57 | "Next, we concatenate the 2018 logs making sure to not record any data from early January 1, 2019 which may have been generated from the Poisson process in December 2018:\n", 58 | "```\n", 59 | "logs_2018 = cat_csvs(\n", 60 | " 'logs/{}_2018.csv', 'datetime', \n", 61 | " ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']\n", 62 | ")\n", 63 | "logs_2018['2018'].sort_index().to_csv('logs/logs_2018.csv') # sometimes the simulation overshoots the end date\n", 64 | "```\n", 65 | "\n", 66 | "Now, we concatenate the 2019 logs remembering to add back the 2019 entries that got into the December 2018 simulation and clip the April 2019 entries from the March simulation:\n", 67 | "```\n", 68 | "logs_2019 = pd.concat([cat_csvs('logs/{}_2019.csv', 'datetime', ['jan', 'feb', 'mar']), logs_2018['2019']])\n", 69 | "logs_2019['2019-Q1'].to_csv('logs/logs_2019.csv') # sometimes the simulation overshoots the end date\n", 70 | "```\n", 71 | "\n", 72 | "After we have the login attempts logs, we concatenate the 2018 hacker logs:\n", 73 | "```\n", 74 | "hackers_2018 = cat_csvs(\n", 75 | " 'logs/hackers_{}_2018.csv', 'start', \n", 76 | " ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']\n", 77 | ")\n", 78 | "hackers_2018['2018'].sort_index().to_csv('logs/hackers_2018.csv')\n", 79 | "```\n", 80 | "\n", 81 | "Concatenating the 2019 hacker logs is the same process:\n", 82 | "```\n", 83 | "hackers_2019 = pd.concat([\n", 84 | " cat_csvs('logs/hackers_{}_2019.csv', 'start', ['jan', 'feb', 'mar']), hackers_2018['2019']\n", 85 | "])\n", 86 | "hackers_2019['2019-Q1'].sort_index().to_csv('logs/hackers_2019.csv')\n", 87 | "```\n", 88 | "\n", 89 | "The process of building the CSV files from the individual simulations is contained in `merge_logs.py` and the entire process is in the bash script `run_simulations.sh`. You don't have to run either of these." 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "# Create SQLite Database" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 1, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "import sqlite3\n", 106 | "import numpy as np\n", 107 | "import pandas as pd\n", 108 | "\n", 109 | "# read in files\n", 110 | "logs_2018 = pd.read_csv('logs/logs_2018.csv', index_col='datetime')\n", 111 | "logs_2019 = pd.read_csv('logs/logs_2019.csv', index_col='datetime')\n", 112 | "hackers_2018 = pd.read_csv('logs/hackers_2018.csv', index_col='start')\n", 113 | "hackers_2019 = pd.read_csv('logs/hackers_2019.csv', index_col='start')\n", 114 | "\n", 115 | "# write to database\n", 116 | "with sqlite3.connect('logs/logs.db') as conn:\n", 117 | " logs_2018.to_sql('logs', conn, if_exists='replace')\n", 118 | " logs_2019.to_sql('logs', conn, if_exists='append')\n", 119 | " hackers_2018.to_sql('attacks', conn, if_exists='replace')\n", 120 | " hackers_2019.to_sql('attacks', conn, if_exists='append')" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.7.2" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /ch_11/logs/hackers_2018.csv: -------------------------------------------------------------------------------- 1 | start,end,source_ip 2 | 2018-01-05 06:03:42.470259,2018-01-05 06:03:51.470259,170.9.4.108 3 | 2018-01-11 03:08:43.284085,2018-01-11 03:09:14.284085,27.255.30.3 4 | 2018-01-17 00:41:43.985324,2018-01-17 00:45:56.985324,226.98.192.152 5 | 2018-01-21 10:34:57.842776,2018-01-21 10:38:01.842776,102.178.107.171 6 | 2018-01-21 23:12:10.852725,2018-01-21 23:12:38.852725,48.172.61.152 7 | 2018-01-23 20:56:23.219809,2018-01-23 20:58:40.219809,62.209.68.197 8 | 2018-01-26 05:56:35.872139,2018-01-26 06:00:42.872139,27.193.94.129 9 | 2018-02-12 17:33:22.974691,2018-02-12 17:35:53.974691,93.253.75.244 10 | 2018-02-18 06:54:30.130457,2018-02-18 06:58:47.130457,51.221.142.249 11 | 2018-02-19 01:36:20.192653,2018-02-19 01:37:31.192653,139.251.218.76 12 | 2018-02-23 19:56:41.865629,2018-02-23 19:57:59.865629,84.4.101.68 13 | 2018-04-02 01:55:20.503176,2018-04-02 01:59:28.503176,131.59.202.195 14 | 2018-04-03 00:16:23.223652,2018-04-03 00:20:27.223652,13.94.42.10 15 | 2018-07-03 22:52:58.170083,2018-07-03 22:56:34.170083,74.178.145.82 16 | 2018-07-19 22:25:03.009372,2018-07-19 22:29:12.009372,6.199.47.120 17 | 2018-07-19 23:50:18.364552,2018-07-19 23:53:10.364552,112.214.73.179 18 | 2018-07-28 13:34:34.220996,2018-07-28 13:36:23.220996,229.204.248.216 19 | 2018-07-31 03:30:46.577582,2018-07-31 03:34:57.577582,142.85.18.115 20 | 2018-08-03 02:28:00.788461,2018-08-03 02:29:37.788461,172.5.153.120 21 | 2018-08-04 10:21:32.855163,2018-08-04 10:23:32.855163,208.224.58.84 22 | 2018-08-05 07:47:05.202707,2018-08-05 07:47:13.202707,59.115.153.240 23 | 2018-08-26 19:48:07.338588,2018-08-26 19:49:02.338588,152.34.193.165 24 | 2018-09-09 11:04:19.470385,2018-09-09 11:08:27.470385,78.174.30.56 25 | 2018-09-11 18:57:00.687504,2018-09-11 18:58:12.687504,17.4.47.84 26 | 2018-09-12 20:52:34.155677,2018-09-12 20:55:28.155677,173.207.252.26 27 | 2018-09-17 15:30:25.969556,2018-09-17 15:33:32.969556,22.210.104.44 28 | 2018-09-18 17:46:09.932924,2018-09-18 17:48:38.932924,174.147.116.255 29 | 2018-09-19 11:08:32.108147,2018-09-19 11:11:07.108147,138.101.91.226 30 | 2018-09-21 19:31:50.003252,2018-09-21 19:35:21.003252,215.189.60.53 31 | 2018-09-24 23:26:31.709160,2018-09-24 23:30:34.709160,228.144.254.255 32 | 2018-10-06 00:04:11.688537,2018-10-06 00:06:08.688537,12.85.219.94 33 | 2018-11-01 04:12:34.982693,2018-11-01 04:16:19.982693,104.8.35.137 34 | 2018-11-07 01:50:33.596719,2018-11-07 01:51:16.596719,183.16.40.217 35 | 2018-11-11 14:21:01.413492,2018-11-11 14:25:06.413492,107.210.163.30 36 | 2018-11-15 10:06:25.095924,2018-11-15 10:09:03.095924,105.178.119.27 37 | 2018-11-20 18:50:37.325372,2018-11-20 18:54:49.325372,118.189.202.82 38 | 2018-11-23 16:26:28.482645,2018-11-23 16:28:14.482645,88.199.57.22 39 | 2018-11-24 04:34:58.053877,2018-11-24 04:35:43.053877,206.121.220.195 40 | 2018-12-01 00:06:51.879131,2018-12-01 00:07:06.879131,168.10.158.149 41 | 2018-12-01 06:43:34.778738,2018-12-01 06:45:41.778738,137.174.91.123 42 | 2018-12-01 12:29:34.979806,2018-12-01 12:33:48.979806,218.114.210.223 43 | 2018-12-01 18:19:18.507327,2018-12-01 18:23:31.507327,118.29.144.220 44 | 2018-12-02 13:22:42.567875,2018-12-02 13:26:55.567875,29.101.15.78 45 | 2018-12-02 20:05:21.387524,2018-12-02 20:09:29.387524,121.120.155.251 46 | 2018-12-02 21:09:01.894714,2018-12-02 21:13:04.894714,232.222.234.177 47 | 2018-12-02 22:40:49.319515,2018-12-02 22:45:01.319515,123.182.42.106 48 | 2018-12-03 01:28:16.456126,2018-12-03 01:32:24.456126,118.93.53.14 49 | 2018-12-03 04:59:47.275187,2018-12-03 05:02:37.275187,151.246.57.5 50 | 2018-12-03 16:52:01.590703,2018-12-03 16:56:17.590703,211.229.145.233 51 | 2018-12-04 00:20:53.519253,2018-12-04 00:25:04.519253,111.202.163.183 52 | 2018-12-04 05:21:35.664250,2018-12-04 05:25:43.664250,105.35.130.3 53 | 2018-12-04 22:17:16.477064,2018-12-04 22:21:26.477064,8.245.180.35 54 | 2018-12-06 06:49:29.023733,2018-12-06 06:53:35.023733,135.248.127.1 55 | 2018-12-06 07:39:45.474778,2018-12-06 07:43:53.474778,14.230.220.30 56 | 2018-12-07 05:06:07.981164,2018-12-07 05:10:22.981164,191.157.174.232 57 | 2018-12-07 07:01:52.126580,2018-12-07 07:03:26.126580,58.205.207.0 58 | 2018-12-09 08:15:30.388657,2018-12-09 08:19:38.388657,215.47.10.84 59 | 2018-12-10 00:54:42.208933,2018-12-10 00:55:12.208933,100.228.159.98 60 | 2018-12-10 04:10:01.520520,2018-12-10 04:12:23.520520,209.19.127.19 61 | 2018-12-10 07:19:15.373502,2018-12-10 07:23:30.373502,154.165.28.184 62 | 2018-12-10 10:16:13.278728,2018-12-10 10:20:32.278728,211.164.183.50 63 | 2018-12-10 19:56:52.369200,2018-12-10 20:01:00.369200,43.222.72.24 64 | 2018-12-11 14:54:49.328939,2018-12-11 14:59:07.328939,36.117.206.37 65 | 2018-12-12 04:50:14.055379,2018-12-12 04:54:26.055379,59.12.7.192 66 | 2018-12-12 10:15:23.076522,2018-12-12 10:19:31.076522,2.190.1.26 67 | 2018-12-12 13:20:16.800584,2018-12-12 13:24:29.800584,15.26.194.139 68 | 2018-12-12 14:13:06.789965,2018-12-12 14:17:17.789965,210.54.123.17 69 | 2018-12-13 05:54:40.470621,2018-12-13 05:58:59.470621,145.195.179.188 70 | 2018-12-13 14:52:35.667099,2018-12-13 14:56:34.667099,203.31.153.203 71 | 2018-12-13 17:51:29.542459,2018-12-13 17:55:47.542459,233.179.120.157 72 | 2018-12-14 09:46:15.839045,2018-12-14 09:50:25.839045,206.47.210.131 73 | 2018-12-14 10:39:20.772384,2018-12-14 10:43:24.772384,230.109.220.92 74 | 2018-12-14 13:08:01.549749,2018-12-14 13:12:12.549749,24.137.19.206 75 | 2018-12-14 16:25:18.228474,2018-12-14 16:29:24.228474,32.105.154.210 76 | 2018-12-15 03:59:23.509947,2018-12-15 04:03:32.509947,12.101.52.83 77 | 2018-12-15 09:54:52.168127,2018-12-15 09:59:11.168127,211.99.79.128 78 | 2018-12-16 00:27:27.587855,2018-12-16 00:31:35.587855,26.121.6.174 79 | 2018-12-16 16:42:04.730609,2018-12-16 16:46:12.730609,234.78.210.179 80 | 2018-12-16 19:21:33.557561,2018-12-16 19:25:46.557561,8.114.151.106 81 | 2018-12-18 10:36:03.439159,2018-12-18 10:40:14.439159,36.61.121.85 82 | 2018-12-19 20:58:39.747942,2018-12-19 21:02:47.747942,86.18.45.118 83 | 2018-12-19 21:44:18.269827,2018-12-19 21:48:30.269827,65.209.187.180 84 | 2018-12-20 00:43:20.356847,2018-12-20 00:47:28.356847,113.26.222.48 85 | 2018-12-20 06:07:46.630802,2018-12-20 06:11:52.630802,230.26.154.133 86 | 2018-12-20 12:26:56.839296,2018-12-20 12:31:05.839296,171.147.124.35 87 | 2018-12-21 00:31:19.644809,2018-12-21 00:35:29.644809,197.71.159.62 88 | 2018-12-21 22:22:38.012453,2018-12-21 22:26:43.012453,121.115.24.225 89 | 2018-12-22 09:45:01.755742,2018-12-22 09:49:15.755742,69.130.109.64 90 | 2018-12-22 17:02:03.504117,2018-12-22 17:06:15.504117,211.219.25.119 91 | 2018-12-23 08:29:59.530428,2018-12-23 08:34:12.530428,40.48.69.140 92 | 2018-12-23 20:43:58.797374,2018-12-23 20:48:03.797374,49.79.153.96 93 | 2018-12-24 04:45:53.184929,2018-12-24 04:50:06.184929,166.198.17.114 94 | 2018-12-24 07:26:30.035556,2018-12-24 07:30:35.035556,43.180.136.184 95 | 2018-12-24 13:23:21.875545,2018-12-24 13:27:30.875545,180.246.142.130 96 | 2018-12-24 14:19:14.642045,2018-12-24 14:23:29.642045,148.170.131.157 97 | 2018-12-24 22:35:00.889513,2018-12-24 22:39:12.889513,32.148.94.35 98 | 2018-12-25 01:47:21.899698,2018-12-25 01:51:31.899698,4.7.249.142 99 | 2018-12-25 16:15:31.808751,2018-12-25 16:17:54.808751,19.101.185.116 100 | 2018-12-26 07:54:59.939399,2018-12-26 07:59:10.939399,113.25.122.147 101 | 2018-12-26 12:09:23.860732,2018-12-26 12:13:32.860732,19.68.134.150 102 | 2018-12-26 18:31:22.538764,2018-12-26 18:35:32.538764,41.152.93.178 103 | 2018-12-26 23:27:24.500651,2018-12-26 23:31:36.500651,76.26.233.66 104 | 2018-12-27 05:47:48.433162,2018-12-27 05:51:58.433162,237.151.209.164 105 | 2018-12-27 11:26:20.357926,2018-12-27 11:30:28.357926,150.82.255.148 106 | 2018-12-28 04:25:17.262554,2018-12-28 04:29:32.262554,239.87.171.169 107 | 2018-12-28 18:00:07.773171,2018-12-28 18:04:21.773171,86.29.47.60 108 | 2018-12-29 03:27:22.432176,2018-12-29 03:31:35.432176,86.94.179.28 109 | 2018-12-29 15:39:10.376311,2018-12-29 15:39:14.376311,45.208.66.139 110 | 2018-12-29 19:52:45.347044,2018-12-29 19:56:55.347044,191.136.131.14 111 | 2018-12-31 08:09:03.009937,2018-12-31 08:13:17.009937,162.197.35.101 112 | 2018-12-31 15:41:57.004462,2018-12-31 15:46:03.004462,138.177.154.55 113 | -------------------------------------------------------------------------------- /ch_11/logs/hackers_2019.csv: -------------------------------------------------------------------------------- 1 | start,end,source_ip 2 | 2019-01-01 12:14:53.238041,2019-01-01 12:16:03.238041,152.133.44.190 3 | 2019-01-09 19:43:31.272840,2019-01-09 19:45:54.272840,140.121.49.84 4 | 2019-01-10 02:01:29.175645,2019-01-10 02:05:41.175645,141.58.118.74 5 | 2019-01-17 19:09:13.638984,2019-01-17 19:10:12.638984,87.98.217.249 6 | 2019-01-18 07:55:38.736720,2019-01-18 07:55:54.736720,78.218.146.197 7 | 2019-01-25 14:26:46.575983,2019-01-25 14:29:11.575983,153.125.66.175 8 | 2019-01-28 08:46:43.924377,2019-01-28 08:50:44.924377,117.44.11.122 9 | 2019-02-02 04:50:49.424024,2019-02-02 04:54:54.424024,12.48.219.98 10 | 2019-02-02 22:43:48.927638,2019-02-02 22:48:06.927638,218.251.192.57 11 | 2019-02-11 04:49:19.277079,2019-02-11 04:49:19.277079,210.159.230.101 12 | 2019-02-27 18:27:56.987467,2019-02-27 18:32:06.987467,124.83.133.90 13 | 2019-02-28 00:23:46.072334,2019-02-28 00:27:53.072334,175.25.26.95 14 | 2019-03-01 07:15:03.608034,2019-03-01 07:19:10.608034,134.199.232.226 15 | 2019-03-15 19:19:01.250667,2019-03-15 19:23:07.250667,75.138.25.164 16 | 2019-03-16 07:40:57.416222,2019-03-16 07:44:57.416222,38.26.34.112 17 | 2019-03-21 17:20:17.142681,2019-03-21 17:23:52.142681,1.185.125.159 18 | 2019-03-22 07:16:36.708436,2019-03-22 07:17:39.708436,239.101.53.203 19 | 2019-03-27 09:33:02.261349,2019-03-27 09:35:43.261349,152.138.125.171 20 | -------------------------------------------------------------------------------- /ch_11/logs/logs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stefmolin/Hands-On-Data-Analysis-with-Pandas/868f5b52f2519cea4ce214598f9b1541c168bdde/ch_11/logs/logs.db -------------------------------------------------------------------------------- /ch_11/merge_logs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def cat_csvs(format_string_file_pattern, index_col, month_list): 4 | """ 5 | Utility function for concatentating CSV files from simulation. 6 | 7 | Parameters: 8 | - format_string_file_pattern: The pattern for the file name with `{}` in the place of the month 9 | - index_col: The column with the datetimes to sort on. 10 | - month_list: The list of the months as formatted in the file names. 11 | 12 | Returns: 13 | A concatenated pandas DataFrame 14 | """ 15 | return pd.concat([ 16 | pd.read_csv( 17 | format_string_file_pattern.format(file), index_col=index_col, parse_dates=True 18 | ) for file in month_list 19 | ]) 20 | 21 | if __name__ == '__main__': 22 | logs_2018 = cat_csvs( 23 | 'logs/{}_2018.csv', 'datetime', 24 | ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] 25 | ) 26 | logs_2018['2018'].sort_index().to_csv('logs/logs_2018.csv') # sometimes the simulation overshoots the end date 27 | 28 | logs_2019 = pd.concat([cat_csvs('logs/{}_2019.csv', 'datetime', ['jan', 'feb', 'mar']), logs_2018.get('2019')]) 29 | logs_2019['2019-Q1'].to_csv('logs/logs_2019.csv') # sometimes the simulation overshoots the end date 30 | 31 | hackers_2018 = cat_csvs( 32 | 'logs/hackers_{}_2018.csv', 'start', 33 | ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] 34 | ) 35 | hackers_2018['2018'].sort_index().to_csv('logs/hackers_2018.csv') 36 | 37 | hackers_2019 = pd.concat([ 38 | cat_csvs('logs/hackers_{}_2019.csv', 'start', ['jan', 'feb', 'mar']), hackers_2018.get('2019') 39 | ]) 40 | hackers_2019['2019-Q1'].sort_index().to_csv('logs/hackers_2019.csv') 41 | 42 | print('All done!') 43 | -------------------------------------------------------------------------------- /ch_11/run_simulations.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | LOGS="logs" 4 | 5 | # make a directory for our logs 6 | if ! [ -d "$LOGS" ]; then 7 | mkdir "$LOGS" 8 | fi 9 | 10 | # run the simulations 11 | echo 'Simulating January 2018...' 12 | python simulate.py -s 1 --stealthy -l "$LOGS"/jan_2018.csv -hl "$LOGS"/hackers_jan_2018.csv 31 "2018-01-01" 0.01 0.5 13 | 14 | printf '\nSimulating February 2018...\n' 15 | python simulate.py -s 2 --stealthy -l "$LOGS"/feb_2018.csv -hl "$LOGS"/hackers_feb_2018.csv 28 "2018-02-01" 0.005 0.25 16 | 17 | printf '\nSimulating March 2018...\n' 18 | python simulate.py -s 3 --stealthy -l "$LOGS"/mar_2018.csv -hl "$LOGS"/hackers_mar_2018.csv 31 "2018-03-01" 0.001 0.10 19 | 20 | printf '\nSimulating April 2018...\n' 21 | python simulate.py -s 4 --stealthy -l "$LOGS"/apr_2018.csv -hl "$LOGS"/hackers_apr_2018.csv 30 "2018-04-01" 0.01 0.65 22 | 23 | printf '\nSimulating May 2018...\n' 24 | python simulate.py -s 5 --stealthy -l "$LOGS"/may_2018.csv -hl "$LOGS"/hackers_may_2018.csv 31 "2018-05-01" 0.0001 0.05 25 | 26 | printf '\nSimulating June 2018...\n' 27 | python simulate.py -s 6 --stealthy -l "$LOGS"/jun_2018.csv -hl "$LOGS"/hackers_jun_2018.csv 30 "2018-06-01" 0.0005 0.05 28 | 29 | printf '\nSimulating July 2018...\n' 30 | python simulate.py -s 7 --stealthy -l "$LOGS"/jul_2018.csv -hl "$LOGS"/hackers_jul_2018.csv 31 "2018-07-01" 0.01 0.15 31 | 32 | printf '\nSimulating August 2018...\n' 33 | python simulate.py -s 8 --stealthy -l "$LOGS"/aug_2018.csv -hl "$LOGS"/hackers_aug_2018.csv 31 "2018-08-01" 0.005 0.1 34 | 35 | printf '\nSimulating September 2018...\n' 36 | python simulate.py -s 9 -l "$LOGS"/sep_2018.csv -hl "$LOGS"/hackers_sep_2018.csv 30 "2018-09-01" 0.005 0.1 37 | 38 | printf '\nSimulating October 2018...\n' 39 | python simulate.py -s 10 -l "$LOGS"/oct_2018.csv -hl "$LOGS"/hackers_oct_2018.csv 31 "2018-10-01" 0.002 0.12 40 | 41 | printf '\nSimulating November 2018...\n' 42 | python simulate.py -s 11 --stealthy -l "$LOGS"/nov_2018.csv -hl "$LOGS"/hackers_nov_2018.csv 30 "2018-11-01" 0.007 0.17 43 | 44 | printf '\nSimulating December 2018...\n' 45 | python simulate.py -s 12 --stealthy -l "$LOGS"/dec_2018.csv -hl "$LOGS"/hackers_dec_2018.csv 31 "2018-12-01" 0.08 0.88 46 | 47 | printf '\nSimulating January 2019...\n' 48 | python simulate.py -s 13 --stealthy -l "$LOGS"/jan_2019.csv -hl "$LOGS"/hackers_jan_2019.csv 31 "2019-01-01" 0.008 0.08 49 | 50 | printf '\nSimulating February 2019...\n' 51 | python simulate.py -s 14 --stealthy -l "$LOGS"/feb_2019.csv -hl "$LOGS"/hackers_feb_2019.csv 28 "2019-02-01" 0.01 0.18 52 | 53 | printf '\nSimulating March 2019...\n' 54 | python simulate.py -s 15 --stealthy -l "$LOGS"/mar_2019.csv -hl "$LOGS"/hackers_mar_2019.csv 31 "2019-03-01" 0.01 0.18 55 | 56 | # combine the files 57 | echo 'Merging files...' 58 | python merge_logs.py 59 | 60 | # remove unnecessary files 61 | echo 'Cleaning up...' 62 | cd "$LOGS" 63 | echo "$(ls)" | grep -E "(^[a-z]{3}_{1})|(hackers_[a-z]{3})" | xargs rm 64 | cd .. 65 | 66 | echo 'Success!' 67 | -------------------------------------------------------------------------------- /ch_11/simulate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime as dt 3 | import os 4 | import logging 5 | import random 6 | 7 | import login_attempt_simulator as sim 8 | 9 | # Logging configuration 10 | FORMAT = '[%(levelname)s] [ %(name)s ] %(message)s' 11 | logging.basicConfig(level=logging.INFO, format=FORMAT) 12 | logger = logging.getLogger(os.path.basename(__file__)) 13 | 14 | def get_simulation_file_path(path_provided, directory, default_file): 15 | """Get the path to the file creating the directory and using the default if necessary.""" 16 | if path_provided: 17 | file = path_provided 18 | else: 19 | if not os.path.exists(directory): 20 | os.mkdir(directory) 21 | file = os.path.join(directory, default_file) 22 | return file 23 | 24 | def get_user_base_file_path(path_provided, default_file): 25 | """Get the path for a user_data directory file.""" 26 | return get_simulation_file_path(path_provided, 'user_data', default_file) 27 | 28 | def get_log_file_path(path_provided, default_file): 29 | """Get the path for a logs directory file.""" 30 | return get_simulation_file_path(path_provided, 'logs', default_file) 31 | 32 | if __name__ == '__main__': 33 | # command line argument parsing 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument( 36 | "days", type=float, 37 | help="number of days to simulate from start" 38 | ) 39 | parser.add_argument( 40 | "start_date", type=str, 41 | help="datetime to start in the form 'YYYY-MM-DD' or 'YYYY-MM-DD-HH'" 42 | ) 43 | parser.add_argument( 44 | "-s", "--seed", type=int, help="set a seed for reproducibility" 45 | ) 46 | parser.add_argument( 47 | "attack_prob", type=float, 48 | help="probability of attack in a given hour" 49 | ) 50 | parser.add_argument( 51 | "try_all_users_prob", type=float, 52 | help="probability attacker tries to guess credentials for all usernames" 53 | ) 54 | parser.add_argument( 55 | "-st", "--stealthy", action='store_true', help="be stealthy? (vary IP addresses?)" 56 | ) 57 | parser.add_argument( 58 | "-m", "--make", action='store_true', help="make userbase" 59 | ) 60 | parser.add_argument( 61 | "-u", "--userbase", help="file to write the userbase to" 62 | ) 63 | parser.add_argument( 64 | "-i", "--ip", help="file to write the user-ip address map to" 65 | ) 66 | parser.add_argument( 67 | "-l", "--log", help="file to write the attempt log to" 68 | ) 69 | parser.add_argument( 70 | "-hl", "--hacklog", help="file to write the hack log to" 71 | ) 72 | args = parser.parse_args() 73 | user_ip_mapping_file = get_user_base_file_path(args.ip, 'user_ips.json') 74 | 75 | if args.make: 76 | logger.warning('Creating new user base and mapping IP addresses to them.') 77 | 78 | user_base_file = get_user_base_file_path(args.userbase, 'user_base.txt') 79 | 80 | # seed the creation of userbase 81 | random.seed(args.seed) 82 | 83 | # create usernames and write to file 84 | sim.utils.make_userbase(user_base_file) 85 | 86 | # create one or more IP addresses per user and save mapping to file 87 | valid_users = sim.utils.get_valid_users(user_base_file) 88 | sim.utils.save_user_ips( 89 | sim.utils.assign_ip_addresses(valid_users), user_ip_mapping_file 90 | ) 91 | 92 | try: 93 | start = dt.datetime(*map(int, args.start_date.split('-'))) 94 | except TypeError: 95 | logger.error('Start date must be in the format "YYYY-MM-DD"') 96 | raise 97 | except ValueError: 98 | logger.warning( 99 | f'Could not interpret {args.start_date}, ' 100 | 'using January 1, 2019 at 12AM as start instead' 101 | ) 102 | start = dt.datetime(2019, 1, 1) 103 | 104 | 105 | end = start + dt.timedelta(days=args.days) 106 | 107 | try: 108 | logger.info(f'Simulating {args.days} days...') 109 | simulator = sim.LoginAttemptSimulator( 110 | user_ip_mapping_file, start, end, seed=args.seed 111 | ) 112 | simulator.simulate( 113 | attack_prob=args.attack_prob, 114 | try_all_users_prob=args.try_all_users_prob, 115 | vary_ips=args.stealthy 116 | ) 117 | 118 | # save logs 119 | logger.info('Saving logs') 120 | simulator.save_hack_log(get_log_file_path(args.hacklog, 'attacks.csv')) 121 | simulator.save_log(get_log_file_path(args.log, 'log.csv')) 122 | 123 | logger.info('All done!') 124 | except: 125 | logger.error('Oops! Something went wrong...') 126 | -------------------------------------------------------------------------------- /ch_11/user_data/user_base.txt: -------------------------------------------------------------------------------- 1 | asmith 2 | ajones 3 | akim 4 | alopez 5 | abrown 6 | bsmith 7 | bjones 8 | bkim 9 | blopez 10 | bbrown 11 | csmith 12 | cjones 13 | ckim 14 | clopez 15 | cbrown 16 | dsmith 17 | djones 18 | dkim 19 | dlopez 20 | dbrown 21 | esmith 22 | ejones 23 | ekim 24 | elopez 25 | ebrown 26 | fsmith 27 | fjones 28 | fkim 29 | flopez 30 | fbrown 31 | gsmith 32 | gjones 33 | gkim 34 | glopez 35 | gbrown 36 | hsmith 37 | hjones 38 | hkim 39 | hlopez 40 | hbrown 41 | ismith 42 | ijones 43 | ikim 44 | ilopez 45 | ibrown 46 | jsmith 47 | jjones 48 | jkim 49 | jlopez 50 | jbrown 51 | ksmith 52 | kjones 53 | kkim 54 | klopez 55 | kbrown 56 | lsmith 57 | ljones 58 | lkim 59 | llopez 60 | lbrown 61 | msmith 62 | mjones 63 | mkim 64 | mlopez 65 | mbrown 66 | nsmith 67 | njones 68 | nkim 69 | nlopez 70 | nbrown 71 | osmith 72 | ojones 73 | okim 74 | olopez 75 | obrown 76 | psmith 77 | pjones 78 | pkim 79 | plopez 80 | pbrown 81 | qsmith 82 | qjones 83 | qkim 84 | qlopez 85 | qbrown 86 | rsmith 87 | rjones 88 | rkim 89 | rlopez 90 | rbrown 91 | ssmith 92 | sjones 93 | skim 94 | slopez 95 | sbrown 96 | tsmith 97 | tjones 98 | tkim 99 | tlopez 100 | tbrown 101 | usmith 102 | ujones 103 | ukim 104 | ulopez 105 | ubrown 106 | vsmith 107 | vjones 108 | vkim 109 | vlopez 110 | vbrown 111 | wsmith 112 | wjones 113 | wkim 114 | wlopez 115 | wbrown 116 | xsmith 117 | xjones 118 | xkim 119 | xlopez 120 | xbrown 121 | ysmith 122 | yjones 123 | ykim 124 | ylopez 125 | ybrown 126 | zsmith 127 | zjones 128 | zkim 129 | zlopez 130 | zbrown 131 | admin 132 | master 133 | dba 134 | -------------------------------------------------------------------------------- /ch_11/user_data/user_ips.json: -------------------------------------------------------------------------------- 1 | {"asmith": ["6.252.142.27"], "ajones": ["173.50.12.181"], "akim": ["98.43.141.103", "220.40.22.86", "124.178.25.98"], "alopez": ["218.160.255.160", "19.161.178.228", "62.253.28.155"], "abrown": ["237.189.239.120"], "bsmith": ["83.113.51.172", "1.218.123.66"], "bjones": ["139.17.174.247", "227.187.186.96"], "bkim": ["12.1.71.46", "5.91.167.118", "234.179.140.103"], "blopez": ["224.156.38.13", "7.145.44.102"], "bbrown": ["165.149.191.111", "203.116.114.41", "219.243.101.100"], "csmith": ["219.34.152.98"], "cjones": ["16.250.160.227", "158.239.189.2"], "ckim": ["204.205.71.171"], "clopez": ["100.208.255.37", "81.153.10.191"], "cbrown": ["24.112.17.125"], "dsmith": ["6.132.27.197"], "djones": ["223.178.55.3"], "dkim": ["32.62.195.210", "145.135.160.219", "88.97.1.201"], "dlopez": ["81.142.166.142"], "dbrown": ["82.151.213.13"], "esmith": ["157.153.26.116", "49.47.127.7", "102.63.17.136"], "ejones": ["35.25.197.235"], "ekim": ["93.10.102.171", "4.40.46.47"], "elopez": ["116.147.197.72", "47.113.213.68", "178.117.47.173"], "ebrown": ["44.23.94.53", "95.170.61.237"], "fsmith": ["25.52.89.61", "21.79.236.54", "109.126.45.254"], "fjones": ["156.123.42.2", "108.234.173.69"], "fkim": ["196.59.227.183", "169.48.40.197"], "flopez": ["235.138.253.203"], "fbrown": ["16.118.156.50"], "gsmith": ["52.196.239.10"], "gjones": ["220.178.30.239"], "gkim": ["207.134.95.170", "124.78.101.138"], "glopez": ["116.115.247.65", "32.45.175.69", "167.7.157.50"], "gbrown": ["34.70.31.26", "60.87.41.200"], "hsmith": ["194.16.87.189", "150.105.190.166", "197.130.196.192"], "hjones": ["77.228.157.126", "103.6.163.47"], "hkim": ["222.242.117.220"], "hlopez": ["25.246.225.197"], "hbrown": ["121.208.14.158", "234.193.77.22"], "ismith": ["235.42.240.18", "168.249.146.54"], "ijones": ["92.90.216.98"], "ikim": ["18.170.45.213", "38.215.94.52"], "ilopez": ["125.182.32.22", "237.25.177.213", "237.149.204.131"], "ibrown": ["171.243.99.246", "149.210.206.193"], "jsmith": ["2.147.74.216"], "jjones": ["195.14.154.50", "202.169.215.170"], "jkim": ["45.195.73.202", "142.163.253.58", "88.89.16.210"], "jlopez": ["196.139.212.129", "219.231.94.92"], "jbrown": ["29.74.37.92"], "ksmith": ["67.144.9.42"], "kjones": ["96.251.147.183"], "kkim": ["156.125.97.32"], "klopez": ["37.0.160.74"], "kbrown": ["156.71.60.56"], "lsmith": ["121.0.179.182"], "ljones": ["187.211.58.58", "140.159.149.19", "105.118.5.129"], "lkim": ["180.225.253.52", "108.94.111.193"], "llopez": ["236.71.190.208"], "lbrown": ["142.117.171.37"], "msmith": ["110.216.28.51"], "mjones": ["163.70.126.33", "214.202.65.75", "163.18.185.107"], "mkim": ["90.49.69.190"], "mlopez": ["172.112.56.41", "52.204.133.50", "39.208.143.86"], "mbrown": ["164.253.117.48", "181.136.173.32"], "nsmith": ["169.248.242.6", "86.134.40.87", "119.137.148.47"], "njones": ["29.82.101.154", "168.143.138.162", "139.140.155.139"], "nkim": ["215.156.73.91", "173.251.190.189", "59.143.160.204"], "nlopez": ["120.99.79.161", "182.42.181.138"], "nbrown": ["142.6.232.17"], "osmith": ["37.76.79.196"], "ojones": ["107.142.221.216", "125.15.117.120", "114.94.122.0"], "okim": ["144.196.236.37", "188.178.79.238"], "olopez": ["84.63.187.234", "27.140.145.181"], "obrown": ["230.231.48.120", "216.231.167.224"], "psmith": ["133.93.124.180", "199.209.181.250"], "pjones": ["59.57.184.103", "207.37.53.232"], "pkim": ["204.182.96.183", "150.64.165.159"], "plopez": ["156.154.161.197", "21.27.218.73", "210.235.225.68"], "pbrown": ["174.108.17.132", "87.92.168.152"], "qsmith": ["179.88.247.35"], "qjones": ["126.228.43.201"], "qkim": ["227.55.249.250"], "qlopez": ["132.124.26.73"], "qbrown": ["122.220.83.23"], "rsmith": ["103.150.207.72"], "rjones": ["237.129.153.201"], "rkim": ["129.116.213.203", "68.150.183.142"], "rlopez": ["65.170.52.77"], "rbrown": ["99.10.254.223", "100.54.46.85"], "ssmith": ["113.97.58.143", "186.89.187.107"], "sjones": ["218.187.113.154", "145.7.65.90", "141.111.232.255"], "skim": ["114.134.47.17", "227.118.165.93"], "slopez": ["172.0.92.175"], "sbrown": ["197.253.160.32", "126.124.56.18"], "tsmith": ["58.49.140.185", "29.70.157.24"], "tjones": ["12.165.104.67"], "tkim": ["101.74.49.187", "11.76.99.35"], "tlopez": ["168.123.156.81"], "tbrown": ["9.233.172.254"], "usmith": ["134.237.148.156"], "ujones": ["206.3.217.210", "90.55.81.7"], "ukim": ["98.227.162.125", "169.29.57.78"], "ulopez": ["142.106.44.88", "158.143.215.210", "5.140.68.6"], "ubrown": ["195.152.80.113", "193.98.19.38", "206.126.112.26"], "vsmith": ["198.236.51.210", "93.36.180.212", "141.175.29.185"], "vjones": ["128.251.20.76"], "vkim": ["71.202.159.171"], "vlopez": ["15.249.62.144"], "vbrown": ["88.121.135.247"], "wsmith": ["59.243.45.6", "142.76.134.181"], "wjones": ["233.31.221.213", "118.4.128.126"], "wkim": ["105.29.8.22", "189.77.78.5"], "wlopez": ["51.185.92.40", "106.225.19.157"], "wbrown": ["208.101.11.88"], "xsmith": ["43.212.185.23"], "xjones": ["169.194.51.116", "82.41.241.235", "26.26.183.131"], "xkim": ["54.57.131.131", "26.32.56.22", "91.244.227.100"], "xlopez": ["101.44.104.162", "155.56.67.107"], "xbrown": ["30.67.241.95"], "ysmith": ["27.95.79.35"], "yjones": ["158.11.94.79"], "ykim": ["52.93.173.144", "200.56.97.220", "209.70.184.211"], "ylopez": ["180.78.246.173", "190.171.127.35", "142.7.151.142"], "ybrown": ["131.139.241.249", "25.12.71.150", "110.235.65.232"], "zsmith": ["226.227.77.216"], "zjones": ["112.33.54.152"], "zkim": ["222.178.53.26"], "zlopez": ["162.160.195.60"], "zbrown": ["230.71.97.153"], "admin": ["140.116.30.11", "41.253.247.255", "22.45.110.229"], "master": ["160.78.166.50", "158.68.77.78"], "dba": ["171.114.66.123"]} -------------------------------------------------------------------------------- /ch_12/README.md: -------------------------------------------------------------------------------- 1 | # Additional Resources 2 | 3 | ## Finding Data 4 | Both `seaborn` and `sklearn` provide built-in sample datasets that you can experiment with. Check out the documentation for more information. 5 | 6 | ### Searching for Data 7 | The following are a few places you can search for data on a variety of topics: 8 | - [DataHub](https://datahub.io/search) 9 | - [Google Dataset Search](https://toolbox.google.com/datasetsearch) 10 | - [Open data on Amazon Web Services](https://registry.opendata.aws/) 11 | - [OpenML](https://www.openml.org) 12 | - [SNAP library of datasets collected by Stanford University](https://snap.stanford.edu/data/index.html) 13 | - [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/index.php) 14 | 15 | ### APIs 16 | - [Facebook API](https://developers.facebook.com/docs/graph-api) 17 | - [NOAA Climate data API](https://www.ncdc.noaa.gov/cdo-web/webservices/v2) 18 | - [NYTimes API](https://developer.nytimes.com/) 19 | - [Open Weather Map API](https://openweathermap.org/api) 20 | - [Twitter API](https://developer.twitter.com/en/docs.html) 21 | - [USGS Earthquake API](https://earthquake.usgs.gov/fdsnws/event/1/) 22 | 23 | ### Websites by Topic 24 | This section contains selected data resources across various topics, which can be accessed through a website. Obtaining the data for an analysis may be as simple as downloading a CSV file or may require parsing HTML with pandas. If you must resort to scraping the page (make sure you have tried the ways we discussed in this book first), be sure that you aren't violating the terms of use of the website. 25 | 26 | #### Finance 27 | In addition to the `pandas_datareader` and `stock_analysis` packages we discussed in chapter 7, consult the following: 28 | - [Google finance](https://www.google.com/finance) 29 | - [NASDAQ historical stock prices](https://www.nasdaq.com/quotes/historical-quotes.aspx) 30 | - [Quandl](https://www.quandl.com) 31 | - [Yahoo! finance](https://finance.yahoo.com) 32 | 33 | #### Government data 34 | - [European Union open data](http://data.europa.eu/euodp/en/data) 35 | - [NASA](https://data.nasa.gov/) 36 | - [NYC data](https://opendata.cityofnewyork.us/data/) 37 | - [UK government data](https://data.gov.uk/) 38 | - [UN data](http://data.un.org/) 39 | - [US census data](https://census.gov/data.html) 40 | - [US government data](https://www.data.gov/) 41 | 42 | #### Health and economy 43 | - [Gapminder](https://www.gapminder.org/data/) 44 | - [Health data](https://healthdata.gov/search/type/dataset) 45 | - [World Health Organization](https://www.who.int/gho/en/) 46 | 47 | #### Social networks 48 | For those interested in text-based data or graph data, check out the following resources on social networks: 49 | - [List of Twitter data resources](https://github.com/shaypal5/awesome-twitter-data) 50 | - [Social network data](https://snap.stanford.edu/data/ego-Facebook.html) 51 | 52 | #### Sports 53 | - [Baseball database (practice working with a DB)](http://www.seanlahman.com/baseball-archive/statistics/) 54 | - [Baseball player statistics](https://www.baseball-reference.com/players/) 55 | - [Basketball player statistics](https://www.basketball-reference.com/players/) 56 | - [Football (American) player statistics](https://www.pro-football-reference.com/players/) 57 | - [Football (soccer) statistics](https://www.whoscored.com/Statistics) 58 | - [Hockey player statistics](https://www.hockey-reference.com/players/) 59 | 60 | #### Miscellaneous 61 | The following resources vary in topic, but be sure to check these out if nothing so far has piqued your interest: 62 | - [Amazon reviews data](https://snap.stanford.edu/data/web-Amazon.html) 63 | - [Data extracted from Wikipedia](https://wiki.dbpedia.org/develop/datasets) 64 | - [Google Trends](https://trends.google.com/trends/) 65 | - [Movies from MovieLens](https://grouplens.org/datasets/movielens/) 66 | - [Yahoo Webscope (reference library of datasets)](https://webscope.sandbox.yahoo.com/) 67 | 68 | ## Practice working with data 69 | - [Datacamp](https://www.datacamp.com/) 70 | - [Kaggle](https://www.kaggle.com/) 71 | 72 | ## Python practice 73 | - [HackerRank](https://www.hackerrank.com) 74 | - [CodeWars](https://www.codewars.com) 75 | - [LeetCode](https://www.leetcode.com) 76 | - [CodinGame](https://www.codingame.com) 77 | - [Python Morsels](https://www.pythonmorsels.com/) 78 | - [Pramp](https://www.pramp.com) 79 | - [Khan Academy](https://www.khanacademy.org/) 80 | - [LinkedIn Learning](https://www.linkedin.com/learning/) 81 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: book_env 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - imbalanced-learn=0.4.3 7 | - jupyterlab=0.35.6 8 | - matplotlib=3.0.3 9 | - numpy=1.16.3 10 | - pandas=0.23.4 11 | - pandas-datareader=0.7.0 12 | - requests=2.21.0 13 | - pip 14 | - python-graphviz=0.10.1 15 | - scikit-learn=0.20.3 16 | - scipy=1.2.1 17 | - seaborn=0.9.0 18 | - sqlalchemy=1.3.3 19 | - statsmodels=0.9.0 20 | - pip: 21 | - git+https://github.com/stefmolin/login-attempt-simulator.git@pandas_book 22 | - git+https://github.com/stefmolin/ml-utils.git@pandas_book 23 | - git+https://github.com/stefmolin/stock-analysis.git@pandas_book 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz==0.10.1 2 | imbalanced-learn==0.4.3 3 | jupyterlab==0.35.6 4 | matplotlib==3.0.3 5 | numpy==1.16.3 6 | pandas==0.23.4 7 | pandas-datareader==0.7.0 8 | requests==2.21.0 9 | scikit-learn==0.20.3 10 | scipy==1.2.1 11 | seaborn==0.9.0 12 | sqlalchemy==1.3.3 13 | statsmodels==0.9.0 14 | git+https://github.com/stefmolin/login-attempt-simulator.git@pandas_book 15 | git+https://github.com/stefmolin/ml-utils.git@pandas_book 16 | git+https://github.com/stefmolin/stock-analysis.git@pandas_book 17 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.7 2 | -------------------------------------------------------------------------------- /solutions/ch_08/dec_2018_attacks.csv: -------------------------------------------------------------------------------- 1 | start,end,source_ip 2 | 2018-12-01 06:09:25.126252,2018-12-01 06:13:32.126252,87.106.250.46 3 | 2018-12-02 17:08:51.659516,2018-12-02 17:13:06.659516,124.100.154.251 4 | 2018-12-02 22:31:34.932660,2018-12-02 22:35:47.932660,162.193.160.52 5 | 2018-12-03 03:41:24.236627,2018-12-03 03:41:58.236627,3.187.98.222 6 | 2018-12-04 04:51:10.436412,2018-12-04 04:55:18.436412,12.203.107.201 7 | 2018-12-04 06:33:49.212236,2018-12-04 06:38:03.212236,189.129.85.67 8 | 2018-12-05 08:34:28.448176,2018-12-05 08:38:43.448176,130.0.243.199 9 | 2018-12-06 13:19:19.870073,2018-12-06 13:22:45.870073,119.224.29.115 10 | 2018-12-07 23:09:16.749674,2018-12-07 23:10:50.749674,81.169.193.182 11 | 2018-12-10 05:49:18.805774,2018-12-10 05:53:23.805774,184.99.196.253 12 | 2018-12-10 10:17:56.201382,2018-12-10 10:22:09.201382,96.187.45.99 13 | 2018-12-11 20:42:42.135463,2018-12-11 20:44:42.135463,232.159.100.161 14 | 2018-12-12 07:57:40.912757,2018-12-12 07:57:44.912757,214.199.76.60 15 | 2018-12-13 03:02:37.937394,2018-12-13 03:06:55.937394,185.2.99.58 16 | 2018-12-13 17:52:32.076269,2018-12-13 17:56:43.076269,153.60.249.72 17 | 2018-12-14 01:02:33.845786,2018-12-14 01:06:38.845786,122.197.157.59 18 | 2018-12-14 09:33:16.032817,2018-12-14 09:37:25.032817,121.147.201.245 19 | 2018-12-14 10:15:50.057193,2018-12-14 10:17:42.057193,115.94.212.74 20 | 2018-12-18 12:47:32.171941,2018-12-18 12:49:11.171941,121.90.233.205 21 | 2018-12-23 00:23:46.442783,2018-12-23 00:27:59.442783,48.85.35.228 22 | 2018-12-23 03:05:14.626905,2018-12-23 03:09:26.626905,62.38.152.194 23 | 2018-12-23 18:21:52.780180,2018-12-23 18:26:05.780180,8.46.128.35 24 | 2018-12-24 23:00:04.367951,2018-12-24 23:02:04.367951,97.32.235.17 25 | 2018-12-25 10:58:42.476592,2018-12-25 11:02:54.476592,113.162.208.193 26 | 2018-12-25 11:01:04.947973,2018-12-25 11:05:11.947973,49.76.49.155 27 | 2018-12-26 10:53:43.622708,2018-12-26 10:57:55.622708,25.215.139.208 28 | 2018-12-26 19:57:11.911521,2018-12-26 20:01:20.911521,196.229.137.227 29 | 2018-12-28 12:18:55.920246,2018-12-28 12:22:54.920246,209.190.162.180 30 | 2018-12-29 18:46:09.159686,2018-12-29 18:50:22.159686,78.6.154.160 31 | 2018-12-29 22:03:32.201530,2018-12-29 22:07:39.201530,211.41.206.218 32 | 2018-12-30 07:58:56.299054,2018-12-30 08:03:04.299054,151.19.160.7 33 | 2018-12-30 10:24:32.325926,2018-12-30 10:25:50.325926,42.36.247.249 34 | 2018-12-30 18:02:10.330542,2018-12-30 18:06:22.330542,31.166.161.10 35 | 2018-12-30 20:53:05.198396,2018-12-30 20:57:15.198396,216.47.145.212 36 | 2018-12-31 15:25:24.965666,2018-12-31 15:28:30.965666,220.140.156.223 37 | -------------------------------------------------------------------------------- /solutions/ch_11/exercise_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unsupervised anomaly detection with One-Class SVM\n", 8 | "\n", 9 | "## Setup" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | "
source_ipusernamesuccessfailure_reason
datetime
2018-01-01 00:06:19.353126223.178.55.3djones1None
2018-01-01 00:09:07.147971223.178.55.3djones1None
2018-01-01 01:08:08.6100416.252.142.27asmith1None
2018-01-01 02:37:50.329298124.178.25.98akim1None
2018-01-01 02:45:20.38208098.43.141.103akim1None
\n", 89 | "
" 90 | ], 91 | "text/plain": [ 92 | " source_ip username success failure_reason\n", 93 | "datetime \n", 94 | "2018-01-01 00:06:19.353126 223.178.55.3 djones 1 None\n", 95 | "2018-01-01 00:09:07.147971 223.178.55.3 djones 1 None\n", 96 | "2018-01-01 01:08:08.610041 6.252.142.27 asmith 1 None\n", 97 | "2018-01-01 02:37:50.329298 124.178.25.98 akim 1 None\n", 98 | "2018-01-01 02:45:20.382080 98.43.141.103 akim 1 None" 99 | ] 100 | }, 101 | "execution_count": 1, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "import numpy as np\n", 108 | "import pandas as pd\n", 109 | "\n", 110 | "import sqlite3\n", 111 | "\n", 112 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n", 113 | " logs_2018 = pd.read_sql(\n", 114 | " \"\"\"\n", 115 | " SELECT * \n", 116 | " FROM logs \n", 117 | " WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n", 118 | " \"\"\", \n", 119 | " conn, parse_dates=['datetime'], index_col='datetime'\n", 120 | " )\n", 121 | "logs_2018.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 2, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "def get_X(log, day):\n", 131 | " \"\"\"\n", 132 | " Get data we can use for the X\n", 133 | " \n", 134 | " Parameters:\n", 135 | " - log: The logs dataframe\n", 136 | " - day: A day or single value we can use as a datetime index slice\n", 137 | " \n", 138 | " Returns: \n", 139 | " A pandas DataFrame\n", 140 | " \"\"\"\n", 141 | " return pd.get_dummies(log[day].assign(\n", 142 | " failures=lambda x: 1 - x.success\n", 143 | " ).query('failures > 0').resample('1min').agg(\n", 144 | " {'username':'nunique', 'failures': 'sum'}\n", 145 | " ).dropna().rename(\n", 146 | " columns={'username':'usernames_with_failures'}\n", 147 | " ).assign(\n", 148 | " day_of_week=lambda x: x.index.dayofweek, \n", 149 | " hour=lambda x: x.index.hour\n", 150 | " ).drop(columns=['failures']), columns=['day_of_week', 'hour'])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n", 162 | " 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n", 163 | " 'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n", 164 | " 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n", 165 | " 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n", 166 | " 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n", 167 | " dtype='object')" 168 | ] 169 | }, 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "X = get_X(logs_2018, '2018-01')\n", 177 | "X.columns" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## One-class SVM" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 4, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 197 | " return self.partial_fit(X, y)\n", 198 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 199 | " return self.fit(X, **fit_params).transform(X)\n", 200 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\svm\\classes.py:1177: DeprecationWarning: The random_state parameter is deprecated and will be removed in version 0.22.\n", 201 | " \" be removed in version 0.22.\", DeprecationWarning)\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "from sklearn.svm import OneClassSVM\n", 207 | "from sklearn.pipeline import Pipeline\n", 208 | "from sklearn.preprocessing import StandardScaler\n", 209 | "\n", 210 | "one_class_svm_pipeline = Pipeline([\n", 211 | " ('scale', StandardScaler()),\n", 212 | " ('svm', OneClassSVM(random_state=0))\n", 213 | "]).fit(X)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 5, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stderr", 223 | "output_type": "stream", 224 | "text": [ 225 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 226 | " Xt = transform.transform(Xt)\n" 227 | ] 228 | }, 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "outlier 22823\n", 233 | "inlier 18794\n", 234 | "dtype: int64" 235 | ] 236 | }, 237 | "execution_count": 5, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "preds = one_class_svm_pipeline.predict(X)\n", 244 | "pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.7.2" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 2 269 | } 270 | -------------------------------------------------------------------------------- /solutions/ch_11/exercise_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding Outliers with k-Means\n", 8 | "\n", 9 | "## Setup" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | "
source_ipusernamesuccessfailure_reason
datetime
2018-01-01 00:06:19.353126223.178.55.3djones1None
2018-01-01 00:09:07.147971223.178.55.3djones1None
2018-01-01 01:08:08.6100416.252.142.27asmith1None
2018-01-01 02:37:50.329298124.178.25.98akim1None
2018-01-01 02:45:20.38208098.43.141.103akim1None
\n", 89 | "
" 90 | ], 91 | "text/plain": [ 92 | " source_ip username success failure_reason\n", 93 | "datetime \n", 94 | "2018-01-01 00:06:19.353126 223.178.55.3 djones 1 None\n", 95 | "2018-01-01 00:09:07.147971 223.178.55.3 djones 1 None\n", 96 | "2018-01-01 01:08:08.610041 6.252.142.27 asmith 1 None\n", 97 | "2018-01-01 02:37:50.329298 124.178.25.98 akim 1 None\n", 98 | "2018-01-01 02:45:20.382080 98.43.141.103 akim 1 None" 99 | ] 100 | }, 101 | "execution_count": 1, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "import numpy as np\n", 108 | "import pandas as pd\n", 109 | "\n", 110 | "import sqlite3\n", 111 | "\n", 112 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n", 113 | " logs_2018 = pd.read_sql(\n", 114 | " \"\"\"\n", 115 | " SELECT * \n", 116 | " FROM logs \n", 117 | " WHERE datetime BETWEEN \"2018-01-01\" AND \"2019-01-01\";\n", 118 | " \"\"\", \n", 119 | " conn, parse_dates=['datetime'], index_col='datetime'\n", 120 | " )\n", 121 | "logs_2018.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 2, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "def get_X(log, day):\n", 131 | " \"\"\"\n", 132 | " Get data we can use for the X\n", 133 | " \n", 134 | " Parameters:\n", 135 | " - log: The logs dataframe\n", 136 | " - day: A day or single value we can use as a datetime index slice\n", 137 | " \n", 138 | " Returns: \n", 139 | " A pandas DataFrame\n", 140 | " \"\"\"\n", 141 | " return pd.get_dummies(log[day].assign(\n", 142 | " failures=lambda x: 1 - x.success\n", 143 | " ).query('failures > 0').resample('1min').agg(\n", 144 | " {'username':'nunique', 'failures': 'sum'}\n", 145 | " ).dropna().rename(\n", 146 | " columns={'username':'usernames_with_failures'}\n", 147 | " ).assign(\n", 148 | " day_of_week=lambda x: x.index.dayofweek, \n", 149 | " hour=lambda x: x.index.hour\n", 150 | " ).drop(columns=['failures']), columns=['day_of_week', 'hour'])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "Index(['usernames_with_failures', 'day_of_week_0', 'day_of_week_1',\n", 162 | " 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',\n", 163 | " 'day_of_week_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4',\n", 164 | " 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11',\n", 165 | " 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17',\n", 166 | " 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],\n", 167 | " dtype='object')" 168 | ] 169 | }, 170 | "execution_count": 3, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "X = get_X(logs_2018, '2018')\n", 177 | "X.columns" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## k-Means\n", 185 | "Since we want a \"normal\" activity cluster and an \"anomaly\" cluster, we need to make 2 clusters." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stderr", 195 | "output_type": "stream", 196 | "text": [ 197 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\preprocessing\\data.py:645: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 198 | " return self.partial_fit(X, y)\n", 199 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\base.py:464: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 200 | " return self.fit(X, **fit_params).transform(X)\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "from sklearn.cluster import KMeans\n", 206 | "from sklearn.pipeline import Pipeline\n", 207 | "from sklearn.preprocessing import StandardScaler\n", 208 | "\n", 209 | "kmeans_pipeline = Pipeline([\n", 210 | " ('scale', StandardScaler()),\n", 211 | " ('kmeans', KMeans(random_state=0, n_clusters=2))\n", 212 | "]).fit(X)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "The cluster label doesn't mean anything to us, but we can examine the size of each cluster. We don't expect the clusters to be of equal size because anomalous activity doesn't happen as often as normal activity (we presume)." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 5, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stderr", 229 | "output_type": "stream", 230 | "text": [ 231 | "c:\\users\\molinstefanie\\packt\\venv\\lib\\site-packages\\sklearn\\pipeline.py:331: DataConversionWarning: Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.\n", 232 | " Xt = transform.transform(Xt)\n" 233 | ] 234 | }, 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "0 430546\n", 239 | "1 93600\n", 240 | "dtype: int64" 241 | ] 242 | }, 243 | "execution_count": 5, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "preds = kmeans_pipeline.predict(X)\n", 250 | "pd.Series(preds).value_counts()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### Evaluating the clustering\n", 258 | "#### Step 1: Get the true labels" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 6, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "with sqlite3.connect('../../ch_11/logs/logs.db') as conn:\n", 268 | " hackers_2018 = pd.read_sql(\n", 269 | " 'SELECT * FROM attacks WHERE start BETWEEN \"2018-01-01\" AND \"2019-01-01\";', \n", 270 | " conn, parse_dates=['start', 'end']\n", 271 | " ).assign(\n", 272 | " duration=lambda x: x.end - x.start, \n", 273 | " start_floor=lambda x: x.start.dt.floor('min'),\n", 274 | " end_ceil=lambda x: x.end.dt.ceil('min')\n", 275 | " )" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 7, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "def get_y(datetimes, hackers, resolution='1min'):\n", 285 | " \"\"\"\n", 286 | " Get data we can use for the y (whether or not a hacker attempted a log in during that time).\n", 287 | " \n", 288 | " Parameters:\n", 289 | " - datetimes: The datetimes to check for hackers\n", 290 | " - hackers: The dataframe indicating when the attacks started and stopped\n", 291 | " - resolution: The granularity of the datetime. Default is 1 minute.\n", 292 | " \n", 293 | " Returns:\n", 294 | " A pandas Series of booleans.\n", 295 | " \"\"\"\n", 296 | " date_ranges = hackers.apply(\n", 297 | " lambda x: pd.date_range(x.start_floor, x.end_ceil, freq=resolution), \n", 298 | " axis=1\n", 299 | " )\n", 300 | " dates = pd.Series()\n", 301 | " for date_range in date_ranges:\n", 302 | " dates = pd.concat([dates, date_range.to_series()])\n", 303 | " return datetimes.isin(dates)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 8, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "is_hacker = get_y(X.reset_index().datetime, hackers_2018)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "### Step 2: Calculate Fowlkes Mallows Score\n", 320 | "This indicates percentage of the observations belong to the same cluster in the true labels and in the predicted labels." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 9, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "0.8395916262911648" 332 | ] 333 | }, 334 | "execution_count": 9, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "from sklearn.metrics import fowlkes_mallows_score\n", 341 | "\n", 342 | "fowlkes_mallows_score(is_hacker, preds)" 343 | ] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 3", 349 | "language": "python", 350 | "name": "python3" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 3 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython3", 362 | "version": "3.7.2" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 2 367 | } 368 | --------------------------------------------------------------------------------