├── .dvc ├── .gitignore └── config ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── Dockerfile_cd4ml_setup ├── LICENSE ├── README.md ├── Strata-london-2019-slides.pdf ├── data └── README.md ├── deploy.sh ├── instructions ├── 1-setup.md ├── 2-deployment-pipeline.md ├── 3-machine-learning-pipeline.md ├── 4-tracking-experiments.md ├── 5-model-monitoring.md └── images │ ├── 1-open-terminal.png │ ├── 1-sample-app.png │ ├── 3-app-pipeline.png │ ├── 3-ml-pipeline.png │ ├── 4-mlflow-setup.png │ ├── 4-mlflow.png │ ├── 5-fluentd-setup.png │ ├── 5-kibana.png │ └── gear.png ├── jupyter_notebooks ├── Exploratory_Analysis.ipynb ├── Feature_Engineering.ipynb ├── Negative_Sales.ipynb └── README.md ├── kubernetes └── web.yml ├── requirements.txt ├── results └── metrics.json ├── run_decisiontree_pipeline.sh ├── setup-git.sh ├── src ├── __init__.py ├── app.py ├── decision_tree.py ├── download_data.py ├── evaluation.py ├── splitter.py ├── tracking.py └── webapp │ ├── static │ └── index.js │ └── templates │ └── index.html ├── start.bat ├── start.sh ├── test ├── app_test.py ├── evaluation_test.py ├── splitter_test.py └── test.py └── undeploy.sh /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /state 2 | /lock 3 | /config.local 4 | /updater 5 | /updater.lock 6 | /state-journal 7 | /state-wal 8 | /cache 9 | /tmp 10 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- 1 | [core] 2 | remote = default 3 | ['remote "default"'] 4 | url = gs://continuous-intelligence 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/** 2 | data/** 3 | data/*.csv 4 | test/__pycache__/ 5 | *.pyc 6 | **/.cache/* 7 | .idea 8 | .vscode 9 | .ipynb_checkpoints 10 | **/env/** 11 | .pytest_cache 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Code of Conduct 2 | 3 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. 4 | 5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. 6 | 7 | Examples of unacceptable behavior by participants include: 8 | 9 | * The use of sexualized language or imagery 10 | * Personal attacks 11 | * Trolling or insulting/derogatory comments 12 | * Public or private harassment 13 | * Publishing other's private information, such as physical or electronic addresses, without explicit permission 14 | * Other unethical or unprofessional conduct 15 | 16 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. 17 | 18 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. 19 | 20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers. 21 | 22 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.2.0, available at https://www.contributor-covenant.org/version/1/2/0/code-of-conduct.html 23 | 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eu.gcr.io/continuous-intelligence/cd4ml-workshop:latest 2 | 3 | USER root 4 | 5 | RUN mkdir -p /app/continuous-intelligence/{src,data} 6 | 7 | COPY start.sh /app/continuous-intelligence 8 | COPY src /app/continuous-intelligence/src 9 | COPY data/decision_tree /app/continuous-intelligence/data/decision_tree 10 | 11 | RUN chmod +x /app/continuous-intelligence/start.sh 12 | 13 | EXPOSE 5005 14 | 15 | CMD ["/app/continuous-intelligence/start.sh"] 16 | -------------------------------------------------------------------------------- /Dockerfile_cd4ml_setup: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3:4.7.12-alpine 2 | 3 | USER root 4 | 5 | # Always use the local requirements.txt to override the cloned one 6 | COPY requirements.txt /requirements.txt 7 | 8 | ENV PATH=$PATH:/opt/conda/bin 9 | 10 | RUN mkdir -p /app/continuous-intelligence \ 11 | && apk --no-cache add git nano bash \ 12 | && git clone https://github.com/ThoughtWorksInc/continuous-intelligence-workshop.git /app/continuous-intelligence \ 13 | && mv /requirements.txt /app/continuous-intelligence/requirements.txt \ 14 | && cd /app/continuous-intelligence \ 15 | && mkdir -p /app/continuous-intelligence/data/raw \ 16 | && pip install --no-cache-dir --no-compile -r requirements.txt \ 17 | && conda list && conda clean -tipy \ 18 | && python /app/continuous-intelligence/src/download_data.py \ 19 | && python /app/continuous-intelligence/src/download_data.py --model 20 | 21 | CMD ["/app/continuous-intelligence/start.sh"] 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-2019 ThoughtWorks Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Continuous Intelligence and CD4ML Workshop 2 | 3 | *NOTE: We are archiving this repository, as it's not been maintained and updated recently. 4 | We will keep it read-only for anyone interested in forking and evolving it independently* 5 | 6 | This workshop contains the sample application and machine learning code used for 7 | the Continuous Delivery for Machine Learning (CD4ML) and Continuous Intelligence 8 | workshop. This material has been developed and is continuously evolved by 9 | [ThoughtWorks](www.thoughtworks.com/open-source) and has been presented in 10 | conferences such as: Yottabyte 2018, World AI Summit 2018, Strata London 2019, 11 | and others. 12 | 13 | ## Pre-Requisites 14 | 15 | In order to run this workshop, you will need: 16 | 17 | * A valid Github account 18 | * A working Docker setup (if running on Windows, make sure to use Linux containers) 19 | 20 | ## Workshop Instructions 21 | 22 | The workshop is divided into several steps, which build on top of each other. 23 | Instructions for each exercise can be found under the 24 | [`instructions`](./instructions) folder. 25 | 26 | *WARNING: the exercises build on top of each other, so you will not be able to 27 | skip steps ahead without executing them.* 28 | 29 | *WARNING 2: the workshop requires infrastructure that we only provision when 30 | needed, therefore you won't be able to execute the exercises on your own that 31 | require that shared infrastructure. We are working on a setup that allows 32 | running the workshop locally, but that is work in progress.* 33 | 34 | ## The Machine Learning Problem 35 | 36 | We built a simplified solution to a Kaggle problem posted by Corporación Favorita, 37 | a large Ecuadorian-based grocery retailer interested in improving their 38 | [Sales Forecasting](https://www.kaggle.com/c/favorita-grocery-sales-forecasting/overview) 39 | using data. For the purposes of this workshop, we have combined and simplified 40 | their data sets, as our goal is not to find the best predictions, but to 41 | demonstrate how to implement CD4ML. 42 | 43 | ## Collaborators 44 | 45 | The material, ideas, and content developed for this workshop were contributions 46 | from (in alphabetical order): 47 | 48 | * [Arif Wider](https://github.com/arifwider) 49 | * [Arun Manivannan](https://github.com/arunma) 50 | * [Christoph Windheuser](https://github.com/ciwin) 51 | * [Danilo Sato](https://github.com/dtsato) 52 | * [Danni Yu](https://github.com/danniyu) 53 | * [David Tan](https://github.com/davified) 54 | * [Emily Grasmeder](https://github.com/emilyagras) 55 | * [Emily Gorcenski](https://github.com/Gorcenski) 56 | * [Jin Yang](https://github.com/yytina) 57 | * [Jonathan Heng](https://github.com/jonheng) 58 | -------------------------------------------------------------------------------- /Strata-london-2019-slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/Strata-london-2019-slides.pdf -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This is where data files will be downloaded to on your local machines 2 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xe 3 | 4 | IMAGE_VERSION=${GO_PIPELINE_LABEL:-latest} 5 | PROJECT_ID=${GCLOUD_PROJECT_ID:-continuous-intelligence} 6 | TENANT_NAMESPACE=${TENANT:-admin} 7 | FLUENTD_HOST=${FLUENTD_HOST:-""} 8 | FLUENTD_PORT=${FLUENTD_PORT:-""} 9 | echo "Deploying image version: $IMAGE_VERSION" 10 | 11 | cat kubernetes/web.yml \ 12 | | sed "s/\\\$tenant\\\$/$TENANT_NAMESPACE/" \ 13 | | sed "s/\\\$fluentd_host\\\$/$FLUENTD_HOST/" \ 14 | | sed "s/\\\$fluentd_port\\\$/$FLUENTD_PORT/" \ 15 | | sed "s/\(image: \).*$/\1eu.gcr.io\/$PROJECT_ID\/ci-workshop-app:$TENANT_NAMESPACE.$IMAGE_VERSION/" \ 16 | | kubectl apply -f - 17 | 18 | echo "Access your application at: http://$TENANT_NAMESPACE.app.cd4ml.net" 19 | -------------------------------------------------------------------------------- /instructions/1-setup.md: -------------------------------------------------------------------------------- 1 | # Exercise 1: Development Environment Setup 2 | 3 | ## Goals 4 | 5 | * Fork your copy of the repository in Github 6 | * Login to your development environment in Jupyterlab 7 | * Configure Git 8 | 9 | ## Step by Step instructions 10 | 11 | We have provisioned all the infrastructure required for the workshop. Each 12 | participant is assigned a numeric ID (from 1 to 100), which will be used 13 | throughout the workshop. 14 | 15 | 1. Visit the main repository at https://github.com/ThoughtWorksInc/cd4ml-workshop 16 | and **fork it** to your personal GitHub account. **Don't clone the main 17 | repository**. 18 | 19 | 2. Create a personal access token in GitHub: 20 | 21 | * Log in to Github 22 | * Open your [Personal Access Tokens](https://github.com/settings/tokens) 23 | settings (*Profile → Settings → Developer Settings → Personal 24 | Access Tokens*) 25 | * Click "Generate new token", choose a name and give it **repo** rights 26 | * Copy the access token value 27 | 28 | 3. Go to Jupyter Lab at https://jupyterhub.cd4ml.net and login with the username 29 | and password provided. 30 | 31 | 4. Open a terminal by clicking on the icon: 32 | 33 | ![Open terminal](./images/1-open-terminal.png) 34 | 35 | 5. Setup Git by running the following commands and answering the questions with 36 | your details: 37 | 38 | ```bash 39 | cd cd4ml-workshop 40 | ./setup-git.sh 41 | ``` 42 | 43 | 6. To test and see the application running in production, open a browser tab, go 44 | to http://userX.app.cd4ml.net (replace `X` with your user ID), and you should 45 | see the application like: 46 | 47 | ![Sample application](./images/1-sample-app.png) 48 | 49 | 7. Done! Go to [the next exercise](./2-deployment-pipeline.md) 50 | -------------------------------------------------------------------------------- /instructions/2-deployment-pipeline.md: -------------------------------------------------------------------------------- 1 | # Exercise 2: Configure your Deployment Pipeline 2 | 3 | ## Goals 4 | 5 | * Learn about [GoCD](https://www.gocd.org/) 6 | * Configure a [Deployment Pipeline](https://martinfowler.com/bliki/DeploymentPipeline.html) 7 | to build and deploy your application to production 8 | * Test the application running in production 9 | 10 | ## Step by Step instructions 11 | 12 | 1. Go to GoCD at http://gocd.cd4ml.net and login with the username and password 13 | provided. 14 | 15 | 2. Click on the little gear symbol (![gear](./images/gear.png)) next to 16 | `ci-workshop-app-X` to edit your deployment pipeline configuration. 17 | 18 | 3. Go to the *"Materials"* tab and edit the existing GitHub URL so that it 19 | points to your forked repository URL - probably just replacing `ThoughtWorksInc` 20 | with your GitHub username. 21 | 22 | 4. Save and go back to the [Dashboard](http://gocd.cd4ml.net) page 23 | 24 | 5. Make a small change to your forked code, e.g., change the 25 | `src/webapp/templates/index.html` file, then add, commit, and push your changes 26 | to see your project being built and deployed in GoCD: 27 | ```bash 28 | git add . 29 | git commit -m"Sample change" 30 | git push 31 | ``` 32 | 33 | 6. Once the pipeline succeeds, you can access your application's URL at 34 | http://userX.app.cd4ml.net (replace `X` with your user ID) 35 | 36 | 7. Done! Go to [the next exercise](./3-machine-learning-pipeline.md) 37 | -------------------------------------------------------------------------------- /instructions/3-machine-learning-pipeline.md: -------------------------------------------------------------------------------- 1 | # Exercise 3: Create your Machine Learning Pipeline 2 | 3 | ## Goals 4 | 5 | * Learn about [DVC](https://dvc.org/) 6 | * Configure a Machine Learning Pipeline with DVC to fetch raw data and train a 7 | ML model 8 | * Create a pipeline in GoCD to automate your ML training pipeline 9 | * Add automated tests to evaluate and govern your ML models 10 | * Combine both GoCD pipelines to promote and deploy the new model to production 11 | 12 | ## Step by Step instructions 13 | 14 | 1. Configure DVC to use your GCP bucket for remote storage **(replace `X` with 15 | your user ID)**: 16 | ```bash 17 | dvc remote modify default url gs://cd4ml-continuous-intelligence-bucket-X 18 | ``` 19 | 20 | 2. Create your Machine Learning pipeline with dvc: 21 | ```bash 22 | dvc run -f input.dvc -d src/download_data.py -o data/raw/store47-2016.csv python src/download_data.py 23 | dvc run -f split.dvc -d data/raw/store47-2016.csv -d src/splitter.py -o data/splitter/train.csv -o data/splitter/validation.csv python src/splitter.py 24 | dvc run -d data/splitter/train.csv -d data/splitter/validation.csv -d src/decision_tree.py -o data/decision_tree/model.pkl -M results/metrics.json python src/decision_tree.py 25 | ``` 26 | 27 | 3. Add, commit, and push your changes: 28 | ```bash 29 | git add . 30 | git commit -m "Creating ML pipeline" 31 | git push 32 | ``` 33 | 34 | 4. Create machine learning training pipeline in GoCD: 35 | 36 | * Go to GoCD's [*"Admin" > "Pipelines"*](http://gocd.cd4ml.net/go/admin/pipelines) 37 | menu and create a new pipeline. 38 | 39 | * Configure your Github repository URL (e.g. `https://github.com//cd4ml-workshop.git`) as a Git material. 40 | 41 | * Give it a name related to your username, e.g. `ml-pipeline-X`, replacing `X` 42 | with your user ID). 43 | 44 | * Click on *"Advanced Settings"* for "Part 2", enable the "Use Template" 45 | toggle and select the existing `ml-pipeline-gcp-template` template. 46 | 47 | ![GoCD Configuration for ML pipeline](./images/3-ml-pipeline.png) 48 | 49 | * Click *"Save + Run This Pipeline"* 50 | 51 | 5. Combine both pipelines: 52 | 53 | * Go back to edit your original `ci-workshop-app-X` pipeline again. 54 | 55 | * In the *"Materials"* tab add your new pipeline as a new material 56 | (double-click to get the correct auto suggestion). 57 | 58 | * Expand the *"build-and-publish"* stage, and click on the *"build"* job. 59 | 60 | * Update the second build task to pull the latest model using DVC instead of downloading a static version from Google Storage, by replacing the 61 | `python src/download_data.py --model` command with 62 | `GOOGLE_APPLICATION_CREDENTIALS=./secret.json dvc pull model.pkl.dvc` 63 | 64 | ![Configuration for application pipeline](./images/3-app-pipeline.png) 65 | 66 | * Save and go back to the main [Dashboard](http://gocd.cd4ml.net) page 67 | 68 | 6. Wait for the machine learning pipeline to train and publish your model. 69 | 70 | **WARNING: The pipeline should fail because the model training accuracy is not 71 | good enough!** 72 | 73 | ### Improving our Model 74 | 75 | 7. In your code, change the model training approach to use a Random Forest 76 | algorithm, by editing the `src/decision_tree.py` file and replacing the `Model.DECISION_TREE` with `Model.RANDOM_FOREST` on the last line of the file. 77 | 78 | 8. Re-run your dvc pipeline locally: 79 | ```bash 80 | dvc repro model.pkl.dvc 81 | ``` 82 | 83 | 9. Add, commit, and push your changes, and watch your pipeline execute and go 84 | green: 85 | ```bash 86 | git add . 87 | git commit -m "Improving model algorithm" 88 | git push 89 | ``` 90 | 91 | 10. Once the machine learning pipeline succeeds, it will trigger a new 92 | application deployment pipeline, which will pull the new improved model and 93 | deploy it to production. Visit your application again to verify that you get 94 | better predictions! 95 | 96 | 11. Done! Go to [the next exercise](./4-tracking-experiments.md) 97 | -------------------------------------------------------------------------------- /instructions/4-tracking-experiments.md: -------------------------------------------------------------------------------- 1 | # Exercise 4: Tracking Experiments with MLflow 2 | 3 | ## Goals 4 | 5 | * Learn about [MLflow](https://mlflow.org/) 6 | * Configure our Machine Learning training Pipeline to track experiments and 7 | results 8 | 9 | ## Step by Step instructions 10 | 11 | 1. In [GoCD](http://gocd.cd4ml.net), click on the little gear symbol 12 | (![gear](./images/gear.png)) next to `ml-pipeline-X` to edit your machine 13 | learning pipeline configuration. 14 | 15 | 2. Open the *"Environment Variables"* tab and configure the URL to the MLFlow 16 | tracking server and your tenant: 17 | 18 | * `MLFLOW_TRACKING_URL = http://mlflow.cd4ml.net` 19 | * `TENANT = userX` (replace `X` with your user ID) 20 | 21 | ![Configure ML pipeline to track with MLflow](./images/4-mlflow-setup.png) 22 | 23 | 3. Save and return to the [Dashboard](http://gocd.cd4ml.net) page. 24 | 25 | 4. Trigger a new ML training pipeline and wait for it to succeed. 26 | 27 | 5. Visit the MLflow URL http://mlflow.cd4ml.net to find your experiment results. 28 | 29 | ![Tracking experiment runs with MLflow](./images/4-mlflow.png) 30 | 31 | 6. Done! Go to [the next exercise](./5-model-monitoring.md) 32 | -------------------------------------------------------------------------------- /instructions/5-model-monitoring.md: -------------------------------------------------------------------------------- 1 | # Exercise 5: Model Monitoring and Observability 2 | 3 | ## Goals 4 | 5 | * Learn about EFK Stack ([Elasticsearch](https://www.elastic.co/products/elasticsearch), 6 | [FluentD](https://www.fluentd.org/), and [Kibana](https://www.elastic.co/products/kibana)) 7 | * Configure and deploy our application to log prediction events to Elastic Search 8 | * Visualize events on Kibana dashboard 9 | * Learn how to close the data feedback loop 10 | 11 | ## Introduction 12 | 13 | To close the data feedback loop, we can log events in production to collect data 14 | about how our model is performing against real data. This data can later be 15 | curated and labeled to improve the dataset used during training. This allows us 16 | to continuously improve our models in production! 17 | 18 | In this workshop, we use the EFK stack for our monitoring and observability 19 | infrastructure. It is composed of three main components: 20 | 21 | * [Elasticsearch](https://www.elastic.co/products/elasticsearch): an open 22 | source search engine. 23 | * [FluentD](https://www.fluentd.org/): an open source data collector for unified 24 | logging layer. 25 | * [Kibana](https://www.elastic.co/products/kibana): an open source web UI that 26 | makes it easy to explore and visualize the data indexed by Elasticsearch. 27 | 28 | ## Step by Step instructions 29 | 30 | 1. In [GoCD](http://gocd.cd4ml.net), click on the little gear symbol 31 | (![gear](./images/gear.png)) next to `ci-workshop-app-X` to edit your 32 | deployment pipeline configuration. 33 | 34 | 2. Open the *"Environment Variables"* tab and configure the FluentD host and 35 | port: 36 | 37 | * `FLUENTD_HOST = elastic-stack-fluentd-elasticsearch.elk.svc.cluster.local` 38 | * `FLUENTD_PORT = 24224` 39 | 40 | ![Configure deployment pipeline to log to FluentD](./images/5-fluentd-setup.png) 41 | 42 | 3. Save and return to the [Dashboard](http://gocd.cd4ml.net) page. 43 | 44 | 4. Trigger a new application deployment pipeline and wait for it to succeed. 45 | 46 | 5. Visit your application in production to make a few predictions. 47 | 48 | 6. Visit the Kibana URL http://kibana.cd4ml.net and click on the *"Discover"* 49 | menu. 50 | 51 | 7. In the search field, find the entries tagged with your user, with a query 52 | string `tag:"userX.prediction"` (substitute `X` with your user ID). 53 | 54 | 8. Click *"Refresh"* and you should see your predictions logged! 55 | 56 | ![Prediction events in Kibana](./images/5-kibana.png) 57 | 58 | 9. Done! 59 | 60 | **NOTE: after the end of the workshop, we delete all the infrastructure and GoCD pipelines for security and cost reasons.** 61 | 62 | You don’t need to use the same tools we chose to implement CD4ML. Get in touch 63 | with us if you want to learn how to run this workshop with your teams in your 64 | company! 65 | -------------------------------------------------------------------------------- /instructions/images/1-open-terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/1-open-terminal.png -------------------------------------------------------------------------------- /instructions/images/1-sample-app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/1-sample-app.png -------------------------------------------------------------------------------- /instructions/images/3-app-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/3-app-pipeline.png -------------------------------------------------------------------------------- /instructions/images/3-ml-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/3-ml-pipeline.png -------------------------------------------------------------------------------- /instructions/images/4-mlflow-setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/4-mlflow-setup.png -------------------------------------------------------------------------------- /instructions/images/4-mlflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/4-mlflow.png -------------------------------------------------------------------------------- /instructions/images/5-fluentd-setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/5-fluentd-setup.png -------------------------------------------------------------------------------- /instructions/images/5-kibana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/5-kibana.png -------------------------------------------------------------------------------- /instructions/images/gear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/gear.png -------------------------------------------------------------------------------- /jupyter_notebooks/Exploratory_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Exploratory Analysis using Jupyter Notebook\n", 8 | "For further reading, we recommend: \n", 9 | "- [the pandas documentation](http://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html#getting) for information about using DataFrames\n", 10 | "- [this blog post](https://towardsdatascience.com/introduction-to-data-visualization-in-python-89a54c97fbed) for a jumpstart into visualizations\n", 11 | "- [the matplotlib documentation](https://matplotlib.org/users/pyplot_tutorial.html) for more info about visualizations" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "#### Loading data from our GPC bucket" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import s3fs\n", 37 | "s3 = s3fs.S3FileSystem(anon=True)\n", 38 | "s3.ls('twde-datalab/raw')\n", 39 | "\n", 40 | "s3.get('twde-datalab/raw/quito_stores_sample2016-2017.csv', \n", 41 | " '../data/quito_stores_sample2016-2017.csv')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "train = pd.read_csv('../data/quito_stores_sample2016-2017.csv')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | "
iddatestore_nbritem_nbrunit_salesonpromotioncitystatecluster
0882114712016-08-16441035207.0TrueQuitoPichincha5
1882114722016-08-16441036657.0FalseQuitoPichincha5
2882114732016-08-164410557413.0FalseQuitoPichincha5
3882114742016-08-164410557518.0FalseQuitoPichincha5
4882114752016-08-16441055778.0FalseQuitoPichincha5
\n", 153 | "
" 154 | ], 155 | "text/plain": [ 156 | " id date store_nbr item_nbr unit_sales onpromotion city \\\n", 157 | "0 88211471 2016-08-16 44 103520 7.0 True Quito \n", 158 | "1 88211472 2016-08-16 44 103665 7.0 False Quito \n", 159 | "2 88211473 2016-08-16 44 105574 13.0 False Quito \n", 160 | "3 88211474 2016-08-16 44 105575 18.0 False Quito \n", 161 | "4 88211475 2016-08-16 44 105577 8.0 False Quito \n", 162 | "\n", 163 | " state cluster \n", 164 | "0 Pichincha 5 \n", 165 | "1 Pichincha 5 \n", 166 | "2 Pichincha 5 \n", 167 | "3 Pichincha 5 \n", 168 | "4 Pichincha 5 " 169 | ] 170 | }, 171 | "execution_count": 4, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "train.head()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "#### With just this glimpse, you can start to fill out your list of assumptions, hypotheses, and questions. Some of mine are:\n", 185 | "- Question: What is the span of dates we are provided?\n", 186 | "- Question: How many distinct store_nbr values are there?\n", 187 | "- Question: How many distinct item_nbr values are there?\n", 188 | "- Hypothesis: unit_sales are always positive\n", 189 | "- Hypothesis: onpromotion is always either True or False\n", 190 | "- Hypothesis: city and state are always going to be Quito and Pichincha\n", 191 | "- Hypothesis: cluster is always 5\n", 192 | "- Question: What does cluster mean and is it important to know?\n", 193 | "- Question: How many records does the data contain?\n", 194 | "- Question: What other data files are available?" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Here's some examples of how to address those first questions" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# Access an entire dataframe column like you would\n", 211 | "# the value in a python dictionary:\n", 212 | "# (The returned object has similar pandas built-in \n", 213 | "# functions, like 'head' and 'max')\n", 214 | "print(data['date'].min())\n", 215 | "print(data['date'].max())" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "# Dataframe columns also have a 'unique' method,\n", 225 | "# which can answer several of our questions from above\n", 226 | "data['store_nbr'].unique()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "print(data['item_nbr'].unique())\n", 236 | "print(\"There are too many item numbers to display, so let's just count them for now:\")\n", 237 | "print(\"\\n{} different item_nbr values in our data\"\n", 238 | " .format(len(data['item_nbr'].unique())))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "#### It might be helpful to know the 'shape' of our data. We could count by hand (for now) the columns, but how many rows do we have altogether?" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "print(data.shape)\n", 255 | "print(\"There are {} rows and {} columns in our data\".format(data.shape[0], data.shape[1]))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "#### Moving along to answer our intial questions... Let's have a look at unit_sales. Keep in mind that unit sales is the variable we want to predict with our science.\n", 263 | "\n", 264 | "Each row in our data is essentially telling us a `unit_sales` number for a given `item_nbr` at a given `store_nbr` on a given `date`. That is, \"how many of an item was sold at a store on a day\"." 265 | ] 266 | } 267 | ], 268 | "metadata": { 269 | "kernelspec": { 270 | "display_name": "continuous-intelligence-workshop", 271 | "language": "python", 272 | "name": "continuous-intelligence-workshop" 273 | }, 274 | "language_info": { 275 | "codemirror_mode": { 276 | "name": "ipython", 277 | "version": 3 278 | }, 279 | "file_extension": ".py", 280 | "mimetype": "text/x-python", 281 | "name": "python", 282 | "nbconvert_exporter": "python", 283 | "pygments_lexer": "ipython3", 284 | "version": "3.7.3" 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 2 289 | } 290 | -------------------------------------------------------------------------------- /jupyter_notebooks/Feature_Engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Engineering\n", 8 | "\n", 9 | "Feature engineering is an answer to the question, \"How can I make the most of the data I have?\"\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Let's get started, then. How does one do feature engineering?\n", 17 | "\n", 18 | "I'll assume you're familiar with pandas and the decision tree pipeline that we're using for this project. That's the algorithm we're going to engineer the data for; not all algorithms will want the data engineered the same way, though often the benefits will work for many algorithms." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# load the data output by src/merger.py\n", 37 | "original_data = pd.read_csv('./merger/bigTable.csv')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion',\n", 50 | " 'city', 'state', 'cluster', 'family', 'class', 'perishable',\n", 51 | " 'transactions', 'year', 'month', 'day', 'dayofweek',\n", 52 | " 'days_til_end_of_data', 'cpi', 'dayoff', 'percent_in_transactions',\n", 53 | " 'item_store_sales_variance'],\n", 54 | " dtype='object')\n", 55 | "(5877318, 22)\n" 56 | ] 57 | }, 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
iddatestore_nbritem_nbrunit_salesonpromotioncitystateclusterfamily...transactionsyearmonthdaydayofweekdays_til_end_of_datacpidayoffpercent_in_transactionsitem_store_sales_variance
0882114712016-08-16441035207.0TrueQuitoPichincha5GROCERY I...394120168161364105.123322False0.00177639.466659
1883063562016-08-17441035202.0FalseQuitoPichincha5GROCERY I...425620168172363105.123322False0.00047039.466659
2883990032016-08-18441035204.0FalseQuitoPichincha5GROCERY I...377620168183362105.123322False0.00105939.466659
3884923682016-08-19441035206.0FalseQuitoPichincha5GROCERY I...418520168194361105.123322False0.00143439.466659
4885916262016-08-204410352013.0TrueQuitoPichincha5GROCERY I...483020168205360105.123322True0.00269239.466659
\n", 225 | "

5 rows × 22 columns

\n", 226 | "
" 227 | ], 228 | "text/plain": [ 229 | " id date store_nbr item_nbr unit_sales onpromotion city \\\n", 230 | "0 88211471 2016-08-16 44 103520 7.0 True Quito \n", 231 | "1 88306356 2016-08-17 44 103520 2.0 False Quito \n", 232 | "2 88399003 2016-08-18 44 103520 4.0 False Quito \n", 233 | "3 88492368 2016-08-19 44 103520 6.0 False Quito \n", 234 | "4 88591626 2016-08-20 44 103520 13.0 True Quito \n", 235 | "\n", 236 | " state cluster family ... transactions \\\n", 237 | "0 Pichincha 5 GROCERY I ... 3941 \n", 238 | "1 Pichincha 5 GROCERY I ... 4256 \n", 239 | "2 Pichincha 5 GROCERY I ... 3776 \n", 240 | "3 Pichincha 5 GROCERY I ... 4185 \n", 241 | "4 Pichincha 5 GROCERY I ... 4830 \n", 242 | "\n", 243 | " year month day dayofweek days_til_end_of_data cpi dayoff \\\n", 244 | "0 2016 8 16 1 364 105.123322 False \n", 245 | "1 2016 8 17 2 363 105.123322 False \n", 246 | "2 2016 8 18 3 362 105.123322 False \n", 247 | "3 2016 8 19 4 361 105.123322 False \n", 248 | "4 2016 8 20 5 360 105.123322 True \n", 249 | "\n", 250 | " percent_in_transactions item_store_sales_variance \n", 251 | "0 0.001776 39.466659 \n", 252 | "1 0.000470 39.466659 \n", 253 | "2 0.001059 39.466659 \n", 254 | "3 0.001434 39.466659 \n", 255 | "4 0.002692 39.466659 \n", 256 | "\n", 257 | "[5 rows x 22 columns]" 258 | ] 259 | }, 260 | "execution_count": 4, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "print(original_data.columns)\n", 267 | "print(original_data.shape)\n", 268 | "original_data.head()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 14, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "import sys, os\n", 278 | "sys.path.append(os.path.join('src'))\n", 279 | "from src import splitter" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 15, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "Loading data from merger output\n", 292 | "Splitting data 70:30 train:validation\n", 293 | "Writing to ./splitter/train.csv\n", 294 | "Writing to ./splitter/validation.csv\n", 295 | "Finished splitting\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "# Now we run splitter and decision_tree with our original data\n", 301 | "splitter.main()" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 16, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "from src import decision_tree" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 17, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "Loading data from splitter/train.csv\n", 323 | "Loading data from splitter/validation.csv\n", 324 | "Encoding categorical variables\n", 325 | "Joining tables for consistent encoding\n", 326 | "Creating decision tree model\n", 327 | "Making prediction on validation data\n", 328 | "Calculating estimated error\n" 329 | ] 330 | }, 331 | { 332 | "name": "stderr", 333 | "output_type": "stream", 334 | "text": [ 335 | "src/evaluation.py:12: RuntimeWarning: divide by zero encountered in log\n", 336 | " log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n", 337 | "src/evaluation.py:12: RuntimeWarning: invalid value encountered in log\n", 338 | " log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n" 339 | ] 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "Writing to ./decision_tree/model.pkl\n", 346 | "Writing to ./decision_tree/score_and_metadata.csv\n", 347 | "Done deciding with trees\n", 348 | "Decision tree analysis done with a validation score (error rate) of 0.00268005495579566.\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "decision_tree.main()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 18, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "original_validation_score = 0.00268005495579566" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "So now we have a baseline for how well our decision tree performed before we added a feature.\n", 370 | "\n", 371 | "Let's see what happens if we add a `two_weeks_before_christmas` and a `two_weeks_after_christmas` column, as per our Exploratory Analysis discussion." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 23, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# Re-read the data and use datetime objects for the date\n", 381 | "engineered_data = pd.read_csv('./merger/bigTable.csv')\n", 382 | "engineered_data.date = pd.to_datetime(engineered_data.date)\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 24, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# Create a before_christmas_window\n", 392 | "start_date = pd.to_datetime('2016-12-11')\n", 393 | "end_date = pd.to_datetime('2016-12-25')\n", 394 | "before_christmas = (engineered_data['date'] > start_date) & (engineered_data['date'] <= end_date)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 25, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "# Create an after_christmas_window\n", 404 | "start_date = pd.to_datetime('2016-12-25')\n", 405 | "end_date = pd.to_datetime('2017-01-08')\n", 406 | "after_christmas = (engineered_data['date'] > start_date) & (engineered_data['date'] <= end_date)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 37, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "engineered_data['two_weeks_before_christmas'] = before_christmas\n", 416 | "engineered_data['two_weeks_after_christmas'] = after_christmas" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "#### Just as a spot check, let's look at the date of the first few records in our new columns" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 38, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "117 2016-12-12\n", 436 | "118 2016-12-13\n", 437 | "119 2016-12-14\n", 438 | "120 2016-12-15\n", 439 | "121 2016-12-16\n", 440 | "Name: date, dtype: datetime64[ns]\n", 441 | "130 2016-12-26\n", 442 | "131 2016-12-27\n", 443 | "132 2016-12-28\n", 444 | "133 2016-12-29\n", 445 | "134 2016-12-30\n", 446 | "Name: date, dtype: datetime64[ns]\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "print(engineered_data[engineered_data.two_weeks_before_christmas == True].date.head())\n", 452 | "print(engineered_data[engineered_data.two_weeks_after_christmas == True].date.head())" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Seems okay to me. Let's see how it changes the results now." 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 41, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "engineered_data.to_csv('./merger/bigTable.csv', index=False)\n" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 42, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Loading data from merger output\n", 481 | "Splitting data 70:30 train:validation\n", 482 | "Writing to ./splitter/train.csv\n", 483 | "Writing to ./splitter/validation.csv\n", 484 | "Finished splitting\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "splitter.main()\n" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 43, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "Loading data from splitter/train.csv\n", 502 | "Loading data from splitter/validation.csv\n", 503 | "Encoding categorical variables\n", 504 | "Joining tables for consistent encoding\n", 505 | "Creating decision tree model\n", 506 | "Making prediction on validation data\n", 507 | "Calculating estimated error\n" 508 | ] 509 | }, 510 | { 511 | "name": "stderr", 512 | "output_type": "stream", 513 | "text": [ 514 | "src/evaluation.py:12: RuntimeWarning: divide by zero encountered in log\n", 515 | " log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n", 516 | "src/evaluation.py:12: RuntimeWarning: invalid value encountered in log\n", 517 | " log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n" 518 | ] 519 | }, 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "Writing to ./decision_tree/model.pkl\n", 525 | "Writing to ./decision_tree/score_and_metadata.csv\n", 526 | "Done deciding with trees\n", 527 | "Decision tree analysis done with a validation score (error rate) of 0.003692818003915606.\n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "decision_tree.main()" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 44, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "engineered_validation_score = 0.003692818003915606" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 45, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "-0.0010127630481199463\n" 554 | ] 555 | } 556 | ], 557 | "source": [ 558 | "print(original_validation_score - engineered_validation_score)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "So as it turns out, adding a boolean about before/after Christmas slightly hurt our performance. \n", 566 | "\n", 567 | "- Now we should iterate on the features \n", 568 | " - for example, maybe two weeks is too wide a window\n", 569 | "- or maybe it's time to question if the scoring algorithm provided to us by the kaggle competition\n", 570 | " - should we replace nwrmsle with another error measurement?" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "Python 3", 584 | "language": "python", 585 | "name": "python3" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.7.3" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 2 602 | } 603 | -------------------------------------------------------------------------------- /jupyter_notebooks/Negative_Sales.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "pd.options.display.float_format = '{:,.2f}'.format # use 2 decimals, not scientific notation" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import s3fs\n", 20 | "s3 = s3fs.S3FileSystem(anon=True)\n", 21 | "s3.ls('twde-datalab/raw')\n", 22 | "\n", 23 | "#may require `mkdir data/`\n", 24 | "s3.get('twde-datalab/raw/quito_stores_sample2016-2017.csv', \n", 25 | " '../data/quito_stores_sample2016-2017.csv')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "data = pd.read_csv('../data/quito_stores_sample2016-2017.csv')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "#### Starting with `.describe()` is never a bad place to start data exploration " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "data.unit_sales.describe()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### I have questions about those *negative sales*. How do you negative sell something?\n", 58 | "That's got my gears turning. Here are some of my new questions about the data:\n", 59 | "- Question: What does a negative sale mean?\n", 60 | "- Question: How often are sales negative?\n", 61 | "- Question: How many times are sales above 5,000?\n", 62 | "- Question: How do the unit_sales numbers vary with the `date` column?\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "#### A good way to answer these questions is with some visualizations.\n", 70 | "\n", 71 | "It might be difficult to get an intuitive feel of the data by knowing the exact answer to many of those questions. What we actually want to learn is the personality of the data. We want to know what it looks like in a glance." 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Let's import the python libraries that do the heavy lifting of data visualization" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "%matplotlib inline\n", 88 | "import random\n", 89 | "import matplotlib.pyplot as plt\n", 90 | "import seaborn as sns" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "and then let's look at a box plot of unit sales. A box plot conveys the mean and the middle 50% of the data." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "sns.boxplot(data.unit_sales)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "**This box plot is awful.** We can't even recognize the inter-quartile-range. \n", 114 | "\n", 115 | "Let's make a decision: **Ignore \"very large\" values** (perhaps to be explored later)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "outliers = data[data.unit_sales > 1000]" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "majority_of_data = data[data.unit_sales <= 1000]" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "and once again look at the boxplot of the non-outlier (for lack of a better term) data" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "sns.boxplot(majority_of_data.unit_sales)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "I'm surprised. This plot isn't any better than the first one. Let's try a different visualization... Maybe kernel density estimation plot. \n", 157 | "\n", 158 | "This shows us the probability of a data point being a certain value." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "sns.kdeplot(majority_of_data.unit_sales, clip=[-100,500])" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "We can see that the likelihood of getting a certain unit_sales value tapers off dramatically and has almost vanished by a unit_sales of 100." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "I'd also like to know how do sales change over time. \n", 189 | "**Is there a weekly cycle? A monthly cycle?**\n", 190 | "Let's look at that with a line graph." 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "#### First, we convert the date column into a datetime object, and set it as the index\n", 198 | "Then we find the weekly average of the data and plot it" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "data.index = pd.to_datetime(data.date)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "data.unit_sales.resample('W').mean().plot(x='index',y='unit_sales')" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Here we can see something that seems interesting around December-January. We also see what appears to be a couple sales cycles throughout the year. \n", 224 | "\n", 225 | "**What do you think causes the huge drop off in August-September?**\n", 226 | "\n", 227 | "I'm curious to see if returns happen more frequently after Christmas, so I'm going to repeat the above plot, but only focusing on returns." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "data[data.unit_sales < 0].unit_sales.resample('W').mean().plot(x='index',y='unit_sales')" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "This graph is surprising to me. Is it surprising to you? I still strongly suspect that returns happen at a statistically significantly different rate after Christmas, given that purchases spike around Dec-Jan anyway. My next thought is about those outliers. Maybe `mean` isn't the right measurement to use, since means can be skewed by outliers. \n", 244 | "\n", 245 | "Let's see the same graph as above, only this time using `median` as the measurement. " 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "data[data.unit_sales < 0].unit_sales.resample('W').median().plot(x='index',y='unit_sales')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "There we go. Look at that. When we use a statistic that is robust to outliers, we can see that return behavior is very different around Christmas.\n", 262 | "\n", 263 | "What can we do with this knowledge? If we're to predict sales and returns for the end of December and beginning of January, our model should incorporate the effect of Christmas on sales. Perhaps it'd be useful to add columns called, `is_two_weeks_before_christmas` and `is_two_weeks_after_christmas`. " 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "# import os\n", 280 | "# import urllib.request\n", 281 | "# import argparse\n", 282 | "\n", 283 | "# def load_data(path, key):\n", 284 | "# gcsBucket = \"continuous-intelligence\"\n", 285 | "\n", 286 | "# if not os.path.exists(path):\n", 287 | "# os.makedirs(path)\n", 288 | "\n", 289 | "# if not os.path.exists(os.path.join(path, key)):\n", 290 | "# url = \"https://storage.googleapis.com/%s/%s\" % (gcsBucket, key)\n", 291 | "# urllib.request.urlretrieve(url, os.path.join(path, key))\n", 292 | "\n", 293 | "# load_data(path='data/raw', key='store47-2016.csv')\n", 294 | "\n" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | } 304 | ], 305 | "metadata": { 306 | "kernelspec": { 307 | "display_name": "continuous-intelligence-workshop", 308 | "language": "python", 309 | "name": "continuous-intelligence-workshop" 310 | }, 311 | "language_info": { 312 | "codemirror_mode": { 313 | "name": "ipython", 314 | "version": 3 315 | }, 316 | "file_extension": ".py", 317 | "mimetype": "text/x-python", 318 | "name": "python", 319 | "nbconvert_exporter": "python", 320 | "pygments_lexer": "ipython3", 321 | "version": "3.7.3" 322 | } 323 | }, 324 | "nbformat": 4, 325 | "nbformat_minor": 2 326 | } 327 | -------------------------------------------------------------------------------- /jupyter_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Exploratory Analysis with Jupyter Notebook 2 | ----------- 3 | 4 | One major difference between the software development process you're used to and the process you'll use doing data science is the **scientific process** part of it. 5 | 6 | From the onset, we can't know exactly what sort of product we're going to build or how we'll use the algorithms we end up using. The data is a huge source of uncertainty and until we poke and prod it, we'll be painting in the dark. 7 | 8 | ### Asking relevant questions 9 | The dataset for this project is sales data for a grocery store in Ecuador. We've reduced the dimensionality of the data so it can be quickly analyzed on most people's laptop computers, but there is still be enough data to make meaningful inquiry and make accurate predictions. 10 | 11 | As you approach a data problem, you should try to identify your hypotheses and your questions. You should simultaneously try to test your hypotheses and seek answers to your questions; these will likely produce a feedback loop where each new answer kindles in your another question. That's how science works. 12 | 13 | At this point, you might not know how many stores you're trying to predict sales for. Do you know how many items the stores sell? What is the date range of the data provided? 14 | 15 | ### Reading a Jupyter Notebook 16 | This directory contains a [basic exploratory analysis in a Jupyter Notebook](https://github.com/ThoughtWorksInc/twde-datalab/blob/master/jupyter_notebooks/Exploratory_Analysis.ipynb). Github can render Jupyter notebooks and by following the link you can read the notebook. 17 | 18 | ### Working with a Jupyter Notebook 19 | The real fun obviously lies in using the notebook. For this you have to run a Jupyter notebook server locally. If you installed Python 3 using Anaconda, then Jupyter should already be on your path. 20 | 21 | 1. `cd jupyter_notebooks` 22 | 1. `jupyter notebook` 23 | 24 | The notebook server should start up and a browser window should open on your machine, allowing you to choose a notebook from this directory. 25 | 26 | -------------------------------------------------------------------------------- /kubernetes/web.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: $tenant$ 5 | --- 6 | apiVersion: v1 7 | kind: Service 8 | metadata: 9 | name: ci-workshop-web 10 | namespace: $tenant$ 11 | labels: 12 | app: ci-workshop 13 | spec: 14 | ports: 15 | - port: 80 16 | targetPort: 5005 17 | selector: 18 | app: ci-workshop 19 | tier: frontend 20 | --- 21 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 22 | kind: Deployment 23 | metadata: 24 | name: ci-workshop-web 25 | namespace: $tenant$ 26 | labels: 27 | app: ci-workshop 28 | spec: 29 | selector: 30 | matchLabels: 31 | app: ci-workshop 32 | tier: frontend 33 | strategy: 34 | type: RollingUpdate 35 | template: 36 | metadata: 37 | labels: 38 | app: ci-workshop 39 | tier: frontend 40 | spec: 41 | containers: 42 | - image: eu.gcr.io/continuous-intelligence/ci-workshop-app 43 | imagePullPolicy: IfNotPresent 44 | name: ci-workshop-web 45 | env: 46 | - name: TENANT 47 | value: $tenant$ 48 | - name: FLUENTD_HOST 49 | value: '$fluentd_host$' 50 | - name: FLUENTD_PORT 51 | value: '$fluentd_port$' 52 | ports: 53 | - containerPort: 5005 54 | name: ci-workshop-web 55 | livenessProbe: 56 | httpGet: 57 | path: / 58 | port: ci-workshop-web 59 | initialDelaySeconds: 5 60 | readinessProbe: 61 | httpGet: 62 | path: / 63 | port: ci-workshop-web 64 | initialDelaySeconds: 5 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | dvc[gs]==0.71.0 2 | numpy==1.17.4 3 | pandas==0.25.3 4 | pylint==2.4.4 5 | pytest==5.3.0 6 | scikit-learn==0.21.3 7 | flask==1.1.1 8 | mlflow==1.4.0 9 | lime==0.1.1.36 10 | fluent-logger==0.9.4 11 | s3fs==0.1.2 12 | seaborn==0.9.0 13 | joblib==0.14.0 14 | -------------------------------------------------------------------------------- /results/metrics.json: -------------------------------------------------------------------------------- 1 | {"nwrmsle": 0.8261571666321248, "r2_score": -0.8594341254951023} -------------------------------------------------------------------------------- /run_decisiontree_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | python3 src/download_data.py 6 | python3 src/splitter.py 7 | python3 src/decision_tree.py 8 | -------------------------------------------------------------------------------- /setup-git.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | read -p 'Name: ' NAME 4 | read -p 'E-mail: ' EMAIL 5 | read -p 'Github username: ' GITHUB_USER 6 | read -sp 'Github personal access token: ' GITHUB_TOKEN 7 | echo 8 | echo "Setting up git..." 9 | git config --global user.name "$NAME" 10 | git config --global user.email "$EMAIL" 11 | git remote set-url origin https://$GITHUB_USER:$GITHUB_TOKEN@github.com/$GITHUB_USER/cd4ml-workshop 12 | git pull --rebase 13 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/src/__init__.py -------------------------------------------------------------------------------- /src/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, jsonify, request 2 | from datetime import datetime 3 | import joblib 4 | import pandas as pd 5 | import decision_tree 6 | import os 7 | from fluent import sender, event 8 | 9 | app = Flask(__name__, template_folder='webapp/templates', static_folder='webapp/static') 10 | 11 | products = { 12 | "99197": { 13 | "class": 1067, 14 | "family": "GROCERY I", 15 | "perishable": 0 16 | }, 17 | "105574": { 18 | "class": 1045, 19 | "family": "GROCERY I", 20 | "perishable": 0 21 | }, 22 | "1963838": { 23 | "class": 3024, 24 | "family": "CLEANING", 25 | "perishable": 0 26 | } 27 | } 28 | 29 | TENANT = os.getenv('TENANT', 'local') 30 | FLUENTD_HOST = os.getenv('FLUENTD_HOST') 31 | FLUENTD_PORT = os.getenv('FLUENTD_PORT') 32 | 33 | @app.route('/') 34 | def index(): 35 | return render_template('index.html') 36 | 37 | @app.route('/prediction') 38 | def get_prediction(): 39 | loaded_model = joblib.load('data/decision_tree/model.pkl') 40 | 41 | date_string = request.args.get('date') 42 | 43 | date = datetime.strptime(date_string, '%Y-%m-%d') 44 | 45 | product = products[request.args.get("item_nbr")] 46 | data = { 47 | "date": date_string, 48 | "item_nbr": request.args.get("item_nbr"), 49 | "family": product['family'], 50 | "class": product['class'], 51 | "perishable": product['perishable'], 52 | "transactions": 1000, 53 | "year": date.year, 54 | "month": date.month, 55 | "day": date.day, 56 | "dayofweek": date.weekday(), 57 | "days_til_end_of_data": 0, 58 | "dayoff": date.weekday() >= 5 59 | } 60 | df = pd.DataFrame(data=data, index=['row1']) 61 | 62 | df = decision_tree.encode_categorical_columns(df) 63 | pred = loaded_model.predict(df) 64 | if FLUENTD_HOST: 65 | logger = sender.FluentSender(TENANT, host=FLUENTD_HOST, port=int(FLUENTD_PORT)) 66 | log_payload = {'prediction': pred[0], **data} 67 | print('logging {}'.format(log_payload)) 68 | if not logger.emit('prediction', log_payload): 69 | print(logger.last_error) 70 | logger.clear_last_error() 71 | 72 | return "%d" % pred[0] 73 | 74 | if __name__ == '__main__': 75 | app.run(host = '0.0.0.0', port=5005) 76 | -------------------------------------------------------------------------------- /src/decision_tree.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import numpy as np 3 | import pandas as pd 4 | import sys, os, json 5 | from sklearn.preprocessing import LabelEncoder 6 | import joblib 7 | sys.path.append(os.path.join('..', 'src')) 8 | sys.path.append(os.path.join('src')) 9 | from sklearn import tree, ensemble, metrics 10 | import evaluation 11 | import tracking 12 | 13 | class Model(Enum): 14 | DECISION_TREE = 0 15 | RANDOM_FOREST = 1 16 | ADABOOST = 2 17 | GRADIENT_BOOST = 3 18 | 19 | 20 | def load_data(): 21 | filename = "data/splitter/train.csv" 22 | print("Loading data from {}".format(filename)) 23 | train = pd.read_csv(filename) 24 | 25 | filename = 'data/splitter/validation.csv' 26 | print("Loading data from {}".format(filename)) 27 | validate = pd.read_csv(filename) 28 | 29 | return train, validate 30 | 31 | 32 | def join_tables(train, validate): 33 | print("Joining tables for consistent encoding") 34 | return train.append(validate).drop('date', axis=1) 35 | 36 | 37 | def encode_categorical_columns(df): 38 | obj_df = df.select_dtypes(include=['object', 'bool']).copy().fillna('-1') 39 | lb = LabelEncoder() 40 | for col in obj_df.columns: 41 | df[col] = lb.fit_transform(obj_df[col]) 42 | return df 43 | 44 | 45 | def encode(train, validate): 46 | print("Encoding categorical variables") 47 | train_ids = train.id 48 | validate_ids = validate.id 49 | 50 | joined = join_tables(train, validate) 51 | 52 | encoded = encode_categorical_columns(joined.fillna(-1)) 53 | 54 | print("Not predicting returns...") 55 | encoded.loc[encoded.unit_sales < 0, 'unit_sales'] = 0 56 | 57 | validate = encoded[encoded['id'].isin(validate_ids)] 58 | train = encoded[encoded['id'].isin(train_ids)] 59 | return train, validate 60 | 61 | 62 | def train_model(train, model=Model.DECISION_TREE, seed=None): 63 | print("Training model using regressor: {}".format(model.name)) 64 | train_dropped = train.drop('unit_sales', axis=1) 65 | target = train['unit_sales'] 66 | 67 | if model == Model.RANDOM_FOREST: 68 | params = {'n_estimators': 10} 69 | clf = ensemble.RandomForestRegressor(random_state=seed, **params) 70 | elif model == Model.ADABOOST: 71 | params = {'n_estimators': 50, 'learning_rate': 1.0, 'loss':'linear'} 72 | clf = ensemble.AdaBoostRegressor(random_state=seed, **params) 73 | elif model == Model.GRADIENT_BOOST: 74 | params = {'n_estimators': 200, 'max_depth': 4} 75 | clf = ensemble.GradientBoostingRegressor(random_state=seed, **params) 76 | else: 77 | params = {'criterion': 'mse'} 78 | clf = tree.DecisionTreeRegressor(random_state=seed) 79 | 80 | trained_model = clf.fit(train_dropped, target) 81 | return (trained_model,params) 82 | 83 | 84 | def overwrite_unseen_prediction_with_zero(preds, train, validate): 85 | cols_item_store = ['item_nbr', 'store_nbr'] 86 | cols_to_use = validate.columns.drop('unit_sales') if 'unit_sales' in validate.columns else validate.columns 87 | validate_train_joined = pd.merge(validate[cols_to_use], train, on=cols_item_store, how='left') 88 | unseen = validate_train_joined[validate_train_joined['unit_sales'].isnull()] 89 | validate['preds'] = preds 90 | validate.loc[validate.id.isin(unseen['id_x']), 'preds'] = 0 91 | preds = validate['preds'].tolist() 92 | return preds 93 | 94 | 95 | def make_predictions(model, validate): 96 | print("Making prediction on validation data") 97 | validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1) 98 | validate_preds = model.predict(validate_dropped) 99 | return validate_preds 100 | 101 | 102 | def write_predictions_and_score(evaluation_metrics, model, columns_used): 103 | key = "decision_tree" 104 | if not os.path.exists('data/{}'.format(key)): 105 | os.makedirs('data/{}'.format(key)) 106 | filename = 'data/{}/model.pkl'.format(key) 107 | print("Writing to {}".format(filename)) 108 | joblib.dump(model, filename) 109 | 110 | filename = 'results/metrics.json' 111 | print("Writing to {}".format(filename)) 112 | if not os.path.exists('results'): 113 | os.makedirs('results') 114 | with open(filename, 'w+') as score_file: 115 | json.dump(evaluation_metrics, score_file) 116 | 117 | 118 | def main(model=Model.DECISION_TREE, seed=None): 119 | original_train, original_validate = load_data() 120 | train, validate = encode(original_train, original_validate) 121 | with tracking.track() as track: 122 | track.set_model(model) 123 | model, params = train_model(train, model, seed) 124 | track.log_params(params) 125 | validation_predictions = make_predictions(model, validate) 126 | 127 | print("Calculating metrics") 128 | evaluation_metrics = { 129 | 'nwrmsle': evaluation.nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values), 130 | 'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions) 131 | } 132 | track.log_metrics(evaluation_metrics) 133 | 134 | write_predictions_and_score(evaluation_metrics, model, original_train.columns) 135 | 136 | print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics))) 137 | 138 | 139 | if __name__ == "__main__": 140 | main(model=Model.DECISION_TREE, seed=8675309) 141 | -------------------------------------------------------------------------------- /src/download_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | import argparse 4 | 5 | def load_data(path, key): 6 | gcsBucket = "continuous-intelligence" 7 | 8 | if not os.path.exists(path): 9 | os.makedirs(path) 10 | 11 | if not os.path.exists(os.path.join(path, key)): 12 | url = "https://storage.googleapis.com/%s/%s" % (gcsBucket, key) 13 | urllib.request.urlretrieve(url, os.path.join(path, key)) 14 | 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser(description='Download files from Google Storage.') 18 | parser.add_argument('--model', action='store_true', default=False, help='Downloads model (data/decision_tree/model.pkl) instead of input file (data/raw/store47-2016.csv)') 19 | args = parser.parse_args() 20 | 21 | if args.model: 22 | print("Loading model...") 23 | load_data(path='data/decision_tree', key='model.pkl') 24 | else: 25 | print("Loading input data...") 26 | load_data(path='data/raw', key='store47-2016.csv') 27 | print("Finished downloading") 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /src/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def nwrmsle(predictions, targets, weights): 5 | if type(predictions) == list: 6 | predictions = np.array([np.nan if x < 0 else x for x in predictions]) 7 | elif type(predictions) == pd.Series: 8 | predictions[predictions < 0] = np.nan 9 | targetsf = targets.astype(float) 10 | targetsf[targets < 0] = np.nan 11 | weights = 1 + 0.25 * weights 12 | log_square_errors = (np.log(predictions + 1) - np.log(targetsf + 1)) ** 2 13 | return(np.sqrt(np.sum(weights * log_square_errors) / np.sum(weights))) 14 | -------------------------------------------------------------------------------- /src/splitter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def get_validation_period(latest_date_train, days_back=15): 5 | # for Kaggle we want from Wednesday to Thursday for a 15 day period 6 | offset = (latest_date_train.weekday() - 3) % 7 7 | end_of_validation_period = latest_date_train - pd.DateOffset(days=offset) 8 | begin_of_validation_period = end_of_validation_period - pd.DateOffset(days=days_back) 9 | return (begin_of_validation_period, end_of_validation_period) 10 | 11 | 12 | def split_validation_train_by_validation_period(train, validation_begin_date, validation_end_date): 13 | train_validation = train[(train['date'] >= validation_begin_date) & (train['date'] <= validation_end_date)] 14 | train_train = train[train['date'] < validation_begin_date] 15 | return train_train, train_validation 16 | 17 | 18 | def write_data(table, filename): 19 | if not os.path.exists('data/splitter'): 20 | os.makedirs('data/splitter') 21 | 22 | print("Writing to data/splitter/{}".format(filename)) 23 | table.to_csv('data/splitter/' + filename, index=False) 24 | 25 | 26 | def main(): 27 | print("Loading data...") 28 | train = pd.read_csv("data/raw/store47-2016.csv") 29 | 30 | train['date'] = pd.to_datetime(train['date'], format="%Y-%m-%d") 31 | 32 | latest_date = train['date'].max() 33 | 34 | begin_of_validation, end_of_validation = get_validation_period(latest_date, days_back=57) 35 | 36 | print("Splitting data between {} and {}".format(begin_of_validation, end_of_validation)) 37 | train_train, train_validation = split_validation_train_by_validation_period(train, begin_of_validation, 38 | end_of_validation) 39 | write_data(train_train, 'train.csv') 40 | 41 | write_data(train_validation, 'validation.csv') 42 | 43 | print("Finished splitting") 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /src/tracking.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import os 3 | 4 | MLFLOW_TRACKING_URL = os.getenv('MLFLOW_TRACKING_URL') 5 | TENANT = os.getenv('TENANT','local') 6 | RUN_LABEL = os.getenv('GO_PIPELINE_LABEL', '0') 7 | USE_MLFLOW = MLFLOW_TRACKING_URL is not None 8 | 9 | class track: 10 | def __enter__(self): 11 | if USE_MLFLOW: 12 | mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URL) 13 | mlflow.set_experiment(TENANT) 14 | mlflow.start_run(run_name=RUN_LABEL) 15 | return self 16 | 17 | def __exit__(self, type, value, traceback): 18 | if USE_MLFLOW: 19 | mlflow.end_run() 20 | 21 | def set_model(self, model): 22 | if USE_MLFLOW: 23 | mlflow.log_param('model', model.name) 24 | 25 | def log_params(self, params): 26 | if USE_MLFLOW: 27 | for param in params: 28 | mlflow.log_param(param, params[param]) 29 | 30 | def log_metrics(self, metrics): 31 | if USE_MLFLOW: 32 | for metric in metrics: 33 | mlflow.log_metric(metric, metrics[metric]) 34 | -------------------------------------------------------------------------------- /src/webapp/static/index.js: -------------------------------------------------------------------------------- 1 | $( document ).ready(function() { 2 | $('#date').datepicker({ 3 | dateFormat: "yy-mm-dd" 4 | }); 5 | 6 | $('button[type="submit"]').click(function() { 7 | var data = []; 8 | var valid = true; 9 | $( 'input[type="text"], select' ).each(function() { 10 | if (this.id === 'date' && !/[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])/.test($(this).val())) { 11 | alert("Invalid date format"); 12 | valid = false; 13 | return false; 14 | } else if (!$(this).val()) { 15 | alert("Need to provide a value for " + $(this).prev().text()); 16 | valid = false; 17 | return false; 18 | } else { 19 | data.push(this.id + "=" + $(this).val()); 20 | } 21 | }); 22 | 23 | if (valid) { 24 | var dataStr = '?' + data.join('&'); 25 | var prefix = (window.location.pathname == "/" ? "" : window.location.pathname) 26 | $.ajax(prefix + "/prediction" + dataStr, { 27 | beforeSend: function() { 28 | $('#prediction').text('loading...') 29 | $('button[type="submit"]').attr("disabled", "true") 30 | } 31 | }) 32 | .done(function(result) { 33 | $('#prediction').text(result); 34 | $('button[type="submit"]').removeAttr("disabled") 35 | }) 36 | .fail(function() { 37 | alert("Request failed.") 38 | $('button[type="submit"]').removeAttr("disabled") 39 | }); 40 | } 41 | }) 42 | }); 43 | -------------------------------------------------------------------------------- /src/webapp/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Sales Forecasting 9 | 10 | 11 |
12 |

Sales forecast

13 |
14 |
15 | 16 | 17 |
18 |
19 | 20 | 25 |
26 |
27 | 28 |
29 |
30 |
Prediction: 31 | 32 |
33 |
34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | cd C:\app\continuous-intelligence\ && python src/app.py 2 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd /app/continuous-intelligence && python src/app.py 4 | -------------------------------------------------------------------------------- /test/app_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | def test_endpoint(): 4 | query_params = '?date=2017-06-14&item_nbr=99197'; 5 | resp = requests.get('http://localhost:5005/prediction' + query_params); 6 | 7 | assert resp.status_code == 200 8 | -------------------------------------------------------------------------------- /test/evaluation_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | from pytest import approx 5 | sys.path.append(os.path.join('..', 'src')) 6 | sys.path.append(os.path.join('src')) 7 | import evaluation 8 | 9 | 10 | def test_calculates_nwrmsle_for_perfect_match(): 11 | estimate = np.array([1, 2, 3]) 12 | actual = np.array([1, 2, 3]) 13 | weights = np.array([1, 1, 1]) 14 | calculated_nwrmsle = evaluation.nwrmsle(estimate, actual, weights) 15 | 16 | assert calculated_nwrmsle == 0.0 17 | 18 | 19 | def test_calculates_nwrmsle_for_imperfect_match(): 20 | estimate = np.array([0, 0, 0]) 21 | actual = np.array([1, 1, 1]) 22 | weights = np.array([1, 1, 1]) 23 | calculated_nwrmsle = evaluation.nwrmsle(estimate, actual, weights) 24 | 25 | # Assert by-hand calculation of nwrmsle is reasonably close to python calculation 26 | assert approx(calculated_nwrmsle, 0.69314718) 27 | -------------------------------------------------------------------------------- /test/splitter_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pandas as pd 4 | sys.path.append(os.path.join('..', 'src')) 5 | sys.path.append(os.path.join('src')) 6 | import splitter 7 | 8 | def test_get_validation_period(): 9 | latest_date = pd.to_datetime('2017-11-22') 10 | actual_begin_date, actual_end_date = splitter.get_validation_period(latest_date) 11 | expected_begin_date = pd.to_datetime('2017-11-01') 12 | expected_end_date = pd.to_datetime('2017-11-16') 13 | assert actual_begin_date == expected_begin_date 14 | assert actual_end_date == expected_end_date 15 | 16 | def test_split_validation_train_by_validation_period(): 17 | date1 = pd.to_datetime('2017-11-12') 18 | date2 = pd.to_datetime('2017-11-25') 19 | date3 = pd.to_datetime('2017-11-30') 20 | date4 = pd.to_datetime('2017-12-01') 21 | validation_begin_date = pd.to_datetime('2017-11-15') 22 | validation_end_date = pd.to_datetime('2017-11-30') 23 | d = {'date': [date1, date2, date3, date4], 'col2': [3, 4, 5, 6]} 24 | df = pd.DataFrame(data=d) 25 | df_train, df_validation = splitter.split_validation_train_by_validation_period(df, validation_begin_date, validation_end_date) 26 | assert df_train.shape[0] == 1 27 | assert df_validation.shape[0] == 2 28 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import json 3 | 4 | class TestAccuracy(unittest.TestCase): 5 | METRICS_FILE = "results/metrics.json" 6 | 7 | def test_80percent_error_score(self): 8 | with open(self.METRICS_FILE, 'r') as file: 9 | metrics = json.load(file) 10 | self.assertLessEqual(metrics['nwrmsle'], 0.80) 11 | self.assertGreater(metrics['r2_score'], 0.0) 12 | 13 | 14 | if __name__ == "__main__": 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /undeploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -xe 3 | 4 | TENANT_NAMESPACE=${TENANT:-admin} 5 | cat kubernetes/web.yml | sed "s/\\\$tenant\\\$/$TENANT_NAMESPACE/" | kubectl delete -f - 6 | --------------------------------------------------------------------------------