├── .dvc
    ├── .gitignore
    └── config
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── Dockerfile_cd4ml_setup
├── LICENSE
├── README.md
├── Strata-london-2019-slides.pdf
├── data
    └── README.md
├── deploy.sh
├── instructions
    ├── 1-setup.md
    ├── 2-deployment-pipeline.md
    ├── 3-machine-learning-pipeline.md
    ├── 4-tracking-experiments.md
    ├── 5-model-monitoring.md
    └── images
    │   ├── 1-open-terminal.png
    │   ├── 1-sample-app.png
    │   ├── 3-app-pipeline.png
    │   ├── 3-ml-pipeline.png
    │   ├── 4-mlflow-setup.png
    │   ├── 4-mlflow.png
    │   ├── 5-fluentd-setup.png
    │   ├── 5-kibana.png
    │   └── gear.png
├── jupyter_notebooks
    ├── Exploratory_Analysis.ipynb
    ├── Feature_Engineering.ipynb
    ├── Negative_Sales.ipynb
    └── README.md
├── kubernetes
    └── web.yml
├── requirements.txt
├── results
    └── metrics.json
├── run_decisiontree_pipeline.sh
├── setup-git.sh
├── src
    ├── __init__.py
    ├── app.py
    ├── decision_tree.py
    ├── download_data.py
    ├── evaluation.py
    ├── splitter.py
    ├── tracking.py
    └── webapp
    │   ├── static
    │       └── index.js
    │   └── templates
    │       └── index.html
├── start.bat
├── start.sh
├── test
    ├── app_test.py
    ├── evaluation_test.py
    ├── splitter_test.py
    └── test.py
└── undeploy.sh


/.dvc/.gitignore:
--------------------------------------------------------------------------------
 1 | /state
 2 | /lock
 3 | /config.local
 4 | /updater
 5 | /updater.lock
 6 | /state-journal
 7 | /state-wal
 8 | /cache
 9 | /tmp
10 | 


--------------------------------------------------------------------------------
/.dvc/config:
--------------------------------------------------------------------------------
1 | [core]
2 | remote = default
3 | ['remote "default"']
4 | url = gs://continuous-intelligence
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/**
 2 | data/**
 3 | data/*.csv
 4 | test/__pycache__/
 5 | *.pyc
 6 | **/.cache/*
 7 | .idea
 8 | .vscode
 9 | .ipynb_checkpoints
10 | **/env/**
11 | .pytest_cache
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
 4 | 
 5 | We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality.
 6 | 
 7 | Examples of unacceptable behavior by participants include:
 8 | 
 9 | * The use of sexualized language or imagery
10 | * Personal attacks
11 | * Trolling or insulting/derogatory comments
12 | * Public or private harassment
13 | * Publishing other's private information, such as physical or electronic addresses, without explicit permission
14 | * Other unethical or unprofessional conduct
15 | 
16 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team.
17 | 
18 | This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community.
19 | 
20 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
21 | 
22 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.2.0, available at https://www.contributor-covenant.org/version/1/2/0/code-of-conduct.html
23 | 
24 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM eu.gcr.io/continuous-intelligence/cd4ml-workshop:latest
 2 | 
 3 | USER root
 4 | 
 5 | RUN mkdir -p /app/continuous-intelligence/{src,data}
 6 | 
 7 | COPY start.sh /app/continuous-intelligence
 8 | COPY src /app/continuous-intelligence/src
 9 | COPY data/decision_tree /app/continuous-intelligence/data/decision_tree
10 | 
11 | RUN chmod +x /app/continuous-intelligence/start.sh
12 | 
13 | EXPOSE 5005
14 | 
15 | CMD ["/app/continuous-intelligence/start.sh"]
16 | 


--------------------------------------------------------------------------------
/Dockerfile_cd4ml_setup:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3:4.7.12-alpine
 2 | 
 3 | USER root
 4 | 
 5 | # Always use the local requirements.txt to override the cloned one
 6 | COPY requirements.txt /requirements.txt
 7 | 
 8 | ENV PATH=$PATH:/opt/conda/bin
 9 | 
10 | RUN mkdir -p /app/continuous-intelligence \
11 |   && apk --no-cache add git nano bash \
12 |   && git clone https://github.com/ThoughtWorksInc/continuous-intelligence-workshop.git /app/continuous-intelligence \
13 |   && mv /requirements.txt /app/continuous-intelligence/requirements.txt \
14 |   && cd /app/continuous-intelligence \
15 |   && mkdir -p /app/continuous-intelligence/data/raw \
16 |   && pip install --no-cache-dir --no-compile -r requirements.txt \
17 |   && conda list && conda clean -tipy \
18 |   && python /app/continuous-intelligence/src/download_data.py \
19 |   && python /app/continuous-intelligence/src/download_data.py --model
20 | 
21 | CMD ["/app/continuous-intelligence/start.sh"]
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018-2019 ThoughtWorks Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Continuous Intelligence and CD4ML Workshop
 2 | 
 3 | *NOTE: We are archiving this repository, as it's not been maintained and updated recently.
 4 | We will keep it read-only for anyone interested in forking and evolving it independently*
 5 | 
 6 | This workshop contains the sample application and machine learning code used for
 7 | the Continuous Delivery for Machine Learning (CD4ML) and Continuous Intelligence
 8 | workshop. This material has been developed and is continuously evolved by
 9 | [ThoughtWorks](www.thoughtworks.com/open-source) and has been presented in
10 | conferences such as: Yottabyte 2018, World AI Summit 2018, Strata London 2019,
11 | and others.
12 | 
13 | ## Pre-Requisites
14 | 
15 | In order to run this workshop, you will need:
16 | 
17 | * A valid Github account
18 | * A working Docker setup (if running on Windows, make sure to use Linux containers)
19 | 
20 | ## Workshop Instructions
21 | 
22 | The workshop is divided into several steps, which build on top of each other.
23 | Instructions for each exercise can be found under the
24 | [`instructions`](./instructions) folder.
25 | 
26 | *WARNING: the exercises build on top of each other, so you will not be able to
27 | skip steps ahead without executing them.*
28 | 
29 | *WARNING 2: the workshop requires infrastructure that we only provision when
30 | needed, therefore you won't be able to execute the exercises on your own that
31 | require that shared infrastructure. We are working on a setup that allows
32 | running the workshop locally, but that is work in progress.*
33 | 
34 | ## The Machine Learning Problem
35 | 
36 | We built a simplified solution to a Kaggle problem posted by Corporación Favorita,
37 | a large Ecuadorian-based grocery retailer interested in improving their
38 | [Sales Forecasting](https://www.kaggle.com/c/favorita-grocery-sales-forecasting/overview)
39 | using data. For the purposes of this workshop, we have combined and simplified
40 | their data sets, as our goal is not to find the best predictions, but to
41 | demonstrate how to implement CD4ML.
42 | 
43 | ## Collaborators
44 | 
45 | The material, ideas, and content developed for this workshop were contributions
46 | from (in alphabetical order):
47 | 
48 | * [Arif Wider](https://github.com/arifwider)
49 | * [Arun Manivannan](https://github.com/arunma)
50 | * [Christoph Windheuser](https://github.com/ciwin)
51 | * [Danilo Sato](https://github.com/dtsato)
52 | * [Danni Yu](https://github.com/danniyu)
53 | * [David Tan](https://github.com/davified)
54 | * [Emily Grasmeder](https://github.com/emilyagras)
55 | * [Emily Gorcenski](https://github.com/Gorcenski)
56 | * [Jin Yang](https://github.com/yytina)
57 | * [Jonathan Heng](https://github.com/jonheng)
58 | 


--------------------------------------------------------------------------------
/Strata-london-2019-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/Strata-london-2019-slides.pdf


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | This is where data files will be downloaded to on your local machines
2 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -xe
 3 | 
 4 | IMAGE_VERSION=${GO_PIPELINE_LABEL:-latest}
 5 | PROJECT_ID=${GCLOUD_PROJECT_ID:-continuous-intelligence}
 6 | TENANT_NAMESPACE=${TENANT:-admin}
 7 | FLUENTD_HOST=${FLUENTD_HOST:-""}
 8 | FLUENTD_PORT=${FLUENTD_PORT:-""}
 9 | echo "Deploying image version: $IMAGE_VERSION"
10 | 
11 | cat kubernetes/web.yml \
12 |   | sed "s/\\\$tenant\\\$/$TENANT_NAMESPACE/" \
13 |   | sed "s/\\\$fluentd_host\\\$/$FLUENTD_HOST/" \
14 |   | sed "s/\\\$fluentd_port\\\$/$FLUENTD_PORT/" \
15 |   | sed "s/\(image: \).*$/\1eu.gcr.io\/$PROJECT_ID\/ci-workshop-app:$TENANT_NAMESPACE.$IMAGE_VERSION/" \
16 |   | kubectl apply -f -
17 | 
18 | echo "Access your application at: http://$TENANT_NAMESPACE.app.cd4ml.net"
19 | 


--------------------------------------------------------------------------------
/instructions/1-setup.md:
--------------------------------------------------------------------------------
 1 | # Exercise 1: Development Environment Setup
 2 | 
 3 | ## Goals
 4 | 
 5 | * Fork your copy of the repository in Github
 6 | * Login to your development environment in Jupyterlab
 7 | * Configure Git
 8 | 
 9 | ## Step by Step instructions
10 | 
11 | We have provisioned all the infrastructure required for the workshop. Each
12 | participant is assigned a numeric ID (from 1 to 100), which will be used
13 | throughout the workshop.
14 | 
15 | 1. Visit the main repository at https://github.com/ThoughtWorksInc/cd4ml-workshop
16 | and **fork it** to your personal GitHub account. **Don't clone the main
17 | repository**.
18 | 
19 | 2. Create a personal access token in GitHub:
20 | 
21 |   * Log in to Github
22 |   * Open your [Personal Access Tokens](https://github.com/settings/tokens)
23 |     settings (*Profile &rarr; Settings &rarr; Developer Settings &rarr; Personal
24 |     Access Tokens*)
25 |   * Click "Generate new token", choose a name and give it **repo** rights
26 |   * Copy the access token value
27 | 
28 | 3. Go to Jupyter Lab at https://jupyterhub.cd4ml.net and login with the username
29 | and password provided.
30 | 
31 | 4. Open a terminal by clicking on the icon:
32 | 
33 |   <kbd>![Open terminal](./images/1-open-terminal.png)</kbd>
34 | 
35 | 5. Setup Git by running the following commands and answering the questions with
36 | your details:
37 | 
38 | ```bash
39 | cd cd4ml-workshop
40 | ./setup-git.sh
41 | ```
42 | 
43 | 6. To test and see the application running in production, open a browser tab, go
44 | to http://userX.app.cd4ml.net (replace `X` with your user ID), and you should
45 | see the application like:
46 | 
47 | <kbd>![Sample application](./images/1-sample-app.png)</kbd>
48 | 
49 | 7. Done! Go to [the next exercise](./2-deployment-pipeline.md)
50 | 


--------------------------------------------------------------------------------
/instructions/2-deployment-pipeline.md:
--------------------------------------------------------------------------------
 1 | # Exercise 2: Configure your Deployment Pipeline
 2 | 
 3 | ## Goals
 4 | 
 5 | * Learn about [GoCD](https://www.gocd.org/)
 6 | * Configure a [Deployment Pipeline](https://martinfowler.com/bliki/DeploymentPipeline.html)
 7 | to build and deploy your application to production
 8 | * Test the application running in production
 9 | 
10 | ## Step by Step instructions
11 | 
12 | 1. Go to GoCD at http://gocd.cd4ml.net and login with the username and password
13 | provided.
14 | 
15 | 2. Click on the little gear symbol (![gear](./images/gear.png)) next to
16 | `ci-workshop-app-X` to edit your deployment pipeline configuration.
17 | 
18 | 3. Go to the *"Materials"* tab and edit the existing GitHub URL so that it
19 | points to your forked repository URL - probably just replacing `ThoughtWorksInc`
20 | with your GitHub username.
21 | 
22 | 4. Save and go back to the [Dashboard](http://gocd.cd4ml.net) page
23 | 
24 | 5. Make a small change to your forked code, e.g., change the
25 | `src/webapp/templates/index.html` file, then add, commit, and push your changes
26 | to see your project being built and deployed in GoCD:
27 | ```bash
28 | git add .
29 | git commit -m"Sample change"
30 | git push
31 | ```
32 | 
33 | 6. Once the pipeline succeeds, you can access your application's URL at
34 | http://userX.app.cd4ml.net (replace `X` with your user ID)
35 | 
36 | 7. Done! Go to [the next exercise](./3-machine-learning-pipeline.md)
37 | 


--------------------------------------------------------------------------------
/instructions/3-machine-learning-pipeline.md:
--------------------------------------------------------------------------------
 1 | # Exercise 3: Create your Machine Learning Pipeline
 2 | 
 3 | ## Goals
 4 | 
 5 | * Learn about [DVC](https://dvc.org/)
 6 | * Configure a Machine Learning Pipeline with DVC to fetch raw data and train a
 7 | ML model
 8 | * Create a pipeline in GoCD to automate your ML training pipeline
 9 | * Add automated tests to evaluate and govern your ML models
10 | * Combine both GoCD pipelines to promote and deploy the new model to production
11 | 
12 | ## Step by Step instructions
13 | 
14 | 1. Configure DVC to use your GCP bucket for remote storage **(replace `X` with
15 |   your user ID)**:
16 | ```bash
17 | dvc remote modify default url gs://cd4ml-continuous-intelligence-bucket-X
18 | ```
19 | 
20 | 2. Create your Machine Learning pipeline with dvc:
21 | ```bash
22 | dvc run -f input.dvc -d src/download_data.py -o data/raw/store47-2016.csv python src/download_data.py
23 | dvc run -f split.dvc -d data/raw/store47-2016.csv -d src/splitter.py -o data/splitter/train.csv -o data/splitter/validation.csv python src/splitter.py
24 | dvc run -d data/splitter/train.csv -d data/splitter/validation.csv -d src/decision_tree.py -o data/decision_tree/model.pkl -M results/metrics.json python src/decision_tree.py
25 | ```
26 | 
27 | 3. Add, commit, and push your changes:
28 | ```bash
29 | git add .
30 | git commit -m "Creating ML pipeline"
31 | git push
32 | ```
33 | 
34 | 4. Create machine learning training pipeline in GoCD:
35 | 
36 |   * Go to GoCD's [*"Admin" > "Pipelines"*](http://gocd.cd4ml.net/go/admin/pipelines)
37 |   menu and create a new pipeline.
38 | 
39 |   * Configure your Github repository URL (e.g. `https://github.com/<github-user>/cd4ml-workshop.git`) as a Git material.
40 | 
41 |   * Give it a name related to your username, e.g. `ml-pipeline-X`, replacing `X`
42 |   with your user ID).
43 | 
44 |   * Click on *"Advanced Settings"* for "Part 2", enable the "Use Template"
45 |   toggle and select the existing `ml-pipeline-gcp-template` template.
46 | 
47 |   <kbd>![GoCD Configuration for ML pipeline](./images/3-ml-pipeline.png)</kbd>
48 | 
49 |   * Click *"Save + Run This Pipeline"*
50 | 
51 | 5. Combine both pipelines:
52 | 
53 |   * Go back to edit your original `ci-workshop-app-X` pipeline again.
54 | 
55 |   * In the *"Materials"* tab add your new pipeline as a new material
56 |   (double-click to get the correct auto suggestion).
57 | 
58 |   * Expand the *"build-and-publish"* stage, and click on the *"build"* job.
59 | 
60 |   * Update the second build task to pull the latest model using DVC instead of downloading a static version from Google Storage, by replacing the
61 |   `python src/download_data.py --model` command with
62 |   `GOOGLE_APPLICATION_CREDENTIALS=./secret.json dvc pull model.pkl.dvc`
63 | 
64 |   <kbd>![Configuration for application pipeline](./images/3-app-pipeline.png)</kbd>
65 | 
66 |   * Save and go back to the main [Dashboard](http://gocd.cd4ml.net) page
67 | 
68 | 6. Wait for the machine learning pipeline to train and publish your model.
69 | 
70 | **WARNING: The pipeline should fail because the model training accuracy is not
71 | good enough!**
72 | 
73 | ### Improving our Model
74 | 
75 | 7. In your code, change the model training approach to use a Random Forest
76 | algorithm, by editing the `src/decision_tree.py` file and replacing the `Model.DECISION_TREE` with `Model.RANDOM_FOREST` on the last line of the file.
77 | 
78 | 8. Re-run your dvc pipeline locally:
79 | ```bash
80 | dvc repro model.pkl.dvc
81 | ```
82 | 
83 | 9. Add, commit, and push your changes, and watch your pipeline execute and go
84 | green:
85 | ```bash
86 | git add .
87 | git commit -m "Improving model algorithm"
88 | git push
89 | ```
90 | 
91 | 10. Once the machine learning pipeline succeeds, it will trigger a new
92 | application deployment pipeline, which will pull the new improved model and
93 | deploy it to production. Visit your application again to verify that you get
94 | better predictions!
95 | 
96 | 11. Done! Go to [the next exercise](./4-tracking-experiments.md)
97 | 


--------------------------------------------------------------------------------
/instructions/4-tracking-experiments.md:
--------------------------------------------------------------------------------
 1 | # Exercise 4: Tracking Experiments with MLflow
 2 | 
 3 | ## Goals
 4 | 
 5 | * Learn about [MLflow](https://mlflow.org/)
 6 | * Configure our Machine Learning training Pipeline to track experiments and
 7 | results
 8 | 
 9 | ## Step by Step instructions
10 | 
11 | 1. In [GoCD](http://gocd.cd4ml.net), click on the little gear symbol
12 | (![gear](./images/gear.png)) next to `ml-pipeline-X` to edit your machine
13 | learning pipeline configuration.
14 | 
15 | 2. Open the *"Environment Variables"* tab and configure the URL to the MLFlow
16 | tracking server and your tenant:
17 | 
18 |   * `MLFLOW_TRACKING_URL = http://mlflow.cd4ml.net`
19 |   * `TENANT = userX` (replace `X` with your user ID)
20 | 
21 |   <kbd>![Configure ML pipeline to track with MLflow](./images/4-mlflow-setup.png)</kbd>
22 | 
23 | 3. Save and return to the [Dashboard](http://gocd.cd4ml.net) page.
24 | 
25 | 4. Trigger a new ML training pipeline and wait for it to succeed.
26 | 
27 | 5. Visit the MLflow URL http://mlflow.cd4ml.net to find your experiment results.
28 | 
29 | <kbd>![Tracking experiment runs with MLflow](./images/4-mlflow.png)</kbd>
30 | 
31 | 6. Done! Go to [the next exercise](./5-model-monitoring.md)
32 | 


--------------------------------------------------------------------------------
/instructions/5-model-monitoring.md:
--------------------------------------------------------------------------------
 1 | # Exercise 5: Model Monitoring and Observability
 2 | 
 3 | ## Goals
 4 | 
 5 | * Learn about EFK Stack ([Elasticsearch](https://www.elastic.co/products/elasticsearch),
 6 | [FluentD](https://www.fluentd.org/), and [Kibana](https://www.elastic.co/products/kibana))
 7 | * Configure and deploy our application to log prediction events to Elastic Search
 8 | * Visualize events on Kibana dashboard
 9 | * Learn how to close the data feedback loop
10 | 
11 | ## Introduction
12 | 
13 | To close the data feedback loop, we can log events in production to collect data
14 | about how our model is performing against real data. This data can later be
15 | curated and labeled to improve the dataset used during training. This allows us
16 | to continuously improve our models in production!
17 | 
18 | In this workshop, we use the EFK stack for our monitoring and observability
19 | infrastructure. It is composed of three main components:
20 | 
21 | * [Elasticsearch](https://www.elastic.co/products/elasticsearch): an open
22 | source search engine.
23 | * [FluentD](https://www.fluentd.org/): an open source data collector for unified
24 | logging layer.
25 | * [Kibana](https://www.elastic.co/products/kibana): an open source web UI that
26 | makes it easy to explore and visualize the data indexed by Elasticsearch.
27 | 
28 | ## Step by Step instructions
29 | 
30 | 1. In [GoCD](http://gocd.cd4ml.net), click on the little gear symbol
31 | (![gear](./images/gear.png)) next to `ci-workshop-app-X` to edit your
32 | deployment pipeline configuration.
33 | 
34 | 2. Open the *"Environment Variables"* tab and configure the FluentD host and
35 | port:
36 | 
37 |   * `FLUENTD_HOST = elastic-stack-fluentd-elasticsearch.elk.svc.cluster.local`
38 |   * `FLUENTD_PORT = 24224`
39 | 
40 |   <kbd>![Configure deployment pipeline to log to FluentD](./images/5-fluentd-setup.png)</kbd>
41 | 
42 | 3. Save and return to the [Dashboard](http://gocd.cd4ml.net) page.
43 | 
44 | 4. Trigger a new application deployment pipeline and wait for it to succeed.
45 | 
46 | 5. Visit your application in production to make a few predictions.
47 | 
48 | 6. Visit the Kibana URL http://kibana.cd4ml.net and click on the *"Discover"*
49 | menu.
50 | 
51 | 7. In the search field, find the entries tagged with your user, with a query
52 | string `tag:"userX.prediction"` (substitute `X` with your user ID).
53 | 
54 | 8. Click *"Refresh"* and you should see your predictions logged!
55 | 
56 | <kbd>![Prediction events in Kibana](./images/5-kibana.png)</kbd>
57 | 
58 | 9. Done!
59 | 
60 | **NOTE: after the end of the workshop, we delete all the infrastructure and GoCD pipelines for security and cost reasons.**
61 | 
62 | You don’t need to use the same tools we chose to implement CD4ML. Get in touch
63 | with us if you want to learn how to run this workshop with your teams in your
64 | company!
65 | 


--------------------------------------------------------------------------------
/instructions/images/1-open-terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/1-open-terminal.png


--------------------------------------------------------------------------------
/instructions/images/1-sample-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/1-sample-app.png


--------------------------------------------------------------------------------
/instructions/images/3-app-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/3-app-pipeline.png


--------------------------------------------------------------------------------
/instructions/images/3-ml-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/3-ml-pipeline.png


--------------------------------------------------------------------------------
/instructions/images/4-mlflow-setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/4-mlflow-setup.png


--------------------------------------------------------------------------------
/instructions/images/4-mlflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/4-mlflow.png


--------------------------------------------------------------------------------
/instructions/images/5-fluentd-setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/5-fluentd-setup.png


--------------------------------------------------------------------------------
/instructions/images/5-kibana.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/5-kibana.png


--------------------------------------------------------------------------------
/instructions/images/gear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/instructions/images/gear.png


--------------------------------------------------------------------------------
/jupyter_notebooks/Exploratory_Analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Exploratory Analysis using Jupyter Notebook\n",
  8 |     "For further reading, we recommend: \n",
  9 |     "- [the pandas documentation](http://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html#getting)  for information about using DataFrames\n",
 10 |     "- [this blog post](https://towardsdatascience.com/introduction-to-data-visualization-in-python-89a54c97fbed) for a jumpstart into visualizations\n",
 11 |     "- [the matplotlib documentation](https://matplotlib.org/users/pyplot_tutorial.html) for more info about visualizations"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "#### Loading data from our GPC bucket"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import s3fs\n",
 37 |     "s3 = s3fs.S3FileSystem(anon=True)\n",
 38 |     "s3.ls('twde-datalab/raw')\n",
 39 |     "\n",
 40 |     "s3.get('twde-datalab/raw/quito_stores_sample2016-2017.csv', \n",
 41 |     "       '../data/quito_stores_sample2016-2017.csv')"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "train = pd.read_csv('../data/quito_stores_sample2016-2017.csv')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/html": [
 61 |        "<div>\n",
 62 |        "<style scoped>\n",
 63 |        "    .dataframe tbody tr th:only-of-type {\n",
 64 |        "        vertical-align: middle;\n",
 65 |        "    }\n",
 66 |        "\n",
 67 |        "    .dataframe tbody tr th {\n",
 68 |        "        vertical-align: top;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe thead th {\n",
 72 |        "        text-align: right;\n",
 73 |        "    }\n",
 74 |        "</style>\n",
 75 |        "<table border=\"1\" class=\"dataframe\">\n",
 76 |        "  <thead>\n",
 77 |        "    <tr style=\"text-align: right;\">\n",
 78 |        "      <th></th>\n",
 79 |        "      <th>id</th>\n",
 80 |        "      <th>date</th>\n",
 81 |        "      <th>store_nbr</th>\n",
 82 |        "      <th>item_nbr</th>\n",
 83 |        "      <th>unit_sales</th>\n",
 84 |        "      <th>onpromotion</th>\n",
 85 |        "      <th>city</th>\n",
 86 |        "      <th>state</th>\n",
 87 |        "      <th>cluster</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>88211471</td>\n",
 94 |        "      <td>2016-08-16</td>\n",
 95 |        "      <td>44</td>\n",
 96 |        "      <td>103520</td>\n",
 97 |        "      <td>7.0</td>\n",
 98 |        "      <td>True</td>\n",
 99 |        "      <td>Quito</td>\n",
100 |        "      <td>Pichincha</td>\n",
101 |        "      <td>5</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>1</th>\n",
105 |        "      <td>88211472</td>\n",
106 |        "      <td>2016-08-16</td>\n",
107 |        "      <td>44</td>\n",
108 |        "      <td>103665</td>\n",
109 |        "      <td>7.0</td>\n",
110 |        "      <td>False</td>\n",
111 |        "      <td>Quito</td>\n",
112 |        "      <td>Pichincha</td>\n",
113 |        "      <td>5</td>\n",
114 |        "    </tr>\n",
115 |        "    <tr>\n",
116 |        "      <th>2</th>\n",
117 |        "      <td>88211473</td>\n",
118 |        "      <td>2016-08-16</td>\n",
119 |        "      <td>44</td>\n",
120 |        "      <td>105574</td>\n",
121 |        "      <td>13.0</td>\n",
122 |        "      <td>False</td>\n",
123 |        "      <td>Quito</td>\n",
124 |        "      <td>Pichincha</td>\n",
125 |        "      <td>5</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>3</th>\n",
129 |        "      <td>88211474</td>\n",
130 |        "      <td>2016-08-16</td>\n",
131 |        "      <td>44</td>\n",
132 |        "      <td>105575</td>\n",
133 |        "      <td>18.0</td>\n",
134 |        "      <td>False</td>\n",
135 |        "      <td>Quito</td>\n",
136 |        "      <td>Pichincha</td>\n",
137 |        "      <td>5</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>4</th>\n",
141 |        "      <td>88211475</td>\n",
142 |        "      <td>2016-08-16</td>\n",
143 |        "      <td>44</td>\n",
144 |        "      <td>105577</td>\n",
145 |        "      <td>8.0</td>\n",
146 |        "      <td>False</td>\n",
147 |        "      <td>Quito</td>\n",
148 |        "      <td>Pichincha</td>\n",
149 |        "      <td>5</td>\n",
150 |        "    </tr>\n",
151 |        "  </tbody>\n",
152 |        "</table>\n",
153 |        "</div>"
154 |       ],
155 |       "text/plain": [
156 |        "         id        date  store_nbr  item_nbr  unit_sales  onpromotion   city  \\\n",
157 |        "0  88211471  2016-08-16         44    103520         7.0         True  Quito   \n",
158 |        "1  88211472  2016-08-16         44    103665         7.0        False  Quito   \n",
159 |        "2  88211473  2016-08-16         44    105574        13.0        False  Quito   \n",
160 |        "3  88211474  2016-08-16         44    105575        18.0        False  Quito   \n",
161 |        "4  88211475  2016-08-16         44    105577         8.0        False  Quito   \n",
162 |        "\n",
163 |        "       state  cluster  \n",
164 |        "0  Pichincha        5  \n",
165 |        "1  Pichincha        5  \n",
166 |        "2  Pichincha        5  \n",
167 |        "3  Pichincha        5  \n",
168 |        "4  Pichincha        5  "
169 |       ]
170 |      },
171 |      "execution_count": 4,
172 |      "metadata": {},
173 |      "output_type": "execute_result"
174 |     }
175 |    ],
176 |    "source": [
177 |     "train.head()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "#### With just this glimpse, you can start to fill out your list of assumptions, hypotheses, and questions. Some of mine are:\n",
185 |     "- Question: What is the span of dates we are provided?\n",
186 |     "- Question: How many distinct store_nbr values are there?\n",
187 |     "- Question: How many distinct item_nbr values are there?\n",
188 |     "- Hypothesis: unit_sales are always positive\n",
189 |     "- Hypothesis: onpromotion is always either True or False\n",
190 |     "- Hypothesis: city and state are always going to be Quito and Pichincha\n",
191 |     "- Hypothesis: cluster is always 5\n",
192 |     "- Question: What does cluster mean and is it important to know?\n",
193 |     "- Question: How many records does the data contain?\n",
194 |     "- Question: What other data files are available?"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "### Here's some examples of how to address those first questions"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# Access an entire dataframe column like you would\n",
211 |     "# the value in a python dictionary:\n",
212 |     "# (The returned object has similar pandas built-in \n",
213 |     "# functions, like 'head' and 'max')\n",
214 |     "print(data['date'].min())\n",
215 |     "print(data['date'].max())"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "# Dataframe columns also have a 'unique' method,\n",
225 |     "# which can answer several of our questions from above\n",
226 |     "data['store_nbr'].unique()"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "print(data['item_nbr'].unique())\n",
236 |     "print(\"There are too many item numbers to display, so let's just count them for now:\")\n",
237 |     "print(\"\\n{} different item_nbr values in our data\"\n",
238 |     "          .format(len(data['item_nbr'].unique())))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "#### It might be helpful to know the 'shape' of our data. We could count by hand (for now) the columns, but how many rows do we have altogether?"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "print(data.shape)\n",
255 |     "print(\"There are {} rows and {} columns in our data\".format(data.shape[0], data.shape[1]))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "#### Moving along to answer our intial questions... Let's have a look at unit_sales. Keep in mind that unit sales is the variable we want to predict with our science.\n",
263 |     "\n",
264 |     "Each row in our data is essentially telling us a `unit_sales` number for a given `item_nbr` at a given `store_nbr` on a given `date`. That is, \"how many of an item was sold at a store on a day\"."
265 |    ]
266 |   }
267 |  ],
268 |  "metadata": {
269 |   "kernelspec": {
270 |    "display_name": "continuous-intelligence-workshop",
271 |    "language": "python",
272 |    "name": "continuous-intelligence-workshop"
273 |   },
274 |   "language_info": {
275 |    "codemirror_mode": {
276 |     "name": "ipython",
277 |     "version": 3
278 |    },
279 |    "file_extension": ".py",
280 |    "mimetype": "text/x-python",
281 |    "name": "python",
282 |    "nbconvert_exporter": "python",
283 |    "pygments_lexer": "ipython3",
284 |    "version": "3.7.3"
285 |   }
286 |  },
287 |  "nbformat": 4,
288 |  "nbformat_minor": 2
289 | }
290 | 


--------------------------------------------------------------------------------
/jupyter_notebooks/Feature_Engineering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Engineering\n",
  8 |     "\n",
  9 |     "Feature engineering is an answer to the question, \"How can I make the most of the data I have?\"\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Let's get started, then. How does one do feature engineering?\n",
 17 |     "\n",
 18 |     "I'll assume you're familiar with pandas and the decision tree pipeline that we're using for this project. That's the algorithm we're going to engineer the data for; not all algorithms will want the data engineered the same way, though often the benefits will work for many algorithms."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import pandas as pd"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# load the data output by src/merger.py\n",
 37 |     "original_data = pd.read_csv('./merger/bigTable.csv')"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 4,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion',\n",
 50 |       "       'city', 'state', 'cluster', 'family', 'class', 'perishable',\n",
 51 |       "       'transactions', 'year', 'month', 'day', 'dayofweek',\n",
 52 |       "       'days_til_end_of_data', 'cpi', 'dayoff', 'percent_in_transactions',\n",
 53 |       "       'item_store_sales_variance'],\n",
 54 |       "      dtype='object')\n",
 55 |       "(5877318, 22)\n"
 56 |      ]
 57 |     },
 58 |     {
 59 |      "data": {
 60 |       "text/html": [
 61 |        "<div>\n",
 62 |        "<style scoped>\n",
 63 |        "    .dataframe tbody tr th:only-of-type {\n",
 64 |        "        vertical-align: middle;\n",
 65 |        "    }\n",
 66 |        "\n",
 67 |        "    .dataframe tbody tr th {\n",
 68 |        "        vertical-align: top;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe thead th {\n",
 72 |        "        text-align: right;\n",
 73 |        "    }\n",
 74 |        "</style>\n",
 75 |        "<table border=\"1\" class=\"dataframe\">\n",
 76 |        "  <thead>\n",
 77 |        "    <tr style=\"text-align: right;\">\n",
 78 |        "      <th></th>\n",
 79 |        "      <th>id</th>\n",
 80 |        "      <th>date</th>\n",
 81 |        "      <th>store_nbr</th>\n",
 82 |        "      <th>item_nbr</th>\n",
 83 |        "      <th>unit_sales</th>\n",
 84 |        "      <th>onpromotion</th>\n",
 85 |        "      <th>city</th>\n",
 86 |        "      <th>state</th>\n",
 87 |        "      <th>cluster</th>\n",
 88 |        "      <th>family</th>\n",
 89 |        "      <th>...</th>\n",
 90 |        "      <th>transactions</th>\n",
 91 |        "      <th>year</th>\n",
 92 |        "      <th>month</th>\n",
 93 |        "      <th>day</th>\n",
 94 |        "      <th>dayofweek</th>\n",
 95 |        "      <th>days_til_end_of_data</th>\n",
 96 |        "      <th>cpi</th>\n",
 97 |        "      <th>dayoff</th>\n",
 98 |        "      <th>percent_in_transactions</th>\n",
 99 |        "      <th>item_store_sales_variance</th>\n",
100 |        "    </tr>\n",
101 |        "  </thead>\n",
102 |        "  <tbody>\n",
103 |        "    <tr>\n",
104 |        "      <th>0</th>\n",
105 |        "      <td>88211471</td>\n",
106 |        "      <td>2016-08-16</td>\n",
107 |        "      <td>44</td>\n",
108 |        "      <td>103520</td>\n",
109 |        "      <td>7.0</td>\n",
110 |        "      <td>True</td>\n",
111 |        "      <td>Quito</td>\n",
112 |        "      <td>Pichincha</td>\n",
113 |        "      <td>5</td>\n",
114 |        "      <td>GROCERY I</td>\n",
115 |        "      <td>...</td>\n",
116 |        "      <td>3941</td>\n",
117 |        "      <td>2016</td>\n",
118 |        "      <td>8</td>\n",
119 |        "      <td>16</td>\n",
120 |        "      <td>1</td>\n",
121 |        "      <td>364</td>\n",
122 |        "      <td>105.123322</td>\n",
123 |        "      <td>False</td>\n",
124 |        "      <td>0.001776</td>\n",
125 |        "      <td>39.466659</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>1</th>\n",
129 |        "      <td>88306356</td>\n",
130 |        "      <td>2016-08-17</td>\n",
131 |        "      <td>44</td>\n",
132 |        "      <td>103520</td>\n",
133 |        "      <td>2.0</td>\n",
134 |        "      <td>False</td>\n",
135 |        "      <td>Quito</td>\n",
136 |        "      <td>Pichincha</td>\n",
137 |        "      <td>5</td>\n",
138 |        "      <td>GROCERY I</td>\n",
139 |        "      <td>...</td>\n",
140 |        "      <td>4256</td>\n",
141 |        "      <td>2016</td>\n",
142 |        "      <td>8</td>\n",
143 |        "      <td>17</td>\n",
144 |        "      <td>2</td>\n",
145 |        "      <td>363</td>\n",
146 |        "      <td>105.123322</td>\n",
147 |        "      <td>False</td>\n",
148 |        "      <td>0.000470</td>\n",
149 |        "      <td>39.466659</td>\n",
150 |        "    </tr>\n",
151 |        "    <tr>\n",
152 |        "      <th>2</th>\n",
153 |        "      <td>88399003</td>\n",
154 |        "      <td>2016-08-18</td>\n",
155 |        "      <td>44</td>\n",
156 |        "      <td>103520</td>\n",
157 |        "      <td>4.0</td>\n",
158 |        "      <td>False</td>\n",
159 |        "      <td>Quito</td>\n",
160 |        "      <td>Pichincha</td>\n",
161 |        "      <td>5</td>\n",
162 |        "      <td>GROCERY I</td>\n",
163 |        "      <td>...</td>\n",
164 |        "      <td>3776</td>\n",
165 |        "      <td>2016</td>\n",
166 |        "      <td>8</td>\n",
167 |        "      <td>18</td>\n",
168 |        "      <td>3</td>\n",
169 |        "      <td>362</td>\n",
170 |        "      <td>105.123322</td>\n",
171 |        "      <td>False</td>\n",
172 |        "      <td>0.001059</td>\n",
173 |        "      <td>39.466659</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>3</th>\n",
177 |        "      <td>88492368</td>\n",
178 |        "      <td>2016-08-19</td>\n",
179 |        "      <td>44</td>\n",
180 |        "      <td>103520</td>\n",
181 |        "      <td>6.0</td>\n",
182 |        "      <td>False</td>\n",
183 |        "      <td>Quito</td>\n",
184 |        "      <td>Pichincha</td>\n",
185 |        "      <td>5</td>\n",
186 |        "      <td>GROCERY I</td>\n",
187 |        "      <td>...</td>\n",
188 |        "      <td>4185</td>\n",
189 |        "      <td>2016</td>\n",
190 |        "      <td>8</td>\n",
191 |        "      <td>19</td>\n",
192 |        "      <td>4</td>\n",
193 |        "      <td>361</td>\n",
194 |        "      <td>105.123322</td>\n",
195 |        "      <td>False</td>\n",
196 |        "      <td>0.001434</td>\n",
197 |        "      <td>39.466659</td>\n",
198 |        "    </tr>\n",
199 |        "    <tr>\n",
200 |        "      <th>4</th>\n",
201 |        "      <td>88591626</td>\n",
202 |        "      <td>2016-08-20</td>\n",
203 |        "      <td>44</td>\n",
204 |        "      <td>103520</td>\n",
205 |        "      <td>13.0</td>\n",
206 |        "      <td>True</td>\n",
207 |        "      <td>Quito</td>\n",
208 |        "      <td>Pichincha</td>\n",
209 |        "      <td>5</td>\n",
210 |        "      <td>GROCERY I</td>\n",
211 |        "      <td>...</td>\n",
212 |        "      <td>4830</td>\n",
213 |        "      <td>2016</td>\n",
214 |        "      <td>8</td>\n",
215 |        "      <td>20</td>\n",
216 |        "      <td>5</td>\n",
217 |        "      <td>360</td>\n",
218 |        "      <td>105.123322</td>\n",
219 |        "      <td>True</td>\n",
220 |        "      <td>0.002692</td>\n",
221 |        "      <td>39.466659</td>\n",
222 |        "    </tr>\n",
223 |        "  </tbody>\n",
224 |        "</table>\n",
225 |        "<p>5 rows × 22 columns</p>\n",
226 |        "</div>"
227 |       ],
228 |       "text/plain": [
229 |        "         id        date  store_nbr  item_nbr  unit_sales  onpromotion   city  \\\n",
230 |        "0  88211471  2016-08-16         44    103520         7.0         True  Quito   \n",
231 |        "1  88306356  2016-08-17         44    103520         2.0        False  Quito   \n",
232 |        "2  88399003  2016-08-18         44    103520         4.0        False  Quito   \n",
233 |        "3  88492368  2016-08-19         44    103520         6.0        False  Quito   \n",
234 |        "4  88591626  2016-08-20         44    103520        13.0         True  Quito   \n",
235 |        "\n",
236 |        "       state  cluster     family            ...              transactions  \\\n",
237 |        "0  Pichincha        5  GROCERY I            ...                      3941   \n",
238 |        "1  Pichincha        5  GROCERY I            ...                      4256   \n",
239 |        "2  Pichincha        5  GROCERY I            ...                      3776   \n",
240 |        "3  Pichincha        5  GROCERY I            ...                      4185   \n",
241 |        "4  Pichincha        5  GROCERY I            ...                      4830   \n",
242 |        "\n",
243 |        "   year  month  day  dayofweek  days_til_end_of_data         cpi  dayoff  \\\n",
244 |        "0  2016      8   16          1                   364  105.123322   False   \n",
245 |        "1  2016      8   17          2                   363  105.123322   False   \n",
246 |        "2  2016      8   18          3                   362  105.123322   False   \n",
247 |        "3  2016      8   19          4                   361  105.123322   False   \n",
248 |        "4  2016      8   20          5                   360  105.123322    True   \n",
249 |        "\n",
250 |        "   percent_in_transactions  item_store_sales_variance  \n",
251 |        "0                 0.001776                  39.466659  \n",
252 |        "1                 0.000470                  39.466659  \n",
253 |        "2                 0.001059                  39.466659  \n",
254 |        "3                 0.001434                  39.466659  \n",
255 |        "4                 0.002692                  39.466659  \n",
256 |        "\n",
257 |        "[5 rows x 22 columns]"
258 |       ]
259 |      },
260 |      "execution_count": 4,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "print(original_data.columns)\n",
267 |     "print(original_data.shape)\n",
268 |     "original_data.head()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 14,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "import sys, os\n",
278 |     "sys.path.append(os.path.join('src'))\n",
279 |     "from src import splitter"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 15,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "Loading data from merger output\n",
292 |       "Splitting data 70:30 train:validation\n",
293 |       "Writing to ./splitter/train.csv\n",
294 |       "Writing to ./splitter/validation.csv\n",
295 |       "Finished splitting\n"
296 |      ]
297 |     }
298 |    ],
299 |    "source": [
300 |     "# Now we run splitter and decision_tree with our original data\n",
301 |     "splitter.main()"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 16,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "from src import decision_tree"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 17,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "Loading data from splitter/train.csv\n",
323 |       "Loading data from splitter/validation.csv\n",
324 |       "Encoding categorical variables\n",
325 |       "Joining tables for consistent encoding\n",
326 |       "Creating decision tree model\n",
327 |       "Making prediction on validation data\n",
328 |       "Calculating estimated error\n"
329 |      ]
330 |     },
331 |     {
332 |      "name": "stderr",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "src/evaluation.py:12: RuntimeWarning: divide by zero encountered in log\n",
336 |       "  log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n",
337 |       "src/evaluation.py:12: RuntimeWarning: invalid value encountered in log\n",
338 |       "  log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n"
339 |      ]
340 |     },
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "Writing to ./decision_tree/model.pkl\n",
346 |       "Writing to ./decision_tree/score_and_metadata.csv\n",
347 |       "Done deciding with trees\n",
348 |       "Decision tree analysis done with a validation score (error rate) of 0.00268005495579566.\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "decision_tree.main()"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 18,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "original_validation_score = 0.00268005495579566"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "So now we have a baseline for how well our decision tree performed before we added a feature.\n",
370 |     "\n",
371 |     "Let's see what happens if we add a `two_weeks_before_christmas` and a `two_weeks_after_christmas` column, as per our Exploratory Analysis discussion."
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 23,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "# Re-read the data and use datetime objects for the date\n",
381 |     "engineered_data = pd.read_csv('./merger/bigTable.csv')\n",
382 |     "engineered_data.date = pd.to_datetime(engineered_data.date)\n"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 24,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "# Create a before_christmas_window\n",
392 |     "start_date = pd.to_datetime('2016-12-11')\n",
393 |     "end_date = pd.to_datetime('2016-12-25')\n",
394 |     "before_christmas = (engineered_data['date'] > start_date) & (engineered_data['date'] <= end_date)"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 25,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "# Create an after_christmas_window\n",
404 |     "start_date = pd.to_datetime('2016-12-25')\n",
405 |     "end_date = pd.to_datetime('2017-01-08')\n",
406 |     "after_christmas = (engineered_data['date'] > start_date) & (engineered_data['date'] <= end_date)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 37,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "engineered_data['two_weeks_before_christmas'] = before_christmas\n",
416 |     "engineered_data['two_weeks_after_christmas'] = after_christmas"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "#### Just as a spot check, let's look at the date of the first few records in our new columns"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 38,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "117   2016-12-12\n",
436 |       "118   2016-12-13\n",
437 |       "119   2016-12-14\n",
438 |       "120   2016-12-15\n",
439 |       "121   2016-12-16\n",
440 |       "Name: date, dtype: datetime64[ns]\n",
441 |       "130   2016-12-26\n",
442 |       "131   2016-12-27\n",
443 |       "132   2016-12-28\n",
444 |       "133   2016-12-29\n",
445 |       "134   2016-12-30\n",
446 |       "Name: date, dtype: datetime64[ns]\n"
447 |      ]
448 |     }
449 |    ],
450 |    "source": [
451 |     "print(engineered_data[engineered_data.two_weeks_before_christmas == True].date.head())\n",
452 |     "print(engineered_data[engineered_data.two_weeks_after_christmas == True].date.head())"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "Seems okay to me. Let's see how it changes the results now."
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 41,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "engineered_data.to_csv('./merger/bigTable.csv', index=False)\n"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 42,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "Loading data from merger output\n",
481 |       "Splitting data 70:30 train:validation\n",
482 |       "Writing to ./splitter/train.csv\n",
483 |       "Writing to ./splitter/validation.csv\n",
484 |       "Finished splitting\n"
485 |      ]
486 |     }
487 |    ],
488 |    "source": [
489 |     "splitter.main()\n"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 43,
495 |    "metadata": {},
496 |    "outputs": [
497 |     {
498 |      "name": "stdout",
499 |      "output_type": "stream",
500 |      "text": [
501 |       "Loading data from splitter/train.csv\n",
502 |       "Loading data from splitter/validation.csv\n",
503 |       "Encoding categorical variables\n",
504 |       "Joining tables for consistent encoding\n",
505 |       "Creating decision tree model\n",
506 |       "Making prediction on validation data\n",
507 |       "Calculating estimated error\n"
508 |      ]
509 |     },
510 |     {
511 |      "name": "stderr",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "src/evaluation.py:12: RuntimeWarning: divide by zero encountered in log\n",
515 |       "  log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n",
516 |       "src/evaluation.py:12: RuntimeWarning: invalid value encountered in log\n",
517 |       "  log_square_errors = (np.log(predictions + 1) - np.log(targets + 1)) ** 2\n"
518 |      ]
519 |     },
520 |     {
521 |      "name": "stdout",
522 |      "output_type": "stream",
523 |      "text": [
524 |       "Writing to ./decision_tree/model.pkl\n",
525 |       "Writing to ./decision_tree/score_and_metadata.csv\n",
526 |       "Done deciding with trees\n",
527 |       "Decision tree analysis done with a validation score (error rate) of 0.003692818003915606.\n"
528 |      ]
529 |     }
530 |    ],
531 |    "source": [
532 |     "decision_tree.main()"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 44,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "engineered_validation_score = 0.003692818003915606"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 45,
547 |    "metadata": {},
548 |    "outputs": [
549 |     {
550 |      "name": "stdout",
551 |      "output_type": "stream",
552 |      "text": [
553 |       "-0.0010127630481199463\n"
554 |      ]
555 |     }
556 |    ],
557 |    "source": [
558 |     "print(original_validation_score - engineered_validation_score)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "So as it turns out, adding a boolean about before/after Christmas slightly hurt our performance. \n",
566 |     "\n",
567 |     "- Now we should iterate on the features \n",
568 |     "  - for example, maybe two weeks is too wide a window\n",
569 |     "- or maybe it's time to question if the scoring algorithm provided to us by the kaggle competition\n",
570 |     "  - should we replace nwrmsle with another error measurement?"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": []
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.7.3"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 2
602 | }
603 | 


--------------------------------------------------------------------------------
/jupyter_notebooks/Negative_Sales.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "pd.options.display.float_format = '{:,.2f}'.format # use 2 decimals, not scientific notation"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import s3fs\n",
 20 |     "s3 = s3fs.S3FileSystem(anon=True)\n",
 21 |     "s3.ls('twde-datalab/raw')\n",
 22 |     "\n",
 23 |     "#may require `mkdir data/`\n",
 24 |     "s3.get('twde-datalab/raw/quito_stores_sample2016-2017.csv', \n",
 25 |     "       '../data/quito_stores_sample2016-2017.csv')"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "data = pd.read_csv('../data/quito_stores_sample2016-2017.csv')"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "#### Starting with `.describe()` is never a bad place to start data exploration "
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "data.unit_sales.describe()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "### I have questions about those *negative sales*. How do you negative sell something?\n",
 58 |     "That's got my gears turning. Here are some of my new questions about the data:\n",
 59 |     "- Question: What does a negative sale mean?\n",
 60 |     "- Question: How often are sales negative?\n",
 61 |     "- Question: How many times are sales above 5,000?\n",
 62 |     "- Question: How do the unit_sales numbers vary with the `date` column?\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "#### A good way to answer these questions is with some visualizations.\n",
 70 |     "\n",
 71 |     "It might be difficult to get an intuitive feel of the data by knowing the exact answer to many of those questions. What we actually want to learn is the personality of the data. We want to know what it looks like in a glance."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Let's import the python libraries that do the heavy lifting of data visualization"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "%matplotlib inline\n",
 88 |     "import random\n",
 89 |     "import matplotlib.pyplot as plt\n",
 90 |     "import seaborn as sns"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "and then let's look at a box plot of unit sales. A box plot conveys the mean and the middle 50% of the data."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "sns.boxplot(data.unit_sales)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "**This box plot is awful.** We can't even recognize the inter-quartile-range. \n",
114 |     "\n",
115 |     "Let's make a decision: **Ignore \"very large\" values** (perhaps to be explored later)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "outliers = data[data.unit_sales > 1000]"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "majority_of_data = data[data.unit_sales <= 1000]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "and once again look at the boxplot of the non-outlier (for lack of a better term) data"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "sns.boxplot(majority_of_data.unit_sales)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "I'm surprised. This plot isn't any better than the first one. Let's try a different visualization... Maybe kernel density estimation plot. \n",
157 |     "\n",
158 |     "This shows us the probability of a data point being a certain value."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "sns.kdeplot(majority_of_data.unit_sales, clip=[-100,500])"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "We can see that the likelihood of getting a certain unit_sales value tapers off dramatically and has almost vanished by a unit_sales of 100."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "I'd also like to know how do sales change over time. \n",
189 |     "**Is there a weekly cycle? A monthly cycle?**\n",
190 |     "Let's look at that with a line graph."
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "#### First, we convert the date column into a datetime object, and set it as the index\n",
198 |     "Then we find the weekly average of the data and plot it"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "data.index = pd.to_datetime(data.date)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "data.unit_sales.resample('W').mean().plot(x='index',y='unit_sales')"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Here we can see something that seems interesting around December-January. We also see what appears to be a couple sales cycles throughout the year. \n",
224 |     "\n",
225 |     "**What do you think causes the huge drop off in August-September?**\n",
226 |     "\n",
227 |     "I'm curious to see if returns happen more frequently after Christmas, so I'm going to repeat the above plot, but only focusing on returns."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "data[data.unit_sales < 0].unit_sales.resample('W').mean().plot(x='index',y='unit_sales')"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "This graph is surprising to me. Is it surprising to you? I still strongly suspect that returns happen at a statistically significantly different rate after Christmas, given that purchases spike around Dec-Jan anyway.  My next thought is about those outliers. Maybe `mean` isn't the right measurement to use, since means can be skewed by outliers. \n",
244 |     "\n",
245 |     "Let's see the same graph as above, only this time using `median` as the measurement. "
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "data[data.unit_sales < 0].unit_sales.resample('W').median().plot(x='index',y='unit_sales')"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "There we go. Look at that. When we use a statistic that is robust to outliers, we can see that return behavior is very different around Christmas.\n",
262 |     "\n",
263 |     "What can we do with this knowledge? If we're to predict sales and returns for the end of December and beginning of January, our model should incorporate the effect of Christmas on sales. Perhaps it'd be useful to add columns called, `is_two_weeks_before_christmas` and `is_two_weeks_after_christmas`. "
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": []
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "# import os\n",
280 |     "# import urllib.request\n",
281 |     "# import argparse\n",
282 |     "\n",
283 |     "# def load_data(path, key):\n",
284 |     "#     gcsBucket = \"continuous-intelligence\"\n",
285 |     "\n",
286 |     "#     if not os.path.exists(path):\n",
287 |     "#         os.makedirs(path)\n",
288 |     "\n",
289 |     "#     if not os.path.exists(os.path.join(path, key)):\n",
290 |     "#         url = \"https://storage.googleapis.com/%s/%s\" % (gcsBucket, key)\n",
291 |     "#         urllib.request.urlretrieve(url, os.path.join(path, key))\n",
292 |     "\n",
293 |     "# load_data(path='data/raw', key='store47-2016.csv')\n",
294 |     "\n"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": []
303 |   }
304 |  ],
305 |  "metadata": {
306 |   "kernelspec": {
307 |    "display_name": "continuous-intelligence-workshop",
308 |    "language": "python",
309 |    "name": "continuous-intelligence-workshop"
310 |   },
311 |   "language_info": {
312 |    "codemirror_mode": {
313 |     "name": "ipython",
314 |     "version": 3
315 |    },
316 |    "file_extension": ".py",
317 |    "mimetype": "text/x-python",
318 |    "name": "python",
319 |    "nbconvert_exporter": "python",
320 |    "pygments_lexer": "ipython3",
321 |    "version": "3.7.3"
322 |   }
323 |  },
324 |  "nbformat": 4,
325 |  "nbformat_minor": 2
326 | }
327 | 


--------------------------------------------------------------------------------
/jupyter_notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Exploratory Analysis with Jupyter Notebook
 2 | -----------
 3 | 
 4 | One major difference between the software development process you're used to and the process you'll use doing data science is the **scientific process** part of it.
 5 | 
 6 | From the onset, we can't know exactly what sort of product we're going to build or how we'll use the algorithms we end up using. The data is a huge source of uncertainty and until we poke and prod it, we'll be painting in the dark.
 7 | 
 8 | ### Asking relevant questions
 9 | The dataset for this project is sales data for a grocery store in Ecuador. We've reduced the dimensionality of the data so it can be quickly analyzed on most people's laptop computers, but there is still be enough data to make meaningful inquiry and make accurate predictions.
10 | 
11 | As you approach a data problem, you should try to identify your hypotheses and your questions. You should simultaneously try to test your hypotheses and seek answers to your questions; these will likely produce a feedback loop where each new answer kindles in your another question. That's how science works.
12 | 
13 | At this point, you might not know how many stores you're trying to predict sales for. Do you know how many items the stores sell? What is the date range of the data provided?
14 | 
15 | ### Reading a Jupyter Notebook
16 | This directory contains a [basic exploratory analysis in a Jupyter Notebook](https://github.com/ThoughtWorksInc/twde-datalab/blob/master/jupyter_notebooks/Exploratory_Analysis.ipynb). Github can render Jupyter notebooks and by following the link you can read the notebook.
17 | 
18 | ### Working with a Jupyter Notebook
19 | The real fun obviously lies in using the notebook. For this you have to run a Jupyter notebook server locally. If you installed Python 3 using Anaconda, then Jupyter should already be on your path.
20 | 
21 | 1. `cd jupyter_notebooks`
22 | 1. `jupyter notebook`
23 | 
24 | The notebook server should start up and a browser window should open on your machine, allowing you to choose a notebook from this directory. 
25 | 
26 | 


--------------------------------------------------------------------------------
/kubernetes/web.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   name: $tenant$
 5 | ---
 6 | apiVersion: v1
 7 | kind: Service
 8 | metadata:
 9 |   name: ci-workshop-web
10 |   namespace: $tenant$
11 |   labels:
12 |     app: ci-workshop
13 | spec:
14 |   ports:
15 |   - port: 80
16 |     targetPort: 5005
17 |   selector:
18 |     app: ci-workshop
19 |     tier: frontend
20 | ---
21 | apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2
22 | kind: Deployment
23 | metadata:
24 |   name: ci-workshop-web
25 |   namespace: $tenant$
26 |   labels:
27 |     app: ci-workshop
28 | spec:
29 |   selector:
30 |     matchLabels:
31 |       app: ci-workshop
32 |       tier: frontend
33 |   strategy:
34 |     type: RollingUpdate
35 |   template:
36 |     metadata:
37 |       labels:
38 |         app: ci-workshop
39 |         tier: frontend
40 |     spec:
41 |       containers:
42 |       - image: eu.gcr.io/continuous-intelligence/ci-workshop-app
43 |         imagePullPolicy: IfNotPresent
44 |         name: ci-workshop-web
45 |         env:
46 |         - name: TENANT
47 |           value: $tenant$
48 |         - name: FLUENTD_HOST
49 |           value: '$fluentd_host$'
50 |         - name: FLUENTD_PORT
51 |           value: '$fluentd_port$'
52 |         ports:
53 |         - containerPort: 5005
54 |           name: ci-workshop-web
55 |         livenessProbe:
56 |           httpGet:
57 |             path: /
58 |             port: ci-workshop-web
59 |           initialDelaySeconds: 5
60 |         readinessProbe:
61 |           httpGet:
62 |             path: /
63 |             port: ci-workshop-web
64 |           initialDelaySeconds: 5
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | dvc[gs]==0.71.0
 2 | numpy==1.17.4
 3 | pandas==0.25.3
 4 | pylint==2.4.4
 5 | pytest==5.3.0
 6 | scikit-learn==0.21.3
 7 | flask==1.1.1
 8 | mlflow==1.4.0
 9 | lime==0.1.1.36
10 | fluent-logger==0.9.4
11 | s3fs==0.1.2
12 | seaborn==0.9.0
13 | joblib==0.14.0
14 | 


--------------------------------------------------------------------------------
/results/metrics.json:
--------------------------------------------------------------------------------
1 | {"nwrmsle": 0.8261571666321248, "r2_score": -0.8594341254951023}


--------------------------------------------------------------------------------
/run_decisiontree_pipeline.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | python3 src/download_data.py
6 | python3 src/splitter.py
7 | python3 src/decision_tree.py
8 | 


--------------------------------------------------------------------------------
/setup-git.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | read -p 'Name: ' NAME
 4 | read -p 'E-mail: ' EMAIL
 5 | read -p 'Github username: ' GITHUB_USER
 6 | read -sp 'Github personal access token: ' GITHUB_TOKEN
 7 | echo
 8 | echo "Setting up git..."
 9 | git config --global user.name "$NAME"
10 | git config --global user.email "$EMAIL"
11 | git remote set-url origin https://$GITHUB_USER:$GITHUB_TOKEN@github.com/$GITHUB_USER/cd4ml-workshop
12 | git pull --rebase
13 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThoughtWorksInc/cd4ml-workshop/64c5ea4f89489e168a1ad09d6f46a7baffd59fef/src/__init__.py


--------------------------------------------------------------------------------
/src/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, jsonify, request
 2 | from datetime import datetime
 3 | import joblib
 4 | import pandas as pd
 5 | import decision_tree
 6 | import os
 7 | from fluent import sender, event
 8 | 
 9 | app = Flask(__name__, template_folder='webapp/templates', static_folder='webapp/static')
10 | 
11 | products = {
12 |     "99197": {
13 |         "class": 1067,
14 |         "family": "GROCERY I",
15 |         "perishable": 0
16 |     },
17 |     "105574": {
18 |         "class": 1045,
19 |         "family": "GROCERY I",
20 |         "perishable": 0
21 |     },
22 |     "1963838": {
23 |         "class": 3024,
24 |         "family": "CLEANING",
25 |         "perishable": 0
26 |     }
27 | }
28 | 
29 | TENANT = os.getenv('TENANT', 'local')
30 | FLUENTD_HOST = os.getenv('FLUENTD_HOST')
31 | FLUENTD_PORT = os.getenv('FLUENTD_PORT')
32 | 
33 | @app.route('/')
34 | def index():
35 |     return render_template('index.html')
36 | 
37 | @app.route('/prediction')
38 | def get_prediction():
39 |   loaded_model = joblib.load('data/decision_tree/model.pkl')
40 | 
41 |   date_string = request.args.get('date')
42 | 
43 |   date = datetime.strptime(date_string, '%Y-%m-%d')
44 | 
45 |   product = products[request.args.get("item_nbr")]
46 |   data = {
47 |     "date": date_string,
48 |     "item_nbr": request.args.get("item_nbr"),
49 |     "family": product['family'],
50 |     "class": product['class'],
51 |     "perishable": product['perishable'],
52 |     "transactions": 1000,
53 |     "year": date.year,
54 |     "month": date.month,
55 |     "day": date.day,
56 |     "dayofweek": date.weekday(),
57 |     "days_til_end_of_data": 0,
58 |     "dayoff": date.weekday() >= 5
59 |   }
60 |   df = pd.DataFrame(data=data, index=['row1'])
61 | 
62 |   df = decision_tree.encode_categorical_columns(df)
63 |   pred = loaded_model.predict(df)
64 |   if FLUENTD_HOST:
65 |       logger = sender.FluentSender(TENANT, host=FLUENTD_HOST, port=int(FLUENTD_PORT))
66 |       log_payload = {'prediction': pred[0], **data}
67 |       print('logging {}'.format(log_payload))
68 |       if not logger.emit('prediction', log_payload):
69 |           print(logger.last_error)
70 |           logger.clear_last_error()
71 | 
72 |   return "%d" % pred[0]
73 | 
74 | if __name__ == '__main__':
75 |     app.run(host = '0.0.0.0', port=5005)
76 | 


--------------------------------------------------------------------------------
/src/decision_tree.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import numpy as np
  3 | import pandas as pd
  4 | import sys, os, json
  5 | from sklearn.preprocessing import LabelEncoder
  6 | import joblib
  7 | sys.path.append(os.path.join('..', 'src'))
  8 | sys.path.append(os.path.join('src'))
  9 | from sklearn import tree, ensemble, metrics
 10 | import evaluation
 11 | import tracking
 12 | 
 13 | class Model(Enum):
 14 |     DECISION_TREE = 0
 15 |     RANDOM_FOREST = 1
 16 |     ADABOOST = 2
 17 |     GRADIENT_BOOST = 3
 18 | 
 19 | 
 20 | def load_data():
 21 |     filename = "data/splitter/train.csv"
 22 |     print("Loading data from {}".format(filename))
 23 |     train = pd.read_csv(filename)
 24 | 
 25 |     filename = 'data/splitter/validation.csv'
 26 |     print("Loading data from {}".format(filename))
 27 |     validate = pd.read_csv(filename)
 28 | 
 29 |     return train, validate
 30 | 
 31 | 
 32 | def join_tables(train, validate):
 33 |     print("Joining tables for consistent encoding")
 34 |     return train.append(validate).drop('date', axis=1)
 35 | 
 36 | 
 37 | def encode_categorical_columns(df):
 38 |     obj_df = df.select_dtypes(include=['object', 'bool']).copy().fillna('-1')
 39 |     lb = LabelEncoder()
 40 |     for col in obj_df.columns:
 41 |         df[col] = lb.fit_transform(obj_df[col])
 42 |     return df
 43 | 
 44 | 
 45 | def encode(train, validate):
 46 |     print("Encoding categorical variables")
 47 |     train_ids = train.id
 48 |     validate_ids = validate.id
 49 | 
 50 |     joined = join_tables(train, validate)
 51 | 
 52 |     encoded = encode_categorical_columns(joined.fillna(-1))
 53 | 
 54 |     print("Not predicting returns...")
 55 |     encoded.loc[encoded.unit_sales < 0, 'unit_sales'] = 0
 56 | 
 57 |     validate = encoded[encoded['id'].isin(validate_ids)]
 58 |     train = encoded[encoded['id'].isin(train_ids)]
 59 |     return train, validate
 60 | 
 61 | 
 62 | def train_model(train, model=Model.DECISION_TREE, seed=None):
 63 |     print("Training model using regressor: {}".format(model.name))
 64 |     train_dropped = train.drop('unit_sales', axis=1)
 65 |     target = train['unit_sales']
 66 | 
 67 |     if model == Model.RANDOM_FOREST:
 68 |         params = {'n_estimators': 10}
 69 |         clf = ensemble.RandomForestRegressor(random_state=seed, **params)
 70 |     elif model == Model.ADABOOST:
 71 |         params = {'n_estimators': 50, 'learning_rate': 1.0, 'loss':'linear'}
 72 |         clf = ensemble.AdaBoostRegressor(random_state=seed, **params)
 73 |     elif model == Model.GRADIENT_BOOST:
 74 |         params = {'n_estimators': 200, 'max_depth': 4}
 75 |         clf = ensemble.GradientBoostingRegressor(random_state=seed, **params)
 76 |     else:
 77 |         params = {'criterion': 'mse'}
 78 |         clf = tree.DecisionTreeRegressor(random_state=seed)
 79 | 
 80 |     trained_model = clf.fit(train_dropped, target)
 81 |     return (trained_model,params)
 82 | 
 83 | 
 84 | def overwrite_unseen_prediction_with_zero(preds, train, validate):
 85 |     cols_item_store = ['item_nbr', 'store_nbr']
 86 |     cols_to_use = validate.columns.drop('unit_sales') if 'unit_sales' in validate.columns else validate.columns
 87 |     validate_train_joined = pd.merge(validate[cols_to_use], train, on=cols_item_store, how='left')
 88 |     unseen = validate_train_joined[validate_train_joined['unit_sales'].isnull()]
 89 |     validate['preds'] = preds
 90 |     validate.loc[validate.id.isin(unseen['id_x']), 'preds'] = 0
 91 |     preds = validate['preds'].tolist()
 92 |     return preds
 93 | 
 94 | 
 95 | def make_predictions(model, validate):
 96 |     print("Making prediction on validation data")
 97 |     validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
 98 |     validate_preds = model.predict(validate_dropped)
 99 |     return validate_preds
100 | 
101 | 
102 | def write_predictions_and_score(evaluation_metrics, model, columns_used):
103 |     key = "decision_tree"
104 |     if not os.path.exists('data/{}'.format(key)):
105 |         os.makedirs('data/{}'.format(key))
106 |     filename = 'data/{}/model.pkl'.format(key)
107 |     print("Writing to {}".format(filename))
108 |     joblib.dump(model, filename)
109 | 
110 |     filename = 'results/metrics.json'
111 |     print("Writing to {}".format(filename))
112 |     if not os.path.exists('results'):
113 |         os.makedirs('results')
114 |     with open(filename, 'w+') as score_file:
115 |         json.dump(evaluation_metrics, score_file)
116 | 
117 | 
118 | def main(model=Model.DECISION_TREE, seed=None):
119 |     original_train, original_validate = load_data()
120 |     train, validate = encode(original_train, original_validate)
121 |     with tracking.track() as track:
122 |         track.set_model(model)
123 |         model, params = train_model(train, model, seed)
124 |         track.log_params(params)
125 |         validation_predictions = make_predictions(model, validate)
126 | 
127 |         print("Calculating metrics")
128 |         evaluation_metrics = {
129 |             'nwrmsle': evaluation.nwrmsle(validation_predictions, validate['unit_sales'].values, validate['perishable'].values),
130 |             'r2_score': metrics.r2_score(y_true=validate['unit_sales'].values, y_pred=validation_predictions)
131 |         }
132 |         track.log_metrics(evaluation_metrics)
133 | 
134 |         write_predictions_and_score(evaluation_metrics, model, original_train.columns)
135 | 
136 |         print("Evaluation done with metrics {}.".format(json.dumps(evaluation_metrics)))
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     main(model=Model.DECISION_TREE, seed=8675309)
141 | 


--------------------------------------------------------------------------------
/src/download_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import urllib.request
 3 | import argparse
 4 | 
 5 | def load_data(path, key):
 6 |     gcsBucket = "continuous-intelligence"
 7 | 
 8 |     if not os.path.exists(path):
 9 |         os.makedirs(path)
10 | 
11 |     if not os.path.exists(os.path.join(path, key)):
12 |         url = "https://storage.googleapis.com/%s/%s" % (gcsBucket, key)
13 |         urllib.request.urlretrieve(url, os.path.join(path, key))
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser(description='Download files from Google Storage.')
18 |     parser.add_argument('--model', action='store_true', default=False, help='Downloads model (data/decision_tree/model.pkl) instead of input file (data/raw/store47-2016.csv)')
19 |     args = parser.parse_args()
20 | 
21 |     if args.model:
22 |         print("Loading model...")
23 |         load_data(path='data/decision_tree', key='model.pkl')
24 |     else:
25 |         print("Loading input data...")
26 |         load_data(path='data/raw', key='store47-2016.csv')
27 |     print("Finished downloading")
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/src/evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def nwrmsle(predictions, targets, weights):
 5 |     if type(predictions) == list:
 6 |         predictions = np.array([np.nan if x < 0 else x for x in predictions])
 7 |     elif type(predictions) == pd.Series:
 8 |         predictions[predictions < 0] = np.nan
 9 |     targetsf = targets.astype(float)
10 |     targetsf[targets < 0] = np.nan
11 |     weights = 1 + 0.25 * weights
12 |     log_square_errors = (np.log(predictions + 1) - np.log(targetsf + 1)) ** 2
13 |     return(np.sqrt(np.sum(weights * log_square_errors) / np.sum(weights)))
14 | 


--------------------------------------------------------------------------------
/src/splitter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | def get_validation_period(latest_date_train, days_back=15):
 5 |     # for Kaggle we want from Wednesday to Thursday for a 15 day period
 6 |     offset = (latest_date_train.weekday() - 3) % 7
 7 |     end_of_validation_period = latest_date_train - pd.DateOffset(days=offset)
 8 |     begin_of_validation_period = end_of_validation_period - pd.DateOffset(days=days_back)
 9 |     return (begin_of_validation_period, end_of_validation_period)
10 | 
11 | 
12 | def split_validation_train_by_validation_period(train, validation_begin_date, validation_end_date):
13 |     train_validation = train[(train['date'] >= validation_begin_date) & (train['date'] <= validation_end_date)]
14 |     train_train = train[train['date'] < validation_begin_date]
15 |     return train_train, train_validation
16 | 
17 | 
18 | def write_data(table, filename):
19 |     if not os.path.exists('data/splitter'):
20 |         os.makedirs('data/splitter')
21 | 
22 |     print("Writing to data/splitter/{}".format(filename))
23 |     table.to_csv('data/splitter/' + filename, index=False)
24 | 
25 | 
26 | def main():
27 |     print("Loading data...")
28 |     train = pd.read_csv("data/raw/store47-2016.csv")
29 | 
30 |     train['date'] = pd.to_datetime(train['date'], format="%Y-%m-%d")
31 | 
32 |     latest_date = train['date'].max()
33 | 
34 |     begin_of_validation, end_of_validation = get_validation_period(latest_date, days_back=57)
35 | 
36 |     print("Splitting data between {} and {}".format(begin_of_validation, end_of_validation))
37 |     train_train, train_validation = split_validation_train_by_validation_period(train, begin_of_validation,
38 |                                                                                 end_of_validation)
39 |     write_data(train_train, 'train.csv')
40 | 
41 |     write_data(train_validation, 'validation.csv')
42 | 
43 |     print("Finished splitting")
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/src/tracking.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | import os
 3 | 
 4 | MLFLOW_TRACKING_URL = os.getenv('MLFLOW_TRACKING_URL')
 5 | TENANT = os.getenv('TENANT','local')
 6 | RUN_LABEL = os.getenv('GO_PIPELINE_LABEL', '0')
 7 | USE_MLFLOW = MLFLOW_TRACKING_URL is not None
 8 | 
 9 | class track:
10 |     def __enter__(self):
11 |         if USE_MLFLOW:
12 |             mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URL)
13 |             mlflow.set_experiment(TENANT)
14 |             mlflow.start_run(run_name=RUN_LABEL)
15 |         return self
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         if USE_MLFLOW:
19 |             mlflow.end_run()
20 | 
21 |     def set_model(self, model):
22 |         if USE_MLFLOW:
23 |             mlflow.log_param('model', model.name)
24 | 
25 |     def log_params(self, params):
26 |         if USE_MLFLOW:
27 |             for param in params:
28 |                 mlflow.log_param(param, params[param])
29 | 
30 |     def log_metrics(self, metrics):
31 |         if USE_MLFLOW:
32 |             for metric in metrics:
33 |                 mlflow.log_metric(metric, metrics[metric])
34 | 


--------------------------------------------------------------------------------
/src/webapp/static/index.js:
--------------------------------------------------------------------------------
 1 | $( document ).ready(function() {
 2 |     $('#date').datepicker({
 3 |       dateFormat: "yy-mm-dd"
 4 |     });
 5 | 
 6 |     $('button[type="submit"]').click(function() {
 7 |         var data = [];
 8 |         var valid = true;
 9 |         $( 'input[type="text"], select' ).each(function() {
10 |             if (this.id === 'date' && !/[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])/.test($(this).val())) {
11 |                 alert("Invalid date format");
12 |                 valid = false;
13 |                 return false;
14 |             } else if (!$(this).val()) {
15 |                 alert("Need to provide a value for " + $(this).prev().text());
16 |                 valid = false;
17 |                 return false;
18 |             } else {
19 |                 data.push(this.id + "=" + $(this).val());
20 |             }
21 |         });
22 | 
23 |         if (valid) {
24 |             var dataStr = '?' + data.join('&');
25 |             var prefix = (window.location.pathname == "/" ? "" : window.location.pathname)
26 |             $.ajax(prefix + "/prediction" + dataStr, {
27 |                     beforeSend: function() {
28 |                       $('#prediction').text('loading...')
29 |                       $('button[type="submit"]').attr("disabled", "true")
30 |                     }
31 |                 })
32 |                 .done(function(result) {
33 |                     $('#prediction').text(result);
34 |                     $('button[type="submit"]').removeAttr("disabled")
35 |                 })
36 |                 .fail(function() {
37 |                     alert("Request failed.")
38 |                     $('button[type="submit"]').removeAttr("disabled")
39 |                 });
40 |         }
41 |     })
42 | });
43 | 


--------------------------------------------------------------------------------
/src/webapp/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 6 |         <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
 7 |         <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">
 8 |         <title>Sales Forecasting</title>
 9 |     </head>
10 |     <body>
11 |         <div class="container">
12 |             <h2 class="col-md-6 mb-5">Sales forecast</h2>
13 |             <div class="mt-3" id="input_data">
14 |                 <div class="form-group col-md-6">
15 |                     <label class="col-md-3" for="date">Date</label>
16 |                     <input type="text" id="date" placeholder="YYYY-MM-DD">
17 |                 </div>
18 |                 <div class="form-group col-md-6">
19 |                     <label class="col-md-3" for="item_nbr">Product</label>
20 |                     <select id="item_nbr">
21 |                       <option value="99197" selected>Milk</option>
22 |                       <option value="105574">Cheese</option>
23 |                       <option value="1963838">Soap</option>
24 |                     </select>
25 |                 </div>
26 |                 <div class="form-group col-md-6 text-center">
27 |                     <button type="submit" class="btn btn-primary">Submit</button>
28 |                 </div>
29 |             </div>
30 |             <div class="col-md-6" style="border: 1px solid; font-weight: bold;">Prediction:
31 |                 <span id="prediction"></span>
32 |             </div>
33 |         </div>
34 |         <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
35 |         <script src="https://code.jquery.com/ui/1.12.1/jquery-ui.js"></script>
36 |         <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
37 |         <script src="static/index.js"></script>
38 |     </body>
39 | </html>
40 | 


--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | cd C:\app\continuous-intelligence\ && python src/app.py
2 | 


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd /app/continuous-intelligence && python src/app.py
4 | 


--------------------------------------------------------------------------------
/test/app_test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | def test_endpoint():
4 |     query_params = '?date=2017-06-14&item_nbr=99197';
5 |     resp = requests.get('http://localhost:5005/prediction' + query_params);
6 | 
7 |     assert resp.status_code == 200
8 | 


--------------------------------------------------------------------------------
/test/evaluation_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import numpy as np
 4 | from pytest import approx
 5 | sys.path.append(os.path.join('..', 'src'))
 6 | sys.path.append(os.path.join('src'))
 7 | import evaluation
 8 | 
 9 | 
10 | def test_calculates_nwrmsle_for_perfect_match():
11 |     estimate = np.array([1, 2, 3])
12 |     actual = np.array([1, 2, 3])
13 |     weights = np.array([1, 1, 1])
14 |     calculated_nwrmsle = evaluation.nwrmsle(estimate, actual, weights)
15 | 
16 |     assert calculated_nwrmsle == 0.0
17 | 
18 | 
19 | def test_calculates_nwrmsle_for_imperfect_match():
20 |     estimate = np.array([0, 0, 0])
21 |     actual = np.array([1, 1, 1])
22 |     weights = np.array([1, 1, 1])
23 |     calculated_nwrmsle = evaluation.nwrmsle(estimate, actual, weights)
24 | 
25 |     # Assert by-hand calculation of nwrmsle is reasonably close to python calculation
26 |     assert approx(calculated_nwrmsle, 0.69314718)
27 | 


--------------------------------------------------------------------------------
/test/splitter_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import pandas as pd
 4 | sys.path.append(os.path.join('..', 'src'))
 5 | sys.path.append(os.path.join('src'))
 6 | import splitter
 7 | 
 8 | def test_get_validation_period():
 9 |     latest_date = pd.to_datetime('2017-11-22')
10 |     actual_begin_date, actual_end_date = splitter.get_validation_period(latest_date)
11 |     expected_begin_date = pd.to_datetime('2017-11-01')
12 |     expected_end_date = pd.to_datetime('2017-11-16')
13 |     assert actual_begin_date == expected_begin_date
14 |     assert actual_end_date == expected_end_date
15 | 
16 | def test_split_validation_train_by_validation_period():
17 |     date1 = pd.to_datetime('2017-11-12')
18 |     date2 = pd.to_datetime('2017-11-25')
19 |     date3 = pd.to_datetime('2017-11-30')
20 |     date4 = pd.to_datetime('2017-12-01')
21 |     validation_begin_date = pd.to_datetime('2017-11-15')
22 |     validation_end_date = pd.to_datetime('2017-11-30')
23 |     d = {'date': [date1, date2, date3, date4], 'col2': [3, 4, 5, 6]}
24 |     df = pd.DataFrame(data=d)
25 |     df_train, df_validation = splitter.split_validation_train_by_validation_period(df, validation_begin_date, validation_end_date)
26 |     assert df_train.shape[0] == 1
27 |     assert df_validation.shape[0] == 2
28 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import json
 3 | 
 4 | class TestAccuracy(unittest.TestCase):
 5 |     METRICS_FILE = "results/metrics.json"
 6 | 
 7 |     def test_80percent_error_score(self):
 8 |         with open(self.METRICS_FILE, 'r') as file:
 9 |             metrics = json.load(file)
10 |             self.assertLessEqual(metrics['nwrmsle'], 0.80)
11 |             self.assertGreater(metrics['r2_score'], 0.0)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     unittest.main()
16 | 


--------------------------------------------------------------------------------
/undeploy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -xe
3 | 
4 | TENANT_NAMESPACE=${TENANT:-admin}
5 | cat kubernetes/web.yml | sed "s/\\\$tenant\\\$/$TENANT_NAMESPACE/" | kubectl delete -f -
6 | 


--------------------------------------------------------------------------------