├── .circleci
    └── config.yml
├── .gitignore
├── LICENSE
├── README.md
├── bodywork.yaml
├── flake8.ini
├── mypy.ini
├── notebooks
    ├── requirements_nb.txt
    └── time_to_dispatch_model.ipynb
├── pipeline
    ├── __init__.py
    ├── serve_model.py
    └── train_model.py
├── requirements_cicd.txt
├── requirements_pipe.txt
├── tests
    ├── __init__.py
    ├── resources
    │   ├── dataset.csv
    │   └── model.pkl
    ├── test_serve_model.py
    └── test_train_model.py
└── tox.ini


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   aws-eks: circleci/aws-eks@1.0.3
 5 | 
 6 | jobs:
 7 |   run-static-code-analysis:
 8 |     docker:
 9 |       - image: circleci/python:3.9
10 |     steps:
11 |       - checkout
12 |       - run:
13 |           name: Installing Python dependencies
14 |           command: pip install -r requirements_cicd.txt
15 |       - run:
16 |           name: Running tests
17 |           command: tox -e py39-static_code_analysis
18 |   run-tests:
19 |     docker: 
20 |       - image: circleci/python:3.9
21 |     steps:
22 |       - checkout
23 |       - run:
24 |           name: Installing Python dependencies
25 |           command: pip install -r requirements_cicd.txt
26 |       - run: 
27 |           name: Running tests
28 |           command: tox -e py39-unit_and_functional_tests
29 |   trigger-bodywork-deployment:
30 |     executor:
31 |       name: aws-eks/python
32 |       tag: "3.9"
33 |     steps:
34 |       - aws-eks/update-kubeconfig-with-authenticator:
35 |           cluster-name: bodywork-dev
36 |       - checkout
37 |       - run:
38 |           name: Installing Python dependencies
39 |           command: pip install -r requirements_cicd.txt
40 |       - run: 
41 |           name: Trigger Deployment
42 |           command: bodywork create deployment https://github.com/bodywork-ml/ml-pipeline-engineering --branch master
43 | 
44 | workflows:
45 |   version: 2
46 |   test-build-deploy:
47 |     jobs:
48 |       - run-static-code-analysis:
49 |           filters:
50 |             branches:
51 |               ignore: master
52 |       - run-tests:
53 |           requires:
54 |             - run-static-code-analysis
55 |           filters:
56 |             branches:
57 |               ignore: master
58 |       - trigger-bodywork-deployment:
59 |           filters:
60 |             branches:
61 |               only: master
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Bodywork Machine Learning
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Engineering ML Pipelines - Part Two
   2 | 
   3 | This is the second part in a series of articles demonstrating best practices for engineering ML pipelines and deploying them to production. In the [first part](https://www.bodyworkml.com/posts/engineering-ml-pipelines-part-1) we focused on project setup - everything from codebase structure to configuring a CI/CD pipeline and making an initial deployment of a skeleton pipeline.
   4 | 
   5 | In this part we are going to focus on developing a fully-operational pipeline and will cover:
   6 | 
   7 | - A simple approach to data and model versioning, using cloud object storage.
   8 | - How to factor-out common code and make it reusable between projects.
   9 | - Defending against errors and handling failure.
  10 | - How to enable configurable pipelines that can run in multiple environments without code changes.
  11 | - Developing the automated model-training stage and how to write tests for it.
  12 | - Developing and testing the serve-model stage that exposes the trained model via a web API.
  13 | - Updating the deployment configuration and releasing the changes to production.
  14 | - Scheduling the pipeline to run on a schedule.
  15 | 
  16 | All of the code referred to in this series of posts is available on  [GitHub](https://github.com/bodywork-ml/ml-pipeline-engineering), with a dedicated branch for each part, so you can explore the code in its various stages of development. Have a quick look before reading on.
  17 | 
  18 | ## A Simple Strategy for Dataset and Model Versioning
  19 | 
  20 | To recap, the data engineering team will deliver the latest tranche of training data to an AWS S3 bucket, in CSV format. They will take responsibility for verifying that these files have the correct schema and contain no unexpected errors. Each filename will contain the timestamp of its creation, in ISO format, so that the datasets in the bucket will look as follows:
  21 | 
  22 | ```text
  23 | s3://time-to-dispatch/
  24 | |-- datasets/
  25 |     |-- time_to_dispatch_2021-07-03T23:05:32.csv
  26 |     |-- time_to_dispatch_2021-07-02T23:05:13.csv
  27 |     |-- time_to_dispatch_2021-07-01T23:04:52.csv
  28 |     |-- ...
  29 | ```
  30 | 
  31 | The train-model stage of the pipeline will only need to download the latest file for training a new model. We could stop here and rely solely on the filenames as a lightweight versioning strategy, but it is safer to enable [versioning](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Versioning.html) for the S3 bucket and to track of the hash of the dataset used for training, which is computed automatically for every object stored on S3 (the MD5 hash of an object is stored as its [Entity Tag or ETag](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html)). This allows us to defend against accidental deletes and/or overwrites and enables us to locate the precise dataset associated with a trained model.
  32 | 
  33 | Because this concept of a dataset is bigger than just an arbitrarily named file on S3, we will need to develop a custom `Dataset` class for representing files on S3 and retrieving their hashes, together with functions/methods for getting and putting `Datasets` to S3.  All of this can be developed on top of  the [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html) AWS client library for Python.
  34 | 
  35 | Trained models will be serialised to file using Python’s [pickle](https://docs.python.org/3.8/library/pickle.html) module (this works well for SciKit-Learn models), and uploaded to the same AWS bucket, using the same timestamped file-naming convention:
  36 | 
  37 | ```text
  38 | s3://time-to-dispatch/
  39 | |-- models/
  40 |     |-- time_to_dispatch_2021-07-03T23:45:23.csv
  41 |     |-- time_to_dispatch_2021-07-02T23:45:31.csv
  42 |     |-- time_to_dispatch_2021-07-01T23:44:25.csv
  43 |     |-- ...
  44 | ```
  45 | 
  46 | When triggered, the serve-model stage of the pipeline will only need to download the most recently persisted model, to ensure that it will generate predictions using the model from the output of the train-model stage. As with the datasets, we could stop here and rely solely on the filenames as a lightweight versioning strategy, but auditing and debugging predictions will be made much easier if we can access model metadata, such as the details of the exact dataset used for training.
  47 | 
  48 | The concept of a model becomes bigger than just the trained model in isolation, so we will also need to develop a custom `Model` class. This needs to ‘wrap’ the trained model object, so that it can be associated with all of the metadata that we need to operate our basic model versioning system. As with the custom `Dataset` class, we will need to develop functions/methods for getting and putting the `Model` object to S3.
  49 | 
  50 | There is a significant development effort required for implementing the functionality described above and it is likely that this will be repeated in many projects. We are going to cover how to handle reusable code in the section below, but you can see our implementations for the `Dataset` and `Model` classes using the links below, which we have also reproduced at the end of this article.
  51 | 
  52 | - [Dataset class](https://github.com/bodywork-ml/bodywork-pipeline-utils/blob/main/src/bodywork_pipeline_utils/aws/datasets.py)
  53 | - [Model class](https://github.com/bodywork-ml/bodywork-pipeline-utils/blob/main/src/bodywork_pipeline_utils/aws/models.py)
  54 | 
  55 | ## Reusing Common Code
  56 | 
  57 | The canonical way for distributing reusable Python modules, is by implementing them within a Python package that can be installed into any project that benefits from the functionality. This is what we have done for the dataset and model versioning functionality described in the previous section, and for configuring the logger used in both stages (so we can can enforce a common log format across projects). You can explore the codebase for this package, named `bodywork-pipeline-utils`,  on [GitHub](https://github.com/bodywork-ml/bodywork-pipeline-utils). The functions and classes within it are shown below,
  58 | 
  59 | ```text
  60 | |-- aws
  61 |     |-- Dataset
  62 |     |-- get_latest_csv_dataset_from_s3
  63 |     |-- get_latest_parquet_dataset_from_s3
  64 |     |-- put_csv_dataset_to_s3
  65 |     |-- put_parquet_dataset_to_s3
  66 |     |-- Model
  67 |     |-- get_latest_pkl_model_from_s3
  68 | |-- logging
  69 |     |-- configure_logger
  70 | ```
  71 | 
  72 | A discussion of best practices for developing a Python package is beyond the scope of these articles, but you can use `bodywork-pipeline-utils` as a template and/or refer to the [Python Packaging Authority](https://www.pypa.io/en/latest/). The Scikit-Learn team has also published their insights into [API design for machine learning software](https://arxiv.org/abs/1309.0238), which we recommend reading.
  73 | 
  74 | ### Distributing Python Packages within your Company
  75 | 
  76 | The easiest way to distribute Python packages within an organisation is directly from your Version Control System (VCS) - e.g. a remote Git repository hosted on GitHub. You do not **need** to host an internal PyPI server, unless you have a specific reason to do so. To install a Python package from a remote Git repo you can use,
  77 | 
  78 | ```plaintext
  79 | $ pip install git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
  80 | ```
  81 | 
  82 | Where `v0.1.5` is the release tag, but could also be a Git commit hash. This will need to be specified in `requrements_pipe.txt` as,
  83 | 
  84 | ```text
  85 | git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
  86 | ```
  87 | 
  88 | Pip supports many VCSs and protocols - e.g. private Git repositories can be accessed via SSH by using `git+ssh` and ensuring that the machine making the request has the appropriate SSH keys available. Refer to the [documentation for pip](https://pip.pypa.io/en/stable/cli/pip_install/#vcs-support) for more information.
  89 | 
  90 | ## Defending Against Errors and Handling Failures
  91 | 
  92 | Pipelines can experience many types of error - here are some examples:
  93 | 
  94 | - Invalid configuration, such as specifying the wrong storage location for datasets and models.
  95 | - Access to datasets and models becomes temporarily unavailable.
  96 | - Errors in an unverified dataset causes model-training to fail.
  97 | - An unexpected jump in [concept drift](https://en.wikipedia.org/wiki/Concept_drift) causes model metrics to breach performance thresholds.
  98 | 
  99 | When developing pipeline stages, it is critical that error events such as these are identified and logged to aid with debugging, and that the pipeline is not allowed to proceed. Our chosen pattern for handling errors is demonstrated in this snippet from `train_model.py`,
 100 | 
 101 | ```python
 102 | import sys
 103 | 
 104 | # ...
 105 | 
 106 | if __name__ == "__main__":
 107 | 
 108 | # ...
 109 | 
 110 |     try:
 111 |         main(
 112 |             s3_bucket,
 113 |             r2_metric_error_threshold,
 114 |             r2_metric_warning_threshold,
 115 |             HYPERPARAM_GRID
 116 |         )
 117 | 		  sys.exit(0)
 118 |     except Exception as e:
 119 |         log.error(f"Error encountered when training model - {e}")
 120 |         sys.exit(1)
 121 | ```
 122 | 
 123 | The pipeline is defined in the `main` function, which is executed within a `try... except` block. If it executes without error, then we signal this to Kubernetes with an exit-code of `0` . If any error is encountered, then the exception is caught, we log the details and signal this to Kubernetes with an exit-code of `1` (so it can attempt a retry, if this has been configured).
 124 | 
 125 | Exceptions within `main` are likely to be raised from within 3rd party packages that we’ve installed - e.g. if `bodywork-pipeline-utils` can’t access AWS or if Scikit-Learn fails to train a model. We recommend reading the documentation (or source code) for external functions and classes to understand what exceptions they raise and if the pipeline would benefit from custom handling and logging.
 126 | 
 127 | Sometimes, however, we need to look for the error ourselves and raise the exception manually, as shown below when the key test metric falls below a pre-configured threshold level,
 128 | 
 129 | ```python
 130 | def main(
 131 |     s3_bucket: str,
 132 |     metric_error_threshold: float,
 133 |     metric_warning_threshold: float,
 134 |     hyperparam_grid: Dict[str, Any]
 135 | ) -> None:
 136 |     """Main training job."""
 137 |     log.info("Starting train-model stage.")
 138 | 
 139 |     # ...
 140 | 
 141 |     if metrics.r_squared >= metric_error_threshold:
 142 |         if metrics.r_squared >= metric_warning_threshold:
 143 |             log.warning("Metrics breached warning threshold - check for drift.")
 144 |         s3_location = persist_model(s3_bucket, model, dataset, metrics)
 145 |         log.info(f"Model serialised and persisted to s3://{s3_location}")
 146 |     else:
 147 |         msg = (
 148 |             f"r-squared metric ({{metrics.r_squared:.3f}}) is below deployment "
 149 |             f"threshold {metric_error_threshold}"
 150 |         )
 151 |         raise RuntimeError(msg)
 152 | ```
 153 | 
 154 | This works as follows:
 155 | 
 156 | - If the r-squared metric is above the error threshold and the warning threshold, then persist the trained model.
 157 | - If the r-squared metric is above the error threshold, but below the warning threshold, then log a warning message and then persist the trained model.
 158 | - If the r-squared metric is below the error threshold, then raise an exception, which will cause the stage to log an error and exit with a non-zero exit code (halting the pipeline), using the logic in the `try... except` block discussed earlier in this section.
 159 | 
 160 | Using logs to communicate pipeline state will take on additional importance later on in Part Three of this series, when we add monitoring, observability and alerting to our pipeline.
 161 | 
 162 | ## Configurable Pipelines
 163 | 
 164 | Pipelines can benefit from parametrisation to make them re-usable across deployment environments (and potentially tenants, if this makes sense for your project). For example, passing the S3 bucket as an external argument to each stage, enables the pipeline to operate both in a staging environment, as well as in production. Similarly, external arguments can be used to set thresholds for defining when warnings and alerts are triggered, based on model training metrics, which can make testing the pipeline much easier.
 165 | 
 166 | Each stage of our pipeline is defined by an executable Python module.  The easiest way to pass arguments to a module is via the command line. For example,
 167 | 
 168 | ```text
 169 | $ python -m pipeline.train_model time-to-dispatch 0.9 0.8
 170 | ```
 171 | 
 172 | Passes an array of strings, `["time-to-dispatch", "0.9", "0.8"]` to `train_model.py`, that can be retrieved from `sys.argv` as demonstrated in the excerpt from `train_model.py` below.
 173 | 
 174 | ```python
 175 | import sys
 176 | 
 177 | # ...
 178 | 
 179 | if __name__ == "__main__":
 180 |     try:
 181 |         args = sys.argv
 182 |         s3_bucket = args[1]
 183 |         r2_metric_error_threshold = float(args[2])
 184 |         if r2_metric_error_threshold <= 0 or r2_metric_error_threshold > 1:
 185 |             raise ValueError()
 186 |         r2_metric_warning_threshold = float(args[3])
 187 |         if r2_metric_warning_threshold <= 0 or r2_metric_warning_threshold > 1:
 188 |             raise ValueError()
 189 |     except (ValueError, IndexError):
 190 |         log.error(
 191 |             "Invalid arguments passed to train_model.py. "
 192 |             "Expected S3_BUCKET R_SQUARED_ERROR_THRESHOLD R_SQUARED_WARNING_THRESHOLD, "
 193 |             "where all thresholds must be in the range [0, 1]."
 194 |         )
 195 |         sys.exit(1)
 196 | 
 197 |     try:
 198 |         main(
 199 |             s3_bucket,
 200 |             r2_metric_error_threshold,
 201 |             r2_metric_warning_threshold,
 202 |             HYPERPARAM_GRID
 203 |         )
 204 |     except Exception as e:
 205 |         log.error(f"Error encountered when training model - {e}")
 206 |         sys.exit(1)
 207 | ```
 208 | 
 209 | Note how we cast the numeric arguments to `float` types before performing basic input validation to ensure that users can’t accidentally specify invalided arguments that could lead to unintended consequences.
 210 | 
 211 | When deployed by Bodywork,  `train_model.py` will be executed in a dedicated container on Kubernetes. The required arguments can be passed via the `args` parameter in the `bodywork.yaml` file that describes the deployment, as shown below.
 212 | 
 213 | ```yaml
 214 | # bodywork.yaml
 215 | ...
 216 | stages:
 217 |   train_model:
 218 |     executable_module_path: pipeline/train_model.py
 219 |       args: ["time-to-dispatch", "0.9", "0.8"]
 220 |       ...
 221 | ```
 222 | 
 223 | ## Engineering the Model Training Job
 224 | 
 225 | The core task here is to engineer the ML solution in the [time_to_dispatch_model.ipynb notebook](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/master/notebooks/time_to_dispatch_model.ipynb),  provided to us by the data scientist who worked on this task, into the pipeline stage defined in [pipeline/train_model.py](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/part-two/pipeline/train_model.py) (reproduced in the Appendix below). The central workflow is defined in the `main` function,
 226 | 
 227 | ```python
 228 | from typing import Any, Dict, List, NamedTuple, Tuple
 229 | 
 230 | from bodywork_pipeline_utils import aws, logging
 231 | from bodywork_pipeline_utils.aws import Dataset
 232 | 
 233 | # ...
 234 | 
 235 | log = logging.configure_logger()
 236 | 
 237 | # ...
 238 | 
 239 | def main(
 240 |     s3_bucket: str,
 241 |     metric_error_threshold: float,
 242 |     metric_warning_threshold: float,
 243 |     hyperparam_grid: Dict[str, Any]
 244 | ) -> None:
 245 |     """Main training job."""
 246 |     log.info("Starting train-model stage.")
 247 |     dataset = aws.get_latest_csv_dataset_from_s3(s3_bucket, "datasets")
 248 |     log.info(f"Retrieved dataset from s3://{s3_bucket}/{dataset.key}")
 249 | 
 250 |     feature_and_labels = prepare_data(dataset.data)
 251 |     model, metrics = train_model(feature_and_labels, hyperparam_grid)
 252 |     validate_trained_model_logic(model, feature_and_labels)
 253 |     log.info(
 254 |         f"Trained model: r-squared={metrics.r_squared:.3f}, "
 255 |         f"MAE={metrics.mean_absolute_error:.3f}"
 256 |     )
 257 | 
 258 |     if metrics.r_squared >= metric_error_threshold:
 259 |         if metrics.r_squared >= metric_warning_threshold:
 260 |             log.warning("Metrics breached warning threshold - check for drift.")
 261 |         s3_location = persist_model(s3_bucket, model, dataset, metrics)
 262 |         log.info(f"Model serialised and persisted to s3://{s3_location}")
 263 |     else:
 264 |         msg = (
 265 |             f"r-squared metric ({{metrics.r_squared:.3f}}) is below deployment "
 266 |             f"threshold {metric_error_threshold}"
 267 |         )
 268 |         raise RuntimeError(msg)
 269 | ```
 270 | 
 271 | This splits the job into smaller sub-tasks, such as preparing the data, that can be delegated to specialised functions that are easier to write (unit) tests for. All interaction with cloud object storage (AWS S3), for retrieving datasets and persisting trained models, is handled by functions imported from the [bodywork-pipeline-utils](https://github.com/bodywork-ml/bodywork-pipeline-utils) package, leaving three key functions that we will discuss in turn:
 272 | 
 273 | - `prepare_data`
 274 | - `train_model`
 275 | - `validate_trained_model_logic`
 276 | 
 277 | The `persist_model` function creates the `Model` object and calls its `put_model_to_S3` method. It will be tested implicitly in the functional tests for `main`, which we will look at later on.
 278 | 
 279 | ### Prepare Data
 280 | 
 281 | This purpose of this function is to start with the dataset as a `DataFrame`, split the features from the labels and then partition each of these into ‘test’ and ‘train ‘subsets. We return the results as a `NamedTuple`  called `FeaturesAndLabels`, which facilitates easier access within functions that consume these data structures.
 282 | 
 283 | ```python
 284 | from typing import Any, Dict, List, NamedTuple, Tuple
 285 | 
 286 | from sklearn.model_selection import GridSearchCV, train_test_split
 287 | 
 288 | # ...
 289 | 
 290 | class FeatureAndLabels(NamedTuple):
 291 |     """Container for features and labels split by test and train sets."""
 292 | 
 293 |     X_train: DataFrame
 294 |     X_test: DataFrame
 295 |     y_train: DataFrame
 296 |     y_test: DataFrame
 297 | 
 298 | # ...
 299 | 
 300 | def prepare_data(data: DataFrame) -> FeatureAndLabels:
 301 |     """Split the data into features and labels for training and testing."""
 302 |     X = data.drop("hours_to_dispatch", axis=1)
 303 |     y = data["hours_to_dispatch"]
 304 |     X_train, X_test, y_train, y_test = train_test_split(
 305 |         X, y, test_size=0.2, stratify=data["product_code"].values, random_state=42
 306 |     )
 307 |     return FeatureAndLabels(X_train, X_test, y_train, y_test)
 308 | ```
 309 | 
 310 | This is tested in [tests/test_train_model.py](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/part-two/tests/test_train_model.py) as follows,
 311 | 
 312 | ```python
 313 | from pandas import read_csv, DataFrame
 314 | from pytest import fixture, raises
 315 | 
 316 | from bodywork_pipeline_utils.aws import Dataset
 317 | 
 318 | # ...
 319 | 
 320 | @fixture(scope="session")
 321 | def dataset() -> Dataset:
 322 |     data = read_csv("tests/resources/dataset.csv")
 323 |     dataset = Dataset(data, datetime(2021, 7, 15), "tests", "resources", "foobar")
 324 |     return dataset
 325 | 
 326 | 
 327 | def test_prepare_data_splits_labels_and_features_into_test_and_train(dataset: Dataset):
 328 |     label_column = "hours_to_dispatch"
 329 |     n_rows_in_dataset = dataset.data.shape[0]
 330 |     n_cols_in_dataset = dataset.data.shape[1]
 331 |     prepared_data = prepare_data(dataset.data)
 332 | 
 333 |     assert prepared_data.X_train.shape[1] == n_cols_in_dataset - 1
 334 |     assert label_column not in prepared_data.X_train.columns
 335 | 
 336 |     assert prepared_data.X_test.shape[1] == n_cols_in_dataset - 1
 337 |     assert label_column not in prepared_data.X_test.columns
 338 | 
 339 |     assert prepared_data.y_train.ndim == 1
 340 |     assert prepared_data.y_train.name == label_column
 341 | 
 342 |     assert prepared_data.y_test.ndim == 1
 343 |     assert prepared_data.y_test.name == label_column
 344 | 
 345 |     assert (prepared_data.X_train.shape[0] + prepared_data.X_test.shape[0]
 346 |             == n_rows_in_dataset)
 347 | 
 348 |     assert (prepared_data.y_train.shape[0] + prepared_data.y_test.shape[0]
 349 |             == n_rows_in_dataset)
 350 | ```
 351 | 
 352 | To help with testing, we have saved a snapshot of CSV data to `tests/resources/dataset.csv` within the project repository, and made it available as a `DataFrame` to all tests in this model, via a [Pytest fixture](https://docs.pytest.org/en/6.2.x/fixture.html) called `dataset`. There is only one unit test for this function and it tests that `prepare_data` splits labels from features, for both  ‘test’ and ‘train’ sets, and that it doesn’t lose any rows of data in the process. If we refactor `prepare_data` in the future, then this test will help prevent us from accidentally leaking the label into the features.
 353 | 
 354 | ### Train Model
 355 | 
 356 | Given a `FeaturesAndLabels` object together with a grid of hyper-parameters, this function will yield a trained model, together with the model’s performance metrics for the ‘test’ set . The hyper-parameter grid is an input  to this function, so that when testing we can use a single point, but can specify many more points for the actual job, when training time is less of a constraint. The metrics are contained within a `NamedTuple` called `TaskMetrics`, to make passing them between functions easier and less prone to error.
 357 | 
 358 | ```python
 359 | from sklearn.model_selection import GridSearchCV, train_test_split
 360 | 
 361 | # ...
 362 | 
 363 | PRODUCT_CODE_MAP = {"SKU001": 0, "SKU002": 1, "SKU003": 2, "SKU004": 3, "SKU005": 4}
 364 | 
 365 | # ...
 366 | 
 367 | class TaskMetrics(NamedTuple):
 368 |     """Container for the task's performance metrics."""
 369 | 
 370 |     r_squared: float
 371 |     mean_absolute_error: float
 372 | 
 373 | # ...
 374 | 
 375 | def train_model(
 376 |     data: FeatureAndLabels, hyperparam_grid: Dict[str, Any]
 377 | ) -> Tuple[BaseEstimator, TaskMetrics]:
 378 |     """Train a model and compute performance metrics."""
 379 |     grid_search = GridSearchCV(
 380 |         estimator=DecisionTreeRegressor(),
 381 |         param_grid=hyperparam_grid,
 382 |         scoring="r2",
 383 |         cv=5,
 384 |         refit=True,
 385 |     )
 386 |     grid_search.fit(preprocess(data.X_train), data.y_train)
 387 |     best_model = grid_search.best_estimator_
 388 |     y_test_pred = best_model.predict(preprocess(data.X_test))
 389 |     performance_metrics = TaskMetrics(
 390 |         r2_score(data.y_test, y_test_pred),
 391 |         mean_absolute_error(data.y_test, y_test_pred)
 392 |     )
 393 |     return (best_model, performance_metrics)
 394 | 
 395 | 
 396 | def preprocess(df: DataFrame) -> DataFrame:
 397 |     """Create features for training model."""
 398 |     processed = df.copy()
 399 |     processed["product_code"] = df["product_code"].apply(lambda e: PRODUCT_CODE_MAP[e])
 400 |     return processed.values
 401 | ```
 402 | 
 403 | We have further delegated the task of pre-processing the features for the model (in this case just mapping categories to integers), to a dedicated function called `preprocess`. The `train_model` function is tested in [tests/test_train_model.py](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/part-two/tests/test_train_model.py) as follows,
 404 | 
 405 | ```python
 406 | from sklearn.utils.validation import check_is_fitted
 407 | 
 408 | # ...
 409 | 
 410 | @fixture(scope="session")
 411 | def prepared_data(dataset: Dataset) -> FeatureAndLabels:
 412 |     return FeatureAndLabels(
 413 |         dataset.data[["orders_placed", "product_code"]][:800],
 414 |         dataset.data[["orders_placed", "product_code"]][800:999],
 415 |         dataset.data["hours_to_dispatch"][:800],
 416 |         dataset.data["hours_to_dispatch"][800:999]
 417 |     )
 418 | 
 419 | # ...
 420 | 
 421 | def test_train_model_yields_model_and_metrics(prepared_data: FeaturesAndLabels):
 422 |     model, metrics = train_model(prepared_data, {"random_state": [42]})
 423 |     try:
 424 |         check_is_fitted(model)
 425 |         assert True
 426 |     except NotFittedError:
 427 |         assert False
 428 | 
 429 |     assert metrics.r_squared >= 0.9
 430 |     assert metrics.mean_absolute_error <= 1.25
 431 | ```
 432 | 
 433 | Which tests that `train_model` returns a fitted model and acceptable performance metrics, given a reasonably sized tranche of data.
 434 | 
 435 | Note, that we haven’t relied on `prepare_data` to create the `FeatureAndLabels object`- we have created this manually in another fixture that relies on the `dataset` fixture discussed earlier. This is a deliberate choice made with the aim of decoupling the outcome of this test from the behaviour of `prepare_data`. Tests that are dependent on multiple functions can be ‘brittle’ and lead to cascades of failing tests when only a single function or method is raising an error. We cannot stress enough how important it is to structure your code in such a way that it can be easily tested.
 436 | 
 437 | For completeness, we also provide a simple test for `preprocess`,
 438 | 
 439 | ```python
 440 | from pandas import read_csv, DataFrame
 441 | 
 442 | # ...
 443 | 
 444 | def test_preprocess_processes_features():
 445 |     data = DataFrame({"orders_placed": [30], "product_code": ["SKU004"]})
 446 |     processed_data = preprocess(data)
 447 |     assert processed_data[0, 0] == 30
 448 |     assert processed_data[0, 1] == 3
 449 | ```
 450 | 
 451 | ### Validating Trained Models
 452 | 
 453 | The goal of the pipeline is to automate the process of training a new model and deploying it - i.e. to take the data scientist out-of-the-loop. Consequently, we need to exercise caution before deploying the latest model. Although the final go/no-go decision on deploying the model will be based on performance metrics, we should also sense-check the model based on basic behaviours we expect it to have. The `validate_trained_model_logic` function performs three logical tests of the model and will raise an exception if it finds an issue (thereby terminating the pipeline before deployment). The three checks are:
 454 | 
 455 | 1. Does the `hours_to_dispatch` variable increase with `order_placed`, for each product?
 456 | 2. Are all predictions for the ‘test’ set positive?
 457 | 3. Are all predictions for the ‘test’ within 25% of the highest `hours_to_dispatch` observation?
 458 | 
 459 | ```python
 460 | def validate_trained_model_logic(model: BaseEstimator, data: FeatureAndLabels) -> None:
 461 |     """Verify that a trained model passes basic logical expectations."""
 462 |     issues_detected: List[str] = []
 463 | 
 464 |     orders_placed_sensitivity_checks = [
 465 |         model.predict(array([[100, product], [150, product]])).tolist()
 466 |         for product in range(len(PRODUCT_CODE_MAP))
 467 |     ]
 468 |     if not all(e[0] < e[1] for e in orders_placed_sensitivity_checks):
 469 |         issues_detected.append(
 470 |             "hours_to_dispatch predictions do not increase with orders_placed"
 471 |         )
 472 | 
 473 |     test_set_predictions = model.predict(preprocess(data.X_test)).reshape(-1)
 474 |     if len(test_set_predictions[test_set_predictions < 0]) > 0:
 475 |         issues_detected.append(
 476 |             "negative hours_to_dispatch predictions found for test set"
 477 |         )
 478 |     if len(test_set_predictions[test_set_predictions > data.y_test.max() * 1.25]) > 0:
 479 |         issues_detected.append(
 480 |             "outlier hours_to_dispatch predictions found for test set"
 481 |         )
 482 | 
 483 |     if issues_detected:
 484 |         msg = "Trained model failed verification: " + ", ".join(issues_detected) + "."
 485 |         raise RuntimeError(msg)
 486 | ```
 487 | 
 488 | Note, that we perform all three checks before raising the exception, so that the error message and the logs that will be generated from it, can be maximally informative when it comes to debugging.
 489 | 
 490 | The associated test can also be found in [tests/test_train_model.py](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/part-two/tests/test_train_model.py).  This is the most complex test thus far, because we have to use Scikit-Learn’s `DummyRegressor` to create models that will fail each one of the tests individually, as can be seen below.
 491 | 
 492 | ```python
 493 | from pytest import fixture, raises
 494 | from sklearn.dummy import DummyRegressor
 495 | 
 496 | # ...
 497 | 
 498 | def test_validate_trained_model_logic_raises_exception_for_failing_models(
 499 |     prepared_data: FeaturesAndLabels
 500 | ):
 501 |     dummy_model = DummyRegressor(strategy="constant", constant=-1.0)
 502 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
 503 |     expected_exception_str = (
 504 |         "Trained model failed verification: "
 505 |         "hours_to_dispatch predictions do not increase with orders_placed."
 506 |     )
 507 |     with raises(RuntimeError, match=expected_exception_str):
 508 |         validate_trained_model_logic(dummy_model, prepared_data)
 509 | 
 510 |     dummy_model = DummyRegressor(strategy="constant", constant=-1.0)
 511 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
 512 |     expected_exception_str = (
 513 |         "Trained model failed verification: "
 514 |         "hours_to_dispatch predictions do not increase with orders_placed, "
 515 |         "negative hours_to_dispatch predictions found for test set."
 516 |     )
 517 |     with raises(RuntimeError, match=expected_exception_str):
 518 |         validate_trained_model_logic(dummy_model, prepared_data)
 519 | 
 520 |     dummy_model = DummyRegressor(strategy="constant", constant=1000.0)
 521 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
 522 |     expected_exception_str = (
 523 |         "Trained model failed verification: "
 524 |         "hours_to_dispatch predictions do not increase with orders_placed, "
 525 |         "outlier hours_to_dispatch predictions found for test set."
 526 |     )
 527 |     with raises(RuntimeError, match=expected_exception_str):
 528 |         validate_trained_model_logic(dummy_model, prepared_data)
 529 | ```
 530 | 
 531 | ### End-to-End Functional Tests
 532 | 
 533 | We’ve tested the individual sub-tasks within `main` , but how do we know that we’ve assembled them correctly, so that `persist_model` will upload the expected `Model` object to cloud storage? We now need to turn our attention to testing `main` from end-to-end - i.e. functional tests for the train-model stage.
 534 | 
 535 | The `main` function will try to access AWS S3 to get a dataset and then save a pickled `Model` to S3. We could setup a S3 bucket for testing this integration, but this constitutes an integration test and is not our current aim. We will disable the calls to AWS by mocking the `bodywork_pipeline_utils.aws` module using the `patch` function from the Python standard library’s [unittest.mock](https://docs.python.org/3/library/unittest.mock.html) module.
 536 | 
 537 | Decorating our test with `@patch("pipeline.train_model.aws")`, causes `bodywork_pipeline_utils.aws` (which we import into `train_model.py`) to be replaced by a `MagicMock` object called `mock_aws`. This allows us to perform a number of useful tasks:
 538 | 
 539 | - Hard-code the return value from `aws.get_latest_csv_dataset_from_s3`, so that it returns our local test dataset instead of a remote dataset on S3.
 540 | - Check if the `put_model_to_s3`method of the `aws.Model` object created in `persist_model`, was called.
 541 | 
 542 | You can see this in action below.
 543 | 
 544 | ```python
 545 | from unittest.mock import MagicMock, patch
 546 | 
 547 | from pytest import fixture, raises
 548 | from _pytest.logging import LogCaptureFixture
 549 | 
 550 | # ...
 551 | 
 552 | @patch("pipeline.train_model.aws")
 553 | def test_train_job_happy_path(
 554 |     mock_aws: MagicMock,
 555 |     dataset: Dataset,
 556 |     caplog: LogCaptureFixture,
 557 | ):
 558 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
 559 |     main("project-bucket", 0.8, 0.9, {"random_state": [42]})
 560 |     mock_aws.Model().put_model_to_s3.assert_called_once()
 561 |     logs = caplog.text
 562 |     assert "Starting train-model stage" in logs
 563 |     assert "Retrieved dataset from s3" in logs
 564 |     assert "Trained model" in logs
 565 |     assert "Model serialised and persisted to s3" in logs
 566 | ```
 567 | 
 568 | This test also makes use of Pytest’s [caplog](https://docs.pytest.org/en/6.2.x/reference.html?highlight=caplog#pytest.logging.caplog) fixture, enabling us to test that `main` yields the expected log records when everything goes according to plan (i.e. the ‘happy path’). This gives us confidence that model artefacts will be persisted as expected, when run in production.
 569 | 
 570 | What about the ‘unhappy paths’ - when performance metrics fall below warning and error thresholds? We need to test that `main` will behave as we expect it too, and so we will have to write tests for these scenarios, as well.
 571 | 
 572 | ```python
 573 | @patch("pipeline.train_model.aws")
 574 | def test_train_job_raises_exception_when_metrics_below_error_threshold(
 575 |     mock_aws: MagicMock,
 576 |     dataset: Dataset,
 577 | ):
 578 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
 579 |     with raises(RuntimeError, match="below deployment threshold"):
 580 |         main("project-bucket", 1, 0.9, {"random_state": [42]})
 581 | 
 582 | 
 583 | @patch("pipeline.train_model.aws")
 584 | def test_train_job_logs_warning_when_metrics_below_warning_threshold(
 585 |     mock_aws: MagicMock,
 586 |     dataset: Dataset,
 587 |     caplog: LogCaptureFixture,
 588 | ):
 589 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
 590 |     main("project-bucket", 0.5, 0.9, {"random_state": [42]})
 591 |     assert "WARNING" in caplog.text
 592 |     assert "breached warning threshold" in caplog.text
 593 | ```
 594 | 
 595 | These tests work by setting the thresholds artificially high (or low) and checking that exceptions are raised or that warning messages are logged. Note, that this testing strategy only works because `main` accepts the thresholds as arguments, which was one of the key motivations for designing it in this way.
 596 | 
 597 | ### Input Validation for the Stage
 598 | 
 599 | The train-model stage works by executing `train_model.py`, which requires three arguments to be passed to it (as discussed earlier on). These inputs are validated and this validation needs to be tested for completeness. This is a long and boring test, so we will not reproduce the whole thing, but instead discuss the testing strategy (which is a bit more interesting).
 600 | 
 601 | The approach to testing input validation, is to run `test_model.py` as Bodywork would run it within a container on Kubernetes, by calling `python pipeline/train_model.py` from the command line. We can replicate this using `subprocess.run` from the Python standard library and capturing the output. We can then pass invalid arguments and check the output for the expected error messages. You can see this pattern in-action below, for the case when no arguments are passed.
 602 | 
 603 | ```python
 604 | from subprocess import run
 605 | 
 606 | # ...
 607 | 
 608 | def test_run_job_handles_error_for_invalid_args():
 609 |     process_one = run(
 610 |         ["python", "pipeline/train_model.py"], capture_output=True, encoding="utf-8"
 611 |     )
 612 |     assert process_one.returncode != 0
 613 |     assert "ERROR" in process_one.stdout
 614 |     assert "Invalid arguments passed to train_model.py" in process_one.stdout
 615 | 
 616 | 	  # ...
 617 | ```
 618 | 
 619 | ## Developing the Model Serving Stage
 620 | 
 621 | In Part One of this series we developed a skeleton web service that returned a hard-coded value whenever the API was called. Our task in this part is to extend this to downloading the latest model persisted to cloud object storage (AWS S3), and then use the model for generating predictions. Unlike the train-model stage, the effort required for this task is relatively small and so we will reproduce `serve_model.py` in full and then discuss it in more detail afterwards.
 622 | 
 623 | ```python
 624 | import sys
 625 | from enum import Enum
 626 | from typing import Dict, Union
 627 | 
 628 | import uvicorn
 629 | from bodywork_pipeline_utils import aws, logging
 630 | from fastapi import FastAPI, status
 631 | from numpy import array
 632 | from pydantic import BaseModel, Field
 633 | 
 634 | from pipeline.train_model import PRODUCT_CODE_MAP
 635 | 
 636 | app = FastAPI(debug=False)
 637 | log = logging.configure_logger()
 638 | 
 639 | 
 640 | class ProductCode(Enum):
 641 |     SKU001 = "SKU001"
 642 |     SKU002 = "SKU002"
 643 |     SKU003 = "SKU003"
 644 |     SKU004 = "SKU004"
 645 |     SKU005 = "SKU005"
 646 | 
 647 | 
 648 | class Data(BaseModel):
 649 |     product_code: ProductCode
 650 |     orders_placed: float = Field(..., ge=0.0)
 651 | 
 652 | 
 653 | class Prediction(BaseModel):
 654 |     est_hours_to_dispatch: float
 655 |     model_version: str
 656 | 
 657 | 
 658 | @app.post(
 659 |     "/api/v0.1/time_to_dispatch",
 660 |     status_code=status.HTTP_200_OK,
 661 |     response_model=Prediction,
 662 | )
 663 | def time_to_dispatch(data: Data) -> Dict[str, Union[str, float]]:
 664 |     features = array([[data.orders_placed, PRODUCT_CODE_MAP[data.product_code.value]]])
 665 |     prediction = wrapped_model.model.predict(features).tolist()[0]
 666 |     return {"est_hours_to_dispatch": prediction, "model_version": str(wrapped_model)}
 667 | 
 668 | 
 669 | if __name__ == "__main__":
 670 |     try:
 671 |         args = sys.argv
 672 |         s3_bucket = args[1]
 673 |         wrapped_model = aws.get_latest_pkl_model_from_s3(s3_bucket, "models")
 674 |         log.info(f"Successfully loaded model: {wrapped_model}")
 675 |     except IndexError:
 676 |         log.error("Invalid arguments passed to serve_model.py - expected S3_BUCKET")
 677 |         sys.exit(1)
 678 |     except Exception as e:
 679 |         log.error(f"Could not get latest model and start web server - {e}")
 680 |         sys.exit(1)
 681 |     uvicorn.run(app, host="0.0.0.0", workers=1)
 682 | ```
 683 | 
 684 | The key changes from the version in Part One are as follows:
 685 | 
 686 | - We now pass the name of the AWS S3 bucket as an argument to `serve_model.py`.
 687 | - In the `if __name__ == "__main__"` block we now attempt to to retrieve latest `Model` object that was persisted to AWS S3, before starting the FastAPI server.
 688 | - We placed a new constraint on the `Data.orders_placed` field to ensure that all values sent to the API must be greater-than-or-equal-to zero, and another new constraint on `Data.product_code` that forces this field to be one of the values specified in the `ProductCode` [enumeration](https://docs.python.org/3/library/enum.html).
 689 | - We now use the model to generate predictions, using the `PRODUCT_CODE_MAP` dictionary from `train_model.py` to map product codes to integers, before calling the model.
 690 | - We use the string representation of the `Model` object in the response’s `model_version` field, which contains the full information on which S3 object is being used, as well as other metadata such as the dataset used to train the model, the type of model, etc. This verbose information is designed to facilitate easy debugging of problematic responses.
 691 | 
 692 | If we start the server locally,
 693 | 
 694 | ```text
 695 | $ python -m pipeline.serve_model time-to-dispatch
 696 | 
 697 | 2021-07-24 09:56:42,718 - INFO - serve_model.<module> - Successfully loaded model: name:time-to-dispatch|model_type:<class 'sklearn.tree._classes.DecisionTreeRegressor'>|model_timestamp:2021-07-20 14:44:13.558375|model_hash:b4860f56fa24193934fe1ea51b66818d|train_dataset_key:datasets/time_to_dispatch_2021-07-01T16|45|38.csv|train_dataset_hash:"759eccda4ceb7a07cda66ad4ef7cdfbc"|pipeline_git_commit_hash:NA
 698 | 2021-07-24 09:56:42,718 - INFO - serve_model.<module> - Successfully loaded model: name:time-to-dispatch|model_type:<class 'sklearn.tree._classes.DecisionTreeRegressor'>|model_timestamp:2021-07-20 14:44:13.558375|model_hash:b4860f56fa24193934fe1ea51b66818d|train_dataset_key:datasets/time_to_dispatch_2021-07-01T16|45|38.csv|train_dataset_hash:"759eccda4ceb7a07cda66ad4ef7cdfbc"|pipeline_git_commit_hash:NA
 699 | INFO:     Started server process [88289]
 700 | INFO:     Waiting for application startup.
 701 | INFO:     Application startup complete.
 702 | INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 703 | ```
 704 | 
 705 | Then we can send a test request,
 706 | 
 707 | ```text
 708 | $ curl http://localhost:8000/api/v0.1/time_to_dispatch \
 709 |     --request POST \
 710 |     --header "Content-Type: application/json" \
 711 |     --data '{"product_code": "SKU001", "orders_placed": 10}'
 712 | ```
 713 | 
 714 | Which should return a response along the lines of,
 715 | 
 716 | ```json
 717 | {
 718 |   "est_hours_to_dispatch": 0.6527543057985115,
 719 |   "model_version": "name:time-to-dispatch|model_type:<class 'sklearn.tree._classes.DecisionTreeRegressor'>|model_timestamp:2021-07-20 14:44:13.558375|model_hash:b4860f56fa24193934fe1ea51b66818d|train_dataset_key:datasets/time_to_dispatch_2021-07-01T16|45|38.csv|train_dataset_hash:\"759eccda4ceb7a07cda66ad4ef7cdfbc\"|pipeline_git_commit_hash:ed3113197adcbdbe338bf406841b930e895c42d6"
 720 | }
 721 | ```
 722 | 
 723 | ### Updating the Tests
 724 | 
 725 | We only need to add one more (small) test to [tests/test_serve_model.py](https://github.com/bodywork-ml/ml-pipeline-engineering/blob/part-two/tests/test_serve_model.py), but we will have to modify the existing tests to take into account that we are now using a trained model to generate predictions, as opposed to returning fixed values. This introduces a complication, because we need to inject a working model into the module.
 726 | 
 727 | To facilitate testing, we have persisted a valid `Model` object to `tests/resources/model.pkl`, which will be loaded in a function called `wrapped_model` and injected into the module at test-time as a new object, using `unittest.mock.patch`. We are unable to use `patch` as we did in `train_model.py`, because the model is only loaded when `serve_model.py` is executed, whereas our tests rely only the FastAPI test client.
 728 | 
 729 | The modified test for a valid request is shown
 730 | 
 731 | ```python
 732 | import pickle
 733 | from subprocess import run
 734 | from unittest.mock import patch
 735 | 
 736 | from bodywork_pipeline_utils.aws import Model
 737 | from fastapi.testclient import TestClient
 738 | from numpy import array
 739 | 
 740 | test_client = TestClient(app)
 741 | 
 742 | def wrapped_model() -> Model:
 743 |     with open("tests/resources/model.pkl", "r+b") as file:
 744 |         wrapped_model = pickle.load(file)
 745 |     return wrapped_model
 746 | 
 747 | 
 748 | @patch("pipeline.serve_model.wrapped_model", new=wrapped_model(), create=True)
 749 | def test_web_api_returns_valid_response_given_valid_data():
 750 |     prediction_request = {"product_code": "SKU001", "orders_placed": 100}
 751 |     prediction_response = test_client.post(
 752 |         "/api/v0.1/time_to_dispatch", json=prediction_request
 753 |     )
 754 |     model_obj = wrapped_model()
 755 |     expected_prediction = model_obj.model.predict(array([[100, 0]])).tolist()[0]
 756 |     assert prediction_response.status_code == 200
 757 |     assert prediction_response.json()["est_hours_to_dispatch"] == expected_prediction
 758 |     assert prediction_response.json()["model_version"] == str(model_obj)
 759 | ```
 760 | 
 761 | This works by checking the output from the API against the output from the model loaded from the test resources, to make sure that they are identical. Next, we modify the test that covers the API data validation, to reflect the extra constraints we have placed on requests.
 762 | 
 763 | ```python
 764 | @patch("pipeline.serve_model.wrapped_model", new=wrapped_model(), create=True)
 765 | def test_web_api_returns_error_code_given_invalid_data():
 766 |     prediction_request = {"product_code": "SKU001", "foo": 100}
 767 |     prediction_response = test_client.post(
 768 |         "/api/v0.1/time_to_dispatch", json=prediction_request
 769 |     )
 770 |     assert prediction_response.status_code == 422
 771 |     assert "value_error.missing" in prediction_response.text
 772 | 
 773 |     prediction_request = {"product_code": "SKU000", "orders_placed": 100}
 774 |     prediction_response = test_client.post(
 775 |         "/api/v0.1/time_to_dispatch", json=prediction_request
 776 |     )
 777 |     assert prediction_response.status_code == 422
 778 |     assert "not a valid enumeration member" in prediction_response.text
 779 | 
 780 |     prediction_request = {"product_code": "SKU001", "orders_placed": -100}
 781 |     prediction_response = test_client.post(
 782 |         "/api/v0.1/time_to_dispatch", json=prediction_request
 783 |     )
 784 |     assert prediction_response.status_code == 422
 785 |     assert "ensure this value is greater than or equal to 0" in prediction_response.text
 786 | ```
 787 | 
 788 | Finally, we add one more test to cover the input validation for the `serve_model.py` module, using the same strategy as we did for the equivalent test for `train_model.py`.
 789 | 
 790 | ```python
 791 | from subprocess import run
 792 | 
 793 | # ...
 794 | 
 795 | def test_web_server_raises_exception_if_passed_invalid_args():
 796 |     process = run(
 797 |         ["python", "-m", "pipeline.serve_model"], capture_output=True, encoding="utf-8"
 798 |     )
 799 |     assert process.returncode != 0
 800 |     assert "ERROR" in process.stdout
 801 |     assert "Invalid arguments passed to serve_model.py" in process.stdout
 802 | ```
 803 | 
 804 | ## Updating the Deployment and Releasing to Production
 805 | 
 806 | The last task we need to complete before we can commit all changes, push to GitHub and trigger the CI/CD pipeline, is to update the deployment configuration in `bodywork.yaml`. This requires three changes:
 807 | 
 808 | - Arguments now need to be passed to each stage.
 809 | - The Python package requirements for each stage need to be updated.
 810 | - AWS credentials need to be injected into each stage, as required by `bodywork_pipeline_utils.aws`.
 811 | - CPU and memory resources need to be updated, together with max completion/startup timeouts.
 812 | 
 813 | ```yaml
 814 | version: "1.1"
 815 | pipeline:
 816 |   name: time-to-dispatch
 817 |   docker_image: bodyworkml/bodywork-core:3.0
 818 |   DAG: train_model >> serve_model
 819 |   secrets_group: dev
 820 | stages:
 821 |   train_model:
 822 |     executable_module_path: pipeline/train_model.py
 823 |     args: ["time-to-dispatch", "0.9", "0.8"]
 824 |     requirements:
 825 |       - numpy>=1.21.0
 826 |       - pandas>=1.2.5
 827 |       - scikit-learn>=1.0.0
 828 |       - git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
 829 |     cpu_request: 1.0
 830 |     memory_request_mb: 1000
 831 |     batch:
 832 |       max_completion_time_seconds: 180
 833 |       retries: 1
 834 |     secrets:
 835 |       AWS_ACCESS_KEY_ID: aws-credentials
 836 |       AWS_SECRET_ACCESS_KEY: aws-credentials
 837 |       AWS_DEFAULT_REGION: aws-credentials
 838 |   serve_model:
 839 |     executable_module_path: pipeline/serve_model.py
 840 |     args: ["time-to-dispatch"]
 841 |     requirements:
 842 |       - numpy>=1.21.0
 843 |       - scikit-learn>=1.0.0
 844 |       - fastapi>=0.65.2
 845 |       - uvicorn>=0.14.0
 846 |       - git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
 847 |     cpu_request: 0.5
 848 |     memory_request_mb: 250
 849 |     service:
 850 |       max_startup_time_seconds: 180
 851 |       replicas: 2
 852 |       port: 8000
 853 |       ingress: true
 854 |     secrets:
 855 |       AWS_ACCESS_KEY_ID: aws-credentials
 856 |       AWS_SECRET_ACCESS_KEY: aws-credentials
 857 |       AWS_DEFAULT_REGION: aws-credentials
 858 | logging:
 859 |   log_level: INFO
 860 | ```
 861 | 
 862 | This will instruct Bodywork to look for `AWS_ACCESS_KEY_ID`,  `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION` in a secret record called `aws-credentials`, so that it can inject these secrets into the containers running the stages of our pipeline (as environment variables that will be detected silently). So, these will have to be created, which can be done as follows,
 863 | 
 864 | ```text
 865 | $ bw create secret aws-credentials \
 866 |     --group=dev \
 867 |     --data AWS_ACCESS_KEY_ID=put-your-key-in-here \
 868 |     --data AWS_SECRET_ACCESS_KEY=put-your-other-key-in-here \
 869 |     --data AWS_DEFAULT_REGION=wherever-your-cluster-is
 870 | ```
 871 | 
 872 | Now you’re ready to push this branch to your remote Git repo! If your tests pass and your colleagues approve the merge, the CD part of the CI/CD pipeline we setup in Part One will ensure the new pipeline is deployed to Kubernetes by Bodywork and executed immediately. Bodywork will perform a rolling-deployment that will ensure zero down-time and automatically roll-back failed deployments to the previous version. When Bodywork has finished, test the new web API,
 873 | 
 874 | ```text
 875 | $ curl http://CLUSTER_IP/pipelines/time-to-dispatch--serve-model/api/v0.1/time_to_dispatch \
 876 |     --request POST \
 877 |     --header "Content-Type: application/json" \
 878 |     --data '{"product_code": "SKU001", "orders_placed": 10}'
 879 | ```
 880 | 
 881 | Where you should observe the same response you received when testing locally,
 882 | 
 883 | ```json
 884 | {
 885 |   "est_hours_to_dispatch": 0.6527543057985115,
 886 |   "model_version": "name:time-to-dispatch|model_type:<class 'sklearn.tree._classes.DecisionTreeRegressor'>|model_timestamp:2021-07-20 14:44:13.558375|model_hash:b4860f56fa24193934fe1ea51b66818d|train_dataset_key:datasets/time_to_dispatch_2021-07-01T16|45|38.csv|train_dataset_hash:\"759eccda4ceb7a07cda66ad4ef7cdfbc\"|pipeline_git_commit_hash:ed3113197adcbdbe338bf406841b930e895c42d6"
 887 | }
 888 | ```
 889 | 
 890 | See our guide to [accessing services](https://bodywork.readthedocs.io/en/latest/kubernetes/#accessing-services) for information on how to determine `CLUSTER_IP`.
 891 | 
 892 | ## Scheduling the Pipeline to run on a Schedule
 893 | 
 894 | At this point, the pipeline will have deployed a model using the most recent dataset made available for this task. We know, however, that new data will arrive every Friday evening and so we’d like to schedule the pipeline to run just after the data is expected. We can achieve this using Bodywork cronjobs, as follows,
 895 | 
 896 | ```text
 897 | $ bw create cronjob https://github.com/bodywork-ml/ml-pipeline-engineering \
 898 |     --name=weekly-update \
 899 |     --branch master \
 900 |     --schedule="45 11 * * 5" \
 901 | 	--retries=2
 902 | ```
 903 | 
 904 | ## Wrap-Up
 905 | 
 906 | In this second part we have gone from a skeleton “Hello, Production!” deployment to a fully-functional train-and-deploy pipeline, that automates re-training and re-deployment in a production environment, on a periodic basis. We have factored-out common code so that it can be re-used across projects and discussed various strategies for developing automated tests for both stages of the pipeline, ensuring that subsequent modifications can be reliably integrated and deployed, with relative ease.
 907 | 
 908 | In the final part of this series we will cover monitoring and observability and aim to to answer the question, “*How will I know when something has gone wrong?*”.
 909 | 
 910 | ## Appendix
 911 | 
 912 | For reference.
 913 | 
 914 | ### The `Dataset` Class
 915 | 
 916 | Reproduced from the [bodywork-pipeline-utils](https://github.com/bodywork-ml/bodywork-pipeline-utils) package, which is available to download from [PyPI](https://pypi.org/project/bodywork-pipeline-utils/).
 917 | 
 918 | ```python
 919 | from datetime import datetime
 920 | from tempfile import NamedTemporaryFile
 921 | from typing import Any, NamedTuple
 922 | 
 923 | from pandas import DataFrame, read_csv, read_parquet
 924 | 
 925 | from bodywork_pipeline_utils.aws.artefacts import (
 926 |     find_latest_artefact_on_s3,
 927 |     make_timestamped_filename,
 928 |     put_file_to_s3,
 929 | )
 930 | 
 931 | 
 932 | class Dataset(NamedTuple):
 933 |     """Container for downloaded datasets and associated metadata."""
 934 | 
 935 |     data: DataFrame
 936 |     datetime: datetime
 937 |     bucket: str
 938 |     key: str
 939 |     hash: str
 940 | 
 941 | 
 942 | def get_latest_csv_dataset_from_s3(bucket: str, folder: str = "") -> Dataset:
 943 |     """Get the latest CSV dataset from S3.
 944 | 
 945 |     Args:
 946 |         bucket: S3 bucket to look in.
 947 |         folder: Folder within bucket to limit search, defaults to "".
 948 | 
 949 |     Returns:
 950 |         Dataset object.
 951 |     """
 952 |     artefact = find_latest_artefact_on_s3("csv", bucket, folder)
 953 |     data = read_csv(artefact.get())
 954 |     return Dataset(data, artefact.timestamp, bucket, artefact.obj_key, artefact.etag)
 955 | 
 956 | 
 957 | def get_latest_parquet_dataset_from_s3(bucket: str, folder: str = "") -> Dataset:
 958 |     """Get the latest Parquet dataset from S3.
 959 | 
 960 |     Args:
 961 |         bucket: S3 bucket to look in.
 962 |         folder: Folder within bucket to limit search, defaults to "".
 963 | 
 964 |     Returns:
 965 |         Dataset object.
 966 |     """
 967 |     artefact = find_latest_artefact_on_s3("parquet", bucket, folder)
 968 |     data = read_parquet(artefact.get())
 969 |     return Dataset(data, artefact.timestamp, bucket, artefact.obj_key, artefact.etag)
 970 | 
 971 | 
 972 | def put_csv_dataset_to_s3(
 973 |     data: DataFrame,
 974 |     filename_prefix: str,
 975 |     ref_datetime: datetime,
 976 |     bucket: str,
 977 |     folder: str = "",
 978 |     **kwargs: Any,
 979 | ) -> None:
 980 |     """Upload DataFrame to S3 as a CSV file.
 981 | 
 982 |     Args:
 983 |         data: The DataFrame to upload.
 984 |         filename_prefix: Prefix before datetime filename element.
 985 |         ref_datetime: The reference date associated with data.
 986 |         bucket: Location on S3 to persist the data.
 987 |         folder: Folder within the bucket, defaults to "".
 988 |         kwargs: Keywork arguments to pass to pandas.to_csv.
 989 |     """
 990 |     filename = make_timestamped_filename(filename_prefix, ref_datetime, "csv")
 991 |     with NamedTemporaryFile() as temp_file:
 992 |         data.to_csv(temp_file, **kwargs)
 993 |         put_file_to_s3(temp_file.name, bucket, folder, filename)
 994 | 
 995 | 
 996 | def put_parquet_dataset_to_s3(
 997 |     data: DataFrame,
 998 |     filename_prefix: str,
 999 |     ref_datetime: datetime,
1000 |     bucket: str,
1001 |     folder: str = "",
1002 |     **kwargs: Any,
1003 | ) -> None:
1004 |     """Upload DataFrame to S3 as a Parquet file.
1005 | 
1006 |     Args:
1007 |         data: The DataFrame to upload.
1008 |         filename_prefix: Prefix before datetime filename element.
1009 |         ref_datetime: The reference date associated with data.
1010 |         bucket: Location on S3 to persist the data.
1011 |         folder: Folder within the bucket, defaults to "".
1012 |         kwargs: Keywork arguments to pass to pandas.to_csv.
1013 |     """
1014 |     filename = make_timestamped_filename(filename_prefix, ref_datetime, "parquet")
1015 |     with NamedTemporaryFile() as temp_file:
1016 |         data.to_parquet(temp_file, **kwargs)
1017 |         put_file_to_s3(temp_file.name, bucket, folder, filename)
1018 | ```
1019 | 
1020 | ### The `Model` Class
1021 | 
1022 | Reproduced from the [bodywork-pipeline-utils](https://github.com/bodywork-ml/bodywork-pipeline-utils) package, which is available to download from [PyPI](https://pypi.org/project/bodywork-pipeline-utils/).
1023 | 
1024 | ```python
1025 | from datetime import datetime
1026 | from hashlib import md5
1027 | from os import environ
1028 | from pickle import dump, dumps, loads, PicklingError, UnpicklingError
1029 | from tempfile import NamedTemporaryFile
1030 | from typing import Any, cast, Dict, Optional
1031 | 
1032 | from bodywork_pipeline_utils.aws.datasets import Dataset
1033 | from bodywork_pipeline_utils.aws.artefacts import (
1034 |     find_latest_artefact_on_s3,
1035 |     make_timestamped_filename,
1036 |     put_file_to_s3,
1037 | )
1038 | 
1039 | 
1040 | class Model:
1041 |     """Base class for representing ML models and metadata."""
1042 | 
1043 |     def __init__(
1044 |         self,
1045 |         name: str,
1046 |         model: Any,
1047 |         train_dataset: Dataset,
1048 |         metadata: Optional[Dict[str, Any]] = None,
1049 |     ):
1050 |         """Constructor.
1051 | 
1052 |         Args:
1053 |             name: Model name.
1054 |             model: Trained model object.
1055 |             train_dataset: Dataset object used to train the model.
1056 |             metadata: Arbitrary model metadata.
1057 |         """
1058 |         self._name = name
1059 |         self._train_dataset_key = train_dataset.key
1060 |         self._train_dataset_hash = train_dataset.hash
1061 |         self._model_hash = self._compute_model_hash(model)
1062 |         self._model = model
1063 |         self._model_type = type(model)
1064 |         self._creation_time = datetime.now()
1065 |         self._pipeline_git_commit_hash = environ.get("GIT_COMMIT_HASH", "NA")
1066 |         self._metadata = metadata
1067 | 
1068 |     def __eq__(self, other: object) -> bool:
1069 |         """Model quality operator."""
1070 |         if isinstance(other, Model):
1071 |             conditions = [
1072 |                 self._train_dataset_hash == other._train_dataset_hash,
1073 |                 self._train_dataset_key == other._train_dataset_key,
1074 |                 self._creation_time == other._creation_time,
1075 |                 self._pipeline_git_commit_hash == other._pipeline_git_commit_hash,
1076 |             ]
1077 |             if all(conditions):
1078 |                 return True
1079 |             else:
1080 |                 return False
1081 |         else:
1082 |             return False
1083 | 
1084 |     def __repr__(self) -> str:
1085 |         """Stdout representation."""
1086 |         info = (
1087 |             f"name: {self._name}\n"
1088 |             f"model_type: {self._model_type}\n"
1089 |             f"model_timestamp: {self._creation_time}\n"
1090 |             f"model_hash: {self._model_hash}\n"
1091 |             f"train_dataset_key: {self._train_dataset_key}\n"
1092 |             f"train_dataset_hash: {self._train_dataset_hash}\n"
1093 |             f"pipeline_git_commit_hash: {self._pipeline_git_commit_hash}"
1094 |         )
1095 |         return info
1096 | 
1097 |     def __str__(self) -> str:
1098 |         """String representation."""
1099 |         info = (
1100 |             f"name:{self._name}|"
1101 |             f"model_type:{self._model_type}|"
1102 |             f"model_timestamp:{self._creation_time}|"
1103 |             f"model_hash:{self._model_hash}|"
1104 |             f"train_dataset_key:{self._train_dataset_key}|"
1105 |             f"train_dataset_hash:{self._train_dataset_hash}|"
1106 |             f"pipeline_git_commit_hash:{self._pipeline_git_commit_hash}"
1107 |         )
1108 |         return info
1109 | 
1110 |     @property
1111 |     def metadata(self) -> Optional[Dict[str, Any]]:
1112 |         return self._metadata
1113 | 
1114 |     @property
1115 |     def model(self) -> Any:
1116 |         return self._model
1117 | 
1118 |     @staticmethod
1119 |     def _compute_model_hash(model: Any) -> str:
1120 |         """Compute a hash for a model object."""
1121 |         try:
1122 |             model_bytestream = dumps(model, protocol=5)
1123 |             hash = md5(model_bytestream)
1124 |             return hash.hexdigest()
1125 |         except PicklingError:
1126 |             msg = "Could not pickle model into bytes before hashing."
1127 |             raise RuntimeError(msg)
1128 |         except Exception as e:
1129 |             msg = "Could not hash model."
1130 |             raise RuntimeError(msg) from e
1131 | 
1132 |     def put_model_to_s3(self, bucket: str, folder: str = "") -> str:
1133 |         """Upload model to S3 as a pickle file.
1134 | 
1135 |         Args:
1136 |             bucket: Location on S3 to persist the data.
1137 |             folder: Folder within the bucket, defaults to "".
1138 |         """
1139 |         filename = make_timestamped_filename(self._name, self._creation_time, "pkl")
1140 |         with NamedTemporaryFile() as temp_file:
1141 |             dump(self, temp_file, protocol=5)
1142 |             put_file_to_s3(temp_file.name, bucket, folder, filename)
1143 |         return f"{bucket}/{folder}/{filename}"
1144 | 
1145 | 
1146 | def get_latest_pkl_model_from_s3(bucket: str, folder: str = "") -> Model:
1147 |     """Get the latest model from S3.
1148 | 
1149 |     Args:
1150 |         bucket: S3 bucket to look in.
1151 |         folder: Folder within bucket to limit search, defaults to "".
1152 | 
1153 |     Returns:
1154 |         Dataset object.
1155 |     """
1156 |     artefact = find_latest_artefact_on_s3("pkl", bucket, folder)
1157 |     try:
1158 |         artefact_bytes = artefact.get().read()
1159 |         model = cast(Model, loads(artefact_bytes))
1160 |         return model
1161 |     except UnpicklingError:
1162 |         msg = "artefact at {bucket}/{model.obj_key} could not be unpickled."
1163 |         raise RuntimeError(msg)
1164 |     except AttributeError:
1165 |         msg = "artefact at {bucket}/{model.obj_key} is not type Model."
1166 |         raise RuntimeError(msg)
1167 | ```
1168 | 
1169 | ### `train_model.py`
1170 | 
1171 | Reproduced from the ml-[pipeline-engineering](https://github.com/bodywork-ml/ml-pipeline-engineering/tree/part-two) repository.
1172 | 
1173 | ```python
1174 | """
1175 | - Download training dataset from AWS S3.
1176 | - Prepare data and train model.
1177 | - Persist model to AWS S3.
1178 | """
1179 | import sys
1180 | from typing import Any, Dict, List, NamedTuple, Tuple
1181 | 
1182 | from bodywork_pipeline_utils import aws, logging
1183 | from bodywork_pipeline_utils.aws import Dataset
1184 | from numpy import array
1185 | from pandas import DataFrame
1186 | from sklearn.base import BaseEstimator
1187 | from sklearn.model_selection import GridSearchCV, train_test_split
1188 | from sklearn.metrics import mean_absolute_error, r2_score
1189 | from sklearn.tree import DecisionTreeRegressor
1190 | 
1191 | PRODUCT_CODE_MAP = {"SKU001": 0, "SKU002": 1, "SKU003": 2, "SKU004": 3, "SKU005": 4}
1192 | HYPERPARAM_GRID = {
1193 |     "random_state": [42],
1194 |     "criterion": ["squared_error", "absolute_error"],
1195 |     "max_depth": [2, 4, 6, 8, 10, None],
1196 |     "min_samples_split": [2, 4, 6, 8, 10],
1197 |     "min_samples_leaf": [2, 4, 6, 8, 10],
1198 | }
1199 | 
1200 | log = logging.configure_logger()
1201 | 
1202 | 
1203 | class FeatureAndLabels(NamedTuple):
1204 |     """Container for features and labels split by test and train sets."""
1205 | 
1206 |     X_train: DataFrame
1207 |     X_test: DataFrame
1208 |     y_train: DataFrame
1209 |     y_test: DataFrame
1210 | 
1211 | 
1212 | class TaskMetrics(NamedTuple):
1213 |     """Container for the task's performance metrics."""
1214 | 
1215 |     r_squared: float
1216 |     mean_absolute_error: float
1217 | 
1218 | 
1219 | def main(
1220 |     s3_bucket: str,
1221 |     metric_error_threshold: float,
1222 |     metric_warning_threshold: float,
1223 |     hyperparam_grid: Dict[str, Any],
1224 | ) -> None:
1225 |     """Main training job."""
1226 |     log.info("Starting train-model stage.")
1227 |     dataset = aws.get_latest_csv_dataset_from_s3(s3_bucket, "datasets")
1228 |     log.info(f"Retrieved dataset from s3://{s3_bucket}/{dataset.key}")
1229 | 
1230 |     feature_and_labels = prepare_data(dataset.data)
1231 |     model, metrics = train_model(feature_and_labels, hyperparam_grid)
1232 |     validate_trained_model_logic(model, feature_and_labels)
1233 |     log.info(
1234 |         f"Trained model: r-squared={metrics.r_squared:.3f}, "
1235 |         f"MAE={metrics.mean_absolute_error:.3f}"
1236 |     )
1237 | 
1238 |     if metrics.r_squared >= metric_error_threshold:
1239 |         if metrics.r_squared >= metric_warning_threshold:
1240 |             log.warning("Metrics breached warning threshold - check for drift.")
1241 |         s3_location = persist_model(s3_bucket, model, dataset, metrics)
1242 |         log.info(f"Model serialised and persisted to s3://{s3_location}")
1243 |     else:
1244 |         msg = (
1245 |             f"r-squared metric ({{metrics.r_squared:.3f}}) is below deployment "
1246 |             f"threshold {metric_error_threshold}"
1247 |         )
1248 |         raise RuntimeError(msg)
1249 | 
1250 | 
1251 | def prepare_data(data: DataFrame) -> FeatureAndLabels:
1252 |     """Split the data into features and labels for training and testing."""
1253 |     X = data.drop("hours_to_dispatch", axis=1)
1254 |     y = data["hours_to_dispatch"]
1255 |     X_train, X_test, y_train, y_test = train_test_split(
1256 |         X, y, test_size=0.2, stratify=data["product_code"].values, random_state=42
1257 |     )
1258 |     return FeatureAndLabels(X_train, X_test, y_train, y_test)
1259 | 
1260 | 
1261 | def train_model(
1262 |     data: FeatureAndLabels, hyperparam_grid: Dict[str, Any]
1263 | ) -> Tuple[BaseEstimator, TaskMetrics]:
1264 |     """Train a model and compute performance metrics."""
1265 |     grid_search = GridSearchCV(
1266 |         estimator=DecisionTreeRegressor(),
1267 |         param_grid=hyperparam_grid,
1268 |         scoring="r2",
1269 |         cv=5,
1270 |         refit=True,
1271 |     )
1272 |     grid_search.fit(preprocess(data.X_train), data.y_train)
1273 |     best_model = grid_search.best_estimator_
1274 |     y_test_pred = best_model.predict(preprocess(data.X_test))
1275 |     performance_metrics = TaskMetrics(
1276 |         r2_score(data.y_test, y_test_pred),
1277 |         mean_absolute_error(data.y_test, y_test_pred),
1278 |     )
1279 |     return (best_model, performance_metrics)
1280 | 
1281 | 
1282 | def validate_trained_model_logic(model: BaseEstimator, data: FeatureAndLabels) -> None:
1283 |     """Verify that a trained model passes basic logical expectations."""
1284 |     issues_detected: List[str] = []
1285 | 
1286 |     orders_placed_sensitivity_checks = [
1287 |         model.predict(array([[100, product], [150, product]])).tolist()
1288 |         for product in range(len(PRODUCT_CODE_MAP))
1289 |     ]
1290 |     if not all(e[0] < e[1] for e in orders_placed_sensitivity_checks):
1291 |         issues_detected.append(
1292 |             "hours_to_dispatch predictions do not increase with orders_placed"
1293 |         )
1294 | 
1295 |     test_set_predictions = model.predict(preprocess(data.X_test)).reshape(-1)
1296 |     if len(test_set_predictions[test_set_predictions < 0]) > 0:
1297 |         issues_detected.append(
1298 |             "negative hours_to_dispatch predictions found for test set"
1299 |         )
1300 |     if len(test_set_predictions[test_set_predictions > data.y_test.max() * 1.25]) > 0:
1301 |         issues_detected.append(
1302 |             "outlier hours_to_dispatch predictions found for test set"
1303 |         )
1304 | 
1305 |     if issues_detected:
1306 |         msg = "Trained model failed verification: " + ", ".join(issues_detected) + "."
1307 |         raise RuntimeError(msg)
1308 | 
1309 | 
1310 | def preprocess(df: DataFrame) -> DataFrame:
1311 |     """Create features for training model."""
1312 |     processed = df.copy()
1313 |     processed["product_code"] = df["product_code"].apply(lambda e: PRODUCT_CODE_MAP[e])
1314 |     return processed.values
1315 | 
1316 | 
1317 | def persist_model(
1318 |     bucket: str, model: BaseEstimator, dataset: Dataset, metrics: TaskMetrics
1319 | ) -> str:
1320 |     """Persist the model and metadata to S3."""
1321 |     metadata = {
1322 |         "r_squared": metrics.r_squared,
1323 |         "mean_absolute_error": metrics.mean_absolute_error,
1324 |     }
1325 |     wrapped_model = aws.Model("time-to-dispatch", model, dataset, metadata)
1326 |     s3_location = wrapped_model.put_model_to_s3(bucket, "models")
1327 |     return s3_location
1328 | 
1329 | 
1330 | if __name__ == "__main__":
1331 |     try:
1332 |         args = sys.argv
1333 |         s3_bucket = args[1]
1334 |         r2_metric_error_threshold = float(args[2])
1335 |         if r2_metric_error_threshold <= 0 or r2_metric_error_threshold > 1:
1336 |             raise ValueError()
1337 |         r2_metric_warning_threshold = float(args[3])
1338 |         if r2_metric_warning_threshold <= 0 or r2_metric_warning_threshold > 1:
1339 |             raise ValueError()
1340 |     except (ValueError, IndexError):
1341 |         log.error(
1342 |             "Invalid arguments passed to train_model.py. "
1343 |             "Expected S3_BUCKET R_SQUARED_ERROR_THRESHOLD R_SQUARED_WARNING_THRESHOLD, "
1344 |             "where all thresholds must be in the range [0, 1]."
1345 |         )
1346 |         sys.exit(1)
1347 | 
1348 |     try:
1349 |         main(
1350 |             s3_bucket,
1351 |             r2_metric_error_threshold,
1352 |             r2_metric_warning_threshold,
1353 |             HYPERPARAM_GRID,
1354 |         )
1355 |     except Exception as e:
1356 |         log.error(f"Error encountered when training model - {e}")
1357 |         sys.exit(1)
1358 | ```
1359 | 


--------------------------------------------------------------------------------
/bodywork.yaml:
--------------------------------------------------------------------------------
 1 | version: "1.1"
 2 | pipeline:
 3 |   name: time-to-dispatch
 4 |   docker_image: bodyworkml/bodywork-core:3.0
 5 |   DAG: train_model >> serve_model
 6 |   secrets_group: dev
 7 | stages:
 8 |   train_model:
 9 |     executable_module_path: pipeline/train_model.py
10 |     args: ["bodywork-time-to-dispatch", "0.9", "0.8"]
11 |     requirements:
12 |       - numpy>=1.21.0
13 |       - pandas>=1.2.5
14 |       - scikit-learn>=1.0.0
15 |       - git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
16 |     cpu_request: 1.0
17 |     memory_request_mb: 1000
18 |     batch:
19 |       max_completion_time_seconds: 180
20 |       retries: 1
21 |     secrets:
22 |       AWS_ACCESS_KEY_ID: aws-credentials
23 |       AWS_SECRET_ACCESS_KEY: aws-credentials
24 |       AWS_DEFAULT_REGION: aws-credentials
25 |   serve_model:
26 |     executable_module_path: pipeline/serve_model.py
27 |     args: ["bodywork-time-to-dispatch"]
28 |     requirements:
29 |       - numpy>=1.21.0
30 |       - scikit-learn>=1.0.0
31 |       - fastapi>=0.65.2
32 |       - uvicorn>=0.14.0
33 |       - git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
34 |     cpu_request: 0.5
35 |     memory_request_mb: 250
36 |     service:
37 |       max_startup_time_seconds: 180
38 |       replicas: 2
39 |       port: 8000
40 |       ingress: true
41 |     secrets:
42 |       AWS_ACCESS_KEY_ID: aws-credentials
43 |       AWS_SECRET_ACCESS_KEY: aws-credentials
44 |       AWS_DEFAULT_REGION: aws-credentials
45 | logging:
46 |   log_level: INFO
47 | 


--------------------------------------------------------------------------------
/flake8.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | filename = *.py
3 | exclude = 
4 |     __pycache__
5 | max_line_length = 89
6 | ignore = W503,W605
7 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | files = pipeline
 3 | warn_return_any = True
 4 | disallow_untyped_calls=True
 5 | 
 6 | [mypy-uvicorn.*]
 7 | ignore_missing_imports = True
 8 | 
 9 | [mypy-pydantic.*]
10 | ignore_missing_imports = True
11 | 
12 | [mypy-fastapi.*]
13 | ignore_missing_imports = True
14 | 
15 | [mypy-sklearn.*]
16 | ignore_missing_imports = True
17 | 
18 | [mypy-pandas.*]
19 | ignore_missing_imports = True
20 | 


--------------------------------------------------------------------------------
/notebooks/requirements_nb.txt:
--------------------------------------------------------------------------------
1 | jupyterlab==3.0.16
2 | seaborn==0.11.1
3 | numpy==1.21.0
4 | pandas==1.3.0
5 | scikit-learn==0.24.2
6 | boto3==1.17.101
7 | joblib==1.0.1
8 | 


--------------------------------------------------------------------------------
/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bodywork-ml/ml-pipeline-engineering/521001735bddf166c75a2b6d72f7a71d5530ca6b/pipeline/__init__.py


--------------------------------------------------------------------------------
/pipeline/serve_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | - Get model and load into memory.
 3 | - Start web API server.
 4 | """
 5 | import sys
 6 | from enum import Enum
 7 | from typing import Dict, Union
 8 | 
 9 | import uvicorn
10 | from bodywork_pipeline_utils import aws, logging
11 | from fastapi import FastAPI, status
12 | from numpy import array
13 | from pydantic import BaseModel, Field
14 | 
15 | from pipeline.train_model import PRODUCT_CODE_MAP
16 | 
17 | app = FastAPI(debug=False)
18 | log = logging.configure_logger()
19 | 
20 | 
21 | class ProductCode(Enum):
22 |     SKU001 = "SKU001"
23 |     SKU002 = "SKU002"
24 |     SKU003 = "SKU003"
25 |     SKU004 = "SKU004"
26 |     SKU005 = "SKU005"
27 | 
28 | 
29 | class Data(BaseModel):
30 |     product_code: ProductCode
31 |     orders_placed: float = Field(..., ge=0.0)
32 | 
33 | 
34 | class Prediction(BaseModel):
35 |     est_hours_to_dispatch: float
36 |     model_version: str
37 | 
38 | 
39 | @app.post(
40 |     "/api/v0.1/time_to_dispatch",
41 |     status_code=status.HTTP_200_OK,
42 |     response_model=Prediction,
43 | )
44 | def time_to_dispatch(data: Data) -> Dict[str, Union[str, float]]:
45 |     features = array([[data.orders_placed, PRODUCT_CODE_MAP[data.product_code.value]]])
46 |     prediction = wrapped_model.model.predict(features).tolist()[0]
47 |     return {"est_hours_to_dispatch": prediction, "model_version": str(wrapped_model)}
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     try:
52 |         args = sys.argv
53 |         s3_bucket = args[1]
54 |         wrapped_model = aws.get_latest_pkl_model_from_s3(s3_bucket, "models")
55 |         log.info(f"Successfully loaded model: {wrapped_model}")
56 |     except IndexError:
57 |         log.error("Invalid arguments passed to serve_model.py - expected S3_BUCKET")
58 |         sys.exit(1)
59 |     except Exception as e:
60 |         log.error(f"Could not get latest model and start web server - {e}")
61 |         sys.exit(1)
62 |     uvicorn.run(app, host="0.0.0.0", workers=1)
63 | 


--------------------------------------------------------------------------------
/pipeline/train_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | - Download training dataset from AWS S3.
  3 | - Prepare data and train model.
  4 | - Persist model to AWS S3.
  5 | """
  6 | import sys
  7 | from typing import Any, Dict, List, NamedTuple, Tuple
  8 | 
  9 | from bodywork_pipeline_utils import aws, logging
 10 | from bodywork_pipeline_utils.aws import Dataset
 11 | from numpy import array
 12 | from pandas import DataFrame
 13 | from sklearn.base import BaseEstimator
 14 | from sklearn.model_selection import GridSearchCV, train_test_split
 15 | from sklearn.metrics import mean_absolute_error, r2_score
 16 | from sklearn.tree import DecisionTreeRegressor
 17 | 
 18 | PRODUCT_CODE_MAP = {"SKU001": 0, "SKU002": 1, "SKU003": 2, "SKU004": 3, "SKU005": 4}
 19 | HYPERPARAM_GRID = {
 20 |     "random_state": [42],
 21 |     "criterion": ["squared_error", "absolute_error"],
 22 |     "max_depth": [2, 4, 6, 8, 10, None],
 23 |     "min_samples_split": [2, 4, 6, 8, 10],
 24 |     "min_samples_leaf": [2, 4, 6, 8, 10],
 25 | }
 26 | 
 27 | log = logging.configure_logger()
 28 | 
 29 | 
 30 | class FeatureAndLabels(NamedTuple):
 31 |     """Container for features and labels split by test and train sets."""
 32 | 
 33 |     X_train: DataFrame
 34 |     X_test: DataFrame
 35 |     y_train: DataFrame
 36 |     y_test: DataFrame
 37 | 
 38 | 
 39 | class TaskMetrics(NamedTuple):
 40 |     """Container for the task's performance metrics."""
 41 | 
 42 |     r_squared: float
 43 |     mean_absolute_error: float
 44 | 
 45 | 
 46 | def main(
 47 |     s3_bucket: str,
 48 |     metric_error_threshold: float,
 49 |     metric_warning_threshold: float,
 50 |     hyperparam_grid: Dict[str, Any],
 51 | ) -> None:
 52 |     """Main training job."""
 53 |     log.info("Starting train-model stage.")
 54 |     dataset = aws.get_latest_csv_dataset_from_s3(s3_bucket, "datasets")
 55 |     log.info(f"Retrieved dataset from s3://{s3_bucket}/{dataset.key}")
 56 | 
 57 |     feature_and_labels = prepare_data(dataset.data)
 58 |     model, metrics = train_model(feature_and_labels, hyperparam_grid)
 59 |     validate_trained_model_logic(model, feature_and_labels)
 60 |     log.info(
 61 |         f"Trained model: r-squared={metrics.r_squared:.3f}, "
 62 |         f"MAE={metrics.mean_absolute_error:.3f}"
 63 |     )
 64 | 
 65 |     if metrics.r_squared >= metric_error_threshold:
 66 |         if metrics.r_squared >= metric_warning_threshold:
 67 |             log.warning("Metrics breached warning threshold - check for drift.")
 68 |         s3_location = persist_model(s3_bucket, model, dataset, metrics)
 69 |         log.info(f"Model serialised and persisted to s3://{s3_location}")
 70 |     else:
 71 |         msg = (
 72 |             f"r-squared metric ({{metrics.r_squared:.3f}}) is below deployment "
 73 |             f"threshold {metric_error_threshold}"
 74 |         )
 75 |         raise RuntimeError(msg)
 76 | 
 77 | 
 78 | def prepare_data(data: DataFrame) -> FeatureAndLabels:
 79 |     """Split the data into features and labels for training and testing."""
 80 |     X = data.drop("hours_to_dispatch", axis=1)
 81 |     y = data["hours_to_dispatch"]
 82 |     X_train, X_test, y_train, y_test = train_test_split(
 83 |         X, y, test_size=0.2, stratify=data["product_code"].values, random_state=42
 84 |     )
 85 |     return FeatureAndLabels(X_train, X_test, y_train, y_test)
 86 | 
 87 | 
 88 | def train_model(
 89 |     data: FeatureAndLabels, hyperparam_grid: Dict[str, Any]
 90 | ) -> Tuple[BaseEstimator, TaskMetrics]:
 91 |     """Train a model and compute performance metrics."""
 92 |     grid_search = GridSearchCV(
 93 |         estimator=DecisionTreeRegressor(),
 94 |         param_grid=hyperparam_grid,
 95 |         scoring="r2",
 96 |         cv=5,
 97 |         refit=True,
 98 |     )
 99 |     grid_search.fit(preprocess(data.X_train), data.y_train)
100 |     best_model = grid_search.best_estimator_
101 |     y_test_pred = best_model.predict(preprocess(data.X_test))
102 |     performance_metrics = TaskMetrics(
103 |         r2_score(data.y_test, y_test_pred),
104 |         mean_absolute_error(data.y_test, y_test_pred),
105 |     )
106 |     return (best_model, performance_metrics)
107 | 
108 | 
109 | def validate_trained_model_logic(model: BaseEstimator, data: FeatureAndLabels) -> None:
110 |     """Verify that a trained model passes basic logical expectations."""
111 |     issues_detected: List[str] = []
112 | 
113 |     orders_placed_sensitivity_checks = [
114 |         model.predict(array([[100, product], [150, product]])).tolist()
115 |         for product in range(len(PRODUCT_CODE_MAP))
116 |     ]
117 |     if not all(e[0] < e[1] for e in orders_placed_sensitivity_checks):
118 |         issues_detected.append(
119 |             "hours_to_dispatch predictions do not increase with orders_placed"
120 |         )
121 | 
122 |     test_set_predictions = model.predict(preprocess(data.X_test)).reshape(-1)
123 |     if len(test_set_predictions[test_set_predictions < 0]) > 0:
124 |         issues_detected.append(
125 |             "negative hours_to_dispatch predictions found for test set"
126 |         )
127 |     if len(test_set_predictions[test_set_predictions > data.y_test.max() * 1.25]) > 0:
128 |         issues_detected.append(
129 |             "outlier hours_to_dispatch predictions found for test set"
130 |         )
131 | 
132 |     if issues_detected:
133 |         msg = "Trained model failed verification: " + ", ".join(issues_detected) + "."
134 |         raise RuntimeError(msg)
135 | 
136 | 
137 | def preprocess(df: DataFrame) -> DataFrame:
138 |     """Create features for training model."""
139 |     processed = df.copy()
140 |     processed["product_code"] = df["product_code"].apply(lambda e: PRODUCT_CODE_MAP[e])
141 |     return processed.values
142 | 
143 | 
144 | def persist_model(
145 |     bucket: str, model: BaseEstimator, dataset: Dataset, metrics: TaskMetrics
146 | ) -> str:
147 |     """Persist the model and metadata to S3."""
148 |     metadata = {
149 |         "r_squared": metrics.r_squared,
150 |         "mean_absolute_error": metrics.mean_absolute_error,
151 |     }
152 |     wrapped_model = aws.Model("time-to-dispatch", model, dataset, metadata)
153 |     s3_location = wrapped_model.put_model_to_s3(bucket, "models")
154 |     return s3_location
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     try:
159 |         args = sys.argv
160 |         s3_bucket = args[1]
161 |         r2_metric_error_threshold = float(args[2])
162 |         if r2_metric_error_threshold <= 0 or r2_metric_error_threshold > 1:
163 |             raise ValueError()
164 |         r2_metric_warning_threshold = float(args[3])
165 |         if r2_metric_warning_threshold <= 0 or r2_metric_warning_threshold > 1:
166 |             raise ValueError()
167 |     except (ValueError, IndexError):
168 |         log.error(
169 |             "Invalid arguments passed to train_model.py. "
170 |             "Expected S3_BUCKET R_SQUARED_ERROR_THRESHOLD R_SQUARED_WARNING_THRESHOLD, "
171 |             "where all thresholds must be in the range [0, 1]."
172 |         )
173 |         sys.exit(1)
174 | 
175 |     try:
176 |         main(
177 |             s3_bucket,
178 |             r2_metric_error_threshold,
179 |             r2_metric_warning_threshold,
180 |             HYPERPARAM_GRID,
181 |         )
182 |     except Exception as e:
183 |         log.error(f"Error encountered when training model - {e}")
184 |         sys.exit(1)
185 | 


--------------------------------------------------------------------------------
/requirements_cicd.txt:
--------------------------------------------------------------------------------
1 | bodywork>=3.0
2 | tox==3.23.1
3 | pytest==6.2.4
4 | mypy==0.910
5 | flake8==3.9.2
6 | 


--------------------------------------------------------------------------------
/requirements_pipe.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.21.0
2 | pandas>=1.2.5
3 | scikit-learn>=1.0.0
4 | boto3>=1.17.101
5 | joblib>=1.0.1
6 | fastapi>=0.65.2
7 | uvicorn>=0.14.0
8 | git+https://github.com/bodywork-ml/bodywork-pipeline-utils@v0.1.5
9 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bodywork-ml/ml-pipeline-engineering/521001735bddf166c75a2b6d72f7a71d5530ca6b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/resources/dataset.csv:
--------------------------------------------------------------------------------
   1 | hours_to_dispatch,orders_placed,product_code
   2 | 15.862958518302616,112.0,SKU004
   3 | 2.929956101665746,7.0,SKU003
   4 | 7.021039716106525,63.0,SKU003
   5 | 13.71305161611768,100.0,SKU003
   6 | 18.53816179488018,173.0,SKU002
   7 | 9.935676120715334,93.0,SKU002
   8 | 8.245560909208793,66.0,SKU002
   9 | 17.39642656594881,141.0,SKU003
  10 | 0.1629555631196309,0.0,SKU003
  11 | 11.009943991017714,108.0,SKU002
  12 | 12.385968056460712,94.0,SKU003
  13 | 17.262157238784948,191.0,SKU002
  14 | 7.410422376401399,75.0,SKU002
  15 | 2.917672665590633,21.0,SKU002
  16 | 5.03491940063904,36.0,SKU004
  17 | 17.970246531347822,128.0,SKU003
  18 | 2.5436340623018943,32.0,SKU001
  19 | 23.088489288479785,226.0,SKU002
  20 | 2.4496465326457493,12.0,SKU001
  21 | 1.6404646888147605,10.0,SKU004
  22 | 5.762189514575706,34.0,SKU003
  23 | 4.372671021799478,38.0,SKU002
  24 | 4.230965820231027,36.0,SKU002
  25 | 11.433358787587952,123.0,SKU002
  26 | 7.49790581653461,85.0,SKU001
  27 | 36.97888907606447,251.0,SKU004
  28 | 10.85091550416944,88.0,SKU003
  29 | 1.8777761818938097,4.0,SKU004
  30 | 13.361167140713476,137.0,SKU002
  31 | 11.6266091206628,75.0,SKU004
  32 | 5.578331826429631,75.0,SKU001
  33 | 26.30158502391177,150.0,SKU005
  34 | 22.81845261384039,131.0,SKU005
  35 | 24.984630208520024,208.0,SKU003
  36 | 20.17488784897143,157.0,SKU003
  37 | 13.214870543481592,84.0,SKU004
  38 | 1.0956526049995454,13.0,SKU003
  39 | 2.307616717267485,37.0,SKU001
  40 | 10.797471455676334,99.0,SKU002
  41 | 6.338436652300741,66.0,SKU002
  42 | 9.194911019061465,61.0,SKU004
  43 | 7.238964678187514,81.0,SKU002
  44 | 10.154996427291902,68.0,SKU004
  45 | 10.729288708038965,93.0,SKU003
  46 | 7.541237178053222,65.0,SKU003
  47 | 8.90378515553683,129.0,SKU001
  48 | 12.160720282978485,113.0,SKU002
  49 | 11.6988415098023,174.0,SKU001
  50 | 34.28573487530505,194.0,SKU005
  51 | 0.3017043397731734,7.0,SKU001
  52 | 2.465330978365856,7.0,SKU004
  53 | 3.140692082573108,16.0,SKU004
  54 | 3.102564136245278,58.0,SKU001
  55 | 17.470765997504852,144.0,SKU003
  56 | 4.018416528334628,23.0,SKU005
  57 | 7.16991009064842,103.0,SKU001
  58 | 1.1612917662269222,2.0,SKU003
  59 | 6.7986689707683805,88.0,SKU001
  60 | 6.289491143317837,65.0,SKU002
  61 | 3.704708529000893,18.0,SKU004
  62 | 4.995678312387475,62.0,SKU002
  63 | 10.722982934677388,110.0,SKU002
  64 | 25.68441194876899,145.0,SKU005
  65 | 13.415470182036998,108.0,SKU003
  66 | 31.552245831101317,231.0,SKU004
  67 | 6.026671572856513,57.0,SKU002
  68 | 4.847926245866279,29.0,SKU004
  69 | 0.0,3.0,SKU003
  70 | 0.0,16.0,SKU001
  71 | 12.028813194046224,127.0,SKU002
  72 | 23.093995566923944,186.0,SKU003
  73 | 3.691138626559598,25.0,SKU004
  74 | 12.729481813819286,118.0,SKU003
  75 | 1.297944679294036,15.0,SKU001
  76 | 8.393653619155266,59.0,SKU003
  77 | 16.011341279509153,100.0,SKU004
  78 | 2.59478210118922,14.0,SKU002
  79 | 4.392437161679324,56.0,SKU002
  80 | 16.747848002415914,91.0,SKU005
  81 | 0.8144907052504978,12.0,SKU004
  82 | 27.056817791887987,150.0,SKU005
  83 | 6.164313702554062,39.0,SKU005
  84 | 1.6818942169802238,18.0,SKU001
  85 | 4.120380418437928,17.0,SKU004
  86 | 8.109204507144886,72.0,SKU002
  87 | 20.84228802373505,145.0,SKU004
  88 | 0.0,7.0,SKU001
  89 | 1.9303395046119096,8.0,SKU004
  90 | 6.864448063592905,37.0,SKU004
  91 | 10.08124618690896,55.0,SKU005
  92 | 0.0,1.0,SKU004
  93 | 20.255414235346567,116.0,SKU005
  94 | 20.27826536328478,113.0,SKU005
  95 | 3.861381932938465,32.0,SKU003
  96 | 0.0,8.0,SKU001
  97 | 5.8455688969343855,27.0,SKU005
  98 | 9.083784010790072,48.0,SKU004
  99 | 9.87952135202481,48.0,SKU005
 100 | 2.6174075675727293,27.0,SKU005
 101 | 6.203738491360637,29.0,SKU005
 102 | 8.320419154749057,77.0,SKU002
 103 | 3.8520099232330023,45.0,SKU002
 104 | 4.93971791098266,69.0,SKU002
 105 | 7.606719621952104,77.0,SKU002
 106 | 4.322610703110861,32.0,SKU003
 107 | 28.18541965546742,167.0,SKU005
 108 | 1.720982427383454,4.0,SKU003
 109 | 4.777492816372326,24.0,SKU004
 110 | 10.95788021409431,140.0,SKU001
 111 | 16.208974305913703,141.0,SKU003
 112 | 0.1172714887269685,4.0,SKU002
 113 | 18.118738850841535,174.0,SKU002
 114 | 1.3813963029728922,5.0,SKU004
 115 | 3.0481919314253108,21.0,SKU005
 116 | 3.28366172415688,51.0,SKU001
 117 | 7.302468976467001,64.0,SKU003
 118 | 8.830355119892593,52.0,SKU005
 119 | 22.858916758577003,133.0,SKU005
 120 | 6.442199901968948,57.0,SKU003
 121 | 8.893435364926903,72.0,SKU003
 122 | 8.095938693777509,64.0,SKU003
 123 | 11.30001445966552,82.0,SKU003
 124 | 0.181166247256127,20.0,SKU002
 125 | 2.7443008606581576,9.0,SKU005
 126 | 7.372461748755517,46.0,SKU005
 127 | 19.5343700519312,177.0,SKU002
 128 | 3.2808828198587365,15.0,SKU005
 129 | 16.359765113839835,126.0,SKU003
 130 | 12.037823070633085,162.0,SKU001
 131 | 19.956356322969143,102.0,SKU005
 132 | 8.199840510813804,110.0,SKU001
 133 | 1.7400091164323852,3.0,SKU001
 134 | 8.165242435330812,87.0,SKU002
 135 | 2.2482671589296355,10.0,SKU004
 136 | 7.987951661708194,35.0,SKU005
 137 | 12.807445040702358,137.0,SKU002
 138 | 5.048276258279048,46.0,SKU002
 139 | 4.345552232001843,67.0,SKU001
 140 | 17.32362823757579,97.0,SKU005
 141 | 5.650649709987079,38.0,SKU002
 142 | 6.864084520010048,62.0,SKU003
 143 | 8.885887871200717,87.0,SKU002
 144 | 8.112742414672743,72.0,SKU003
 145 | 6.633067863582399,45.0,SKU004
 146 | 5.819063640723796,42.0,SKU003
 147 | 18.46566910492376,150.0,SKU003
 148 | 5.366660348261927,51.0,SKU001
 149 | 1.5839651494174565,11.0,SKU002
 150 | 2.657766184388877,27.0,SKU004
 151 | 2.683487766960106,24.0,SKU002
 152 | 14.485046919181697,104.0,SKU003
 153 | 1.851738939873872,10.0,SKU003
 154 | 4.715906415247824,45.0,SKU002
 155 | 8.585069795505504,59.0,SKU004
 156 | 6.8041695442719865,46.0,SKU004
 157 | 14.533763788564467,194.0,SKU001
 158 | 4.136377812066431,53.0,SKU003
 159 | 14.251391961387188,110.0,SKU003
 160 | 3.183318958593049,29.0,SKU001
 161 | 8.494530369337136,92.0,SKU002
 162 | 12.877091420512322,101.0,SKU003
 163 | 2.8080978122708906,25.0,SKU002
 164 | 1.9315612620939957,7.0,SKU004
 165 | 26.17637031354232,150.0,SKU005
 166 | 0.7825204597180833,3.0,SKU001
 167 | 5.903697981354479,57.0,SKU003
 168 | 4.72506587800898,77.0,SKU001
 169 | 19.48239101805117,127.0,SKU004
 170 | 1.2996810030889012,10.0,SKU004
 171 | 1.8792779461315847,23.0,SKU004
 172 | 20.558065422866484,135.0,SKU004
 173 | 0.0,4.0,SKU005
 174 | 5.60188630406615,29.0,SKU002
 175 | 11.03690953317154,132.0,SKU002
 176 | 8.391763080926554,48.0,SKU005
 177 | 19.748068796545788,209.0,SKU002
 178 | 1.7241468713500514,5.0,SKU004
 179 | 11.61654111751677,120.0,SKU002
 180 | 3.48922451701595,33.0,SKU001
 181 | 11.89580111383792,111.0,SKU002
 182 | 0.1865320899776454,17.0,SKU001
 183 | 5.128498355662528,47.0,SKU003
 184 | 1.6668946462343506,27.0,SKU001
 185 | 8.408407905276698,113.0,SKU001
 186 | 18.620942005652573,125.0,SKU004
 187 | 7.018946453282507,42.0,SKU004
 188 | 9.895436698188028,64.0,SKU004
 189 | 3.054328694049185,49.0,SKU001
 190 | 43.49681039806271,254.0,SKU005
 191 | 16.560091311762058,116.0,SKU004
 192 | 15.12449855774384,99.0,SKU004
 193 | 5.345904724657064,74.0,SKU001
 194 | 6.420714569387397,35.0,SKU003
 195 | 6.916502321597927,55.0,SKU003
 196 | 3.251430487508451,31.0,SKU002
 197 | 15.666749577934503,140.0,SKU003
 198 | 4.263488365568152,37.0,SKU004
 199 | 13.940034447009156,86.0,SKU005
 200 | 11.289691370036754,118.0,SKU002
 201 | 0.0,5.0,SKU005
 202 | 17.717033034360902,108.0,SKU005
 203 | 7.742302908251123,47.0,SKU004
 204 | 1.3475912569832134,29.0,SKU001
 205 | 8.55296706024533,115.0,SKU001
 206 | 1.305508611597023,10.0,SKU001
 207 | 2.922826228648638,16.0,SKU003
 208 | 12.948165729627428,169.0,SKU001
 209 | 16.24260115669779,106.0,SKU004
 210 | 24.2940978550361,187.0,SKU003
 211 | 1.0240875793413569,8.0,SKU001
 212 | 24.60215877133846,135.0,SKU005
 213 | 2.4888804901190773,31.0,SKU004
 214 | 18.473360212539777,124.0,SKU004
 215 | 5.613173819183727,56.0,SKU002
 216 | 14.0770503392745,95.0,SKU004
 217 | 10.742639960969846,61.0,SKU005
 218 | 13.883324615697909,95.0,SKU004
 219 | 10.082667874441793,93.0,SKU002
 220 | 2.7302920824662404,22.0,SKU004
 221 | 0.0638448704277721,2.0,SKU004
 222 | 5.82934797043336,44.0,SKU002
 223 | 13.281088574206432,80.0,SKU004
 224 | 2.1282929240819475,16.0,SKU003
 225 | 28.01153931325399,191.0,SKU004
 226 | 11.222612584820242,171.0,SKU001
 227 | 9.38819181841278,127.0,SKU001
 228 | 10.310432067559262,78.0,SKU003
 229 | 12.18259733641105,86.0,SKU004
 230 | 5.693388940692294,32.0,SKU005
 231 | 4.367692922599419,31.0,SKU004
 232 | 9.123814854821635,145.0,SKU001
 233 | 3.545650139847896,34.0,SKU002
 234 | 8.411459375510937,64.0,SKU003
 235 | 0.2668927482994054,1.0,SKU002
 236 | 9.680960511847315,46.0,SKU005
 237 | 3.881656884748386,28.0,SKU004
 238 | 35.48713804128805,205.0,SKU005
 239 | 13.033645815260677,109.0,SKU002
 240 | 3.46593097775765,31.0,SKU004
 241 | 25.899484686297257,152.0,SKU005
 242 | 4.9398573782398,61.0,SKU001
 243 | 6.928966596167616,77.0,SKU001
 244 | 23.55935985889228,181.0,SKU003
 245 | 5.745663961154962,90.0,SKU001
 246 | 16.984222291576053,205.0,SKU001
 247 | 4.115665435322434,37.0,SKU002
 248 | 8.644985453993431,126.0,SKU001
 249 | 1.1250851342805293,25.0,SKU001
 250 | 7.182435249111576,71.0,SKU002
 251 | 5.555570863823387,45.0,SKU001
 252 | 4.037866614978047,51.0,SKU001
 253 | 9.639194237931838,60.0,SKU003
 254 | 9.739015316212305,118.0,SKU001
 255 | 0.0,4.0,SKU001
 256 | 3.1796620953989443,14.0,SKU003
 257 | 0.6807187736959351,16.0,SKU002
 258 | 1.6829389426859902,31.0,SKU001
 259 | 21.022362043649217,119.0,SKU005
 260 | 1.7930007607038714,18.0,SKU002
 261 | 3.999007274494224,31.0,SKU002
 262 | 10.094864276173404,128.0,SKU001
 263 | 2.497468308019819,21.0,SKU003
 264 | 10.675191572399372,90.0,SKU002
 265 | 3.430929659806873,31.0,SKU003
 266 | 36.77393478062781,216.0,SKU005
 267 | 1.9740924868564145,12.0,SKU003
 268 | 16.16131968408653,108.0,SKU004
 269 | 3.2089227621304808,24.0,SKU003
 270 | 5.4802625732311405,47.0,SKU002
 271 | 11.5264444639151,94.0,SKU003
 272 | 23.33960922381077,172.0,SKU003
 273 | 14.272877042744264,93.0,SKU004
 274 | 0.2748816211286949,5.0,SKU004
 275 | 4.004430822179706,46.0,SKU001
 276 | 10.68945824251119,108.0,SKU002
 277 | 3.991644872275173,52.0,SKU001
 278 | 0.4236529053558329,10.0,SKU004
 279 | 20.81919676576796,174.0,SKU003
 280 | 10.517626424751372,63.0,SKU004
 281 | 33.401244733679384,198.0,SKU005
 282 | 10.65645357217296,86.0,SKU003
 283 | 5.258560603316176,66.0,SKU001
 284 | 3.117430510947059,14.0,SKU003
 285 | 7.55262341169911,46.0,SKU005
 286 | 21.558817494534672,139.0,SKU004
 287 | 6.936986600380907,71.0,SKU002
 288 | 2.475021545341073,47.0,SKU002
 289 | 0.477741327328371,4.0,SKU002
 290 | 1.893333180210396,13.0,SKU004
 291 | 8.348717378832655,79.0,SKU002
 292 | 3.9475597887610743,26.0,SKU005
 293 | 20.42162643513978,140.0,SKU004
 294 | 7.768889806193853,66.0,SKU002
 295 | 4.373832948584255,63.0,SKU001
 296 | 31.310320195083776,221.0,SKU004
 297 | 14.434994539118012,93.0,SKU004
 298 | 2.5178776300730297,13.0,SKU003
 299 | 6.260557045901718,54.0,SKU002
 300 | 6.154243238864765,90.0,SKU001
 301 | 5.273493497389223,30.0,SKU005
 302 | 19.823095019520395,129.0,SKU004
 303 | 11.63178066428944,61.0,SKU005
 304 | 24.52893312840891,137.0,SKU005
 305 | 3.2238168988686478,14.0,SKU005
 306 | 0.1059674203300821,7.0,SKU003
 307 | 0.0,11.0,SKU005
 308 | 13.232637968473709,122.0,SKU002
 309 | 0.4681796091973343,14.0,SKU002
 310 | 6.514809282151354,89.0,SKU001
 311 | 0.8567851158846693,1.0,SKU001
 312 | 2.929787838543067,17.0,SKU005
 313 | 9.991489893624628,53.0,SKU005
 314 | 9.549325650185471,57.0,SKU003
 315 | 16.786123468541096,93.0,SKU005
 316 | 2.554598428233418,19.0,SKU005
 317 | 15.284380144810132,96.0,SKU005
 318 | 17.97743094135668,115.0,SKU004
 319 | 26.2580561112093,175.0,SKU004
 320 | 10.77629434691818,97.0,SKU003
 321 | 21.52210726793813,163.0,SKU003
 322 | 10.272529586942888,95.0,SKU002
 323 | 0.6911694010466809,6.0,SKU003
 324 | 4.115855753886536,19.0,SKU002
 325 | 24.942246883632887,137.0,SKU005
 326 | 7.736777298153168,70.0,SKU003
 327 | 10.541811838821062,75.0,SKU003
 328 | 8.001689159124139,62.0,SKU003
 329 | 4.378094806449489,25.0,SKU002
 330 | 7.960547028612911,70.0,SKU002
 331 | 8.727978911634546,110.0,SKU001
 332 | 2.561818093183416,21.0,SKU002
 333 | 6.374381476306286,76.0,SKU002
 334 | 2.7060322142191446,21.0,SKU004
 335 | 21.142138765262786,167.0,SKU003
 336 | 0.0,11.0,SKU002
 337 | 8.664366134865414,66.0,SKU003
 338 | 1.499326842652462,18.0,SKU003
 339 | 10.270422330139228,71.0,SKU004
 340 | 10.87595640536798,81.0,SKU003
 341 | 2.197205641248261,28.0,SKU002
 342 | 22.57036261654559,177.0,SKU003
 343 | 10.299638264316016,135.0,SKU001
 344 | 3.6463702419894863,63.0,SKU001
 345 | 6.208309508959573,36.0,SKU004
 346 | 3.4964592679113657,26.0,SKU002
 347 | 10.694545024558856,65.0,SKU004
 348 | 7.8888239061412735,80.0,SKU001
 349 | 1.5803965088066247,20.0,SKU002
 350 | 10.726491532959418,101.0,SKU003
 351 | 7.36458281748981,80.0,SKU002
 352 | 12.68781544067522,67.0,SKU005
 353 | 17.454153348800556,175.0,SKU002
 354 | 9.19307549293427,85.0,SKU003
 355 | 0.0,3.0,SKU005
 356 | 7.629607753791996,41.0,SKU004
 357 | 5.908881815462458,53.0,SKU002
 358 | 12.53220533953272,87.0,SKU004
 359 | 13.558186702259178,117.0,SKU002
 360 | 10.53547245528302,105.0,SKU001
 361 | 11.157766928714867,135.0,SKU001
 362 | 0.9732493949071142,2.0,SKU003
 363 | 5.941217265855522,72.0,SKU001
 364 | 4.723018952386578,22.0,SKU005
 365 | 6.386781230091902,25.0,SKU005
 366 | 4.367105778965916,53.0,SKU002
 367 | 34.32838125983244,231.0,SKU004
 368 | 19.56394138159653,112.0,SKU005
 369 | 7.928146450661046,74.0,SKU002
 370 | 16.73860607656345,101.0,SKU004
 371 | 7.13573485473717,90.0,SKU001
 372 | 3.7632148358356634,31.0,SKU002
 373 | 10.296954953315964,64.0,SKU004
 374 | 5.50347973528762,44.0,SKU003
 375 | 3.20307836891112,29.0,SKU003
 376 | 15.414513616181582,88.0,SKU005
 377 | 8.670451909348959,62.0,SKU004
 378 | 0.5407870659046616,9.0,SKU003
 379 | 7.947779554441141,43.0,SKU005
 380 | 4.739077425445343,52.0,SKU002
 381 | 12.730005392516968,109.0,SKU002
 382 | 26.184756356786984,174.0,SKU004
 383 | 10.315484508738212,60.0,SKU004
 384 | 6.650502637018809,53.0,SKU004
 385 | 2.1304580117125997,22.0,SKU004
 386 | 4.180408299913875,34.0,SKU003
 387 | 10.009547151845608,73.0,SKU003
 388 | 55.50575199057534,320.0,SKU005
 389 | 10.389779530582286,84.0,SKU003
 390 | 2.7272197205794484,17.0,SKU002
 391 | 4.971531214342614,31.0,SKU005
 392 | 4.9500338357202205,46.0,SKU003
 393 | 12.768715521893444,80.0,SKU004
 394 | 8.1960111944939,76.0,SKU003
 395 | 15.479118782265914,211.0,SKU001
 396 | 8.364292512651044,47.0,SKU005
 397 | 11.62696224890297,89.0,SKU003
 398 | 9.772730638533464,72.0,SKU003
 399 | 6.626126506983514,56.0,SKU002
 400 | 12.938477889427045,88.0,SKU004
 401 | 6.662577133623284,38.0,SKU004
 402 | 12.572400025149328,121.0,SKU002
 403 | 0.9324498126785576,2.0,SKU002
 404 | 3.938606597014239,61.0,SKU001
 405 | 5.3252655428017,27.0,SKU005
 406 | 22.522449861471024,151.0,SKU004
 407 | 16.4407457036259,164.0,SKU002
 408 | 14.389248682810274,142.0,SKU002
 409 | 18.191756214698653,92.0,SKU005
 410 | 7.775962057300561,87.0,SKU002
 411 | 9.194193567378449,107.0,SKU001
 412 | 8.963461306180786,43.0,SKU005
 413 | 9.963916005938966,122.0,SKU001
 414 | 2.869920156415172,27.0,SKU003
 415 | 40.08993933675594,256.0,SKU004
 416 | 0.0,1.0,SKU002
 417 | 2.4514119960532006,10.0,SKU005
 418 | 1.5988384098510506,20.0,SKU001
 419 | 7.364300571839185,52.0,SKU004
 420 | 1.8543966387112796,9.0,SKU004
 421 | 4.725315904253543,28.0,SKU002
 422 | 4.641195684533036,29.0,SKU004
 423 | 12.340885633184662,104.0,SKU002
 424 | 28.84258208205075,232.0,SKU003
 425 | 1.0439279716620553,10.0,SKU002
 426 | 22.762194964508183,320.0,SKU001
 427 | 5.815890445572074,42.0,SKU004
 428 | 8.781362958463662,60.0,SKU004
 429 | 4.310224690968447,31.0,SKU004
 430 | 4.969331145347111,59.0,SKU002
 431 | 4.254881986637297,29.0,SKU005
 432 | 8.036536514774797,83.0,SKU002
 433 | 8.121644694079865,92.0,SKU001
 434 | 16.35042997795203,178.0,SKU002
 435 | 9.914110021984513,80.0,SKU004
 436 | 16.966486521467324,150.0,SKU003
 437 | 7.589348213086975,112.0,SKU001
 438 | 8.154114634538775,60.0,SKU003
 439 | 11.191335506260994,154.0,SKU001
 440 | 3.0811150841864614,46.0,SKU001
 441 | 0.9850673258902936,9.0,SKU004
 442 | 9.565600308525944,114.0,SKU002
 443 | 1.6804942877016738,5.0,SKU004
 444 | 45.806010770907264,297.0,SKU004
 445 | 6.154968397708967,43.0,SKU004
 446 | 7.47368124335578,59.0,SKU003
 447 | 4.608152607705204,60.0,SKU001
 448 | 6.042119282862837,41.0,SKU005
 449 | 0.8142454515864972,13.0,SKU002
 450 | 5.515671274408636,25.0,SKU004
 451 | 4.636951342237169,62.0,SKU001
 452 | 1.6989431993082362,15.0,SKU002
 453 | 13.939385046429257,84.0,SKU005
 454 | 16.633702727796386,98.0,SKU005
 455 | 1.3748868282729232,5.0,SKU004
 456 | 7.790245949725851,49.0,SKU005
 457 | 13.570577248505932,68.0,SKU005
 458 | 17.01401232607066,186.0,SKU002
 459 | 5.153621388887575,58.0,SKU002
 460 | 4.520454762150347,40.0,SKU004
 461 | 5.584374934591344,57.0,SKU001
 462 | 8.667599129124753,58.0,SKU004
 463 | 7.663320362290976,64.0,SKU003
 464 | 4.172983800174736,22.0,SKU005
 465 | 4.553141616040267,42.0,SKU001
 466 | 8.461516907921657,82.0,SKU002
 467 | 9.455084768902928,66.0,SKU003
 468 | 15.282702340161531,125.0,SKU003
 469 | 10.744775183731608,67.0,SKU005
 470 | 16.59705587021233,164.0,SKU002
 471 | 4.025060565556266,63.0,SKU001
 472 | 4.168071143236177,35.0,SKU002
 473 | 2.9989485785996948,19.0,SKU005
 474 | 0.2141547706695837,2.0,SKU003
 475 | 9.524654002871673,67.0,SKU005
 476 | 21.451581292246857,144.0,SKU004
 477 | 10.141468104171029,83.0,SKU003
 478 | 14.274431674612371,75.0,SKU005
 479 | 0.8862518385459388,0.0,SKU001
 480 | 1.9488519968818392,26.0,SKU001
 481 | 6.776616058908596,112.0,SKU001
 482 | 3.303929832236766,25.0,SKU004
 483 | 7.973493504138554,68.0,SKU002
 484 | 4.02759904129777,56.0,SKU001
 485 | 19.394191500289462,136.0,SKU004
 486 | 9.228460880087797,108.0,SKU001
 487 | 3.627279515882541,35.0,SKU004
 488 | 4.344678744879769,59.0,SKU001
 489 | 28.20286711748844,276.0,SKU002
 490 | 18.971122155401016,108.0,SKU005
 491 | 7.296003311530962,114.0,SKU001
 492 | 24.112720092003546,143.0,SKU005
 493 | 0.0,4.0,SKU001
 494 | 3.905275120785187,31.0,SKU002
 495 | 31.92699363718175,181.0,SKU005
 496 | 0.6098164064947426,7.0,SKU004
 497 | 15.660564064690233,203.0,SKU001
 498 | 4.857893408942699,75.0,SKU001
 499 | 9.896842908650775,61.0,SKU004
 500 | 1.2646853764956043,8.0,SKU002
 501 | 10.485850980053828,86.0,SKU003
 502 | 3.69085076264708,51.0,SKU001
 503 | 16.938736363478096,115.0,SKU004
 504 | 4.0713179504656765,25.0,SKU003
 505 | 2.252173725602784,27.0,SKU001
 506 | 2.407723683751742,15.0,SKU005
 507 | 10.50703090160446,77.0,SKU004
 508 | 1.5903262544325123,6.0,SKU005
 509 | 3.480364362983944,29.0,SKU002
 510 | 16.982540190136135,140.0,SKU003
 511 | 0.7737568495302025,8.0,SKU004
 512 | 8.994570133390438,121.0,SKU001
 513 | 2.1000103393854936,43.0,SKU001
 514 | 17.164833691020714,137.0,SKU003
 515 | 5.489790319590373,41.0,SKU005
 516 | 14.429071269999652,90.0,SKU005
 517 | 11.351802007447793,79.0,SKU004
 518 | 16.48709668692267,114.0,SKU004
 519 | 4.099041746598781,21.0,SKU005
 520 | 4.4229427407808375,28.0,SKU004
 521 | 13.246496020579574,68.0,SKU005
 522 | 29.829938091965712,238.0,SKU003
 523 | 1.2958095889074637,7.0,SKU004
 524 | 30.51302566618488,181.0,SKU005
 525 | 3.992880482046787,34.0,SKU004
 526 | 6.221340254182712,82.0,SKU001
 527 | 3.15362015553725,41.0,SKU001
 528 | 0.6744036444229824,4.0,SKU002
 529 | 7.25127964492274,34.0,SKU005
 530 | 1.6227147774633446,27.0,SKU002
 531 | 11.730431304053536,57.0,SKU005
 532 | 3.990540523245081,37.0,SKU004
 533 | 12.447830636294984,69.0,SKU005
 534 | 10.203780436283118,153.0,SKU001
 535 | 7.796424623983194,47.0,SKU004
 536 | 6.870359730082846,111.0,SKU001
 537 | 18.171554597867797,139.0,SKU004
 538 | 22.78913384088108,129.0,SKU005
 539 | 30.673808972572395,176.0,SKU005
 540 | 14.588719419402594,180.0,SKU001
 541 | 8.032853049019158,64.0,SKU003
 542 | 31.46691083409173,184.0,SKU005
 543 | 21.624066078629284,177.0,SKU003
 544 | 10.9170856906305,51.0,SKU004
 545 | 6.059236930192439,55.0,SKU003
 546 | 3.015091853405324,33.0,SKU001
 547 | 2.2231789273582274,18.0,SKU002
 548 | 1.941026196031544,14.0,SKU002
 549 | 5.147735011932456,62.0,SKU001
 550 | 0.0,2.0,SKU002
 551 | 0.0,11.0,SKU001
 552 | 23.28422603694228,132.0,SKU005
 553 | 0.6825809570000236,6.0,SKU003
 554 | 17.999180637375808,179.0,SKU002
 555 | 34.17138148592573,192.0,SKU005
 556 | 0.6121960215110213,10.0,SKU002
 557 | 15.452139894769374,107.0,SKU004
 558 | 4.74997962778599,51.0,SKU001
 559 | 5.27139857940564,75.0,SKU001
 560 | 0.0182848903759793,3.0,SKU005
 561 | 6.2522744205333725,46.0,SKU004
 562 | 13.30320367694808,102.0,SKU003
 563 | 29.03587403710914,195.0,SKU004
 564 | 32.88652482117852,204.0,SKU004
 565 | 6.625614493080897,53.0,SKU003
 566 | 12.834975141078669,112.0,SKU003
 567 | 7.513248406283697,53.0,SKU004
 568 | 5.420861632513154,48.0,SKU002
 569 | 10.503333664123913,111.0,SKU002
 570 | 7.673479071093201,51.0,SKU004
 571 | 12.59506737700984,116.0,SKU002
 572 | 2.3985363726032856,10.0,SKU003
 573 | 18.111252062818885,149.0,SKU003
 574 | 10.423127651768716,107.0,SKU002
 575 | 7.596733186262165,60.0,SKU003
 576 | 5.841498473238032,60.0,SKU001
 577 | 5.034210778801493,49.0,SKU003
 578 | 4.090750907544717,25.0,SKU004
 579 | 19.358753681668283,110.0,SKU005
 580 | 6.981936885031976,79.0,SKU002
 581 | 3.491469329534961,23.0,SKU002
 582 | 17.475123332162237,132.0,SKU003
 583 | 7.217509200758076,45.0,SKU003
 584 | 19.958733791150248,107.0,SKU005
 585 | 24.968684848432016,143.0,SKU005
 586 | 6.670960160594753,40.0,SKU004
 587 | 7.433014392348529,95.0,SKU001
 588 | 9.209661692452968,67.0,SKU004
 589 | 4.980452733589054,63.0,SKU002
 590 | 14.15008093283544,110.0,SKU003
 591 | 13.409421784804092,89.0,SKU004
 592 | 3.702855792950124,32.0,SKU003
 593 | 10.420397433087096,84.0,SKU003
 594 | 1.995271893760099,2.0,SKU004
 595 | 0.0,13.0,SKU004
 596 | 10.49552514624718,56.0,SKU005
 597 | 5.60372634389699,63.0,SKU001
 598 | 15.146479015867802,81.0,SKU005
 599 | 7.635422068987944,120.0,SKU001
 600 | 8.39059684339341,58.0,SKU004
 601 | 0.1630920800438741,7.0,SKU001
 602 | 8.248994967111672,80.0,SKU002
 603 | 8.32338765086459,54.0,SKU005
 604 | 16.88627770526568,94.0,SKU005
 605 | 19.114838284480825,114.0,SKU005
 606 | 0.5046580051989842,6.0,SKU003
 607 | 15.467590543956096,113.0,SKU003
 608 | 26.000559907780783,168.0,SKU004
 609 | 8.162900781962719,54.0,SKU004
 610 | 0.5973080227421457,16.0,SKU003
 611 | 0.670757196613704,2.0,SKU005
 612 | 15.951858526065324,124.0,SKU003
 613 | 0.05728405150732,29.0,SKU001
 614 | 27.97797085402032,158.0,SKU005
 615 | 1.0154538977614134,17.0,SKU003
 616 | 14.090707924750255,104.0,SKU004
 617 | 1.0019533462906232,17.0,SKU003
 618 | 0.9374933296478438,31.0,SKU002
 619 | 3.6651019026820735,33.0,SKU003
 620 | 26.1795960276642,155.0,SKU005
 621 | 2.025698736307619,7.0,SKU003
 622 | 27.81427437344677,188.0,SKU004
 623 | 2.627959247436115,45.0,SKU001
 624 | 6.5625302586566985,82.0,SKU002
 625 | 14.78871315777592,87.0,SKU005
 626 | 11.758628596666425,69.0,SKU004
 627 | 1.7199578142947116,2.0,SKU002
 628 | 14.798700051197551,121.0,SKU003
 629 | 3.560031851972313,36.0,SKU002
 630 | 11.43879487052075,143.0,SKU001
 631 | 2.9543626258469624,40.0,SKU002
 632 | 1.00884448501833,6.0,SKU002
 633 | 9.25435527046808,55.0,SKU003
 634 | 5.428411204444368,37.0,SKU003
 635 | 15.977006774214306,87.0,SKU005
 636 | 12.89264750424594,92.0,SKU003
 637 | 1.1370251077375266,2.0,SKU002
 638 | 1.5507611088589917,24.0,SKU001
 639 | 6.275533543290628,60.0,SKU003
 640 | 23.25048278836277,134.0,SKU005
 641 | 2.8942606555853905,22.0,SKU005
 642 | 21.12947863125401,214.0,SKU002
 643 | 4.878522460941999,46.0,SKU003
 644 | 2.9141375129800897,26.0,SKU003
 645 | 1.3837131099243467,16.0,SKU005
 646 | 11.727823135150278,99.0,SKU003
 647 | 1.2912292974666384,9.0,SKU002
 648 | 13.50050534481568,171.0,SKU001
 649 | 1.4968755687044806,11.0,SKU002
 650 | 5.456730315985535,47.0,SKU003
 651 | 18.257702061661604,181.0,SKU002
 652 | 12.712016043052971,72.0,SKU004
 653 | 4.112928732189596,32.0,SKU004
 654 | 19.933522946898087,147.0,SKU003
 655 | 4.313069667805731,46.0,SKU003
 656 | 10.606252479784215,164.0,SKU001
 657 | 11.335818988311248,115.0,SKU002
 658 | 5.756061834556352,77.0,SKU001
 659 | 12.956580975371262,113.0,SKU003
 660 | 7.109912140915538,60.0,SKU003
 661 | 19.63911969764996,110.0,SKU005
 662 | 5.22239370236369,83.0,SKU001
 663 | 10.092039094100183,73.0,SKU003
 664 | 10.752895883368838,64.0,SKU004
 665 | 15.460325288477993,81.0,SKU005
 666 | 1.8361435000457405,27.0,SKU002
 667 | 6.130759256822609,39.0,SKU005
 668 | 2.3531842161938163,25.0,SKU002
 669 | 8.166433522290717,56.0,SKU004
 670 | 11.746588476970976,121.0,SKU002
 671 | 11.625722606595469,145.0,SKU001
 672 | 16.972793479130615,164.0,SKU002
 673 | 2.4187916310203845,33.0,SKU002
 674 | 2.920887140754707,23.0,SKU005
 675 | 1.4185034125468814,18.0,SKU004
 676 | 7.442870989363708,83.0,SKU002
 677 | 8.58425483256257,103.0,SKU001
 678 | 9.596003931588692,54.0,SKU005
 679 | 17.04497177670908,107.0,SKU004
 680 | 7.526315276676786,62.0,SKU004
 681 | 13.488687876092998,106.0,SKU003
 682 | 12.273918233197836,160.0,SKU001
 683 | 8.536379035807345,42.0,SKU005
 684 | 2.616682353559852,17.0,SKU005
 685 | 3.0373272859583897,47.0,SKU001
 686 | 30.266344490366357,192.0,SKU004
 687 | 3.9276631101258417,41.0,SKU002
 688 | 3.6902138664682567,47.0,SKU001
 689 | 3.150997237534521,15.0,SKU005
 690 | 0.5089468784834459,29.0,SKU001
 691 | 10.670705549808863,71.0,SKU004
 692 | 24.45296897476211,206.0,SKU003
 693 | 2.6857747415556084,22.0,SKU003
 694 | 8.46128206816337,88.0,SKU002
 695 | 0.0,1.0,SKU002
 696 | 7.507497607315963,46.0,SKU004
 697 | 1.1803125966257908,11.0,SKU002
 698 | 2.7905189912728625,31.0,SKU003
 699 | 5.948942026324017,47.0,SKU004
 700 | 12.30945993057853,76.0,SKU004
 701 | 7.10483158686062,48.0,SKU004
 702 | 0.0,12.0,SKU001
 703 | 6.104082425478678,65.0,SKU002
 704 | 16.265169604038626,94.0,SKU005
 705 | 3.1320635454877017,35.0,SKU003
 706 | 18.54388441002704,192.0,SKU002
 707 | 4.175352411085883,26.0,SKU003
 708 | 3.8405114958931295,14.0,SKU002
 709 | 30.7119289608115,174.0,SKU005
 710 | 0.0,1.0,SKU003
 711 | 5.053190340643344,30.0,SKU005
 712 | 3.679040559738356,28.0,SKU003
 713 | 6.160399318330828,72.0,SKU001
 714 | 2.452013913619434,22.0,SKU001
 715 | 1.7305281418140326,3.0,SKU003
 716 | 7.8673536763553535,63.0,SKU003
 717 | 11.795181831816215,67.0,SKU005
 718 | 0.0,1.0,SKU004
 719 | 21.37491486306533,179.0,SKU003
 720 | 3.804553410490541,52.0,SKU002
 721 | 1.4637585615456534,23.0,SKU002
 722 | 17.69129655804075,188.0,SKU002
 723 | 1.550822624048076,35.0,SKU001
 724 | 16.26163077938886,87.0,SKU005
 725 | 6.279051109353285,109.0,SKU001
 726 | 6.418647890850076,66.0,SKU002
 727 | 23.319548930848065,180.0,SKU003
 728 | 14.60014468260054,149.0,SKU002
 729 | 16.48574902205995,121.0,SKU003
 730 | 2.958064932999906,20.0,SKU005
 731 | 18.6560471259509,113.0,SKU004
 732 | 15.19587931265685,134.0,SKU003
 733 | 9.109229885038662,104.0,SKU001
 734 | 6.133641170448227,70.0,SKU002
 735 | 3.5996277817862548,22.0,SKU002
 736 | 2.2982532647277845,28.0,SKU002
 737 | 14.458535307476346,107.0,SKU003
 738 | 28.082170323012534,153.0,SKU005
 739 | 3.45871337388066,32.0,SKU003
 740 | 4.619194895379417,48.0,SKU001
 741 | 5.434317628398099,55.0,SKU002
 742 | 8.282200849864763,78.0,SKU003
 743 | 10.463867672012556,100.0,SKU002
 744 | 5.69153263193154,89.0,SKU001
 745 | 3.393825100327883,27.0,SKU004
 746 | 4.633921129495124,37.0,SKU004
 747 | 3.833749540208236,23.0,SKU004
 748 | 0.3854686797168094,14.0,SKU002
 749 | 9.448314870528366,85.0,SKU003
 750 | 1.5995477253547217,26.0,SKU002
 751 | 7.570626772008124,47.0,SKU005
 752 | 14.2382514364412,110.0,SKU003
 753 | 5.467838976799268,61.0,SKU001
 754 | 3.080214280973185,15.0,SKU004
 755 | 15.465509496144884,124.0,SKU003
 756 | 11.7619538524447,95.0,SKU003
 757 | 0.1028561736996098,2.0,SKU004
 758 | 5.6659761741285415,70.0,SKU001
 759 | 3.411100146192058,23.0,SKU002
 760 | 11.981602004859765,121.0,SKU002
 761 | 2.0920902231661405,12.0,SKU003
 762 | 12.257308054525144,90.0,SKU004
 763 | 14.515143830263568,78.0,SKU005
 764 | 4.33285059709926,55.0,SKU001
 765 | 8.969324841055782,100.0,SKU002
 766 | 2.9426419390369905,25.0,SKU001
 767 | 3.179987046234479,59.0,SKU001
 768 | 0.0,1.0,SKU001
 769 | 14.9812941887312,94.0,SKU004
 770 | 10.398082442127023,67.0,SKU004
 771 | 3.4150588883789936,31.0,SKU004
 772 | 16.56720754137663,130.0,SKU003
 773 | 7.714454313598001,44.0,SKU004
 774 | 3.3072928332055547,24.0,SKU003
 775 | 3.896212276013321,15.0,SKU002
 776 | 0.0581682546940762,9.0,SKU002
 777 | 0.0,20.0,SKU002
 778 | 0.0066255935910888,14.0,SKU002
 779 | 1.4308956536494937,37.0,SKU002
 780 | 13.986193409846951,79.0,SKU004
 781 | 7.442428088374623,43.0,SKU005
 782 | 7.812029046805771,39.0,SKU004
 783 | 1.4073300397359725,10.0,SKU005
 784 | 12.38614596460642,89.0,SKU004
 785 | 9.56336946975962,113.0,SKU002
 786 | 4.910069094426747,71.0,SKU001
 787 | 8.159278053970185,42.0,SKU005
 788 | 12.313823190390137,167.0,SKU001
 789 | 10.919107879980544,125.0,SKU002
 790 | 11.036158720144549,59.0,SKU005
 791 | 4.211564742719324,45.0,SKU001
 792 | 0.4844784104643779,7.0,SKU004
 793 | 1.7048529358771054,24.0,SKU002
 794 | 6.519963882198034,55.0,SKU004
 795 | 34.25446545388516,224.0,SKU004
 796 | 18.18266821679577,186.0,SKU002
 797 | 2.442026385146449,29.0,SKU005
 798 | 5.969025642706006,60.0,SKU003
 799 | 12.130626303167238,93.0,SKU003
 800 | 11.754778148605787,121.0,SKU002
 801 | 9.438891168630438,85.0,SKU003
 802 | 0.9947877162864348,13.0,SKU001
 803 | 24.25665581075613,143.0,SKU005
 804 | 1.6337835461517902,15.0,SKU004
 805 | 9.56678052927533,66.0,SKU004
 806 | 3.303294218585963,16.0,SKU002
 807 | 2.5433563843790328,6.0,SKU004
 808 | 8.133682487254523,135.0,SKU001
 809 | 2.441467075951645,33.0,SKU001
 810 | 4.294996573576606,57.0,SKU002
 811 | 10.140384110670562,75.0,SKU004
 812 | 2.6793364610190635,36.0,SKU003
 813 | 4.854400303912137,46.0,SKU003
 814 | 3.468442161599195,27.0,SKU005
 815 | 1.5519778149038552,5.0,SKU002
 816 | 7.275955861919659,46.0,SKU005
 817 | 9.619229826099886,93.0,SKU003
 818 | 0.009306007227194,0.0,SKU003
 819 | 19.8723125819095,160.0,SKU003
 820 | 0.6836436482919016,5.0,SKU005
 821 | 14.860262716889652,143.0,SKU002
 822 | 3.6283354719017193,32.0,SKU001
 823 | 9.65963688218493,133.0,SKU001
 824 | 4.060534207936721,30.0,SKU003
 825 | 10.59449505951253,108.0,SKU002
 826 | 7.160437213963959,67.0,SKU002
 827 | 5.489317553808848,46.0,SKU004
 828 | 35.40246343380261,232.0,SKU004
 829 | 17.420696187432984,98.0,SKU005
 830 | 4.386694009309784,36.0,SKU004
 831 | 9.069104683051677,39.0,SKU005
 832 | 6.57695306659079,106.0,SKU001
 833 | 6.952260083072916,53.0,SKU003
 834 | 1.923565827681676,29.0,SKU002
 835 | 8.611044177015675,107.0,SKU001
 836 | 4.599312165375223,35.0,SKU001
 837 | 7.327915618637981,37.0,SKU005
 838 | 7.776654231791658,63.0,SKU003
 839 | 5.993697832184908,29.0,SKU005
 840 | 1.4956878369304687,28.0,SKU001
 841 | 15.930490661841764,154.0,SKU002
 842 | 15.526443912438454,103.0,SKU004
 843 | 15.155184025054677,94.0,SKU005
 844 | 3.057375753958269,18.0,SKU001
 845 | 8.381123685910485,48.0,SKU004
 846 | 8.119132810364215,60.0,SKU004
 847 | 6.466563631884142,73.0,SKU001
 848 | 4.11691219480187,42.0,SKU003
 849 | 2.245557469727032,24.0,SKU001
 850 | 6.906692521521395,69.0,SKU002
 851 | 5.415045637340494,25.0,SKU004
 852 | 16.927724801540037,166.0,SKU002
 853 | 5.472997906940382,40.0,SKU005
 854 | 5.025049804941838,27.0,SKU004
 855 | 5.549081388919629,48.0,SKU003
 856 | 5.936002675556806,86.0,SKU001
 857 | 13.636471735687548,78.0,SKU005
 858 | 13.03345747085613,136.0,SKU002
 859 | 2.292125802529632,3.0,SKU005
 860 | 6.204284118386802,43.0,SKU003
 861 | 6.662431090292584,104.0,SKU001
 862 | 15.168679057336584,91.0,SKU005
 863 | 1.0188993445578178,23.0,SKU002
 864 | 20.320728603689115,111.0,SKU005
 865 | 0.4920720717755844,7.0,SKU001
 866 | 21.359015774516102,128.0,SKU005
 867 | 18.86397273853941,173.0,SKU002
 868 | 20.312916844978904,264.0,SKU001
 869 | 0.9634931785171452,6.0,SKU003
 870 | 17.83124437053494,142.0,SKU003
 871 | 11.44970789748826,85.0,SKU004
 872 | 17.131182814400816,95.0,SKU005
 873 | 3.8254439799843807,42.0,SKU002
 874 | 17.51498025551254,95.0,SKU005
 875 | 10.94486444275418,137.0,SKU001
 876 | 0.3715529157687786,3.0,SKU004
 877 | 6.402999141180072,65.0,SKU002
 878 | 14.558470919996346,74.0,SKU005
 879 | 13.018162899943844,144.0,SKU002
 880 | 3.3769588613665062,53.0,SKU001
 881 | 3.7487906370561936,17.0,SKU004
 882 | 3.328268410671886,18.0,SKU002
 883 | 0.0,1.0,SKU001
 884 | 12.454914816299173,105.0,SKU003
 885 | 8.280969995209578,90.0,SKU002
 886 | 27.720423663262302,194.0,SKU004
 887 | 7.316255876549341,53.0,SKU002
 888 | 0.3192254374186071,5.0,SKU002
 889 | 0.9710566449427528,7.0,SKU001
 890 | 12.94876199230672,71.0,SKU005
 891 | 17.393731913725947,91.0,SKU005
 892 | 3.771391827802317,42.0,SKU002
 893 | 12.90484234574758,88.0,SKU004
 894 | 25.944833327103808,167.0,SKU004
 895 | 3.391892380001714,69.0,SKU001
 896 | 1.891895068310235,4.0,SKU002
 897 | 26.150553980007206,244.0,SKU002
 898 | 4.365715116011769,24.0,SKU005
 899 | 0.0,2.0,SKU001
 900 | 19.14914790107668,146.0,SKU003
 901 | 26.05862841454268,175.0,SKU004
 902 | 23.546064895426618,156.0,SKU004
 903 | 4.587806894095571,44.0,SKU002
 904 | 8.398536141276084,121.0,SKU001
 905 | 11.869960772227486,92.0,SKU003
 906 | 2.1948199433241173,24.0,SKU004
 907 | 10.268335892905675,166.0,SKU001
 908 | 5.919626397999135,44.0,SKU003
 909 | 3.592704911354881,25.0,SKU002
 910 | 0.5703641991318462,6.0,SKU005
 911 | 9.082470790652732,64.0,SKU004
 912 | 17.10898531356723,110.0,SKU004
 913 | 13.711390094965042,110.0,SKU003
 914 | 0.0,2.0,SKU005
 915 | 4.879966997628959,61.0,SKU001
 916 | 12.495666733672897,125.0,SKU002
 917 | 0.0,5.0,SKU001
 918 | 5.2870019870098535,43.0,SKU003
 919 | 7.543093315991967,49.0,SKU004
 920 | 5.394393406583969,40.0,SKU004
 921 | 8.48477007419964,101.0,SKU001
 922 | 6.929625520095475,64.0,SKU002
 923 | 23.183326547227978,188.0,SKU003
 924 | 12.026683122498143,119.0,SKU002
 925 | 13.406336364530503,110.0,SKU003
 926 | 8.228747432803644,85.0,SKU002
 927 | 6.145516177461079,84.0,SKU001
 928 | 35.49579295283252,213.0,SKU005
 929 | 3.0314864852334167,24.0,SKU003
 930 | 3.9474864033821766,48.0,SKU001
 931 | 2.641071207189748,1.0,SKU004
 932 | 2.978284502448213,17.0,SKU004
 933 | 5.664276862588904,63.0,SKU002
 934 | 3.158022979071008,32.0,SKU004
 935 | 2.327412994749542,19.0,SKU005
 936 | 0.5329757865934462,8.0,SKU003
 937 | 2.46143173405841,29.0,SKU002
 938 | 7.661341344177124,63.0,SKU003
 939 | 6.519462711873654,55.0,SKU003
 940 | 2.7927903808037757,40.0,SKU001
 941 | 6.7039314389735605,48.0,SKU004
 942 | 12.661912164001784,101.0,SKU003
 943 | 15.199011684136298,92.0,SKU005
 944 | 3.2839904247744345,26.0,SKU005
 945 | 18.72715149079252,126.0,SKU004
 946 | 4.776321099961804,29.0,SKU004
 947 | 0.0,7.0,SKU002
 948 | 10.56763428990645,111.0,SKU002
 949 | 2.934815334838208,31.0,SKU002
 950 | 17.936191745190765,121.0,SKU004
 951 | 2.596337752816326,15.0,SKU003
 952 | 4.994252467822371,54.0,SKU001
 953 | 0.5104089892001464,3.0,SKU002
 954 | 1.8375191727409692,8.0,SKU001
 955 | 6.164618147488205,37.0,SKU005
 956 | 12.425895830054529,77.0,SKU004
 957 | 31.384021020955316,183.0,SKU005
 958 | 5.013624222933439,47.0,SKU002
 959 | 7.808464606730919,57.0,SKU003
 960 | 10.513191024484003,105.0,SKU002
 961 | 8.594821929176447,127.0,SKU001
 962 | 7.713204554256867,94.0,SKU002
 963 | 10.422759825991283,79.0,SKU003
 964 | 3.7518005990197487,32.0,SKU003
 965 | 6.772088312451085,46.0,SKU004
 966 | 18.40825492956788,113.0,SKU004
 967 | 1.4422736854652418,19.0,SKU003
 968 | 9.778602105966128,72.0,SKU003
 969 | 9.755535955004616,140.0,SKU001
 970 | 2.624175167043714,28.0,SKU002
 971 | 6.740829100453989,52.0,SKU004
 972 | 6.2981411213596665,94.0,SKU001
 973 | 9.208171636016122,88.0,SKU002
 974 | 7.6231897540307,42.0,SKU004
 975 | 4.602389425242959,56.0,SKU001
 976 | 27.38669854150025,155.0,SKU005
 977 | 8.20260789759186,32.0,SKU005
 978 | 5.869211374804695,35.0,SKU004
 979 | 11.217755169722237,162.0,SKU001
 980 | 6.977915829180232,87.0,SKU002
 981 | 4.769917104172526,44.0,SKU002
 982 | 13.903440762885136,204.0,SKU001
 983 | 6.785798485862577,44.0,SKU005
 984 | 6.010606554023705,76.0,SKU001
 985 | 16.326670748792033,87.0,SKU005
 986 | 2.512858822431763,16.0,SKU005
 987 | 3.9847232596693214,52.0,SKU002
 988 | 6.402818220334475,44.0,SKU004
 989 | 4.355660110269191,37.0,SKU003
 990 | 10.22190096441918,135.0,SKU001
 991 | 20.07781867444539,122.0,SKU005
 992 | 1.6789783555371454,34.0,SKU001
 993 | 24.350608034136915,156.0,SKU004
 994 | 11.478198499843325,71.0,SKU004
 995 | 11.214147145144503,80.0,SKU003
 996 | 13.114857432801996,120.0,SKU003
 997 | 4.033392084446712,28.0,SKU005
 998 | 6.699360500397856,46.0,SKU003
 999 | 7.784547973465105,41.0,SKU005
1000 | 8.66469389334129,83.0,SKU002
1001 | 13.830253663243903,81.0,SKU005
1002 | 


--------------------------------------------------------------------------------
/tests/resources/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bodywork-ml/ml-pipeline-engineering/521001735bddf166c75a2b6d72f7a71d5530ca6b/tests/resources/model.pkl


--------------------------------------------------------------------------------
/tests/test_serve_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for web API.
 3 | """
 4 | import pickle
 5 | from subprocess import run
 6 | from unittest.mock import patch
 7 | 
 8 | from bodywork_pipeline_utils.aws import Model
 9 | from fastapi.testclient import TestClient
10 | from numpy import array
11 | 
12 | from pipeline.serve_model import app
13 | 
14 | test_client = TestClient(app)
15 | 
16 | 
17 | def wrapped_model() -> Model:
18 |     with open("tests/resources/model.pkl", "r+b") as file:
19 |         wrapped_model = pickle.load(file)
20 |     return wrapped_model
21 | 
22 | 
23 | @patch("pipeline.serve_model.wrapped_model", new=wrapped_model(), create=True)
24 | def test_web_api_returns_valid_response_given_valid_data():
25 |     prediction_request = {"product_code": "SKU001", "orders_placed": 100}
26 |     prediction_response = test_client.post(
27 |         "/api/v0.1/time_to_dispatch", json=prediction_request
28 |     )
29 |     model_obj = wrapped_model()
30 |     expected_prediction = model_obj.model.predict(array([[100, 0]])).tolist()[0]
31 |     assert prediction_response.status_code == 200
32 |     assert prediction_response.json()["est_hours_to_dispatch"] == expected_prediction
33 |     assert prediction_response.json()["model_version"] == str(model_obj)
34 | 
35 | 
36 | @patch("pipeline.serve_model.wrapped_model", new=wrapped_model(), create=True)
37 | def test_web_api_returns_error_code_given_invalid_data():
38 |     prediction_request = {"product_code": "SKU001", "foo": 100}
39 |     prediction_response = test_client.post(
40 |         "/api/v0.1/time_to_dispatch", json=prediction_request
41 |     )
42 |     assert prediction_response.status_code == 422
43 |     assert "value_error.missing" in prediction_response.text
44 | 
45 |     prediction_request = {"product_code": "SKU000", "orders_placed": 100}
46 |     prediction_response = test_client.post(
47 |         "/api/v0.1/time_to_dispatch", json=prediction_request
48 |     )
49 |     assert prediction_response.status_code == 422
50 |     assert "not a valid enumeration member" in prediction_response.text
51 | 
52 |     prediction_request = {"product_code": "SKU001", "orders_placed": -100}
53 |     prediction_response = test_client.post(
54 |         "/api/v0.1/time_to_dispatch", json=prediction_request
55 |     )
56 |     assert prediction_response.status_code == 422
57 |     assert "ensure this value is greater than or equal to 0" in prediction_response.text
58 | 
59 | 
60 | def test_web_server_raises_exception_if_passed_invalid_args():
61 |     process = run(
62 |         ["python", "-m", "pipeline.serve_model"], capture_output=True, encoding="utf-8"
63 |     )
64 |     assert process.returncode != 0
65 |     assert "ERROR" in process.stdout
66 |     assert "Invalid arguments passed to serve_model.py" in process.stdout
67 | 


--------------------------------------------------------------------------------
/tests/test_train_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for model training stage.
  3 | """
  4 | from datetime import datetime
  5 | from subprocess import run
  6 | from unittest.mock import MagicMock, patch
  7 | 
  8 | from bodywork_pipeline_utils.aws import Dataset
  9 | from pandas import read_csv, DataFrame
 10 | from pytest import fixture, raises
 11 | from _pytest.logging import LogCaptureFixture
 12 | from sklearn.dummy import DummyRegressor
 13 | from sklearn.exceptions import NotFittedError
 14 | from sklearn.utils.validation import check_is_fitted
 15 | 
 16 | from pipeline.train_model import (
 17 |     FeatureAndLabels,
 18 |     main,
 19 |     prepare_data,
 20 |     preprocess,
 21 |     train_model,
 22 |     validate_trained_model_logic,
 23 | )
 24 | 
 25 | 
 26 | @fixture(scope="session")
 27 | def dataset() -> Dataset:
 28 |     data = read_csv("tests/resources/dataset.csv")
 29 |     dataset = Dataset(data, datetime(2021, 7, 15), "tests", "resources", "foobar")
 30 |     return dataset
 31 | 
 32 | 
 33 | @fixture(scope="session")
 34 | def prepared_data(dataset: Dataset) -> FeatureAndLabels:
 35 |     return FeatureAndLabels(
 36 |         dataset.data[["orders_placed", "product_code"]][:800],
 37 |         dataset.data[["orders_placed", "product_code"]][800:999],
 38 |         dataset.data["hours_to_dispatch"][:800],
 39 |         dataset.data["hours_to_dispatch"][800:999],
 40 |     )
 41 | 
 42 | 
 43 | def test_prepare_data_splits_labels_and_features_into_test_and_train(dataset: Dataset):
 44 |     label_column = "hours_to_dispatch"
 45 |     n_rows_in_dataset = dataset.data.shape[0]
 46 |     n_cols_in_dataset = dataset.data.shape[1]
 47 |     prepared_data = prepare_data(dataset.data)
 48 | 
 49 |     assert prepared_data.X_train.shape[1] == n_cols_in_dataset - 1
 50 |     assert label_column not in prepared_data.X_train.columns
 51 | 
 52 |     assert prepared_data.X_test.shape[1] == n_cols_in_dataset - 1
 53 |     assert label_column not in prepared_data.X_test.columns
 54 | 
 55 |     assert prepared_data.y_train.ndim == 1
 56 |     assert prepared_data.y_train.name == label_column
 57 | 
 58 |     assert prepared_data.y_test.ndim == 1
 59 |     assert prepared_data.y_test.name == label_column
 60 | 
 61 |     assert (
 62 |         prepared_data.X_train.shape[0] + prepared_data.X_test.shape[0]
 63 |         == n_rows_in_dataset
 64 |     )
 65 | 
 66 |     assert (
 67 |         prepared_data.y_train.shape[0] + prepared_data.y_test.shape[0]
 68 |         == n_rows_in_dataset
 69 |     )
 70 | 
 71 | 
 72 | def test_preprocess_processes_features():
 73 |     data = DataFrame({"orders_placed": [30], "product_code": ["SKU004"]})
 74 |     processed_data = preprocess(data)
 75 |     assert processed_data[0, 0] == 30
 76 |     assert processed_data[0, 1] == 3
 77 | 
 78 | 
 79 | def test_train_model_yields_model_and_metrics(prepared_data: FeatureAndLabels):
 80 |     model, metrics = train_model(prepared_data, {"random_state": [42]})
 81 |     try:
 82 |         check_is_fitted(model)
 83 |         assert True
 84 |     except NotFittedError:
 85 |         assert False
 86 | 
 87 |     assert metrics.r_squared >= 0.9
 88 |     assert metrics.mean_absolute_error <= 1.25
 89 | 
 90 | 
 91 | def test_validate_trained_model_logic_raises_exception_for_failing_models(
 92 |     prepared_data: FeatureAndLabels,
 93 | ):
 94 |     dummy_model = DummyRegressor(strategy="constant", constant=-1.0)
 95 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
 96 |     expected_exception_str = (
 97 |         "Trained model failed verification: "
 98 |         "hours_to_dispatch predictions do not increase with orders_placed."
 99 |     )
100 |     with raises(RuntimeError, match=expected_exception_str):
101 |         validate_trained_model_logic(dummy_model, prepared_data)
102 | 
103 |     dummy_model = DummyRegressor(strategy="constant", constant=-1.0)
104 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
105 |     expected_exception_str = (
106 |         "Trained model failed verification: "
107 |         "hours_to_dispatch predictions do not increase with orders_placed, "
108 |         "negative hours_to_dispatch predictions found for test set."
109 |     )
110 |     with raises(RuntimeError, match=expected_exception_str):
111 |         validate_trained_model_logic(dummy_model, prepared_data)
112 | 
113 |     dummy_model = DummyRegressor(strategy="constant", constant=1000.0)
114 |     dummy_model.fit(prepared_data.X_train, prepared_data.y_train)
115 |     expected_exception_str = (
116 |         "Trained model failed verification: "
117 |         "hours_to_dispatch predictions do not increase with orders_placed, "
118 |         "outlier hours_to_dispatch predictions found for test set."
119 |     )
120 |     with raises(RuntimeError, match=expected_exception_str):
121 |         validate_trained_model_logic(dummy_model, prepared_data)
122 | 
123 | 
124 | @patch("pipeline.train_model.aws")
125 | def test_train_job_happy_path(
126 |     mock_aws: MagicMock,
127 |     dataset: Dataset,
128 |     caplog: LogCaptureFixture,
129 | ):
130 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
131 |     main("project-bucket", 0.8, 0.9, {"random_state": [42]})
132 |     mock_aws.Model().put_model_to_s3.assert_called_once()
133 |     logs = caplog.text
134 |     assert "Starting train-model stage" in logs
135 |     assert "Retrieved dataset from s3" in logs
136 |     assert "Trained model" in logs
137 |     assert "Model serialised and persisted to s3" in logs
138 | 
139 | 
140 | @patch("pipeline.train_model.aws")
141 | def test_train_job_raises_exception_when_metrics_below_error_threshold(
142 |     mock_aws: MagicMock,
143 |     dataset: Dataset,
144 | ):
145 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
146 |     with raises(RuntimeError, match="below deployment threshold"):
147 |         main("project-bucket", 1, 0.9, {"random_state": [42]})
148 | 
149 | 
150 | @patch("pipeline.train_model.aws")
151 | def test_train_job_logs_warning_when_metrics_below_warning_threshold(
152 |     mock_aws: MagicMock,
153 |     dataset: Dataset,
154 |     caplog: LogCaptureFixture,
155 | ):
156 |     mock_aws.get_latest_csv_dataset_from_s3.return_value = dataset
157 |     main("project-bucket", 0.5, 0.9, {"random_state": [42]})
158 |     assert "WARNING" in caplog.text
159 |     assert "breached warning threshold" in caplog.text
160 | 
161 | 
162 | def test_run_job_handles_error_for_invalid_args():
163 |     process_one = run(
164 |         ["python", "pipeline/train_model.py"], capture_output=True, encoding="utf-8"
165 |     )
166 |     assert process_one.returncode != 0
167 |     assert "ERROR" in process_one.stdout
168 |     assert "Invalid arguments passed to train_model.py" in process_one.stdout
169 | 
170 |     process_two = run(
171 |         ["python", "-m", "pipeline.train_model", "my-bucket", "-1", "0.5"],
172 |         capture_output=True,
173 |         encoding="utf-8",
174 |     )
175 |     assert process_two.returncode != 0
176 |     assert "ERROR" in process_two.stdout
177 |     assert "Invalid arguments passed to train_model.py" in process_two.stdout
178 | 
179 |     process_three = run(
180 |         ["python", "-m", "pipeline.train_model", "my-bucket", "2", "0.5"],
181 |         capture_output=True,
182 |         encoding="utf-8",
183 |     )
184 |     assert process_three.returncode != 0
185 |     assert "ERROR" in process_three.stdout
186 |     assert "Invalid arguments passed to train_model.py" in process_three.stdout
187 | 
188 |     process_four = run(
189 |         ["python", "-m", "pipeline.train_model", "my-bucket", "0.5", "-1"],
190 |         capture_output=True,
191 |         encoding="utf-8",
192 |     )
193 |     assert process_four.returncode != 0
194 |     assert "ERROR" in process_four.stdout
195 |     assert "Invalid arguments passed to train_model.py" in process_four.stdout
196 | 
197 |     process_five = run(
198 |         ["python", "-m", "pipeline.train_model", "my-bucket", "0.5", "2"],
199 |         capture_output=True,
200 |         encoding="utf-8",
201 |     )
202 |     assert process_five.returncode != 0
203 |     assert "ERROR" in process_five.stdout
204 |     assert "Invalid arguments passed to train_model.py" in process_five.stdout
205 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = {py39}_{unit_and_functional_tests,static_code_analysis}
 3 | 
 4 | [testenv]
 5 | skip_install = true
 6 | deps = 
 7 |     -rrequirements_cicd.txt
 8 |     -rrequirements_pipe.txt
 9 | commands = 
10 |     unit_and_functional_tests: pytest tests/ --disable-warnings {posargs}
11 |     static_code_analysis: mypy --config-file mypy.ini
12 |     static_code_analysis: flake8 --config flake8.ini pipeline
13 | 


--------------------------------------------------------------------------------