├── .gitattributes
├── .gitignore
├── README.md
├── config
    ├── requirements_aws_deployment.txt
    ├── requirements_baseline.txt
    └── requirements_mlflow_approach.txt
├── data
    └── raw
    │   ├── download_links.md
    │   ├── renfe.parquet
    │   └── renfe_light.parquet
├── docs
    ├── airflow_setup.md
    ├── aws_setup.md
    ├── images
    │   ├── dag_list.png
    │   └── python.png
    └── mlflow_setup.md
├── notebooks
    ├── aws_deployment.ipynb
    ├── baseline.ipynb
    ├── mlflow_approach.ipynb
    └── mlflow_deployment.ipynb
├── scripts
    └── dags
    │   ├── dynamic.py
    │   ├── renfe_dag.py
    │   ├── renfe_script.py
    │   └── simple_dag.py
└── vm
    └── ubuntu.ova


/.gitattributes:
--------------------------------------------------------------------------------
1 | data/raw/renfe.parquet filter=lfs diff=lfs merge=lfs -text
2 | vm/ubuntu.ova filter=lfs diff=lfs merge=lfs -text
3 | data/raw/renfe_light.parquet filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .ipynb_checkpoints
3 | __pycache__
4 | output
5 | embedding.parquet
6 | .idea
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML-IN-PRODUCTION-MADRID
 2 | 
 3 | This repository contains all materials from the workshop about __putting Machine Learning models to production__ we 
 4 | teached in September 2019 at [IronHack](http://www.ironhack.com/en).
 5 | 
 6 | ## Approach
 7 | 
 8 | This is a practical workshop with the goals of learning the following concepts:
 9 | 
10 | * How to setup MLFLow, a tool for ML experiment tracking and model deploying, from zero to hero.
11 | * How to track ML experiments with MLFLow
12 | * How to put models to production with MLFLow.
13 | * How to deploy models to production in AWS Sagemaker with just a couple lines of code.
14 | * How to setup Apache Airflow, a powerful tool to design, schedule and monitor workflows.
15 | * How to create workflows that take advantage of deployed models.
16 | 
17 | In order to follow tutorials in a standard setup, there is a Linux VM included in this repository 
18 | with repository itself and conda preinstalled. Please download VirtualBox and import `vm/ubuntu.ova`.
19 | 
20 | VM login credentials are:
21 | 
22 | - username: ubuntu
23 | - password: ubuntu
24 | 
25 | In case you want to follow examples in this repo using your very own setup, we highly recommend using an
26 | [Ubuntu 18.04](http://releases.ubuntu.com/18.04/ubuntu-18.04.3-live-server-amd64.iso) 
27 | machine with [conda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh) installed.
28 | 
29 | ### Calendar
30 | 
31 | - Friday 27/09/2019 from 17 to 20h
32 |     * Introduction to Machine Learning in Production
33 |     * Introduction to MLFlow, MLFLow full setup
34 |     * Introduction to Dataset and Business Case (Renfe AVE ticket price forecasting)
35 |     * MLFLow training API
36 | 
37 | - Saturday from 10 to 20h
38 |     * MLFLow deployment API
39 |     * Python Virtual Environments distribution
40 |     * AWS model deployment with SageMaker
41 |     * Introduction to Apache Airflow
42 |     * Airflow orchestration
43 | 
44 | ### Business Case
45 | 
46 | All examples will use our [dataset](https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing) 
47 | about high speed train tickets in Spain. 
48 | The following use cases are covered here:
49 | 
50 | - Unsupervised learning - high speed train tickets clustering using the following algorithms:
51 |     * Dimensionality reduction with UMAP
52 |     * HDBSCAN clustering
53 |     * Model to production using MLFlow so that the REST API returns a cluster ID for new tickets
54 | 
55 | - Supervised learning - high speed train tickets forecasting using the following algorithms:
56 |     * XGBoost implementation of AWS Sagemaker (both cloud training and model deployment)
57 |     * scikit-learn Random Forest (local training and cloud deployment in AWS Sagemaker)
58 |   
59 | - Model deployment:
60 |     * Putting models to production in virtually any linux machine or server
61 |     * Putting model to production in cloud with AWS SageMaker
62 | 
63 | - Scheduling:
64 |     * Orchestration of (batch) clustering and price forecasting for new data using Apache Airflow
65 | 


--------------------------------------------------------------------------------
/config/requirements_aws_deployment.txt:
--------------------------------------------------------------------------------
  1 | alembic==1.2.1
  2 | asn1crypto==0.24.0
  3 | attrs==19.1.0
  4 | backcall==0.1.0
  5 | bcrypt==3.1.7
  6 | bleach==3.1.0
  7 | boto3==1.9.238
  8 | botocore==1.12.238
  9 | cached-property==1.5.1
 10 | certifi==2019.9.11
 11 | cffi==1.12.3
 12 | chardet==3.0.4
 13 | Click==7.0
 14 | cloudpickle==1.2.2
 15 | configparser==4.0.2
 16 | cryptography==2.7
 17 | cycler==0.10.0
 18 | databricks-cli==0.9.0
 19 | decorator==4.4.0
 20 | defusedxml==0.6.0
 21 | docker==3.7.3
 22 | docker-compose==1.24.1
 23 | docker-pycreds==0.4.0
 24 | dockerpty==0.4.1
 25 | docopt==0.6.2
 26 | docutils==0.15.2
 27 | entrypoints==0.3
 28 | fabric==2.5.0
 29 | Flask==1.1.1
 30 | fsspec==0.5.1
 31 | gitdb2==2.0.5
 32 | GitPython==3.0.2
 33 | gorilla==0.3.0
 34 | gunicorn==19.9.0
 35 | idna==2.7
 36 | invoke==1.3.0
 37 | ipykernel==5.1.2
 38 | ipython==7.8.0
 39 | ipython-genutils==0.2.0
 40 | itsdangerous==1.1.0
 41 | jedi==0.15.1
 42 | Jinja2==2.10.1
 43 | jmespath==0.9.4
 44 | joblib==0.13.2
 45 | json5==0.8.5
 46 | jsonschema==3.0.2
 47 | jupyter-client==5.3.3
 48 | jupyter-core==4.5.0
 49 | jupyterlab==1.1.4
 50 | jupyterlab-server==1.0.6
 51 | kiwisolver==1.1.0
 52 | Mako==1.1.0
 53 | MarkupSafe==1.1.1
 54 | matplotlib==3.1.1
 55 | mistune==0.8.4
 56 | mlflow==1.2.0
 57 | nbconvert==5.6.0
 58 | nbformat==4.4.0
 59 | notebook==6.0.1
 60 | numpy==1.17.2
 61 | pandas==0.25.1
 62 | pandocfilters==1.4.2
 63 | paramiko==2.6.0
 64 | parso==0.5.1
 65 | pexpect==4.7.0
 66 | pickleshare==0.7.5
 67 | prometheus-client==0.7.1
 68 | prompt-toolkit==2.0.9
 69 | protobuf==3.9.2
 70 | protobuf3-to-dict==0.1.5
 71 | psycopg2-binary==2.8.3
 72 | ptyprocess==0.6.0
 73 | pyarrow==0.14.1
 74 | pycparser==2.19
 75 | Pygments==2.4.2
 76 | PyNaCl==1.3.0
 77 | pyparsing==2.4.2
 78 | pyrsistent==0.15.4
 79 | python-dateutil==2.8.0
 80 | python-editor==1.0.4
 81 | pytz==2019.2
 82 | PyYAML==5.1
 83 | pyzmq==18.1.0
 84 | querystring-parser==1.2.4
 85 | requests==2.20.1
 86 | s3fs==0.3.4
 87 | s3transfer==0.2.1
 88 | sagemaker==1.42.3
 89 | scikit-learn==0.21.3
 90 | scipy==1.3.1
 91 | Send2Trash==1.5.0
 92 | simplejson==3.16.0
 93 | six==1.12.0
 94 | smmap2==2.0.5
 95 | SQLAlchemy==1.3.8
 96 | sqlparse==0.3.0
 97 | tabulate==0.8.5
 98 | terminado==0.8.2
 99 | testpath==0.4.2
100 | texttable==0.9.1
101 | tornado==6.0.3
102 | traitlets==4.3.2
103 | urllib3==1.24.3
104 | wcwidth==0.1.7
105 | webencodings==0.5.1
106 | websocket-client==0.56.0
107 | Werkzeug==0.16.0
108 | 


--------------------------------------------------------------------------------
/config/requirements_baseline.txt:
--------------------------------------------------------------------------------
1 | pandas==0.25.1
2 | pyarrow==0.14.1
3 | scikit-learn==0.21.3
4 | umap-learn==0.3.10
5 | matplotlib==3.1.1
6 | hdbscan==0.8.22
7 | 


--------------------------------------------------------------------------------
/config/requirements_mlflow_approach.txt:
--------------------------------------------------------------------------------
1 | mlflow==1.2.0
2 | 


--------------------------------------------------------------------------------
/data/raw/download_links.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing


--------------------------------------------------------------------------------
/data/raw/renfe.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:858e23648675cbe6763ed91e6a7a67fd89b1230f20e703ddb171897acb9a1461
3 | size 62013038
4 | 


--------------------------------------------------------------------------------
/data/raw/renfe_light.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f7ee46f90a0686a3683148f4a58fd8c589713b257d48639b506d48bd18cf1ab4
3 | size 19349833
4 | 


--------------------------------------------------------------------------------
/docs/airflow_setup.md:
--------------------------------------------------------------------------------
  1 | # Airflow setup guide
  2 | 
  3 | ## Introduction
  4 | 
  5 | The goal of this session is to achieve full autonomy by installing and configuring Airflow with a standard and easy configuration 
  6 | (but effective for most projects), as well as learn the basic functionalities about workflows creation and getting familiar with 
  7 | Airflow web interface.
  8 | 
  9 | This introduction will explain the basics concepts of Airflow using a business case that will be developed during this session.
 10 | 
 11 | This case study is the creation of a pipeline that, using this dataset about High Speed train tickets, try to predict its price
 12 | and cluster the ticket belong (two models, one supervised and other unsupervised have been training before, and are available through
 13 | different endpoints).
 14 | 
 15 | 1. Mocks user alarms for Renfe tickets. Those alarms are just a ticket scrapped in a very particular point of time we want to track.
 16 | 2. Predicts future (next week price), and compare to original price.
 17 | 3. Perform a clustering of ticket (just for learning purposes).
 18 | 
 19 | ## Apache Airflow: motivation, advantages and disadvantages
 20 | 
 21 | ### Current situation and motivation for Airflow existance
 22 | 
 23 | Currently the management and orchestration of processes has been carried out using various tools and methodologies, for example:
 24 | 
 25 | * Shell scripts.
 26 | * Python scripts.
 27 | * R scripts.
 28 | * Jupyter Notebooks.
 29 | * Cron
 30 | * Oozie
 31 | 
 32 | At the same time, the Data practice has evolved due to:
 33 | 
 34 | * More data accumulated by companies. They want to use that data.
 35 | * Companies whose activity was not oriented to the storage and exploitation of the data from the beginning, but that 
 36 |     want to invest and transform into data-oriented organizations.
 37 | * Data accumulated in very heterogeneous sources (relational databases, Big Data infrastructure, cloud infrastructure).
 38 | 
 39 | All this causes an increasing complexity when working with data and designing processes for its exploitation:
 40 | 
 41 | * It is more complicated and critical to monitor (executions, failures).
 42 | * It is more complicated to find bugs and fix them (search the logs, etc.).
 43 | * It is more complicated to maintain the processes and introduce changes without affecting critical parts.
 44 | 
 45 | Apache Airflow has gained great popularity in the coming years, especially due to the rise of Data projects using Machine 
 46 | Learning libraries written in Python or whose main API is in Python, which is becoming the 
 47 | [reference language] (https://stackoverflow.blog/2017/09/06/incredible-growth-python/) in the field of data analysis 
 48 | and artificial intelligence (sorry R).
 49 | 
 50 | If programming languages were divided in terms of their efficiency and
 51 | [speed] (http://www.bioinformatics.org/benchmark/results.html) in execution, 
 52 | there would be 3 distinct groups:
 53 | 
 54 | 1. Fast languages: C, C ++, Fortran.
 55 | 2. Languages with intermediate speed: Java, Scala.
 56 | 3. Slow languages: Python, Ruby, Perl.
 57 | 
 58 | Most of the code written in Python for scientific computing and data analysis, uses under the hood extensions 
 59 | in C or C ++ (as in the case of NumPy or Tensorflow). Python does a good job as a wrapper and nexus of
 60 | code written in other (faster) languages. At the same time, its learning curve is reduced compared to
 61 | other languages, so it attracts users with little experience in programming and software development, but with
 62 | solid knowledge about data analytics. Growth in recent years has been exponential.
 63 | 
 64 | ![Python growth over time](./images/python.png)
 65 | 
 66 | ## Apache Airflow Introduction
 67 | 
 68 | ### Airflow Features
 69 | 
 70 | Python properties as a 'glue' language fit perfectly with the concept proposed by Apache Airflow, that is why its 
 71 | use has increased since its [release](https://airbnb.io/projects/airflow/) by the AirBnB engineering team.
 72 | 
 73 | Apache Airflow is defined as:
 74 | 
 75 | > A platform to programmatically author, schedule and monitor workflows.
 76 | 
 77 | The main features of Apache Airflow are the following:
 78 | 
 79 | * Usability
 80 |      * Web interface.
 81 |      * Creation and use of connections with diverse infrastructure.
 82 |      * Review of logs of each task independently.
 83 |      * Visualization of the executed code in each task.
 84 | * Robustness:
 85 |      * Task retry management.
 86 |      * Dependency between tasks.
 87 | * Elegant:
 88 |      * Definition of execution graphs (DAGs) as .py files
 89 |      * Minimum knowledge about scripting required.
 90 |      
 91 | ### Airflow operating scheme
 92 |      
 93 | Airflow consists of 4 main components.
 94 | 
 95 | * Webserver:
 96 |      * Process monitoring and visualization.
 97 |      * Query execution logs.
 98 |      * Definition of 'connections'.
 99 | * Scheduler:
100 |      * Launching tasks and processes at the right time and order.
101 | * Metadata database:
102 |      * Storage of the status of tasks and processes: queued, running, retrying, finished, failed.
103 |      * Storage of connections, users, passwords, etc.
104 | * Worker (optional):
105 |      * This component is responsible for executing the tasks offering additional functionalities to the execution by 
106 |      default. These additional functionalities usually have to do with the distribution of the execution.
107 |         
108 | ### Main components of Apache Airflow workflows
109 | 
110 | The main components of Apache Airflow are the following:
111 | 
112 | * DAG: Acronym for Directed Acyclic Graph: it is a set of tasks arranged according to a certain 
113 | dependence between them and that are executed with a certain periodicity.
114 | * Tasks: execution units.
115 | * Operators: define the type of task. They are subdivided into three types:
116 |     * Operators: execute some arbitrary action.
117 |     * Transfers: move information between different locations.
118 |     * Sensors: wait for changes in the system to start the execution.
119 |  * Hooks: communication interfaces with the infrastructure.
120 |  * Plugins: extensions of all the previous elements.
121 |  
122 | ### Ways to execute processes in Airflow: executors.
123 | 
124 | There is a big difference in the way processes are executed in Apache Airflow. The element that performs the execution 
125 | of the end of tasks in Airflow is called executor. There are several types of executors, each with their strengths
126 | and drawbacks:
127 | 
128 | * Sequential executor: is the default executor of Apache Airflow. It is characterized by executing tasks sequentially 
129 | (without any parallelization). It is good for prototyping processes and developing.
130 | * Local executor: uses Python built-in multiprocessing library. Its great advantage is that it does not require any 
131 | external elements to work and supports parallelization of tasks in a local machine. It is a good option when airflow 
132 | tasks needs some processing power and scheduler is on a single computer.
133 | * [Celery](http://www.celeryproject.org/) executor: Celery is by definition a distributed task queue. Its
134 | main feature is that it allows to distribute the tasks by several machines that are coordinated with the help of a 
135 | broker like Redis or RabbitMQ.
136 | * [Dask](https://dask.org/) executor: Dask has been one of the great revelations in the analytical community that allow 
137 | to distribute Python natively. Its main feature is that beyond distributing tasks by certain components of a cluster, 
138 | Dask distributes the tasks themselves, using distributed arrays of pandas and numpy. Please note the difference between
139 | distributing tasks and distributed tasks.
140 | 
141 | ## Apache Airflow setup
142 | 
143 | ### Step 1: Apache Airflow installation in a virtual environment
144 | 
145 | It is recommended to install Airflow in a conda virtual environment, along with the rest of the dependencies
146 | of this session. Miniconda is already installed in the virtual machine provided. For more information about
147 | conda virtual environment manager, it is recommended to read the following content:
148 | 
149 | * https://medium.freecodecamp.org/why-you-need-python-environments-and-how-to-manage-them-with-conda-85f155f4353c
150 | * https://conda.io/docs/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file
151 | 
152 | A conda environment file `.yml`, include the necessary requirements for the creation of an environment, these, the 
153 | libraries which is necessary to install. An example of a .yml file would be the following:
154 | 
155 | ```yml
156 | name: airflow_env
157 | channels:
158 |   - default
159 | dependencies:
160 |   - pip:
161 |     - apache-airflow[postgres]
162 | ```
163 | 
164 | To create this environment execute the following commands:
165 | 
166 | * `sudo apt install gcc`
167 | * `conda env create -f /path/to/env_file.yml`
168 | 
169 | There are several ways to install Apache Airflow depending on the integrations that will be used. In this workshop we will install 
170 | Airflow base and the libraries needed for the integration of Airflow with Postgres. For more information about Airflow installation 
171 | options, please check the docs:
172 | 
173 | * http://airflow.apache.org/installation.html
174 | 
175 | The way of installing Airflow differs depending on whether it is done using the Python default package manager (pip)
176 | or the Anaconda environment manager (conda). Depending on the installed version, the installation of additional libraries
177 | will have to be performed or not. Usually, conda installation could be defined as more 'automatic', performing a more detailed 
178 | management of the dependencies, and installing `gcc` automatically.
179 | 
180 | ### Step 2. Database init (for metadata)
181 | 
182 | Once the setup of all requirements has been completed, a database for the Airflow metadata (graphs, tasks, connections, 
183 | execution status, statistics, etc.) must be initialized.
184 | 
185 | Airflow is able to use a large number of technologies as database backend. The choice of such 
186 | technology has profound implications in what Apache Airflow can do:
187 | 
188 | * SQLite: used by default. Creates a `.db` file where you store the metadata, without any other additional requirement 
189 | it has the drawback of not allowing parallel task execution (because of the `.db` file does not support concurrent connections).
190 | * Postgres, MySQL, and other enterprise databases: allow parallel task execution (allows connection concurrency).
191 | 
192 | In this practical session, SQLite will be used for initial testing, later on, we will setup Postgres. Postgres is already installed 
193 | on the Virtual Machine provided, in case of not having an installation of Postgres, install it by running the following commands:
194 | 
195 | * `sudo apt update`
196 | * `sudo apt install postgresql postgresql-contrib pgadmin3`
197 | 
198 | To initialize the metadata using the default backend (SQLite), the Python virtual environment must first be activated, using the 
199 | following command:
200 | 
201 | * `conda activate airflow_env`
202 | 
203 | Once the environment is activated, the prompt will change and show the name of the environment in parentheses. All actions 
204 | performed (installation of libraries) will be carried out on said environment.
205 | 
206 | To initialize the metadata database, the following command must be executed with the environment activated:
207 | 
208 | * `airflow initdb`
209 | 
210 | By default, the directory `~/airflow/` will be created and inside it there will be a file called `airflow.db` that, the 
211 | SQLite database that contains the metadata. The internal structure can be obtained by connecting programmatically with 
212 | the database (for example with the standard Python sqlite module) or with a SQLite viewer as DB Browser for SQLite, 
213 | DBeaver or a commercial tool like JetBrains DataGrip.
214 | 
215 | **Note**: The location of the airflow folder can be modified by setting the `AIRFLOW_HOME` environment variable
216 | before the initialization of the metadata.
217 | 
218 | ### Step 3. Boot the web server and scheduler
219 | 
220 | Once the database has been initialized, the installation is complete and only remains to activate each of the
221 | Airflow components:
222 | 
223 | - webserver
224 | 
225 |     To activate the web server (optionally it can be launched on screen or tmux, making sure that the environment is
226 |      deactivated at the time of the creation of the multiplexer session):
227 |      
228 |      * `tmux new -s airflow_webserver` (optional)
229 |      * `conda activate airflow_env`
230 |      * `airflow webserver`
231 |      
232 |     To exit tmux session and leave the web server running in the background, press `ctrl + b` and then just the
233 |     `d` key.
234 | 
235 | - scheduler
236 | 
237 |     The process for the scheduler is repeated:
238 |     
239 |      * `tmux new -s airflow_scheduler` (optional)
240 |      * `conda activate airflow_env`
241 |      * `airflow scheduler`
242 |  
243 | Once the web server and scheduler have been started, the Apache Airflow web interface can be accessed at the URL 
244 | (by default, it can be modified by adding the ip and port options in the launch command):
245 | 
246 | * `http: // localhost: 8080`
247 | 
248 | ![Apache Airflow Web Interface](./images/dag_list.png)
249 | 
250 | ## Airflow tuning beyond boilerplate
251 | 
252 | ### Parallel task execution
253 | 
254 | In this section, some of the Airflow configuration options will be modified to:
255 | 
256 | * Allow the execution of tasks in parallel.
257 | * Optimize the use of system resources on limited resources (such as the virtual machine) and shared environments.
258 | 
259 | Both the web server and the scheduler must be stopped. To access the scheduler execution terminal in case
260 | If it has been launched on tmux, execute:
261 | 
262 | * `tmux attach -t airflow_webserver`
263 | * Press `ctrl + c` to stop the process
264 | * Repeat the process for scheduler
265 | 
266 | Next, you have to create a postgres database that will be used by Airflow for the storage of metadata. Postgres must be 
267 | already installed in the virtual machine, you can use pgAdminIII to create the user and database, but we will cover the
268 | process using psql cli:
269 | 
270 | 1. Switch to postgres user: `sudo su postgres` (introduce your password).
271 | 2. You will access psql, the command line application to manage Postgres on the computer: `psql`.
272 | 3. Execute the following query: `CREATE USER airflow WITH ENCRYPTED PASSWORD 'airflow';`
273 | 4. Create airflow database: `CREATE DATABASE airflow;`
274 | 5. Give permissions to airflow user in airflow database: `GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;`
275 | 6. Quit psql with `\q`.
276 | 
277 | Once the database has been created, the Airflow backend must be modified in the config file, so that it
278 |  start using this new database instead of the one used by default (SQLite).
279 | 
280 | The config file is located in `~/airflow/airflow.cfg`, around line 40 there is the option to specify the metadata 
281 | database:
282 | 
283 | ```ini
284 | # The SqlAlchemy connection string to the metadata database.
285 | # SqlAlchemy supports many different database engine, more information
286 | # their website
287 | sql_alchemy_conn = sqlite:////home/<user>/airflow/airflow.db
288 | ```
289 | 
290 | The SQLite database is currently configured, which is nothing more than a local file called airflow.db in the
291 | `~/airflow` folder. To use the postgres database that has just been created, the connection string must be replaced 
292 | with:
293 | 
294 | ```ini
295 | sql_alchemy_conn = postgresql+psycopg2://airflow:airflow@localhost:5432/airflow
296 | ```
297 | 
298 | The config file is then saved and the new database is initialized again (remember activating conda virtual env first):
299 | 
300 | * `airflow initdb`
301 | 
302 | So that Airflow support parallel execution of tasks that have no dependence on each other, in addition to
303 | provide a backend that supports concurrent writing, you must select a type of executor that supports this
304 | characteristic. You can choose several types of executors:
305 | 
306 | Other advanced options like:
307 | 
308 | * Sequential: it is a type of executor that executes the tasks sequentially, one after another. Ideal for prototyping 
309 | and making tests.
310 | * Local: basic executor that supports parallel execution using multiprocessing.
311 | 
312 | Advanced options like:
313 | 
314 | * Celery: use Celery executors, requiring a broker, such as RabbitMQ or Redis, to distribute the tasks
315 | Among the executors. This configuration is more advanced and goes beyond the scope of this workshop.
316 | * Dask: uses executors of [Dask](https://dask.pydata.org/en/latest/), a framework that provides distribution of 
317 | analytical tasks with an API very similar to pandas, numpy, etc. This setting is more advanced and out of the scope of 
318 | this workshop.
319 | 
320 | To use the Local type executor, the configuration file around line #35 must be modified:
321 | 
322 | ```ini
323 | # The executor class that airflow should use. Choices include
324 | # SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor
325 | executor = SequentialExecutor
326 | ```
327 | 
328 | Line 35 must be replaced by the following:
329 | 
330 | * `executor = LocalExecutor`
331 | 
332 | ### Improve performance in shared & low resources environments
333 | 
334 | In this section a series of changes will be made to improve the performance of Airflow in the virtual machine and 
335 | computers with limited resources.
336 | 
337 | The following options will be modified in the configuration:
338 | 
339 | * `parallelism` (~line 54): number of tasks marked as active in the database at the same time. It will be reduced to 8.
340 | * `dag concurrency` (~line 57): number of active tasks per worker (in the case of a basic local executor there will 
341 | only be one worker). It will be reduced to 4.
342 | * `max_active_runs_per_dag` (~line 67): number of times the same DAG can be active. It will be reduced to 2.
343 | * `max_threads` (~line 333): the number of processes that the scheduler can start. Allows control directly on the cpu. 
344 | It must match the number of processor cores. In this case, it will be set to 2.
345 | * `min_file_process_interval` (~line 298): time interval with which files are read in the dags folder. By default they 
346 | are read continuously (0 option). It will be set to 60 to read every minute.
347 | * `dagbag_import_timeout` (~ line 84): the time after which a timeout error is raised when reading the dags. Will be 
348 | fixed in 60.
349 | 
350 | ### Setup Airflow as a system service
351 | 
352 | To setup Airflow as a service using systemd (__mandatory__ in production environments), create the following files in 
353 | `/etc/systemd/system/` path.
354 | 
355 | * Scheduler
356 | 
357 |     ```
358 |     [Unit]
359 |     Description=Airflow scheduler
360 |     After=network.target
361 |     
362 |     [Service]
363 |     Environment=AIRFLOW_HOME=/home/user/airflow
364 |     Restart=on-failure
365 |     RestartSec=30
366 |     StandardOutput=file:/var/log/airflow/scheduler/stdout.log
367 |     StandardError=file:/var/log/airflow/scheduler/stderr.log
368 |     ExecStart=/bin/bash -c 'PATH=/home/user/miniconda3/envs/airflow_env/bin/:$PATH exec airflow scheduler'
369 |     
370 |     [Install]
371 |     WantedBy=multi-user.target
372 |     ```
373 | 
374 | * Webserver
375 | 
376 |     ```
377 |     [Unit]
378 |     Description=Airflow webserver
379 |     After=network.target
380 |     
381 |     [Service]
382 |     Environment=AIRFLOW_HOME=/home/user/airflow
383 |     Restart=on-failure
384 |     RestartSec=30
385 |     StandardOutput=file:/var/log/airflow/webserver/stdout.log
386 |     StandardError=file:/var/log/airflow/webserver/stderr.log
387 |     ExecStart=/bin/bash -c 'PATH=/home/user/miniconda3/envs/airflow_env/bin/:$PATH exec airflow webserver'
388 |     
389 |     [Install]
390 |     WantedBy=multi-user.target
391 |     ```
392 | 


--------------------------------------------------------------------------------
/docs/aws_setup.md:
--------------------------------------------------------------------------------
 1 | # Deployment of Machine Learning models on AWS
 2 | 
 3 | ## Introduction
 4 | 
 5 | AWS is one of the most popular PaaS services available on the market. It powers a high % of the internet and offers a 
 6 | wide range of services interesting to train and deploy Machine Learning models. Its most famous is 
 7 | [SageMaker](https://aws.amazon.com/sagemaker/), an integrated solution for training and deploy models. Another 
 8 | interesting option is using spot instances to train complex models on expensive hardware for a fraction of the price.
 9 | This function is also integrated in SageMaker now.
10 | 
11 | ## Getting ready for AWS
12 | 
13 | ### Create an account
14 | 
15 | First thing needed is creating an account. It requires a credit card and takes time to be active. Please check 
16 | [this](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/) link to learn about
17 | how to create an AWS account. It takes a couple of days to have it fully working.
18 | 
19 | ### Core components of AWS
20 | 
21 | There are some core components of AWS, being many other services just wrappers around those components. The most
22 | important are:
23 | 
24 | * EC2: computing
25 | * S3: storage
26 | 
27 | ### Getting credentials to use AWS programmatically
28 | 
29 | First thing needed for using AWS from Python scripts is getting AWS credentials. To enable a programmatic access, the
30 | service known as IAM: Identity and Access Management service. Please be aware of the region you are working on: for
31 | this workshop, the region `eu-west-1`, located in Ireland, will be used. To access IAM, follw
32 | [this](https://console.aws.amazon.com/iam/home?region=eu-west-1#/home) link.
33 | 
34 | Once in AWS: 
35 | 
36 | * Create a new user with admin permissions
37 | * Check programmatic access checkbox
38 | * Attach directly administrative policies to the user
39 | * Copy `Access key ID` and `Secret access key` to a safe place
40 | 
41 | ### Creating credentials and config files
42 | 
43 | Using credentials, a couple of files must be created inside a folder called `.aws` in user `home` folder:
44 | 
45 | * credentials (text file with no extension):
46 | 
47 | ```ini
48 | [default]
49 | aws_access_key_id=<your_access_key>
50 | aws_secret_access_key=<your_secred_key>
51 | ```
52 | 
53 | * config (text file with no extension):
54 | 
55 | ```ini
56 | [default]
57 | region=eu-west-1
58 | output=json
59 | ```
60 | 
61 | ### Create a SageMaker execution role
62 | 
63 | From IAM console, create an execution role with full AWS SageMaker execution access.
64 | 
65 | ### Installing aws cli
66 | 
67 | Next, install AWS cli interface:
68 | 
69 | * `sudo apt install awscli`
70 | 
71 | ### Install boto3 and sagemaker api in your virtual environment
72 | 
73 | Those libraries can lead to errors while installing, please be careful and create a new virtual environment. A
74 | requests file to run `aws_deployment.ipynb` notebook is provided in config folder. First create a new environment called
75 | `aws_env`:
76 | 
77 | * `conda create -n aws_env`
78 | * `conda activate aws_env`
79 | * `conda install python`
80 | * `pip install -r /path/to/requests.txt`
81 | 


--------------------------------------------------------------------------------
/docs/images/dag_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whiteboxml/teaching-ml-in-production/acc9b65d8293dc921ac1817ebc72ce7733fb29bb/docs/images/dag_list.png


--------------------------------------------------------------------------------
/docs/images/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whiteboxml/teaching-ml-in-production/acc9b65d8293dc921ac1817ebc72ce7733fb29bb/docs/images/python.png


--------------------------------------------------------------------------------
/docs/mlflow_setup.md:
--------------------------------------------------------------------------------
  1 | # MLFlow Setup - Tracking Server
  2 | 
  3 | This guide will show you how to setup a production MLFlow Tracking Server, so
  4 | that you can keep track of your experiments and enjoy a full user
  5 | experience with MLFlow.
  6 | 
  7 | The first thing we need to configure is the environment.
  8 | 
  9 | ## Environment
 10 | 
 11 | Let's create a new Conda environment as it will be the place where MLFlow
 12 | will be installed:
 13 | 
 14 | ```bash
 15 | conda create -n mlflow_env
 16 | conda activate mlflow_env
 17 | ```
 18 | 
 19 | Then we have to install the MLFlow library:
 20 | 
 21 | ```bash
 22 | conda install python
 23 | pip install mlflow
 24 | ```
 25 | 
 26 | Run the following command to check that the installation was successful:
 27 | 
 28 | ```bash
 29 | mlflow --help
 30 | ```
 31 | 
 32 | We'd like our Traking Server to have a Postgres database as a backend for
 33 | storing metadata, so the first step will be installing PostgreSQL:
 34 | 
 35 | ```bash
 36 | sudo apt-get install postgresql postgresql-contrib postgresql-server-dev-all
 37 | ```
 38 | 
 39 | Check installation connecting to the database:
 40 | 
 41 | ```bash
 42 | sudo -u postgres psql
 43 | ```
 44 | 
 45 | After the installation is successful, let's create an user and a database
 46 | for the Traking Server:
 47 | 
 48 | ```sql
 49 | CREATE DATABASE mlflow;
 50 | CREATE USER mlflow WITH ENCRYPTED PASSWORD 'mlflow';
 51 | GRANT ALL PRIVILEGES ON DATABASE mlflow TO mlflow;
 52 | ```
 53 | 
 54 | As we'll need to interact with Postgres from Python, it is needed to install
 55 | the psycopg2 library. However, to ensure a successful installation we need 
 56 | to install the gcc linux package before:
 57 | 
 58 | ```bash
 59 | sudo apt install gcc
 60 | pip install psycopg2
 61 | ```
 62 | 
 63 | The last step will be creating a directory in our local machine for our 
 64 | Tracking Server to log there the Machine Learning models and other artifacts.
 65 | Remember that the Postgres database is only used for storing metadata
 66 | regarding those models (imaging adding a model or a virtual environment
 67 | to a database). This directory is called artifact URI:
 68 | 
 69 |  ```bash
 70 | mkdir ~/mlruns
 71 | ```
 72 | 
 73 | ## Run
 74 | 
 75 | Everything is now setup to run the Tracking Server. Then write the following
 76 | command:
 77 | 
 78 | ```bash
 79 | mlflow server --backend-store-uri postgresql://mlflow:mlflow@localhost/mlflow --default-artifact-root file:/home/your_user/mlruns -h 0.0.0.0 -p 8000
 80 | ```
 81 | 
 82 | Now the Tracking server should be available a the following URL: 
 83 | http://0.0.0.0:8000. However, if you Ctrl-C or exit the terminal, the
 84 | server will go down.
 85 | 
 86 | ## Production
 87 | 
 88 | If you want the Tracking server to be up and running after restarts and 
 89 | be resilient to failures, it is very useful to run it as a systemd service.
 90 | 
 91 | You need to go into the /etc/systemd/system folder and create a new file called
 92 | mlflow-tracking.service with the following content:
 93 |  
 94 | ```
 95 | [Unit]
 96 | Description=MLFlow tracking server
 97 | After=network.target
 98 | 
 99 | [Service]
100 | Restart=on-failure
101 | RestartSec=30
102 | StandardOutput=file:/path_to_your_logging_folder/stdout.log
103 | StandardError=file:/path_to_your_logging_folder/stderr.log
104 | ExecStart=/bin/bash -c 'PATH=/path_to_your_conda_installation/envs/mlflow_env/bin/:$PATH exec mlflow server --backend-store-uri postgresql://mlflow:mlflow@localhost/mlflow --default-artifact-root file:/home/your_user/mlruns -h 0.0.0.0 -p 8000'
105 | 
106 | [Install]
107 | WantedBy=multi-user.target
108 | ```
109 | 
110 | After that, you need to activate and enable the service with the following
111 | commands:
112 | 
113 | ```bash
114 | sudo mkdir -p /path_to_your_logging_folder
115 | sudo systemctl daemon-reload
116 | sudo systemctl enable mlflow-tracking
117 | sudo systemctl start mlflow-tracking
118 | ```
119 | 
120 | Check that everything worked as expected with the following command:
121 | 
122 | ```bash
123 | sudo systemctl status mlflow-tracking
124 | ```
125 | 
126 | You should see an output similar to this:
127 | 
128 | ```
129 | ● mlflow-tracking.service - MLFlow tracking server
130 |    Loaded: loaded (/etc/systemd/system/mlflow-tracking.service; enabled; vendor preset: enabled)
131 |    Active: active (running) since Fri 2019-09-27 09:02:11 UTC; 14s ago
132 |  Main PID: 10357 (mlflow)
133 |     Tasks: 10 (limit: 4915)
134 |    CGroup: /system.slice/mlflow-tracking.service
135 |            ├─10357 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/mlflow s
136 |            ├─10377 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/gunicorn
137 |            ├─10381 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/gunicorn
138 |            ├─10383 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/gunicorn
139 |            ├─10385 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/gunicorn
140 |            └─10386 /path_to_your_conda_installation/envs/mlflow_env/bin/python /home/ubuntu/miniconda3/envs/mlflow_env/bin/gunicorn
141 | 
142 | Sep 27 09:02:11 ubuntu systemd[1]: Started MLFlow tracking server.
143 | ```
144 |  
145 |  You can now restart your machine and the MLFlow Tracking Server will be
146 |  up and running after this restart.
147 |  


--------------------------------------------------------------------------------
/notebooks/aws_deployment.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# renfe-guru sagemaker example"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## 0. python general imports"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "import pandas as pd\n",
  24 |     "import logging\n",
  25 |     "import matplotlib.pyplot as plt\n",
  26 |     "\n",
  27 |     "%matplotlib inline"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "code",
  32 |    "execution_count": 2,
  33 |    "metadata": {},
  34 |    "outputs": [
  35 |     {
  36 |      "name": "stderr",
  37 |      "output_type": "stream",
  38 |      "text": [
  39 |       "2019-09-26 13:20:28,799 - root - INFO - hi!\n"
  40 |      ]
  41 |     }
  42 |    ],
  43 |    "source": [
  44 |     "def get_logger():\n",
  45 |     "    logger = logging.getLogger()\n",
  46 |     "    logger.setLevel(logging.DEBUG)\n",
  47 |     "    ch = logging.StreamHandler()\n",
  48 |     "    ch.setLevel(logging.INFO)\n",
  49 |     "    formatter = logging.Formatter(\n",
  50 |     "        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')\n",
  51 |     "    ch.setFormatter(formatter)\n",
  52 |     "    logger.addHandler(ch)\n",
  53 |     "\n",
  54 |     "    return logger\n",
  55 |     "\n",
  56 |     "logger = get_logger()\n",
  57 |     "\n",
  58 |     "logger.info(\"hi!\")"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "markdown",
  63 |    "metadata": {},
  64 |    "source": [
  65 |     "## 1. data loading"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "markdown",
  70 |    "metadata": {},
  71 |    "source": [
  72 |     "the dataset can be downloaded here: https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing"
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "code",
  77 |    "execution_count": 3,
  78 |    "metadata": {},
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "renfe = pd.read_parquet('../data/raw/renfe.parquet')  # about 60MB in .parquet file, but 3.5Gb in memory, be careful!"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": 4,
  87 |    "metadata": {},
  88 |    "outputs": [
  89 |     {
  90 |      "data": {
  91 |       "text/html": [
  92 |        "<div>\n",
  93 |        "<style scoped>\n",
  94 |        "    .dataframe tbody tr th:only-of-type {\n",
  95 |        "        vertical-align: middle;\n",
  96 |        "    }\n",
  97 |        "\n",
  98 |        "    .dataframe tbody tr th {\n",
  99 |        "        vertical-align: top;\n",
 100 |        "    }\n",
 101 |        "\n",
 102 |        "    .dataframe thead th {\n",
 103 |        "        text-align: right;\n",
 104 |        "    }\n",
 105 |        "</style>\n",
 106 |        "<table border=\"1\" class=\"dataframe\">\n",
 107 |        "  <thead>\n",
 108 |        "    <tr style=\"text-align: right;\">\n",
 109 |        "      <th></th>\n",
 110 |        "      <th>insert_date</th>\n",
 111 |        "      <th>origin</th>\n",
 112 |        "      <th>destination</th>\n",
 113 |        "      <th>start_date</th>\n",
 114 |        "      <th>end_date</th>\n",
 115 |        "      <th>train_type</th>\n",
 116 |        "      <th>price</th>\n",
 117 |        "      <th>train_class</th>\n",
 118 |        "      <th>fare</th>\n",
 119 |        "    </tr>\n",
 120 |        "  </thead>\n",
 121 |        "  <tbody>\n",
 122 |        "    <tr>\n",
 123 |        "      <td>0</td>\n",
 124 |        "      <td>2019-08-21 03:42:10</td>\n",
 125 |        "      <td>SEVILLA</td>\n",
 126 |        "      <td>MADRID</td>\n",
 127 |        "      <td>2019-08-29 13:40:00</td>\n",
 128 |        "      <td>2019-08-29 16:10:00</td>\n",
 129 |        "      <td>AVE</td>\n",
 130 |        "      <td>47.30</td>\n",
 131 |        "      <td>Turista</td>\n",
 132 |        "      <td>Promo</td>\n",
 133 |        "    </tr>\n",
 134 |        "    <tr>\n",
 135 |        "      <td>1</td>\n",
 136 |        "      <td>2019-08-21 03:42:10</td>\n",
 137 |        "      <td>SEVILLA</td>\n",
 138 |        "      <td>MADRID</td>\n",
 139 |        "      <td>2019-08-29 14:45:00</td>\n",
 140 |        "      <td>2019-08-29 17:15:00</td>\n",
 141 |        "      <td>AVE</td>\n",
 142 |        "      <td>53.40</td>\n",
 143 |        "      <td>Turista</td>\n",
 144 |        "      <td>Promo</td>\n",
 145 |        "    </tr>\n",
 146 |        "    <tr>\n",
 147 |        "      <td>2</td>\n",
 148 |        "      <td>2019-08-21 03:42:10</td>\n",
 149 |        "      <td>SEVILLA</td>\n",
 150 |        "      <td>MADRID</td>\n",
 151 |        "      <td>2019-08-29 14:58:00</td>\n",
 152 |        "      <td>2019-08-29 17:50:00</td>\n",
 153 |        "      <td>ALVIA</td>\n",
 154 |        "      <td>NaN</td>\n",
 155 |        "      <td>Preferente</td>\n",
 156 |        "      <td>Promo</td>\n",
 157 |        "    </tr>\n",
 158 |        "    <tr>\n",
 159 |        "      <td>3</td>\n",
 160 |        "      <td>2019-08-21 03:42:10</td>\n",
 161 |        "      <td>SEVILLA</td>\n",
 162 |        "      <td>MADRID</td>\n",
 163 |        "      <td>2019-08-29 15:45:00</td>\n",
 164 |        "      <td>2019-08-29 18:15:00</td>\n",
 165 |        "      <td>AVE</td>\n",
 166 |        "      <td>61.45</td>\n",
 167 |        "      <td>Preferente</td>\n",
 168 |        "      <td>Promo</td>\n",
 169 |        "    </tr>\n",
 170 |        "    <tr>\n",
 171 |        "      <td>4</td>\n",
 172 |        "      <td>2019-08-21 03:42:10</td>\n",
 173 |        "      <td>SEVILLA</td>\n",
 174 |        "      <td>MADRID</td>\n",
 175 |        "      <td>2019-08-29 16:45:00</td>\n",
 176 |        "      <td>2019-08-29 19:17:00</td>\n",
 177 |        "      <td>AVE</td>\n",
 178 |        "      <td>60.30</td>\n",
 179 |        "      <td>Turista</td>\n",
 180 |        "      <td>Promo</td>\n",
 181 |        "    </tr>\n",
 182 |        "  </tbody>\n",
 183 |        "</table>\n",
 184 |        "</div>"
 185 |       ],
 186 |       "text/plain": [
 187 |        "          insert_date   origin destination          start_date  \\\n",
 188 |        "0 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 13:40:00   \n",
 189 |        "1 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 14:45:00   \n",
 190 |        "2 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 14:58:00   \n",
 191 |        "3 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 15:45:00   \n",
 192 |        "4 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 16:45:00   \n",
 193 |        "\n",
 194 |        "             end_date train_type  price train_class   fare  \n",
 195 |        "0 2019-08-29 16:10:00        AVE  47.30     Turista  Promo  \n",
 196 |        "1 2019-08-29 17:15:00        AVE  53.40     Turista  Promo  \n",
 197 |        "2 2019-08-29 17:50:00      ALVIA    NaN  Preferente  Promo  \n",
 198 |        "3 2019-08-29 18:15:00        AVE  61.45  Preferente  Promo  \n",
 199 |        "4 2019-08-29 19:17:00        AVE  60.30     Turista  Promo  "
 200 |       ]
 201 |      },
 202 |      "execution_count": 4,
 203 |      "metadata": {},
 204 |      "output_type": "execute_result"
 205 |     }
 206 |    ],
 207 |    "source": [
 208 |     "renfe.head()"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": 5,
 214 |    "metadata": {},
 215 |    "outputs": [
 216 |     {
 217 |      "data": {
 218 |       "text/plain": [
 219 |        "insert_date    datetime64[ns]\n",
 220 |        "origin                 object\n",
 221 |        "destination            object\n",
 222 |        "start_date     datetime64[ns]\n",
 223 |        "end_date       datetime64[ns]\n",
 224 |        "train_type             object\n",
 225 |        "price                 float64\n",
 226 |        "train_class            object\n",
 227 |        "fare                   object\n",
 228 |        "dtype: object"
 229 |       ]
 230 |      },
 231 |      "execution_count": 5,
 232 |      "metadata": {},
 233 |      "output_type": "execute_result"
 234 |     }
 235 |    ],
 236 |    "source": [
 237 |     "renfe.dtypes"
 238 |    ]
 239 |   },
 240 |   {
 241 |    "cell_type": "code",
 242 |    "execution_count": 6,
 243 |    "metadata": {},
 244 |    "outputs": [
 245 |     {
 246 |      "name": "stdout",
 247 |      "output_type": "stream",
 248 |      "text": [
 249 |       "<class 'pandas.core.frame.DataFrame'>\n",
 250 |       "RangeIndex: 10800510 entries, 0 to 10800509\n",
 251 |       "Data columns (total 9 columns):\n",
 252 |       "insert_date    datetime64[ns]\n",
 253 |       "origin         object\n",
 254 |       "destination    object\n",
 255 |       "start_date     datetime64[ns]\n",
 256 |       "end_date       datetime64[ns]\n",
 257 |       "train_type     object\n",
 258 |       "price          float64\n",
 259 |       "train_class    object\n",
 260 |       "fare           object\n",
 261 |       "dtypes: datetime64[ns](3), float64(1), object(5)\n",
 262 |       "memory usage: 741.6+ MB\n"
 263 |      ]
 264 |     }
 265 |    ],
 266 |    "source": [
 267 |     "renfe.info()  # with deep memory usage will take a while..."
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "markdown",
 272 |    "metadata": {},
 273 |    "source": [
 274 |     "## 2. data wrangling"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "markdown",
 279 |    "metadata": {},
 280 |    "source": [
 281 |     "first of all, null values will be dropped. all null values are due to:\n",
 282 |     "- scrapping errors, specially at the beggining of the process\n",
 283 |     "- trains with no ticket available (usually full, canceled, etc.)"
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "code",
 288 |    "execution_count": 7,
 289 |    "metadata": {},
 290 |    "outputs": [],
 291 |    "source": [
 292 |     "# filtering null values, inplace to modify original df\n",
 293 |     "\n",
 294 |     "renfe.dropna(inplace=True)"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "markdown",
 299 |    "metadata": {},
 300 |    "source": [
 301 |     "as the goal is to predict ticket price in advance, some interesting features can be derived:\n",
 302 |     "- trip duration (in hours)\n",
 303 |     "- time to departure (in days)\n",
 304 |     "- hour of departure (24h)\n",
 305 |     "- week day of departure"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "code",
 310 |    "execution_count": 8,
 311 |    "metadata": {},
 312 |    "outputs": [],
 313 |    "source": [
 314 |     "# feature engineering / generation\n",
 315 |     "\n",
 316 |     "def add_features(renfe_df):\n",
 317 |     "\n",
 318 |     "    renfe_df['duration'] = (renfe_df['end_date'] - renfe_df['start_date']).dt.seconds / 3600\n",
 319 |     "    renfe_df['time_to_departure'] = (renfe_df['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \\\n",
 320 |     "                                   - renfe_df['insert_date'].dt.tz_localize('UTC')).dt.days\n",
 321 |     "    renfe_df['hour'] = renfe_df['start_date'].dt.hour\n",
 322 |     "    renfe_df['weekday'] = renfe_df['start_date'].dt.dayofweek\n",
 323 |     "\n",
 324 |     "add_features(renfe)"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "following, perform train - validation - test splits:"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": 9,
 337 |    "metadata": {},
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "# train - test split\n",
 341 |     "\n",
 342 |     "from sklearn.model_selection import train_test_split\n",
 343 |     "\n",
 344 |     "renfe_train_validation, renfe_test = train_test_split(renfe)\n",
 345 |     "renfe_train, renfe_validation = train_test_split(renfe_train_validation)\n",
 346 |     "\n",
 347 |     "# to avoid chained assignment 'pandas warning'\n",
 348 |     "\n",
 349 |     "renfe_train = renfe_train.copy()\n",
 350 |     "renfe_validation = renfe_validation.copy()\n",
 351 |     "renfe_test = renfe_test.copy()"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 10,
 357 |    "metadata": {},
 358 |    "outputs": [
 359 |     {
 360 |      "name": "stderr",
 361 |      "output_type": "stream",
 362 |      "text": [
 363 |       "2019-09-26 13:21:06,323 - root - INFO - n obs in training set are: 5699503\n",
 364 |       "2019-09-26 13:21:06,325 - root - INFO - n obs in validation set are: 1899835\n",
 365 |       "2019-09-26 13:21:06,326 - root - INFO - n obs in test set are: 2533113\n"
 366 |      ]
 367 |     }
 368 |    ],
 369 |    "source": [
 370 |     "logger.info(f'n obs in training set are: {renfe_train.shape[0]}')\n",
 371 |     "logger.info(f'n obs in validation set are: {renfe_validation.shape[0]}')\n",
 372 |     "logger.info(f'n obs in test set are: {renfe_test.shape[0]}')"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "metadata": {},
 378 |    "source": [
 379 |     "data looks like this so far:"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 11,
 385 |    "metadata": {},
 386 |    "outputs": [
 387 |     {
 388 |      "data": {
 389 |       "text/html": [
 390 |        "<div>\n",
 391 |        "<style scoped>\n",
 392 |        "    .dataframe tbody tr th:only-of-type {\n",
 393 |        "        vertical-align: middle;\n",
 394 |        "    }\n",
 395 |        "\n",
 396 |        "    .dataframe tbody tr th {\n",
 397 |        "        vertical-align: top;\n",
 398 |        "    }\n",
 399 |        "\n",
 400 |        "    .dataframe thead th {\n",
 401 |        "        text-align: right;\n",
 402 |        "    }\n",
 403 |        "</style>\n",
 404 |        "<table border=\"1\" class=\"dataframe\">\n",
 405 |        "  <thead>\n",
 406 |        "    <tr style=\"text-align: right;\">\n",
 407 |        "      <th></th>\n",
 408 |        "      <th>4014038</th>\n",
 409 |        "      <th>1500573</th>\n",
 410 |        "      <th>4233318</th>\n",
 411 |        "      <th>6871655</th>\n",
 412 |        "      <th>10472288</th>\n",
 413 |        "    </tr>\n",
 414 |        "  </thead>\n",
 415 |        "  <tbody>\n",
 416 |        "    <tr>\n",
 417 |        "      <td>insert_date</td>\n",
 418 |        "      <td>2019-04-17 13:23:18</td>\n",
 419 |        "      <td>2019-09-05 13:02:06</td>\n",
 420 |        "      <td>2019-04-19 05:04:09</td>\n",
 421 |        "      <td>2019-05-21 21:50:01</td>\n",
 422 |        "      <td>2019-08-16 19:45:56</td>\n",
 423 |        "    </tr>\n",
 424 |        "    <tr>\n",
 425 |        "      <td>origin</td>\n",
 426 |        "      <td>VALENCIA</td>\n",
 427 |        "      <td>MADRID</td>\n",
 428 |        "      <td>MADRID</td>\n",
 429 |        "      <td>BARCELONA</td>\n",
 430 |        "      <td>PONFERRADA</td>\n",
 431 |        "    </tr>\n",
 432 |        "    <tr>\n",
 433 |        "      <td>destination</td>\n",
 434 |        "      <td>MADRID</td>\n",
 435 |        "      <td>VALENCIA</td>\n",
 436 |        "      <td>SEVILLA</td>\n",
 437 |        "      <td>MADRID</td>\n",
 438 |        "      <td>MADRID</td>\n",
 439 |        "    </tr>\n",
 440 |        "    <tr>\n",
 441 |        "      <td>start_date</td>\n",
 442 |        "      <td>2019-05-24 10:40:00</td>\n",
 443 |        "      <td>2019-10-28 16:05:00</td>\n",
 444 |        "      <td>2019-05-08 19:30:00</td>\n",
 445 |        "      <td>2019-07-14 16:25:00</td>\n",
 446 |        "      <td>2019-09-04 06:11:00</td>\n",
 447 |        "    </tr>\n",
 448 |        "    <tr>\n",
 449 |        "      <td>end_date</td>\n",
 450 |        "      <td>2019-05-24 12:32:00</td>\n",
 451 |        "      <td>2019-10-28 22:47:00</td>\n",
 452 |        "      <td>2019-05-08 22:05:00</td>\n",
 453 |        "      <td>2019-07-14 18:55:00</td>\n",
 454 |        "      <td>2019-09-04 10:15:00</td>\n",
 455 |        "    </tr>\n",
 456 |        "    <tr>\n",
 457 |        "      <td>train_type</td>\n",
 458 |        "      <td>AVE</td>\n",
 459 |        "      <td>REGIONAL</td>\n",
 460 |        "      <td>AVE</td>\n",
 461 |        "      <td>AVE</td>\n",
 462 |        "      <td>ALVIA</td>\n",
 463 |        "    </tr>\n",
 464 |        "    <tr>\n",
 465 |        "      <td>price</td>\n",
 466 |        "      <td>33.65</td>\n",
 467 |        "      <td>28.35</td>\n",
 468 |        "      <td>53.4</td>\n",
 469 |        "      <td>88.95</td>\n",
 470 |        "      <td>33.5</td>\n",
 471 |        "    </tr>\n",
 472 |        "    <tr>\n",
 473 |        "      <td>train_class</td>\n",
 474 |        "      <td>Turista</td>\n",
 475 |        "      <td>Turista</td>\n",
 476 |        "      <td>Turista</td>\n",
 477 |        "      <td>Turista</td>\n",
 478 |        "      <td>Turista</td>\n",
 479 |        "    </tr>\n",
 480 |        "    <tr>\n",
 481 |        "      <td>fare</td>\n",
 482 |        "      <td>Promo</td>\n",
 483 |        "      <td>Adulto ida</td>\n",
 484 |        "      <td>Promo</td>\n",
 485 |        "      <td>Promo</td>\n",
 486 |        "      <td>Promo</td>\n",
 487 |        "    </tr>\n",
 488 |        "    <tr>\n",
 489 |        "      <td>duration</td>\n",
 490 |        "      <td>1.86667</td>\n",
 491 |        "      <td>6.7</td>\n",
 492 |        "      <td>2.58333</td>\n",
 493 |        "      <td>2.5</td>\n",
 494 |        "      <td>4.06667</td>\n",
 495 |        "    </tr>\n",
 496 |        "    <tr>\n",
 497 |        "      <td>time_to_departure</td>\n",
 498 |        "      <td>36</td>\n",
 499 |        "      <td>53</td>\n",
 500 |        "      <td>19</td>\n",
 501 |        "      <td>53</td>\n",
 502 |        "      <td>18</td>\n",
 503 |        "    </tr>\n",
 504 |        "    <tr>\n",
 505 |        "      <td>hour</td>\n",
 506 |        "      <td>10</td>\n",
 507 |        "      <td>16</td>\n",
 508 |        "      <td>19</td>\n",
 509 |        "      <td>16</td>\n",
 510 |        "      <td>6</td>\n",
 511 |        "    </tr>\n",
 512 |        "    <tr>\n",
 513 |        "      <td>weekday</td>\n",
 514 |        "      <td>4</td>\n",
 515 |        "      <td>0</td>\n",
 516 |        "      <td>2</td>\n",
 517 |        "      <td>6</td>\n",
 518 |        "      <td>2</td>\n",
 519 |        "    </tr>\n",
 520 |        "  </tbody>\n",
 521 |        "</table>\n",
 522 |        "</div>"
 523 |       ],
 524 |       "text/plain": [
 525 |        "                              4014038              1500573   \\\n",
 526 |        "insert_date        2019-04-17 13:23:18  2019-09-05 13:02:06   \n",
 527 |        "origin                        VALENCIA               MADRID   \n",
 528 |        "destination                     MADRID             VALENCIA   \n",
 529 |        "start_date         2019-05-24 10:40:00  2019-10-28 16:05:00   \n",
 530 |        "end_date           2019-05-24 12:32:00  2019-10-28 22:47:00   \n",
 531 |        "train_type                         AVE             REGIONAL   \n",
 532 |        "price                            33.65                28.35   \n",
 533 |        "train_class                    Turista              Turista   \n",
 534 |        "fare                             Promo           Adulto ida   \n",
 535 |        "duration                       1.86667                  6.7   \n",
 536 |        "time_to_departure                   36                   53   \n",
 537 |        "hour                                10                   16   \n",
 538 |        "weekday                              4                    0   \n",
 539 |        "\n",
 540 |        "                              4233318              6871655   \\\n",
 541 |        "insert_date        2019-04-19 05:04:09  2019-05-21 21:50:01   \n",
 542 |        "origin                          MADRID            BARCELONA   \n",
 543 |        "destination                    SEVILLA               MADRID   \n",
 544 |        "start_date         2019-05-08 19:30:00  2019-07-14 16:25:00   \n",
 545 |        "end_date           2019-05-08 22:05:00  2019-07-14 18:55:00   \n",
 546 |        "train_type                         AVE                  AVE   \n",
 547 |        "price                             53.4                88.95   \n",
 548 |        "train_class                    Turista              Turista   \n",
 549 |        "fare                             Promo                Promo   \n",
 550 |        "duration                       2.58333                  2.5   \n",
 551 |        "time_to_departure                   19                   53   \n",
 552 |        "hour                                19                   16   \n",
 553 |        "weekday                              2                    6   \n",
 554 |        "\n",
 555 |        "                              10472288  \n",
 556 |        "insert_date        2019-08-16 19:45:56  \n",
 557 |        "origin                      PONFERRADA  \n",
 558 |        "destination                     MADRID  \n",
 559 |        "start_date         2019-09-04 06:11:00  \n",
 560 |        "end_date           2019-09-04 10:15:00  \n",
 561 |        "train_type                       ALVIA  \n",
 562 |        "price                             33.5  \n",
 563 |        "train_class                    Turista  \n",
 564 |        "fare                             Promo  \n",
 565 |        "duration                       4.06667  \n",
 566 |        "time_to_departure                   18  \n",
 567 |        "hour                                 6  \n",
 568 |        "weekday                              2  "
 569 |       ]
 570 |      },
 571 |      "execution_count": 11,
 572 |      "metadata": {},
 573 |      "output_type": "execute_result"
 574 |     }
 575 |    ],
 576 |    "source": [
 577 |     "renfe_train.head().T"
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "markdown",
 582 |    "metadata": {},
 583 |    "source": [
 584 |     "following, there are some categorical columns that have to be encoded (most ML algorithms will need that):"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "code",
 589 |    "execution_count": 12,
 590 |    "metadata": {},
 591 |    "outputs": [
 592 |     {
 593 |      "name": "stderr",
 594 |      "output_type": "stream",
 595 |      "text": [
 596 |       "2019-09-26 13:21:09,363 - root - INFO - transforming training set...\n",
 597 |       "2019-09-26 13:21:27,345 - root - INFO - transforming validation set...\n",
 598 |       "2019-09-26 13:21:32,909 - root - INFO - transforming test set...\n"
 599 |      ]
 600 |     }
 601 |    ],
 602 |    "source": [
 603 |     "# preprocessing\n",
 604 |     "\n",
 605 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 606 |     "import joblib\n",
 607 |     "\n",
 608 |     "encode_cols = ['train_type', 'train_class', 'fare', 'origin', 'destination']\n",
 609 |     "encoder = OrdinalEncoder()\n",
 610 |     "encoder.fit(renfe[encode_cols])  # warning, it should be fit only on training data!\n",
 611 |     "joblib.dump(encoder, '../output/pickle_data/encoder.joblib')\n",
 612 |     "\n",
 613 |     "for split, df in {'training': renfe_train, \n",
 614 |     "                  'validation': renfe_validation, \n",
 615 |     "                  'test': renfe_test}.items():\n",
 616 |     "    logger.info(f'transforming {split} set...')\n",
 617 |     "    df.loc[:,encode_cols] = encoder.transform(df.loc[:,encode_cols])"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "markdown",
 622 |    "metadata": {},
 623 |    "source": [
 624 |     "with those columns encoded, data looks like this:"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": 13,
 630 |    "metadata": {},
 631 |    "outputs": [
 632 |     {
 633 |      "data": {
 634 |       "text/html": [
 635 |        "<div>\n",
 636 |        "<style scoped>\n",
 637 |        "    .dataframe tbody tr th:only-of-type {\n",
 638 |        "        vertical-align: middle;\n",
 639 |        "    }\n",
 640 |        "\n",
 641 |        "    .dataframe tbody tr th {\n",
 642 |        "        vertical-align: top;\n",
 643 |        "    }\n",
 644 |        "\n",
 645 |        "    .dataframe thead th {\n",
 646 |        "        text-align: right;\n",
 647 |        "    }\n",
 648 |        "</style>\n",
 649 |        "<table border=\"1\" class=\"dataframe\">\n",
 650 |        "  <thead>\n",
 651 |        "    <tr style=\"text-align: right;\">\n",
 652 |        "      <th></th>\n",
 653 |        "      <th>4014038</th>\n",
 654 |        "      <th>1500573</th>\n",
 655 |        "      <th>4233318</th>\n",
 656 |        "      <th>6871655</th>\n",
 657 |        "      <th>10472288</th>\n",
 658 |        "    </tr>\n",
 659 |        "  </thead>\n",
 660 |        "  <tbody>\n",
 661 |        "    <tr>\n",
 662 |        "      <td>insert_date</td>\n",
 663 |        "      <td>2019-04-17 13:23:18</td>\n",
 664 |        "      <td>2019-09-05 13:02:06</td>\n",
 665 |        "      <td>2019-04-19 05:04:09</td>\n",
 666 |        "      <td>2019-05-21 21:50:01</td>\n",
 667 |        "      <td>2019-08-16 19:45:56</td>\n",
 668 |        "    </tr>\n",
 669 |        "    <tr>\n",
 670 |        "      <td>origin</td>\n",
 671 |        "      <td>5</td>\n",
 672 |        "      <td>2</td>\n",
 673 |        "      <td>2</td>\n",
 674 |        "      <td>0</td>\n",
 675 |        "      <td>3</td>\n",
 676 |        "    </tr>\n",
 677 |        "    <tr>\n",
 678 |        "      <td>destination</td>\n",
 679 |        "      <td>2</td>\n",
 680 |        "      <td>5</td>\n",
 681 |        "      <td>4</td>\n",
 682 |        "      <td>2</td>\n",
 683 |        "      <td>2</td>\n",
 684 |        "    </tr>\n",
 685 |        "    <tr>\n",
 686 |        "      <td>start_date</td>\n",
 687 |        "      <td>2019-05-24 10:40:00</td>\n",
 688 |        "      <td>2019-10-28 16:05:00</td>\n",
 689 |        "      <td>2019-05-08 19:30:00</td>\n",
 690 |        "      <td>2019-07-14 16:25:00</td>\n",
 691 |        "      <td>2019-09-04 06:11:00</td>\n",
 692 |        "    </tr>\n",
 693 |        "    <tr>\n",
 694 |        "      <td>end_date</td>\n",
 695 |        "      <td>2019-05-24 12:32:00</td>\n",
 696 |        "      <td>2019-10-28 22:47:00</td>\n",
 697 |        "      <td>2019-05-08 22:05:00</td>\n",
 698 |        "      <td>2019-07-14 18:55:00</td>\n",
 699 |        "      <td>2019-09-04 10:15:00</td>\n",
 700 |        "    </tr>\n",
 701 |        "    <tr>\n",
 702 |        "      <td>train_type</td>\n",
 703 |        "      <td>2</td>\n",
 704 |        "      <td>13</td>\n",
 705 |        "      <td>2</td>\n",
 706 |        "      <td>2</td>\n",
 707 |        "      <td>0</td>\n",
 708 |        "    </tr>\n",
 709 |        "    <tr>\n",
 710 |        "      <td>price</td>\n",
 711 |        "      <td>33.65</td>\n",
 712 |        "      <td>28.35</td>\n",
 713 |        "      <td>53.4</td>\n",
 714 |        "      <td>88.95</td>\n",
 715 |        "      <td>33.5</td>\n",
 716 |        "    </tr>\n",
 717 |        "    <tr>\n",
 718 |        "      <td>train_class</td>\n",
 719 |        "      <td>4</td>\n",
 720 |        "      <td>4</td>\n",
 721 |        "      <td>4</td>\n",
 722 |        "      <td>4</td>\n",
 723 |        "      <td>4</td>\n",
 724 |        "    </tr>\n",
 725 |        "    <tr>\n",
 726 |        "      <td>fare</td>\n",
 727 |        "      <td>8</td>\n",
 728 |        "      <td>1</td>\n",
 729 |        "      <td>8</td>\n",
 730 |        "      <td>8</td>\n",
 731 |        "      <td>8</td>\n",
 732 |        "    </tr>\n",
 733 |        "    <tr>\n",
 734 |        "      <td>duration</td>\n",
 735 |        "      <td>1.86667</td>\n",
 736 |        "      <td>6.7</td>\n",
 737 |        "      <td>2.58333</td>\n",
 738 |        "      <td>2.5</td>\n",
 739 |        "      <td>4.06667</td>\n",
 740 |        "    </tr>\n",
 741 |        "    <tr>\n",
 742 |        "      <td>time_to_departure</td>\n",
 743 |        "      <td>36</td>\n",
 744 |        "      <td>53</td>\n",
 745 |        "      <td>19</td>\n",
 746 |        "      <td>53</td>\n",
 747 |        "      <td>18</td>\n",
 748 |        "    </tr>\n",
 749 |        "    <tr>\n",
 750 |        "      <td>hour</td>\n",
 751 |        "      <td>10</td>\n",
 752 |        "      <td>16</td>\n",
 753 |        "      <td>19</td>\n",
 754 |        "      <td>16</td>\n",
 755 |        "      <td>6</td>\n",
 756 |        "    </tr>\n",
 757 |        "    <tr>\n",
 758 |        "      <td>weekday</td>\n",
 759 |        "      <td>4</td>\n",
 760 |        "      <td>0</td>\n",
 761 |        "      <td>2</td>\n",
 762 |        "      <td>6</td>\n",
 763 |        "      <td>2</td>\n",
 764 |        "    </tr>\n",
 765 |        "  </tbody>\n",
 766 |        "</table>\n",
 767 |        "</div>"
 768 |       ],
 769 |       "text/plain": [
 770 |        "                              4014038              1500573   \\\n",
 771 |        "insert_date        2019-04-17 13:23:18  2019-09-05 13:02:06   \n",
 772 |        "origin                               5                    2   \n",
 773 |        "destination                          2                    5   \n",
 774 |        "start_date         2019-05-24 10:40:00  2019-10-28 16:05:00   \n",
 775 |        "end_date           2019-05-24 12:32:00  2019-10-28 22:47:00   \n",
 776 |        "train_type                           2                   13   \n",
 777 |        "price                            33.65                28.35   \n",
 778 |        "train_class                          4                    4   \n",
 779 |        "fare                                 8                    1   \n",
 780 |        "duration                       1.86667                  6.7   \n",
 781 |        "time_to_departure                   36                   53   \n",
 782 |        "hour                                10                   16   \n",
 783 |        "weekday                              4                    0   \n",
 784 |        "\n",
 785 |        "                              4233318              6871655   \\\n",
 786 |        "insert_date        2019-04-19 05:04:09  2019-05-21 21:50:01   \n",
 787 |        "origin                               2                    0   \n",
 788 |        "destination                          4                    2   \n",
 789 |        "start_date         2019-05-08 19:30:00  2019-07-14 16:25:00   \n",
 790 |        "end_date           2019-05-08 22:05:00  2019-07-14 18:55:00   \n",
 791 |        "train_type                           2                    2   \n",
 792 |        "price                             53.4                88.95   \n",
 793 |        "train_class                          4                    4   \n",
 794 |        "fare                                 8                    8   \n",
 795 |        "duration                       2.58333                  2.5   \n",
 796 |        "time_to_departure                   19                   53   \n",
 797 |        "hour                                19                   16   \n",
 798 |        "weekday                              2                    6   \n",
 799 |        "\n",
 800 |        "                              10472288  \n",
 801 |        "insert_date        2019-08-16 19:45:56  \n",
 802 |        "origin                               3  \n",
 803 |        "destination                          2  \n",
 804 |        "start_date         2019-09-04 06:11:00  \n",
 805 |        "end_date           2019-09-04 10:15:00  \n",
 806 |        "train_type                           0  \n",
 807 |        "price                             33.5  \n",
 808 |        "train_class                          4  \n",
 809 |        "fare                                 8  \n",
 810 |        "duration                       4.06667  \n",
 811 |        "time_to_departure                   18  \n",
 812 |        "hour                                 6  \n",
 813 |        "weekday                              2  "
 814 |       ]
 815 |      },
 816 |      "execution_count": 13,
 817 |      "metadata": {},
 818 |      "output_type": "execute_result"
 819 |     }
 820 |    ],
 821 |    "source": [
 822 |     "renfe_train.head().T"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "markdown",
 827 |    "metadata": {},
 828 |    "source": [
 829 |     "## 3. upload data to S3"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "markdown",
 834 |    "metadata": {},
 835 |    "source": [
 836 |     "to use sagemaker using aws apis (sagemaker or boto) data must be formated in a particular way and stored in aws s3"
 837 |    ]
 838 |   },
 839 |   {
 840 |    "cell_type": "code",
 841 |    "execution_count": 14,
 842 |    "metadata": {},
 843 |    "outputs": [
 844 |     {
 845 |      "data": {
 846 |       "text/html": [
 847 |        "<div>\n",
 848 |        "<style scoped>\n",
 849 |        "    .dataframe tbody tr th:only-of-type {\n",
 850 |        "        vertical-align: middle;\n",
 851 |        "    }\n",
 852 |        "\n",
 853 |        "    .dataframe tbody tr th {\n",
 854 |        "        vertical-align: top;\n",
 855 |        "    }\n",
 856 |        "\n",
 857 |        "    .dataframe thead th {\n",
 858 |        "        text-align: right;\n",
 859 |        "    }\n",
 860 |        "</style>\n",
 861 |        "<table border=\"1\" class=\"dataframe\">\n",
 862 |        "  <thead>\n",
 863 |        "    <tr style=\"text-align: right;\">\n",
 864 |        "      <th></th>\n",
 865 |        "      <th>price</th>\n",
 866 |        "      <th>train_type</th>\n",
 867 |        "      <th>train_class</th>\n",
 868 |        "      <th>fare</th>\n",
 869 |        "      <th>duration</th>\n",
 870 |        "      <th>time_to_departure</th>\n",
 871 |        "      <th>hour</th>\n",
 872 |        "      <th>weekday</th>\n",
 873 |        "    </tr>\n",
 874 |        "  </thead>\n",
 875 |        "  <tbody>\n",
 876 |        "    <tr>\n",
 877 |        "      <td>4014038</td>\n",
 878 |        "      <td>33.65</td>\n",
 879 |        "      <td>2.0</td>\n",
 880 |        "      <td>4.0</td>\n",
 881 |        "      <td>8.0</td>\n",
 882 |        "      <td>1.866667</td>\n",
 883 |        "      <td>36</td>\n",
 884 |        "      <td>10</td>\n",
 885 |        "      <td>4</td>\n",
 886 |        "    </tr>\n",
 887 |        "    <tr>\n",
 888 |        "      <td>1500573</td>\n",
 889 |        "      <td>28.35</td>\n",
 890 |        "      <td>13.0</td>\n",
 891 |        "      <td>4.0</td>\n",
 892 |        "      <td>1.0</td>\n",
 893 |        "      <td>6.700000</td>\n",
 894 |        "      <td>53</td>\n",
 895 |        "      <td>16</td>\n",
 896 |        "      <td>0</td>\n",
 897 |        "    </tr>\n",
 898 |        "    <tr>\n",
 899 |        "      <td>4233318</td>\n",
 900 |        "      <td>53.40</td>\n",
 901 |        "      <td>2.0</td>\n",
 902 |        "      <td>4.0</td>\n",
 903 |        "      <td>8.0</td>\n",
 904 |        "      <td>2.583333</td>\n",
 905 |        "      <td>19</td>\n",
 906 |        "      <td>19</td>\n",
 907 |        "      <td>2</td>\n",
 908 |        "    </tr>\n",
 909 |        "    <tr>\n",
 910 |        "      <td>6871655</td>\n",
 911 |        "      <td>88.95</td>\n",
 912 |        "      <td>2.0</td>\n",
 913 |        "      <td>4.0</td>\n",
 914 |        "      <td>8.0</td>\n",
 915 |        "      <td>2.500000</td>\n",
 916 |        "      <td>53</td>\n",
 917 |        "      <td>16</td>\n",
 918 |        "      <td>6</td>\n",
 919 |        "    </tr>\n",
 920 |        "    <tr>\n",
 921 |        "      <td>10472288</td>\n",
 922 |        "      <td>33.50</td>\n",
 923 |        "      <td>0.0</td>\n",
 924 |        "      <td>4.0</td>\n",
 925 |        "      <td>8.0</td>\n",
 926 |        "      <td>4.066667</td>\n",
 927 |        "      <td>18</td>\n",
 928 |        "      <td>6</td>\n",
 929 |        "      <td>2</td>\n",
 930 |        "    </tr>\n",
 931 |        "  </tbody>\n",
 932 |        "</table>\n",
 933 |        "</div>"
 934 |       ],
 935 |       "text/plain": [
 936 |        "          price  train_type  train_class  fare  duration  time_to_departure  \\\n",
 937 |        "4014038   33.65         2.0          4.0   8.0  1.866667                 36   \n",
 938 |        "1500573   28.35        13.0          4.0   1.0  6.700000                 53   \n",
 939 |        "4233318   53.40         2.0          4.0   8.0  2.583333                 19   \n",
 940 |        "6871655   88.95         2.0          4.0   8.0  2.500000                 53   \n",
 941 |        "10472288  33.50         0.0          4.0   8.0  4.066667                 18   \n",
 942 |        "\n",
 943 |        "          hour  weekday  \n",
 944 |        "4014038     10        4  \n",
 945 |        "1500573     16        0  \n",
 946 |        "4233318     19        2  \n",
 947 |        "6871655     16        6  \n",
 948 |        "10472288     6        2  "
 949 |       ]
 950 |      },
 951 |      "execution_count": 14,
 952 |      "metadata": {},
 953 |      "output_type": "execute_result"
 954 |     }
 955 |    ],
 956 |    "source": [
 957 |     "# target must be in the first position of csv columns for xgboost via sagemaker API\n",
 958 |     "\n",
 959 |     "target = 'price'\n",
 960 |     "features = ['train_type', 'train_class', 'fare', 'duration', 'time_to_departure', 'hour', 'weekday']\n",
 961 |     "\n",
 962 |     "renfe_train[[target] + features].head()"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "markdown",
 967 |    "metadata": {},
 968 |    "source": [
 969 |     "data must be pushed to s3, aws credentials must be properly set for this purpose (`/home/user/.aws/credentials`)."
 970 |    ]
 971 |   },
 972 |   {
 973 |    "cell_type": "code",
 974 |    "execution_count": 15,
 975 |    "metadata": {},
 976 |    "outputs": [],
 977 |    "source": [
 978 |     "BUCKET = 'ml-in-production-madrid-sagemaker'\n",
 979 |     "\n",
 980 |     "s3_train = 's3://' + BUCKET + '/' + 'train'\n",
 981 |     "s3_validation = 's3://' + BUCKET + '/' + 'validation'\n",
 982 |     "s3_model_output = 's3://' + BUCKET + '/' + 'model'\n",
 983 |     "\n",
 984 |     "renfe_train[[target] + features].to_csv(s3_train + '/train.csv', index=False, header=False)  # .csv file without header\n",
 985 |     "renfe_validation[[target] + features].to_csv(s3_validation + '/validation.csv', index=False, header=False)"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "markdown",
 990 |    "metadata": {},
 991 |    "source": [
 992 |     "## 4. train model with sagemaker api"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "cell_type": "markdown",
 997 |    "metadata": {},
 998 |    "source": [
 999 |     "first, using sagemaker api for xgboost image, can launch a model training job using the following code. please note that `role` and `region` information must be specified. can be fetched from aws programatically or hardcoded. __please check your aws console while training__"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": 19,
1005 |    "metadata": {},
1006 |    "outputs": [
1007 |     {
1008 |      "name": "stderr",
1009 |      "output_type": "stream",
1010 |      "text": [
1011 |       "2019-09-26 13:47:03,272 - sagemaker - INFO - Creating training-job with name: ml-in-production-madrid-sgemaker-api-2019-09-26-13-47-03\n"
1012 |      ]
1013 |     },
1014 |     {
1015 |      "name": "stdout",
1016 |      "output_type": "stream",
1017 |      "text": [
1018 |       "2019-09-26 13:47:03 Starting - Starting the training job...\n",
1019 |       "2019-09-26 13:47:06 Starting - Launching requested ML instances...\n",
1020 |       "2019-09-26 13:48:01 Starting - Preparing the instances for training......\n",
1021 |       "2019-09-26 13:49:05 Downloading - Downloading input data\n",
1022 |       "2019-09-26 13:49:05 Training - Downloading the training image...\n",
1023 |       "2019-09-26 13:49:24 Training - Training image download completed. Training in progress.\u001b[31mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n",
1024 |       "\u001b[31mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:linear to Json.\u001b[0m\n",
1025 |       "\u001b[31mReturning the value itself\u001b[0m\n",
1026 |       "\u001b[31mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n",
1027 |       "\u001b[31mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n",
1028 |       "\u001b[31mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n",
1029 |       "\u001b[31mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n",
1030 |       "\u001b[31mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n",
1031 |       "\u001b[31m[13:49:26] 5699503x7 matrix with 39896521 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n",
1032 |       "\u001b[31mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n",
1033 |       "\u001b[31m[13:49:27] 1899835x7 matrix with 13298845 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n",
1034 |       "\u001b[31mINFO:root:Single node training.\u001b[0m\n",
1035 |       "\u001b[31mINFO:root:Train matrix has 5699503 rows\u001b[0m\n",
1036 |       "\u001b[31mINFO:root:Validation matrix has 1899835 rows\u001b[0m\n",
1037 |       "\u001b[31m[13:49:27] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\u001b[0m\n",
1038 |       "\u001b[31m[13:49:27] WARNING: /workspace/src/learner.cc:686: Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.\u001b[0m\n",
1039 |       "\u001b[31m[0]#011train-rmse:53.9982#011validation-rmse:54.0441\u001b[0m\n",
1040 |       "\u001b[31m[1]#011train-rmse:44.2293#011validation-rmse:44.3132\u001b[0m\n",
1041 |       "\u001b[31m[2]#011train-rmse:36.7184#011validation-rmse:36.7152\u001b[0m\n",
1042 |       "\u001b[31m[3]#011train-rmse:30.5009#011validation-rmse:30.535\u001b[0m\n",
1043 |       "\u001b[31m[4]#011train-rmse:25.9117#011validation-rmse:25.9343\u001b[0m\n",
1044 |       "\u001b[31m[5]#011train-rmse:22.2993#011validation-rmse:22.3438\u001b[0m\n",
1045 |       "\n",
1046 |       "2019-09-26 13:50:06 Uploading - Uploading generated training model\u001b[31m[6]#011train-rmse:19.5827#011validation-rmse:19.6235\u001b[0m\n",
1047 |       "\u001b[31m[7]#011train-rmse:17.5192#011validation-rmse:17.5463\u001b[0m\n",
1048 |       "\n",
1049 |       "2019-09-26 13:50:11 Completed - Training job completed\n",
1050 |       "Training seconds: 85\n",
1051 |       "Billable seconds: 85\n"
1052 |      ]
1053 |     }
1054 |    ],
1055 |    "source": [
1056 |     "from datetime import datetime\n",
1057 |     "import sagemaker\n",
1058 |     "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
1059 |     "from sagemaker.estimator import Estimator\n",
1060 |     "\n",
1061 |     "ROLE = 'arn:aws:iam::090554204572:role/service-role/AmazonSageMaker-ExecutionRole-20190610T164237'\n",
1062 |     "REGION = 'eu-west-1'\n",
1063 |     "TRAINING_JOB_NAME = 'ml-in-production-madrid-sgemaker-api'\n",
1064 |     "\n",
1065 |     "train_channel = sagemaker.session.s3_input(s3_train, content_type='text/csv')\n",
1066 |     "valid_channel = sagemaker.session.s3_input(s3_validation, content_type='text/csv')\n",
1067 |     "\n",
1068 |     "data_channels = {'train': train_channel, \n",
1069 |     "                 'validation': valid_channel}\n",
1070 |     "\n",
1071 |     "container = get_image_uri(REGION, 'xgboost', '0.90-1')\n",
1072 |     "\n",
1073 |     "xgb_model = Estimator(container,\n",
1074 |     "                      ROLE, \n",
1075 |     "                      train_instance_count=1, \n",
1076 |     "                      train_instance_type='ml.m4.xlarge',\n",
1077 |     "                      train_volume_size = 5,\n",
1078 |     "                      output_path=s3_model_output,\n",
1079 |     "                      sagemaker_session=sagemaker.Session()\n",
1080 |     "                     )\n",
1081 |     "\n",
1082 |     "xgb_model.set_hyperparameters(max_depth = 4,\n",
1083 |     "                              eta = .2,\n",
1084 |     "                              gamma = 4,\n",
1085 |     "                              min_child_weight = 8,\n",
1086 |     "                              silent = 0,\n",
1087 |     "                              objective = \"reg:linear\",\n",
1088 |     "                              num_round = 8)\n",
1089 |     "\n",
1090 |     "xgb_model.fit(inputs=data_channels, \n",
1091 |     "              logs=True, \n",
1092 |     "              job_name=TRAINING_JOB_NAME + \\\n",
1093 |     "              '-' + datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
1094 |     ")"
1095 |    ]
1096 |   },
1097 |   {
1098 |    "cell_type": "markdown",
1099 |    "metadata": {},
1100 |    "source": [
1101 |     "## 5. train model with boto3 api"
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "markdown",
1106 |    "metadata": {},
1107 |    "source": [
1108 |     "a training job can be created using boto3 __and not sagemaker api__. a dictionary with all details must be specified:"
1109 |    ]
1110 |   },
1111 |   {
1112 |    "cell_type": "code",
1113 |    "execution_count": 23,
1114 |    "metadata": {},
1115 |    "outputs": [
1116 |     {
1117 |      "name": "stdout",
1118 |      "output_type": "stream",
1119 |      "text": [
1120 |       "InProgress\n",
1121 |       "InProgress\n",
1122 |       "InProgress\n",
1123 |       "InProgress\n",
1124 |       "InProgress\n",
1125 |       "InProgress\n",
1126 |       "InProgress\n",
1127 |       "InProgress\n",
1128 |       "InProgress\n",
1129 |       "Completed\n"
1130 |      ]
1131 |     }
1132 |    ],
1133 |    "source": [
1134 |     "import boto3\n",
1135 |     "import time\n",
1136 |     "\n",
1137 |     "TRAINING_JOB_NAME = 'ml-in-production-madrid-boto3-api' \\\n",
1138 |     "                    + '-' + datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\")\n",
1139 |     "\n",
1140 |     "create_training_params = \\\n",
1141 |     "{\n",
1142 |     "    \"AlgorithmSpecification\": {\n",
1143 |     "        \"TrainingImage\": container,\n",
1144 |     "        \"TrainingInputMode\": \"File\"\n",
1145 |     "    },\n",
1146 |     "    \"RoleArn\": ROLE,\n",
1147 |     "    \"OutputDataConfig\": {\n",
1148 |     "        \"S3OutputPath\": s3_model_output\n",
1149 |     "    },\n",
1150 |     "    \"ResourceConfig\": {\n",
1151 |     "        \"InstanceCount\": 1,\n",
1152 |     "        \"InstanceType\": \"ml.m4.4xlarge\",\n",
1153 |     "        \"VolumeSizeInGB\": 5\n",
1154 |     "    },\n",
1155 |     "    \"TrainingJobName\": TRAINING_JOB_NAME,\n",
1156 |     "    \"HyperParameters\": {\n",
1157 |     "        \"max_depth\":\"4\",\n",
1158 |     "        \"eta\":\"0.2\",\n",
1159 |     "        \"gamma\":\"4\",\n",
1160 |     "        \"min_child_weight\":\"4\",\n",
1161 |     "        \"subsample\":\"0.7\",\n",
1162 |     "        \"silent\":\"0\",\n",
1163 |     "        \"objective\":\"reg:linear\",\n",
1164 |     "        \"num_round\":\"8\"\n",
1165 |     "    },\n",
1166 |     "    \"StoppingCondition\": {\n",
1167 |     "        \"MaxRuntimeInSeconds\": 3600\n",
1168 |     "    },\n",
1169 |     "    \"InputDataConfig\": [\n",
1170 |     "        {\n",
1171 |     "            \"ChannelName\": \"train\",\n",
1172 |     "            \"DataSource\": {\n",
1173 |     "                \"S3DataSource\": {\n",
1174 |     "                    \"S3DataType\": \"S3Prefix\",\n",
1175 |     "                    \"S3Uri\": s3_train,\n",
1176 |     "                    \"S3DataDistributionType\": \"FullyReplicated\"\n",
1177 |     "                }\n",
1178 |     "            },\n",
1179 |     "            \"ContentType\": \"text/csv\",\n",
1180 |     "            \"CompressionType\": \"None\"\n",
1181 |     "        },\n",
1182 |     "        {\n",
1183 |     "            \"ChannelName\": \"validation\",\n",
1184 |     "            \"DataSource\": {\n",
1185 |     "                \"S3DataSource\": {\n",
1186 |     "                    \"S3DataType\": \"S3Prefix\",\n",
1187 |     "                    \"S3Uri\": s3_validation,\n",
1188 |     "                    \"S3DataDistributionType\": \"FullyReplicated\"\n",
1189 |     "                }\n",
1190 |     "            },\n",
1191 |     "            \"ContentType\": \"text/csv\",\n",
1192 |     "            \"CompressionType\": \"None\"\n",
1193 |     "        }\n",
1194 |     "    ]\n",
1195 |     "}\n",
1196 |     "\n",
1197 |     "client = boto3.client('sagemaker', region_name=REGION)\n",
1198 |     "client.create_training_job(**create_training_params)\n",
1199 |     "status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']\n",
1200 |     "\n",
1201 |     "# this loop will query status until completed, there is no more info available, go to aws console for more...\n",
1202 |     "while status !='Completed' and status!='Failed':\n",
1203 |     "    time.sleep(16)\n",
1204 |     "    status = client.describe_training_job(TrainingJobName=TRAINING_JOB_NAME)['TrainingJobStatus']\n",
1205 |     "    logger.info('training job created with boto3 api is:' + status)"
1206 |    ]
1207 |   },
1208 |   {
1209 |    "cell_type": "markdown",
1210 |    "metadata": {},
1211 |    "source": [
1212 |     "## 6. serving model with sagemaker api"
1213 |    ]
1214 |   },
1215 |   {
1216 |    "cell_type": "markdown",
1217 |    "metadata": {},
1218 |    "source": [
1219 |     "to deploy a model and create an endpoint using sagemaker api, `deploy` method of sagemaker estimator can be used. takes some time... go grab some cofee!"
1220 |    ]
1221 |   },
1222 |   {
1223 |    "cell_type": "code",
1224 |    "execution_count": 26,
1225 |    "metadata": {},
1226 |    "outputs": [],
1227 |    "source": [
1228 |     "ENDPOINT_NAME = 'ml-in-production-madrid-sagemaker-api-endpoint'\n",
1229 |     "MODEL_NAME = 'ml-in-production-madrid-sagemaker-api-model'\n",
1230 |     "\n",
1231 |     "xgb_predictor = xgb_model.deploy(initial_instance_count=1,\n",
1232 |     "                                 instance_type='ml.t2.medium',\n",
1233 |     "                                 endpoint_name=ENDPOINT_NAME,\n",
1234 |     "                                 model_name=MODEL_NAME)"
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "markdown",
1239 |    "metadata": {},
1240 |    "source": [
1241 |     "## 7. serving model with boto3 api"
1242 |    ]
1243 |   },
1244 |   {
1245 |    "cell_type": "markdown",
1246 |    "metadata": {},
1247 |    "source": [
1248 |     "not straightforward, involves 3 steps, too low level to explain here, does not worth the pain having sagemaker api and mlflow... let's try mlflow with an sklearn model instead :-D"
1249 |    ]
1250 |   },
1251 |   {
1252 |    "cell_type": "markdown",
1253 |    "metadata": {},
1254 |    "source": [
1255 |     "## 8. invoking endpoint (just boto3 api option)"
1256 |    ]
1257 |   },
1258 |   {
1259 |    "cell_type": "code",
1260 |    "execution_count": 27,
1261 |    "metadata": {},
1262 |    "outputs": [
1263 |     {
1264 |      "name": "stderr",
1265 |      "output_type": "stream",
1266 |      "text": [
1267 |       "2019-09-26 14:08:13,386 - root - INFO - getting a sample of 100 elements from test split\n",
1268 |       "2019-09-26 14:08:13,498 - root - INFO - calling endpoint...\n"
1269 |      ]
1270 |     },
1271 |     {
1272 |      "data": {
1273 |       "text/plain": [
1274 |        "{'ResponseMetadata': {'RequestId': '047786de-f29f-484e-8b60-556d2f5c72d5',\n",
1275 |        "  'HTTPStatusCode': 200,\n",
1276 |        "  'HTTPHeaders': {'x-amzn-requestid': '047786de-f29f-484e-8b60-556d2f5c72d5',\n",
1277 |        "   'x-amzn-invoked-production-variant': 'AllTraffic',\n",
1278 |        "   'date': 'Thu, 26 Sep 2019 14:08:14 GMT',\n",
1279 |        "   'content-type': 'text/csv; charset=utf-8',\n",
1280 |        "   'content-length': '1842'},\n",
1281 |        "  'RetryAttempts': 0},\n",
1282 |        " 'ContentType': 'text/csv; charset=utf-8',\n",
1283 |        " 'InvokedProductionVariant': 'AllTraffic',\n",
1284 |        " 'Body': <botocore.response.StreamingBody at 0x7f8cfc9ac690>}"
1285 |       ]
1286 |      },
1287 |      "metadata": {},
1288 |      "output_type": "display_data"
1289 |     }
1290 |    ],
1291 |    "source": [
1292 |     "runtime = boto3.client('runtime.sagemaker')\n",
1293 |     "\n",
1294 |     "logger.info('getting a sample of 100 elements from test split')\n",
1295 |     "test_sample = renfe_test.sample(100)\n",
1296 |     "\n",
1297 |     "logger.info('calling endpoint...')\n",
1298 |     "response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,\n",
1299 |     "                                   Body=test_sample[features].to_csv(header=False, index=False))  # data must be passed as .csv (string)\n",
1300 |     "display(response)\n",
1301 |     "y_pred = list(map(lambda x: float(x), response['Body'].read().decode().split(',')))  # result is a string and must be parsed"
1302 |    ]
1303 |   },
1304 |   {
1305 |    "cell_type": "markdown",
1306 |    "metadata": {},
1307 |    "source": [
1308 |     "let's compare result with reality:"
1309 |    ]
1310 |   },
1311 |   {
1312 |    "cell_type": "code",
1313 |    "execution_count": 32,
1314 |    "metadata": {},
1315 |    "outputs": [
1316 |     {
1317 |      "name": "stderr",
1318 |      "output_type": "stream",
1319 |      "text": [
1320 |       "2019-09-26 14:10:12,971 - root - INFO - mae for xbgoost model is: 11.515720779418944\n"
1321 |      ]
1322 |     },
1323 |     {
1324 |      "data": {
1325 |       "text/html": [
1326 |        "<div>\n",
1327 |        "<style scoped>\n",
1328 |        "    .dataframe tbody tr th:only-of-type {\n",
1329 |        "        vertical-align: middle;\n",
1330 |        "    }\n",
1331 |        "\n",
1332 |        "    .dataframe tbody tr th {\n",
1333 |        "        vertical-align: top;\n",
1334 |        "    }\n",
1335 |        "\n",
1336 |        "    .dataframe thead th {\n",
1337 |        "        text-align: right;\n",
1338 |        "    }\n",
1339 |        "</style>\n",
1340 |        "<table border=\"1\" class=\"dataframe\">\n",
1341 |        "  <thead>\n",
1342 |        "    <tr style=\"text-align: right;\">\n",
1343 |        "      <th></th>\n",
1344 |        "      <th>y_true</th>\n",
1345 |        "      <th>y_pred</th>\n",
1346 |        "    </tr>\n",
1347 |        "  </thead>\n",
1348 |        "  <tbody>\n",
1349 |        "    <tr>\n",
1350 |        "      <td>668489</td>\n",
1351 |        "      <td>49.55</td>\n",
1352 |        "      <td>62.541649</td>\n",
1353 |        "    </tr>\n",
1354 |        "    <tr>\n",
1355 |        "      <td>9449386</td>\n",
1356 |        "      <td>45.80</td>\n",
1357 |        "      <td>38.042965</td>\n",
1358 |        "    </tr>\n",
1359 |        "    <tr>\n",
1360 |        "      <td>586448</td>\n",
1361 |        "      <td>85.10</td>\n",
1362 |        "      <td>62.541649</td>\n",
1363 |        "    </tr>\n",
1364 |        "    <tr>\n",
1365 |        "      <td>1431351</td>\n",
1366 |        "      <td>45.30</td>\n",
1367 |        "      <td>34.568508</td>\n",
1368 |        "    </tr>\n",
1369 |        "    <tr>\n",
1370 |        "      <td>2338501</td>\n",
1371 |        "      <td>85.10</td>\n",
1372 |        "      <td>62.541649</td>\n",
1373 |        "    </tr>\n",
1374 |        "    <tr>\n",
1375 |        "      <td>...</td>\n",
1376 |        "      <td>...</td>\n",
1377 |        "      <td>...</td>\n",
1378 |        "    </tr>\n",
1379 |        "    <tr>\n",
1380 |        "      <td>5341749</td>\n",
1381 |        "      <td>28.35</td>\n",
1382 |        "      <td>24.286119</td>\n",
1383 |        "    </tr>\n",
1384 |        "    <tr>\n",
1385 |        "      <td>354498</td>\n",
1386 |        "      <td>100.40</td>\n",
1387 |        "      <td>61.160225</td>\n",
1388 |        "    </tr>\n",
1389 |        "    <tr>\n",
1390 |        "      <td>8479041</td>\n",
1391 |        "      <td>85.10</td>\n",
1392 |        "      <td>62.541649</td>\n",
1393 |        "    </tr>\n",
1394 |        "    <tr>\n",
1395 |        "      <td>6837888</td>\n",
1396 |        "      <td>76.30</td>\n",
1397 |        "      <td>67.757019</td>\n",
1398 |        "    </tr>\n",
1399 |        "    <tr>\n",
1400 |        "      <td>1672585</td>\n",
1401 |        "      <td>21.95</td>\n",
1402 |        "      <td>34.568508</td>\n",
1403 |        "    </tr>\n",
1404 |        "  </tbody>\n",
1405 |        "</table>\n",
1406 |        "<p>100 rows × 2 columns</p>\n",
1407 |        "</div>"
1408 |       ],
1409 |       "text/plain": [
1410 |        "         y_true     y_pred\n",
1411 |        "668489    49.55  62.541649\n",
1412 |        "9449386   45.80  38.042965\n",
1413 |        "586448    85.10  62.541649\n",
1414 |        "1431351   45.30  34.568508\n",
1415 |        "2338501   85.10  62.541649\n",
1416 |        "...         ...        ...\n",
1417 |        "5341749   28.35  24.286119\n",
1418 |        "354498   100.40  61.160225\n",
1419 |        "8479041   85.10  62.541649\n",
1420 |        "6837888   76.30  67.757019\n",
1421 |        "1672585   21.95  34.568508\n",
1422 |        "\n",
1423 |        "[100 rows x 2 columns]"
1424 |       ]
1425 |      },
1426 |      "metadata": {},
1427 |      "output_type": "display_data"
1428 |     }
1429 |    ],
1430 |    "source": [
1431 |     "from sklearn.metrics import mean_absolute_error\n",
1432 |     "y_true = test_sample['price']\n",
1433 |     "logger.info(f\"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}\")\n",
1434 |     "\n",
1435 |     "display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))"
1436 |    ]
1437 |   },
1438 |   {
1439 |    "cell_type": "markdown",
1440 |    "metadata": {},
1441 |    "source": [
1442 |     "## 9. deploy model using mlflow the easy way (sklearn version)"
1443 |    ]
1444 |   },
1445 |   {
1446 |    "cell_type": "code",
1447 |    "execution_count": 54,
1448 |    "metadata": {},
1449 |    "outputs": [
1450 |     {
1451 |      "name": "stderr",
1452 |      "output_type": "stream",
1453 |      "text": [
1454 |       "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n",
1455 |       "[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:  2.7min\n",
1456 |       "[Parallel(n_jobs=32)]: Done 256 out of 256 | elapsed:  4.4min finished\n"
1457 |      ]
1458 |     },
1459 |     {
1460 |      "data": {
1461 |       "text/plain": [
1462 |        "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,\n",
1463 |        "                      max_features='auto', max_leaf_nodes=None,\n",
1464 |        "                      min_impurity_decrease=0.0, min_impurity_split=None,\n",
1465 |        "                      min_samples_leaf=1, min_samples_split=2,\n",
1466 |        "                      min_weight_fraction_leaf=0.0, n_estimators=256, n_jobs=32,\n",
1467 |        "                      oob_score=False, random_state=None, verbose=1,\n",
1468 |        "                      warm_start=False)"
1469 |       ]
1470 |      },
1471 |      "execution_count": 54,
1472 |      "metadata": {},
1473 |      "output_type": "execute_result"
1474 |     }
1475 |    ],
1476 |    "source": [
1477 |     "from sklearn.ensemble import RandomForestRegressor\n",
1478 |     "\n",
1479 |     "X = renfe_train[features]\n",
1480 |     "y = renfe_train[target]\n",
1481 |     "\n",
1482 |     "rf = RandomForestRegressor(n_estimators=256, \n",
1483 |     "                           n_jobs=32,  # adapt to your processor(s)\n",
1484 |     "                           verbose=1,\n",
1485 |     "                           max_depth=8)  # limit max depth to keep serialized model under 100MB (or it will be unable to deploy in AWS)\n",
1486 |     "rf.fit(X, y)"
1487 |    ]
1488 |   },
1489 |   {
1490 |    "cell_type": "markdown",
1491 |    "metadata": {},
1492 |    "source": [
1493 |     "check results for this model:"
1494 |    ]
1495 |   },
1496 |   {
1497 |    "cell_type": "code",
1498 |    "execution_count": 57,
1499 |    "metadata": {},
1500 |    "outputs": [
1501 |     {
1502 |      "name": "stderr",
1503 |      "output_type": "stream",
1504 |      "text": [
1505 |       "[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.\n",
1506 |       "[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s\n",
1507 |       "[Parallel(n_jobs=32)]: Done 256 out of 256 | elapsed:    0.1s finished\n",
1508 |       "2019-09-26 14:55:48,768 - root - INFO - mae for xbgoost model is: 6.751278555302093\n"
1509 |      ]
1510 |     },
1511 |     {
1512 |      "data": {
1513 |       "text/html": [
1514 |        "<div>\n",
1515 |        "<style scoped>\n",
1516 |        "    .dataframe tbody tr th:only-of-type {\n",
1517 |        "        vertical-align: middle;\n",
1518 |        "    }\n",
1519 |        "\n",
1520 |        "    .dataframe tbody tr th {\n",
1521 |        "        vertical-align: top;\n",
1522 |        "    }\n",
1523 |        "\n",
1524 |        "    .dataframe thead th {\n",
1525 |        "        text-align: right;\n",
1526 |        "    }\n",
1527 |        "</style>\n",
1528 |        "<table border=\"1\" class=\"dataframe\">\n",
1529 |        "  <thead>\n",
1530 |        "    <tr style=\"text-align: right;\">\n",
1531 |        "      <th></th>\n",
1532 |        "      <th>y_true</th>\n",
1533 |        "      <th>y_pred</th>\n",
1534 |        "    </tr>\n",
1535 |        "  </thead>\n",
1536 |        "  <tbody>\n",
1537 |        "    <tr>\n",
1538 |        "      <td>668489</td>\n",
1539 |        "      <td>49.55</td>\n",
1540 |        "      <td>75.643374</td>\n",
1541 |        "    </tr>\n",
1542 |        "    <tr>\n",
1543 |        "      <td>9449386</td>\n",
1544 |        "      <td>45.80</td>\n",
1545 |        "      <td>45.800000</td>\n",
1546 |        "    </tr>\n",
1547 |        "    <tr>\n",
1548 |        "      <td>586448</td>\n",
1549 |        "      <td>85.10</td>\n",
1550 |        "      <td>75.643374</td>\n",
1551 |        "    </tr>\n",
1552 |        "    <tr>\n",
1553 |        "      <td>1431351</td>\n",
1554 |        "      <td>45.30</td>\n",
1555 |        "      <td>42.568147</td>\n",
1556 |        "    </tr>\n",
1557 |        "    <tr>\n",
1558 |        "      <td>2338501</td>\n",
1559 |        "      <td>85.10</td>\n",
1560 |        "      <td>75.643374</td>\n",
1561 |        "    </tr>\n",
1562 |        "    <tr>\n",
1563 |        "      <td>...</td>\n",
1564 |        "      <td>...</td>\n",
1565 |        "      <td>...</td>\n",
1566 |        "    </tr>\n",
1567 |        "    <tr>\n",
1568 |        "      <td>5341749</td>\n",
1569 |        "      <td>28.35</td>\n",
1570 |        "      <td>28.350000</td>\n",
1571 |        "    </tr>\n",
1572 |        "    <tr>\n",
1573 |        "      <td>354498</td>\n",
1574 |        "      <td>100.40</td>\n",
1575 |        "      <td>94.133290</td>\n",
1576 |        "    </tr>\n",
1577 |        "    <tr>\n",
1578 |        "      <td>8479041</td>\n",
1579 |        "      <td>85.10</td>\n",
1580 |        "      <td>75.643374</td>\n",
1581 |        "    </tr>\n",
1582 |        "    <tr>\n",
1583 |        "      <td>6837888</td>\n",
1584 |        "      <td>76.30</td>\n",
1585 |        "      <td>76.984304</td>\n",
1586 |        "    </tr>\n",
1587 |        "    <tr>\n",
1588 |        "      <td>1672585</td>\n",
1589 |        "      <td>21.95</td>\n",
1590 |        "      <td>35.448023</td>\n",
1591 |        "    </tr>\n",
1592 |        "  </tbody>\n",
1593 |        "</table>\n",
1594 |        "<p>100 rows × 2 columns</p>\n",
1595 |        "</div>"
1596 |       ],
1597 |       "text/plain": [
1598 |        "         y_true     y_pred\n",
1599 |        "668489    49.55  75.643374\n",
1600 |        "9449386   45.80  45.800000\n",
1601 |        "586448    85.10  75.643374\n",
1602 |        "1431351   45.30  42.568147\n",
1603 |        "2338501   85.10  75.643374\n",
1604 |        "...         ...        ...\n",
1605 |        "5341749   28.35  28.350000\n",
1606 |        "354498   100.40  94.133290\n",
1607 |        "8479041   85.10  75.643374\n",
1608 |        "6837888   76.30  76.984304\n",
1609 |        "1672585   21.95  35.448023\n",
1610 |        "\n",
1611 |        "[100 rows x 2 columns]"
1612 |       ]
1613 |      },
1614 |      "metadata": {},
1615 |      "output_type": "display_data"
1616 |     }
1617 |    ],
1618 |    "source": [
1619 |     "y_pred=rf.predict(test_sample[features])\n",
1620 |     "y_true=test_sample['price']\n",
1621 |     "\n",
1622 |     "logger.info(f\"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}\")\n",
1623 |     "\n",
1624 |     "display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))"
1625 |    ]
1626 |   },
1627 |   {
1628 |    "cell_type": "code",
1629 |    "execution_count": 39,
1630 |    "metadata": {},
1631 |    "outputs": [
1632 |     {
1633 |      "name": "stderr",
1634 |      "output_type": "stream",
1635 |      "text": [
1636 |       "2019-09-26 14:23:43,825 - root - INFO - model saved!\n"
1637 |      ]
1638 |     }
1639 |    ],
1640 |    "source": [
1641 |     "import mlflow.sklearn\n",
1642 |     "\n",
1643 |     "MODEL_PATH = '../output/price_pred_model'\n",
1644 |     "!rm -rf $MODEL_PATH  # '!' can be used to execute bash commands in jupyter cells\n",
1645 |     "\n",
1646 |     "mlflow.sklearn.save_model(sk_model=rf, path=MODEL_PATH)\n",
1647 |     "logger.info(\"model saved!\")"
1648 |    ]
1649 |   },
1650 |   {
1651 |    "cell_type": "markdown",
1652 |    "metadata": {},
1653 |    "source": [
1654 |     "using `mlflow.sagemaker` module, this model can be deployed directly in AWS, with just one line of code..."
1655 |    ]
1656 |   },
1657 |   {
1658 |    "cell_type": "code",
1659 |    "execution_count": 46,
1660 |    "metadata": {},
1661 |    "outputs": [
1662 |     {
1663 |      "name": "stderr",
1664 |      "output_type": "stream",
1665 |      "text": [
1666 |       "2019/09/26 14:30:53 INFO mlflow.sagemaker: Using the python_function flavor for deployment!\n",
1667 |       "2019/09/26 14:30:54 INFO mlflow.sagemaker: tag response: {'ResponseMetadata': {'RequestId': 'D677046E82A7F687', 'HostId': '/Pz7uHFsEwkCO7dIjL+VZie+kqgRZBZ3mtseIIj8RCzShyVD8+o8rlyspi8Ka4+LvQQdnpRq9ew=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '/Pz7uHFsEwkCO7dIjL+VZie+kqgRZBZ3mtseIIj8RCzShyVD8+o8rlyspi8Ka4+LvQQdnpRq9ew=', 'x-amz-request-id': 'D677046E82A7F687', 'date': 'Thu, 26 Sep 2019 14:30:55 GMT', 'content-length': '0', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}\n",
1668 |       "2019/09/26 14:30:54 INFO mlflow.sagemaker: Creating new endpoint with name: ml-in-prod-mad-mlf-api-ep ...\n",
1669 |       "2019/09/26 14:30:54 INFO mlflow.sagemaker: Created model with arn: arn:aws:sagemaker:eu-west-1:090554204572:model/ml-in-prod-mad-mlf-api-ep-model-tkd7gkgbsj6tshwhvjp2cw\n",
1670 |       "2019/09/26 14:30:54 INFO mlflow.sagemaker: Created endpoint configuration with arn: arn:aws:sagemaker:eu-west-1:090554204572:endpoint-config/ml-in-prod-mad-mlf-api-ep-config-bb4rjblctcivhckd9guywg\n",
1671 |       "2019/09/26 14:30:55 INFO mlflow.sagemaker: Created endpoint with arn: arn:aws:sagemaker:eu-west-1:090554204572:endpoint/ml-in-prod-mad-mlf-api-ep\n",
1672 |       "2019/09/26 14:30:55 INFO mlflow.sagemaker: Waiting for the deployment operation to complete...\n",
1673 |       "2019/09/26 14:30:55 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1674 |       "2019/09/26 14:31:15 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1675 |       "2019/09/26 14:31:35 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1676 |       "2019/09/26 14:31:56 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1677 |       "2019/09/26 14:32:16 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1678 |       "2019/09/26 14:32:36 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1679 |       "2019/09/26 14:32:57 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1680 |       "2019/09/26 14:33:17 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1681 |       "2019/09/26 14:33:37 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1682 |       "2019/09/26 14:33:58 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1683 |       "2019/09/26 14:34:18 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1684 |       "2019/09/26 14:34:38 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1685 |       "2019/09/26 14:34:59 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1686 |       "2019/09/26 14:35:19 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1687 |       "2019/09/26 14:35:39 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1688 |       "2019/09/26 14:35:59 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1689 |       "2019/09/26 14:36:20 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1690 |       "2019/09/26 14:36:40 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1691 |       "2019/09/26 14:37:00 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1692 |       "2019/09/26 14:37:21 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1693 |       "2019/09/26 14:37:41 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1694 |       "2019/09/26 14:38:01 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1695 |       "2019/09/26 14:38:21 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1696 |       "2019/09/26 14:38:42 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1697 |       "2019/09/26 14:39:02 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1698 |       "2019/09/26 14:39:22 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1699 |       "2019/09/26 14:39:43 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1700 |       "2019/09/26 14:40:03 INFO mlflow.sagemaker: Waiting for endpoint to reach the \"InService\" state. Current endpoint status: \"Creating\"\n",
1701 |       "2019/09/26 14:40:08 INFO mlflow.sagemaker: The deployment operation completed successfully with message: \"The SageMaker endpoint was created successfully.\"\n"
1702 |      ]
1703 |     }
1704 |    ],
1705 |    "source": [
1706 |     "import mlflow.sagemaker\n",
1707 |     "\n",
1708 |     "ENDPOINT_NAME = 'ml-in-prod-mad-mlf-api-ep'\n",
1709 |     "\n",
1710 |     "mlflow.sagemaker.deploy(app_name=ENDPOINT_NAME, \n",
1711 |     "                         model_uri=MODEL_PATH, \n",
1712 |     "                         execution_role_arn=ROLE, \n",
1713 |     "                         bucket=BUCKET,\n",
1714 |     "                         region_name=REGION, \n",
1715 |     "                         mode='create',  # try 'replace'\n",
1716 |     "                         instance_type='ml.t2.medium', \n",
1717 |     "                         instance_count=1)"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "markdown",
1722 |    "metadata": {},
1723 |    "source": [
1724 |     "endpoint can be invoked the usual way, using boto3. authenticating against aws using plain curl or requests is much harder than just store its credentials and using aws sdk:"
1725 |    ]
1726 |   },
1727 |   {
1728 |    "cell_type": "code",
1729 |    "execution_count": null,
1730 |    "metadata": {},
1731 |    "outputs": [],
1732 |    "source": [
1733 |     "import json\n",
1734 |     "response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,\n",
1735 |     "                                   ContentType='application/json',\n",
1736 |     "                                   Body=test_sample[features].to_json(orient='split'))\n",
1737 |     "\n",
1738 |     "y_pred = json.loads(response['Body'].read().decode())\n",
1739 |     "y_true = test_sample[target]\n",
1740 |     "\n",
1741 |     "logger.info(f\"mae for xbgoost model is: {mean_absolute_error(y_true=y_true, y_pred=y_pred)}\")\n",
1742 |     "\n",
1743 |     "display(pd.DataFrame({'y_true': y_true, 'y_pred': y_pred}))"
1744 |    ]
1745 |   },
1746 |   {
1747 |    "cell_type": "markdown",
1748 |    "metadata": {},
1749 |    "source": [
1750 |     "## 10. references"
1751 |    ]
1752 |   },
1753 |   {
1754 |    "cell_type": "markdown",
1755 |    "metadata": {},
1756 |    "source": [
1757 |     "- https://www.mlflow.org/docs/latest/models.html#built-in-deployment-tools\n",
1758 |     "- https://aws.amazon.com/sagemaker/features/"
1759 |    ]
1760 |   }
1761 |  ],
1762 |  "metadata": {
1763 |   "kernelspec": {
1764 |    "display_name": "Python 3",
1765 |    "language": "python",
1766 |    "name": "python3"
1767 |   },
1768 |   "language_info": {
1769 |    "codemirror_mode": {
1770 |     "name": "ipython",
1771 |     "version": 3
1772 |    },
1773 |    "file_extension": ".py",
1774 |    "mimetype": "text/x-python",
1775 |    "name": "python",
1776 |    "nbconvert_exporter": "python",
1777 |    "pygments_lexer": "ipython3",
1778 |    "version": "3.7.4"
1779 |   }
1780 |  },
1781 |  "nbformat": 4,
1782 |  "nbformat_minor": 4
1783 | }
1784 | 


--------------------------------------------------------------------------------
/notebooks/baseline.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# MACHINE LEARNING IN PRODUCTION MADRID - BASELINE"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "This is the baseline for the business case we've setup for the workshop. It is mainly a clusterization of the Spanish High Speed Rail dataset from Kaggle. We'll load this dataset that we've been building for months scrapping www.renfe.com (the official website for buying train tickets in Spain). Here is a link with the current version in Kaggle (we use to update it once a month):\n",
  15 |     "\n",
  16 |     "https://www.kaggle.com/thegurusteam/spanish-high-speed-rail-system-ticket-pricing"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "metadata": {},
  22 |    "source": [
  23 |     "## ETL"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "We've made a dump from our database and we've converted into a parquet file to improve the loading times and overhead of a PostgreSQL connection."
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 1,
  36 |    "metadata": {},
  37 |    "outputs": [
  38 |     {
  39 |      "data": {
  40 |       "text/html": [
  41 |        "<div>\n",
  42 |        "<style scoped>\n",
  43 |        "    .dataframe tbody tr th:only-of-type {\n",
  44 |        "        vertical-align: middle;\n",
  45 |        "    }\n",
  46 |        "\n",
  47 |        "    .dataframe tbody tr th {\n",
  48 |        "        vertical-align: top;\n",
  49 |        "    }\n",
  50 |        "\n",
  51 |        "    .dataframe thead th {\n",
  52 |        "        text-align: right;\n",
  53 |        "    }\n",
  54 |        "</style>\n",
  55 |        "<table border=\"1\" class=\"dataframe\">\n",
  56 |        "  <thead>\n",
  57 |        "    <tr style=\"text-align: right;\">\n",
  58 |        "      <th></th>\n",
  59 |        "      <th>insert_date</th>\n",
  60 |        "      <th>origin</th>\n",
  61 |        "      <th>destination</th>\n",
  62 |        "      <th>start_date</th>\n",
  63 |        "      <th>end_date</th>\n",
  64 |        "      <th>train_type</th>\n",
  65 |        "      <th>price</th>\n",
  66 |        "      <th>train_class</th>\n",
  67 |        "      <th>fare</th>\n",
  68 |        "    </tr>\n",
  69 |        "  </thead>\n",
  70 |        "  <tbody>\n",
  71 |        "    <tr>\n",
  72 |        "      <td>0</td>\n",
  73 |        "      <td>2019-08-21 03:42:10</td>\n",
  74 |        "      <td>SEVILLA</td>\n",
  75 |        "      <td>MADRID</td>\n",
  76 |        "      <td>2019-08-29 13:40:00</td>\n",
  77 |        "      <td>2019-08-29 16:10:00</td>\n",
  78 |        "      <td>AVE</td>\n",
  79 |        "      <td>47.30</td>\n",
  80 |        "      <td>Turista</td>\n",
  81 |        "      <td>Promo</td>\n",
  82 |        "    </tr>\n",
  83 |        "    <tr>\n",
  84 |        "      <td>1</td>\n",
  85 |        "      <td>2019-08-21 03:42:10</td>\n",
  86 |        "      <td>SEVILLA</td>\n",
  87 |        "      <td>MADRID</td>\n",
  88 |        "      <td>2019-08-29 14:45:00</td>\n",
  89 |        "      <td>2019-08-29 17:15:00</td>\n",
  90 |        "      <td>AVE</td>\n",
  91 |        "      <td>53.40</td>\n",
  92 |        "      <td>Turista</td>\n",
  93 |        "      <td>Promo</td>\n",
  94 |        "    </tr>\n",
  95 |        "    <tr>\n",
  96 |        "      <td>2</td>\n",
  97 |        "      <td>2019-08-21 03:42:10</td>\n",
  98 |        "      <td>SEVILLA</td>\n",
  99 |        "      <td>MADRID</td>\n",
 100 |        "      <td>2019-08-29 14:58:00</td>\n",
 101 |        "      <td>2019-08-29 17:50:00</td>\n",
 102 |        "      <td>ALVIA</td>\n",
 103 |        "      <td>NaN</td>\n",
 104 |        "      <td>Preferente</td>\n",
 105 |        "      <td>Promo</td>\n",
 106 |        "    </tr>\n",
 107 |        "    <tr>\n",
 108 |        "      <td>3</td>\n",
 109 |        "      <td>2019-08-21 03:42:10</td>\n",
 110 |        "      <td>SEVILLA</td>\n",
 111 |        "      <td>MADRID</td>\n",
 112 |        "      <td>2019-08-29 15:45:00</td>\n",
 113 |        "      <td>2019-08-29 18:15:00</td>\n",
 114 |        "      <td>AVE</td>\n",
 115 |        "      <td>61.45</td>\n",
 116 |        "      <td>Preferente</td>\n",
 117 |        "      <td>Promo</td>\n",
 118 |        "    </tr>\n",
 119 |        "    <tr>\n",
 120 |        "      <td>4</td>\n",
 121 |        "      <td>2019-08-21 03:42:10</td>\n",
 122 |        "      <td>SEVILLA</td>\n",
 123 |        "      <td>MADRID</td>\n",
 124 |        "      <td>2019-08-29 16:45:00</td>\n",
 125 |        "      <td>2019-08-29 19:17:00</td>\n",
 126 |        "      <td>AVE</td>\n",
 127 |        "      <td>60.30</td>\n",
 128 |        "      <td>Turista</td>\n",
 129 |        "      <td>Promo</td>\n",
 130 |        "    </tr>\n",
 131 |        "  </tbody>\n",
 132 |        "</table>\n",
 133 |        "</div>"
 134 |       ],
 135 |       "text/plain": [
 136 |        "          insert_date   origin destination          start_date  \\\n",
 137 |        "0 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 13:40:00   \n",
 138 |        "1 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 14:45:00   \n",
 139 |        "2 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 14:58:00   \n",
 140 |        "3 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 15:45:00   \n",
 141 |        "4 2019-08-21 03:42:10  SEVILLA      MADRID 2019-08-29 16:45:00   \n",
 142 |        "\n",
 143 |        "             end_date train_type  price train_class   fare  \n",
 144 |        "0 2019-08-29 16:10:00        AVE  47.30     Turista  Promo  \n",
 145 |        "1 2019-08-29 17:15:00        AVE  53.40     Turista  Promo  \n",
 146 |        "2 2019-08-29 17:50:00      ALVIA    NaN  Preferente  Promo  \n",
 147 |        "3 2019-08-29 18:15:00        AVE  61.45  Preferente  Promo  \n",
 148 |        "4 2019-08-29 19:17:00        AVE  60.30     Turista  Promo  "
 149 |       ]
 150 |      },
 151 |      "execution_count": 1,
 152 |      "metadata": {},
 153 |      "output_type": "execute_result"
 154 |     }
 155 |    ],
 156 |    "source": [
 157 |     "import pandas as pd\n",
 158 |     "\n",
 159 |     "df = pd.read_parquet('../data/raw/renfe.parquet')\n",
 160 |     "\n",
 161 |     "df.head()"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "markdown",
 166 |    "metadata": {},
 167 |    "source": [
 168 |     "## Preprocessing"
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "markdown",
 173 |    "metadata": {},
 174 |    "source": [
 175 |     "### Null values"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 2,
 181 |    "metadata": {},
 182 |    "outputs": [
 183 |     {
 184 |      "name": "stdout",
 185 |      "output_type": "stream",
 186 |      "text": [
 187 |       "Dataset size: 10800510\n",
 188 |       "\n",
 189 |       "Percentage of null values:\n"
 190 |      ]
 191 |     },
 192 |     {
 193 |      "data": {
 194 |       "text/plain": [
 195 |        "insert_date    0.000000\n",
 196 |        "origin         0.000000\n",
 197 |        "destination    0.000000\n",
 198 |        "start_date     0.000000\n",
 199 |        "end_date       0.000000\n",
 200 |        "train_type     0.000000\n",
 201 |        "price          6.185439\n",
 202 |        "train_class    0.281070\n",
 203 |        "fare           0.281070\n",
 204 |        "dtype: float64"
 205 |       ]
 206 |      },
 207 |      "execution_count": 2,
 208 |      "metadata": {},
 209 |      "output_type": "execute_result"
 210 |     }
 211 |    ],
 212 |    "source": [
 213 |     "print(f'Dataset size: {len(df)}\\n')\n",
 214 |     "\n",
 215 |     "print('Percentage of null values:')\n",
 216 |     "df.isnull().sum() / len(df) * 100"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "markdown",
 221 |    "metadata": {},
 222 |    "source": [
 223 |     "Since there is a few number of records with null values compared with the total dataset size, we can remove them to ease posterior analysis."
 224 |    ]
 225 |   },
 226 |   {
 227 |    "cell_type": "code",
 228 |    "execution_count": 3,
 229 |    "metadata": {},
 230 |    "outputs": [],
 231 |    "source": [
 232 |     "df.dropna(inplace=True)"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "markdown",
 237 |    "metadata": {},
 238 |    "source": [
 239 |     "### Encoding"
 240 |    ]
 241 |   },
 242 |   {
 243 |    "cell_type": "markdown",
 244 |    "metadata": {},
 245 |    "source": [
 246 |     "We should encode the string columns as future algorithms can only handle numbers. We've used the OrdinalEncoder approach with no fixed order in columns as fare in which a certain ordering (from cheapest to most expensive could have influence). We've left it that way for the sake of simplicity."
 247 |    ]
 248 |   },
 249 |   {
 250 |    "cell_type": "code",
 251 |    "execution_count": 4,
 252 |    "metadata": {},
 253 |    "outputs": [
 254 |     {
 255 |      "data": {
 256 |       "text/html": [
 257 |        "<div>\n",
 258 |        "<style scoped>\n",
 259 |        "    .dataframe tbody tr th:only-of-type {\n",
 260 |        "        vertical-align: middle;\n",
 261 |        "    }\n",
 262 |        "\n",
 263 |        "    .dataframe tbody tr th {\n",
 264 |        "        vertical-align: top;\n",
 265 |        "    }\n",
 266 |        "\n",
 267 |        "    .dataframe thead th {\n",
 268 |        "        text-align: right;\n",
 269 |        "    }\n",
 270 |        "</style>\n",
 271 |        "<table border=\"1\" class=\"dataframe\">\n",
 272 |        "  <thead>\n",
 273 |        "    <tr style=\"text-align: right;\">\n",
 274 |        "      <th></th>\n",
 275 |        "      <th>insert_date</th>\n",
 276 |        "      <th>origin</th>\n",
 277 |        "      <th>destination</th>\n",
 278 |        "      <th>start_date</th>\n",
 279 |        "      <th>end_date</th>\n",
 280 |        "      <th>train_type</th>\n",
 281 |        "      <th>price</th>\n",
 282 |        "      <th>train_class</th>\n",
 283 |        "      <th>fare</th>\n",
 284 |        "    </tr>\n",
 285 |        "  </thead>\n",
 286 |        "  <tbody>\n",
 287 |        "    <tr>\n",
 288 |        "      <td>0</td>\n",
 289 |        "      <td>2019-08-21 03:42:10</td>\n",
 290 |        "      <td>4.0</td>\n",
 291 |        "      <td>2.0</td>\n",
 292 |        "      <td>2019-08-29 13:40:00</td>\n",
 293 |        "      <td>2019-08-29 16:10:00</td>\n",
 294 |        "      <td>2.0</td>\n",
 295 |        "      <td>47.30</td>\n",
 296 |        "      <td>4.0</td>\n",
 297 |        "      <td>8.0</td>\n",
 298 |        "    </tr>\n",
 299 |        "    <tr>\n",
 300 |        "      <td>1</td>\n",
 301 |        "      <td>2019-08-21 03:42:10</td>\n",
 302 |        "      <td>4.0</td>\n",
 303 |        "      <td>2.0</td>\n",
 304 |        "      <td>2019-08-29 14:45:00</td>\n",
 305 |        "      <td>2019-08-29 17:15:00</td>\n",
 306 |        "      <td>2.0</td>\n",
 307 |        "      <td>53.40</td>\n",
 308 |        "      <td>4.0</td>\n",
 309 |        "      <td>8.0</td>\n",
 310 |        "    </tr>\n",
 311 |        "    <tr>\n",
 312 |        "      <td>3</td>\n",
 313 |        "      <td>2019-08-21 03:42:10</td>\n",
 314 |        "      <td>4.0</td>\n",
 315 |        "      <td>2.0</td>\n",
 316 |        "      <td>2019-08-29 15:45:00</td>\n",
 317 |        "      <td>2019-08-29 18:15:00</td>\n",
 318 |        "      <td>2.0</td>\n",
 319 |        "      <td>61.45</td>\n",
 320 |        "      <td>2.0</td>\n",
 321 |        "      <td>8.0</td>\n",
 322 |        "    </tr>\n",
 323 |        "    <tr>\n",
 324 |        "      <td>4</td>\n",
 325 |        "      <td>2019-08-21 03:42:10</td>\n",
 326 |        "      <td>4.0</td>\n",
 327 |        "      <td>2.0</td>\n",
 328 |        "      <td>2019-08-29 16:45:00</td>\n",
 329 |        "      <td>2019-08-29 19:17:00</td>\n",
 330 |        "      <td>2.0</td>\n",
 331 |        "      <td>60.30</td>\n",
 332 |        "      <td>4.0</td>\n",
 333 |        "      <td>8.0</td>\n",
 334 |        "    </tr>\n",
 335 |        "    <tr>\n",
 336 |        "      <td>5</td>\n",
 337 |        "      <td>2019-08-21 03:42:10</td>\n",
 338 |        "      <td>4.0</td>\n",
 339 |        "      <td>2.0</td>\n",
 340 |        "      <td>2019-08-29 17:45:00</td>\n",
 341 |        "      <td>2019-08-29 20:17:00</td>\n",
 342 |        "      <td>2.0</td>\n",
 343 |        "      <td>60.30</td>\n",
 344 |        "      <td>4.0</td>\n",
 345 |        "      <td>8.0</td>\n",
 346 |        "    </tr>\n",
 347 |        "  </tbody>\n",
 348 |        "</table>\n",
 349 |        "</div>"
 350 |       ],
 351 |       "text/plain": [
 352 |        "          insert_date  origin  destination          start_date  \\\n",
 353 |        "0 2019-08-21 03:42:10     4.0          2.0 2019-08-29 13:40:00   \n",
 354 |        "1 2019-08-21 03:42:10     4.0          2.0 2019-08-29 14:45:00   \n",
 355 |        "3 2019-08-21 03:42:10     4.0          2.0 2019-08-29 15:45:00   \n",
 356 |        "4 2019-08-21 03:42:10     4.0          2.0 2019-08-29 16:45:00   \n",
 357 |        "5 2019-08-21 03:42:10     4.0          2.0 2019-08-29 17:45:00   \n",
 358 |        "\n",
 359 |        "             end_date  train_type  price  train_class  fare  \n",
 360 |        "0 2019-08-29 16:10:00         2.0  47.30          4.0   8.0  \n",
 361 |        "1 2019-08-29 17:15:00         2.0  53.40          4.0   8.0  \n",
 362 |        "3 2019-08-29 18:15:00         2.0  61.45          2.0   8.0  \n",
 363 |        "4 2019-08-29 19:17:00         2.0  60.30          4.0   8.0  \n",
 364 |        "5 2019-08-29 20:17:00         2.0  60.30          4.0   8.0  "
 365 |       ]
 366 |      },
 367 |      "execution_count": 4,
 368 |      "metadata": {},
 369 |      "output_type": "execute_result"
 370 |     }
 371 |    ],
 372 |    "source": [
 373 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 374 |     "\n",
 375 |     "encoder_m = OrdinalEncoder()\n",
 376 |     "\n",
 377 |     "columns_to_encode = ['origin', 'destination', 'train_type', 'train_class', 'fare']\n",
 378 |     "df.loc[:, columns_to_encode] = encoder_m.fit_transform(df[columns_to_encode])\n",
 379 |     "\n",
 380 |     "df.head()"
 381 |    ]
 382 |   },
 383 |   {
 384 |    "cell_type": "markdown",
 385 |    "metadata": {},
 386 |    "source": [
 387 |     "### Feature Engineering"
 388 |    ]
 389 |   },
 390 |   {
 391 |    "cell_type": "markdown",
 392 |    "metadata": {},
 393 |    "source": [
 394 |     "The main task here is to extract the datetime columns info into numbers (duration, time to departure...)."
 395 |    ]
 396 |   },
 397 |   {
 398 |    "cell_type": "code",
 399 |    "execution_count": 5,
 400 |    "metadata": {},
 401 |    "outputs": [
 402 |     {
 403 |      "data": {
 404 |       "text/html": [
 405 |        "<div>\n",
 406 |        "<style scoped>\n",
 407 |        "    .dataframe tbody tr th:only-of-type {\n",
 408 |        "        vertical-align: middle;\n",
 409 |        "    }\n",
 410 |        "\n",
 411 |        "    .dataframe tbody tr th {\n",
 412 |        "        vertical-align: top;\n",
 413 |        "    }\n",
 414 |        "\n",
 415 |        "    .dataframe thead th {\n",
 416 |        "        text-align: right;\n",
 417 |        "    }\n",
 418 |        "</style>\n",
 419 |        "<table border=\"1\" class=\"dataframe\">\n",
 420 |        "  <thead>\n",
 421 |        "    <tr style=\"text-align: right;\">\n",
 422 |        "      <th></th>\n",
 423 |        "      <th>insert_date</th>\n",
 424 |        "      <th>origin</th>\n",
 425 |        "      <th>destination</th>\n",
 426 |        "      <th>start_date</th>\n",
 427 |        "      <th>end_date</th>\n",
 428 |        "      <th>train_type</th>\n",
 429 |        "      <th>price</th>\n",
 430 |        "      <th>train_class</th>\n",
 431 |        "      <th>fare</th>\n",
 432 |        "      <th>duration</th>\n",
 433 |        "      <th>time_to_departure</th>\n",
 434 |        "      <th>hour</th>\n",
 435 |        "      <th>weekday</th>\n",
 436 |        "    </tr>\n",
 437 |        "  </thead>\n",
 438 |        "  <tbody>\n",
 439 |        "    <tr>\n",
 440 |        "      <td>0</td>\n",
 441 |        "      <td>2019-08-21 03:42:10</td>\n",
 442 |        "      <td>4.0</td>\n",
 443 |        "      <td>2.0</td>\n",
 444 |        "      <td>2019-08-29 13:40:00</td>\n",
 445 |        "      <td>2019-08-29 16:10:00</td>\n",
 446 |        "      <td>2.0</td>\n",
 447 |        "      <td>47.30</td>\n",
 448 |        "      <td>4.0</td>\n",
 449 |        "      <td>8.0</td>\n",
 450 |        "      <td>2.500000</td>\n",
 451 |        "      <td>8</td>\n",
 452 |        "      <td>13</td>\n",
 453 |        "      <td>3</td>\n",
 454 |        "    </tr>\n",
 455 |        "    <tr>\n",
 456 |        "      <td>1</td>\n",
 457 |        "      <td>2019-08-21 03:42:10</td>\n",
 458 |        "      <td>4.0</td>\n",
 459 |        "      <td>2.0</td>\n",
 460 |        "      <td>2019-08-29 14:45:00</td>\n",
 461 |        "      <td>2019-08-29 17:15:00</td>\n",
 462 |        "      <td>2.0</td>\n",
 463 |        "      <td>53.40</td>\n",
 464 |        "      <td>4.0</td>\n",
 465 |        "      <td>8.0</td>\n",
 466 |        "      <td>2.500000</td>\n",
 467 |        "      <td>8</td>\n",
 468 |        "      <td>14</td>\n",
 469 |        "      <td>3</td>\n",
 470 |        "    </tr>\n",
 471 |        "    <tr>\n",
 472 |        "      <td>3</td>\n",
 473 |        "      <td>2019-08-21 03:42:10</td>\n",
 474 |        "      <td>4.0</td>\n",
 475 |        "      <td>2.0</td>\n",
 476 |        "      <td>2019-08-29 15:45:00</td>\n",
 477 |        "      <td>2019-08-29 18:15:00</td>\n",
 478 |        "      <td>2.0</td>\n",
 479 |        "      <td>61.45</td>\n",
 480 |        "      <td>2.0</td>\n",
 481 |        "      <td>8.0</td>\n",
 482 |        "      <td>2.500000</td>\n",
 483 |        "      <td>8</td>\n",
 484 |        "      <td>15</td>\n",
 485 |        "      <td>3</td>\n",
 486 |        "    </tr>\n",
 487 |        "    <tr>\n",
 488 |        "      <td>4</td>\n",
 489 |        "      <td>2019-08-21 03:42:10</td>\n",
 490 |        "      <td>4.0</td>\n",
 491 |        "      <td>2.0</td>\n",
 492 |        "      <td>2019-08-29 16:45:00</td>\n",
 493 |        "      <td>2019-08-29 19:17:00</td>\n",
 494 |        "      <td>2.0</td>\n",
 495 |        "      <td>60.30</td>\n",
 496 |        "      <td>4.0</td>\n",
 497 |        "      <td>8.0</td>\n",
 498 |        "      <td>2.533333</td>\n",
 499 |        "      <td>8</td>\n",
 500 |        "      <td>16</td>\n",
 501 |        "      <td>3</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <td>5</td>\n",
 505 |        "      <td>2019-08-21 03:42:10</td>\n",
 506 |        "      <td>4.0</td>\n",
 507 |        "      <td>2.0</td>\n",
 508 |        "      <td>2019-08-29 17:45:00</td>\n",
 509 |        "      <td>2019-08-29 20:17:00</td>\n",
 510 |        "      <td>2.0</td>\n",
 511 |        "      <td>60.30</td>\n",
 512 |        "      <td>4.0</td>\n",
 513 |        "      <td>8.0</td>\n",
 514 |        "      <td>2.533333</td>\n",
 515 |        "      <td>8</td>\n",
 516 |        "      <td>17</td>\n",
 517 |        "      <td>3</td>\n",
 518 |        "    </tr>\n",
 519 |        "  </tbody>\n",
 520 |        "</table>\n",
 521 |        "</div>"
 522 |       ],
 523 |       "text/plain": [
 524 |        "          insert_date  origin  destination          start_date  \\\n",
 525 |        "0 2019-08-21 03:42:10     4.0          2.0 2019-08-29 13:40:00   \n",
 526 |        "1 2019-08-21 03:42:10     4.0          2.0 2019-08-29 14:45:00   \n",
 527 |        "3 2019-08-21 03:42:10     4.0          2.0 2019-08-29 15:45:00   \n",
 528 |        "4 2019-08-21 03:42:10     4.0          2.0 2019-08-29 16:45:00   \n",
 529 |        "5 2019-08-21 03:42:10     4.0          2.0 2019-08-29 17:45:00   \n",
 530 |        "\n",
 531 |        "             end_date  train_type  price  train_class  fare  duration  \\\n",
 532 |        "0 2019-08-29 16:10:00         2.0  47.30          4.0   8.0  2.500000   \n",
 533 |        "1 2019-08-29 17:15:00         2.0  53.40          4.0   8.0  2.500000   \n",
 534 |        "3 2019-08-29 18:15:00         2.0  61.45          2.0   8.0  2.500000   \n",
 535 |        "4 2019-08-29 19:17:00         2.0  60.30          4.0   8.0  2.533333   \n",
 536 |        "5 2019-08-29 20:17:00         2.0  60.30          4.0   8.0  2.533333   \n",
 537 |        "\n",
 538 |        "   time_to_departure  hour  weekday  \n",
 539 |        "0                  8    13        3  \n",
 540 |        "1                  8    14        3  \n",
 541 |        "3                  8    15        3  \n",
 542 |        "4                  8    16        3  \n",
 543 |        "5                  8    17        3  "
 544 |       ]
 545 |      },
 546 |      "execution_count": 5,
 547 |      "metadata": {},
 548 |      "output_type": "execute_result"
 549 |     }
 550 |    ],
 551 |    "source": [
 552 |     "df['duration'] = (df['end_date'] - df['start_date']).dt.seconds / 3600\n",
 553 |     "\n",
 554 |     "df['time_to_departure'] = (df['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \\\n",
 555 |     "                           - df['insert_date'].dt.tz_localize('UTC')).dt.days\n",
 556 |     "\n",
 557 |     "df['hour'] = df['start_date'].dt.hour\n",
 558 |     "\n",
 559 |     "df['weekday'] = df['start_date'].dt.dayofweek\n",
 560 |     "\n",
 561 |     "df.head()"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "markdown",
 566 |    "metadata": {},
 567 |    "source": [
 568 |     "Thus, we can remove these columns as the posterior algorithms don't know how to handle them."
 569 |    ]
 570 |   },
 571 |   {
 572 |    "cell_type": "code",
 573 |    "execution_count": 6,
 574 |    "metadata": {},
 575 |    "outputs": [
 576 |     {
 577 |      "data": {
 578 |       "text/html": [
 579 |        "<div>\n",
 580 |        "<style scoped>\n",
 581 |        "    .dataframe tbody tr th:only-of-type {\n",
 582 |        "        vertical-align: middle;\n",
 583 |        "    }\n",
 584 |        "\n",
 585 |        "    .dataframe tbody tr th {\n",
 586 |        "        vertical-align: top;\n",
 587 |        "    }\n",
 588 |        "\n",
 589 |        "    .dataframe thead th {\n",
 590 |        "        text-align: right;\n",
 591 |        "    }\n",
 592 |        "</style>\n",
 593 |        "<table border=\"1\" class=\"dataframe\">\n",
 594 |        "  <thead>\n",
 595 |        "    <tr style=\"text-align: right;\">\n",
 596 |        "      <th></th>\n",
 597 |        "      <th>origin</th>\n",
 598 |        "      <th>destination</th>\n",
 599 |        "      <th>train_type</th>\n",
 600 |        "      <th>price</th>\n",
 601 |        "      <th>train_class</th>\n",
 602 |        "      <th>fare</th>\n",
 603 |        "      <th>duration</th>\n",
 604 |        "      <th>time_to_departure</th>\n",
 605 |        "      <th>hour</th>\n",
 606 |        "      <th>weekday</th>\n",
 607 |        "    </tr>\n",
 608 |        "  </thead>\n",
 609 |        "  <tbody>\n",
 610 |        "    <tr>\n",
 611 |        "      <td>0</td>\n",
 612 |        "      <td>4.0</td>\n",
 613 |        "      <td>2.0</td>\n",
 614 |        "      <td>2.0</td>\n",
 615 |        "      <td>47.30</td>\n",
 616 |        "      <td>4.0</td>\n",
 617 |        "      <td>8.0</td>\n",
 618 |        "      <td>2.500000</td>\n",
 619 |        "      <td>8</td>\n",
 620 |        "      <td>13</td>\n",
 621 |        "      <td>3</td>\n",
 622 |        "    </tr>\n",
 623 |        "    <tr>\n",
 624 |        "      <td>1</td>\n",
 625 |        "      <td>4.0</td>\n",
 626 |        "      <td>2.0</td>\n",
 627 |        "      <td>2.0</td>\n",
 628 |        "      <td>53.40</td>\n",
 629 |        "      <td>4.0</td>\n",
 630 |        "      <td>8.0</td>\n",
 631 |        "      <td>2.500000</td>\n",
 632 |        "      <td>8</td>\n",
 633 |        "      <td>14</td>\n",
 634 |        "      <td>3</td>\n",
 635 |        "    </tr>\n",
 636 |        "    <tr>\n",
 637 |        "      <td>3</td>\n",
 638 |        "      <td>4.0</td>\n",
 639 |        "      <td>2.0</td>\n",
 640 |        "      <td>2.0</td>\n",
 641 |        "      <td>61.45</td>\n",
 642 |        "      <td>2.0</td>\n",
 643 |        "      <td>8.0</td>\n",
 644 |        "      <td>2.500000</td>\n",
 645 |        "      <td>8</td>\n",
 646 |        "      <td>15</td>\n",
 647 |        "      <td>3</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <td>4</td>\n",
 651 |        "      <td>4.0</td>\n",
 652 |        "      <td>2.0</td>\n",
 653 |        "      <td>2.0</td>\n",
 654 |        "      <td>60.30</td>\n",
 655 |        "      <td>4.0</td>\n",
 656 |        "      <td>8.0</td>\n",
 657 |        "      <td>2.533333</td>\n",
 658 |        "      <td>8</td>\n",
 659 |        "      <td>16</td>\n",
 660 |        "      <td>3</td>\n",
 661 |        "    </tr>\n",
 662 |        "    <tr>\n",
 663 |        "      <td>5</td>\n",
 664 |        "      <td>4.0</td>\n",
 665 |        "      <td>2.0</td>\n",
 666 |        "      <td>2.0</td>\n",
 667 |        "      <td>60.30</td>\n",
 668 |        "      <td>4.0</td>\n",
 669 |        "      <td>8.0</td>\n",
 670 |        "      <td>2.533333</td>\n",
 671 |        "      <td>8</td>\n",
 672 |        "      <td>17</td>\n",
 673 |        "      <td>3</td>\n",
 674 |        "    </tr>\n",
 675 |        "  </tbody>\n",
 676 |        "</table>\n",
 677 |        "</div>"
 678 |       ],
 679 |       "text/plain": [
 680 |        "   origin  destination  train_type  price  train_class  fare  duration  \\\n",
 681 |        "0     4.0          2.0         2.0  47.30          4.0   8.0  2.500000   \n",
 682 |        "1     4.0          2.0         2.0  53.40          4.0   8.0  2.500000   \n",
 683 |        "3     4.0          2.0         2.0  61.45          2.0   8.0  2.500000   \n",
 684 |        "4     4.0          2.0         2.0  60.30          4.0   8.0  2.533333   \n",
 685 |        "5     4.0          2.0         2.0  60.30          4.0   8.0  2.533333   \n",
 686 |        "\n",
 687 |        "   time_to_departure  hour  weekday  \n",
 688 |        "0                  8    13        3  \n",
 689 |        "1                  8    14        3  \n",
 690 |        "3                  8    15        3  \n",
 691 |        "4                  8    16        3  \n",
 692 |        "5                  8    17        3  "
 693 |       ]
 694 |      },
 695 |      "execution_count": 6,
 696 |      "metadata": {},
 697 |      "output_type": "execute_result"
 698 |     }
 699 |    ],
 700 |    "source": [
 701 |     "columns_to_remove = ['insert_date', 'start_date', 'end_date']\n",
 702 |     "df = df[[x for x in df.columns if x not in columns_to_remove]]\n",
 703 |     "\n",
 704 |     "df.head()"
 705 |    ]
 706 |   },
 707 |   {
 708 |    "cell_type": "markdown",
 709 |    "metadata": {},
 710 |    "source": [
 711 |     "### UMAP"
 712 |    ]
 713 |   },
 714 |   {
 715 |    "cell_type": "markdown",
 716 |    "metadata": {},
 717 |    "source": [
 718 |     "In order to visualize the posterior clusterization, we've made an UMAP projection over 2D with a small set of our dataset."
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": 7,
 724 |    "metadata": {},
 725 |    "outputs": [
 726 |     {
 727 |      "data": {
 728 |       "text/html": [
 729 |        "<div>\n",
 730 |        "<style scoped>\n",
 731 |        "    .dataframe tbody tr th:only-of-type {\n",
 732 |        "        vertical-align: middle;\n",
 733 |        "    }\n",
 734 |        "\n",
 735 |        "    .dataframe tbody tr th {\n",
 736 |        "        vertical-align: top;\n",
 737 |        "    }\n",
 738 |        "\n",
 739 |        "    .dataframe thead th {\n",
 740 |        "        text-align: right;\n",
 741 |        "    }\n",
 742 |        "</style>\n",
 743 |        "<table border=\"1\" class=\"dataframe\">\n",
 744 |        "  <thead>\n",
 745 |        "    <tr style=\"text-align: right;\">\n",
 746 |        "      <th></th>\n",
 747 |        "      <th>x</th>\n",
 748 |        "      <th>y</th>\n",
 749 |        "    </tr>\n",
 750 |        "  </thead>\n",
 751 |        "  <tbody>\n",
 752 |        "    <tr>\n",
 753 |        "      <td>0</td>\n",
 754 |        "      <td>-2.046220</td>\n",
 755 |        "      <td>-3.009452</td>\n",
 756 |        "    </tr>\n",
 757 |        "    <tr>\n",
 758 |        "      <td>1</td>\n",
 759 |        "      <td>-3.559751</td>\n",
 760 |        "      <td>12.496255</td>\n",
 761 |        "    </tr>\n",
 762 |        "    <tr>\n",
 763 |        "      <td>2</td>\n",
 764 |        "      <td>-2.933941</td>\n",
 765 |        "      <td>-6.047703</td>\n",
 766 |        "    </tr>\n",
 767 |        "    <tr>\n",
 768 |        "      <td>3</td>\n",
 769 |        "      <td>15.188833</td>\n",
 770 |        "      <td>3.740645</td>\n",
 771 |        "    </tr>\n",
 772 |        "    <tr>\n",
 773 |        "      <td>4</td>\n",
 774 |        "      <td>-1.575144</td>\n",
 775 |        "      <td>-10.619226</td>\n",
 776 |        "    </tr>\n",
 777 |        "  </tbody>\n",
 778 |        "</table>\n",
 779 |        "</div>"
 780 |       ],
 781 |       "text/plain": [
 782 |        "           x          y\n",
 783 |        "0  -2.046220  -3.009452\n",
 784 |        "1  -3.559751  12.496255\n",
 785 |        "2  -2.933941  -6.047703\n",
 786 |        "3  15.188833   3.740645\n",
 787 |        "4  -1.575144 -10.619226"
 788 |       ]
 789 |      },
 790 |      "execution_count": 7,
 791 |      "metadata": {},
 792 |      "output_type": "execute_result"
 793 |     }
 794 |    ],
 795 |    "source": [
 796 |     "import warnings\n",
 797 |     "from umap import UMAP\n",
 798 |     "\n",
 799 |     "df_umap = df.sample(10000, random_state=42)\n",
 800 |     "\n",
 801 |     "with warnings.catch_warnings():\n",
 802 |     "    warnings.simplefilter('ignore')\n",
 803 |     "\n",
 804 |     "    umap_m = UMAP(random_state=42)\n",
 805 |     "    df_embedding = pd.DataFrame(umap_m.fit_transform(df_umap), columns=['x', 'y'])\n",
 806 |     "\n",
 807 |     "df_embedding.head()"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": 8,
 813 |    "metadata": {},
 814 |    "outputs": [
 815 |     {
 816 |      "data": {
 817 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXkAAAD4CAYAAAAJmJb0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAf/klEQVR4nO3dfXBU15km8OdV08INi0emLBijWCMPZnCFgUgTlYFiKkVSUfCYsi074xAX7HpqU5CtSqoWkyWRLGqALARNcDBTs6nZMTuuZNfYwY5xh4koY/mDzRaF2YhtWYLEDBAL7IYBMqDBZWQQ4t0/1A2tVn/c7r73nvvx/KpUqD/U9zRqPX36nHPfI6oKIiIKpirTDSAiIucw5ImIAowhT0QUYAx5IqIAY8gTEQXYBNMNyHTnnXdqQ0OD6WYQEfnK4cOHf6+qtblu81TINzQ0oKenx3QziIh8RURO5buNwzVERAHGkCciCjCGPBFRgDHkiYgCjCFPRBRgnlpdEyb3dezFpyO3isPdFhG8v/lBgy0ioiBiT96A7IAHgE9HFPd17DXUIiIKKoa8AdkBX+x6IqJyMeSJiAKMY/Ie09DWNebyQOdSQy0hoiBgyBtwW0QsD800tHUx6EuU/UZZCP9vKegqHq4RkbtF5B0R+Y2IHBWR/5y6fqqIdIvI8dS/d1Te3GB4f/ODuC0ippsROPM3d5cU8EBpbwhEfmRHT/46gO+o6v8TkSkADotIN4C/AvCWqnaKSBuANgDfs+F4gZC9XJJhU554Iom1r/Ri+IbplhB5U8Uhr6pnAZxNff+xiPwWQB2ARwAsTt3tpwD2gyFPNmjZth/Hz39iuhlEvmDrmLyINABoAnAIwPTUGwAA/AuA6Xl+ZhWAVQBQX19vZ3MoYJbvOIgDJy+abgaRr4iqPWuzReTfAfjfADar6m4RGVTVmozbL6lqwXH55uZmDXM9+VxDNpwYHJXrBDInRETwxPy7sal1ruPHIrKLiBxW1eZct9nSkxeRKIBXAexU1d2pq8+JyF2qelZE7gJw3o5jBRkDfSwT8xQjqnjh3dMAwKCnQLBjdY0A+EcAv1XVbRk37QHwZOr7JwH8otJjUXiYnoh+6dCHRo9PZBc7evKLAPx7AP0i0pu67mkAnQBeFpFvADgF4Gs2HIvIFSOqiCeS2LrvGM4MDmFGTQyTqqvKmvBl8TkyybYxeTuEfUyebjHdkxcAt0UjGBoeseXxGPTkJMfH5Im8bMWC+jHj6/PWv47LVwuH96TqCD65Zk/AAyw+R+awQBkFXvYEat/GB3D7xEjO+0ZEsGJBPa7YGPBEJrEnT5400Lk055DNhCrBM49/Dq/0nLa0Zj7fiqW+jQ8U/Ll33r+A5OCQtcYSeRhDnjyr0JLS1qa6nCdH3T4xUjTArVi7ZDbad/fbOiZPZAJDnnxr58qFjj12a1MdAGD1rt4i97SGk65kCsfkifJobarD9CnVFT/O9mWNNrSGqDwMeaICDnW0YEIFIy2TolU3PxUQmcCQJyrixJalZffof/DYPJtbQ1QahjyRBYc6WjBr2uSSfmbWtMnsxZNxDHkii7rXLMaKBdbLYR8//wnWxfsdbBFRcQx5ohJsap1bUrXQdEVLIlMY8kRlKCXoTdfhoXBjyBOVqZTJ2Pmbux1sCVF+DHmiMh3qaLF833MfX3OwJUT5MeSJXBJPJE03gUKIIU/kErtKJBCVwpaQF5HnReS8iBzJuG6DiCRFpDf1xeIdFDj5Shbn07JtvzMNIcrDrgJlPwHw3wD8z6zrn1XVZ2w6BgVUPJHEd3/+Hq4V2Fgje+MPr+jb+IClTUjSytk+kKgStoS8qv5KRBrseCwKvnXx/pLXj7/w7mm88O5pT4Z9urQxl0qSFzldavjbIvIfAPQA+I6qXsq+g4isArAKAOrrrZ9NSP5UaRCmw14APLuskWUDiIpwcuL17wHMBNAI4CyAH+W6k6o+p6rNqtpcW1vrYHPINDt7uorRicyGti4s6nzbEytXamJR000gGsexkFfVc6o6oqo3AOwAcL9Tx6JwSw4O3Qz8hrYuY4G/4eE5XK5GnuPYa1JE7sq4+CiAI/nuS2SndOC7XRystakO27hBCHmMLWPyIvISgMUA7hSRjwCsB7BYRBox+sl6AMA37TgWkVXp8ftMTk/ctjbV4aldvci/TojIXXatrnkix9X/aMdjE9kpM/idCvzlC+pZfZI8Q1S90+dobm7Wnp4e083wtfmbu8uukyICqAIREYyooq4mhrVLZtu6gsWrywztDvx8y0RLqV5JZJWIHFbV5py3MeT9L55IYuM/HcWlK8O2P3YsGsGWx+baGvQt2/Z79qSgRTOnYufKhaabQVQShnxAudUrrquJ4UDblxx7/HJOjnKDCLB8vvdOviLKVijkueLLp9wc9jgzOOTo429qnYvtyxo992JUHR3Db2jrwj0GVusQ2cHpM14pAGbUxBw/RmtT3ZghIa/17hW3Jm05rk5+4rXOE3lMLBrB2iWzXT9uei/V7R5cd35vuzcnj4lyYchTXnU1MdsnXUvV2lSHgc6lGOhcikUzpxprR6brCs+UUiAqhhOvPuX0mPztEyM3qyt6USnlfd3gxeqYFB6FJl45Jk85Xb46MuaNxGvj0Ok3oHvaujxxdml6/oBBT17D4RqfcnvooqGtC/M3d7t6TCs+8NC4/UuHPjTdBKJxGPI+tXPlQteD/tzH14xWecwnc9x++pRqY+0Y8dDQJ1EaQ97Hdq5ciFjU/V/h6l29uK9jr+vHteJQRwsGOpdi1rTJrh87IuL6MYmK4Zi8z215bB5W7+p1/bifjiga2roQi1Zhy2PzPLdDU/eaxWMuu7Hu/on5dzv6+ETl4OqaAPBSLRivTdDmE08kbS0JzNU1ZBJr14SAl6o7+iXoiYKCtWtCwCsrTACgceMbnpucJQorhnxAtDbVeeaM0MGhYbTv7mfQE3mALSEvIs+LyHkROZJx3VQR6RaR46l/77DjWF4RTySxqPNt3NPW5ZlT3HeuXIgVC+pNNwMAMDQ8gg17jppuBlHo2dWT/wmA7HPg2wC8paqzALyVuhwILdv2Y/WuXiQHh6AAkoNDeGpXryeCPl3Yywu9+sGhYcz569c98f9CFFa2TbyKSAOAX6rqn6YuHwOwWFXPishdAParasFyhl6feC02uVkdEfzz5gddao01Xjntn6tPiJzjyuqaHCE/qKo1qe8FwKX05ayfWwVgFQDU19d//tSpU7a0x25WV6/cMSmK9Q/NsbxuPJ5IYuu+YzgzOIQZDuypmsn0CpxZ0yaPW79ORJUzHvKpy5dUteC4vJd78pUG5AQBnvlao6W9WLcva0RrU53tbwDxRNLIiVPZvHoCFZFfmQr5QA3XmO4Fp5X6SSGbV06cqgKwLfVmRkSVMbVOfg+AJ1PfPwngFw4eKzQuXRnG6gomebvXLMbtEyM2t6p0NwC07+4z3QyiwLNrCeVLAA4CmC0iH4nINwB0AmgRkeMAvpy6TDapJOj7Nj7gidU3Q8M3uDk2kcNY1qAEXhmyyVbphKbp58UyCESV4c5QAXf8/CdjgromFsWGh62P2w90LnWlSiMRuY9lDQJocGh03L6UoZD0SVSmarETkTMY8iXw25YQL7x7Gg1tXWgosfRC95rFnhizJ6LKMeRL8IGPx46Tg0MlTdbuXLnwZs/eK/VwiKh0DPmQWb2rFy3b9pf0M5ta52L7skZMcmCrQU66EjmLE68hlJ6oLSVgW5vq8k7kLt9xEAdOXrSreURkI4Z8iaZPqca5j6+5cqy6mhi+eF8tdv3f0xi+Yf/j39vehRNbKu9J71y5cMzleCKJta/0OtJmIioNQ75E7Q9+1rb6LysW1OOX753F4NBoLZt8JQs2tc51ZInjdYdOkcju9d/XsRefjow/GIdqiJzHk6FKtKjzbSQHh2x5rEpDzo5hEgYtkf9xj1cbnbEY8MWWIE6cUPl//c6VC7mmnYgK4nBNlmK940nRKlyxMNhcrIf9N1+dV3LbculesxjxRBLfebkXOUZEiCjk2JPPYGX4w0rAW2Fnid3Wpjqc3MIzVYloPIZ8yrz1r/t+GWCpZ6ryrFai4At9yC/fcRANbV24fHXE1eM6VWJ358qF2L6sEZOrC9eMXzRz6rilj0QUPKEek5+/udu1Ne/ZXnj3NJr/aKojOyMVOnGJiMIldCEfTySxYc/Rm2vTTUqvt2cgE5FTHA95ERkA8DGAEQDX863ldIPpzTFyYdATkZPc6sl/UVV/79KxxvHK5tX5bNhzlCFPRI4I/MSr1wMegCeGjogomNwIeQXwhogcFpFVLhxvDDsCXjJ2C6mJRbkenYh8w42Q/3NV/TMAfwHgWyLyhcwbRWSViPSISM+FCxdcaI51A51LsX1ZI26bcGs54uDQsO2fDO6YFLX18YiI0hwPeVVNpv49D+A1APdn3f6cqjaranNtba3TzbEsvRvS1n3HMDTs7Br69Q/NcfTxiSi8HJ14FZHJAKpU9ePU918B8H0nj5mtCkAphQgiInhi/t3Y1DoXgPWCZOVaNNOZtfJE5I54Ipmz/LgAWL6g/maWmOL06prpAF6T0UHtCQBeVNXXHT7mGNuWNVqq/74ixy9jXbwfTtf84lmnRP4RTySxdd8xS+XGFbi5B0RmttzT1jUuV+pqYli7ZLYjHb5Q1JP34vr4NNZzJ/KudfF+vHjoNG5UEJMREZzc8qCllX7llhspVE8+FGe8DnQu9WTQc5VOfk7uG5vrUxuFl9PbVY6oWs6fAycvYl2839bXZ6B78ve2dzm2xV2lJghs2V/Vy+KJJNp392HIZ5u9snhbcPnhvJl0z78UoerJxxNJPLWr1/Gx9HzSm2+/ejiZd1XOrGmT0b1msbsNs0l6TPLM4BBmZI0jBmXzkgMnL2L5joMM+oCIJ5L43qt9uHrdH52NEZs73oHoyXtpKCY9xl4oDL0s30qBMOJ8ib/l20DeDwTAByW8/gLdk/dSwGduwuHHcr8MeAoKL+VCOex8awp87Ro7VWF00q6uJjbutiCM4z69u890E4gq5veAt5vve/JOcXLdqgmFeunpYQm79q8lIu8IdcgHofdtRbFhmIa2Lo4/Z+H+t+GV70xVN1fmSPG7WBaakC91IiNItu47VvQ+y3cchAjgoXl4Y2ZNmxyKN38aK73qLb1owuSwj51Z5fuQz3eiE3umt1ipv3Pg5EWsWFB/8zTsMKqJRbHh4TmBGaKj0hw//4mrwS4A7p02ecynAydGF3wf8gADvZgZNTFLtTY2tc4NdMhvX9bIAA8Br57hnqk6IvjhX37OlddjIEKeClu7ZHbgl0ZOnFCFv/nqPIY4ARgN+kWdb1vq3LjljklRrH/I/U+KDPkQSL+oCgV9eqJxu8WqnaaEZbKcKud0mfBibp8YQd/GB4y2AWDIh0b65KxcKwQyg7O1qQ4/fue4sfoefi75QN5idZjSDtn7UHhJIMoakP2cHNNkb5zc4OQZ3F47jybQZQ3IGXZMXrGkL5lkZZiyVH58TbMnT5YVqvE+KVqFHzzGiU/ypnXx/opXjnk54Av15B0PeRF5AMDfAogA+B+q2pnvvgx5InJbod2fTK2IKZWxkBeRCIB/BtAC4CMAvwbwhKr+Jtf9GfJERKUrFPJOV6G8H8AJVf2dql4D8DMAjzh8TCIiSnE65OsAfJhx+aPUdTeJyCoR6RGRngsXLjjcHCKicDFeT15Vn1PVZlVtrq2tNd0cIqJAcTrkkwDuzrj8mdR1RETkAqdD/tcAZonIPSJSDeDrAPY4fEwiIkpx9GQoVb0uIt8GsA+jSyifV9WjTh6TiIhucfyMV1XdC2Cv08chIqLxjE+8EhGRcxjyREQBxgJlHpSvel45G2PEE0m07+7D0PCNnLd7uR4HEVWOBco8xsnyqPkw6In8jaWGS5BrUw0AiFYBWx93fo/Q9t19jj5+Li8eOs2QJwoohnyGfAEPAMM3RutS5+pl27EJRjyRRMdr/XmHVZyUq/oeEQUDQz5DuVveHTh5Ect3HMwb9HbUsnZa0/ffwKUrwzlv45Z8RP7FkLdJ5mYahT4ReFW+gAdG3/xatu1n0BP5EEPeRiYmTd3itzctIhrFdfIZZk2bXNHPBzXgici/GPIZutcsrjjoiYi8hMM1WTLHnYM8/EJE4cCefAGv9Hh7RQwRUTHsyWfxw3JHIiKr2JPPENaAnz6lGhGRgvcpdjsReRN78hmCFvAC4NlljQByr/zZvmxsmYaGtq68j/XE/Lvz3laqeCKJrfuO4czgEGbUxLB2yWzHy0UQhRVD3meqACiAP4hFMTxyA59cG8l5vzsmRbH+oTk3w9NKiE4Q4HqeEgd21baZt/51XL56q83JwSGs3tWLH79znCdbETnAsSqUIrIBwEoAF1JXPZ3aJSovt6tQevXM1DsmRZH4668YOfa97V1jgn6CACe2LC3rsdbF+/HSoQ8xooqICEbKfK3VxKLY8PAc9vaJ8jBZhfJZVX3G4WOUxasBH4tGsP6hOcaOX26gZ8ue3yg34AFgcGgYa195D8CtTyTLdxwcU0oCGJ1bOPfxtTHXpYes+AZBYRXa4RovBvzk6gg2Pzq35EDKDrzqiOCHf/k5o8H20qEPbX284RuKrfuOobWpLmfAAxgX8MDo0NbqXb3oOXWR5ZQplJwervkrAJcB9AD4jqpeynG/VQBWAUB9ff3nT5065Uh7shWaZDShnI07Cq0GMtGDjSeSWLOrF+4XS7YmIsCPvsZePQVPoeGaikJeRN4E8Ic5buoA8C6A32O0M/VfAdylqv+x0OO5OSbvRshn1pkvFMjTp1TjUEeLpccsZZlnXU0MB9q+ZK2xZfDqkFcx2auKiPzOsZAvoQENAH6pqn9a6H5BCfl8vfJcAV0s4J1auz+hSvDM49aHdOKJJDbsOYrBofwlif0oFq3ClsdK2zeXyGuMhLyI3KWqZ1PfPwVgvqp+vdDPuBnyTvRCb4sI3t/8YEWPkbkixWlWhnTCeoJYGnv95AemQv5/AWjE6HDNAIBvpkM/H78voYxGBJOrJ+DfhoZznuTjxcAsNKTjtXkL0wTAcm56Th5kfLjGKrdDPpMXA9gNAuCDzvHLJhnw1pQzYe41uT491vFMZF9hyJcgfcp9cnDIaDvckt2TD9vzt5sdm7o7LZ5I4rs/fw/XRqz97fMTjPcx5CuQb012EGSPyccTSax5uRc3vPOS8K2BHJ+OTMouJ1EuP7yJhRFD3iZ+C/yIABY7a2SzSspBVCKeSOJ7r/bh6nXnzlZg0HuPybIGgbJz5UJfjN1nf7yOJ5Jo392PoeHKe3JkzXUd/X93ekzbxNJWP3V0iCFfsk2tc7Gpde6YySoRwOQHIhFg+fz8Y6Zb9x0LTMCnC51lTwx6cavGdBkGJ9g1/ELBx5AvUzrs00Z7y30YGq7sY7JgdM1pplg0gq9+vg7vvH+hrBrsZ3w+iTopWoUfFDlh6bs/f8/FFllj1/97Zv39SdWRvOWl3eTGpxSyB0PeJq1NdWMnMMuo4ZK5yUclm2rEE0k8vbsPVyp8w/GSYs/f6koRN82oiRW9T3Y55j+unYSTFz7JO/nthYAHbm1Cw6D3Poa8AzIDP83KpG3mSpdy/nj8NjFs1ZXhG77rOVYJsHbJ7IL3yVWO2U+1gNa+0uur30lYcY9Xl+xcuRArFtTnvG36lGoMdC6t6A8mqAGftnXfsby3rYv3u9gSa/4gFi36+7S7HLPbAvRBMdDYk3dR9ji+nYIc8MDoNoHps3C3Z63t3+nB1U6DV4qvdnGjPhERQ96n4okkNv7TUVyyECZBkzkevHXfsXET1V5gZTy+ki0RiaxiyPuIX+u3OyG9PNGL5ReiESk6Hh9PJFEl6uuT1RbNnGq6CWQBQ97j7Fqa6XXRiGBrasvCxo1vFD25x6vLQkVw83nkEpRPYDzr1T8Y8h4WlkqQ2evg/83C2ZszamJo2bbf4ZaVJlIl+FGBjViC8kmMAe8vDHmPCkvAA4BCxlyeURMrOgxzfWQEx897qzefGfBBrebJgPcfLqEMsbqaGAY6lxofWx0aHhmzRHLtktmIVkmBnwDOfXzN6WZZFqmScSt+2nf3By7gBzqXMuB9qKKQF5HHReSoiNwQkeas29pF5ISIHBORJZU1k5ywdslsxBNJDPyr+TDKHGNvbarD1sc/h5pY9OZ1Eyd4sz8SEYwboglSraC02ydGTDeBylTpcM0RAI8B+IfMK0XkswC+DmAOgBkA3hSRP1HVYL3yfa59dx+u31AMe2CJR/aSw/RZw/FEEh2v9XvmdP5M+TZht3NSuCYWxYaH54wb58+sZzOjJoYr1647Opnbt/EBxx6bnFVRyKvqbwFAZNxH60cA/ExVrwL4QEROALgfwMFKjkf28tKKnewlh14v6VxobNrKnEIh+d48MmWXznCqCqepuvhkH6cmXusAvJtx+aPUdeOIyCoAqwCgvj73af9htGjm1MCcxZqrsma29t19nisVnE+VAI8333qt2r1RR7GAz6W1qc72/z8rbzbkfUUHOkXkTRE5kuPrETsaoKrPqWqzqjbX1tba8ZCBsHPlQkyfUm26GbawMhjkpU8VxdzQW7V01sX7sXpXr20BX8kkuF1bDlZHRieSGfDBULQnr6pfLuNxkwDuzrj8mdR1VIJDHS2eH7YIqzODQ4gnkrb+bgSoePVKdtCX8vq5LSJ4f/ODFR2fvMeWPV5FZD+A/6KqPanLcwC8iNFx+BkA3gIwq9jEq9f3ePWCStbPWxk2IWvqUhPFdi2TvH1ixPHJzXyVSr226TiVzrE9XkXkUQB/B6AWQJeI9KrqElU9KiIvA/gNgOsAvsWVNfbI/IPMXmFhZXORXD/Tc+oiPy1k2b6sMW/5gWjVaG2ap2wYA3dz3Jtr3MPJlp68XdiTN+++jr341ANLKk2qq4nhQNuXAIyvNZO5pHFR59sV9eTd6L1TOBTqyTPkKaegb0JSSObZq4VUumyRwyRkl0Ih783TCMm4sH60txrwALBl72/KPs6saZPL/lmiUjDkKa+w9TRL3YKxkvo53WsWl/2zRKVgFUoqKDvonTqzMkzC9uZJZjHkqSTZp9Nn8nN55HybrNvNdMVPCh8O15BRk6JVZQdf4WLEtyyaORXblzXmfbGvWFBf1gbr5ZyRHNa5DjKHPXlyTblLBjM34Ehvfl2XWuMPIOfwUa5jlTLebsWhjhbM39xteWyewzRkApdQkm0KDddwTTiRcxw745Uo00Dn0pxBzx4skTkMebIVA53IWzjxSkQUYAx5IqIAY8gTEQUYQ56IKMAY8kREAcaQJyIKMC6hpEC4t70L1zPO65sgwIktXM5JVFFPXkQeF5GjInJDRJozrm8QkSER6U19/ffKm0o0VjyRxL1P70VD29iAB4DrOhr8RGFXaU/+CIDHAPxDjttOqmpjhY9PFt3T1jVuk+5SNsDwg1LLHGcHP1EYVRTyqvpbABCxWg+Q7GJle750IAYh6FnHnqg8To7J3yMiCQCXAaxT1f+T604isgrAKgCor3enprffVLLf6tZ9xwIR8lv3HTPdBCJfKhryIvImgD/McVOHqv4iz4+dBVCvqv8qIp8HEBeROap6OfuOqvocgOeA0SqU1pseLC3b9uP4+U9sf9wzg0O2P6YJQXkeRG4rGvKq+uVSH1RVrwK4mvr+sIicBPAnAEJXR7iSXrgdZtTEjB3bTjNqYkiWGPS3T4w41Boi/3BknbyI1IpIJPX9HwOYBeB3ThzLy0wHPICbG2v4XanPg/XriUZVNCYvIo8C+DsAtQC6RKRXVZcA+AKA74vIMIAbAP6TqppNOwNMB3yQVtekn0ehyddZ0yaje81il1pE5A+Vrq55DcBrOa5/FcCrlTw2lSe9LV5Qwj1ToU3EiSg3nvEaEOVuRk1EwcaQd9CimVNtH7LhWDMRlYIh76CdKxcW3Ny6EPbMicgODHmHrVhQjxfePV30fkGaJCUi7xBV75x/1NzcrD09wVtKX+qJThySIaJSiMhhVW3OdRvrybuge81ilFLd5/LVEcxb/7pj7SGi8GDIu2T5gtLq8ly+OuJQS4goTBjyLuEkKhGZwJB30UAndyoiIncx5F020LkU06dUF70fi2sRkR0Y8gYc6mjBQOdSzJo2OeftXF1DRHbhOnmDWEyLiJzGnjwRUYAx5ImIAowhT0QUYAx5IqIAY8gTEQWYpwqUicgFAKdMt8OAOwH83nQjDOFzDyc+d3v9karW5rrBUyEfViLSk6+CXNDxufO5h43bz53DNUREAcaQJyIKMIa8NzxnugEG8bmHE5+7SzgmT0QUYOzJExEFGEOeiCjAGPIeISIbRCQpIr2prwdNt8lpIvKAiBwTkRMi0ma6PW4SkQER6U/9roO3e30GEXleRM6LyJGM66aKSLeIHE/9e4fJNjolz3N39W+dIe8tz6pqY+prr+nGOElEIgB+DOAvAHwWwBMi8lmzrXLdF1O/66CvF/8JgOwNEtoAvKWqswC8lbocRD/B+OcOuPi3zpAnU+4HcEJVf6eq1wD8DMAjhttEDlDVXwG4mHX1IwB+mvr+pwBaXW2US/I8d1cx5L3l2yLSl/qIF8iPrxnqAHyYcfmj1HVhoQDeEJHDIrLKdGMMmK6qZ1Pf/wuA6SYbY4Brf+sMeReJyJsiciTH1yMA/h7ATACNAM4C+JHRxpLT/lxV/wyjw1XfEpEvmG6QKTq6jjtMa7ld/Vvn9n8uUtUvW7mfiOwA8EuHm2NaEsDdGZc/k7ouFFQ1mfr3vIi8htHhq1+ZbZWrzonIXap6VkTuAnDedIPcoqrn0t+78bfOnrxHpF7oaY8COJLvvgHxawCzROQeEakG8HUAewy3yRUiMllEpqS/B/AVBP/3nW0PgCdT3z8J4BcG2+Iqt//W2ZP3jh+KSCNGP7YOAPim2eY4S1Wvi8i3AewDEAHwvKoeNdwst0wH8JqIAKN/gy+q6utmm+QcEXkJwGIAd4rIRwDWA+gE8LKIfAOj5cW/Zq6Fzsnz3Be7+bfOsgZERAHG4RoiogBjyBMRBRhDnogowBjyREQBxpAnIgowhjwRUYAx5ImIAuz/A897pAMCILF6AAAAAElFTkSuQmCC\n",
 818 |       "text/plain": [
 819 |        "<Figure size 432x288 with 1 Axes>"
 820 |       ]
 821 |      },
 822 |      "metadata": {
 823 |       "needs_background": "light"
 824 |      },
 825 |      "output_type": "display_data"
 826 |     }
 827 |    ],
 828 |    "source": [
 829 |     "import matplotlib.pyplot as plt\n",
 830 |     "\n",
 831 |     "%matplotlib inline\n",
 832 |     "\n",
 833 |     "plt.scatter(x=df_embedding['x'], y=df_embedding['y']);"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "markdown",
 838 |    "metadata": {},
 839 |    "source": [
 840 |     "### Save Dataset"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "markdown",
 845 |    "metadata": {},
 846 |    "source": [
 847 |     "This is a good checkpoint to save this preprocessing as the feature engineering has finished here and now we're going to tune the model."
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": 9,
 853 |    "metadata": {},
 854 |    "outputs": [],
 855 |    "source": [
 856 |     "output_folder = '../output/processed'\n",
 857 |     "\n",
 858 |     "!if [ ! -d $output_folder ]; then mkdir -p $output_folder; fi\n",
 859 |     "\n",
 860 |     "df_embedding.to_parquet(f'{output_folder}/embedding.parquet')"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "markdown",
 865 |    "metadata": {},
 866 |    "source": [
 867 |     "## Clustering"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "markdown",
 872 |    "metadata": {},
 873 |    "source": [
 874 |     "A density based model has been chosen for doing the clustering due to the irregular shapes coming from UMAP projection. Here algorithms like KMeans would perform badly as we'll see in following lessons."
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "markdown",
 879 |    "metadata": {},
 880 |    "source": [
 881 |     "### HDBSCAN"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "code",
 886 |    "execution_count": 10,
 887 |    "metadata": {},
 888 |    "outputs": [
 889 |     {
 890 |      "name": "stderr",
 891 |      "output_type": "stream",
 892 |      "text": [
 893 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
 894 |       "  \"(https://pypi.org/project/six/).\", DeprecationWarning)\n",
 895 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
 896 |       "  warnings.warn(msg, category=DeprecationWarning)\n"
 897 |      ]
 898 |     },
 899 |     {
 900 |      "name": "stdout",
 901 |      "output_type": "stream",
 902 |      "text": [
 903 |       "Cluster Ids: [-1  0  1  2  3  4  5  6  7  8  9 10 11]\n"
 904 |      ]
 905 |     }
 906 |    ],
 907 |    "source": [
 908 |     "import numpy as np\n",
 909 |     "from hdbscan import HDBSCAN\n",
 910 |     "    \n",
 911 |     "hdbscan_m = HDBSCAN(min_cluster_size=50, \n",
 912 |     "                    metric='euclidean', \n",
 913 |     "                    prediction_data=True)\n",
 914 |     "\n",
 915 |     "clustering = hdbscan_m.fit_predict(df_embedding)\n",
 916 |     "\n",
 917 |     "print(f'Cluster Ids: {np.sort(np.unique(clustering))}')"
 918 |    ]
 919 |   },
 920 |   {
 921 |    "cell_type": "markdown",
 922 |    "metadata": {},
 923 |    "source": [
 924 |     "## Results"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "markdown",
 929 |    "metadata": {},
 930 |    "source": [
 931 |     "As output from the clustering we've selected 2 variables: the total number of clusters that HDBSCAN has found and the number of points that HDBSCAN has decided not to include in one of them."
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": 11,
 937 |    "metadata": {},
 938 |    "outputs": [
 939 |     {
 940 |      "name": "stdout",
 941 |      "output_type": "stream",
 942 |      "text": [
 943 |       "Number of Clusters: 12\n",
 944 |       "Noise: 133\n"
 945 |      ]
 946 |     }
 947 |    ],
 948 |    "source": [
 949 |     "number_of_clusters = len(np.unique(clustering[clustering != -1]))\n",
 950 |     "print(f'Number of Clusters: {number_of_clusters}')\n",
 951 |     "\n",
 952 |     "number_of_outliers = len(clustering[clustering == -1])\n",
 953 |     "print(f'Noise: {number_of_outliers}')"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": 12,
 959 |    "metadata": {},
 960 |    "outputs": [
 961 |     {
 962 |      "data": {
 963 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcgAAAECCAYAAAB60kc4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzdd3hURRcH4N9sTTa9N1IoAaQFJTTpHSE06UWainyKDbD3jgVFbAgCoggiKFKlF6mBAKEECKEFCAnpPVvvfH+ExIRs6t5t4bzPw2P2lpmjhpzMvTNnGOcchBBCCClPYu0ACCGEEFtECZIQQggxghIkIYQQYgQlSEIIIcQISpCEEEKIEZQgCSGEECNk1g6gLG9vbx4WFmbtMAghxK6cOHEinXPuY+046hubSpBhYWGIiYmxdhiEEGJXGGOJ1o6hPqJHrIQQQogRlCAJIYQQIyhBEkIIIUbY1DtIQgghtu3EiRMtZDLZHgBe1o5FBBl6vb53u3btzhs7SQnSCtJTMvBEi9koyC4EAHgFeuCXq99CoVBYOTJCCKmaTCbb4+/v7+nt7a2VSqXWDqfODAYD0tPTPVNSUvYA8Dd2DT1itbDs9ByMD5xZmhwBION2FgY7TATtrEIIsQNe3t7eOntOjgAglUrh7e2tQxUjYUqQFvZM5KuVnntt0EcWjIQQQurG3pNjier+PShBWljqjfRKz53Yfhr9JKPRTzIak5o8bcGoCCHEdImJiWzw4MGOwcHBqpYtW6p69OjheObMGUl4eLiqLu0tXLhQce3aNavlKUqQlsZqdtmdq2noJxlt3ljqMY1Oh9eObMXXsfutHQoh9wWDwYBhw4Y5du/e3XDz5s3CuLi4wnnz5mmTkpLqnGdWrlwpu3nzZg1/ahbTarV17a4CmqRjYQOn9ca2ZXtqfP1I32n4M3W5GSOqXw4nXcHEfX+UO7Yg7jA85Q7I1KnLHR/bsDXmPRxlyfAIqbe2bNkil8lkeOWVV0ozVOfOnQ0XL14sTZALFy5UxMTESH755Rc1APTq1ctxzpw5+gEDBujGjRvneObMGQYAEydONISEhAjnzp3D5MmTlQ4ODjh+/HjhqVOnpHPmzFEWFhbCw8ODr1y5Uh0WFsY7dOigatWqlRAdHS0ZOXKkITQ0VJg3b55cIpHAxcUFMTExhRUjrp7JCZIxFgzgFwB+ADiAxZzzrxljngDWAAgDcB3AGM55lqn92bs5P/0P+9YegjpPU6Prc9PzzRxR/XAu/TaGbF9R6fl7kyMArLl2Fq09AzCxeTtzhkbIfeH06dOSiIgIoS73HjlyRJqcnIyEhIRCAEhLS2M+Pj580aJF8i+++ELTvXt3g1qtxvPPP6/cvHlzUVBQEF+yZIl87ty5DuvWrSsCikeOcXFxhQAQHh6u2r59u7pRo0ZCWlparUagZYkxgtQDmMM5P8kYcwFwgjG2E8BUALs55/MYY68CeBXAKyL0Z/c25azEsX9O4tMp30IilSD7To61Q7JLWr0e43auxKnM5Dq38eaJHZQgCbGy5s2bCzdv3mRTpkxxiIqKMowYMUJ37zVnzpyRJiQkoE+fPo5A8SNdX1/f0qn/48eP15d83b59e2HSpEnKkSNHGiZMmFChrZoyOUFyzpMBJN/9Oo8xdgFAEIBhAHrevWwFgH2gBFmqwyMP4c/UZQBQ5btGJq3zLz/1Eucc7dcuQIaRESEhxHpat24tbNiwQV7VNTKZDILw3yBTrVYzAPD19eVnz54tXL9+vfzHH3+UrVmzRlYyMizBOUeTJk0QGxtr9HGpi4tLabJctWqVes+ePdINGzbI27Vr53jixImigICAWq+jE3WSDmMsDMCDAKIB+N1NngCQguJHsMbumcEYi2GMxaSlpYkZjt3YkPdrpef+Ua+2YCS27YHfPkOjVfPES44CoIwrxKAuLyAnK1ecNgm5Tw0ZMkSn1Wrx+eefl1Y8OXr0qLTsLNRGjRoJcXFxEr1ej4SEBMnZs2cBALdv32YGgwHTpk3Tffzxx9pz584xAHBycuI5OTkSAGjTpo0hMzMTu3btkgGAWq3G8ePHja7TiIuLk/Tu3dvw9ddfqz09PXH16tU65TrREiRjzBnAnwBe4JyX+2nDi1fAG83enPPFnPNIznmkj8/9uZ2ZyskB2/VrKhzfqllVb9Yb1VWuWo0RW39Gw98+gRoGcRtngMbPFfHDA9Hllc8w0m86YnacFrcPQu4TUqkUGzZsUO/du1caHBysatKkieqVV15RNGjQoHTI2K9fP31wcDAPDw9XPf3008oWLVoAABITEyXdunVTNW/eXDV58mTl+++/rwOAyZMn65999ll58+bNVXq9HqtXr9a8+uqrimbNmqlatWql+vfff43+gHzxxReV4eHhqvDwcFX79u2FTp061emHBxOjegtjTA5gM4DtnPMv7x6LB9CTc57MGAsAsI9z3qyqdiIjIzntB0kAYNDGJbiQV/maUZNxABoGVlD8RIhpDPBZcxUuJzMw7tXhePzjiebrmxCRMcZOcM4jLdHX6dOndREREeKtpbCy06dPKyIiIow+GjZ5BMkYYwCWArhQkhzv2ghgyt2vpwDYYGpf5P4wYONi8ybHEkX/vYLnSinyIr0BAGs+3YDcjDzEpd7BhvgLiEu9U+umNXp9uXcthBD7I8Ys1i4AHgNwljEWe/fY6wDmAfiDMfY4gEQAY0Toi9wHLuVlmK/xkgcm+VIwofwEKGYoPmmQM4xeswq3oIaEMQicw8PBEcn5ecbfE1RjadRw9GrU2LS4CSEWJ8Ys1oOovD5MH1PbJ8RUc1p3w4ar53C5IAsQBLBUGaAo/+qCaQxwPVY8SSxjaAgK9QXQl0mHRfl5de7/8c1/Y9uEKWjq7V3nNgghlkeVdEi9N6tNV8xq07X088Sot3C0uwsADi5lYBxwPpEO1dniOhZ57X3A6zRWrNzsHVuxecJkUdskhJgXJUhSr/X0D6tw7LfNH2Dtku2Yv+RvGFQyqC7lQJFSvORKIpWAKWWiJ8iETDM+NiaEmAUlSGJzzo6ajdbrvjR6LnrELPiqXBC5bgEyNEVGrykRpnLHst7jjJ4b/eQAjH5yALQaHU7uOoOY7bEIaOSHQU/2xVM7N+PQjURRU2Sgi4uIrRFCLIESJLE5zkolrk18DUvOHsWiC0cR4ROEZb3KVxs6PvJ5vHx4C9ZdP/vffVI5egY1Rs+AxhjeqBWkkuonaSuUcnQa3A6dBv9Xbu7dHr0x8o9VyNHUrF5uTczvP0i0tgghliHKOkix0DpIYisyiwoxbPVKJJkwOafEa1264cl2HUSIihDjaB1k3Zl1HSQh9ZGnowrbJk01uZ2hTZpSciTECkaNGuXo6empqutmzQAlSEIq5aRQ4M8xE0xqY/7AwSJFQ4h92rRou2Js0JOqftLRqrFBT6o2LdquqP4u002fPl23YcMGk96TUIIkpAoP+gdg87hJdbp3etuHavQelJD6atOi7YpFs1fIMpOzAQ5kJmdj0ewVMkskyUGDBul9fHxMeodIf3sJqUYLXz/snTy91ve93q2n+MEQYkdWfrBOplWX345Rq9Zh5Qfr7GKCKCVIQmog1N0DF//3XK3+wrT6xvhSFULuF5kp2bU6XhudO3dWlezYUfbPr7/+WuWelLVhF1mcEFugkMtx+bk5WH7yOD44+G+116sBPLXhT/w4bKT5gyPEBnn6uyMzuWIy9PR3N7ntI0eOGN04WUw0giSklqY91B7BzjVb+L8z8TrOpiRXfyEh9dCkt0bpFQ7lB3QKBzkmvTVKb6WQaoUSJCF1sGx4zUeFw/5YBVtab0yIpQyZOUA788spes8Ad4ABngHumPnlFP2QmQPMvo4yKirKsVu3bg7Xrl2Dn5+f6ssvv6z1xCB6xEpIHTT29KrV9QuPHsbznbuYKRpCbNeQmQO0lkiI99q8eXPVtShrgEaQhFjAwuNHrR0CIaSWKEESUkeTW7Wp8bUcQEqe6WXrCCGWI0qCZIwtY4ylMsbOlTn2LmMsiTEWe/cPVWsmlVJrdPj8x+0Y+dQivPTRWhw6fhlFatsu9/hu7361uv7h5YvNFAkhxBzEegf5M4BvAfxyz/GvOOdfiNQHqWfUGh2u38zAp4u2I+FaWunxO+n5OHIysfRztw6N8ckrI6wRYrWuPjcHr+7Yhj8uxtXo+pzCQrip6lwakhBiQaIkSM75v4yxMDHaIvXb7kMX8OHXW6Ez1HxW54FjV9B15Bdwd3XA7989CWeV0owR1t68/gMR7O6O+UcPVXvthwf24fMB9DCFEHtg7neQsxhjZ+4+gvUwc1/EhhkMArqO/ALvfLmlVsmxrOxcNQY+9g26jvwCv/4ZDa3OdpZS+Tk71+i6TZcumjkSQohYzJkgfwDQGEBbAMkA5hu7iDE2gzEWwxiLSUtLM3YJqQf6Tlggans/rjqA3uMWYMj077D38EVorJwsu4aE1ug6La2HJMRumG0dJOf8TsnXjLElADZXct1iAIuB4g2TzRUPsS6dXjBLu1k5RXhr/n/fWj06NsEHc4dBImFm6a8y/s4u6Bkahn2J1y3aLyHEfMw2gmSMBZT5OALAucquJUQs+6Mvo/vo+eg97ksMnvotPvxmq8X6Xjr0UXQMCLJYf4Tcz9auXSsPCwtThYSEqF577TWzTEwQZQTJGFsNoCcAb8bYLQDvAOjJGGuL4iVg1wE8JUZfhNSEVidAq1Nj277z2LbvPADAzVmJtYtmQOVonkk+jDH88uhoNPuu8sfJjzRsbJa+CbFVm1ceVKxeuFOWmZYLTx9XjH+unz5qUleT1nDpdDrMnj1bvn37dnXDhg2Ftm3bqkaNGiVt166dQay4AfFmsY43cnipGG2T2kvPL8DhK4lo6uuN5gG+Fc7rDQIOX0nE0as3Eezphp5NG8HP1dmsjyU7PRiGo6eum639msjJ16D/pG8AAA+1aoCPXx4BZydxk6VcKkWAygnJhQVGz383ZLio/RFiyzavPKhY/MEGmU5TPEcgMzUXiz/YIAMAU5Lk/v37ZaGhobxFixYCAIwYMcKwdu1amU0mSGIbMvML0eXTHyscb+rrhds5ecjXGP9+fB97AACeKkc0D/CBs1IBnSCge3gYhrVtAUeF6durffHmKIyauQgpafkmtyWGk+duYeDk4mQZGuiOXxZMg1QqFaXtg48/haGrfkVcxn+TzmSMIX7Wi6K0T4i9WL1wZ2lyLKHT6LF64U6ZKQnyxo0bLDAwsPRzcHAwj46OFv2VISVIO5aVX4gBC5YhT6MDQ/GzbGMupWbUqL3MwiIcvnKj9PPRKzfw65FY/DFzPJyUtS6EX8G6RTORX6DG+GeXIivH5DrCokm8nY0eY74CALw8sw+G9nvQpPYYY9g0cbIYoRFi1zLTcmt13NZQgrRTT//yF/Ym/FdtxhzTf4t0eiRl5+K3o7GY0aODKG06Ozlg07JnSj8fPnkVL3/0lyhti+GzRbvx2aLdAIDQIA988vJwhDSo3c4dhJBinj6uyEytmAw9fVxNajckJITfvn279PPNmzdZYGCg6D8GqVi5HUrLzS+XHM1Jo9djW9wls7X/8EON8PdPMzF2SDuz9VFXiUlZmPD8cnQd+QV2Hjhv7XAIsTvjn+unlyvLj8PkShnGP9fPpIXL3bt311+/fp1duHBBUlRUhPXr10tHjhwp+mJoSpB26JGvllm0PxcH85Z28/ZwxrNTe+Hgn3Oxc+Vz6N05HAq5bX1rvrdgK7qO/AJvz99o7VAIsRtRk7pqZ7w1TO/p61q8YbKvK2a8NczkWawKhQLz58/XDRw40KFp06aqYcOGGdq3by/qBB2AHrHapQK96N8HlXKUyzGxY1vL9eeowPtzhwEAOOc4G5+EWW/8DvOUGai9PYcvQaf7G5+8SrNRCamJqEldtaYmRGPGjh2rGzt2rE7sdsuyrV/Tic1wUsihkEkxoWME+rVoYpUYGGNo07wB/v1zLg6sm4MObUOsEse9Dhy/jPe+2oxbyVnWDoUQYkY0giRGqXU6jO8YgRf7dgFjli3bZgxjDF++NQYAcPZiEv73xmqrxrPz4EXsPFhcePyJ0R0xdVw3q8ZDCBEfjSDt0DM9Opq9DwMHVh49jVbvfo0PN+w0e3+10bp5EA7+ORdzn+xl7VAAAD+tjcYbn623dhiEEJFRgrRDs/o+DGel6Yv3a+q3mHN44K2vcPyqZWbO1tTwge1w8M+5mDWlu7VDwf7oK+C0Uwch9QolSDt1/M1ZmNHVsksjJi//C50++t6ifdbEuKEdcPDPufjrxychE6cYTp3QO0lC6hdKkHZsVt8uFu8zR63BA299hTXHYi3ed3V8vd2w74/iCT2d2tZsf0YxKUUoyUcIsR00SceOyaVSyCUMOsHyj/be3bQX727ai4aebuj9QBPMGdDNJibzAMUTer54azSA4qUiVxPT8M2KvYg5c9Os/fp6u5i1fUKIZVGCtHM7XpyOXvOtt3HKtcwcLD10AksPnUCf5o3w7cRhVovFGMYYGof5YsE7Y0uPHY65jA07TuPkuVso0tRsGVVVtW4B4MOXhpgWKCHE5lCCtHP+7q54e3AvvL9lr7VDwe6LV7Hy8ElMevgha4dSpYcjm+DhyOrXdnLOK4yKT527iTWbo3Hw+HUAgLuLEn/+OBNKC06aIoQAo0aNctyzZw/z8vJCQkJCoTn6oARZD4zv1BaHriZi94Wr1g4FH/2zHzlqLaZ2eUiUHUCsydgj4wdbBePBVsFWiIYQ+7TprxOKlcsPyDIz8uHp5YxJ07rphzzazuTKOtOnT9c9//zzfPr06WarhUmTdOqJb8YPRc+mlp+YYszif6MxbvHv0OpFrx1MCLEjm/46oVi0cIcsM6N4H9jMjHwsWrhDtumvEyb/9jxo0CC9j4+PWSdgiJIgGWPLGGOpjLFzZY55MsZ2MsYS7v7TQ4y+bEWh5griUsYi9lZPXEl/GTq9daf4M8bww2OP4tle5i8iUB2tQcCV1AwsO3TC2qEQQqxo5fIDMq22fO1ordaAlcsP2MXTS7FGkD8DGHjPsVcB7OachwPYffez3RK4FtfTP0Z0YkNEJzbE2ZS+yNccg8aQiPSCtTiZ1A4FmgvWDhNP934YFz54Ef4uTlaNgwP4etdhTPrpDxRoRK9TTAixAyUjx5oetzWiJEjO+b8AMu85PAzAirtfrwBgt9sf6A3ZOH6jGe4ULKniKo5LaTMsFlN19r48Azufn2LtMHAiMQlTlv4BwQpLUQgh1uXp5Vyr47bGnO8g/TjnyXe/TgHgZ+wixtgMxlgMYywmLS3NjOHUXfydZ2t0ndZwCxn5O6HWXa9R2bGs/BhEJ7YoHZWeT54Ivb7I1HBLNfD2xIUPXsS+uU/gud6dRWu3tuKS09D6nQVYdiCmxssqCCH2b9K0bnqFonx5K4VCiknTutnFBAUmVv1IxlgYgM2c81Z3P2dzzt3LnM/inFf5HjIyMpLHxMSIEo+YohMbmtyGm3w08nS7IKD6d5Ut/TfDUR6KxKyPkVGwAZzr4ebYHWGe70IpC6pzDI9+uxIX7lj/l5DR7VrhvWF9baawACH2jjF2gnMeaYm+Tp8+rYuIiKjxexNzzWKNiopyjI6OZllZWfDy8sIrr7yinz17dq3bPX36tCIiIsLoOi1zJsh4AD0558mMsQAA+zjnzapqoz4nSLEEub2IQLeZkLDaTwJT63R48P1vzRBV7fVq1hDfT7Lbp+6E2BRbTpC2rqoEac5HrBsBlLwEmwJggxn7MisHaStrh1AqKecrnLwRVad7HeRynHzrGZEjqpu98ddwLinF2mEQQkilxFrmsRrAEQDNGGO3GGOPA5gHoB9jLAFA37uf7VIL/9+tHUI5BiQgOrEhBKH27/McFQrEvfcCHgw2+krYosYssu6mx4QQUhVR1qJwzsdXcqqPGO1bm1xm3SUTlTl+sykAOcK9v4OnU78a3yeRMKyaMQEA8P6GHVgdE2emCKvGAWw9fQGDIh6wSv+EEFIVqqRj93RISJ9ROhM2OrE5rqW/A72QW6O73x7WH+fffwFNvK1Tx+GVv7ZbpV9CCKmOXVQzsAVKSTNohHhrh1EDGqQW/ILUgl/QNjAaSrlvtXcwxrDp+akAgOvpmXjk6xVV3yAiPa2PJITYKBpB1lDLwDXWDqHWYm93RHRia8TfeRLp+esh8OrfWYZ5e+LMO8/hAX9vC0QIi/VDCCG1RSPIGpJL3awdQh3lI1u9C9nqXUjO+RktA9dBwqremkkuk+KvZx4DANzMyMbwb39Bod5Q5T119cfMCWZplxBCTEUjSCsQaelprRXqz+D4jQdQpL1Z43uCvdyx/5Wn8PKAbghydxU9JplUWv1FhBBiBTSCrIUGLm/gVt5HpjfEi2dwVl9Iprp97OvCgDPJ3eGpHIdw/09qdIezgxLTukZiWtf/1iHrDQL0Bj0OXk7Eb0dicfT6rVpH8ljHiFrfQwghCQkJkkmTJjmkp6eDMYbJkycb3n77bY3Y/VCCrIUgzydMTpCcA4eXeUPlbcCDwysrOyeFi0Mn+LtMRUrOSuRp95vUpzGZmt+h1j4HB0VAne6XSSWQSRXo2yIcfVuElx43CAJ+PXIS83ccgF6ouo3Xo3rXqW9CiP1Yvz1W8fPaI7KMrAJ4eThh6ujO+hED2ppUiUcul/MvvvhC061bN0NWVhbatm2rGjx4sLRdu3aivguiBFlrUgB1/39g0DHs/y4I/WY0QwvfCcgo2gC9IRNOytZwkIXCQd4QjvLGpdd7qvpCrbuBjNyduJU/D4B4NX7PJQ9HZGi0aO0BgFQiwdQukZjaJRKCwHEtPRPTl61FasF/RdhlDDjxZs0KwBNC7Nf67bGKb5bvlWl1xT8zM7IK8M3yvTIAMCVJhoWF8bCwMAMAeHh4oEmTJjwxMVFCCdKKiuvW1uy/vyAAknve8Bp0QOx6TxRmy9B7xONwcWwKF8fqHzM6yEMQ5PU4grweLz2m1+txIim8iruqZ0CqSfdXRyJhaOzrhf2vzjRrP4QQ2/Tz2iOlybGEVmfAz2uPyEwdRZa4ePGi5Pz586xHjx6ibxVEk3QqIQj5EIr2QSg6DsGQfjc51ux9YLjnT8hPkUOnBrhQ/FhVMAAn/vDGprdCAQAPdDAtuclkMrQNPAnTfsehCTKEEPPJyCqo1fHays7OxsiRIx0++eQTnZeXlyhtlkUjyDK4kAuePgMQTlY8V8M2/CWt4F7wP3QP4vj5e38kXHJA1k0H3D6rgl5b/PsIk0GUrZ6Ucg90DE1ATtFZXEwdWuv7neUdTI6BEEIq4+XhZDQZenmYXr5TrVYjKirKcdSoUYapU6eaZaNZGkHexQ3J4KmRRpNjWZEKWaW/VTSTSRAsL662I1cwPPF8Bh4dkQvkymDQl8xI5Xj0ZXFrj7o5tkaHkCu1vs/fdbqocRBCSFlTR3fWK+T3bJgsl2Lq6M4mTaYwGAwYP368Y9OmTfl7770n+uzVEvf9CJJzDp67Hih6tUbXM8bQViHDFZ0e+RzwlDAESiWQMlZhVMgY0KlfHjr1iwfngE4LrPvBE4++/Kbo/x6MSfBgUAxOJdV8SzhPp16ix0EIISVK3jOKPYt1586dsr///puFh4ez5s2bqwDgvffe040dO1bUkaRoGyaLwdIbJgvqs0D2SIv1BxS/j2RMDuZ3FoyJP4DnnON29grcyv0cQGElVzE81OAE5FLrFCgnhIiLNkyuu6o2TL5vR5BC/kog/32L91s8yNSB3xkA5r/TDO0zBHlMRZDH1HLHBUENgEEiUYreJyGE1Ef3VYIUDDogrT+AJGuHAiARQkp3MN+/wSSeZu9NInEwex+EEFKfmD1BMsauA8hD8QJCvaUeA9xLSJkAwHKPb2smBTx1FJj/HmsHQggh5B6WGkH24pynW6ivCoSU1gDMNtHJRLfA9VfAZI2rv5QQQojF1PtlHkJKO9hucizGtResHQIhhJB7WCJBcgA7GGMnGGMzLNDffx1zjuKnu3XFALiX+SwHJH0AiPzOUBYsbnuEEEJMZokE2ZVz/hCARwA8wxjrXvYkY2wGYyyGMRaTlpYmasdcE2fC3UGQ+McDLk8BKJngogOE3QAyaxdHNStpmLxNXQIkhBBiRmZPkJzzpLv/TAWwHkCHe84v5pxHcs4jfXx8xO1c6l3HGyWA91JwoQDI+xqAuk6tcA7cuqJAYX4VZeUUj4pSdo4QQoi4zDpJhzHmBEDCOc+7+3V/ABZbfCiR+6OaLQnLkAOQAfJugNs7kMh8IKiPo67JESje0eOHt4Pw5uJEGKvmyjkgcRe/qg4hxDIWvvY7/lldccs4hYMMc+ZPQPfBDxq978ius0i7nYWoSV0huXfbH1KtgoICdOrUSaXVamEwGBAVFSUsWLCg7j+sK2HuWax+ANbfHSHJAKzinG8zc591oABc50GiioIg6ADNHgj5sYB6mUmtXj3ngHPRKmSmyhAQooWkTElCjZpB4J5wkjibGDshxBI2/XIASz/dBEEnAIxBp6m8nKhWrccnz/yCuGNX8b/3/qvW9ds3O7By/j+ln394ez0AoNlDIVjw14vmC96K1u4/rViyNVqWnlsAb1cnPDmoo350D9Mq8Tg4OODAgQOF7u7uUKvV6NChg2r37t3SPn362M9+kJzzqwCq3/DQnJzmAQXV1VnVArmzIeTOFq1bzoGEcw4QBAnenNQIn/5xBc6uBnAAcjnH7j890H746zC9pn39lV+oxie/74HOYMAb4/sgPbcQt9Jy4OfhjEYBXlDIq/725ZyjSKODg0IOiYQeY5OaycsuxOXzSXBQyZFw+iZ+/OAvCHUsrb1xxUE0bBEElYsSy+dtQsqNLKPXxZ+8gUfCXsSa2A/g6l5/fmleu/+0Yv66/TKtvjhvpecWYP66/TIAMCVJSqVSuLsXT6DUarXQ6/VmeVVV7yvpSFwehSAEAkWTLd731l+9oFVLcPuaEpM7PICWHQrg5qnHhRMq9B2TCb/GQywek7XEXknC3lOXEezrjpYN/eDv5goPVxUEgSMxJQN/HTyNXacuQyKRolCjRU5B+aU5u05eFiUOBmDagPbwdnNCoJcrOrUIhVwqpffA90cAKFcAACAASURBVBmDwYDPXliJfzfFmr2vr19ZU+Nrx7Z9C/9c/8qM0VjWkq3RpcmxhFZvwJKt0TJTR5E6nQ6tW7dW3bhxA5MnTxZ69+4t6ugRqMcJUkiZDuCgVfrmHLhzS4aEM05ljjGci3YGwOHpp0PfGUusEpu5cc5x6Ow1/Lb3JJLSs3Er3ZRlNuLjAJZtP17p+Vahfvjl1QmWC4hYjMFgwJiIN1GYL/qrKlElXkpGaNMAa4chivRc4xsjV3a8NuRyOS5evFiYlpbGoqKiHI8dOybt0KGD/TxitSTOOXRaPaR5s8D0e60UhRRgXsjQvIsXh60Bk2SDl84SKp6k4+LpiF8TV0GhUFgpxupxzqEXBEiZBPlFGqgcFJBJJdDo9Nh54hLOXE2Gs6MSeUVF8HBSoV2zYDQJ8MJbP2/D0Ys3rB2+Sc4l3kGnWQtw9NsXrB0KEckHTy3D4e1nrR1GjUXvias3CdLb1cloMvR2Fe/lko+PD+/atath06ZNMkqQ9/jhpeX4a/7Wu584PH11+PwvBRo0ssJuLG5fQOI4GL4AVlzuhv1/HMalE1eQn12A5p2bYehT/SCVSqttxlr0BgFPfLkGZ66m1Oq+n7YdM1NE1qE1cJy7loxWDevHD6n7UdLNFDzR7VNrh1Eny+dtwZiZfa0dhiieHNRRX/YdJAAoZFI8OaijSRsmJyUlMYVCAR8fH56Xl4d9+/ZJ586dK+pekICdJ8idK/aWSY4AwJCZKseMns2w+tR5uHmJ/ki6chJvMIdBpR8dVEoMmNoLA6baz6bEEz76FZeTa1cEob76fO0+rHh5vLXDIHUwtPlc6NQW/LtvBqcOxePBLs2sHYbJSt4zij2L9ebNm5Jp06YpDQYDOOcYOnSoYfz48ZQgy/ps2vdGjjIY9MDyef544XMxt7VigKwf4NgHKNoCGM4DPAuAFHAYBLh+ZPMTPfaeisfLP22FQQAc5FIsnTMGD4T6AwCSM3IpOZbhqLDrvxr3rR8/+NPukyMAHNgSWy8SJFCcJE1NiPfq1KmT4cKFC5XtCC+aevpToGRCTF2UPAJtAnj9CIk8sOIlTiPqGphV5BWp0WP2D+WOqXUGTJy3Gj6uKmybNwO/7T5ppehs07zHB1s7BFIHfy8Vf2JeQKgXcjLzoSnSQyaXQFMk+kClgl7DrLIrILlHPU2QHIFh1ezgwToC7u8A2n0A9IDTBEgkrpYIzuIeee2nSs+l5RbipcUbIZXU02+FOnBWSOHuorJ2GMTSJICntzM69WuFAeM7oWmr0NJTNy/fxv8GfgFNkWVGp6070vZ3tsCufyo2jAjFtdOJRs9NffV2+QOKd8E8xld8DKpsYqbobENekQaFmqp/490TexWzR3bHzpOXLBSV7XJXybFn/ixrh0EsjDEgrFkA/Bp4QqsxYN6slUi+bp0tbNec+tAq/ZKK7DpB/njycwxUjoOgL19x1dXLGY17HgGYExi7v+sc5hbUbM3XleQMM0di28IDPfHzyxPgqJRbOxRigs/+eBovjzE2N6FqnAPXLiTj2oVkM0RlXIPGPtCotUhLygEAvPDpOAwY29Fi/ZPq2XWCZIxhu3YNbl9PwWsDPoabtzMWHvrY2mHZFD8Plxpddz2lfk/QebBxAD59MgqOSgVkEgku3UqFp6sKQd7u1d9M7EbrDuF44vUo/PTxZmuHUqWJzw/A2Gf6Qk6TwWxavfi/ExjmjxXxC60dhk2SSSXo/EAojlww/ii6RKcHQtGxWQgW/1NxZwJbxgC4Oinh6aJCoJcLGnh7IKpTc6TlFEIqlaBNQ3+4OVV8n9i6kZHJV6ReGDmjDx59sjeimsyFYKj5fj7mwhgglUnh5uWEd5Y+ifCWDawdEqmhepEgSdW+e+5RTP50Fc5dv2P0vFwqwZgeEXB3dsSKXTHQ6GxzmnzrMH+oHOR4fEAHRDYPsXY4xIYV5musmhyZBHjp6wnoNaS91WIgpqMEeZ/45ZUJMAgCRn/wM66n5JQebxnqi3lPRMHj7qzNPZ//D11e+NYqMSplDEFeHgjzd8eA9s3g7eqMVmH+kFezawch91IoLfQ9wwBwwEGlwDvLHkfbTk0t0y+xCPrJcx+RSiT4653pVV7jqJTjo6n98cbPO8wSw9CHW2DGoE7498w1NAn0RGQzGgkS8ckVMjz8SBsc/ueM6G07uyrx7tIZaNm+kehtk9rR6XRo06aNyt/fn+/du7dI7PYpQZIKHunYEicSkvHXodoXeJZJJHBRKdAy1B+T+0ciwNMFDko5vFzKFyce16utWOESYtRLX07E/+JuIeWGOBPQpHIJNifMF6Wt+8nvx04rvt8bLUvPL4C3sxOe7tVRP66DOJV1PvzwQ2WTJk14fn6+GM1VQAmSGPXmpL54c1JfvLVsMw6cS0RuUeXfz73aNMIr43vD171mM2YJsQQHRwWW//sWUpMy8eaURbh5Oc2k9pbsrm7jdXKv34+dVsz7Z79Mc7dYeVp+Aeb9U7xhsqlJ8vLly5IdO3ZIX3vtNd1XX31lllxm9gTJGBsI4GsU13D7iXM+z9x9EvF8MD2q9OuCIjVe/H4jYi4X17h9e2IvDO9KI0Fi23yDPLF41+sAgKICNdb9uAcnDyVAqZTD1cMJ7l5OOHUwHreu/lcYwMvfDUX5aoAxDJ7YGdNfHWqt8O3a93ujS5NjCY3egO/3RstMTZCzZs1Sfvrpp5rc3FyzLXY3a4JkjEkBfAegH4BbAI4zxjZyzs+bs19iHk6ODlg8Z4y1wyCkzhydHPDY7EF4bLa1I7k/pOdXsmFyJcdravXq1XJvb2/evXt3w+bNm+0zQQLoAOAy5/wqADDGfgcwDAAlSEIIqee8nZ2QZiQZejubtmHygQMHpDt37pQEBgaqtFot8vPzMWzYMMcNGzaIOlHH3HXYggDcLPP51t1jhBBC6rmne3XUK2XlN4lXyqR4updpGyZ///336jt37hTevn278Oeff9Z17tyZi50cARuYpMMYmwFgBgCEhNCUf0IIqS9K3jOaaxaruZk7QSYBCC7zucHdY6U454sBLAaAyMhIbuZ4CCGEWNC4DhFacybEqKgoXVRUlFk26TT3I9bjAMIZYw0ZYwoA4wBsNHOfhBBCiMnMOoLknOsZY7MAbEfxMo9lnPM4c/ZJCCGEiMHs7yA551sBbDV3P4QQQoiY7u/dhAkhhJBKWH0WK6mIc46/EqOx4NJWaAU9vGTOeKP1SHTxa1bnNgUu4EzGdcTn3ka2rghRQQ8iyNlbxKgJIaR+oQRpY7bdjsXbZ/4odyxdn48XT62AismxvtdL8FA4l57jnCM+JwnPx/yCfEMRwp388IB7A8RmXkdiYQb0ML6349JrewEAO3q/AXeFaYt2CSGkPqIEaUMOpcVXSI5lFXIdBuz5uMo2zuffxvn82zXus/+ej3BsYNVtEkLI/YgSZBmcc4w/sABXCytW/WcAfJSuCFV5QQIJ0nX5CFV5Y3zDrojwCDW571uFGfjo7F8mt1MX1/JT0dDZ1yp9E0KIraIEWUbfXR8gz6A2eo4DSNXkIlWTW3rsSv4d7EmNQ5iTD37r8izkkur/cxbpNbhZkAFfBze4K52gFfR47dQqHE1PgI4bfxxqbqcyryHI0QOH0xOQoc5FuGsAWruHgDFmlXgIIaQ6gYGBKicnJ0ilUkilUsTFxRWK3QclyLuu5d6pNDlW53pBGj4/vwmvtxpR4dyOm7H46Px6FHGzFHoQxcLzWzDv/Aaj5/p4PYBP2j9m4YgIIfXFb2diFd8cOypLKyyAj8oJz3bopJ/Ypq0olXX2799fFBgYaLYKbJQg71p746hJ92+7HYvnmj2Cby5uxd9JMbCnmnmFqLxu8O6MC+i47XVE03tKQkgt/XYmVvHhgX0yjaH46VhqYQE+PLBPBgBiJUlzonWQdzV3M22TEZ2gR+/d72O9nSXHmuAAjqTEWzsMQoid+ebY0dLkWEJjMOCbY0dNHpwxxtC3b1/Hli1bqj7//HOFqe0ZQwnyrqHBkSbdb6h3abG8ObG/WjsEQoidSSs0vjFyZcdr48CBA+rz588Xbt++veinn36Sbd26VfQnopQgy1jU7klrh2CzDBCsHQIhxM74qIyvsa7seG00atRIAIAGDRrwQYMGCUeOHJFWd09tUYIs4yGfhjg28GM8H/4IXKVKqCCHFDSTEwD6+bW2dgiEEDvzbIdOeqX0ng2TpVI826GTSRsm5+TkICsrq/Trffv2Sdq0aSP6b/E0SceIiY27oY17AzwTsxwGXr8fndZUqjbP2iEQQuxMyUQcsWexJiUlSUaMGOEAAAaDAY8++qhh9OjRoi8VoARZhk7QY+qh75BQcMfaodicVHWOtUMghNihiW3aasWesdqiRQshPj5e9HWP96JHrHdlawvQZcfb911ybKzyxeMNe1V7XXvPxhaIhhBCbAeNIO9678w6a4cguoH+ERgc9CAWXtyGhIKUcufCXfzxbpvRCHcJAPBf8fLKPN6kt2hxLUtYgf1Z/wIAGqsa4c2Wr0HC6Hc1QohtoQQJ4GreHRxOt711fgyosHiEAZAwCZq7BkElkeNCbhIEQYCbTAU9BOhgQJiTL15vNRxhd+ur/ubTtPT+fL0aBkGAm0JVrt3nmg7EwkvbjMbxa+dZ8Hd0r3X8OkGHy/lXwDlHE5fGuJx7BZ9e+qLcNVcKr2La8SfR06sbcg35KNAXwFPmCT9HXwwM6A9HmWOt+yWEEDGYLUEyxt4F8CSAksrfr3POt5qrv9oSuIAph75DfH6ytUMpRwKGxxp2wzPNBgIAbhZkYPnVvZAAeKpJf/g4uprUvrPMwejxSY26QycI+OHyjtJjSibDP71eg7Oi9klqZ8ourLqxBkINl4fsyzhQ/kA28HfyRgBAD5/uGB8yBo7S8nFczL2IO+pURHq0g5OctuwihIjL3CPIrzjnX1R/meX13PEe1DZWH1UhkSHMyQfTG//3ODPYyQtvtx5Vo/sNXIAErM5Fxqc16YlpTXrW6d6y4nLO47cbv4OLVDxhf9q/uFl4E2+3eAOMMVzOu4wPL8wrbX/Z9RWQQILe3r2wK313uXvHBozGoOCBosRBCLm/3JePWK/kJNtccvSRu2B2yyj08G0BmaTm613Ppl3H4ycWlzs2MKAt3m0zymrv9f5O2ihaciyRVHQb8bnxaOzcGB9c+KTCeQFCheQIAGuS12Jd8l9Y1mFxhXOEEFIVcyfIWYyxyQBiAMzhnGfdewFjbAaAGQAQEhJi5nCKvXfOtibk9PZuiXmRE2t1z/nMG5h6bJHRc9uSY6ERdPj0wdq1aYrYzDNYfOUnFHDTS0gZoxE0+CT+8zrda4ABU449jqF+gzEy9FGRIyOE1FcmJUjG2C4A/kZOvQHgBwAfoHieyQcA5gOYfu+FnPPFABYDQGRkpEVW5TvJlGbvw0OmwrPNHoG70gmNnP3w7PGluFmUWeE6BaQ1To57k8/hvXN/otCgqf7aO3HQGHRQSuXgnCMm/SpWJx5EgUGLMSGd0du/ZZ0exXLOkVR0G2ezzmHT7S1mS4jmsPHOFmy+8w+mhU2BjEkR7toEPkofa4dFCLFRjFugUgxjLAzAZs55q6qui4yM5DExMWaPJ1tTgP57PzJL264SR2zs9RJU8oqTYcYdWICrBamln11kDtjR+w1IK3mkqjbo8PSxJTiXc6tOsYSovHCjMMPoOTmTYm23FxGo8qy2nRxtLjYlbcbetP3QV7E1lj0KUPhjePBQtHWLgEMlE5gIsXWMsROcc9N2XKih06dP6yIiIqy+VVVaWhp77LHHHOLj4xljDIsXL9b27du31j+gTp8+rYiIiJAbO2fOWawBnPOSKaIjAJwzV1+15a50QqDcDbd14laH+a7ddLT3aVLp+d+7vQAASCvKxbobR5GmzcVf148iWZMNnWBAK49QnMq6hui0BGRo8qDmpiWjypIjAOi4AdOO/IDtfd6o9JrMokzMPvuy6O8TbUmyNgU/XDH+fnJSg4noE9CT1mgSYoKVl04qvjl7UJamLoCPgxOebd1VP6npQyYn2BkzZjj079/fsG3bNm1RURHy8/NFL5xtthEkY+xXAG1R/Ij1OoCnyiRMoyw1gizx08VdWHx9j2jtScAgZ1IoJFIMa9Ae08N7Qy8Y8NSRRbhWlAEFJNDa2K4YG3q8hABHjwrHXz31BpJ1KUbuuH95yb0wPGgIuvp0oaRJbIqtjiBXXjqp+PDELplG+G9PSKVEijfb9TUpSaanp7OIiAjHGzduFEqlpm3iYZURJOf8MXO1LZYnmvfFE837Qi8YMPv4ChzNumxSewI4NFwPjUGPlYkHsTLxYLnztpYcgeLCAWUJXMD6mxsoORqRocvA0us/Y+n1nwEAjswB/fz6YGjQEMilRv9+2bxMdRL+vDkHhYb0MkcZJJCjp88stPYaZLXYiP375uzBcskRADSCAd+cPSgzJUHGx8dLPD09MWbMGMfz58+z1q1b8yVLlhS5ubmZHHNZ9+Uyj3vJJFIs7Fg8f+h4xhUsvPAP4vNvWzkq85OAoaGTb+nnO+o7mHfhC2TqKk4mIhUVcTU2pmzBxpQtAAAJJJjX+iP4OfpWc6d1GLgeakMubuSfwq6UzyFU+j6ZQ4AWe9K+xJ60L9HP92W08Oxv0VhJ/ZCmrmTD5EqO15TBYMCFCxfw9ddfa3v37m2YNm2aw1tvveWwcOFCdfV31xwlyHu092qMX7vOAgDcyLuD2SdX4kZR5e/y7BMHwPBss0dK11xyzvH+uY+RL+RbNzQ7JkDAy2dfQzePLni8ybQ6F2wQS5EhB0X6bDBBhl8SpwMwVHuPMTtTP8PO1M8Q4tgeI0IrrkElpDI+Dk5INZIMfRxMq3wVGhoq+Pr6onfv3gYAGDNmjH7evHmiP8ahBFmFEBc/rOsxBwCgMejQbec7Vo6orjhKqroycLgqiuDrWIgdacuxI205AEDFHFHIi6wYY/1xIOsQbp1PwqvNX4KD1HIzY/WCFgW6TKy/+Qpy9Emit3+j6DhWXJ6OKU2Wid42qZ+ebd1Vb+wd5LOtu5o0AzE0NJQHBAQgNjZW0rZtW2HHjh2y5s2biz6hhhJkDSmlcmzq/jKG/PuZtUOpVoDSDd91eAJpmlxcybuDs7n7kaS+Bl0V1YMoOYrrWsF1rL35Jx4LM1+xhiz1bWy88QayhZtm6+Ne2fobyNWkwlVpm4+RiW0pec9ojlmsCxcu1EycONFBp9MhJCSEr1q1StTHqwAlyFrxU7kjesBH+Pr8Zqy7dRwCFxCgdENT90DsvhNn8XgYAAeJHAGOHujsHY6xYV3K7brRwMkLDZ09sS11OXQmLhmxBVJI4aFwh6fCE529OqGrTxcI3IDzuRex6vrvSNOlVd+IBR3OOCJ6gizQZeJK9hHszfhK1HZr43zONnTynWy1/ol9mdT0Ia0YCfFeXbp0McTFxZl102RKkLXEGMMLLYfghZZDyh1PyE3G09E/Iccg7kjMUSpHiMobAwLbgnMOPTegnWcjtHYPqfIdl8AFqHVqHEw9YPfJsY1ra7zY9DlIJMaWVsjhp/SxueQIFBePr45OUONmwSkYuBZhTh0gv2fHEo2hAHHZ/yAh719kaW5Aw63/jjit8Jq1QyDEIihBiiTcNQA7+70FADh65xJePrUS6lpWnXGSKfFhm7FI1+QhIS8FWkGPHn4t0Nk7vNp1d5xzxGSeQHTmcdwqTEKyxra28TJFoVBYSXIstuTqcgtGU3Nt3SOqPH8xeyd2pHwBXmbyjANzhZrnAeCQwRF6qFFxV1Druqo+gK1JH2JQ0JvWDoUQs6IEaQad/Jri34HvAygeyS29vAdLrlRdkKCNWwg+enA8/Bxqv45HZ9DhlbOvI0NbP5dnXMu/XuX5O5o7lgmkFtzl7hgfMqbS85maG9iR8jn4PWtj1Ty39Gs9bPe9cELePmgNs6GQqqq/mBA7ReVAzEzCJHgyvC8O9HsPUxp2h6Ok/EzkZk4B+LfvO/ip88w6JUcAWJH4a71NjkDxbhy5ujyj59QGNTQ1KN5uSVImxUvNZsNDUbFCUYmzWVsqJEd7cyz9N2uHQIhZ0QjSQpRSOZ5pNhDPNBN/897ozOOit2krSkohzjr5PBhjGODbD6NCRkJx9xeNH68shaGO6/vMRcakyNMbT+gligwVdn6zO0VCbvUXEWLHKEHaqVR1GpZe+xmX8i5BsPORSGVKkmPZyUjbU3ciWZOCOc1eQIG+ACezT1orvEoZuIBAx4Aqr2nk3AXxeeLVAbaGVm5Uho7Ub5Qg7USeOg9zz7wKNURf6mNzuI4DUgAMRmfqxuWcx+2i25Ay04oUm0t3n65wk1f+uLxAnwGVzL3S8/bAReYHf8fm1g6DELOiBGnDNAYNdqfsw5qkP6wditlND5uCHr7d8dHqz3B8cyxcn3WodBmLAAG3ipLQRNXYwlFWL8wxFI+FGl/7mK9Lx983X0OG9jpsbWZqbYQ79cTABq9ZvZQeIeZGCdJGnUo/gwVXv7Z2GGblwBzQxLkRpoZNgY+jNwDA64QfClbr4DLTAUxp/D4JJDDoDXjxzEsWjLZ6CokCTzV5wuiSnJT8S1hz62krRCWuIIe2GBRMyzuIdcXGxkrGjRtXWsfx1q1beOmllwzvvPOOqDP2KEHaoKt51y2WHMMcw3C96LpF+rqXmqsRn5eAbXd24LGwCQCAdn3bYP3XW1HwuwbOk5UVRykc8FR6YtH1JVaIuHKhqhBMCXsMgY6BSCo8i/13vke65goYJFXsmmFfunrORDvfUdYOg9iZdYnRiqVX98gyNHnwUrrg8Ua99aNCO5pUWadt27bCxYsXCwFAp9MhICBANW7cuMpradYRJUgb9MGFj8zeh7vEHV9HzofABcw4/jR0EP17q0Z00GF/6n708O2GEFUw2vWPQMuHm+HsDxcgayyFQ5fy36KMMaRpbatqzkC//hgfOhYAkKa+jL9vvgo9L/5F1t6XcpSQwYGSI6m1dYnRigXxW2RaofiXxAxNHhbEb5EBgKlJssTGjRvlwcHBaNasmeh/2UxaB8kYG80Yi2OMCYyxyHvOvcYYu8wYi2eMDTAtzPuLJWal9g/og/jceHwe/yX0Vh7h6KDH6ewzAACJRILPdr2N5xfNgNMSDxSMARR/O6GjooNNvvP6X+MZGFemIEB0+krouehlJ61uRMjn1g6B2KGlV/eUJscSWkGPpVf3iDY4W716tWzUqFFm+SFmapDnADwK4MeyBxljLQCMA9ASQCCAXYyxppxz21qwdh9bm/QXuA1NFFFKFKVfy+QyRM3oh8FP9sXeO/vxy42VOKazrbWerlIXfNNuQYXj6ZqrEGsCDoMUEiZFhPswdPWdUfoLQnzOfuxM/hQSJgUYwLnBrEm5pWsUAlUPmK19Un9laIyvB67seG0VFRVh9+7d7KuvvjLLIzCTEiTn/AJgdCr+MAC/c841AK4xxi4D6ADgiCn93S+8ZJ7I0Ju3Mo6lkqMU0hot5O/g2R6F+iIcSj+MHH0OMorScTgr2gIR1l5TVVO80eoVo+e8lGHI0d02qX0veWN09p0MtSEPwaq2cFX4lzvfzK0HGjq3R2JBDDgEhDpFYlHCCJhjZmwnz2no6Gu+LbtI/ealdDGaDL2ULqK0v27dOnmLFi14cHCwWX6gmesdZBCAo2U+37p7jNTAZxGf4PETT1k7DFHUtMrN87FzzByJeNJ0qRC4AAmTQC9ocSXvIHYlfyVK7dQwp44Y2uDDah8nK6QqhLt2L/38WNhy/Hp9qsn9l/VU4/VwkIvzg4zcnx5v1Ftf9h0kACgkMjzeqLcoj0R///132ejRo832ZLLaBMkY2wXA38ipNzjnG0wNgDE2A8AMAAgJCTG1uXpBJpVhefslmHn8GWhQ/95n2bt8fQGuFlyDVn8FO1O+BER8hzssuG4TtDwdGuCxkJ/x642pJsfQx2cOWnk9YnI7hJRMxBF7FisA5OTk4NChQ2zFihVmm2FYbYLknPetQ7tJAILLfG5w95ix9hcDWAwAkZGRtvNSzMokTILFHX7AiYwTWHjle2uHQ8pgDLhdeAlns76FqMkx8BOT7vdUNcDzzXeBc47Uoss4nbUBCXn/Qo/q95RlkGF40McIcXnIpBgIudeo0I5asWasluXm5obs7Gy73DB5I4BVjLEvUTxJJxzAMTP1Va+182qHFV5LoTfoseDiNzhbcK7ObUkgKTdDVg651ZZ32DPOgRztidKlHKZikGFS6BJ4OgZXf3FN2mMMfqpw9FfNRX/MBQAU6LKw+sozKEBquZ4nhi2Bt0OYKP0SUt+YlCAZYyMAfAPAB8AWxlgs53wA5zyOMfYHgPMo/hX7GZrBahqZVIa5LV8EUFzEO1ObiV3Ju3Ey5zQ4N8Bb4Q1vBx+0cW+Dtu6tIZP8979WL+ghcAEX8uKRo83B7aLbSNdmoLlrM0R6PIRP4j5Dis729lS0liBlIAQISNakVDgnYzJMDpuE1MI/RelrZvhGKC2wp6KT3ANPNF9l9n4IqU9YyY4JtiAyMpLHxMRYO4z7EuccW5K2YO3t9dYOxepmNfkf2ntGQuACrucnYtudHUhRpyBEFYz+/v0QogpGTMbvOJy2HLzOW20xPN98p6hxk/sXY+wE5zyy+itNd/r0aV1ERES9mRxx+vRpRUREhNzYOaqkQwAUP5aLahCFqAZRiM+Nx8cXP7N2SFbR1i0C7TyK38NJmASNXBriaZeKM4pbu0fhUNqyOvfT39/4MhFCiO0wqZIOqZ+auTbD6ICR1g7D4p5pNBMvNH3WaLHxeyUVngVMqHj0gHtd5r4RQiyJEiQxKip4EEYEDLN2GBYToAhAB+/2NS5nt/9O3WcWjwlaWOd7CSGWQ49YSaWGBw/F8OCh4JyXJo7YjFh8deUbK0cmvlcfmFur69WG/Dr108FzCgJcWtTpXkKIZVGCI++5AwAADixJREFUJNUqO6pq69UWK7yWgnOObF02/rm9A8ezYpCpM29pPHPylHrAXeleq3uCndriSv6BWvfVyWdSre8hhFgHPWIldcIYg4fCAxPCxuKrB+u204OnzBOD/AbCX+FXp/sbOFRfvdBH5oMpIZPgKjFeMi3S9SF81e6LWvddl0k2zRz72uSOJITYo3fffVfZpEkTVXh4uCoqKsqxoKBA9D5oBElE4SnzRGY1BdY/bfURPJQe4OBwkJZuBo6xoaMBAAZuAOccBi4gNiMW14quQQIpbhbdQr4hH55yTxQYCuAqd0V/v35o4tIIxzNi8O2VHyr09XazN9DQNax0wk1v/14AALVBDT3Xw1nmbNK/r0LmgDHB3+KPm7NqdH1vj1fQ2q+fSX0SYo9239mr2HB7kyxHlwM3uRuGBQ7R9/HrZdIykatXr0qWLFkivXjxYqGLiwsGDRrkuHTpUsVzzz0n6vITSpBEFK+3eBlzz7xa6fkXwp+Hv8pYSd//SO9u3yQD0NG3AzqiQ7X9tveKxAqvpTWOs2xiNlWAU3M812wnzqRvxf6Mb8ChhwJuaOLSFT38noJCZv4CAITYst139ipW31gj0/Hiil05uhysvrFGBgCmJkmDwYDCwkKmVCp5UVERgoKCRF/UTwmSiMLHwQevNX0Zn1wqv37yieCp6BbQzUpRmR9jDBE+gxHhM9jaoRBiczbc3lSaHEvouA4bbm+SmZIgGzVqJDz99NOGhg0bOiqVSnTr1o2PHDlS9LqZlCCJaJq7N8OKDjUfzRFC6rccXU6tjtdUamoq27p1qzQhIaHIy8uLR0VFOX7//feKp59+WtRHrDRJhxBCiFm4yd1qdbymNm3aJAsODuZBQUHcwcEBw4cPNxw+fFj0fEYJkhBCiFkMCxyil7PyZU7lTI5hgUNM2ieuYcOGwqlTp1hubi4MBgN2794tbd68ed1LW1WCHrESQggxi5L3jGLPYu3du7chKipKiIiIUMlkMrRs2ZLPnj1b9ALqlCAJIYSYTR+/XlpTE6IxCxYsUC9YsEDsZsuhR6yEEEKIEZQgCSGEECPoESuxe3l5eXjUbXq5Yx0HP4QPN71mpYgIIfWBSSNIxthoxlgcY0xgjEWWOR7GGCtijMXe/bPI9FAJKSYIAtRFGsxs9xL6SUZXSI4AEL3lJN4dWbcasYQQApg+gjwH4FEAPxo5d4Vz3tbE9kkN5OQUYuLwhdBo/ps53bJ1Ayz4car1ghIZ5xw/vLgM6xduq/E9h9YfM2NEhJD6zqQEyTm/AIB2KLCwxd/txNrfoqu8Ju7sLcx47Ecs/vUpC0VlXotfWlGr5EgIIaYy5zvIhoyxUwByAbzJOTe6eR5jbAaAGQAQEhJixnDsy+F/L2DVL4eQlpKLzMzCOrVx7UqayFFZhyAIWPflFmuHQQi5z1SbIBljuwAY24bhDc75hkpuSwYQwjnPYIy1A/A3Y6wl5zz33gs554sBLAaAyMhI0aux2wKNRo/srAK4uDgAjCH+fBKO7L+IzOxCZGXmoyBfjWtX0yAYxO9bEDgkEvse4WsKNXW6r0GzQJEjIYTcT6pNkJzzvrVtlHOuAaC5+/UJxtgVAE0BxNQ6QjumUeswpO+n4KIXQKo5e0+OAODgVLctqpZf+FrkSAghtuL9999X/vrr/9u7+5iqzjsO4N/ffdN7UaooYimis+I27HrR6FzHtE6ZVaphTCO1kZi4Zi6hnQuLjXQ2NM7NZo01S9OYzMy0wegyVgmVEh22mWZZIpOuV5mvjW8gUgRqbcDKfXn2B5cW8VyFe+855758P8nNPfc5h3N+T24Ov3ue85znqbIqpVBaWuqvrKwM75f0A+jyHKSIpIuINbg8HUAOgEt6HCuWFT/zhqnJEfGfGwH03+OeMXfasLcveaUIDYFq/QIiomE71X3IsefiGtefzhW49lxc4zrVfcgR6T4bGxutVVVV1qampt4zZ870Hj582Nrc3Bxbg5WLSLGItAJ4CsAHInIkuGohgFMi8gmAvwP4pVLqwdPNJ5iuztvwes3MjkBtw2ZTjx9NuxvfgH106AaP3PyZaAhUoyFQjRe2rzMwMiIK5VT3Icfxm7ttvf7+f/+9/m4cv7nbFmmSPH36tCUvL0+lpqbC4XAgPz/ff+DAAfvD/3JkIu3FWgOgRqP8PQDvRbLveNfaYvzvgQnpY+C968MLZYuxfOUcw4+vt/reA7h+qR2vPrsDvT1f4bW6zfjOkzPMDouIQjjRVWXzq3uHYfWrPpzoqrI9mbYy7PFZZ8+eHdi2bZvcuHFDUlJS1NGjR61ut5uzecSLx3O0+jXpY+OvC7B6zQ8MO56ZHps+GXt5b5EoLgxcOQ63fLjmzJnj37Rpk6+goMDpdDoxa9asgNVqjWifWpggdTJmzGhYrIhqz9TUcU5U/mEVZuQ8ClfKqOjtmIhIBy5rmmYydFnTIt53eXl538AUV2VlZaOzsrJ4BRlP6j7agsKnXx/R39hsFuTNmYKlz87G00tyYbFwPHkiik/zJ5T6jt/cfU8zq1UcmD+hNKIJkwGgtbVVsrKy1IULFyz19fWWxsbGqPdiZYLUkd1ugyvFgd6e4TW1L1/pRnnFSp2jIiIyxsB9xhNdVbZefzdc1jTMn1Dqi+T+44CioiLnrVu3YLPZsGvXLm96enrUn6MXpWLn2fy5c+eqkycT71HJn/xw+4i2/+3vfoZFS3J1ioaIEo2INCml5j58y8h5PB6v2+2O+gTIZvF4PA63263ZA5btdwY48q9XRrT97189qFMkREQ0XEyQBgjnPmJ93X91iISIiIaLCdIgG3+1ZETbXzzTplMkREQ0HEyQBln93FNY/Mx3h7392vX5OkZDREQPwwRpoIrKVfjgny/D5XrwKEvjxjkxKWOcQVEREZEWJkiDORwO1B59GQ3/3oqlhd+7b/0T7ixU1//GhMiIiGgwPgdpos1bi7B5a5HZYRARkQZeQRIREWlggiQiorizevVqZ1pamisnJ8c1uLy6uto+bdo0V3Z2tquioiKiQauZIImISDftt/c5Pm6Z7zpxdbrr45b5rvbb+yKeMBkANmzY4K2trb1n/FWv14vy8nJ7fX39V+fPn+89ePCgtampKexpPpggiYhIF+239zmufb7d5g10AFDwBjpw7fPttmgkycLCQt/Q8VePHTtmmzp1qsrNzQ04nU4UFxf7q6urw+5rwwRJRES6aPviLZvCvZNsKNxF2xdv6dJB9Nq1a5KZmfn15ylTpqi2tjYJd38x1Yu1qampU0Sumh2HCSYC6DQ7CJOw7smJdY+uqVHeX1R4AzdHVB5rYipBKqXSzY7BDCJy0qiR+GMN6866J5tkqrvdko7+5tX7y/WQnZ2t2tq+GaazpaVFMjMzw56yik2sRESki8xHXvIJ7u1IKhiFzEdeinjCZC0LFy70XblyRc6ePWu5c+cOampqrKtWrQr7WEyQRESki8mp6/qyx2/12S2TAAjslknIHr/VNzl1XcTzSa5YscK5YMGC0ZcvX0ZGRobrzTffdDgcDuzcudO7bNmy0TNnznQVFRX5582b5w/3GDHVxJrE/mx2ACZi3ZMT654kJqeu64tGQhyqrq7ujlZ5SUmJt6SkxBuNY/AKMgYopZLqhBmMdU9OrDvFAyZIIiIiDUyQREREGpggY4SIvCYi10Xkk+Cr0OyY9CYiy0TkvIh8KiJbzI7HSCJyRUROB7/rk2bHoycR2SsiHSLSPKgsTUQaRORi8H28mTHqJUTdk+5cj1dMkLFll1IqL/iqNzsYPYmIFcDbAJYDyAWwVkRyzY3KcD8OfteJ/kzcOwCWDSnbAuBDpVQOgA+DnxPRO7i/7kASnevxjAmSzPJ9AJ8qpS4ppfoA/BUAJ8dMQEqp4wC6hxQXAXg3uPwugJ8aGpRBQtSd4gQTZGx5UUROBZtlErLJaZDHALQM+twaLEsWCsA/RKRJRH5hdjAmyFBK3QgutwPIMDMYEyTTuR63mCANJCJHRaRZ41UEYDeAxwHkAbgBYKepwZLefqSUmoP+JuYyEVlodkBmUUop9P9gSBY81+MEE6SBlFIFSqknNF61SqnPlFJ+pVQAwB70N0EmsusApgz6nBUsSwpKqevB9w4ANUj873uoz0TkUQAIvt8/YGeCSsJzXRehJkwOVR4OJsgYMfDPIqgYQHOobRPEfwDkiMi3RMQB4DkA75sckyFEJEVExg4sA1iKxP++h3ofwPrg8noAtSbGYqhkO9cDPfsdgY58V6D9265AR74r0LNftwmTH1QeDg41Fzv+KCJ56G9qugJgo7nh6Esp5RORFwEcAWAFsFcp9T+TwzJKBoAaEQH6z8H9SqnD5oakHxE5AGARgIki0gqgEsDrAP4mIj8HcBXAGvMi1E+Iui9KlnM90LPfgS932DAwJ2TgJvDlDlsAgCXl+YiGnyssLPSdO3fuvou8UOXhYIKMEUqpUrNjMFqwe3vSdXFXSl0C4DY7DqMopdaGWLXE0EBMEKLufzE8ELP0vP1Ncvza3f7yCBOkEdjESkRE+giEmBc6VHmMYYIkIiJ9WCaOrDzGMEESEZE+Usp8GDJhMjAqWB77mCCJiEgXlpTn+zC2wgdLOgABLOnA2ApfpB10AO0Jkx9UHg7pf0aXiIjo4Twej9ftdsd8B5vh8ng8DrfbbddaxytIIiIiDUyQREREGpggiYiINDBBEhHRiPj9frNDiIqH1YMJkoiIRqKrs7PTHu9J0u/3o7Oz0w6gK9Q2HGqOiIiGzefzLW5vb/+ovb19gtmxREGXz+dbHGolH/MgIiLSwCZWIiIiDUyQREREGpggiYiINDBBEhERaWCCJCIi0vB/cuZR2FE0MtsAAAAASUVORK5CYII=\n",
 964 |       "text/plain": [
 965 |        "<Figure size 432x288 with 1 Axes>"
 966 |       ]
 967 |      },
 968 |      "metadata": {
 969 |       "needs_background": "light"
 970 |      },
 971 |      "output_type": "display_data"
 972 |     }
 973 |    ],
 974 |    "source": [
 975 |     "fig, ax = plt.subplots()\n",
 976 |     "\n",
 977 |     "scatter = ax.scatter(x=df_embedding['x'], \n",
 978 |     "                     y=df_embedding['y'],\n",
 979 |     "                     c=clustering)\n",
 980 |     "legend = ax.legend(*scatter.legend_elements(num=number_of_clusters), \n",
 981 |     "                   title='Clusters', \n",
 982 |     "                   loc='upper right', \n",
 983 |     "                   bbox_to_anchor=(1.25,1.0))\n",
 984 |     "ax.add_artist(legend);\n",
 985 |     "\n",
 986 |     "output_folder = '../output/figures'\n",
 987 |     "\n",
 988 |     "!if [ ! -d $output_folder ]; then mkdir -p $output_folder; fi\n",
 989 |     "\n",
 990 |     "plt.show()\n",
 991 |     "plt.savefig(f'{output_folder}/baseline.png')\n",
 992 |     "plt.close()"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "cell_type": "markdown",
 997 |    "metadata": {},
 998 |    "source": [
 999 |     "In order to see how these records have been clustered together, we'll inverse transform the dataset and select the records from one cluster."
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": 13,
1005 |    "metadata": {},
1006 |    "outputs": [
1007 |     {
1008 |      "data": {
1009 |       "text/html": [
1010 |        "<div>\n",
1011 |        "<style scoped>\n",
1012 |        "    .dataframe tbody tr th:only-of-type {\n",
1013 |        "        vertical-align: middle;\n",
1014 |        "    }\n",
1015 |        "\n",
1016 |        "    .dataframe tbody tr th {\n",
1017 |        "        vertical-align: top;\n",
1018 |        "    }\n",
1019 |        "\n",
1020 |        "    .dataframe thead th {\n",
1021 |        "        text-align: right;\n",
1022 |        "    }\n",
1023 |        "</style>\n",
1024 |        "<table border=\"1\" class=\"dataframe\">\n",
1025 |        "  <thead>\n",
1026 |        "    <tr style=\"text-align: right;\">\n",
1027 |        "      <th></th>\n",
1028 |        "      <th>origin</th>\n",
1029 |        "      <th>destination</th>\n",
1030 |        "      <th>train_type</th>\n",
1031 |        "      <th>price</th>\n",
1032 |        "      <th>train_class</th>\n",
1033 |        "      <th>fare</th>\n",
1034 |        "      <th>duration</th>\n",
1035 |        "      <th>time_to_departure</th>\n",
1036 |        "      <th>hour</th>\n",
1037 |        "      <th>weekday</th>\n",
1038 |        "    </tr>\n",
1039 |        "  </thead>\n",
1040 |        "  <tbody>\n",
1041 |        "    <tr>\n",
1042 |        "      <td>2500995</td>\n",
1043 |        "      <td>VALENCIA</td>\n",
1044 |        "      <td>MADRID</td>\n",
1045 |        "      <td>REGIONAL</td>\n",
1046 |        "      <td>28.35</td>\n",
1047 |        "      <td>Turista</td>\n",
1048 |        "      <td>Adulto ida</td>\n",
1049 |        "      <td>7.633333</td>\n",
1050 |        "      <td>8</td>\n",
1051 |        "      <td>9</td>\n",
1052 |        "      <td>1</td>\n",
1053 |        "    </tr>\n",
1054 |        "    <tr>\n",
1055 |        "      <td>7750123</td>\n",
1056 |        "      <td>VALENCIA</td>\n",
1057 |        "      <td>MADRID</td>\n",
1058 |        "      <td>REGIONAL</td>\n",
1059 |        "      <td>28.35</td>\n",
1060 |        "      <td>Turista</td>\n",
1061 |        "      <td>Adulto ida</td>\n",
1062 |        "      <td>7.633333</td>\n",
1063 |        "      <td>17</td>\n",
1064 |        "      <td>9</td>\n",
1065 |        "      <td>0</td>\n",
1066 |        "    </tr>\n",
1067 |        "    <tr>\n",
1068 |        "      <td>8626820</td>\n",
1069 |        "      <td>MADRID</td>\n",
1070 |        "      <td>VALENCIA</td>\n",
1071 |        "      <td>REGIONAL</td>\n",
1072 |        "      <td>28.35</td>\n",
1073 |        "      <td>Turista</td>\n",
1074 |        "      <td>Adulto ida</td>\n",
1075 |        "      <td>6.700000</td>\n",
1076 |        "      <td>38</td>\n",
1077 |        "      <td>16</td>\n",
1078 |        "      <td>3</td>\n",
1079 |        "    </tr>\n",
1080 |        "    <tr>\n",
1081 |        "      <td>6918060</td>\n",
1082 |        "      <td>VALENCIA</td>\n",
1083 |        "      <td>MADRID</td>\n",
1084 |        "      <td>REGIONAL</td>\n",
1085 |        "      <td>28.35</td>\n",
1086 |        "      <td>Turista</td>\n",
1087 |        "      <td>Adulto ida</td>\n",
1088 |        "      <td>7.666667</td>\n",
1089 |        "      <td>14</td>\n",
1090 |        "      <td>14</td>\n",
1091 |        "      <td>2</td>\n",
1092 |        "    </tr>\n",
1093 |        "    <tr>\n",
1094 |        "      <td>7929214</td>\n",
1095 |        "      <td>VALENCIA</td>\n",
1096 |        "      <td>MADRID</td>\n",
1097 |        "      <td>REGIONAL</td>\n",
1098 |        "      <td>28.35</td>\n",
1099 |        "      <td>Turista</td>\n",
1100 |        "      <td>Adulto ida</td>\n",
1101 |        "      <td>7.700000</td>\n",
1102 |        "      <td>53</td>\n",
1103 |        "      <td>6</td>\n",
1104 |        "      <td>3</td>\n",
1105 |        "    </tr>\n",
1106 |        "    <tr>\n",
1107 |        "      <td>...</td>\n",
1108 |        "      <td>...</td>\n",
1109 |        "      <td>...</td>\n",
1110 |        "      <td>...</td>\n",
1111 |        "      <td>...</td>\n",
1112 |        "      <td>...</td>\n",
1113 |        "      <td>...</td>\n",
1114 |        "      <td>...</td>\n",
1115 |        "      <td>...</td>\n",
1116 |        "      <td>...</td>\n",
1117 |        "      <td>...</td>\n",
1118 |        "    </tr>\n",
1119 |        "    <tr>\n",
1120 |        "      <td>5643889</td>\n",
1121 |        "      <td>MADRID</td>\n",
1122 |        "      <td>VALENCIA</td>\n",
1123 |        "      <td>REGIONAL</td>\n",
1124 |        "      <td>28.35</td>\n",
1125 |        "      <td>Turista</td>\n",
1126 |        "      <td>Adulto ida</td>\n",
1127 |        "      <td>6.716667</td>\n",
1128 |        "      <td>57</td>\n",
1129 |        "      <td>12</td>\n",
1130 |        "      <td>2</td>\n",
1131 |        "    </tr>\n",
1132 |        "    <tr>\n",
1133 |        "      <td>1165780</td>\n",
1134 |        "      <td>VALENCIA</td>\n",
1135 |        "      <td>MADRID</td>\n",
1136 |        "      <td>REGIONAL</td>\n",
1137 |        "      <td>28.35</td>\n",
1138 |        "      <td>Turista</td>\n",
1139 |        "      <td>Adulto ida</td>\n",
1140 |        "      <td>6.983333</td>\n",
1141 |        "      <td>40</td>\n",
1142 |        "      <td>6</td>\n",
1143 |        "      <td>6</td>\n",
1144 |        "    </tr>\n",
1145 |        "    <tr>\n",
1146 |        "      <td>8859946</td>\n",
1147 |        "      <td>VALENCIA</td>\n",
1148 |        "      <td>MADRID</td>\n",
1149 |        "      <td>REGIONAL</td>\n",
1150 |        "      <td>28.35</td>\n",
1151 |        "      <td>Turista</td>\n",
1152 |        "      <td>Adulto ida</td>\n",
1153 |        "      <td>7.633333</td>\n",
1154 |        "      <td>53</td>\n",
1155 |        "      <td>9</td>\n",
1156 |        "      <td>0</td>\n",
1157 |        "    </tr>\n",
1158 |        "    <tr>\n",
1159 |        "      <td>7704935</td>\n",
1160 |        "      <td>MADRID</td>\n",
1161 |        "      <td>VALENCIA</td>\n",
1162 |        "      <td>REGIONAL</td>\n",
1163 |        "      <td>28.35</td>\n",
1164 |        "      <td>Turista</td>\n",
1165 |        "      <td>Adulto ida</td>\n",
1166 |        "      <td>6.716667</td>\n",
1167 |        "      <td>25</td>\n",
1168 |        "      <td>12</td>\n",
1169 |        "      <td>1</td>\n",
1170 |        "    </tr>\n",
1171 |        "    <tr>\n",
1172 |        "      <td>4862490</td>\n",
1173 |        "      <td>VALENCIA</td>\n",
1174 |        "      <td>MADRID</td>\n",
1175 |        "      <td>REGIONAL</td>\n",
1176 |        "      <td>28.35</td>\n",
1177 |        "      <td>Turista</td>\n",
1178 |        "      <td>Adulto ida</td>\n",
1179 |        "      <td>7.133333</td>\n",
1180 |        "      <td>7</td>\n",
1181 |        "      <td>9</td>\n",
1182 |        "      <td>2</td>\n",
1183 |        "    </tr>\n",
1184 |        "  </tbody>\n",
1185 |        "</table>\n",
1186 |        "<p>474 rows × 10 columns</p>\n",
1187 |        "</div>"
1188 |       ],
1189 |       "text/plain": [
1190 |        "           origin destination train_type  price train_class        fare  \\\n",
1191 |        "2500995  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1192 |        "7750123  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1193 |        "8626820    MADRID    VALENCIA   REGIONAL  28.35     Turista  Adulto ida   \n",
1194 |        "6918060  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1195 |        "7929214  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1196 |        "...           ...         ...        ...    ...         ...         ...   \n",
1197 |        "5643889    MADRID    VALENCIA   REGIONAL  28.35     Turista  Adulto ida   \n",
1198 |        "1165780  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1199 |        "8859946  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1200 |        "7704935    MADRID    VALENCIA   REGIONAL  28.35     Turista  Adulto ida   \n",
1201 |        "4862490  VALENCIA      MADRID   REGIONAL  28.35     Turista  Adulto ida   \n",
1202 |        "\n",
1203 |        "         duration  time_to_departure  hour  weekday  \n",
1204 |        "2500995  7.633333                  8     9        1  \n",
1205 |        "7750123  7.633333                 17     9        0  \n",
1206 |        "8626820  6.700000                 38    16        3  \n",
1207 |        "6918060  7.666667                 14    14        2  \n",
1208 |        "7929214  7.700000                 53     6        3  \n",
1209 |        "...           ...                ...   ...      ...  \n",
1210 |        "5643889  6.716667                 57    12        2  \n",
1211 |        "1165780  6.983333                 40     6        6  \n",
1212 |        "8859946  7.633333                 53     9        0  \n",
1213 |        "7704935  6.716667                 25    12        1  \n",
1214 |        "4862490  7.133333                  7     9        2  \n",
1215 |        "\n",
1216 |        "[474 rows x 10 columns]"
1217 |       ]
1218 |      },
1219 |      "execution_count": 13,
1220 |      "metadata": {},
1221 |      "output_type": "execute_result"
1222 |     }
1223 |    ],
1224 |    "source": [
1225 |     "df_results = df_umap.copy()\n",
1226 |     "\n",
1227 |     "df_results.loc[:,columns_to_encode] = encoder_m.inverse_transform(df_results[columns_to_encode])\n",
1228 |     "\n",
1229 |     "df_results[clustering == 0]"
1230 |    ]
1231 |   },
1232 |   {
1233 |    "cell_type": "markdown",
1234 |    "metadata": {},
1235 |    "source": [
1236 |     "## Save Models"
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "markdown",
1241 |    "metadata": {},
1242 |    "source": [
1243 |     "The last step will be saving all the models that have taken part in the pipeline for further use in following lessons."
1244 |    ]
1245 |   },
1246 |   {
1247 |    "cell_type": "code",
1248 |    "execution_count": 14,
1249 |    "metadata": {},
1250 |    "outputs": [],
1251 |    "source": [
1252 |     "import pickle\n",
1253 |     "\n",
1254 |     "output_folder = '../output/pickle_data'\n",
1255 |     "\n",
1256 |     "!if [ ! -d $output_folder ]; then mkdir -p $output_folder; fi\n",
1257 |     "\n",
1258 |     "with open(f'{output_folder}/encoder.pickle', 'wb') as f:\n",
1259 |     "    pickle.dump(encoder_m, f)\n",
1260 |     "\n",
1261 |     "with open(f'{output_folder}/umap.pickle', 'wb') as f:\n",
1262 |     "    pickle.dump(umap_m, f)\n",
1263 |     "\n",
1264 |     "with open(f'{output_folder}/hdbscan.pickle', 'wb') as f:\n",
1265 |     "    pickle.dump(hdbscan_m, f)"
1266 |    ]
1267 |   },
1268 |   {
1269 |    "cell_type": "code",
1270 |    "execution_count": null,
1271 |    "metadata": {},
1272 |    "outputs": [],
1273 |    "source": []
1274 |   }
1275 |  ],
1276 |  "metadata": {
1277 |   "kernelspec": {
1278 |    "display_name": "Python 3",
1279 |    "language": "python",
1280 |    "name": "python3"
1281 |   },
1282 |   "language_info": {
1283 |    "codemirror_mode": {
1284 |     "name": "ipython",
1285 |     "version": 3
1286 |    },
1287 |    "file_extension": ".py",
1288 |    "mimetype": "text/x-python",
1289 |    "name": "python",
1290 |    "nbconvert_exporter": "python",
1291 |    "pygments_lexer": "ipython3",
1292 |    "version": "3.7.4"
1293 |   }
1294 |  },
1295 |  "nbformat": 4,
1296 |  "nbformat_minor": 2
1297 | }
1298 | 


--------------------------------------------------------------------------------
/notebooks/mlflow_deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MACHINE LEARNING IN PRODUCTION MADRID - MLFLOW DEPLOYMENT"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In previous lessons we've seen how to put a simple Scikit-Learn model into production. However, in the real world the models used to be complicated, maybe not Sklearn flavor and there is an important feature engineering of the input data.\n",
 15 |     "\n",
 16 |     "You can also handle that with MLFlow. We'll see how to do it in the following cells."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Custom Model to Production"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "The first thing we need to do is defining the paths to the pickle data we saved in previous lessons, in order to be able to reproduce the prediction pipeline."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "pickle_data_path = '../output/pickle_data'\n",
 40 |     "\n",
 41 |     "artifacts = {\n",
 42 |     "    'encoder_path': f'{pickle_data_path}/encoder.pickle',\n",
 43 |     "    'umap_path': f'{pickle_data_path}/umap.pickle',\n",
 44 |     "    'hdbscan_path': f'{pickle_data_path}/hdbscan.pickle',\n",
 45 |     "}"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "To put a model into production with MLFlow it is necessary to define a wrapper for it. The process is straightforward with a Scikit-Learn model (KMeans from previous lessons) since the Sklearn Wrapper has been already defined by MLFlow developers.\n",
 53 |     "\n",
 54 |     "Thus, the only thing we need to do is extend the mlflow.pyfunc.PythonModel class and override the predict method:\n",
 55 |     "\n",
 56 |     "```python\n",
 57 |     "class ModelWrapper(mlflow.pyfunc.PythonModel):\n",
 58 |     "    \n",
 59 |     "    def predict(self, context, model_input):\n",
 60 |     "        your_code_here\n",
 61 |     "    \n",
 62 |     "```\n",
 63 |     "\n",
 64 |     "In the cell below, a custom mlflow.pyfunc.PythonModel has been defined. However, it is more complex than the previous definition since the feature engineering of the input data is also included here."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stderr",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/sklearn/externals/six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).\n",
 77 |       "  \"(https://pypi.org/project/six/).\", DeprecationWarning)\n",
 78 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
 79 |       "  warnings.warn(msg, category=DeprecationWarning)\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "import mlflow.pyfunc\n",
 85 |     "\n",
 86 |     "import numpy as np\n",
 87 |     "import pandas as pd\n",
 88 |     "import pickle\n",
 89 |     "import hdbscan\n",
 90 |     "\n",
 91 |     "class ModelWrapper(mlflow.pyfunc.PythonModel):\n",
 92 |     "\n",
 93 |     "    # define some useful list of columns\n",
 94 |     "    def __init__(self):\n",
 95 |     "\n",
 96 |     "        self.columns_to_encode = ['origin', 'destination', 'train_type', 'train_class', 'fare']\n",
 97 |     "        self.columns_to_remove = ['insert_date', 'start_date', 'end_date']\n",
 98 |     "\n",
 99 |     "    # at the time of loading the MLFlow model, the pickle data from the baseline\n",
100 |     "    # pipeline has to be loaded\n",
101 |     "    def load_context(self, context):\n",
102 |     "        \n",
103 |     "        with open(context.artifacts['encoder_path'], 'rb') as f:\n",
104 |     "            self.encoder_m = pickle.load(f)\n",
105 |     "            \n",
106 |     "        with open(context.artifacts['umap_path'], 'rb') as f:\n",
107 |     "            self.umap_m = pickle.load(f)\n",
108 |     "        \n",
109 |     "        with open(context.artifacts['hdbscan_path'], 'rb') as f:\n",
110 |     "            self.hdbscan_m = pickle.load(f)\n",
111 |     "            \n",
112 |     "    # the datetime columns could arrive in the integer form, in that case convert to\n",
113 |     "    # datetime type\n",
114 |     "    def check_dt_type(self, model_input):\n",
115 |     "        \n",
116 |     "        if model_input[self.columns_to_remove[0]].dtype == 'int64':\n",
117 |     "            for col in self.columns_to_remove:\n",
118 |     "                model_input[col] = pd.to_datetime(model_input[col])\n",
119 |     "        \n",
120 |     "        return model_input\n",
121 |     "\n",
122 |     "    # the baseline transformations are done here\n",
123 |     "    def transform(self, model_input):\n",
124 |     "        \n",
125 |     "        model_input.dropna(inplace=True)\n",
126 |     "        \n",
127 |     "        model_input = self.check_dt_type(model_input)\n",
128 |     "        \n",
129 |     "        model_input.loc[:, self.columns_to_encode] = \\\n",
130 |     "            self.encoder_m.transform(model_input[self.columns_to_encode])\n",
131 |     "        \n",
132 |     "        model_input['duration'] = (model_input['end_date'] - model_input['start_date']).dt.seconds / 3600\n",
133 |     "\n",
134 |     "        model_input['time_to_departure'] = (model_input['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \\\n",
135 |     "                                   - model_input['insert_date'].dt.tz_localize('UTC')).dt.days\n",
136 |     "\n",
137 |     "        model_input['hour'] = model_input['start_date'].dt.hour\n",
138 |     "\n",
139 |     "        model_input['weekday'] = model_input['start_date'].dt.dayofweek\n",
140 |     "\n",
141 |     "        model_input = model_input[[x for x in model_input.columns if x not in self.columns_to_remove]]\n",
142 |     "        \n",
143 |     "        return model_input\n",
144 |     "\n",
145 |     "    # main method to override, the OrdinalEncoder and UMAP transformations are done along\n",
146 |     "    # with the HDBSCAN prediction over this embedding\n",
147 |     "    def predict(self, context, model_input):\n",
148 |     "        \n",
149 |     "        # allocate payload with return value for null\n",
150 |     "        payload = np.ones(len(model_input)) * -1\n",
151 |     "        \n",
152 |     "        preprocessed = self.transform(model_input.reset_index(drop=True))\n",
153 |     "        embedding = self.umap_m.transform(preprocessed)\n",
154 |     "        clusters, _ = hdbscan.approximate_predict(self.hdbscan_m, embedding)\n",
155 |     "        \n",
156 |     "        # fill not null records with their cluster\n",
157 |     "        payload[preprocessed.index] = clusters\n",
158 |     "        \n",
159 |     "        return payload\n",
160 |     "        \n"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "After the custom model has been defined, it is necessary to pack everything together, both the model and the conda environment."
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 3,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "mlflow_pyfunc_model_path = '../output/custom_model'\n",
177 |     "\n",
178 |     "# remove all models if already there\n",
179 |     "!rm -rf $mlflow_pyfunc_model_path\n",
180 |     "\n",
181 |     "# conda environment definition\n",
182 |     "conda_env = {\n",
183 |     "    'channels': ['defaults'],\n",
184 |     "    'dependencies': [\n",
185 |     "        'python',\n",
186 |     "        {'pip': [\n",
187 |     "            'mlflow',\n",
188 |     "            'umap-learn',\n",
189 |     "            'hdbscan',\n",
190 |     "          ]\n",
191 |     "        },\n",
192 |     "    ],\n",
193 |     "    'name': 'custom_env',\n",
194 |     "}\n",
195 |     "\n",
196 |     "# finally save the model as an MLFlow project into the output directory\n",
197 |     "mlflow.pyfunc.save_model(path=mlflow_pyfunc_model_path, \n",
198 |     "                         python_model=ModelWrapper(),\n",
199 |     "                         conda_env=conda_env,\n",
200 |     "                         artifacts=artifacts)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "## Setup Endpoint"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "In previous lessons we saw how to create an endpoint with MLFlow and the command line:\n",
215 |     "\n",
216 |     "```bash\n",
217 |     "mlflow models serve -m path_to_your_model -h host -p port\n",
218 |     "```\n",
219 |     "\n",
220 |     "However, it is desirable that this endpoint could be always alive. This can be done with systemd and the following configuration:\n",
221 |     "\n",
222 |     "```\n",
223 |     "[Unit]\n",
224 |     "Description=MLFlow model in production\n",
225 |     "After=network.target\n",
226 |     "\n",
227 |     "[Service]\n",
228 |     "Restart=on-failure\n",
229 |     "RestartSec=30\n",
230 |     "StandardOutput=file:/path_to_your_logging_folder/stdout.log\n",
231 |     "StandardError=file:/path_to_your_logging_folder/stderr.log\n",
232 |     "Environment=MLFLOW_TRACKING_URI=http://host_ts:port_ts\n",
233 |     "Environment=MLFLOW_CONDA_HOME=/path_to_your_conda_installation\n",
234 |     "ExecStart=/bin/bash -c 'PATH=/path_to_your_conda_installation/envs/mlinproduction_env/bin/:$PATH exec mlflow models serve -m path_to_your_model -h host -p port'\n",
235 |     "\n",
236 |     "[Install]\n",
237 |     "WantedBy=multi-user.target\n",
238 |     "```\n",
239 |     "\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "## Test Endpoint"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "Before testing the endpoint it is necessary to load some test data."
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "### Load Test Data"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 4,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "data": {
270 |       "text/html": [
271 |        "<div>\n",
272 |        "<style scoped>\n",
273 |        "    .dataframe tbody tr th:only-of-type {\n",
274 |        "        vertical-align: middle;\n",
275 |        "    }\n",
276 |        "\n",
277 |        "    .dataframe tbody tr th {\n",
278 |        "        vertical-align: top;\n",
279 |        "    }\n",
280 |        "\n",
281 |        "    .dataframe thead th {\n",
282 |        "        text-align: right;\n",
283 |        "    }\n",
284 |        "</style>\n",
285 |        "<table border=\"1\" class=\"dataframe\">\n",
286 |        "  <thead>\n",
287 |        "    <tr style=\"text-align: right;\">\n",
288 |        "      <th></th>\n",
289 |        "      <th>insert_date</th>\n",
290 |        "      <th>origin</th>\n",
291 |        "      <th>destination</th>\n",
292 |        "      <th>start_date</th>\n",
293 |        "      <th>end_date</th>\n",
294 |        "      <th>train_type</th>\n",
295 |        "      <th>price</th>\n",
296 |        "      <th>train_class</th>\n",
297 |        "      <th>fare</th>\n",
298 |        "    </tr>\n",
299 |        "  </thead>\n",
300 |        "  <tbody>\n",
301 |        "    <tr>\n",
302 |        "      <td>4266349</td>\n",
303 |        "      <td>2019-04-19 11:02:07</td>\n",
304 |        "      <td>SEVILLA</td>\n",
305 |        "      <td>MADRID</td>\n",
306 |        "      <td>2019-05-15 07:40:00</td>\n",
307 |        "      <td>2019-05-15 10:05:00</td>\n",
308 |        "      <td>AVE</td>\n",
309 |        "      <td>47.30</td>\n",
310 |        "      <td>Turista</td>\n",
311 |        "      <td>Promo</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <td>6261496</td>\n",
315 |        "      <td>2019-05-15 01:20:47</td>\n",
316 |        "      <td>BARCELONA</td>\n",
317 |        "      <td>MADRID</td>\n",
318 |        "      <td>2019-05-29 11:00:00</td>\n",
319 |        "      <td>2019-05-29 13:45:00</td>\n",
320 |        "      <td>AVE</td>\n",
321 |        "      <td>90.50</td>\n",
322 |        "      <td>Turista Plus</td>\n",
323 |        "      <td>Promo</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <td>9583320</td>\n",
327 |        "      <td>2019-06-20 17:25:44</td>\n",
328 |        "      <td>MADRID</td>\n",
329 |        "      <td>VALENCIA</td>\n",
330 |        "      <td>2019-07-28 16:55:00</td>\n",
331 |        "      <td>2019-07-28 21:58:00</td>\n",
332 |        "      <td>AVE-LD</td>\n",
333 |        "      <td>39.25</td>\n",
334 |        "      <td>Turista con enlace</td>\n",
335 |        "      <td>Promo</td>\n",
336 |        "    </tr>\n",
337 |        "    <tr>\n",
338 |        "      <td>7197242</td>\n",
339 |        "      <td>2019-05-25 07:48:22</td>\n",
340 |        "      <td>MADRID</td>\n",
341 |        "      <td>SEVILLA</td>\n",
342 |        "      <td>2019-06-12 10:00:00</td>\n",
343 |        "      <td>2019-06-12 12:32:00</td>\n",
344 |        "      <td>AVE</td>\n",
345 |        "      <td>53.40</td>\n",
346 |        "      <td>Turista</td>\n",
347 |        "      <td>Promo</td>\n",
348 |        "    </tr>\n",
349 |        "    <tr>\n",
350 |        "      <td>10340229</td>\n",
351 |        "      <td>2019-08-15 03:08:13</td>\n",
352 |        "      <td>MADRID</td>\n",
353 |        "      <td>VALENCIA</td>\n",
354 |        "      <td>2019-08-28 16:55:00</td>\n",
355 |        "      <td>2019-08-28 19:14:00</td>\n",
356 |        "      <td>ALVIA</td>\n",
357 |        "      <td>46.15</td>\n",
358 |        "      <td>Preferente</td>\n",
359 |        "      <td>Promo</td>\n",
360 |        "    </tr>\n",
361 |        "    <tr>\n",
362 |        "      <td>7436516</td>\n",
363 |        "      <td>2019-05-27 17:40:21</td>\n",
364 |        "      <td>VALENCIA</td>\n",
365 |        "      <td>MADRID</td>\n",
366 |        "      <td>2019-07-08 12:40:00</td>\n",
367 |        "      <td>2019-07-08 14:20:00</td>\n",
368 |        "      <td>AVE</td>\n",
369 |        "      <td>45.30</td>\n",
370 |        "      <td>Turista</td>\n",
371 |        "      <td>Promo</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <td>3724117</td>\n",
375 |        "      <td>2019-04-15 14:08:30</td>\n",
376 |        "      <td>BARCELONA</td>\n",
377 |        "      <td>MADRID</td>\n",
378 |        "      <td>2019-04-25 12:00:00</td>\n",
379 |        "      <td>2019-04-25 15:10:00</td>\n",
380 |        "      <td>AVE</td>\n",
381 |        "      <td>58.15</td>\n",
382 |        "      <td>Turista</td>\n",
383 |        "      <td>Promo</td>\n",
384 |        "    </tr>\n",
385 |        "    <tr>\n",
386 |        "      <td>7481454</td>\n",
387 |        "      <td>2019-05-28 05:43:10</td>\n",
388 |        "      <td>MADRID</td>\n",
389 |        "      <td>SEVILLA</td>\n",
390 |        "      <td>2019-07-15 08:30:00</td>\n",
391 |        "      <td>2019-07-15 11:14:00</td>\n",
392 |        "      <td>ALVIA</td>\n",
393 |        "      <td>87.40</td>\n",
394 |        "      <td>Preferente</td>\n",
395 |        "      <td>Flexible</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <td>488848</td>\n",
399 |        "      <td>2019-08-26 11:29:18</td>\n",
400 |        "      <td>MADRID</td>\n",
401 |        "      <td>VALENCIA</td>\n",
402 |        "      <td>2019-10-21 15:10:00</td>\n",
403 |        "      <td>2019-10-21 16:52:00</td>\n",
404 |        "      <td>AVE</td>\n",
405 |        "      <td>21.95</td>\n",
406 |        "      <td>Turista</td>\n",
407 |        "      <td>Promo</td>\n",
408 |        "    </tr>\n",
409 |        "    <tr>\n",
410 |        "      <td>8780694</td>\n",
411 |        "      <td>2019-06-11 14:03:46</td>\n",
412 |        "      <td>MADRID</td>\n",
413 |        "      <td>BARCELONA</td>\n",
414 |        "      <td>2019-06-27 13:25:00</td>\n",
415 |        "      <td>2019-06-27 16:24:00</td>\n",
416 |        "      <td>AVE-TGV</td>\n",
417 |        "      <td>107.70</td>\n",
418 |        "      <td>Turista</td>\n",
419 |        "      <td>Flexible</td>\n",
420 |        "    </tr>\n",
421 |        "  </tbody>\n",
422 |        "</table>\n",
423 |        "</div>"
424 |       ],
425 |       "text/plain": [
426 |        "                 insert_date     origin destination          start_date  \\\n",
427 |        "4266349  2019-04-19 11:02:07    SEVILLA      MADRID 2019-05-15 07:40:00   \n",
428 |        "6261496  2019-05-15 01:20:47  BARCELONA      MADRID 2019-05-29 11:00:00   \n",
429 |        "9583320  2019-06-20 17:25:44     MADRID    VALENCIA 2019-07-28 16:55:00   \n",
430 |        "7197242  2019-05-25 07:48:22     MADRID     SEVILLA 2019-06-12 10:00:00   \n",
431 |        "10340229 2019-08-15 03:08:13     MADRID    VALENCIA 2019-08-28 16:55:00   \n",
432 |        "7436516  2019-05-27 17:40:21   VALENCIA      MADRID 2019-07-08 12:40:00   \n",
433 |        "3724117  2019-04-15 14:08:30  BARCELONA      MADRID 2019-04-25 12:00:00   \n",
434 |        "7481454  2019-05-28 05:43:10     MADRID     SEVILLA 2019-07-15 08:30:00   \n",
435 |        "488848   2019-08-26 11:29:18     MADRID    VALENCIA 2019-10-21 15:10:00   \n",
436 |        "8780694  2019-06-11 14:03:46     MADRID   BARCELONA 2019-06-27 13:25:00   \n",
437 |        "\n",
438 |        "                    end_date train_type   price         train_class      fare  \n",
439 |        "4266349  2019-05-15 10:05:00        AVE   47.30             Turista     Promo  \n",
440 |        "6261496  2019-05-29 13:45:00        AVE   90.50        Turista Plus     Promo  \n",
441 |        "9583320  2019-07-28 21:58:00     AVE-LD   39.25  Turista con enlace     Promo  \n",
442 |        "7197242  2019-06-12 12:32:00        AVE   53.40             Turista     Promo  \n",
443 |        "10340229 2019-08-28 19:14:00      ALVIA   46.15          Preferente     Promo  \n",
444 |        "7436516  2019-07-08 14:20:00        AVE   45.30             Turista     Promo  \n",
445 |        "3724117  2019-04-25 15:10:00        AVE   58.15             Turista     Promo  \n",
446 |        "7481454  2019-07-15 11:14:00      ALVIA   87.40          Preferente  Flexible  \n",
447 |        "488848   2019-10-21 16:52:00        AVE   21.95             Turista     Promo  \n",
448 |        "8780694  2019-06-27 16:24:00    AVE-TGV  107.70             Turista  Flexible  "
449 |       ]
450 |      },
451 |      "execution_count": 4,
452 |      "metadata": {},
453 |      "output_type": "execute_result"
454 |     }
455 |    ],
456 |    "source": [
457 |     "import pandas as pd\n",
458 |     "\n",
459 |     "df = pd.read_parquet('../data/raw/renfe.parquet')\n",
460 |     "\n",
461 |     "test_data = df.sample(10)\n",
462 |     "\n",
463 |     "test_data"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "### Debug Model"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "In case the endpoint is not working as expected, the model can be loaded with the MLFlow API into the Jupyter notebook and start debugging it with the following cell."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 5,
483 |    "metadata": {},
484 |    "outputs": [
485 |     {
486 |      "name": "stderr",
487 |      "output_type": "stream",
488 |      "text": [
489 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/mlflow/pyfunc/__init__.py:281: DeprecationWarning: .. Warning:: ``mlflow.pyfunc.load_pyfunc`` is deprecated since 1.0. This method will be removed in a near future release. Use ``mlflow.pyfunc.load_model`` instead.\n",
490 |       "  return load_pyfunc(model_uri, suppress_warnings)\n",
491 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/numba/compiler.py:602: NumbaPerformanceWarning: \n",
492 |       "The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.\n",
493 |       "\n",
494 |       "To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.\n",
495 |       "\n",
496 |       "File \"../../../../../../miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/umap/nndescent.py\", line 124:\n",
497 |       "    @numba.njit(parallel=True)\n",
498 |       "    def init_from_random(n_neighbors, data, query_points, heap, rng_state):\n",
499 |       "    ^\n",
500 |       "\n",
501 |       "  self.func_ir.loc))\n",
502 |       "/home/ubuntu/miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/numba/compiler.py:602: NumbaPerformanceWarning: \n",
503 |       "The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.\n",
504 |       "\n",
505 |       "To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.\n",
506 |       "\n",
507 |       "File \"../../../../../../miniconda3/envs/mlinproduction_env/lib/python3.7/site-packages/umap/nndescent.py\", line 135:\n",
508 |       "    @numba.njit(parallel=True)\n",
509 |       "    def init_from_tree(tree, data, query_points, heap, rng_state):\n",
510 |       "    ^\n",
511 |       "\n",
512 |       "  self.func_ir.loc))\n"
513 |      ]
514 |     },
515 |     {
516 |      "name": "stdout",
517 |      "output_type": "stream",
518 |      "text": [
519 |       "Predictions: [8. 4. 8. 8. 8. 8. 7. 4. 9. 5.]\n"
520 |      ]
521 |     }
522 |    ],
523 |    "source": [
524 |     "loaded_model = mlflow.pyfunc.load_model(mlflow_pyfunc_model_path)\n",
525 |     "\n",
526 |     "print(f'Predictions: {loaded_model.predict(test_data)}')"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "metadata": {},
532 |    "source": [
533 |     "### Query Endpoint"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "markdown",
538 |    "metadata": {},
539 |    "source": [
540 |     "Here, it is done via Python requests, however it can also be done with cURL or another tool."
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": 6,
546 |    "metadata": {},
547 |    "outputs": [
548 |     {
549 |      "name": "stdout",
550 |      "output_type": "stream",
551 |      "text": [
552 |       "Predictions: [8.0, 4.0, 8.0, 8.0, 8.0, 8.0, 7.0, 4.0, 9.0, 5.0]\n"
553 |      ]
554 |     }
555 |    ],
556 |    "source": [
557 |     "import requests\n",
558 |     "\n",
559 |     "host = 'host'\n",
560 |     "port = 'port'\n",
561 |     "\n",
562 |     "url = f'http://{host}:{port}/invocations'\n",
563 |     "\n",
564 |     "headers = {\n",
565 |     "    'Content-Type': 'application/json',\n",
566 |     "}\n",
567 |     "\n",
568 |     "r = requests.post(url=url, headers=headers, data=test_data.to_json(orient='split'))\n",
569 |     "\n",
570 |     "print(f'Predictions: {r.text}')\n"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": []
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.7.4"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 2
602 | }
603 | 


--------------------------------------------------------------------------------
/scripts/dags/dynamic.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import airflow
 3 | from airflow import DAG
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.email_operator import EmailOperator
 6 | from datetime import timedelta
 7 | 
 8 | # settings
 9 | 
10 | MAIL_LIST = []  # set your email here!
11 | 
12 | N_TASKS_LEVEL_1 = 2
13 | N_LEVELS = 3
14 | N_TASKS = 4
15 | 
16 | DEFAULT_ARGS = {
17 |     'owner': 'airflow',
18 |     'start_date': airflow.utils.dates.days_ago(1),
19 |     'email': MAIL_LIST,
20 |     'email_on_failure': True,
21 |     'email_on_retry': False,
22 |     'retries': 5,
23 |     'retry_delay': timedelta(minutes=15),
24 | }
25 | 
26 | 
27 | # pipeline functions and helpers
28 | 
29 | def generate_levels(n_tasks, previous_level_tasks, level_start, n_levels):
30 |     if n_levels == 0:
31 |         return previous_level_tasks
32 |     generated_tasks = []
33 | 
34 |     for task in previous_level_tasks:
35 |         task_id = task.task_id
36 |         for n_task in range(n_tasks):
37 |             task_ln = DummyOperator(task_id=task_id + '_' + f'task_level_{level_start}_{n_task}', dag=dag)
38 |             task >> task_ln
39 |             generated_tasks.append(task_ln)
40 | 
41 |     return generate_levels(n_tasks=n_tasks,
42 |                            previous_level_tasks=generated_tasks,
43 |                            level_start=level_start + 1,
44 |                            n_levels=n_levels - 1)
45 | 
46 | 
47 | # dag definition
48 | 
49 | dag = DAG('crazy_maze',
50 |           default_args=DEFAULT_ARGS,
51 |           schedule_interval='0 8 * * MON')
52 | 
53 | # task dynamic definition
54 | 
55 | start = DummyOperator(task_id='start', dag=dag)
56 | join = DummyOperator(task_id='join', dag=dag)
57 | end = DummyOperator(task_id='end', dag=dag)
58 | 
59 | task_execution_success_mail = EmailOperator(
60 |     to=MAIL_LIST,
61 |     task_id="mail_execution",
62 |     subject="Airflow - Big Data Master",
63 |     html_content="crazy maze executed successfully!",
64 |     dag=dag)
65 | 
66 | for n_task in range(N_TASKS_LEVEL_1):
67 |     task_l1 = DummyOperator(task_id=f'task_level_1_{n_task}', dag=dag)
68 |     start >> task_l1
69 |     task_l2 = DummyOperator(task_id=f'task_level_1_{n_task}_level_2_1', dag=dag)
70 |     task_l1 >> task_l2
71 |     task_l2 >> join
72 | 
73 |     if random.random() > 0.5:
74 |         task_l2_extra = DummyOperator(task_id=f'task_level_1_{n_task}_level_2_2', dag=dag)
75 |         task_l1 >> task_l2_extra
76 |         task_l2_extra >> join
77 | 
78 | last_level_tasks = generate_levels(n_tasks=N_TASKS,
79 |                                    previous_level_tasks=[join],
80 |                                    level_start=2,
81 |                                    n_levels=N_LEVELS)
82 | 
83 | for task in last_level_tasks:
84 |     task >> task_execution_success_mail
85 | 
86 | task_execution_success_mail >> end
87 | 


--------------------------------------------------------------------------------
/scripts/dags/renfe_dag.py:
--------------------------------------------------------------------------------
 1 | import airflow
 2 | from airflow import DAG
 3 | from airflow.operators.bash_operator import BashOperator
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from datetime import timedelta
 6 | 
 7 | ###############################################################################
 8 | # SETTINGS
 9 | 
10 | CENTERS = ['MADRID']
11 | DESTINATIONS = ['BARCELONA', 'SEVILLA', 'VALENCIA', 'GRANADA', 'PONFERRADA']
12 | 
13 | PYTHON_INTERPRETER_PATH: str = '/path/to/python/interpreter/'
14 | RUN_SCRIPT: str = '/path/to/script.py'
15 | 
16 | ###############################################################################
17 | # DAG
18 | 
19 | default_args = {
20 |     'owner': 'airflow',
21 |     'depends_on_past': False,
22 |     'start_date': airflow.utils.dates.days_ago(1),
23 |     'email_on_failure': False,
24 |     'email_on_retry': False,
25 |     'retries': 1,
26 |     'retry_delay': timedelta(minutes=5),
27 | }
28 | 
29 | dag = DAG(dag_id='renfe_production',
30 |           schedule_interval=None,
31 |           catchup=False,
32 |           default_args=default_args)
33 | 
34 | ###############################################################################
35 | # TASKS
36 | 
37 | start = DummyOperator(task_id='start', dag=dag)
38 | end = DummyOperator(task_id='end', dag=dag)
39 | 
40 | command = """
41 | {{ params.interpreter_path }} \
42 | {{ params.script_path }} \
43 | -o {{ params.origin }} \
44 | -d {{ params.destination }} \
45 | """
46 | 
47 | for c in CENTERS:
48 |     for d in DESTINATIONS:
49 | 
50 |         task_c_d = BashOperator(
51 |             task_id='alarms' + '_' + c + '_' + d,
52 |             bash_command=command,
53 |             params={
54 |                 'interpreter_path': PYTHON_INTERPRETER_PATH,
55 |                 'script_path': RUN_SCRIPT,
56 |                 'origin': c,
57 |                 'destination': d
58 |                     },
59 |             dag=dag,
60 |         )
61 | 
62 |         task_d_c = BashOperator(
63 |             task_id='alarms' + '_' + d + '_' + c,
64 |             bash_command=command,
65 |             params={
66 |                 'interpreter_path': PYTHON_INTERPRETER_PATH,
67 |                 'script_path': RUN_SCRIPT,
68 |                 'origin': d,
69 |                 'destination': c
70 |                     },
71 |             dag=dag,
72 |         )
73 | 
74 |         start >> task_c_d >> task_d_c >> end
75 | 


--------------------------------------------------------------------------------
/scripts/dags/renfe_script.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # IMPORTS
  3 | 
  4 | import argparse
  5 | import boto3
  6 | from datetime import datetime, timedelta
  7 | import joblib
  8 | import json
  9 | import logging
 10 | import pandas as pd
 11 | import requests
 12 | 
 13 | 
 14 | #########################################################################################
 15 | # ARGPARSER
 16 | 
 17 | 
 18 | def get_cli():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument('-o', '--origin', type=str, required=True, help='Origin station')
 21 |     parser.add_argument('-d', '--destination', type=str, required=True, help='Destination station')
 22 |     return parser.parse_args()
 23 | 
 24 | 
 25 | ###############################################################################
 26 | # PARAMETERS
 27 | 
 28 | # cli
 29 | 
 30 | args = get_cli()
 31 | 
 32 | ORIGIN = args.origin
 33 | DESTINATION = args.destination
 34 | 
 35 | # database
 36 | 
 37 | USER = 'workshop'
 38 | PASSWORD = 'workshop'
 39 | HOST = 'server-davidadrian.asuscomm.com'
 40 | PORT = 5433
 41 | DATABASE = 'renfe'
 42 | CONN_STRING = f'postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}'
 43 | 
 44 | # model
 45 | 
 46 | ENCODER_PATH = '/home/david/Nextcloud-gurus/repos/freelance/teaching/ironhack/' \
 47 |                'ml-in-production-madrid/output/pickle_data/encoder.joblib'
 48 | ENCODER = joblib.load(ENCODER_PATH)
 49 | ENCODE_COLS = ['train_type', 'train_class', 'fare', 'origin', 'destination']
 50 | FEATURES = [
 51 |             'train_type',
 52 |             #'train_class',
 53 |             #'fare',
 54 |             'duration',
 55 |             'time_to_departure',
 56 |             'hour',
 57 |             'weekday'
 58 |             ]
 59 | 
 60 | ENDPOING_CLUSTERING = 'http://server-davidadrian.asuscomm.com:8926/invocations'
 61 | 
 62 | # AWS
 63 | 
 64 | ENDPOINT_XGB = 'ml-in-production-madrid-sagemaker-api-endpoint'
 65 | ENDPOINT_RF = 'ml-in-prod-mad-mlf-api-ep'
 66 | runtime = boto3.client('runtime.sagemaker')
 67 | 
 68 | 
 69 | ###############################################################################
 70 | # FUNCTIONS
 71 | 
 72 | 
 73 | def get_logger():
 74 |     logger = logging.getLogger()
 75 |     logger.setLevel(logging.DEBUG)
 76 |     ch = logging.StreamHandler()
 77 |     ch.setLevel(logging.INFO)
 78 |     formatter = logging.Formatter(
 79 |         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 80 |     ch.setFormatter(formatter)
 81 |     logger.addHandler(ch)
 82 | 
 83 |     return logger
 84 | 
 85 | 
 86 | logger = get_logger()
 87 | 
 88 | 
 89 | def mock_alarms():
 90 |     logger.info("getting (mocking) user alarms...")
 91 |     sql_query = f"""
 92 |     select * from trips tablesample system (10)
 93 |     where origin = '{ORIGIN}'
 94 |     and destination = '{DESTINATION}'
 95 |     and start_date > '{(datetime.now() + timedelta(days=15)).strftime("%Y-%m-%d %H:%M:%S")}'
 96 |     limit 100
 97 |     """
 98 | 
 99 |     return pd.read_sql_query(sql=sql_query, con=CONN_STRING)
100 | 
101 | 
102 | def add_features(renfe_df):
103 |     logger.info("adding new features...")
104 |     renfe_df['duration'] = (renfe_df['end_date'] - renfe_df['start_date']).dt.seconds / 3600
105 |     renfe_df['time_to_departure'] = (renfe_df['start_date'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC') \
106 |                                      - renfe_df['insert_date'].dt.tz_localize('UTC')).dt.days
107 |     renfe_df['hour'] = renfe_df['start_date'].dt.hour
108 |     renfe_df['weekday'] = renfe_df['start_date'].dt.dayofweek
109 | 
110 | 
111 | def preprocessing(renfe_df):
112 |     logger.info("preprocessing data...")
113 |     renfe_df.dropna(inplace=True)
114 |     add_features(renfe_df)
115 |     renfe_df.loc[:, ENCODE_COLS] = ENCODER.transform(renfe_df[ENCODE_COLS])
116 | 
117 | 
118 | def get_forecast(renfe_df, model):
119 |     logger.info('calling endpoint...')
120 | 
121 |     if model == 'xgb':
122 |         response = runtime.invoke_endpoint(EndpointName=ENDPOINT_XGB,
123 |                                            Body=renfe_df.to_csv(header=False,
124 |                                                                 index=False))
125 |         y_pred = list(map(lambda x: float(x), response['Body'].read().decode().split(',')))
126 |         return y_pred
127 | 
128 |     elif model == 'rf':
129 |         response = runtime.invoke_endpoint(EndpointName=ENDPOINT_RF,
130 |                                            ContentType='application/json',
131 |                                            Body=renfe_df.to_json(orient='split'))
132 | 
133 |         y_pred = json.loads(response['Body'].read().decode())
134 |         return y_pred
135 | 
136 |     elif model == 'clustering':
137 | 
138 |         headers = {
139 |             'Content-Type': 'application/json',
140 |         }
141 | 
142 |         r = requests.post(url=ENDPOING_CLUSTERING,
143 |                           headers=headers,
144 |                           data=renfe_df.to_json(orient='split'))
145 | 
146 |         y_pred = [int(x) for x in r.text if x.isdigit()]
147 |         return y_pred
148 | 
149 | 
150 | ###############################################################################
151 | # MAIN
152 | 
153 | if __name__ == '__main__':
154 |     alarms = mock_alarms()
155 |     preprocessing(alarms)
156 | 
157 |     for alarm in alarms.itertuples():
158 |         logger.info(f"forecasting price for ticket from {ORIGIN} to {DESTINATION} with "
159 |                     f"alarm price: {alarm.price}...")
160 | 
161 |         predict_df = pd.DataFrame({
162 |             'train_type': alarm.train_type,
163 |             'duration': alarm.duration,
164 |             'time_to_departure': range(1, alarm.time_to_departure),
165 |             'hour': alarm.hour,
166 |             'weekday': alarm.weekday
167 |         })
168 | 
169 |         forecast_xgb = get_forecast(predict_df, model='xgb')
170 |         forecast_rf = get_forecast(predict_df, model='rf')
171 | 
172 |         logger.info('----------------------------------------------------------')
173 |         logger.info(f'current price is: {alarm.price}')
174 |         logger.info('----------------------------------------------------------')
175 |         logger.info(f'max price predicted by xgb model is: {max(forecast_xgb)}')
176 |         logger.info(f'min price predicted by xgb model is: {min(forecast_xgb)}')
177 |         logger.info('----------------------------------------------------------')
178 |         logger.info(f'max price predicted by rf model is: {max(forecast_rf)}')
179 |         logger.info(f'min price predicted by rf model is: {min(forecast_rf)}')
180 |         logger.info('----------------------------------------------------------')
181 | 


--------------------------------------------------------------------------------
/scripts/dags/simple_dag.py:
--------------------------------------------------------------------------------
 1 | import airflow
 2 | from airflow import DAG
 3 | from airflow.operators.bash_operator import BashOperator
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from datetime import timedelta
 6 | 
 7 | ###############################################################################
 8 | # DAG
 9 | 
10 | default_args = {
11 |     'owner': 'airflow',
12 |     'depends_on_past': False,
13 |     'start_date': airflow.utils.dates.days_ago(1),
14 |     'email_on_failure': False,
15 |     'email_on_retry': False,
16 |     'retries': 1,
17 |     'retry_delay': timedelta(minutes=2),
18 | }
19 | 
20 | dag = DAG(dag_id='simple_dag',
21 |           schedule_interval='@hourly',
22 |           catchup=False,
23 |           default_args=default_args)
24 | 
25 | ###############################################################################
26 | # TASKS
27 | 
28 | start = DummyOperator(task_id='start', dag=dag)
29 | end = DummyOperator(task_id='end', dag=dag)
30 | 
31 | # example task 1
32 | 
33 | task_1 = BashOperator(
34 |     task_id='task_1',
35 |     bash_command="echo 'executing bash command: cd {{ params.dir }} && ./{{ params.script }}'",
36 |     params={'dir': '/path/to/your/directory/',
37 |             'script': 'script.sh'},
38 |     dag=dag)
39 | 
40 | # example task 2
41 | 
42 | task_2_command = """
43 | echo 'executing bash command: cd {{ params.dir }} \
44 | && sudo -u {{ params.user }} ./{{ params.script }} \
45 | --process={{ params.process }} \
46 | """
47 | 
48 | task_2 = BashOperator(
49 |     task_id='task_2',
50 |     bash_command=task_2_command,
51 |     params={'dir': '/path/to/your/directory/',
52 |             'user': 'user',
53 |             'script': 'script.sh',
54 |             'process': 'process_name'},
55 |     dag=dag)
56 | 
57 | start >> [task_1, task_2] >> end
58 | 


--------------------------------------------------------------------------------
/vm/ubuntu.ova:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed70ef018aabaaba56fd4a5c496c037e4a784d11c3ce39586148814e4344a0fa
3 | size 2409983488
4 | 


--------------------------------------------------------------------------------