├── .gitignore
├── LICENSE
├── README.md
├── project-boilerplates
├── reinforcement-learning
│ ├── .envrc
│ ├── .gitignore
│ ├── README.md
│ ├── requirements.txt
│ ├── rl_boilerplate
│ │ ├── __init__.py
│ │ ├── agent.py
│ │ ├── config.py
│ │ ├── environment.py
│ │ ├── main.py
│ │ └── network.py
│ └── setup.py
├── sending-images-streamlit-fastapi
│ ├── README.md
│ ├── backend
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── MANIFEST.in
│ │ ├── Makefile
│ │ ├── face_rec
│ │ │ ├── __init__.py
│ │ │ ├── face_detection.py
│ │ │ └── haarcascade_frontalface_default.xml
│ │ ├── fast_api
│ │ │ ├── __init__.py
│ │ │ └── api.py
│ │ ├── notebooks
│ │ │ ├── .ipynb_checkpoints
│ │ │ │ └── face_detection-checkpoint.ipynb
│ │ │ ├── face_detection.ipynb
│ │ │ └── haarcascade_frontalface_default.xml
│ │ ├── requirements.txt
│ │ └── setup.py
│ └── frontend
│ │ ├── Dockerfile
│ │ ├── app.py
│ │ └── requirements.txt
├── time-series-cross-validator-challenge
│ ├── .challengifyignore
│ ├── .github
│ │ └── workflows
│ │ │ └── pythonpackage.yml
│ ├── .gitignore
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── README.md
│ ├── notebooks
│ │ ├── test_package.ipynb
│ │ └── tutorial_ts_forecating.ipynb
│ ├── pytest.ini
│ ├── requirements.txt
│ ├── setup.py
│ ├── tests
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── integrated
│ │ │ ├── test_main.py
│ │ │ └── test_model_performance.py
│ │ └── unittests
│ │ │ ├── test_data.py
│ │ │ └── test_model.py
│ └── ts_boilerplate
│ │ ├── __init__.py
│ │ ├── dataprep.py
│ │ ├── generate_dummy_data.py
│ │ ├── main.py
│ │ ├── metrics.py
│ │ ├── model.py
│ │ └── params.py
└── time-series-cross-validator
│ ├── .challengifyignore
│ ├── .github
│ └── workflows
│ │ └── pythonpackage.yml
│ ├── .gitignore
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── README.md
│ ├── TODO.md
│ ├── notebooks
│ ├── WIP_tutorial_darts_library.ipynb
│ ├── test_package.ipynb
│ └── tutorial_ts_forecating.ipynb
│ ├── pytest.ini
│ ├── requirements.txt
│ ├── setup.py
│ ├── tests
│ ├── __init__.py
│ ├── conftest.py
│ ├── integrated
│ │ ├── test_main.py
│ │ └── test_model_performance.py
│ └── unittests
│ │ ├── test_data.py
│ │ └── test_model.py
│ └── ts_boilerplate
│ ├── __init__.py
│ ├── dataprep.py
│ ├── generate_dummy_data.py
│ ├── main.py
│ ├── metrics.py
│ ├── model.py
│ └── params.py
└── tutorials
├── .keep
└── removing-bottlenecks
├── demo.ipynb
├── model.png
├── row_column_wise.png
├── slides.ipynb
├── slides.slides.html
└── tensorflow.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.DS_Store
2 | **/__pycache__
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Le Wagon
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## What is this repo about ?
3 |
4 | Le Wagon is hereby curating a selected list of useful **open-source templates** for data analysis / data science.
5 |
6 | ✅ These templates should be:
7 |
8 | - self-explanatory
9 | - replicable (provided Le Wagon [data-setup](https://github.com/lewagon/data-setup) has been done on the machine)
10 | - focused on one specific theme
11 |
12 | A theme could be:
13 | - A **project boilerplate**: A typical 2 weeks-long / 4 students project in the spirit of what is done at [Le Wagon Data Science bootcamp](https://www.lewagon.com/data-science-course/full-time)
14 | - e.g. "Time-Series cross-validation boilerplate"
15 | - e.g. "Reinforcement Learning boilerplate"
16 | - e.g. "Image classification trainer + streamlit API boilerplate"
17 | - ...
18 |
19 |
20 | - A **tutorials** guide focused on one specific topic worth sharing
21 | - e.g. "Removing bottlenecks with Numba, Cython, and TensorFlow"
22 |
23 | ## How to contribute ?
24 |
25 | Feel free to contribute by adding your suggestions
26 |
27 | ### Submit a pull request with new templates or improve existing ones
28 | 👉 Fork this repository to your account, and submit pull-requests following [the standard open-source contribution](https://jarv.is/notes/how-to-pull-request-fork-github/) methodology
29 |
30 | 👉 Le Wagon team will check your PR and integrate it to the list if it pass quality standards
31 |
32 | ### Discuss features/improvements/suggestions
33 | 👉 https://github.com/lewagon/data-templates/discussions
34 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/.envrc:
--------------------------------------------------------------------------------
1 | layout python3
2 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | .coverage
3 | .ipynb_checkpoints
4 | **/*.DS_Store
5 | data_raw/
6 | data_processed/
7 | *.csv
8 | __pycache__/
9 | .env
10 | .direnv/
11 | .vscode/
12 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/README.md:
--------------------------------------------------------------------------------
1 | This is a **boilerplate** repo for a reinforcement learning (RL) project.
2 |
3 | This directory provides an example repository structure for RL projects using pytorch. This template provides a generic agent using the [deep Q-learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) algorithm as well as an agent playing random actions for baseline performance. The DQN architecture is in itw own class and is hot-swappable with other potential architectures. A sample environment using [OpenAi's gym](https://github.com/openai/gym) and a generic control loop is also provided.
4 |
5 | Note that since RL projects are rarely data-centric, and data has to be generated on-the-fly, requirements are likely to differ from standard ML projects.
6 |
7 | # Detailed package workflow
8 |
9 | This boilerplate package contains multiple modules:
10 |
11 | - `main.py` is the entry point of the package. It defines the agent and environment to use.
12 | - `environment.py` defines environment-side setup and execution utilities. It uses the gym package for demonstration purposes.
13 | - `agent.py` defines multiple types of learning agent. We have included a random agent and deep Q-learning agent for demonstration purposes.
14 | - `config.py` defines a singleton class used for storing simulation parameters. This class is globally available in all packages (through the `CFG` variable). It has to be initialized once (see module documentation).
15 | - `network.py` defines the neural network used by the DQN agent.
16 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/requirements.txt:
--------------------------------------------------------------------------------
1 | # Update as needed
2 | torch
3 | #tensorflow
4 | gymnasium
5 | gymnasium[box2d]
6 | tqdm
7 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/reinforcement-learning/rl_boilerplate/__init__.py
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/agent.py:
--------------------------------------------------------------------------------
1 | """
2 | Agent module.
3 | """
4 |
5 | import random
6 | import torch
7 | import torch.nn
8 | import tensorflow as tf
9 |
10 | from rl_boilerplate import network
11 | from rl_boilerplate.config import CFG
12 |
13 |
14 | class Agent:
15 | """
16 | A learning agent parent class.
17 | """
18 |
19 | def __init__(self):
20 | pass
21 |
22 | def set(self):
23 | """
24 | Make the agent learn from a (s, a, r, s') tuple.
25 | """
26 | raise NotImplementedError
27 |
28 | def get(self):
29 | """
30 | Request a next action from the agent.
31 | """
32 | raise NotImplementedError
33 |
34 |
35 | class RandomAgent(Agent):
36 | """
37 | A random playing agent class.
38 | """
39 |
40 | def set(self, obs_old, act, rwd, obs_new):
41 | """
42 | A random agent doesn't learn.
43 | """
44 | return
45 |
46 | def get(self, obs_new, act_space):
47 | """
48 | Simply return a random action.
49 | """
50 | return act_space.sample()
51 |
52 |
53 | class DQNAgent_pt(Agent):
54 | """
55 | A basic pytorch Deep Q-learning agent.
56 | """
57 |
58 | def __init__(self, x_dim, y_dim):
59 | self.net = network.DQN_pt(x_dim, y_dim)
60 | self.opt = torch.optim.Adam(self.net.parameters(), lr=0.0001)
61 |
62 | def set(self, obs_old, act, rwd, obs_new):
63 | """
64 | Learn from a single observation sample.
65 | """
66 | obs_new = torch.tensor(obs_new)
67 |
68 | # We get the network output
69 | out = self.net(torch.tensor(obs_new))[act]
70 |
71 | # We compute the target
72 | with torch.no_grad():
73 | exp = rwd + CFG.gamma * self.net(obs_new).max()
74 |
75 | # Compute the loss
76 | loss = torch.square(exp - out)
77 |
78 | # Perform a backward propagation.
79 | self.opt.zero_grad()
80 | loss.sum().backward()
81 | self.opt.step()
82 |
83 | def get(self, obs_new, act_space):
84 | """
85 | Run an epsilon-greedy policy for next actino selection.
86 | """
87 | # Return random action with probability epsilon
88 | if random.uniform(0, 1) < CFG.epsilon:
89 | return act_space.sample()
90 | # Else, return action with highest value
91 | with torch.no_grad():
92 | # Get the values of all possible actions
93 | val = self.net(torch.tensor(obs_new))
94 | # Choose the highest-values action
95 | return torch.argmax(val).numpy()
96 |
97 | class DQNAgent_tf(Agent):
98 | """
99 | A basic tensorflow Deep Q-learning agent.
100 | """
101 |
102 | def __init__(self, x_dim, y_dim):
103 | self.net = network.DQN_tf(x_dim, y_dim)
104 | self.opt = tf.optimizers.Adam(learning_rate=0.0001)
105 |
106 | def set(self, obs_old, act, rwd, obs_new):
107 | """
108 | Learn from a single observation sample.
109 | """
110 |
111 | obs_new = obs_new.reshape(1, -1)
112 |
113 | with tf.GradientTape() as tape:
114 |
115 | # We get the network output
116 | out = self.net(obs_new)[0, act]
117 |
118 | # We compute the target
119 | exp = rwd + CFG.gamma * tf.reduce_max(self.net(obs_new))
120 |
121 | # Compute the loss
122 | loss = tf.square(exp - out)
123 | print(loss)
124 |
125 | grads = tape.gradient(loss, self.net.trainable_variables)
126 | self.opt.apply_gradients(zip(grads, self.net.trainable_variables))
127 |
128 | def get(self, obs_new, act_space):
129 | """
130 | Run an epsilon-greedy policy for next actino selection.
131 | """
132 | # Return random action with probability epsilon
133 | if random.uniform(0, 1) < CFG.epsilon:
134 | return act_space.sample()
135 | # Else, return action with highest value
136 | with torch.no_grad():
137 | return tf.argmax(self.net(obs_new.reshape(1, -1)), axis=1).numpy()[0]
138 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/config.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration Module.
3 |
4 | This module defines a singleton-type configuration class that can be used all across our project. This class can contain any parameter that one may want to change from one simulation run to the other.
5 | """
6 |
7 | import random
8 |
9 |
10 | class Configuration:
11 | """
12 | This configuration class is extremely flexible due to a two-step init process. We only instantiate a single instance of it (at the bottom if this file) so that all modules can import this singleton at load time. The second initialization (which happens in main.py) allows the user to input custom parameters of the config class at execution time.
13 | """
14 |
15 | def __init__(self):
16 | """
17 | Declare types but do not instantiate anything
18 | """
19 | self.alpha = 0.2
20 | self.gamma = 0.98
21 | self.epsilon = 1.0
22 | self.rnd_seed = None
23 | self.agt_type = None
24 |
25 | def init(self, agt_type, **kwargs):
26 | """
27 | User-defined configuration init. Mandatory to properly set all configuration parameters.
28 | """
29 |
30 | # Mandatory arguments go here. In our case it is useless.
31 | self.agt_type = agt_type
32 |
33 | # We set default values for arguments we have to define
34 | self.rnd_seed = random.randint(0, 1000)
35 | self.epsilon = 0.05
36 |
37 | # However, these arguments can be overridden by passing them as keyword arguments in the init method. Hence, passing for instance epsilon=0.1 as a kwarg to the init method will override the default value we just defined.
38 | self.__dict__.update(kwargs)
39 |
40 | # Once all values are properly set, use them.
41 | random.seed(self.rnd_seed)
42 |
43 |
44 | CFG = Configuration()
45 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/environment.py:
--------------------------------------------------------------------------------
1 | """
2 | Environment module.
3 |
4 | This module contains the RL environment. We provide a gym setup by default, which can easily be replaced by other packages such as pettingzoo. Fundamentally, this module is used to simulate the environment and generate (s, a, r, s') tuples for the agent to learn from.
5 | """
6 |
7 | import gymnasium as gym
8 | from tqdm import tqdm
9 |
10 | from rl_boilerplate.config import CFG
11 |
12 | def get_env():
13 | """
14 | Returns a gym environment. Replace by a custom environment if needed.
15 | """
16 | # We use the LunarLander env. Other environments are available.
17 | return gym.make("LunarLander-v2", render_mode="human")
18 |
19 |
20 | def run_env(env, agt, run_number):
21 | """
22 | Run a given environment with a given agent.
23 | """
24 |
25 | obs_old, info = env.reset(seed=CFG.rnd_seed)
26 |
27 | # We get the action space.
28 | act_space = env.action_space
29 |
30 | print(f"Run number: {run_number + 1}")
31 | for _ in range(1000):
32 |
33 | # We can visually render the learning environment. We disable it for performance.
34 | env.render()
35 |
36 | # We request an action from the agent.
37 | act = agt.get(obs_old, act_space)
38 |
39 | # We apply the action on the environment.
40 | obs_new, rwd, terminated, truncated, _ = env.step(act)
41 |
42 | # We perform a learning step.
43 | agt.set(obs_old, act, rwd, obs_new)
44 |
45 | # Update latest observation
46 | obs_old = obs_new
47 |
48 | if terminated or truncated:
49 | obs_end, info = env.reset()
50 |
51 | env.close()
52 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/main.py:
--------------------------------------------------------------------------------
1 | from rl_boilerplate import agent, environment
2 |
3 | from config import CFG
4 |
5 | # We initialize our configuration class
6 | CFG.init("", rnd_seed=22)
7 |
8 | # We create an agent. State and action spaces are hardcoded here.
9 | agt = agent.DQNAgent_tf(8, 4)
10 |
11 | # Run a learning process
12 | for i in range(1000):
13 | env = environment.get_env()
14 | environment.run_env(env, agt, i)
15 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/rl_boilerplate/network.py:
--------------------------------------------------------------------------------
1 | """
2 | Neural network module.
3 |
4 | This module defines architectures used by reinforcement learning agents.
5 | """
6 |
7 | import tensorflow as tf
8 | import torch
9 | import torch.nn
10 |
11 |
12 | class DQN_pt(torch.nn.Module):
13 | """
14 | PyTorch implementation of a Deep Q-Network with 3 linear layers.
15 | x_dim refers to the number of dimensions to pass as input
16 | y_dim refers to the action space of the agent
17 | """
18 |
19 | def __init__(self, x_dim, y_dim):
20 | super().__init__()
21 |
22 | self.net = torch.nn.Sequential(
23 | torch.nn.Linear(x_dim, 128),
24 | torch.nn.ReLU(inplace=True),
25 | torch.nn.Linear(128, 128),
26 | torch.nn.ReLU(inplace=True),
27 | torch.nn.Linear(128, y_dim),
28 | torch.nn.ReLU(inplace=True),
29 | )
30 |
31 | def forward(self, obs):
32 | return self.net(obs)
33 |
34 | class DQN_tf(tf.keras.Model):
35 | """
36 | Tensorflow implementation of a Deep Q-Network with 3 linear layers.
37 | x_dim refers to the number of dimensions to pass as input
38 | y_dim refers to the action space of the agent
39 | """
40 |
41 | def __init__(self, x_dim, y_dim):
42 | super().__init__()
43 | self.layer1 = tf.keras.layers.Dense(128, activation="relu", input_shape=(x_dim,))
44 | self.layer2 = tf.keras.layers.Dense(128, activation="relu")
45 | self.layer3 = tf.keras.layers.Dense(y_dim, activation="relu")
46 |
47 | def call(self, obs):
48 | x = self.layer1(obs)
49 | x = self.layer2(x)
50 | return self.layer3(x)
51 |
--------------------------------------------------------------------------------
/project-boilerplates/reinforcement-learning/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | with open('requirements.txt') as f:
5 | content = f.readlines()
6 | requirements = [x.strip() for x in content if 'git+' not in x]
7 |
8 | setup(name='rl-boilerplate',
9 | version="1.0",
10 | description="Trainer Boilerplate for Reinforcement Learning Projects",
11 | packages=find_packages(),
12 | install_requires=requirements,
13 | include_package_data=True,
14 | zip_safe=False)
15 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/README.md:
--------------------------------------------------------------------------------
1 | # Simple Face Annotation with Streamlit, FastAPI and OpenCV
2 |
3 | This is a boilerplate for any projects that involve sending an image from a web UI to an API, performing some manipulation on the image, and sending it back. The example taken here is a simple face recognition app using `OpenCV`s built-in [Haar Cascade object detection algorithm](https://pyimagesearch.com/2021/04/12/opencv-haar-cascades/).
4 |
5 | ### What's here:
6 |
7 | * [Streamlit](https://docs.streamlit.io/) on the frontend
8 | * [FastAPI](https://fastapi.tiangolo.com/) on the backend
9 | * [PIL/pillow](https://pillow.readthedocs.io/en/stable/) and [opencv-python](https://github.com/opencv/opencv-python) for working with images
10 | * Backend and frontend can be deployed with Docker
11 |
12 | ### Using this template
13 |
14 | > From inside the `backend` folder:
15 |
16 | You can serve the API with `uvicorn fast_api.api:app --reload` (default port is `8000`)
17 |
18 | > From inside the `frontend` folder:
19 |
20 | You can serve the frontend with `streamlit run app.py` (default port is `8501`)
21 |
22 | ### Using this template with Docker
23 |
24 | Both the `frontend` and `backend` have corresponding `Dockerfile`s for the web UI and API.
25 |
26 | 1. To create a Docker image, inside the corresponding folders run `docker built -t NAME_FOR_THE_IMAGE .`
27 | 2. Run a container for either API or UI with `docker run -p MACHINE_PORT:CONTAINER_PORT NAME_FOR_THE_IMAGE`;
28 |
29 | Here, `MACHINE_PORT` is the `localhost` port you want to link to the container, while `CONTAINER_PORT` is the port which will be used by the running app in the container.
30 |
31 |
32 | 3. ❗ You won't be able to reach the API container through `localhost`; You'll need to [link](https://docs.docker.com/network/links/) the containers:
33 |
34 | * **API:** `docker run -p 8000:8000 NAME_FOR_THE_API_IMAGE --name api`
35 | * **UI:** `docker run -p 8501:8501 --link api:api NAME_FOR_THE_UI_IMAGE`
36 |
37 | This way you can use `api` instead of `localhost` to reach the API container from the frontend
38 |
39 | ❗ Note that Docker docs mention that `--link` might be removed in the future (as of 2022.06). Alternatives can be [user-defined bridges](https://docs.docker.com/network/bridge/#differences-between-user-defined-bridges-and-the-default-bridge) or [Docker Compose](https://docs.docker.com/compose/)
40 |
41 | Have fun!
42 |
43 |
44 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/.dockerignore:
--------------------------------------------------------------------------------
1 | /notebooks
2 | /__pycache__
3 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8.12-buster
2 |
3 | WORKDIR /app
4 |
5 | # libraries required by OpenCV
6 | RUN apt-get update
7 | RUN apt-get install \
8 | 'ffmpeg'\
9 | 'libsm6'\
10 | 'libxext6' -y
11 |
12 | COPY requirements.txt .
13 | RUN pip install -r requirements.txt
14 |
15 | COPY . .
16 |
17 | # You can add --port $PORT if you need to set PORT as a specific env variable
18 | CMD uvicorn fast_api.api:app --host 0.0.0.0 --port $PORT
19 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/Makefile:
--------------------------------------------------------------------------------
1 | ##### Prediction API - - - - - - - - - - - - - - - - - - - - - - - - -
2 |
3 | run_api:
4 | uvicorn fast_api.api:app --reload
5 |
6 | ##### Docker - - - - - - - - - - - - - - - - - - - - - - - - -
7 |
8 | docker_build:
9 | docker build -t template-image-api .
10 |
11 | docker_run:
12 | docker run -p 8000:8000 --name api template-image-api
13 |
14 | ##### GCP - - - - - - - - - - - - - - - - - - - - - - - - -
15 |
16 | GCP_PROJECT_ID=XXX
17 |
18 | DOCKER_IMAGE_NAME=XXX
19 |
20 | # https://cloud.google.com/storage/docs/locations#location-mr
21 | GCR_MULTI_REGION=XXX
22 |
23 | # https://cloud.google.com/compute/docs/regions-zones#available
24 | REGION=XXX
25 |
26 | build_gcr_image:
27 | docker build -t $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) .
28 |
29 | build_gcr_image_m1:
30 | docker build --platform linux/amd64 -t $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) .
31 |
32 | run_gcr_image:
33 | docker run -e PORT=8000 -p 8080:8000 $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME)
34 |
35 | push_gcr_image:
36 | docker push $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME)
37 |
38 | gcr_deploy:
39 | gcloud run deploy --image $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) --platform managed --region $(REGION)
40 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/__init__.py
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/face_detection.py:
--------------------------------------------------------------------------------
1 | import cv2
2 |
3 | def annotate_face(img_np_array):
4 | """
5 | Detect and annotate faces with a red square.
6 | `img_np_array` should be a (width, height, 3) shape np.array
7 | """
8 | # Load the default cascade from OpenCV
9 | face_cascade = cv2.CascadeClassifier('face_rec/haarcascade_frontalface_default.xml')
10 |
11 | # Detect faces
12 | faces = face_cascade.detectMultiScale(img_np_array, 1.1, 4)
13 |
14 | # Draw rectangle around the faces
15 | for (x, y, w, h) in faces:
16 | cv2.rectangle(img_np_array, (x, y), (x+w, y+h), (0, 0, 255), 2)
17 |
18 | # return image numpy array
19 | return img_np_array
20 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/__init__.py
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/api.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI, UploadFile, File
2 | from fastapi.middleware.cors import CORSMiddleware
3 | from starlette.responses import Response
4 |
5 | import numpy as np
6 | import cv2
7 | import io
8 | from face_rec.face_detection import annotate_face
9 |
10 | app = FastAPI()
11 |
12 | # # Allow all requests (optional, good for development purposes)
13 | # app.add_middleware(
14 | # CORSMiddleware,
15 | # allow_origins=["*"], # Allows all origins
16 | # allow_credentials=True,
17 | # allow_methods=["*"], # Allows all methods
18 | # allow_headers=["*"], # Allows all headers
19 | # )
20 |
21 | @app.get("/")
22 | def index():
23 | return {"status": "ok"}
24 |
25 | @app.post('/upload_image')
26 | async def receive_image(img: UploadFile=File(...)):
27 | ### Receiving and decoding the image
28 | contents = await img.read()
29 |
30 | nparr = np.fromstring(contents, np.uint8)
31 | cv2_img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) # type(cv2_img) => numpy.ndarray
32 |
33 | ### Do cool stuff with your image.... For example face detection
34 | annotated_img = annotate_face(cv2_img)
35 |
36 | ### Encoding and responding with the image
37 | im = cv2.imencode('.png', annotated_img)[1] # extension depends on which format is sent from Streamlit
38 | return Response(content=im.tobytes(), media_type="image/png")
39 |
40 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/requirements.txt:
--------------------------------------------------------------------------------
1 | # packaging
2 | pip>=9
3 | setuptools>=26
4 | twine
5 | wheel>=0.29
6 |
7 | # data science
8 | six
9 | numpy
10 |
11 | # api
12 | fastapi
13 | uvicorn
14 | python-multipart
15 |
16 | # img
17 | opencv-python
18 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/backend/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | with open('requirements.txt') as f:
5 | content = f.readlines()
6 | requirements = [x.strip() for x in content if 'git+' not in x]
7 |
8 | setup(name='FaceRecApi',
9 | version="1.0",
10 | description="Simple FastAPI with face recognition",
11 | packages=find_packages(),
12 | # include_package_data: to install data from MANIFEST.in
13 | include_package_data=True,
14 | install_requires=requirements)
15 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/frontend/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8.12-buster
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt .
6 | RUN pip install -r requirements.txt
7 |
8 | COPY . .
9 |
10 | # You can add --server.port $PORT if you need to set PORT as a specific env variable
11 | CMD streamlit run app.py
12 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/frontend/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from PIL import Image
3 | import requests
4 | from dotenv import load_dotenv
5 | import os
6 |
7 | # Set page tab display
8 | st.set_page_config(
9 | page_title="Simple Image Uploader",
10 | page_icon= '🖼️',
11 | layout="wide",
12 | initial_sidebar_state="expanded",
13 | )
14 |
15 | # Example local Docker container URL
16 | # url = 'http://api:8000'
17 | # Example localhost development URL
18 | # url = 'http://localhost:8000'
19 | load_dotenv()
20 | url = os.getenv('API_URL')
21 |
22 |
23 | # App title and description
24 | st.header('Simple Image Uploader 📸')
25 | st.markdown('''
26 | > This is a Le Wagon boilerplate for any data science projects that involve exchanging images between a Python API and a simple web frontend.
27 |
28 | > **What's here:**
29 |
30 | > * [Streamlit](https://docs.streamlit.io/) on the frontend
31 | > * [FastAPI](https://fastapi.tiangolo.com/) on the backend
32 | > * [PIL/pillow](https://pillow.readthedocs.io/en/stable/) and [opencv-python](https://github.com/opencv/opencv-python) for working with images
33 | > * Backend and frontend can be deployed with Docker
34 | ''')
35 |
36 | st.markdown("---")
37 |
38 | ### Create a native Streamlit file upload input
39 | st.markdown("### Let's do a simple face recognition 👇")
40 | img_file_buffer = st.file_uploader('Upload an image')
41 |
42 | if img_file_buffer is not None:
43 |
44 | col1, col2 = st.columns(2)
45 |
46 | with col1:
47 | ### Display the image user uploaded
48 | st.image(Image.open(img_file_buffer), caption="Here's the image you uploaded ☝️")
49 |
50 | with col2:
51 | with st.spinner("Wait for it..."):
52 | ### Get bytes from the file buffer
53 | img_bytes = img_file_buffer.getvalue()
54 |
55 | ### Make request to API (stream=True to stream response as bytes)
56 | res = requests.post(url + "/upload_image", files={'img': img_bytes})
57 |
58 | if res.status_code == 200:
59 | ### Display the image returned by the API
60 | st.image(res.content, caption="Image returned from API ☝️")
61 | else:
62 | st.markdown("**Oops**, something went wrong 😓 Please try again.")
63 | print(res.status_code, res.content)
64 |
65 |
--------------------------------------------------------------------------------
/project-boilerplates/sending-images-streamlit-fastapi/frontend/requirements.txt:
--------------------------------------------------------------------------------
1 | # packaging
2 | pip>=9
3 | setuptools>=26
4 | twine
5 | wheel>=0.29
6 |
7 | # data science
8 | six
9 |
10 | # img
11 | pillow
12 |
13 | # web
14 | streamlit
15 | requests
16 | python-dotenv
17 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/.challengifyignore:
--------------------------------------------------------------------------------
1 | TODO.md
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
1 |
2 | # 🤖 usage
3 | #
4 | # this file contains the conf for GitHub Continuous Integration
5 | # and Continuous Deployment to Heroku
6 | #
7 | # in order to activate the tests in GitHub CI:
8 | # - uncomment the content of the CI paragraph (lines 41-55)
9 | # - create some tests in the tests/ directory
10 | #
11 | # in order to activate CD to Heroku:
12 | # - activate the tests in GitHub CI
13 | # - uncomment the content of the CD paragraph (lines 57-75)
14 |
15 | name: Python package
16 |
17 | on:
18 | push:
19 | branches: [ master ]
20 | pull_request:
21 | branches: [ master ]
22 |
23 | jobs:
24 |
25 | # 🤖 CI paragraph
26 | #
27 | # uncomment the content of this paragraph to activate the tests in GitHub CI
28 | # - remove the 2 trailing characters "# ", do not change the spaces
29 | # (the `name` keys should be at the same level as the `uses` key)
30 | # (the `strategy` key should be at the same level as the `steps` key)
31 |
32 | build:
33 |
34 | runs-on: ubuntu-latest
35 |
36 | steps:
37 | - uses: actions/checkout@v2
38 | - name: Say hello
39 | run: |
40 | echo "Hello, World!"
41 | # - name: Set up Python ${{ matrix.python-version }}
42 | # uses: actions/setup-python@v1
43 | # with:
44 | # python-version: ${{ matrix.python-version }}
45 | # - name: Install dependencies
46 | # run: |
47 | # python -m pip install --upgrade pip
48 | # pip install -r requirements.txt
49 | # - name: Install package and test
50 | # run: |
51 | # make install test clean
52 |
53 | # strategy:
54 | # matrix:
55 | # python-version: [3.8]
56 |
57 | # # 🤖 CD paragraph
58 | # #
59 | # # uncomment the following lines to activate CD to Heroku
60 | # # - remove the 2 trailing characters "# ", do not change the spaces
61 | # # (there should be 2 spaces before the `deploy_heroku` key)
62 | # # - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets
63 | # # - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app
64 |
65 | # deploy_heroku:
66 | # needs: build
67 | # runs-on: ubuntu-latest
68 |
69 | # steps:
70 | # - uses: actions/checkout@v2
71 | # - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action
72 | # with:
73 | # heroku_api_key: ${{secrets.HEROKU_API_KEY}}
74 | # heroku_app_name: "REPLACE_WITH_YOUR_HEROKU_APP_NAME" # Must be unique in Heroku
75 | # heroku_email: ${{secrets.HEROKU_EMAIL}}
76 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | .coverage
3 | .ipynb_checkpoints
4 | **/*.DS_Store
5 | data_raw/
6 | data_processed/
7 | *.csv
8 | __pycache__/
9 | .env
10 | .vscode/
11 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/Makefile:
--------------------------------------------------------------------------------
1 | # TODO
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/README.md:
--------------------------------------------------------------------------------
1 | This is a **boilerplate** repo for a machine-learning project involving **Time Series forecasting**.
2 |
3 | In particular
4 |
5 | - It provides a **cross-validation framework** to ensure model are tested thoroughly and without data leakage
6 | - It is agnostic of the type of model involved
7 | - It is well suited for short research projects, typical of few-weeks coding bootcamps such as Le Wagon DataScience
8 |
9 | # Detailed package workflow
10 |
11 | ## Architecture
12 | - `ts_boilerplate` package
13 | - `main.py` comprises the main routes to be called from the CLI (`train`, `cross-validate`, `backtest`)
14 | - `params.py` contains project-level global variable to be set manually
15 |
16 |
17 | - `data` folder contains
18 | - `raw` and `clean` folder should contain **2D arrays `data` time-series**, with (axis 0) representing timesteps integer, and (axis 1) columns containing tagets and covariates, as per [picture](https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true)
19 | ```python
20 | data.shape = (length, n_targets+n_covariates)
21 | ```
22 | - `Xy` may persist your tuple (X,y) of **3D arrays** training sets to be fed to your models if you want to store them to avoid preprocessing multiple times.
23 | ```python
24 | X.shape = (n_samples, input_length, n_covariates)
25 | y.shape = (n_samples, output_length, n_targets)
26 | ```
27 | - `notebooks`
28 | - `test_package.ipynb` will help you understand how the package and the tests have been built.
29 | - `tutorial_ts_forecasting.ipynb` is a recommended read before diving into this project. It contains visuals that will help you fill global project params and understand naming conventions
30 |
31 |
32 |
33 | - `tests` folder detailed below
34 |
35 | ## How to test your code?
36 | First of all, fill `ts_boilerplate/params.py` corresponding to your true project speficities
37 |
38 | Then, run this in your terminal from the root project folder to check your code
39 | - `pytest`
40 | - `pytest -m "not optional"` to only check mandatory tests
41 | - `pytest -m "not optional" -m "not slow"` to also avoid tests that may be slow (involving fitting your model)
42 |
43 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/notebooks/test_package.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Step by step guide to Unit Tests used in this project"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "ExecuteTime": {
15 | "end_time": "2022-03-14T14:24:55.279012Z",
16 | "start_time": "2022-03-14T14:24:53.949004Z"
17 | }
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import numpy as np\n",
22 | "import pandas as pd\n",
23 | "import matplotlib.pyplot as plt\n",
24 | "import os\n",
25 | "from ts_boilerplate.params import ROOT_DIR, DATA, TRAIN, CROSS_VAL\n",
26 | "from ts_boilerplate.dataprep import get_X_y, get_folds, train_test_split, get_Xi_yi\n",
27 | "from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones\n",
28 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n",
29 | "from ts_boilerplate.metrics import mape\n",
30 | "\n",
31 | "%load_ext autoreload\n",
32 | "%autoreload 2"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## 1) `generate_dummy_data.py`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "Let's create a dummy time series dataset whose value increment by 1 every day"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "data = generate_data_monotonic_increase()\n",
56 | "data"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 3,
62 | "metadata": {
63 | "ExecuteTime": {
64 | "end_time": "2022-03-14T14:25:19.973275Z",
65 | "start_time": "2022-03-14T14:25:19.950901Z"
66 | }
67 | },
68 | "outputs": [],
69 | "source": [
70 | "# Store as CSV\n",
71 | "data_df = pd.DataFrame(data)\n",
72 | "data_df.to_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"), index=False)\n",
73 | "pd.read_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"))"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## 2) `dataprep.py`"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "### 2.1) `getX_y`"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 10,
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "X, y = get_X_y(data, **TRAIN)\n",
97 | "print(X.shape)\n",
98 | "print(y.shape)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 11,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# Let's compute the shape arithmetically (for unittests)\n",
108 | "(len(data) \\\n",
109 | " - (TRAIN['input_length'] -1) \\\n",
110 | " - (TRAIN['output_length'] -1) \\\n",
111 | " - TRAIN['horizon']) \\\n",
112 | " / TRAIN[\"stride\"]"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "☝️ ceiling rounding function should be used for stride > 1"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "### 2.2) `train_test_split`"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 12,
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "train_test_ratio = TRAIN[\"train_test_ratio\"]\n",
136 | "input_length = TRAIN[\"input_length\"]\n",
137 | "output_length = TRAIN[\"output_length\"]\n",
138 | "data.shape"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 13,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "last_train_idx = round(train_test_ratio * len(data))\n",
148 | "data_train = data[0:last_train_idx, :]\n",
149 | "\n",
150 | "first_test_idx = last_train_idx - input_length\n",
151 | "data_test = data[first_test_idx:, :]"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 14,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "data_train"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "data_test"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
179 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n",
180 | "\n",
181 | "print(\"####### Last train pair\")\n",
182 | "print(X_train[-1])\n",
183 | "print(y_train[-1])\n",
184 | "print(\"####### First test pair\")\n",
185 | "print(X_test[0])\n",
186 | "print(y_test[0])"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 17,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "gap = np.min(y_test) - np.max(y_train)\n",
196 | "gap"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 18,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "assert gap >= TRAIN[\"horizon\"], \"❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ \""
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "### 2.3) `get_folds`"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "folds = get_folds(data, **CROSS_VAL)\n",
222 | "print('n_folds= ', len(folds))\n",
223 | "print(folds[-1])"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "## 3) `model.py`"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 27,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "data_train, data_test = train_test_split(data, **TRAIN)\n",
240 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
241 | "X_test, y_test = get_X_y(data_test, **TRAIN)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 28,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "import tensorflow as tf\n",
251 | "from keras.models import Model\n",
252 | "from keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 18,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "# BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS\n",
262 | "input = Input(shape=X_train.shape[1:])\n",
263 | "# Take last temporal values of the targets, and duplicate it as many times as `output_length`\n",
264 | "x = Lambda(\n",
265 | " lambda x: tf.repeat(tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1),\n",
266 | " repeats=TRAIN['output_length'],\n",
267 | " axis=1))(input)\n",
268 | "output = Reshape(y_train.shape[1:])(x)\n",
269 | "model = Model(input, output)\n",
270 | "model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)\n",
271 | "model.summary()"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 19,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',\n",
281 | " patience=2,\n",
282 | " verbose=0,\n",
283 | " mode='min',\n",
284 | " restore_best_weights=True)\n",
285 | "history = model.fit(X_train,\n",
286 | " y_train,\n",
287 | " epochs=50,\n",
288 | " batch_size=16,\n",
289 | " validation_split=0.3,\n",
290 | " callbacks=[es],\n",
291 | " verbose=0)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 29,
297 | "metadata": {},
298 | "outputs": [],
299 | "source": [
300 | "from ts_boilerplate.metrics import mape\n",
301 | "\n",
302 | "y_pred = model.predict(X_test)\n",
303 | "mape(y_test, y_pred)\n"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "## 4) `main.py`\n"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "metadata": {},
316 | "source": [
317 | "### 4.1) `train()`"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 10,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "data = generate_data_monotonic_increase()\n",
327 | "data_train, data_test = train_test_split(data, **TRAIN)\n",
328 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
329 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n",
330 | "model = get_model(X_train, y_train)\n",
331 | "history = fit_model(model, X_train, y_train)\n",
332 | "y_pred = predict_output(model, X_test)\n",
333 | "metrics_test = mape(y_test, y_pred)\n",
334 | "\n",
335 | "print(\"### Test Metric: \", metrics_test)"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "### 4.2) cross_validate()"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": []
351 | },
352 | {
353 | "cell_type": "markdown",
354 | "metadata": {},
355 | "source": [
356 | "### 4.1) `backtesting()`"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 20,
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "y_pred_backtest = []"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 21,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "data = generate_data_monotonic_increase()\n",
375 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n",
376 | "from ts_boilerplate.dataprep import get_Xi_yi"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 25,
382 | "metadata": {},
383 | "outputs": [],
384 | "source": [
385 | "stride = 10\n",
386 | "start_ratio:float = 0.8\n",
387 | "retrain: bool = True\n",
388 | "retrain_every: int = 50"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 30,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "from tqdm.notebook import tqdm\n",
398 | "\n",
399 | "# Initialization\n",
400 | "start_timestep_0 = round(start_ratio * len(data))\n",
401 | "data_train_0 = data[:start_timestep_0, ...]\n",
402 | "X_train_tmp, y_train_tmp = get_X_y(data_train_0, **TRAIN)\n",
403 | "data_test_backtested = data[start_timestep_0:, ...]\n",
404 | "_, y_test = get_X_y(data_test_backtested, **TRAIN, shuffle=False)\n",
405 | "y_pred_backtested = []\n",
406 | "retrain_counter = 0\n",
407 | "timesteps_backtested_list = []\n",
408 | "\n",
409 | "for i in tqdm(range(0, len(data_test_backtested), stride)):\n",
410 | " start_timestep_i = start_timestep_0 + i\n",
411 | " data_train = data[:start_timestep_i, ...]\n",
412 | " data_test = data[start_timestep_i:, ...]\n",
413 | " X_train_tmp, y_train_tmp = get_X_y(data_train, **TRAIN)\n",
414 | " X_test_i, y_test_i = get_Xi_yi(first_index=0, data=data_test, **TRAIN)\n",
415 | "\n",
416 | " # At some point after sliding through time, we will reach the end of the test set\n",
417 | " if y_test_i.shape[0] < y_train_tmp.shape[1]:\n",
418 | " break\n",
419 | "\n",
420 | " model = get_model(X_train_tmp, y_train_tmp)\n",
421 | "\n",
422 | " # Retrain when required, with incremental learning (ie. starting from previous weights)\n",
423 | " if retrain and i % retrain_every == 0:\n",
424 | " retrain_counter += 1\n",
425 | " fit_model(model, X_train_tmp, y_train_tmp)\n",
426 | "\n",
427 | " y_pred_i = np.squeeze(predict_output(model, X_test_i[None, ...]))\n",
428 | " y_pred_backtested.append(y_pred_i)\n",
429 | " timesteps_backtested_list.append(i)\n",
430 | "\n",
431 | "y_pred_backtested = np.array(y_pred_backtested)\n",
432 | "y_test_backtested = y_test[timesteps_backtested_list]\n",
433 | "# Check that we compare apples to apples\n",
434 | "assert y_pred_backtested.shape == y_test_backtested.shape\n",
435 | "\n",
436 | "metrics_backtested = mape(y_pred_backtested, y_test_backtested)"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 31,
442 | "metadata": {},
443 | "outputs": [],
444 | "source": [
445 | "print(\n",
446 | " f'### BACKETESTED METRICS BASED ON THE LAST {y_pred_backtested.shape[0]} TIMESTEPS AND WITH {retrain_counter} retrain operations'\n",
447 | ")\n",
448 | "print(mape(y_pred_backtested, y_test_backtested))\n"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 32,
454 | "metadata": {},
455 | "outputs": [],
456 | "source": [
457 | "# TODO: make it work for any dimension of y\n",
458 | "plt.plot(y_pred_backtested[:,0,0], label='historical forecasts')\n",
459 | "plt.plot(y_test_backtested[:,0,0], label='truth')\n",
460 | "plt.xlabel('timesteps')\n",
461 | "plt.legend()\n",
462 | "plt.show()"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {},
469 | "outputs": [],
470 | "source": []
471 | }
472 | ],
473 | "metadata": {
474 | "interpreter": {
475 | "hash": "572b4e543617d03e90ecaf525e08695da1ff29b13594f787e33b342cf572f792"
476 | },
477 | "kernelspec": {
478 | "display_name": "Python 3 (ipykernel)",
479 | "language": "python",
480 | "name": "python3"
481 | },
482 | "language_info": {
483 | "codemirror_mode": {
484 | "name": "ipython",
485 | "version": 3
486 | },
487 | "file_extension": ".py",
488 | "mimetype": "text/x-python",
489 | "name": "python",
490 | "nbconvert_exporter": "python",
491 | "pygments_lexer": "ipython3",
492 | "version": "3.8.12"
493 | },
494 | "toc": {
495 | "base_numbering": 1,
496 | "nav_menu": {},
497 | "number_sections": false,
498 | "sideBar": true,
499 | "skip_h1_title": false,
500 | "title_cell": "Table of Contents",
501 | "title_sidebar": "Contents",
502 | "toc_cell": false,
503 | "toc_position": {},
504 | "toc_section_display": true,
505 | "toc_window_display": false
506 | }
507 | },
508 | "nbformat": 4,
509 | "nbformat_minor": 2
510 | }
511 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/notebooks/tutorial_ts_forecating.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Recap\n",
8 | "\n",
9 | "We will go through the main issues you will face when working with Recurrent Neural Networks that are designed to deal with time-series"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Part 1: How to make a proper Time Series Split ?"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "Let's imagine your `data` as 2D array structured as follows\n",
24 | "\n",
25 | "`data.shape = (n_timesteps, n_features)`\n",
26 | "\n",
27 | "`features` can be separated into 3 categories\n",
28 | "- targets\n",
29 | "- past-covariates\n",
30 | "- future-covariates"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### 1.1) First, create many **FOLDS** for your cross-validation"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "`fold_1.shape = (n_timesteps_per_fold, n_features)` as 2D arrays \n",
52 | "`fold_2.shape = (n_timesteps_per_fold, n_features)` as 2D arrays"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Always split your training set *chronologically before* your test set\n",
60 | "\n",
61 | "👇 e.g. 4-time cross validation split"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "
"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "Create as many folds as needed to clearly test all type of past conditions \n",
76 | "(e.g crash markets periods 📉, bull-run markets 📈, flat markets 😴 etc...)\n",
77 | "\n",
78 | "It's very common to have **hundreds of folds** in Time Series forecasting!"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "### 1.2) in each FOLD, and for each train or test SET, split your time series into different SEQUENCES of (observations, target)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "
"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "Goal: create (`X_train`, `y_train`, `X_test`, `y_test`) containing all you need to train and test your model for this fold\n",
100 | " \n",
101 | "- `X_train.shape = (n_samples, input_chunk_length, n_covariate_features)`\n",
102 | "- `y_train.shape = (n_samples, output_chunk_length, n_targets)`\n",
103 | "\n",
104 | "Notice that we now have 3D-arrays"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "\n",
112 | "💡 You can randomly sample or create them all sliding from left to right, with selected stride"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "
"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "### 3) 🚨 Beware of the **GAP** of length (horizon - 1) between each train & test sets in each fold to avoid data-leakage"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "metadata": {},
132 | "source": [
133 | "
"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "👇 Below is a zoom inside **ONE SINGLE FOLD**\n",
141 | "\n",
142 | "A gap of size `horizon - 1` is mandatory to reflect real situations:\n",
143 | "- Here the forecast horizon is `4` days\n",
144 | "- Let's say we want our train set to end by predicting day `10` based on days before `4, 5, 6`\n",
145 | "- In a real situation we would need to **wait** for day `10` to discover the true value of `y` on which to finalize training\n",
146 | "- Therefore, the test set can only start on day `10`, which is meant to predict `y_test = 10 + 4`"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "horizon $h = 4$\n",
154 | "\n",
155 | "$$ \\Large X^{t+\\color{green}4} = f(X^t, X^{t-1}, X^{t-2}) $$\n",
156 | "\n",
157 | "
"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "metadata": {},
163 | "source": [
164 | "✅ Use [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) `TimeSeriesSplit(n_splits = ..., gap=...)`"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "---"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "# Part 2: Air Pollution Solution"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "
"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {},
191 | "source": [
192 | "## 2.1 Data\n",
193 | "\n",
194 | "❓ **Question** ❓ We will load the data from the third and fourth exercise. Load the data, and keep only the following columns : `['pm2.5', 'TEMP', 'DEWP', 'PRES', 'Ir', 'Is', 'Iws']`"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 1,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "import pandas as pd\n",
204 | "\n",
205 | "df = pd.read_csv('https://wagon-public-datasets.s3.amazonaws.com/deep_learning_datasets/air%20pollution.txt', index_col=[0])\n",
206 | "df = df[['pm2.5', 'TEMP', 'DEWP', 'PRES', 'Ir', 'Is', 'Iws']]"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 2,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "df"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "❓ **Question** ❓ For the sake of simplicity, fill in the missing values with mean over the entire dataset."
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 3,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "df = df.fillna(df.mean())"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 26,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "df.describe()"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "Usually, in classic settings, there is multiple independent sequences $X$, each with a corresponding $y$.\n",
248 | "However, if often happens that we don't have access to multiple sequences $X$, but to only one very long sequence as it is the case here. From that, experts usually split them into multiple sub-sequences.\n",
249 | "\n",
250 | "\n",
251 | "❓ **Question** ❓ Write a function that is able to get a subsequence $X$ and a corresponding $y$ which corresponds to the air pollution **5 days** after the last observation. The length of the subsequence should be an argument of the function"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 4,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "import numpy as np\n",
261 | "\n",
262 | "def subsample_sequence(df, length):\n",
263 | " pass # YOUR CODE HERE\n",
264 | " return X, y\n",
265 | "\n",
266 | "subsample_sequence(df, 10)"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {},
272 | "source": [
273 | "❓ **Question** ❓ Given a list of integers, write a function that split the initial dataset as many times as there are integers in the list. The length of each sequence is the value of the integer in that list."
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 5,
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "def get_X_y(df, length_of_observations):\n",
283 | " X, y = [], []\n",
284 | " pass # YOUR CODE HERE\n",
285 | " return X, y\n",
286 | "\n",
287 | "length_of_observations = np.random.randint(10, 15, 100)\n",
288 | "X, y = get_X_y(df, length_of_observations)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "❓ **Question** ❓ If you split into a train and test set _after_ creating the shorter sequences, you risk having same values in the train and test set, which corresponds to data leakage. Therefore, split your train and test set and then, get your training and test sequences - and the corresponding output.\n",
296 | "\n",
297 | "❗️ Beware of the gap required between train and test!"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 9,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "length_of_observations = np.random.randint(10, 15, 100)\n",
307 | "X_train, y_train = get_X_y(df, length_of_observations)\n",
308 | "\n",
309 | "length_of_observations = np.random.randint(10, 15, 100)\n",
310 | "X_test, y_test = get_X_y(df, length_of_observations)"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "metadata": {},
316 | "source": [
317 | "Each sequence has a certain number of observations. But across sequences, this number of observations is not the same. Because the Neural Network is trained with *batches* of data, you must ensure that, once the sequences are concatenated, they can be represented as a tensor. This operation is called the padding\n",
318 | "\n",
319 | "❓ From the four sequences above, return a padded tensor (with the dedicated Keras function) and plot it."
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 10,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
329 | "pass # YOUR CODE HERE\n",
330 | "\n",
331 | "X_train_pad.shape"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "## Model\n",
339 | "\n",
340 | "As you added data to your input just for computational reasons, your model has to know which one is useful or not. \n",
341 | "\n",
342 | "❓ Initialize a model and add a masking layer so that your model does not take the padded values into account. You have to tell which value you used for the padding"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 11,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "from tensorflow.keras import Sequential, layers\n",
352 | "from tensorflow.keras.layers import Normalization"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 20,
358 | "metadata": {
359 | "tags": [
360 | "challengify"
361 | ]
362 | },
363 | "outputs": [],
364 | "source": [
365 | "# YOUR CODE HERE"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "❓ Compile your model"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 22,
378 | "metadata": {
379 | "tags": [
380 | "challengify"
381 | ]
382 | },
383 | "outputs": [],
384 | "source": [
385 | "# YOUR CODE HERE"
386 | ]
387 | },
388 | {
389 | "cell_type": "markdown",
390 | "metadata": {},
391 | "source": [
392 | "❓ Train your model on the data"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": 23,
398 | "metadata": {
399 | "tags": [
400 | "challengify"
401 | ]
402 | },
403 | "outputs": [],
404 | "source": [
405 | "# YOUR CODE HERE"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 25,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "import matplotlib.pyplot as plt\n",
415 | "plt.plot(history.history['mean_absolute_percentage_error'])\n",
416 | "plt.plot(history.history['val_mean_absolute_percentage_error'])\n",
417 | "plt.legend(['train', 'test'])"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": []
426 | }
427 | ],
428 | "metadata": {
429 | "kernelspec": {
430 | "display_name": "Python 3 (ipykernel)",
431 | "language": "python",
432 | "name": "python3"
433 | },
434 | "language_info": {
435 | "codemirror_mode": {
436 | "name": "ipython",
437 | "version": 3
438 | },
439 | "file_extension": ".py",
440 | "mimetype": "text/x-python",
441 | "name": "python",
442 | "nbconvert_exporter": "python",
443 | "pygments_lexer": "ipython3",
444 | "version": "3.8.12"
445 | },
446 | "toc": {
447 | "base_numbering": 1,
448 | "nav_menu": {},
449 | "number_sections": false,
450 | "sideBar": true,
451 | "skip_h1_title": false,
452 | "title_cell": "Table of Contents",
453 | "title_sidebar": "Contents",
454 | "toc_cell": false,
455 | "toc_position": {},
456 | "toc_section_display": true,
457 | "toc_window_display": false
458 | }
459 | },
460 | "nbformat": 4,
461 | "nbformat_minor": 2
462 | }
463 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | slow: marks tests as slow (deselect with '-m "not slow"')
4 | optional: marks tests as optional (deselect with '-m "not optional"')
5 | addopts = -v -s --color=yes -W ignore::DeprecationWarning
6 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator-challenge/requirements.txt
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | with open('requirements.txt') as f:
5 | content = f.readlines()
6 | requirements = [x.strip() for x in content if 'git+' not in x]
7 |
8 | setup(name='ts-boilerplate',
9 | version="1.0",
10 | description="Trainer Boilerplate for Time Series Forecast Models with Cross Validation",
11 | packages=find_packages(),
12 | install_requires=requirements,
13 | test_suite='tests',
14 | # include_package_data: to install data from MANIFEST.in
15 | include_package_data=True,
16 | zip_safe=False)
17 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator-challenge/tests/__init__.py
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones
4 | from typing import Tuple
5 |
6 | @pytest.fixture(scope="session")
7 | def data_monotonic_increase() -> np.ndarray:
8 | return generate_data_monotonic_increase()
9 |
10 | @pytest.fixture(scope="session")
11 | def data_zeros_and_ones() -> np.ndarray:
12 | return generate_data_zeros_and_ones()
13 |
14 |
15 | @pytest.fixture(scope="session")
16 | def X_y_zeros_and_ones() -> Tuple[np.ndarray]:
17 | return generate_X_y_zeros_and_ones()
18 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/integrated/test_main.py:
--------------------------------------------------------------------------------
1 | """Tests that main route run without raising exceptions"""
2 |
3 | import pytest
4 | from ts_boilerplate.main import backtest, train, cross_validate
5 |
6 | @pytest.mark.slow
7 | def test_main_route_train(data_monotonic_increase):
8 | train(data_monotonic_increase)
9 |
10 | @pytest.mark.slow
11 | def test_main_route_cross_validate(data_monotonic_increase):
12 | cross_validate(data_monotonic_increase)
13 |
14 | @pytest.mark.slow
15 | def test_backtest(data_monotonic_increase):
16 | backtest(data_monotonic_increase, print_metrics=False, plot_metrics=False)
17 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/integrated/test_model_performance.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.main import train
3 |
4 |
5 | @pytest.mark.optional
6 | @pytest.mark.slow
7 | def test_model_can_fit_well_enough_on_dummy_dataset(data_zeros_and_ones):
8 | """Check that the model can fit, with MAPE lower than some threshold on dummy dataset of zeros and ones"""
9 |
10 | metrics = train(data_zeros_and_ones)
11 | #print("#### metrics on dummy dataset ", metrics)
12 |
13 | assert metrics < 5, "your model does not seem to be able to fit well enough even a very easy dataset"
14 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/unittests/test_data.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.params import TRAIN, DATA
3 | from ts_boilerplate.main import get_X_y
4 | from ts_boilerplate.main import train_test_split
5 | import numpy as np
6 | import math
7 |
8 |
9 | # These tests make use of the fixture `data_monotonic_increase` stored in tests/conftest.py (pytest magic under the hood)
10 | def test_get_X_y_returns_correct_shapes(data_monotonic_increase):
11 | """Test that X and y have correct shape (excluding sample size), as per project setup as defined in `params.py`
12 | """
13 | X, y = get_X_y(data_monotonic_increase, **TRAIN)
14 |
15 | # Check that X and y have the correct lengths (in time) and depth (in number of covariates)
16 | assert X.ndim == 3
17 | assert X.shape[1] == TRAIN['input_length']
18 | assert X.shape[2] == DATA['n_covariates'] + DATA[
19 | 'n_targets'], "Did you forget to include your past targets-values as features ?"
20 |
21 | y_should_be_3D = TRAIN['output_length'] > 1 and DATA["n_targets"] > 1
22 | y_should_be_1D = TRAIN['output_length'] == 1 and DATA["n_targets"] == 1
23 | if y_should_be_3D:
24 | assert y.ndim == 3
25 | assert y.shape[1] == TRAIN['output_length']
26 | assert y.shape[2] == DATA['n_targets']
27 | elif y_should_be_1D:
28 | assert y.ndim == 1
29 | else:
30 | assert y.ndim == 2
31 | assert y.shape[1] == TRAIN['output_length'] if DATA['n_targets'] == 1 else DATA['n_targets']
32 |
33 |
34 | @pytest.mark.optional
35 | @pytest.mark.skipif(TRAIN['stride'] == None, reason="Optional test only applicable if sliding method is used to get_X_y")
36 | def test_optional_get_X_y_returns_optimal_sample_size(data_monotonic_increase):
37 | """If get_X_y uses a stride method, check that X and y contains the optimal number of sample each
38 | """
39 | X, y = get_X_y(data_monotonic_increase, **TRAIN)
40 |
41 | # Complex formula below retro-engineered from `create_dummy_tests.ipynb`
42 | expected_len = math.ceil(
43 | (len(data_monotonic_increase) \
44 | - (TRAIN['input_length'] -1) \
45 | - (TRAIN['output_length'] -1) \
46 | - TRAIN['horizon']
47 | ) / TRAIN["stride"]
48 | )
49 | assert len(X) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen"
50 | assert len(y) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen"
51 |
52 | def test_no_data_leak(data_monotonic_increase):
53 | """Test that the time gap between the last timestep of `y_train` and the first timestep of `y_test`
54 | is at least as big as the forecast horizon
55 | according to 'https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png'
56 | """
57 |
58 | data_train, data_test = train_test_split(data_monotonic_increase, **TRAIN)
59 | X_train, y_train = get_X_y(data_train, shuffle=False, **TRAIN)
60 | X_test, y_test = get_X_y(data_test, shuffle=False, **TRAIN)
61 |
62 | y_train_last_seen_timestep = np.max(y_train) # OR y_train[-1].flat[-1]
63 | y_test_first_seen_timestep = np.min(y_test) # OR y_test[0].flat[0]
64 | gap = y_test_first_seen_timestep - y_train_last_seen_timestep
65 | # Note: for strides = 1, the inequality below must be an exact equality, but we don't need to test that to ensure no data leak.
66 | assert gap >= TRAIN["horizon"], "❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ "
67 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/tests/unittests/test_model.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.model import fit_model, get_model, predict_output
3 |
4 | def test_model_has_correct_output_shape(X_y_zeros_and_ones):
5 | X, y = X_y_zeros_and_ones
6 | model = get_model(X,y)
7 | y_pred = predict_output(model, X)
8 | assert y_pred.shape == y.shape
9 |
10 | @pytest.mark.slow
11 | def test_model_can_fit(X_y_zeros_and_ones):
12 | """Check that the model can fit without crashing"""
13 | X, y = X_y_zeros_and_ones
14 | model = get_model(X,y)
15 | fit_model(model, X, y, verbose=0)
16 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import isfile
2 | from os.path import dirname
3 |
4 | version_file = '{}/version.txt'.format(dirname(__file__))
5 |
6 | if isfile(version_file):
7 | with open(version_file) as version_file:
8 | __version__ = version_file.read().strip()
9 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/dataprep.py:
--------------------------------------------------------------------------------
1 | """Prepare Data so as to be used in a Pipelined ML model"""
2 |
3 | import numpy as np
4 | from ts_boilerplate.params import DATA
5 | from typing import Tuple, List
6 | import numpy as np
7 |
8 |
9 | def load_data(data_path: str) -> np.ndarray:
10 | """Load data from `data_path` into to memory
11 | Returns a 2D array with (axis 0) representing timesteps, and (axis 1) columns containing tagets and covariates
12 | ref: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true
13 | """
14 | # YOUR_CODE_HERE
15 | pass
16 |
17 |
18 | def clean_data(data: np.ndarray) -> np.ndarray:
19 | """Clean data without creating data leakage:
20 | - make sure there is no NaN between any timestep
21 | - etc...
22 | """
23 | # YOUR_CODE_HERE
24 | pass
25 |
26 |
27 | def get_X_y(
28 | data: np.ndarray,
29 | input_length: int,
30 | output_length: int,
31 | horizon: int,
32 | stride: int,
33 | shuffle=True,
34 | **kwargs,
35 | ) -> Tuple[np.ndarray, np.ndarray]:
36 | """
37 | Use `data`, a 2D-array with axis=0 as timesteps, and axis=1 as (tagets+covariates columns)
38 |
39 | Returns a Tuple (X,y) of two ndarrays :
40 | X.shape = (n_samples, input_length, n_covariates)
41 | y.shape =
42 | (n_samples, output_length, n_targets) if all 3-dimensions are of size > 1
43 | (n_samples, output_length) if n_targets == 1
44 | (n_samples, n_targets) if output_length == 1
45 | (n_samples, ) if both n_targets and lenghts == 1
46 |
47 | ❗️ Raise error if data contains NaN
48 | ❗️ Make sure to shuffle the pairs in unison if `shuffle=True` for idd purpose
49 | ❗️ Don't ditch past values of your target time-series in your features - they are very useful features!
50 | 👉 illustration: https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-1.png
51 |
52 | [💡 Hints ] You can use a sliding method
53 | - Reading `data` in ascending order
54 | - `stride` timestamps after another
55 | Feel free to use another approach, for example random sampling without replacement
56 |
57 | """
58 | pass # YOUR CODE HERE
59 |
60 |
61 |
62 |
63 |
64 | def get_folds(data: np.ndarray,
65 | fold_length: int,
66 | fold_stride: int,
67 | **kwargs) -> List[np.ndarray]:
68 | """Slide through `data` time-series (2D array) to create folds of equal `fold_length`, using `fold_stride` between each fold
69 | Returns a list of folds, each as a 2D-array time series
70 | """
71 | pass # YOUR CODE HERE
72 |
73 |
74 | def train_test_split(data: np.ndarray,
75 | train_test_ratio: float,
76 | input_length: int,
77 | **kwargs) -> Tuple[np.ndarray, np.ndarray]:
78 | """Returns a train and test 2D-arrays, that will not create any data leaks when sampling (X, y) from them
79 | Inspired from "https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png"
80 | """
81 | pass # YOUR CODE HERE
82 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/generate_dummy_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from ts_boilerplate.params import CROSS_VAL, DATA, TRAIN
3 | from typing import Tuple
4 |
5 | def generate_data_monotonic_increase() -> np.ndarray:
6 | """Creates a monotonicly increasing time serie dataset for test purposes
7 | - shape is (DATA['length'], DATA['n_covariates] + DATA['n_targets']),
8 | - values are all equals to their respective integer index!
9 |
10 | e.g:
11 | data = array(
12 | [[ 0., 0., 0., 0., 0.],
13 | [ 1., 1., 1., 1., 1.],
14 | ...,
15 | [998., 998., 998., 998., 998.],
16 | [999., 999., 999., 999., 999.]]
17 | )
18 |
19 | """
20 |
21 | indexes = np.arange(0, DATA['length'])
22 | data = np.zeros((DATA['length'], DATA['n_covariates'] + DATA['n_targets'])) \
23 | + np.expand_dims(indexes, axis=1)
24 | return data
25 |
26 | def generate_data_zeros_and_ones() -> np.ndarray:
27 | """Create a dummy data made of zeros for covariates, and ones for the targets
28 | e.g:
29 | data = array(
30 | [[1.,1.,0.,0.,0.],
31 | [1.,1.,0.,0.,0.],
32 | ...,
33 | [1.,1.,0.,0.,0.],
34 | [1.,1.,0.,0.,0.]]
35 | )
36 | """
37 | shape = (DATA['length'], DATA['n_covariates'] + DATA['n_targets'])
38 | data = np.zeros(shape)
39 | data[:, DATA["target_column_idx"]] = 1.
40 | return data
41 |
42 | def generate_X_y_zeros_and_ones() -> Tuple[np.ndarray]:
43 | """Create a dummy (X,y) tuple made of zeros for covariates, and ones for the targets, just to check if model fit well"""
44 | length = round(DATA["length"] / TRAIN['stride'])
45 |
46 | shape_X = (length, TRAIN['input_length'], DATA['n_covariates']+DATA['n_targets'])
47 | X = np.zeros(shape_X)
48 | X[:, :, DATA["target_column_idx"]] = 1.
49 |
50 | shape_y = (length, TRAIN['output_length'], DATA['n_targets'])
51 | y = np.ones(shape_y)
52 | y = np.squeeze(y)
53 |
54 | return (X,y)
55 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Top level orchestrator of the project. To be called from the CLI.
3 | It comprises all the "routes" you may want to call
4 | '''
5 | from email import header
6 | import numpy as np
7 | import pandas as pd
8 | import os
9 | from ts_boilerplate.dataprep import get_Xi_yi, get_X_y, get_folds, train_test_split
10 | from ts_boilerplate.model import get_model, fit_model, predict_output
11 | from ts_boilerplate.metrics import mape, mae
12 | from ts_boilerplate.params import CROSS_VAL, ROOT_DIR, TRAIN, DATA
13 | from typing import Tuple, List
14 | import matplotlib.pyplot as plt
15 |
16 |
17 | def train(data: np.ndarray, print_metrics: bool = False):
18 | """
19 | Train the model in this package on one fold `data` containing the 2D-array of time-series for your problem
20 | Returns `metrics_test` associated with the training
21 | """
22 | pass # YOUR CODE HERE
23 |
24 |
25 | def cross_validate(data: np.ndarray, print_metrics: bool = False):
26 | """
27 | Cross-Validate the model in this package on`data`
28 | Returns `metrics_cv`: the list of test metrics at each fold
29 | """
30 | pass # YOUR CODE HERE
31 |
32 |
33 | def backtest(data: np.ndarray,
34 | stride: int = 1,
35 | start_ratio: float = 0.9,
36 | retrain: bool = True,
37 | retrain_every: int = 1,
38 | print_metrics=False,
39 | plot_metrics=False):
40 | """Returns historical forecasts for the entire dataset
41 | - by training model up to `start_ratio` of the dataset
42 | - then predicting next values using the model in this package (only predict the last time-steps if `predict_only_last_value` is True)
43 | - then moving `stride` timesteps ahead
44 | - then retraining the model if `retrain` is True and if we moved `retrain_every` timesteps since last training
45 | - then predicting next values again
46 |
47 | Return:
48 | - all historical predictions as 2D-array time-series of shape ((1-start_ratio)*len(data), n_targets)/stride
49 | - Compute the 'mean-MAPE' per forecast horizon
50 | - Print historical predictions if you want a visual check
51 |
52 | see https://unit8co.github.io/darts/generated_api/darts.models.forecasting.rnn_model.html#darts.models.forecasting.rnn_model.RNNModel.historical_forecasts
53 | """
54 | pass # YOUR CODE HERE
55 |
56 | if __name__ == '__main__':
57 | data = pd.read_csv(os.path.join(ROOT_DIR, 'data','raw','data.csv')).to_numpy()
58 | try:
59 | train(data=data, print_metrics=True)
60 | # cross_validate(data=data, print_metrics=True)
61 | # backtest(data=data,
62 | # stride = 1,
63 | # start_ratio = 0.9,
64 | # retrain = True,
65 | # retrain_every=1,
66 | # print_metrics=True,
67 | # plot_metrics=True)
68 | except:
69 | import ipdb, traceback, sys
70 | extype, value, tb = sys.exc_info()
71 | traceback.print_exc()
72 | ipdb.post_mortem(tb)
73 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Computes usefull Time Series metrics from (y_true, y_test)
3 | '''
4 |
5 | import numpy as np
6 | from tensorflow import reduce_mean
7 | from tensorflow.keras.metrics import mean_absolute_error, mean_absolute_percentage_error
8 |
9 |
10 | def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
11 | """Returns Mean Absolute Error"""
12 | pass # YOUR CODE HERE
13 |
14 | def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
15 | """Returns Mean Absolute Percentage Error"""
16 | pass # YOUR CODE HERE
17 |
18 | def mase(y_true: np.ndarray, y_pred: np.ndarray) -> float:
19 | """Returns Mean Absolute Scaled Error (https://en.wikipedia.org/wiki/Mean_absolute_scaled_error)
20 | """
21 | pass
22 |
23 |
24 | def play_trading_strategy(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
25 | """Returns the array of relative portfolio values over the test period"""
26 | pass
27 |
28 |
29 | def return_on_investment(played_trading_strategy: np.ndarray) -> float:
30 | """Returns the ROI of an investment strategy"""
31 | pass
32 |
33 |
34 | def sharpe_ratio(played_trading_strategy: np.ndarray) -> float:
35 | """Returns the Sharpe Ratio (Return on Investment / Volatility) of an investment strategy"""
36 | pass
37 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/model.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input
3 | from tensorflow.keras import Model
4 | from ts_boilerplate.params import DATA, TRAIN
5 |
6 | # TODO: Should we add here the preprocessing? into a class called "pipeline"?
7 | # TODO: Should we refacto in a class ? Probably!
8 |
9 |
10 | def get_model(X_train, y_train):
11 | """Instanciate, compile and and return the model of your choice"""
12 | pass # YOUR CODE HERE
13 |
14 |
15 | def fit_model(model, X_train, y_train, **kwargs):
16 | """Fit the `model` object, including preprocessing if needs be"""
17 | pass # YOUR CODE HERE
18 |
19 |
20 | def predict_output(model, X_test):
21 | """Return y_test. Include preprocessing if needs be"""
22 | pass # YOUR CODE HERE
23 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/params.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 |
4 | ## CREDENTIALS AND PATHS
5 | load_dotenv()
6 | API_KEY = os.getenv('API_KEY')
7 |
8 | ## DIR PARAMS
9 | ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
10 | DATA_RAW_CSV_PATH = os.path.join(ROOT_DIR, 'data', 'raw', 'data.csv')
11 |
12 | # 👇 Please fill these global variable below very carefully, in order to create tests related to your problem👇
13 | # cf: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true
14 | DATA = dict(
15 | length = 500, # How many timesteps does your dataset contains?
16 | n_covariates = 3, # number of past covariates, excluding target time series. Our tests do not support future_covariate yet.
17 | target_column_idx = [0,1] # List of index(es) of target column(s) in your dataset. e.g [0] for Mono-target problem, e.g. [0,1,4] for multi-variate targets problem. Note that past targets values will also be used as features X.
18 | )
19 | DATA['n_targets'] = len(DATA['target_column_idx']) # number of target time series to predict.
20 |
21 | TRAIN = dict(
22 | horizon = 4, # start prediction xxx timestep ahead
23 | input_length = 10, # Length (in time) of each sequences that will be seen by the model (X.shape[1])
24 | output_length = 7, # Length (in time) of prediction (y.shape[1])
25 | stride = 1, # Integer used to create all pairs of sample (Xi, yi) by sliding in each data fold. Use `None` if you don't plan to use any sliding method in data.get_X_y
26 | train_test_ratio = 0.7, # ratio of train / (train+test) length in each fold
27 | )
28 |
29 | CROSS_VAL = dict(
30 | fold_length = 200,
31 | fold_stride = 100,
32 | )
33 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/.challengifyignore:
--------------------------------------------------------------------------------
1 | TODO.md
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
1 |
2 | # 🤖 usage
3 | #
4 | # this file contains the conf for GitHub Continuous Integration
5 | # and Continuous Deployment to Heroku
6 | #
7 | # in order to activate the tests in GitHub CI:
8 | # - uncomment the content of the CI paragraph (lines 41-55)
9 | # - create some tests in the tests/ directory
10 | #
11 | # in order to activate CD to Heroku:
12 | # - activate the tests in GitHub CI
13 | # - uncomment the content of the CD paragraph (lines 57-75)
14 |
15 | name: Python package
16 |
17 | on:
18 | push:
19 | branches: [ master ]
20 | pull_request:
21 | branches: [ master ]
22 |
23 | jobs:
24 |
25 | # 🤖 CI paragraph
26 | #
27 | # uncomment the content of this paragraph to activate the tests in GitHub CI
28 | # - remove the 2 trailing characters "# ", do not change the spaces
29 | # (the `name` keys should be at the same level as the `uses` key)
30 | # (the `strategy` key should be at the same level as the `steps` key)
31 |
32 | build:
33 |
34 | runs-on: ubuntu-latest
35 |
36 | steps:
37 | - uses: actions/checkout@v2
38 | - name: Say hello
39 | run: |
40 | echo "Hello, World!"
41 | # - name: Set up Python ${{ matrix.python-version }}
42 | # uses: actions/setup-python@v1
43 | # with:
44 | # python-version: ${{ matrix.python-version }}
45 | # - name: Install dependencies
46 | # run: |
47 | # python -m pip install --upgrade pip
48 | # pip install -r requirements.txt
49 | # - name: Install package and test
50 | # run: |
51 | # make install test clean
52 |
53 | # strategy:
54 | # matrix:
55 | # python-version: [3.8]
56 |
57 | # # 🤖 CD paragraph
58 | # #
59 | # # uncomment the following lines to activate CD to Heroku
60 | # # - remove the 2 trailing characters "# ", do not change the spaces
61 | # # (there should be 2 spaces before the `deploy_heroku` key)
62 | # # - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets
63 | # # - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app
64 |
65 | # deploy_heroku:
66 | # needs: build
67 | # runs-on: ubuntu-latest
68 |
69 | # steps:
70 | # - uses: actions/checkout@v2
71 | # - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action
72 | # with:
73 | # heroku_api_key: ${{secrets.HEROKU_API_KEY}}
74 | # heroku_app_name: "REPLACE_WITH_YOUR_HEROKU_APP_NAME" # Must be unique in Heroku
75 | # heroku_email: ${{secrets.HEROKU_EMAIL}}
76 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | .coverage
3 | .ipynb_checkpoints
4 | **/*.DS_Store
5 | data_raw/
6 | data_processed/
7 | *.csv
8 | __pycache__/
9 | .env
10 | .vscode/
11 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/Makefile:
--------------------------------------------------------------------------------
1 | # TODO
2 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/README.md:
--------------------------------------------------------------------------------
1 | This is a **boilerplate** repo for a machine-learning project involving **Time Series forecasting**.
2 |
3 | In particular
4 |
5 | - It provides a **cross-validation framework** to ensure model are tested thoroughly and without data leakage
6 | - It is agnostic of the type of model involved
7 | - It is well suited for short research projects, typical of few-weeks coding bootcamps such as Le Wagon DataScience
8 |
9 | # Detailed package workflow
10 |
11 | ## Architecture
12 | - `ts_boilerplate` package
13 | - `main.py` comprises the main routes to be called from the CLI (`train`, `cross-validate`, `backtest`)
14 | - `params.py` contains project-level global variable to be set manually
15 |
16 |
17 | - `data` folder contains
18 | - `raw` and `clean` folder should contain **2D arrays `data` time-series**, with (axis 0) representing timesteps integer, and (axis 1) columns containing tagets and covariates, as per [picture](https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true)
19 | ```python
20 | data.shape = (length, n_targets+n_covariates)
21 | ```
22 | - `Xy` may persist your tuple (X,y) of **3D arrays** training sets to be fed to your models if you want to store them to avoid preprocessing multiple times.
23 | ```python
24 | X.shape = (n_samples, input_length, n_covariates)
25 | y.shape = (n_samples, output_length, n_targets)
26 | ```
27 | - `notebooks`
28 | - `test_package.ipynb` will help you understand how the package and the tests have been built.
29 | - `tutorial_ts_forecasting.ipynb` is a recommended read before diving into this project. It contains visuals that will help you fill global project params and understand naming conventions
30 |
31 |
32 |
33 | - `tests` folder detailed below
34 |
35 | ## How to test your code?
36 | First of all, fill `ts_boilerplate/params.py` corresponding to your true project speficities
37 |
38 | Then, run this in your terminal from the root project folder to check your code
39 | - `pytest`
40 | - `pytest -m "not optional"` to only check mandatory tests
41 | - `pytest -m "not optional" -m "not slow"` to also avoid tests that may be slow (involving fitting your model)
42 |
43 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/TODO.md:
--------------------------------------------------------------------------------
1 | - [ ] Refacto `model.py`
2 | - [ ] Rename `pipeline.py` because it may comprises the pre-processing such as scaling etc...
3 | - [ ] Turn into a class `TsPipeline()` instead of pure functions
4 |
5 | - [ ] Add requirements.txt
6 | - [ ] Integrate package as part of the ML Ops lifecycle
7 | - [ ] track & save experiment results
8 | - [ ] ...
9 |
10 | - [ ] Add tests for future-covariates
11 | - [ ] Create Makefile
12 | - [ ] Include DAG of the project
13 | - [ ] publish to lewagon community
14 | - [ ] cache the fixtures of conftest.py
15 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/notebooks/WIP_tutorial_darts_library.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## This is a tutorial for Darts python package"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": []
14 | }
15 | ],
16 | "metadata": {
17 | "language_info": {
18 | "name": "python"
19 | },
20 | "orig_nbformat": 4
21 | },
22 | "nbformat": 4,
23 | "nbformat_minor": 2
24 | }
25 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/notebooks/test_package.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Step by step guide to Unit Tests used in this project"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "ExecuteTime": {
15 | "end_time": "2022-03-14T14:24:55.279012Z",
16 | "start_time": "2022-03-14T14:24:53.949004Z"
17 | }
18 | },
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "The autoreload extension is already loaded. To reload it, use:\n",
25 | " %reload_ext autoreload\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import numpy as np\n",
31 | "import pandas as pd\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "import os\n",
34 | "from ts_boilerplate.params import ROOT_DIR, DATA, TRAIN, CROSS_VAL\n",
35 | "from ts_boilerplate.dataprep import get_X_y, get_folds, train_test_split, get_Xi_yi\n",
36 | "from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones\n",
37 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n",
38 | "from ts_boilerplate.metrics import mape\n",
39 | "\n",
40 | "%load_ext autoreload\n",
41 | "%autoreload 2"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## 1) `generate_dummy_data.py`"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "Let's create a dummy time series dataset whose value increment by 1 every day"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 2,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/plain": [
66 | "array([[ 0., 0., 0., 0., 0.],\n",
67 | " [ 1., 1., 1., 1., 1.],\n",
68 | " [ 2., 2., 2., 2., 2.],\n",
69 | " ...,\n",
70 | " [497., 497., 497., 497., 497.],\n",
71 | " [498., 498., 498., 498., 498.],\n",
72 | " [499., 499., 499., 499., 499.]])"
73 | ]
74 | },
75 | "execution_count": 2,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "data = generate_data_monotonic_increase()\n",
82 | "data"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 3,
88 | "metadata": {
89 | "ExecuteTime": {
90 | "end_time": "2022-03-14T14:25:19.973275Z",
91 | "start_time": "2022-03-14T14:25:19.950901Z"
92 | }
93 | },
94 | "outputs": [
95 | {
96 | "data": {
97 | "text/html": [
98 | "\n",
99 | "\n",
112 | "
\n",
113 | " \n",
114 | " \n",
115 | " | \n",
116 | " 0 | \n",
117 | " 1 | \n",
118 | " 2 | \n",
119 | " 3 | \n",
120 | " 4 | \n",
121 | "
\n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " 0 | \n",
126 | " 0.0 | \n",
127 | " 0.0 | \n",
128 | " 0.0 | \n",
129 | " 0.0 | \n",
130 | " 0.0 | \n",
131 | "
\n",
132 | " \n",
133 | " 1 | \n",
134 | " 1.0 | \n",
135 | " 1.0 | \n",
136 | " 1.0 | \n",
137 | " 1.0 | \n",
138 | " 1.0 | \n",
139 | "
\n",
140 | " \n",
141 | " 2 | \n",
142 | " 2.0 | \n",
143 | " 2.0 | \n",
144 | " 2.0 | \n",
145 | " 2.0 | \n",
146 | " 2.0 | \n",
147 | "
\n",
148 | " \n",
149 | " 3 | \n",
150 | " 3.0 | \n",
151 | " 3.0 | \n",
152 | " 3.0 | \n",
153 | " 3.0 | \n",
154 | " 3.0 | \n",
155 | "
\n",
156 | " \n",
157 | " 4 | \n",
158 | " 4.0 | \n",
159 | " 4.0 | \n",
160 | " 4.0 | \n",
161 | " 4.0 | \n",
162 | " 4.0 | \n",
163 | "
\n",
164 | " \n",
165 | " ... | \n",
166 | " ... | \n",
167 | " ... | \n",
168 | " ... | \n",
169 | " ... | \n",
170 | " ... | \n",
171 | "
\n",
172 | " \n",
173 | " 495 | \n",
174 | " 495.0 | \n",
175 | " 495.0 | \n",
176 | " 495.0 | \n",
177 | " 495.0 | \n",
178 | " 495.0 | \n",
179 | "
\n",
180 | " \n",
181 | " 496 | \n",
182 | " 496.0 | \n",
183 | " 496.0 | \n",
184 | " 496.0 | \n",
185 | " 496.0 | \n",
186 | " 496.0 | \n",
187 | "
\n",
188 | " \n",
189 | " 497 | \n",
190 | " 497.0 | \n",
191 | " 497.0 | \n",
192 | " 497.0 | \n",
193 | " 497.0 | \n",
194 | " 497.0 | \n",
195 | "
\n",
196 | " \n",
197 | " 498 | \n",
198 | " 498.0 | \n",
199 | " 498.0 | \n",
200 | " 498.0 | \n",
201 | " 498.0 | \n",
202 | " 498.0 | \n",
203 | "
\n",
204 | " \n",
205 | " 499 | \n",
206 | " 499.0 | \n",
207 | " 499.0 | \n",
208 | " 499.0 | \n",
209 | " 499.0 | \n",
210 | " 499.0 | \n",
211 | "
\n",
212 | " \n",
213 | "
\n",
214 | "
500 rows × 5 columns
\n",
215 | "
"
216 | ],
217 | "text/plain": [
218 | " 0 1 2 3 4\n",
219 | "0 0.0 0.0 0.0 0.0 0.0\n",
220 | "1 1.0 1.0 1.0 1.0 1.0\n",
221 | "2 2.0 2.0 2.0 2.0 2.0\n",
222 | "3 3.0 3.0 3.0 3.0 3.0\n",
223 | "4 4.0 4.0 4.0 4.0 4.0\n",
224 | ".. ... ... ... ... ...\n",
225 | "495 495.0 495.0 495.0 495.0 495.0\n",
226 | "496 496.0 496.0 496.0 496.0 496.0\n",
227 | "497 497.0 497.0 497.0 497.0 497.0\n",
228 | "498 498.0 498.0 498.0 498.0 498.0\n",
229 | "499 499.0 499.0 499.0 499.0 499.0\n",
230 | "\n",
231 | "[500 rows x 5 columns]"
232 | ]
233 | },
234 | "execution_count": 3,
235 | "metadata": {},
236 | "output_type": "execute_result"
237 | }
238 | ],
239 | "source": [
240 | "# Store as CSV\n",
241 | "data_df = pd.DataFrame(data)\n",
242 | "data_df.to_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"), index=False)\n",
243 | "pd.read_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"))"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "## 2) `dataprep.py`"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### 2.1) `getX_y`"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 10,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "name": "stdout",
267 | "output_type": "stream",
268 | "text": [
269 | "(475, 10, 5)\n",
270 | "(475, 7, 2)\n"
271 | ]
272 | }
273 | ],
274 | "source": [
275 | "X, y = get_X_y(data, **TRAIN)\n",
276 | "print(X.shape)\n",
277 | "print(y.shape)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 11,
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "data": {
287 | "text/plain": [
288 | "475.0"
289 | ]
290 | },
291 | "execution_count": 11,
292 | "metadata": {},
293 | "output_type": "execute_result"
294 | }
295 | ],
296 | "source": [
297 | "# Let's compute the shape arithmetically (for unittests)\n",
298 | "(len(data) \\\n",
299 | " - (TRAIN['input_length'] -1) \\\n",
300 | " - (TRAIN['output_length'] -1) \\\n",
301 | " - TRAIN['horizon']) \\\n",
302 | " / TRAIN[\"stride\"]"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "metadata": {},
308 | "source": [
309 | "☝️ ceiling rounding function should be used for stride > 1"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "### 2.2) `train_test_split`"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 12,
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "(500, 5)"
328 | ]
329 | },
330 | "execution_count": 12,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "train_test_ratio = TRAIN[\"train_test_ratio\"]\n",
337 | "input_length = TRAIN[\"input_length\"]\n",
338 | "output_length = TRAIN[\"output_length\"]\n",
339 | "data.shape"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 13,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "last_train_idx = round(train_test_ratio * len(data))\n",
349 | "data_train = data[0:last_train_idx, :]\n",
350 | "\n",
351 | "first_test_idx = last_train_idx - input_length\n",
352 | "data_test = data[first_test_idx:, :]"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 14,
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "data": {
362 | "text/plain": [
363 | "array([[ 0., 0., 0., 0., 0.],\n",
364 | " [ 1., 1., 1., 1., 1.],\n",
365 | " [ 2., 2., 2., 2., 2.],\n",
366 | " ...,\n",
367 | " [347., 347., 347., 347., 347.],\n",
368 | " [348., 348., 348., 348., 348.],\n",
369 | " [349., 349., 349., 349., 349.]])"
370 | ]
371 | },
372 | "execution_count": 14,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "data_train"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "data_test"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
397 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n",
398 | "\n",
399 | "print(\"####### Last train pair\")\n",
400 | "print(X_train[-1])\n",
401 | "print(y_train[-1])\n",
402 | "print(\"####### First test pair\")\n",
403 | "print(X_test[0])\n",
404 | "print(y_test[0])"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 17,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "data": {
414 | "text/plain": [
415 | "10.0"
416 | ]
417 | },
418 | "execution_count": 17,
419 | "metadata": {},
420 | "output_type": "execute_result"
421 | }
422 | ],
423 | "source": [
424 | "gap = np.min(y_test) - np.max(y_train)\n",
425 | "gap"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 18,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "assert gap >= TRAIN[\"horizon\"], \"❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ \""
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "metadata": {},
440 | "source": [
441 | "### 2.3) `get_folds`"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": null,
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "folds = get_folds(data, **CROSS_VAL)\n",
451 | "print('n_folds= ', len(folds))\n",
452 | "print(folds[-1])"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "metadata": {},
458 | "source": [
459 | "## 3) `model.py`"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 27,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "data_train, data_test = train_test_split(data, **TRAIN)\n",
469 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
470 | "X_test, y_test = get_X_y(data_test, **TRAIN)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 28,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "import tensorflow as tf\n",
480 | "from keras.models import Model\n",
481 | "from keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 18,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "name": "stdout",
491 | "output_type": "stream",
492 | "text": [
493 | "Model: \"model\"\n",
494 | "_________________________________________________________________\n",
495 | " Layer (type) Output Shape Param # \n",
496 | "=================================================================\n",
497 | " input_1 (InputLayer) [(None, 10, 5)] 0 \n",
498 | " \n",
499 | " lambda (Lambda) (None, 7, 2) 0 \n",
500 | " \n",
501 | " reshape (Reshape) (None, 7, 2) 0 \n",
502 | " \n",
503 | "=================================================================\n",
504 | "Total params: 0\n",
505 | "Trainable params: 0\n",
506 | "Non-trainable params: 0\n",
507 | "_________________________________________________________________\n"
508 | ]
509 | }
510 | ],
511 | "source": [
512 | "# BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS\n",
513 | "input = Input(shape=X_train.shape[1:])\n",
514 | "# Take last temporal values of the targets, and duplicate it as many times as `output_length`\n",
515 | "x = Lambda(\n",
516 | " lambda x: tf.repeat(tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1),\n",
517 | " repeats=TRAIN['output_length'],\n",
518 | " axis=1))(input)\n",
519 | "output = Reshape(y_train.shape[1:])(x)\n",
520 | "model = Model(input, output)\n",
521 | "model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)\n",
522 | "model.summary()"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 19,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',\n",
532 | " patience=2,\n",
533 | " verbose=0,\n",
534 | " mode='min',\n",
535 | " restore_best_weights=True)\n",
536 | "history = model.fit(X_train,\n",
537 | " y_train,\n",
538 | " epochs=50,\n",
539 | " batch_size=16,\n",
540 | " validation_split=0.3,\n",
541 | " callbacks=[es],\n",
542 | " verbose=0)"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": 29,
548 | "metadata": {},
549 | "outputs": [
550 | {
551 | "data": {
552 | "text/plain": [
553 | "3.0535266"
554 | ]
555 | },
556 | "execution_count": 29,
557 | "metadata": {},
558 | "output_type": "execute_result"
559 | }
560 | ],
561 | "source": [
562 | "from ts_boilerplate.metrics import mape\n",
563 | "\n",
564 | "y_pred = model.predict(X_test)\n",
565 | "mape(y_test, y_pred)\n"
566 | ]
567 | },
568 | {
569 | "cell_type": "markdown",
570 | "metadata": {},
571 | "source": [
572 | "## 4) `main.py`\n"
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {},
578 | "source": [
579 | "### 4.1) `train()`"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 10,
585 | "metadata": {},
586 | "outputs": [
587 | {
588 | "name": "stdout",
589 | "output_type": "stream",
590 | "text": [
591 | "### Test Metric: 3.0535274\n"
592 | ]
593 | }
594 | ],
595 | "source": [
596 | "data = generate_data_monotonic_increase()\n",
597 | "data_train, data_test = train_test_split(data, **TRAIN)\n",
598 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n",
599 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n",
600 | "model = get_model(X_train, y_train)\n",
601 | "history = fit_model(model, X_train, y_train)\n",
602 | "y_pred = predict_output(model, X_test)\n",
603 | "metrics_test = mape(y_test, y_pred)\n",
604 | "\n",
605 | "print(\"### Test Metric: \", metrics_test)"
606 | ]
607 | },
608 | {
609 | "cell_type": "markdown",
610 | "metadata": {},
611 | "source": [
612 | "### 4.2) cross_validate()"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "metadata": {},
619 | "outputs": [],
620 | "source": []
621 | },
622 | {
623 | "cell_type": "markdown",
624 | "metadata": {},
625 | "source": [
626 | "### 4.1) `backtesting()`"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 20,
632 | "metadata": {},
633 | "outputs": [],
634 | "source": [
635 | "y_pred_backtest = []"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 21,
641 | "metadata": {},
642 | "outputs": [],
643 | "source": [
644 | "data = generate_data_monotonic_increase()\n",
645 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n",
646 | "from ts_boilerplate.dataprep import get_Xi_yi"
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": 25,
652 | "metadata": {},
653 | "outputs": [],
654 | "source": [
655 | "stride = 10\n",
656 | "start_ratio:float = 0.8\n",
657 | "retrain: bool = True\n",
658 | "retrain_every: int = 50"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 30,
664 | "metadata": {},
665 | "outputs": [
666 | {
667 | "data": {
668 | "application/vnd.jupyter.widget-view+json": {
669 | "model_id": "f1761b545b3340529fd4939039a0cb46",
670 | "version_major": 2,
671 | "version_minor": 0
672 | },
673 | "text/plain": [
674 | " 0%| | 0/10 [00:00, ?it/s]"
675 | ]
676 | },
677 | "metadata": {},
678 | "output_type": "display_data"
679 | }
680 | ],
681 | "source": [
682 | "from tqdm.notebook import tqdm\n",
683 | "\n",
684 | "# Initialization\n",
685 | "start_timestep_0 = round(start_ratio * len(data))\n",
686 | "data_train_0 = data[:start_timestep_0, ...]\n",
687 | "X_train_tmp, y_train_tmp = get_X_y(data_train_0, **TRAIN)\n",
688 | "data_test_backtested = data[start_timestep_0:, ...]\n",
689 | "_, y_test = get_X_y(data_test_backtested, **TRAIN, shuffle=False)\n",
690 | "y_pred_backtested = []\n",
691 | "retrain_counter = 0\n",
692 | "timesteps_backtested_list = []\n",
693 | "\n",
694 | "for i in tqdm(range(0, len(data_test_backtested), stride)):\n",
695 | " start_timestep_i = start_timestep_0 + i\n",
696 | " data_train = data[:start_timestep_i, ...]\n",
697 | " data_test = data[start_timestep_i:, ...]\n",
698 | " X_train_tmp, y_train_tmp = get_X_y(data_train, **TRAIN)\n",
699 | " X_test_i, y_test_i = get_Xi_yi(first_index=0, data=data_test, **TRAIN)\n",
700 | "\n",
701 | " # At some point after sliding through time, we will reach the end of the test set\n",
702 | " if y_test_i.shape[0] < y_train_tmp.shape[1]:\n",
703 | " break\n",
704 | "\n",
705 | " model = get_model(X_train_tmp, y_train_tmp)\n",
706 | "\n",
707 | " # Retrain when required, with incremental learning (ie. starting from previous weights)\n",
708 | " if retrain and i % retrain_every == 0:\n",
709 | " retrain_counter += 1\n",
710 | " fit_model(model, X_train_tmp, y_train_tmp)\n",
711 | "\n",
712 | " y_pred_i = np.squeeze(predict_output(model, X_test_i[None, ...]))\n",
713 | " y_pred_backtested.append(y_pred_i)\n",
714 | " timesteps_backtested_list.append(i)\n",
715 | "\n",
716 | "y_pred_backtested = np.array(y_pred_backtested)\n",
717 | "y_test_backtested = y_test[timesteps_backtested_list]\n",
718 | "# Check that we compare apples to apples\n",
719 | "assert y_pred_backtested.shape == y_test_backtested.shape\n",
720 | "\n",
721 | "metrics_backtested = mape(y_pred_backtested, y_test_backtested)"
722 | ]
723 | },
724 | {
725 | "cell_type": "code",
726 | "execution_count": 31,
727 | "metadata": {},
728 | "outputs": [
729 | {
730 | "name": "stdout",
731 | "output_type": "stream",
732 | "text": [
733 | "### BACKETESTED METRICS BASED ON THE LAST 8 TIMESTEPS AND WITH 2 retrain operations\n",
734 | "2.9357621869726818\n"
735 | ]
736 | }
737 | ],
738 | "source": [
739 | "print(\n",
740 | " f'### BACKETESTED METRICS BASED ON THE LAST {y_pred_backtested.shape[0]} TIMESTEPS AND WITH {retrain_counter} retrain operations'\n",
741 | ")\n",
742 | "print(mape(y_pred_backtested, y_test_backtested))\n"
743 | ]
744 | },
745 | {
746 | "cell_type": "code",
747 | "execution_count": 32,
748 | "metadata": {},
749 | "outputs": [
750 | {
751 | "data": {
752 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEGCAYAAACevtWaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAA930lEQVR4nO3dd3hU1dbH8e8ipNBC70noHUILRRGl1wioKCio2Lhc8V7LVS+gItJEQQFR5GJBsStYkN6rIE0UTCihJ5RAgIQQUme/f5zRN+IACSScM8n6PE+ezJyZM/NLCCsn6+yztxhjUEoplbcUsDuAUkqpnKfFXSml8iAt7koplQdpcVdKqTxIi7tSSuVBBe0OAFCmTBlTtWpVu2MopZRX2bZt22ljTFlPjzmiuFetWpWtW7faHUMppbyKiBy+3GPallFKqTxIi7tSSuVBWtyVUioPynLPXUR8gK1AjDEmXEQ6AJMAP2Ab8IgxJl1EBJgK9ACSgEHGmO3ZDZaWlkZ0dDTJycnZ3VXlIQEBAQQFBeHr62t3FKW8SnZOqD4JRAKBIlIA+BjoaIzZKyKjgQeBD4DuQC33RyvgXffnbImOjqZYsWJUrVoV6/eFym+MMcTFxREdHU21atXsjqOUV8lSW0ZEgoCewPvuTaWBVGPMXvf9ZcBd7tu9gdnGsgkoISIVsxssOTmZ0qVLa2HPx0SE0qVL619vSl2DrPbcpwDPAy73/dNAQREJc9/vCwS7b1cGjmbaN9q97S9EZLCIbBWRradOnfL4plrYlf4MKHVtrlrcRSQciDXGbPtjm7HmCe4PTBaRzcB5ICM7b2yMmWmMCTPGhJUt63EMvlJK5V3JCbB8FJw9lCsvn5Uj9zZALxE5BHwJdBCRT40xG40xbY0xLYG1wB8tmhj+/ygeIMi9zascOnSIhg0benxs5MiRLF++/LL7fv/990RERGT7PWfMmMHs2bOzvR9A0aJFPW5/6623qFevHgMGDLim181p1/q9USrPcGXAto9gWjNYPxmiLl9LrsdVT6gaY4YDwwFEpB3wrDFmoIiUM8bEiog/8F9gnHuXecATIvIl1onUeGPM8dwIb5fRo0df8fHvv/+e8PBw6tevn+XXTE9PZ8iQIdcb7W+mT5/O8uXLCQoKynKOggVz78Lla/neKJVnHFgNS16Ak7sg5Ca472uo3CxX3up6xrk/JyKRwG/Aj8aYle7tC4EDQBTwHvD49UW0T0ZGBo899hgNGjSgS5cuXLx4EYBBgwYxZ84cAIYNG0b9+vUJDQ3l2Wef5aeffmLevHk899xzNGnShP3797Njxw5at25NaGgod9xxB2fPngWgXbt2PPXUU4SFhTF16lRGjRrFpEmTAIiKiqJTp040btyYZs2asX//fhITE+nYsSPNmjWjUaNG/PDDD1fMP2TIEA4cOED37t2ZPHkyZ86coU+fPoSGhtK6dWt+++03AEaNGsX9999PmzZtuP/++zl16hR33XUXLVq0oEWLFmzYsAGAxMREHnroIRo1akRoaChz584F4J///CdhYWE0aNCAl19++c/3z8r35q233vrzOf3798/Bfz2lHCRuP3xxL8zuDSkJcPfH8NCiXCvskM25ZYwxq4HV7tvPAc95eI4BhuZAtj+98uPvRBxLyMmXpH6lQF6+vcEVn7Nv3z6++OIL3nvvPe655x7mzp3LwIED/3w8Li6O7777jt27dyMinDt3jhIlStCrVy/Cw8Pp27cvAKGhoUybNo3bbruNkSNH8sorrzBlyhQAUlNT/5xXZ9SoUX++9oABAxg2bBh33HEHycnJuFwu/Pz8+O677wgMDOT06dO0bt2aXr16Xfak44wZM1i8eDGrVq2iTJky/Otf/6Jp06Z8//33rFy5kgceeIAdO3YAEBERwfr16ylUqBD33XcfTz/9NLfccgtHjhyha9euREZGMmbMGIoXL87OnTsB/vwlNW7cOEqVKkVGRgYdO3bkt99+o3Llyln63kyYMIGDBw/i7+/PuXPnsvVvqJTjXTwLaybC5plQMAA6jYJW/wTfgFx/a0dMHOZU1apVo0mTJgA0b96cQ4cO/eXx4sWLExAQwCOPPEJ4eDjh4eF/e434+HjOnTvHbbfdBsCDDz7I3Xff/efj/fr1+9s+58+fJyYmhjvuuAOwLuQB68KuESNGsHbtWgoUKEBMTAwnT56kQoUKWfp61q9f/+fRdocOHYiLiyMhwfql2atXLwoVKgTA8uXL/9IXT0hIIDExkeXLl/Pll1/+ub1kyZIAfP3118ycOZP09HSOHz9OREQE9evXv+r3BqxffAMGDKBPnz706dMnS1+HUo6XkQ7bZsGq8VaBb/YAdHgRipa7YRG8orhf7Qg7t/j7+/9528fH58+2zB8KFizI5s2bWbFiBXPmzOHtt99m5cqVl77MFRUpUiTLz/3ss884deoU27Ztw9fXl6pVq+bYGPDMOVwuF5s2bfrzl8qVHDx4kEmTJrFlyxZKlizJoEGDSE5OzvL3ZsGCBaxdu5Yff/yRcePGsXPnzlzt+SuV6/YthyUj4PQeqNoWur0KFRrd8Bg6t8x1SExMJD4+nh49ejB58mR+/fVXAIoVK8b58+cB6+i+ZMmSrFu3DoBPPvnkz6P4yylWrBhBQUF8//33AKSkpJCUlER8fDzlypXD19eXVatWcfjwZWf79Kht27Z89tlnAKxevZoyZcoQGBj4t+d16dKFadOm/Xn/j9ZN586deeedd/7cfvbsWRISEihSpAjFixfn5MmTLFq0KMvfG5fLxdGjR2nfvj2vvfYa8fHxJCYmZutrUsoxYnfDp3fBZ3eBKw36fw4P/mhLYQcvOXJ3qvPnz9O7d2+Sk5MxxvDmm28C0L9/fx577DHeeust5syZw8cff8yQIUNISkqievXqzJo166qv/cknn/CPf/yDkSNH4uvryzfffMOAAQO4/fbbadSoEWFhYdStWzdbeUeNGsXDDz9MaGgohQsX5uOPP/b4vLfeeouhQ4cSGhpKeno6t956KzNmzODFF19k6NChNGzYEB8fH15++WXuvPNOmjZtSt26dQkODqZNmzZZ/t58+eWXPPLII8THx2OM4d///jclSpTI1teklO0uxMHqV2Hrh+BXFLqOhxaPQUE/W2OJdf7TXmFhYebSxToiIyOpV6+eTYmUk+jPgnKk9FTY8h6seQ1SEiHsIWg3AoqUvmERRGSbMSbM02N65K6UUtlhDOxZBEtfhDP7oUZH62i9XPb+ks5tWtyVUiqrTuyyTpYeXANl6sCAOVCrs92pPNLirpRSV5MYCyvHwi+fQEBx6D7RasP4OHedAS3uSil1OekpsOldWDsJ0i9CqyFw2/NQqKTdya5Ki7tSSl3KGIj4AZaNhHOHoXZ36DIGytSyO1mWaXFXSqnMjv0Ci0fAkZ+gXH24/3uo0d7uVNmmFzFdxrlz55g+fXq29/voo484duzYn/erVq3K6dOnczKaUio3JByH7x+Hme3h9F4Inwz/WOeVhR20uF/W5Yp7enr6Ffe7tLgrpRwu7aI1ude05rDzG2jzb/j3dgh7GHy8t7nhvclz2bBhw9i/fz9NmjTB19eXgIAASpYsye7du1m6dCnh4eHs2rULgEmTJpGYmEjDhg3ZunUrAwYMoFChQmzcuBGAadOm8eOPP5KWlsY333yT7StLlVK5wBjYNReWvQwJ0VCvF3R+BUpVtztZjvCO4r5oGJzYmbOvWaERdJ9w2YcnTJjArl272LFjB6tXr6Znz57s2rWLatWq/W12yD/07duXt99+m0mTJhEW9v8XjZUpU4bt27czffp0Jk2axPvvv+9xf6XUDXJ0CywZDtFboEIo3Pk/qHqL3alyVJbbMiLiIyK/iMh89/2OIrJdRHaIyHoRqene7i8iX4lIlIj8LCJVcyn7DdWyZUuqVat2TfveeeedgOdpg5VSN1B8NMx9FD7oBOeOQO93YPDqPFfYIXtH7k8CkcAf0wi+C/Q2xkSKyOPAi8Ag4BHgrDGmpoj0B14D/j5peXZc4Qj7Rsk8JW7BggVxuVx/3r/atLt/TB3s4+Nz1Z69UioXpCTChqnw0zTAQNtn4Zanwd/z2sN5QZaO3EUkCOgJZO4nGP6/0BcH/jiL2Bv4Y7rBOUBHudxSQQ6WeWraS5UvX57Y2Fji4uJISUlh/vz5WdpPKXWDuVyw43N4OwzWvg51e8ATW6DjS3m6sEPWj9ynAM8DxTJtexRYKCIXgQSgtXt7ZeAogDEmXUTigdLAX8YDishgYDBASEjINcbPPaVLl6ZNmzY0bNiQQoUKUb58+T8f8/X1ZeTIkbRs2ZLKlSv/5QTpoEGDGDJkyF9OqCqlbHB4IyweBsd3QOXm1rqlIa3sTnXDXHXKXxEJB3oYYx4XkXbAs8aYcBH5FnjNGPOziDwH1DHGPCoiu4Buxpho9/77gVbGmMsO9tYpf9WV6M+Cypazh6wrSyN+gGKVrBEwDftCAeeN/Ha5DAUKXHtj43qn/G0D9BKRHkAAECgiC4C6xpif3c/5Cljsvh0DBAPRIlIQq2UTd83plVIqK5ITYN0bsGk6FChoza1+87/Ar7DdyTz6Keo0YxZE8mLPerSpWSbHX/+qxd0YMxwYDvDHkTvQBzghIrWNMXuBzlgnWwHmAQ8CG4G+wErjhBVBlFJ5kyvDmq1x5Vi4cApC+0OnlyGwkt3JPDp4+gLjF0ayLOIklUsUIsOVO+Xxmsa5u3vpjwFzRcQFnAUedj/8AfCJiEQBZ4D+1xrOGIMXnotVOUiPC9QVHVhjza9+chcEt4b7vrL66w4Un5TGWyv3MXvjIfx8CvBc1zo8cks1Anx9cuX9slXcjTGrgdXu298B33l4TjJw9/UGCwgIIC4ujtKlS2uBz6eMMcTFxREQEGB3FOU0cfth6UuwZwEUD4G+s6DBHeDAWpGe4eKLzUd4c9lezl1M457mwfyna23KFcvdn2vHXqEaFBREdHQ0p06dsjuKslFAQABBQUF2x1BOcfGsNQ/M5plQ0B86joTWQ8HXmQcAq/fEMm5BJPtiE2ldvRQvhdenQaXiN+S9HVvcfX19r/mKUKVUHpORDttmwarxVoFvOhA6vATFyl99XxtExZ5n7IJIVu85RZXShfnf/c3pUr/8De1COLa4K6UUAPuWw9IX4NRuqNrWWoy6YqjdqTw6eyGVKcv38unPRyjs58MLPerxwM1V8C+YO331K9HirpRyplN7YMkLELUMSlaDfp9B3Z6O7Kunprv4ZNNhpi7fS2JKOve1CuHpTrUpXdTftkxa3JVSzpJ0Bla/Cls+AL+i0GUstBxs9dgdxhjDishYxi2M5ODpC7StVYYXe9anToViV985l2lxV0o5Q3oqbHkf1kyAlPPQ/CFoPwKK5PwFPjkh8ngCYxdEsCEqjuplizBrUAva1SnrmNF9WtyVUvYyBvYutlowZ/ZDjQ7QZRyUr293Mo9OJ6bwxtK9fLXlCMUCfBl1e30GtK6Cr4+zpjfQ4q6Uss+JXdZFSAfXQOlacN83UKuzI/vqKekZzNpwiLdXRpGclsGDN1flyY61KFHYz+5oHmlxV0rdeImnYNVY2D4b/AOh++vuNUt97U72N8YYFu86wauLdnPkTBId65ZjRM961Cjr7CmDtbgrpW6c9BTY9C6snQTpF6HlP+C256FwKbuTebQrJp7R8yPYfPAMdcoX45NHWtK2Vlm7Y2WJFnelVO4zBiLnWVPxnj0EtbtZo2DK1LI7mUcnE5KZuGQPc7dHU7KwH+PuaEi/sGAKOqyvfiVa3JVSuevYDquvfngDlKsP939nnTR1oOS0DN5be4B31+wnLcPF4LbVGdqhJoEBzmsXXY0Wd6VU7kg4DivHWMvcFS4F4ZOh6QPg47yyY4xh3q/HeH3xHmLOXaRbgwoM71GXKqWLXH1nh3Led1kp5d3SLsJPb8P6yZCRai2YceuzEHBjJszKrl+OnGXM/Ai2HzlHg0qBTLq7MTfVKG13rOumxV0plTOMgV1zYfkoiD8K9W6HzqOhVHW7k3l07NxFXl+8m+93HKNsMX9evyuUu5oH4XMdy945iRZ3pdT1i94Ki4dD9GaoEAp93oVqbe1O5VFSajoz1hxg5tr9uAwMbV+Df7arSVH/vFUOs/zViIgPsBWIcS+QvQ74YwKFcsBmY0wfsa69nQr0AJKAQcaY7TmcWynlBPHRsPwV2Pk1FC0Pvd6GJvdBgRs/C+LVuFyGb3+JYeKS3ZxMSCE8tCLDutclqKQz11i9Xtn5VfUk1jqpgQDGmD9/LYvIXOAH993uQC33RyvgXfdnpVRekXoBNkyFDW+BcUHb/8AtT4O//RNmebLl0BnGzI/gt+h4GgeXYPqAZjSv4syx9TklS8VdRIKAnsA44JlLHgsEOgAPuTf1Bma7F8XeJCIlRKSiMeZ4zsVWStnC5YLfvoQVo+H8cWhwJ3R+BUqE2J3Mo6NnkpiwaDcLdh6nQmAAk/s1pnfjyhTII331K8nqkfsU4Hn+vw2TWR9ghTEmwX2/MnA00+PR7m1/Ke4iMhgYDBAS4swfDKVUJoc3wpLhcOwXaxHquz+GEGf+UX4+OY3pq/fzwfqD+IjwVKdaDL61OoX98lZf/Uqu+pWKSDgQa4zZJiLtPDzlXuD97L6xMWYmMBMgLCxMl7hXyqnOHoJlL0PE91CsEtwxExrdDQWcd7VmhsvwzdajTFq6l9OJKdzZtDLPdatDxeKF7I52w2Xl11gboJeI9AACgEAR+dQYM1BEygAtgTsyPT8GCM50P8i9TSnlTZITYP2bsHG6dYK03XBrzLqfMy/s+Wn/acbMjyTyeALNq5Tk/QfDaBJcwu5YtrlqcTfGDAeGA7iP3J81xgx0P9wXmG+MSc60yzzgCRH5EutEarz225XyIq4M+OVTWDkWLsRCaH/oOBKKV7Y7mUcHT19g/MJIlkWcpHKJQky7tynhoRUds2iGXa63AdUfmHDJtoVYwyCjsIZCPnTpTkophzq4FhaPgJM7IbgV3PslBDW3O5VH8RfTmLZiHx9vPISfTwGe61qHR26pRoCv84Zh2iFbxd0YsxpYnel+Ow/PMcDQ68yllLqR4vbD0pdgzwIoHgJ9Z0GDOxy5aEZ6hosvNh9h8vJ9nE1K5Z7mwfyna23KFQuwO5qj5J9Tx0qpv7t4DtZOhJ//Zy1A3XEktH4cfJ15AnLN3lOMnR/BvthEWlcvxYs969OwsjPnrLGbFnel8qOMdNg2C1a/CklnoOlA6PASFCtvdzKPomLPM25BJKv2nKJK6cL87/7mdKlfPt/31a9Ei7tS+U3Ucmsx6lO7ocot0G08VGxsdyqPzl5IZeqKfXyy6TCFfX0Y0aMuD95cFf+C2le/Gi3uSuUXp/bA0hdh31IoWQ36fQp1wx3ZV09Nd/HJpsO8tWIf55PTuLdlCE93rk2Zov52R/MaWtyVyuuSzsDqCbDlfWuMeucx0OofVo/dYYwxrNwdy7gFkRw4fYG2tcrwYs/61KngzDlrnEyLu1J5VUaaVdBXT4CUBGg+CNqNgKLOXOB594kExs6PZH3UaaqXLcKHg8JoX6ec9tWvkRZ3pfIaY2DvElj6AsRFQfX20HU8lK9vdzKPTiem8OayvXy5+QjFAnx5+fb6DGxdBV8vWozaibS4K5WXnPzdWoz6wGooXQvu+xpqdXFkXz0lPYOPNhzi7ZVRXEzL4IGbqvJUp1qUKOxnd7Q8QYu7UnlB4ilYNQ62fwz+gdDtNWjxCPj42p3sb4wxLPn9BOMX7ubImSQ61C3HiB71qFmuqN3R8hQt7kp5s/QU+HkGrJ0EaUnQcjDc9l8o7MyFKHbFxDNmfgQ/HzxD7fJFmf1wS26t7cxzAN5Oi7tS3sgYiPwRlr1kTclbqyt0GQtla9udzKPYhGQmLtnDnO3RlCzsx9g+DenfIpiC2lfPNVrclfI2x3+1Jvc6vB7K1oOB30LNjnan8ig5LYMP1h/knVVRpGW4ePSWajzRoRbFCzmvXZTXaHFXylucPwErxsCOz6y2S883oNkg8HHef2NjDD/+dpzXFu0m5txFutQvz4ge9ahaxplzwedFzvupUEr9VdpF2PgOrHsTMlLhpqFw63NQqITdyTzacfQcY+ZHsO3wWepVDGTi3aHcXKOM3bHyHS3uSjmVMfD7t9YSd/FHrakCOo+G0jXsTubR8fiLvL54D9/9EkOZov68dlcj+jYPxicfLEbtRFrclXKi6G3WYtRHf4byjaDPdKh2q92pPEpKTed/aw7wv7X7cRl4vF0NHm9fk6L+Wl7slOXvvoj4AFuBGGNMuFjXBI8F7gYygHeNMW+5t0/FWo0pCRhkjNme89GVyoPiY2DFK/DbV1CkHPSaBk0GWGuYOozLZfh+RwyvL97DiYRkeoZWZFi3ugSXKmx3NEX2jtyfBCKBQPf9QVgLYdc1xrhEpJx7e3eglvujFfCu+7NS6nJSL8CGt2DDVDAuuOUZaPsM+Dtzwqyth84wZn4Ev0bHExpUnGn3NaVFVWeOrc+vslTcRSQI6AmMA55xb/4ncJ8xxgVgjIl1b+8NzHYvt7dJREqISEVdJFspD1wu2Pk1LH8Fzh+zlrbr9AqUrGJ3Mo+OnkliwuLdLPjtOOUD/Xnj7sbc0bQyBbSv7jhZPXKfAjwPZD6MqAH0E5E7gFPAv40x+4DKwNFMz4t2b/tLcReRwcBggJCQkGvJrpR3O7IJFg+HY9uhUlPo+yFUucnuVB4lpqQzfVUU768/SAGBJzvW4h+3Vaewn/bVneqq/zIiEg7EGmO2iUi7TA/5A8nGmDARuRP4EGib1Tc2xswEZgKEhYWZ7IRWyqudO2KNgPn9WyhWEfrMgNB+UMB5V2tmuAxzt0Xz+pI9nE5MoU+TSjzfrS6VSjhzjVX1/7Lya7cN0EtEegABQKCIfIp1RP6t+znfAbPct2OwevF/CHJvUyp/SzlvjVXf+A5IAWsOmDZPWgtoONDG/XGMmR9BxPEEmoWU4L0HmtM0pKTdsVQWXbW4G2OGA8MB3EfuzxpjBorIBKA9cBC4Ddjr3mUe8ISIfIl1IjVe++0qX3NlWFeVrhgDF2Kh0T3Q6WUoHmR3Mo8Ox11g/MJIlvx+kkrFA3jr3qbcHlpRF83wMtfTMJsAfCYiTwOJwKPu7QuxhkFGYQ2FfOi6EirlzQ6us8arn9gJQS3h3i8gKMzuVB4lJKfx9sooZm04iK9PAZ7tUptH21YnwNd5wzDV1WWruBtjVgOr3bfPYY2gufQ5Bhh6/dGU8mJnDsDSl2D3fCgeDHd9AA3vcuSiGekZLr7ccpQ3l+3lbFIqfZsF8VzXOpQLDLA7mroOeqpbqZyUHA9rJ8KmGeDjBx1ehJueAF9nnoBct+8UY+ZHsPdkIi2rlWJkeH0aVi5udyyVA7S4K5UTMtKtVZBWjYekOOuq0o4vQbEKdifzKCo2kfELI1m5O5aQUoWZMbAZXRtU0L56HqLFXanrtX+lNb/6qUio0sZajLpSE7tTeXQuKZUpy/fx6abDBPj6MLx7XQa1qYp/Qe2r5zVa3JW6Vqf2wtIXYd8SKFkV7vkE6t3uyL56WoaLTzcdZsryfZxPTqN/yxCe6VybMkX97Y6mcokWd6WyK+kMrHkNtrwPBQtZ0/C2GgIFnVcojTGs2hPL2AWRHDh1gVtqluHF8HrUrRB49Z2VV9PirlRWZaTBlg9g9auQkgDNHoT2L0BRZy7wvOfEecYuiGDdvtNUL1OEDx4Mo0PdctpXzye0uCt1NcbAvqWw5AWI2wfV21l99fIN7E7mUVxiCpOX7+Xzn49Q1L8gI8PrM7B1FfwKOm96A5V7tLgrdSUnI2DpC9ZJ09I14d6voHZXR/bVU9Iz+PinQ0xbGUVSagYP3FSVJzvWomQRP7ujKRtocVfKkwunrWGN22ZZc6p3mwBhj0BB5xVKYwxLI04yfmEkh+OSaF+nLC/0rEfNcs6cC17dGFrclcosPRU2/w/WTITURGjxGLQbBoWduRDF78fiGTM/gk0HzlCrXFE+frglt9V25jkAdWNpcVcKrL767gXW0MazB6FWF+gyFsrWsTuZR7Hnk3ljyV6+3naUEoV8GdOnIfe2CKagj/bVlUWLu1LHf4MlI+DQOihbFwbOhZqd7E7lUXJaBh+sP8j0VVGkZrh4pE01/tWxFsUL+dodTTmMFneVf50/CSvHwC+fQqGS0PMNaDYIfJz338IYw4Kdx5mwaDfRZy/SpX55hveoR7UyzpwLXtnPeT/FSuW2tGTY9I61cEZ6Ctw0FG59DgqVsDuZR79Fn2P0jxFsPXyWehUD+fzRUG6uWcbuWMrhtLir/MMY+P07WP6ytdRd3XDr6tLSNexO5tGJ+GReX7Kbb7fHUKaoHxPubMTdYcH46GLUKguyXNxFxAfYCsQYY8JF5COsFZji3U8ZZIzZIdblb1OxFuxIcm/fnrOxlcqmmO3WYtRHN0H5RvDAPKh+m92pPLqYmsHMtQeYsWY/Gcbwz3Y1eLxdDYoFaF9dZV12jtyfBCKBzJNSPGeMmXPJ87oDtdwfrYB33Z+VuvESjsGK0fDrF1CkLNz+FjQdCAWcNwuiy2X44dcYXl+8h+PxyfRsVJFh3esSXKqw3dGUF8pScReRIKxVl8YBz1zl6b2B2e4VmTaJSAkRqajrqKobKjUJfpoGG6ZYa5je8jTc8gwEOHPCrG2HzzJ6fgS/Hj1Ho8rFmdq/KS2rOXNsvfIOWT1ynwI8D1x6yds4ERkJrACGGWNSgMrA0UzPiXZv0+Kucp/LBbvmwPJRkBAD9ftA51esKXkdKPpsEq8t3sOPvx6jfKA/b9zdmDuaVqaA9tXVdbpqcReRcCDWGLNNRNplemg4cALwA2YC/wVGZ/WNRWQwMBggJCQk64mVupyjm2HxMIjZBhWbwF3vQ5Wb7U7l0YWUdN5dvZ/31h1ABP7dsRZDbqtOYT8d46ByRlZ+ktoAvUSkBxAABIrIp8aYge7HU0RkFvCs+34MEJxp/yD3tr8wxszE+qVAWFiYucb8SlkjX5aPgl1zoVhF6DMDQvtBAeddrelyGeZsj2bikj2cOp9CnyaVeL5bXSqVcOYaq8p7XbW4G2OGYx2l4z5yf9YYM/CPPrp7dEwfYJd7l3nAEyLyJdaJ1Hjtt6tckZII6yfDxrcBgVufh1ueAj9nXtjz84E4xiyIYFdMAk1DSjDz/uY0DSlpdyyVR13P34CfiUhZQIAdwBD39oVYwyCjsIZCPnQ9AZX6G5cLfv3cGgWTeBIa3QOdXobiQXYn8+hIXBKvLopk0a4TVCoewFv3NuX20Iq6aIbKVdkq7saY1cBq9+0Ol3mOAYZebzClPDq0AZYMh+O/QlAL6P85BIXZncqjhOQ03lkZxawNhyjoI/ync20eu7U6Ab7OG4ap8h49e6O8w5kDsGwkRP4IgUFw1wfQ8C5HLpqRnuHiq61HeXPpXs4kpdK3WRDPdq1D+cAAu6OpfESLu3K25HhYOwl+ngEFfKH9i3DzE+DrzBOQ6/edZuyCCHafOE/LaqX4OLw+DSsXtzuWyoe0uCtnykiHX2bDynGQFAdNBkCHFyGwot3JPNp/KpHxCyJZsTuW4FKFeHdAM7o1rKB9dWUbLe7Kefavshajjv0dqrSxFqOu1MTuVB7FJ6UxdcU+Zm88RICvD8O612XQzVW1r65sp8VdOcfpfdZKSHsXQ4kqcM9sqNfLkX31tAwXn206zJQV+0i4mEa/FiE807k2ZYv52x1NKUCLu3KCi2dhzeuweSYULASdXoFWQ8DXmScgV+2JZez8CPafukCbmqV5sWd96lV05pw1Kv/S4q7sk5EGWz+E1a9aJ06bPQDtX4Ci5exO5tHek+cZuyCStXtPUa1MEd57IIxO9cppX105khZ3ZY99y6x1S0/vhWq3WX31Cg3tTuXRmQupTF62l883H6GInw8vhdfn/tZV8CvovOkNlPqDFnd1Y8VGWidL96+AUjWg/xdQp7sj++qp6S5mbzzE1BX7SErNYGCrEJ7sVJtSRfzsjqbUVWlxVzfGhThYPR62zgL/otD1VWjxKBR0XqE0xrAs4iTjF0ZyKC6JdnXK8kKPetQqf+mM10o5lxZ3lbvSU60TpWteh9REaPEI3DYMipS2O5lHEccSGDM/go0H4qhZrigfPdSCdnWceQ5AqSvR4q5yhzGwZ6E1tPHMAajZGbqMhXJ17U7m0anzKbyxdA9fbT1KiUK+jO7dgPtahlDQR/vqyjtpcVc578ROazHqQ+ugTB0YMBdqdbI7lUfJaRl8uOEg01ftJzktg4fbVOPfHWpRvLAuRq28mxZ3lXMSY2HlGNj+CRQqCT0mQfOHwMd5P2bGGBbuPMGriyKJPnuRTvXKM6JHXaqXLWp3NKVyhPP+1ynvk5YMm6bDujch/SK0fhxue84q8A70W/Q5xsyPYMuhs9StUIzPHm1Fm5pl7I6lVI7S4q6unTEQ8b01Fe+5I1CnB3QeA2Vq2p3MoxPxyby+ZDffbo+hTFE/Xr2zEfeEBeOji1GrPCjLxV1EfICtQIwxJjzT9reAh40xRd33/YHZQHMgDuhnjDmUk6GVA8Rsty5COrIRyjWAB36A6u3sTuXRxdQMZq49wIw1+8lwGYbcVoOh7WtQLED76irvys6R+5NAJPDnJBoiEgZc+rf3I8BZY0xNEekPvAb0u96gyiESjlnL2/36BRQpC7dPhab3QwHnzYLochnm/XqM1xbv5nh8Mj0aVWBYt3qElC5sdzSlcl2WiruIBAE9gXHAM+5tPsBE4D7gjkxP7w2Mct+eA7wtIuJefk95q9Qk+GkabJgCrnRo8xS0/Q8EOHPCrG2HzzJmfgQ7jp6jYeVApvRrQqvqzhxbr1RuyOqR+xTgeSDzJXpPAPOMMccvmTipMnAUwBiTLiLxQGngdOYnichgYDBASEjItWRXN4LLBbvmwPJRkBAD9XtbszaWqmZ3Mo9izl3ktUW7mffrMcoV82di31DuahZEAe2rq3zmqsVdRMKBWGPMNhFp595WCbgbaHetb2yMmQnMBAgLC9Ojeic6utkarx6zFSo2hjvfg6pt7E7l0YWUdGas2c/MtQcA+FeHmgy5rQZF/HXMgMqfsvKT3wboJSI9gACsnvvvQAoQ5T5qLywiUcaYmkAMEAxEi0hBoDjWiVXlLc4dtY7Ud82BohWg93RofC8UcN7Vmi6XYe72aCYu2UPs+RR6Na7Ef7vXpXIJZ66xqtSNctXibowZDgwHcB+5P5t5tIx7e6K7sAPMAx4ENgJ9gZXab/cSKYlWT/2nadb9W5+zeuv+zryw5+cDcYxZEMGumASaBJfg3YHNaV7FmWPrlbrRcuNv1g+AT0QkCjgD9M+F91A5yeWyRr+sGA2JJ6BhX+g0CkoE253MoyNxSby6KJJFu05QqXgAU/s34fbQStpXVyqTbBV3Y8xqYLWH7UUz3U7G6scrb3BoAywZDsd/hcph0O8TCG5pdyqPzien8faqKGatP4RPAeGZzrV5rG11Cvk5bximUnbTs0351ZmD1pWlkfMgMAjufB8a3uXIvnqGy/DVlqO8sXQPcRdSuatZEM91rUOF4s5cY1UpJ9Dint8kJ8C6SbDpXShQ0Fqz9KYnwM+ZF/ZsiDrNmPkR7D5xnhZVSzLroRaEBpWwO5ZSjqfFPb9wZcD22bByLCSdhsb3QceXILCS3ck8OnAqkfELI1keGUtQyUJMH9CM7g0r6GLUSmWRFvf84MBqWDwCYn+HkJug6zdQuZndqTyKT0pj6op9zN54iABfH/7brS4PtalKgK/21ZXKDi3uednpKGslpL2LoEQI3P2xdYWpA49+0zJcfP7zESYv30v8xTT6twjmmc51KFvM3+5oSnklLe550cWzsGYibP4fFCxkDWts9U/wdeYJyNV7Yhm7IJKo2ERuql6al8LrU7+SM+esUcpbaHHPSzLSYOssWP2qVeCbPQAdXoSizlzged/J84xdEMmavaeoWrowM+9vTuf65bWvrlQO0OKeV+xbbs2vfnoPVG0L3V6FCo3sTuXRmQupTFm+l89+PkJhPx9e7FmPB26qil9B5w3DVMpbaXH3drG7YekLELUcSlWH/p9bKyI58Og3Nd3F7I2HeGvFPi6kZnBfyxCe7lybUkX87I6mVJ6jxd1bXYiz2i9bPwS/otB1PLR4DAo6r1AaY1geGcv4hZEcPH2BW2uX5aWe9ahVvtjVd1ZKXRMt7t4mPRW2vAdrXrMm+gp7CNqNgCLOXIgi8ngCY+ZH8NP+OGqWK8qsh1rQvo4zzwEolZdocfcWxsCeRdbQxjP7oUZH62i9XF27k3l06nwKby7bw1dbjhJYyJdXejXgvlYh+PpoX12pG0GLuzc4scua3OvgWihTBwbMgVqd7U7lUXJaBrM2HOKdVVEkp2Uw6OZqPNmxFsUL62LUSt1IWtydLDHWmi7gl08goDh0n2i1YXycVyiNMSzadYJXF0Vy9MxFOtUrx4ge9ahe1plzwSuV12lxd6L0FGtir7WTIP0itBoCtz0PhZy5EMXO6HjGzI9g86Ez1K1QjE8facUttcrYHUupfE2Lu5MYAxE/WFPxnjsMtbtDl7FQpubV97XByYRkJi7Zw9zt0ZQq7Mf4OxrRr0UwPrpohlK2y3JxFxEfYCsQY4wJF5EPgDBAgL3AIGNMooj4A7OB5lhrp/YzxhzK8eR5zbFfrMm9jvwE5RrAAz9A9XZ2p/IoOS2D99Ye4N01+0nPMAy+tTpD29ckMMB57SKl8qvsHLk/CURiLZAN8LQxJgFARN4EngAmAI8AZ40xNUWkP/Aa0C/nIucxCcdh5RjY8TkULg3hU6xpAwo4bxZEYwzzfj3Ga4t2cyw+me4NKzC8ez1CSjtzLnil8rMsFXcRCQJ6AuOAZwAyFXYBCgF/LILdGxjlvj0HeFtERBfJvkTaRfjpbVg/GVxp0Obf0PY/1olTB9p+5Cxj5kfwy5FzNKgUyOR+TWhV3Zlj65VSWT9ynwI8D/zlkkIRmQX0ACKA/7g3VwaOAhhj0kUkHigNnL5k38HAYICQkJBrS++NjIFdc2HZy5AQDfV6QefRUKqa3ck8OnbuIq8t3s0PO45Rtpg/E/uGclezIF2MWimHu2pxF5FwINYYs01E2mV+zBjzkLsXPw2r9TIrq29sjJkJzAQICwvLH0f10Vth8TCI3gIVQuHOmVC1jd2pPEpKTWfG6v3MXHcAY+BfHWoy5LYaFPHXc/BKeYOs/E9tA/QSkR5AABAoIp8aYwYCGGMyRORLrCP7WUAMEAxEi0hBoDjWidX8Kz4alo+Cnd9A0fLQezo0vteRi1G7XIZvf4lh4pLdnExIoVfjSvy3e10qlyhkdzSlVDZctbgbY4YDwwHcR+7PAveLSE1jTJS7594L2O3eZR7wILAR6AuszLf99pRE2DAVfpoGGGj7LNzyNPg788KeLYfOMPrHCHbGxNMkuATTBzSneRVnjq1XSl3Ztf6NLcDHIhLovv0r8E/3Yx8An4hIFHAG6H/dKb2NywW/fQkrRsP549Cwr7UaUolgu5N5dPRMEq8uimThzhNULB7AlH5N6NW4kvbVlfJi2SruxpjVwGr3XY/NYmNMMnD3daXyZoc3Wn314zugchjcMxuCW9qdyqPzyWm8s2o/H64/iE8B4ZnOtXmsbXUK+TlvGKZSKnv07FhOOXvIurI04gcIrAx3vmcdsTuwr57hMny99ShvLN3D6cRU7mxWmee71qVCcWeusaqUyj4t7tcrOQHWvQGbpkOBgtbc6jf/C/yceWHPT1GnGT0/gt0nzhNWpSQfPNiCxsEl7I6llMphWtyvlSvDmq1x5Vi4cMoa/dJxJARWsjuZRwdPX2D8wkiWRZykcolCvHNfM3o0qqCLUSuVR2lxvxYH1liLUZ/cBcGt4b6voHJzu1N5FH8xjWkr9vHxxkP4+RTg+W51eLhNNQJ8ta+uVF6mxT074vbD0pdgzwIoHgJ3fwT1+zhyMer0DBdfbD7Cm8v2cu5iGv3CgnmmS23KFdO+ulL5gRb3rLh4DtZOhJ//BwX9oePL0Ppx8HVmoVyz9xRj50ewLzaRm6qX5sXwejSo5Mw5a5RSuUOL+5VkpMO2WbBqPFw8C83uh/YvQrHydifzKCr2PGMXRLJ6zymqlC7MzPub07l+ee2rK5UPaXG/nKjlsOQFOLUbqra1FqOuGGp3Ko/OXkhlyvK9fPrzEQr7+fBCj3o8cHMV/AtqX12p/EqL+6VO7bGKetQyKFkN+n0GdXs6sq+emu7ik02Hmbp8L4kp6QxoVYWnOtWidFF/u6MppWymxf0PSWdg9auw5QPwK2otb9dysNVjdxhjDCsiYxm3MJKDpy/QtlYZXuxZnzoVil19Z6VUvqDFPT0VtrwPayZAynlo/hC0HwFFnLnAc+TxBMYuiGBDVBzVyxZh1qAWtKtTVvvqSqm/yL/F3RjYu9hqwZzZDzU6QJdxUL6+3ck8Op2YwhtL9/LVliMUC/Bl1O31GdC6Cr4+zpveQCllv/xZ3E/ssi5COrgGSteC+76BWp0d2VdPSc9g1oZDvLMyiotpGTx4c1We7FiLEoX97I6mlHKw/FXcE0/BqrGwfTb4B0L31yHsYfDxtTvZ3xhjWLzrBK8u2s2RM0l0rFuOET3rUaOsM+eCV0o5S/4o7ukpsOlda4KvtCRo+Q+47XkoXMruZB7tioln9PwINh88Q53yxfjkkZa0rVXW7lhKKS+St4u7MRA5z5qK9+whqN3NGgVTppbdyTw6mZDMxCV7mLs9mpKF/Rh3R0P6hQVTUPvqSqlsynJxdy+EvRWIMcaEi8hnQBiQBmwG/mGMSXMvuzcV6AEkAYOMMdtzPvpVHNth9dUPb4By9eH+76yTpg6UnJbBe2sP8O6a/aRluBjctjpDO9QkMMB57SKllHfIzpH7k0AkEOi+/xkw0H37c+BR4F2gO1DL/dHKva1VToTNkvMnYMUY2PGZ1Xbp+SY0exB8nPdHijGGeb8e4/XFe4g5d5FuDSowvEddqpQuYnc0pZSXy1LFE5EgoCcwDngGwBizMNPjm4Eg993ewGz3otibRKSEiFQ0xhzP0eSXSrsIG9+GdZMhI9VaMOPWZyHAmRNm/XLkLGPmR7D9yDnqVwxk0t2NualGabtjKaXyiKwezk4Bngf+dgmkiPgC92Md2QNUBo5mekq0e9vxS/YbDAwGCAkJyU7mvzIGds2F5aMg/ijUDYfOo6F0jWt/zVx07NxFXl+8m+93HKNsMX9evyuUu5oH4aOLUSulctBVi7uIhAOxxphtItLOw1OmA2uNMeuy88bGmJnATICwsDCTnX3/FLMNFg2D6M1QoRH0eReqtb2ml8ptSanpzFhzgJlr9+MyMLR9Df7ZriZF/Z3XLlJKeb+sVJY2QC8R6QEEAIEi8qkxZqCIvAyUBf6R6fkxQHCm+0HubTnv2A44dxh6vQ1N7oMCzpsF0eUyfPtLDBOX7OZkQgrhoRX5b7e6BJdy5hqrSqm84arF3RgzHBgO4D5yf9Zd2B8FugIdjTGuTLvMA54QkS+xTqTG51q/vdmDEHoP+Dtzwqwth84wZn4Ev0XH0zioOO/c14ywqs4cW6+UyluupycwAzgMbHRPWvWtMWY0sBBrGGQU1lDIh6435GX5FAQf5xX2o2eSmLBoNwt2HqdCYACT+zWmd+PKFNC+ulLqBslWcTfGrAZWu2973Nc9Smbo9QbzRueT05i+ej8frD9IAYGnOtVi8K3VKeynfXWl1I2lVScHZLgM32w9yqSlezmdmMKdTSvzXLc6VCxeyO5oSql8Sov7dfpp/2nGzI8k8ngCzauU5P0Hw2gSXMLuWEqpfE6L+zU6dPoC4xdGsjTiJJVLFGLavU0JD62oi2YopRxBi3s2xV9M4+2V+/jop0P4+RTgua51eOSWagT4Om8YplIq/9LinkXpGS6+2HKUycv2cjYplbubB/FslzqUCwywO5pSSv2NFvcsWLP3FOMWRLD3ZCKtqpXipfD6NKzszDlrlFIKtLhfUVRsIuMWRLBqzymqlC7MjIHN6dqgvPbVlVKOp8Xdg7MXUpm6Yh+fbDpMYV8fRvSoy4M3V8W/oPbVlVLeQYt7JmkZLj7ZeJipK/ZxPjmNe1uG8HTn2pQp6m93NKWUyhYt7liLZqzcHcu4hZEcOHWBtrXK8GLP+tSp4LypDZRSKivyfXHfc+I8YxdEsG7faaqXLcKHg8JoX6ec9tWVUl4t3xb3uMQU3ly2ly82H6FYgC8v316fga2r4KuLUSul8oB8V9xT0jP4aMMh3l4ZxcW0DB64qSpPdapFicJ+dkdTSqkck2+KuzGGJb+fZPzCSI6cSaJD3XKM6FGPmuWK2h1NKaVyXL4o7rti4hkzP4KfD56hdvmizH64JbfWLmt3LKWUyjVZLu4i4gNsBWKMMeEi8gTwFFADKGuMOe1+ngBTsRbsSAIGGWO253TwrIhNSGbS0j18sy2akoX9GNunIf1bBFNQ++pKqTwuO0fuTwKRQKD7/gZgPu7FOzLpDtRyf7QC3nV/vmGS0zL4YP1Bpq+KIjXDxWNtqzO0fU2KF/K9kTGUUso2WSruIhIE9ATGAc8AGGN+cT926dN7A7PdKzJtEpESIlIx19ZRzcQYw/zfjjNh0W5izl2ka4PyDO9ej6pliuT2WyullKNk9ch9CvA8kJWreioDRzPdj3Zv+0txF5HBwGCAkJCQLMa4vB1HzzFmfgTbDp+lfsVAJt3dmJtqlL7u11VKKW901eIuIuFArDFmm4i0y6k3NsbMBGYChIWFmWt9nePxF3l98R6++yWGMkX9ee2uRvRtHoyPLkatlMrHsnLk3gboJSI9gAAgUEQ+NcYMvMzzY4DgTPeD3Nty3NdbjjJy3i5cBoa2r8E/29WkqH++GACklFJXdNVKaIwZDgwHcB+5P3uFwg4wD3hCRL7EOpEan1v99iqlC9OxXnmGdatLcKnCufEWSinlla75MFdE/o3Vh68A/CYiC40xjwILsYZBRmENhXwoJ4J60qp6aVpV1766UkpdSqxBLfYKCwszW7dutTuGUkp5FRHZZowJ8/SYXs2jlFJ5kBZ3pZTKg7S4K6VUHqTFXSml8iAt7koplQdpcVdKqTxIi7tSSuVBjhjnLiKngMPXuHsZ4HQOxslt3pTXm7KCd+X1pqzgXXm9KStcX94qxhiPKw85orhfDxHZerlB/E7kTXm9KSt4V15vygreldebskLu5dW2jFJK5UFa3JVSKg/KC8V9pt0Bssmb8npTVvCuvN6UFbwrrzdlhVzK6/U9d6WUUn+XF47clVJKXUKLu1JK5UFeXdxFpJuI7BGRKBEZZneeKxGRD0UkVkR22Z3lakQkWERWiUiEiPwuIk/anelyRCRARDaLyK/urK/YnSkrRMRHRH4Rkfl2Z7kSETkkIjtFZIeIOH7RBREpISJzRGS3iESKyE12Z/JEROq4v6d/fCSIyFM5+h7e2nMXER9gL9AZiAa2APcaYyJsDXYZInIrkAjMNsY0tDvPlYhIRaCiMWa7iBQDtgF9nPi9FREBihhjEkXEF1gPPGmM2WRztCsSkWeAMCDQGBNud57LEZFDQJgxxisuChKRj4F1xpj3RcQPKGyMOWdzrCty17IYoJUx5lov5vwbbz5ybwlEGWMOGGNSgS+B3jZnuixjzFrgjN05ssIYc9wYs919+zwQCVS2N5VnxpLovuvr/nD0EYuIBAE9gfftzpKXiEhx4FbgAwBjTKrTC7tbR2B/ThZ28O7iXhk4mul+NA4tQN5MRKoCTYGfbY5yWe4Wxw4gFlhmjHFsVrcpWOsPu2zOkRUGWCoi20RksN1hrqIacAqY5W55vS8iRewOlQX9gS9y+kW9ubirXCYiRYG5wFPGmAS781yOMSbDGNMECAJaiohj214iEg7EGmO22Z0li24xxjQDugND3e1FpyoINAPeNcY0BS4ATj8X5wf0Ar7J6df25uIeAwRnuh/k3qZygLt/PRf4zBjzrd15ssL9J/gqoJvNUa6kDdDL3cv+EuggIp/aG+nyjDEx7s+xwHdY7VCnigaiM/3lNger2DtZd2C7MeZkTr+wNxf3LUAtEanm/u3XH5hnc6Y8wX2S8gMg0hjzpt15rkREyopICfftQlgn2HfbGuoKjDHDjTFBxpiqWD+zK40xA22O5ZGIFHGfUMfd3ugCOHa0lzHmBHBUROq4N3UEHDcI4BL3kgstGbD+jPFKxph0EXkCWAL4AB8aY363OdZlicgXQDugjIhEAy8bYz6wN9VltQHuB3a6e9kAI4wxC+2LdFkVgY/dIw4KAF8bYxw9vNCLlAe+s37XUxD43Biz2N5IV/Uv4DP3Ad8B4CGb81yW+xdmZ+AfufL63joUUiml1OV5c1tGKaXUZWhxV0qpPEiLu1JK5UFa3JVSKg/S4q6UUnmQFnfl1dyzAD7uvl1JRObk4ns1EZEeufX6SuUkLe7K25UAHgcwxhwzxvTNxfdqAmhxV15Bi7vydhOAGu45sb/5Y758ERkkIt+LyDL3nORPiMgz7gmlNolIKffzaojIYvfEWOtEpK57+90isss9T/xa90Uxo4F+7vfq576C80P3fPK/iEjvTO/9g4isFpF9IvKye3sREVngfs1dItLPlu+Yyhe89gpVpdyGAQ2NMU3cM1hmvjq1IdaMlgFAFPBfY0xTEZkMPIA1O+NMYIgxZp+ItAKmAx2AkUBXY0yMiJQwxqSKyEisuc2fABCR8VjTBzzsngJhs4gsd793S/f7JwFbRGQBUAU4Zozp6d6/eC59T5TS4q7ytFXu+ejPi0g88KN7+04g1D3r5c3AN+5L7AH83Z83AB+JyNfA5SZO64I1Cdiz7vsBQIj79jJjTByAiHwL3AIsBN4QkdeA+caYdTnxRSrliRZ3lZelZLrtynTfhfWzXwA4554u+C+MMUPcR/I9gW0i0tzD6wtwlzFmz182WvtdOq+HMcbsFZFmWH37sSKywhgz+hq+LqWuSnvuytudB4pdy47uOeoPisjdYM2GKSKN3bdrGGN+NsaMxFoAItjDey0B/uWeRRMRaZrpsc4iUso9U2UfYIOIVAKSjDGfAhNx/nS0yotpcVdezd362OA+kTrxGl5iAPCIiPwK/M7/L9U4UayFoXcBPwG/Ys0VX/+PE6rAGKxl/X4Tkd/d9/+wGWs+/N+AucaYrUAjrL78DuBlYOw15FUqS3RWSKVymIgMItOJV6XsoEfuSimVB+mRu1JK5UF65K6UUnmQFnellMqDtLgrpVQepMVdKaXyIC3uSimVB/0fYvDUYUqKCs8AAAAASUVORK5CYII=",
753 | "text/plain": [
754 | ""
755 | ]
756 | },
757 | "metadata": {
758 | "needs_background": "light"
759 | },
760 | "output_type": "display_data"
761 | }
762 | ],
763 | "source": [
764 | "# TODO: make it work for any dimension of y\n",
765 | "plt.plot(y_pred_backtested[:,0,0], label='historical forecasts')\n",
766 | "plt.plot(y_test_backtested[:,0,0], label='truth')\n",
767 | "plt.xlabel('timesteps')\n",
768 | "plt.legend()\n",
769 | "plt.show()"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": null,
775 | "metadata": {},
776 | "outputs": [],
777 | "source": []
778 | }
779 | ],
780 | "metadata": {
781 | "interpreter": {
782 | "hash": "572b4e543617d03e90ecaf525e08695da1ff29b13594f787e33b342cf572f792"
783 | },
784 | "kernelspec": {
785 | "display_name": "Python 3 (ipykernel)",
786 | "language": "python",
787 | "name": "python3"
788 | },
789 | "language_info": {
790 | "codemirror_mode": {
791 | "name": "ipython",
792 | "version": 3
793 | },
794 | "file_extension": ".py",
795 | "mimetype": "text/x-python",
796 | "name": "python",
797 | "nbconvert_exporter": "python",
798 | "pygments_lexer": "ipython3",
799 | "version": "3.8.12"
800 | },
801 | "toc": {
802 | "base_numbering": 1,
803 | "nav_menu": {},
804 | "number_sections": false,
805 | "sideBar": true,
806 | "skip_h1_title": false,
807 | "title_cell": "Table of Contents",
808 | "title_sidebar": "Contents",
809 | "toc_cell": false,
810 | "toc_position": {},
811 | "toc_section_display": true,
812 | "toc_window_display": false
813 | }
814 | },
815 | "nbformat": 4,
816 | "nbformat_minor": 2
817 | }
818 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 | slow: marks tests as slow (deselect with '-m "not slow"')
4 | optional: marks tests as optional (deselect with '-m "not optional"')
5 | addopts = -v -s --color=yes -W ignore::DeprecationWarning
6 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator/requirements.txt
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | with open('requirements.txt') as f:
5 | content = f.readlines()
6 | requirements = [x.strip() for x in content if 'git+' not in x]
7 |
8 | setup(name='ts-boilerplate',
9 | version="1.0",
10 | description="Trainer Boilerplate for Time Series Forecast Models with Cross Validation",
11 | packages=find_packages(),
12 | install_requires=requirements,
13 | test_suite='tests',
14 | # include_package_data: to install data from MANIFEST.in
15 | include_package_data=True,
16 | zip_safe=False)
17 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator/tests/__init__.py
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import numpy as np
3 | from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones
4 | from typing import Tuple
5 |
6 | @pytest.fixture(scope="session")
7 | def data_monotonic_increase() -> np.ndarray:
8 | return generate_data_monotonic_increase()
9 |
10 | @pytest.fixture(scope="session")
11 | def data_zeros_and_ones() -> np.ndarray:
12 | return generate_data_zeros_and_ones()
13 |
14 |
15 | @pytest.fixture(scope="session")
16 | def X_y_zeros_and_ones() -> Tuple[np.ndarray]:
17 | return generate_X_y_zeros_and_ones()
18 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/integrated/test_main.py:
--------------------------------------------------------------------------------
1 | """Tests that main route run without raising exceptions"""
2 |
3 | import pytest
4 | from ts_boilerplate.main import backtest, train, cross_validate
5 |
6 | @pytest.mark.slow
7 | def test_main_route_train(data_monotonic_increase):
8 | train(data_monotonic_increase)
9 |
10 | @pytest.mark.slow
11 | def test_main_route_cross_validate(data_monotonic_increase):
12 | cross_validate(data_monotonic_increase)
13 |
14 | @pytest.mark.slow
15 | def test_backtest(data_monotonic_increase):
16 | backtest(data_monotonic_increase, print_metrics=False, plot_metrics=False)
17 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/integrated/test_model_performance.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.main import train
3 |
4 |
5 | @pytest.mark.optional
6 | @pytest.mark.slow
7 | def test_model_can_fit_well_enough_on_dummy_dataset(data_zeros_and_ones):
8 | """Check that the model can fit, with MAPE lower than some threshold on dummy dataset of zeros and ones"""
9 |
10 | metrics = train(data_zeros_and_ones)
11 | #print("#### metrics on dummy dataset ", metrics)
12 |
13 | assert metrics < 5, "your model does not seem to be able to fit well enough even a very easy dataset"
14 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/unittests/test_data.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.params import TRAIN, DATA
3 | from ts_boilerplate.main import get_X_y
4 | from ts_boilerplate.main import train_test_split
5 | import numpy as np
6 | import math
7 |
8 |
9 | # These tests make use of the fixture `data_monotonic_increase` stored in tests/conftest.py (pytest magic under the hood)
10 | def test_get_X_y_returns_correct_shapes(data_monotonic_increase):
11 | """Test that X and y have correct shape (excluding sample size), as per project setup as defined in `params.py`
12 | """
13 | X, y = get_X_y(data_monotonic_increase, **TRAIN)
14 |
15 | # Check that X and y have the correct lengths (in time) and depth (in number of covariates)
16 | assert X.ndim == 3
17 | assert X.shape[1] == TRAIN['input_length']
18 | assert X.shape[2] == DATA['n_covariates'] + DATA[
19 | 'n_targets'], "Did you forget to include your past targets-values as features ?"
20 |
21 | y_should_be_3D = TRAIN['output_length'] > 1 and DATA["n_targets"] > 1
22 | y_should_be_1D = TRAIN['output_length'] == 1 and DATA["n_targets"] == 1
23 | if y_should_be_3D:
24 | assert y.ndim == 3
25 | assert y.shape[1] == TRAIN['output_length']
26 | assert y.shape[2] == DATA['n_targets']
27 | elif y_should_be_1D:
28 | assert y.ndim == 1
29 | else:
30 | assert y.ndim == 2
31 | assert y.shape[1] == TRAIN['output_length'] if DATA['n_targets'] == 1 else DATA['n_targets']
32 |
33 |
34 | @pytest.mark.optional
35 | @pytest.mark.skipif(TRAIN['stride'] == None, reason="Optional test only applicable if sliding method is used to get_X_y")
36 | def test_optional_get_X_y_returns_optimal_sample_size(data_monotonic_increase):
37 | """If get_X_y uses a stride method, check that X and y contains the optimal number of sample each
38 | """
39 | X, y = get_X_y(data_monotonic_increase, **TRAIN)
40 |
41 | # Complex formula below retro-engineered from `create_dummy_tests.ipynb`
42 | expected_len = math.ceil(
43 | (len(data_monotonic_increase) \
44 | - (TRAIN['input_length'] -1) \
45 | - (TRAIN['output_length'] -1) \
46 | - TRAIN['horizon']
47 | ) / TRAIN["stride"]
48 | )
49 | assert len(X) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen"
50 | assert len(y) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen"
51 |
52 | def test_no_data_leak(data_monotonic_increase):
53 | """Test that the time gap between the last timestep of `y_train` and the first timestep of `y_test`
54 | is at least as big as the forecast horizon
55 | according to 'https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png'
56 | """
57 |
58 | data_train, data_test = train_test_split(data_monotonic_increase, **TRAIN)
59 | X_train, y_train = get_X_y(data_train, shuffle=False, **TRAIN)
60 | X_test, y_test = get_X_y(data_test, shuffle=False, **TRAIN)
61 |
62 | y_train_last_seen_timestep = np.max(y_train) # OR y_train[-1].flat[-1]
63 | y_test_first_seen_timestep = np.min(y_test) # OR y_test[0].flat[0]
64 | gap = y_test_first_seen_timestep - y_train_last_seen_timestep
65 | # Note: for strides = 1, the inequality below must be an exact equality, but we don't need to test that to ensure no data leak.
66 | assert gap >= TRAIN["horizon"], "❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ "
67 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/tests/unittests/test_model.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ts_boilerplate.model import fit_model, get_model, predict_output
3 |
4 | def test_model_has_correct_output_shape(X_y_zeros_and_ones):
5 | X, y = X_y_zeros_and_ones
6 | model = get_model(X,y)
7 | y_pred = predict_output(model, X)
8 | assert y_pred.shape == y.shape
9 |
10 | @pytest.mark.slow
11 | def test_model_can_fit(X_y_zeros_and_ones):
12 | """Check that the model can fit without crashing"""
13 | X, y = X_y_zeros_and_ones
14 | model = get_model(X,y)
15 | fit_model(model, X, y, verbose=0)
16 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/__init__.py:
--------------------------------------------------------------------------------
1 | from os.path import isfile
2 | from os.path import dirname
3 |
4 | version_file = '{}/version.txt'.format(dirname(__file__))
5 |
6 | if isfile(version_file):
7 | with open(version_file) as version_file:
8 | __version__ = version_file.read().strip()
9 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/dataprep.py:
--------------------------------------------------------------------------------
1 | """Prepare Data so as to be used in a Pipelined ML model"""
2 |
3 | import numpy as np
4 | from ts_boilerplate.params import DATA
5 | from typing import Tuple, List
6 | import numpy as np
7 |
8 |
9 | def load_data(data_path: str) -> np.ndarray:
10 | """Load data from `data_path` into to memory
11 | Returns a 2D array with (axis 0) representing timesteps, and (axis 1) columns containing tagets and covariates
12 | ref: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true
13 | """
14 | # YOUR_CODE_HERE
15 | pass
16 |
17 |
18 | def clean_data(data: np.ndarray) -> np.ndarray:
19 | """Clean data without creating data leakage:
20 | - make sure there is no NaN between any timestep
21 | - etc...
22 | """
23 | # YOUR_CODE_HERE
24 | pass
25 |
26 |
27 | def get_X_y(
28 | data: np.ndarray,
29 | input_length: int,
30 | output_length: int,
31 | horizon: int,
32 | stride: int,
33 | shuffle=True,
34 | **kwargs,
35 | ) -> Tuple[np.ndarray, np.ndarray]:
36 | """
37 | Use `data`, a 2D-array with axis=0 as timesteps, and axis=1 as (tagets+covariates columns)
38 |
39 | Returns a Tuple (X,y) of two ndarrays :
40 | X.shape = (n_samples, input_length, n_covariates)
41 | y.shape =
42 | (n_samples, output_length, n_targets) if all 3-dimensions are of size > 1
43 | (n_samples, output_length) if n_targets == 1
44 | (n_samples, n_targets) if output_length == 1
45 | (n_samples, ) if both n_targets and lenghts == 1
46 |
47 | ❗️ Raise error if data contains NaN
48 | ❗️ Make sure to shuffle the pairs in unison if `shuffle=True` for idd purpose
49 | ❗️ Don't ditch past values of your target time-series in your features - they are very useful features!
50 | 👉 illustration: https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-1.png
51 |
52 | [💡 Hints ] You can use a sliding method
53 | - Reading `data` in ascending order
54 | - `stride` timestamps after another
55 | Feel free to use another approach, for example random sampling without replacement
56 |
57 | """
58 | # $CHALLENGIFY_BEGIN
59 | assert np.isnan(data).sum() == 0
60 |
61 | X = []
62 | y = []
63 |
64 | for i in range(0, len(data), stride):
65 | Xi, yi = get_Xi_yi(first_index=i,
66 | data=data,
67 | horizon=horizon,
68 | input_length=input_length,
69 | output_length=output_length)
70 | # Exit loop as soon as we reach the end of the dataset
71 | if len(yi) < output_length:
72 | break
73 | X.append(Xi)
74 | y.append(yi)
75 |
76 | X = np.array(X)
77 | y = np.array(y)
78 | y = np.squeeze(y)
79 | if shuffle:
80 | idx = np.arange(len(X))
81 | np.random.shuffle(idx)
82 | X = X[idx]
83 | y = y[idx]
84 |
85 | return X, y
86 | # $CHALLENGIFY_END
87 |
88 |
89 | # $DELETE_BEGIN
90 | def get_Xi_yi(first_index,
91 | data,
92 | horizon,
93 | input_length,
94 | output_length,
95 | **kwargs):
96 | X_start = first_index
97 | X_last = X_start + input_length
98 | y_start = X_last + horizon - 1
99 | y_last = y_start + output_length
100 |
101 | Xi = data[X_start:X_last]
102 | yi = data[y_start:y_last, DATA['target_column_idx']]
103 | return (Xi, yi)
104 | # $DELETE_END
105 |
106 |
107 | def get_folds(data: np.ndarray,
108 | fold_length: int,
109 | fold_stride: int,
110 | **kwargs) -> List[np.ndarray]:
111 | """Slide through `data` time-series (2D array) to create folds of equal `fold_length`, using `fold_stride` between each fold
112 | Returns a list of folds, each as a 2D-array time series
113 | """
114 | # $CHALLENGIFY_BEGIN
115 | folds = []
116 | for i in range(0, len(data), fold_stride):
117 | # Exit loop as soon as last fold value would exceed last data value
118 | if (i + fold_length) > len(data):
119 | break
120 | fold = data[i:i + fold_length, :]
121 | folds.append(fold)
122 | return folds
123 | # $CHALLENGIFY_END
124 |
125 |
126 | def train_test_split(data: np.ndarray,
127 | train_test_ratio: float,
128 | input_length: int,
129 | **kwargs) -> Tuple[np.ndarray, np.ndarray]:
130 | """Returns a train and test 2D-arrays, that will not create any data leaks when sampling (X, y) from them
131 | Inspired from "https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png"
132 | """
133 | # $CHALLENGIFY_BEGIN
134 | last_train_idx = round(train_test_ratio * len(data))
135 | data_train = data[0:last_train_idx, :]
136 |
137 | # [here is the key to no data leak]
138 | # The last idx of the first X_test must be equal to the last idx of the last y_train.
139 | # Its equal to day n°10 in the picture rnn-3.png
140 | first_test_idx = last_train_idx - input_length
141 | data_test = data[first_test_idx:, :]
142 |
143 | return (data_train, data_test)
144 | # $CHALLENGIFY_END
145 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/generate_dummy_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from ts_boilerplate.params import CROSS_VAL, DATA, TRAIN
3 | from typing import Tuple
4 |
5 | def generate_data_monotonic_increase() -> np.ndarray:
6 | """Creates a monotonicly increasing time serie dataset for test purposes
7 | - shape is (DATA['length'], DATA['n_covariates] + DATA['n_targets']),
8 | - values are all equals to their respective integer index!
9 |
10 | e.g:
11 | data = array(
12 | [[ 0., 0., 0., 0., 0.],
13 | [ 1., 1., 1., 1., 1.],
14 | ...,
15 | [998., 998., 998., 998., 998.],
16 | [999., 999., 999., 999., 999.]]
17 | )
18 |
19 | """
20 |
21 | indexes = np.arange(0, DATA['length'])
22 | data = np.zeros((DATA['length'], DATA['n_covariates'] + DATA['n_targets'])) \
23 | + np.expand_dims(indexes, axis=1)
24 | return data
25 |
26 | def generate_data_zeros_and_ones() -> np.ndarray:
27 | """Create a dummy data made of zeros for covariates, and ones for the targets
28 | e.g:
29 | data = array(
30 | [[1.,1.,0.,0.,0.],
31 | [1.,1.,0.,0.,0.],
32 | ...,
33 | [1.,1.,0.,0.,0.],
34 | [1.,1.,0.,0.,0.]]
35 | )
36 | """
37 | shape = (DATA['length'], DATA['n_covariates'] + DATA['n_targets'])
38 | data = np.zeros(shape)
39 | data[:, DATA["target_column_idx"]] = 1.
40 | return data
41 |
42 | def generate_X_y_zeros_and_ones() -> Tuple[np.ndarray]:
43 | """Create a dummy (X,y) tuple made of zeros for covariates, and ones for the targets, just to check if model fit well"""
44 | length = round(DATA["length"] / TRAIN['stride'])
45 |
46 | shape_X = (length, TRAIN['input_length'], DATA['n_covariates']+DATA['n_targets'])
47 | X = np.zeros(shape_X)
48 | X[:, :, DATA["target_column_idx"]] = 1.
49 |
50 | shape_y = (length, TRAIN['output_length'], DATA['n_targets'])
51 | y = np.ones(shape_y)
52 | y = np.squeeze(y)
53 |
54 | return (X,y)
55 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Top level orchestrator of the project. To be called from the CLI.
3 | It comprises all the "routes" you may want to call
4 | '''
5 | from email import header
6 | import numpy as np
7 | import pandas as pd
8 | import os
9 | from ts_boilerplate.dataprep import get_Xi_yi, get_X_y, get_folds, train_test_split
10 | from ts_boilerplate.model import get_model, fit_model, predict_output
11 | from ts_boilerplate.metrics import mape, mae
12 | from ts_boilerplate.params import CROSS_VAL, ROOT_DIR, TRAIN, DATA
13 | from typing import Tuple, List
14 | import matplotlib.pyplot as plt
15 |
16 |
17 | def train(data: np.ndarray, print_metrics: bool = False):
18 | """
19 | Train the model in this package on one fold `data` containing the 2D-array of time-series for your problem
20 | Returns `metrics_test` associated with the training
21 | """
22 | # $CHALLENGIFY_BEGIN
23 | data_train, data_test = train_test_split(data, **TRAIN)
24 | X_train, y_train = get_X_y(data_train, **TRAIN)
25 | X_test, y_test = get_X_y(data_test, **TRAIN)
26 | model = get_model(X_train, y_train)
27 | history = fit_model(model, X_train, y_train)
28 | y_pred = predict_output(model, X_test)
29 | metrics_test = mae(y_test, y_pred)
30 | if print_metrics:
31 | print("### Test Metric: ", metrics_test)
32 | return metrics_test
33 | # $CHALLENGIFY_END
34 |
35 |
36 | def cross_validate(data: np.ndarray, print_metrics: bool = False):
37 | """
38 | Cross-Validate the model in this package on`data`
39 | Returns `metrics_cv`: the list of test metrics at each fold
40 | """
41 | # $CHALLENGIFY_BEGIN
42 | folds = get_folds(data, **CROSS_VAL)
43 | metrics_cv = []
44 | for fold in folds:
45 | metrics_fold = train(fold, print_metrics=print_metrics)
46 | metrics_cv.append(metrics_fold)
47 |
48 | if print_metrics:
49 | print(f"### CV metrics after {len(folds)} folds ### ")
50 | print(metrics_cv)
51 | return metrics_cv
52 | # $CHALLENGIFY_END
53 |
54 |
55 | def backtest(data: np.ndarray,
56 | stride: int = 1,
57 | start_ratio: float = 0.9,
58 | retrain: bool = True,
59 | retrain_every: int = 1,
60 | print_metrics=False,
61 | plot_metrics=False):
62 | """Returns historical forecasts for the entire dataset
63 | - by training model up to `start_ratio` of the dataset
64 | - then predicting next values using the model in this package (only predict the last time-steps if `predict_only_last_value` is True)
65 | - then moving `stride` timesteps ahead
66 | - then retraining the model if `retrain` is True and if we moved `retrain_every` timesteps since last training
67 | - then predicting next values again
68 |
69 | Return:
70 | - all historical predictions as 2D-array time-series of shape ((1-start_ratio)*len(data), n_targets)/stride
71 | - Compute the 'mean-MAPE' per forecast horizon
72 | - Print historical predictions if you want a visual check
73 |
74 | see https://unit8co.github.io/darts/generated_api/darts.models.forecasting.rnn_model.html#darts.models.forecasting.rnn_model.RNNModel.historical_forecasts
75 | """
76 | # $CHALLENGIFY_BEGIN
77 |
78 | # Initialization
79 | start_timestep_0 = round(start_ratio * len(data))
80 | data_train_0 = data[:start_timestep_0, ...]
81 | X_train_tmp, y_train_tmp = get_X_y(data_train_0, **TRAIN)
82 | data_test_backtested = data[start_timestep_0:, ...]
83 | _, y_test = get_X_y(data_test_backtested, **TRAIN, shuffle=False)
84 | y_pred_backtested = []
85 | retrain_counter = 0
86 | timesteps_backtested_list = []
87 | for i in range(0, len(data_test_backtested), stride):
88 | start_timestep_i = start_timestep_0 + i
89 | data_train = data[:start_timestep_i, ...]
90 | data_test = data[start_timestep_i:, ...]
91 | X_train_tmp, y_train_tmp = get_X_y(data_train, **TRAIN)
92 | X_test_i, y_test_i = get_Xi_yi(first_index=0, data=data_test, **TRAIN)
93 |
94 | # At some point after sliding through time, we will reach the end of the test set
95 | if y_test_i.shape[0] < y_train_tmp.shape[1]:
96 | break
97 |
98 | model = get_model(X_train_tmp, y_train_tmp)
99 |
100 | # Retrain when required, with incremental learning (ie. starting from previous weights)
101 | if retrain and i % retrain_every == 0:
102 | retrain_counter += 1
103 | fit_model(model, X_train_tmp, y_train_tmp)
104 |
105 | y_pred_i = np.squeeze(predict_output(model, X_test_i[None, ...]))
106 | y_pred_backtested.append(y_pred_i)
107 | timesteps_backtested_list.append(i)
108 |
109 | y_pred_backtested = np.array(y_pred_backtested)
110 | y_test_backtested = y_test[timesteps_backtested_list]
111 | # Check that we compare apples to apples
112 | assert y_pred_backtested.shape == y_test_backtested.shape
113 |
114 | metrics_backtested = mae(y_pred_backtested, y_test_backtested)
115 |
116 | if print_metrics:
117 | print(
118 | f'### BACKETESTED METRICS BASED ON THE LAST {y_pred_backtested.shape[0]} TIMESTEPS AND WITH {retrain_counter} retrain operations'
119 | )
120 | print(mae(y_pred_backtested, y_test_backtested))
121 | if plot_metrics:
122 | # TODO: make it work for any dimension of y
123 | plt.plot(y_pred_backtested[:,0,0], label='historical forecasts')
124 | plt.plot(y_test_backtested[:,0,0], label='truth')
125 | plt.xlabel('timesteps number (0=beginning of backtest)')
126 | plt.legend()
127 | plt.show()
128 |
129 | return metrics_backtested
130 | # $CHALLENGIFY_END
131 |
132 | if __name__ == '__main__':
133 | data = pd.read_csv(os.path.join(ROOT_DIR, 'data','raw','data.csv')).to_numpy()
134 | try:
135 | train(data=data, print_metrics=True)
136 | cross_validate(data=data, print_metrics=True)
137 | backtest(data=data,
138 | stride = 1,
139 | start_ratio = 0.9,
140 | retrain = True,
141 | retrain_every=1,
142 | print_metrics=True,
143 | plot_metrics=True)
144 | except:
145 | import ipdb, traceback, sys
146 | extype, value, tb = sys.exc_info()
147 | traceback.print_exc()
148 | ipdb.post_mortem(tb)
149 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/metrics.py:
--------------------------------------------------------------------------------
1 | '''
2 | Computes usefull Time Series metrics from (y_true, y_test)
3 | '''
4 |
5 | import numpy as np
6 | from tensorflow import reduce_mean
7 | from tensorflow.keras.metrics import mean_absolute_error, mean_absolute_percentage_error
8 |
9 |
10 | def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
11 | """Returns Mean Absolute Error"""
12 | # $CHALLENGIFY_BEGIN
13 | return reduce_mean(mean_absolute_error(y_true, y_pred)).numpy()
14 | # $CHALLENGIFY_END
15 |
16 | def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
17 | """Returns Mean Absolute Percentage Error"""
18 | # $CHALLENGIFY_BEGIN
19 | return reduce_mean(mean_absolute_percentage_error(y_true, y_pred)).numpy()
20 | # $CHALLENGIFY_END
21 |
22 | def mase(y_true: np.ndarray, y_pred: np.ndarray) -> float:
23 | """Returns Mean Absolute Scaled Error (https://en.wikipedia.org/wiki/Mean_absolute_scaled_error)
24 | """
25 | pass
26 |
27 |
28 | def play_trading_strategy(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
29 | """Returns the array of relative portfolio values over the test period"""
30 | pass
31 |
32 |
33 | def return_on_investment(played_trading_strategy: np.ndarray) -> float:
34 | """Returns the ROI of an investment strategy"""
35 | pass
36 |
37 |
38 | def sharpe_ratio(played_trading_strategy: np.ndarray) -> float:
39 | """Returns the Sharpe Ratio (Return on Investment / Volatility) of an investment strategy"""
40 | pass
41 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/model.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input
3 | from tensorflow.keras import Model
4 | from ts_boilerplate.params import DATA, TRAIN
5 |
6 | # TODO: Should we add here the preprocessing? into a class called "pipeline"?
7 | # TODO: Should we refacto in a class ? Probably!
8 |
9 |
10 | def get_model(X_train, y_train):
11 | """Instanciate, compile and and return the model of your choice"""
12 | # $CHALLENGIFY_BEGIN
13 |
14 | # BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS
15 | input = Input(shape=X_train.shape[1:])
16 | # Take last temporal values of the targets, and duplicate it as many times as `output_length`
17 | x = Lambda(
18 | lambda x: tf.repeat(
19 | tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1),
20 | repeats=TRAIN['output_length'],
21 | axis=1)
22 | )(input)
23 | output = Reshape(y_train.shape[1:])(x)
24 | model = Model(input, output)
25 |
26 | # # THE SIMPLEST OF ALL POSSIBLE RNN
27 | # model = tf.keras.Sequential()
28 | # model.add(SimpleRNN(1, activation='tanh', input_shape=X_train.shape[1:]))
29 | # model.add(Dense(TRAIN['output_length'] * DATA["n_targets"], activation='linear'))
30 | # model.add(Reshape(y_train.shape[1:]))
31 | # model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)
32 |
33 | model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)
34 | return model
35 | # $CHALLENGIFY_END
36 |
37 |
38 | def fit_model(model, X_train, y_train, **kwargs):
39 | """Fit the `model` object, including preprocessing if needs be"""
40 | # $CHALLENGIFY_BEGIN
41 | verbose = kwargs.get("verbose", 0)
42 | es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
43 | patience=2,
44 | verbose=verbose,
45 | mode='min',
46 | restore_best_weights=True)
47 | history = model.fit(X_train,
48 | y_train,
49 | epochs=50,
50 | batch_size=16,
51 | validation_split=0.3,
52 | callbacks=[es],
53 | verbose=verbose)
54 | return history
55 | # $CHALLENGIFY_END
56 |
57 |
58 | def predict_output(model, X_test):
59 | """Return y_test. Include preprocessing if needs be"""
60 | # $CHALLENGIFY_BEGIN
61 | y_pred = model.predict(X_test)
62 | return y_pred
63 | # $CHALLENGIFY_END
64 |
--------------------------------------------------------------------------------
/project-boilerplates/time-series-cross-validator/ts_boilerplate/params.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dotenv import load_dotenv
3 |
4 | ## CREDENTIALS AND PATHS
5 | load_dotenv()
6 | API_KEY = os.getenv('API_KEY')
7 |
8 | ## DIR PARAMS
9 | ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
10 | DATA_RAW_CSV_PATH = os.path.join(ROOT_DIR, 'data', 'raw', 'data.csv')
11 |
12 | # 👇 Please fill these global variable below very carefully, in order to create tests related to your problem👇
13 | # cf: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true
14 | DATA = dict(
15 | length = 500, # How many timesteps does your dataset contains?
16 | n_covariates = 3, # number of past covariates, excluding target time series. Our tests do not support future_covariate yet.
17 | target_column_idx = [0,1] # List of index(es) of target column(s) in your dataset. e.g [0] for Mono-target problem, e.g. [0,1,4] for multi-variate targets problem. Note that past targets values will also be used as features X.
18 | )
19 | DATA['n_targets'] = len(DATA['target_column_idx']) # number of target time series to predict.
20 |
21 | TRAIN = dict(
22 | horizon = 4, # start prediction xxx timestep ahead
23 | input_length = 10, # Length (in time) of each sequences that will be seen by the model (X.shape[1])
24 | output_length = 7, # Length (in time) of prediction (y.shape[1])
25 | stride = 1, # Integer used to create all pairs of sample (Xi, yi) by sliding in each data fold. Use `None` if you don't plan to use any sliding method in data.get_X_y
26 | train_test_ratio = 0.7, # ratio of train / (train+test) length in each fold
27 | )
28 |
29 | CROSS_VAL = dict(
30 | fold_length = 200,
31 | fold_stride = 100,
32 | )
33 |
--------------------------------------------------------------------------------
/tutorials/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/.keep
--------------------------------------------------------------------------------
/tutorials/removing-bottlenecks/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/removing-bottlenecks/model.png
--------------------------------------------------------------------------------
/tutorials/removing-bottlenecks/row_column_wise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/removing-bottlenecks/row_column_wise.png
--------------------------------------------------------------------------------
/tutorials/removing-bottlenecks/slides.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "f5e4ca46-23e7-4bc6-a49f-8ed36e611c43",
7 | "metadata": {
8 | "slideshow": {
9 | "slide_type": "skip"
10 | },
11 | "tags": []
12 | },
13 | "outputs": [],
14 | "source": [
15 | "import numpy as np\n",
16 | "import pandas as pd\n",
17 | "from numba import jit, vectorize, float64\n",
18 | "from time import perf_counter as counter"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "70135331-91db-4ae6-b7ee-9ac0b1901e97",
24 | "metadata": {
25 | "slideshow": {
26 | "slide_type": "slide"
27 | },
28 | "tags": []
29 | },
30 | "source": [
31 | "# Removing bottlenecks with Numba, Cython, and TensorFlow"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "id": "5c9f0890-bb38-4760-a74b-091f2eb8ea67",
37 | "metadata": {
38 | "slideshow": {
39 | "slide_type": "subslide"
40 | },
41 | "tags": []
42 | },
43 | "source": [
44 | "## Topics"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "af960d3e-7c57-4211-9baf-1a5d817e7dd8",
50 | "metadata": {
51 | "slideshow": {
52 | "slide_type": "fragment"
53 | },
54 | "tags": []
55 | },
56 | "source": [
57 | "1. Numba to speed up simple operations and create vectorising functions.\n",
58 | "2. When to and how to use Cython in its simplest form.\n",
59 | "3. How to include these functions in your packages.\n",
60 | "4. Tensorflow feature engineering.\n",
61 | "5. Quick win for TensorFlow speed in prediction."
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "id": "021cbdcd-b78b-46eb-8646-e4ee1779e7a8",
67 | "metadata": {
68 | "slideshow": {
69 | "slide_type": "slide"
70 | },
71 | "tags": []
72 | },
73 | "source": [
74 | "## 1. Numba"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "id": "12b25089-e0d6-4252-a8ec-0a795b86ed81",
80 | "metadata": {
81 | "slideshow": {
82 | "slide_type": "subslide"
83 | },
84 | "tags": []
85 | },
86 | "source": [
87 | "### Introduction"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "id": "3e9c5391-e3d7-4b31-ab50-c2faef528ae5",
93 | "metadata": {
94 | "slideshow": {
95 | "slide_type": "fragment"
96 | },
97 | "tags": []
98 | },
99 | "source": [
100 | "- Library to translate python code into fast machine code.\n",
101 | "- Designed specifically for compatibility with numpy.\n",
102 | "- Provides just in time compilation."
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "3a6054ca-30b9-4737-a3df-080e2b4e4582",
108 | "metadata": {
109 | "slideshow": {
110 | "slide_type": "subslide"
111 | },
112 | "tags": []
113 | },
114 | "source": [
115 | "### Demo"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "id": "c6a21db2-b420-4fa9-b188-b5935ae8250c",
121 | "metadata": {
122 | "slideshow": {
123 | "slide_type": "subslide"
124 | },
125 | "tags": []
126 | },
127 | "source": [
128 | "### Pros v Cons"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "id": "2b7a4395-3ff0-45ca-b205-c8d257fb8af0",
134 | "metadata": {
135 | "slideshow": {
136 | "slide_type": "fragment"
137 | },
138 | "tags": []
139 | },
140 | "source": [
141 | "Pros\n",
142 | "- Easy to implement in many cases.\n",
143 | "- Significant speed boosts.\n",
144 | "- Suited to a lot of data processing needs in data science.\n",
145 | "\n",
146 | "Cons\n",
147 | "- Limited scope regarding python libaries.\n",
148 | "- When certain functions either do not work or are not accelerated it is difficult to work out why not?\n",
149 | "- In more complex use cases needing to make all of the functions compatible is a hassle."
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "id": "8372b391-2719-4600-98eb-4ae7bf53895d",
155 | "metadata": {
156 | "slideshow": {
157 | "slide_type": "skip"
158 | },
159 | "tags": []
160 | },
161 | "source": [
162 | "### Speed comparisions"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "id": "e23150e8-7c19-4e49-9836-c23afd9b0bac",
168 | "metadata": {
169 | "slideshow": {
170 | "slide_type": "skip"
171 | },
172 | "tags": []
173 | },
174 | "source": [
175 | "With loops"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "id": "e5a4cf2b-8efa-4743-bf04-62ac29375ae7",
182 | "metadata": {
183 | "slideshow": {
184 | "slide_type": "skip"
185 | },
186 | "tags": []
187 | },
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/html": [
192 | "\n",
193 | "\n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " | \n",
210 | " d | \n",
211 | "
\n",
212 | " \n",
213 | " \n",
214 | " \n",
215 | " 0 | \n",
216 | " 0.638929 | \n",
217 | "
\n",
218 | " \n",
219 | " 1 | \n",
220 | " 0.649349 | \n",
221 | "
\n",
222 | " \n",
223 | " 2 | \n",
224 | " 0.496717 | \n",
225 | "
\n",
226 | " \n",
227 | " 3 | \n",
228 | " 0.614220 | \n",
229 | "
\n",
230 | " \n",
231 | " 4 | \n",
232 | " 0.260934 | \n",
233 | "
\n",
234 | " \n",
235 | "
\n",
236 | "
"
237 | ],
238 | "text/plain": [
239 | " d\n",
240 | "0 0.638929\n",
241 | "1 0.649349\n",
242 | "2 0.496717\n",
243 | "3 0.614220\n",
244 | "4 0.260934"
245 | ]
246 | },
247 | "execution_count": 4,
248 | "metadata": {},
249 | "output_type": "execute_result"
250 | }
251 | ],
252 | "source": [
253 | "data = pd.DataFrame(np.random.uniform(0, 1, 1_000_000).reshape(-1,1))\n",
254 | "data.columns = [\"d\"]\n",
255 | "data.head()"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 5,
261 | "id": "f455bf1f-b25c-4d81-890c-8e59d55196c7",
262 | "metadata": {
263 | "slideshow": {
264 | "slide_type": "skip"
265 | },
266 | "tags": []
267 | },
268 | "outputs": [],
269 | "source": [
270 | "test_values = [\"1\", \"10\", \"100\", \"1_000\", \"10_000\", \"100_000\", \"1_000_000\"]"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 6,
276 | "id": "267c9df5-11e3-4657-bd40-0b7c4113aa90",
277 | "metadata": {
278 | "slideshow": {
279 | "slide_type": "skip"
280 | },
281 | "tags": []
282 | },
283 | "outputs": [],
284 | "source": [
285 | "empty_results = [np.nan for i in range(7)]"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 7,
291 | "id": "c412aca3-4715-45cf-91eb-724032b4ff87",
292 | "metadata": {
293 | "slideshow": {
294 | "slide_type": "skip"
295 | },
296 | "tags": []
297 | },
298 | "outputs": [],
299 | "source": [
300 | "results = {\n",
301 | " \"Pure Python\": empty_results.copy(),\n",
302 | " \"Jit Operation\": empty_results.copy(),\n",
303 | " \"Jit Apply\": empty_results.copy(),\n",
304 | " \"Jit Loop\": empty_results.copy(),\n",
305 | " \"Vectorize\": empty_results.copy(),\n",
306 | " \"Jit Vectorize\": empty_results.copy()\n",
307 | "}"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "id": "36b81f68-9434-44c4-8aae-fb00b6a9604a",
313 | "metadata": {
314 | "slideshow": {
315 | "slide_type": "skip"
316 | },
317 | "tags": []
318 | },
319 | "source": [
320 | "Pure python test"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 8,
326 | "id": "27a0b14c-cb01-43ce-a58c-0b7c90f32f74",
327 | "metadata": {
328 | "slideshow": {
329 | "slide_type": "skip"
330 | },
331 | "tags": []
332 | },
333 | "outputs": [
334 | {
335 | "name": "stdout",
336 | "output_type": "stream",
337 | "text": [
338 | "Pure Python\n",
339 | "Testing 1\n",
340 | "Testing 10\n"
341 | ]
342 | }
343 | ],
344 | "source": [
345 | "def operation(x):\n",
346 | " val = 0\n",
347 | " for i in range(1_000):\n",
348 | " for j in range(1_000):\n",
349 | " val += (x * i) - (x * j) \n",
350 | " return val\n",
351 | "\n",
352 | "def pure_python_test(n):\n",
353 | " new = np.zeros(n).reshape(-1,1)\n",
354 | " for i in range(n):\n",
355 | " new[i,:] = operation(data.iloc[i,:][0])\n",
356 | " return new\n",
357 | "\n",
358 | "test = \"Pure Python\"\n",
359 | "print(test)\n",
360 | "for i, val in enumerate(test_values):\n",
361 | " if i >= 2:\n",
362 | " break\n",
363 | " print(f\"Testing {val}\")\n",
364 | " baseline_begin = counter()\n",
365 | " pure_python_test(int(val))\n",
366 | " baseline_end = counter()\n",
367 | " results[test][i] = baseline_end-baseline_begin"
368 | ]
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "id": "37ca0ee6-a9e3-476a-af91-23423c857f7e",
373 | "metadata": {
374 | "slideshow": {
375 | "slide_type": "skip"
376 | },
377 | "tags": []
378 | },
379 | "source": [
380 | "Jit operation test"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 9,
386 | "id": "a3ec308d-70e3-4faf-910f-8b75295eaeeb",
387 | "metadata": {
388 | "slideshow": {
389 | "slide_type": "skip"
390 | },
391 | "tags": []
392 | },
393 | "outputs": [
394 | {
395 | "name": "stdout",
396 | "output_type": "stream",
397 | "text": [
398 | "Jit Operation\n",
399 | "Testing 1\n",
400 | "Testing 10\n",
401 | "Testing 100\n",
402 | "Testing 1_000\n"
403 | ]
404 | }
405 | ],
406 | "source": [
407 | "@jit\n",
408 | "def jit_operation(x):\n",
409 | " val = 0\n",
410 | " for i in range(1_000):\n",
411 | " for j in range(1_000):\n",
412 | " val += (x * i) - (x * j) \n",
413 | " return val\n",
414 | "\n",
415 | "def jit_operation_test(n):\n",
416 | " new = np.zeros(n).reshape(-1,1)\n",
417 | " d = data.head(n)\n",
418 | " for i in range(n):\n",
419 | " x = d.iloc[i,:][0]\n",
420 | " val = jit_operation(x) \n",
421 | " new[i,:] = val\n",
422 | " return new\n",
423 | "\n",
424 | "test = \"Jit Operation\"\n",
425 | "print(test)\n",
426 | "for i, val in enumerate(test_values):\n",
427 | " if i >= 4:\n",
428 | " break\n",
429 | " print(f\"Testing {val}\")\n",
430 | " baseline_begin = counter()\n",
431 | " jit_operation_test(int(val))\n",
432 | " baseline_end = counter()\n",
433 | " results[test][i] = baseline_end-baseline_begin"
434 | ]
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "id": "483d7e62-5ba2-47e3-aee2-4ce2e0a85103",
439 | "metadata": {
440 | "slideshow": {
441 | "slide_type": "skip"
442 | },
443 | "tags": []
444 | },
445 | "source": [
446 | "Jit loop test"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": 10,
452 | "id": "a6b9b251-63e5-4cac-acd7-c661f447703c",
453 | "metadata": {
454 | "slideshow": {
455 | "slide_type": "skip"
456 | },
457 | "tags": []
458 | },
459 | "outputs": [
460 | {
461 | "name": "stdout",
462 | "output_type": "stream",
463 | "text": [
464 | "Jit Loop\n",
465 | "Testing 1\n",
466 | "Testing 10\n",
467 | "Testing 100\n",
468 | "Testing 1_000\n",
469 | "Testing 10_000\n",
470 | "Testing 100_000\n",
471 | "Testing 1_000_000\n"
472 | ]
473 | }
474 | ],
475 | "source": [
476 | "@jit\n",
477 | "def jit_operation(x):\n",
478 | " val = 0\n",
479 | " for i in range(1_000):\n",
480 | " for j in range(1_000):\n",
481 | " val += (x * i) - (x * j) \n",
482 | " return val\n",
483 | "\n",
484 | "@jit\n",
485 | "def jit_loop_test(data):\n",
486 | " new = np.zeros(len(data)).reshape(-1,1)\n",
487 | " for i, val in enumerate(data):\n",
488 | " new[i,:] = val\n",
489 | " return new\n",
490 | "\n",
491 | "test = \"Jit Loop\"\n",
492 | "print(test)\n",
493 | "for i, val in enumerate(test_values):\n",
494 | " print(f\"Testing {val}\")\n",
495 | " baseline_begin = counter()\n",
496 | " jit_loop_test(np.array(data.head(int(val))[\"d\"]))\n",
497 | " baseline_end = counter()\n",
498 | " results[test][i] = baseline_end-baseline_begin"
499 | ]
500 | },
501 | {
502 | "cell_type": "markdown",
503 | "id": "66838bdc-a519-4899-8a46-61ca73d10f99",
504 | "metadata": {
505 | "jp-MarkdownHeadingCollapsed": true,
506 | "slideshow": {
507 | "slide_type": "skip"
508 | },
509 | "tags": []
510 | },
511 | "source": [
512 | "Jit apply test"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 11,
518 | "id": "62c520b1-fad5-43bb-b6d0-b2c79c6b8549",
519 | "metadata": {
520 | "slideshow": {
521 | "slide_type": "skip"
522 | },
523 | "tags": []
524 | },
525 | "outputs": [
526 | {
527 | "name": "stdout",
528 | "output_type": "stream",
529 | "text": [
530 | "Jit Apply\n",
531 | "Testing 1\n",
532 | "Testing 10\n",
533 | "Testing 100\n",
534 | "Testing 1_000\n",
535 | "Testing 10_000\n"
536 | ]
537 | }
538 | ],
539 | "source": [
540 | "@jit\n",
541 | "def jit_operation(x):\n",
542 | " val = 0\n",
543 | " for i in range(1_000):\n",
544 | " for j in range(1_000):\n",
545 | " val += (x * i) - (x * j) \n",
546 | " return val\n",
547 | "\n",
548 | "def jit_apply_test(n):\n",
549 | " t_d = data.head(n)\n",
550 | " return t_d[\"d\"].apply(jit_operation)\n",
551 | "\n",
552 | "test = \"Jit Apply\"\n",
553 | "print(test)\n",
554 | "for i, val in enumerate(test_values):\n",
555 | " if i >= 5:\n",
556 | " break\n",
557 | " print(f\"Testing {val}\")\n",
558 | " baseline_begin = counter()\n",
559 | " jit_apply_test(int(val))\n",
560 | " baseline_end = counter()\n",
561 | " results[test][i] = baseline_end-baseline_begin"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "id": "35006bc5-6a8e-41df-9b64-3839e74ed232",
567 | "metadata": {
568 | "slideshow": {
569 | "slide_type": "skip"
570 | },
571 | "tags": []
572 | },
573 | "source": [
574 | "Vectorize test"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 12,
580 | "id": "b9638155-021d-47ea-a061-eb6333bc5afa",
581 | "metadata": {
582 | "slideshow": {
583 | "slide_type": "skip"
584 | },
585 | "tags": []
586 | },
587 | "outputs": [
588 | {
589 | "name": "stdout",
590 | "output_type": "stream",
591 | "text": [
592 | "Vectorize\n",
593 | "Testing 1\n",
594 | "Testing 10\n",
595 | "Testing 100\n",
596 | "Testing 1_000\n",
597 | "Testing 10_000\n"
598 | ]
599 | }
600 | ],
601 | "source": [
602 | "@vectorize([float64(float64)])\n",
603 | "def vectorize_operation(x):\n",
604 | " val = 0\n",
605 | " for i in range(1_000):\n",
606 | " for j in range(1_000):\n",
607 | " val += (x * i) - (x * j) \n",
608 | " return val\n",
609 | "\n",
610 | "def vectorize_test(n):\n",
611 | " t_d = data.head(n)\n",
612 | " new = vectorize_operation(t_d[\"d\"])\n",
613 | " return new\n",
614 | "\n",
615 | "test = \"Vectorize\"\n",
616 | "print(test)\n",
617 | "for i, val in enumerate(test_values):\n",
618 | " if i >= 5:\n",
619 | " break\n",
620 | " print(f\"Testing {val}\")\n",
621 | " baseline_begin = counter()\n",
622 | " vectorize_test(int(val))\n",
623 | " baseline_end = counter()\n",
624 | " results[test][i] = baseline_end-baseline_begin"
625 | ]
626 | },
627 | {
628 | "cell_type": "markdown",
629 | "id": "45ee8732-2985-42a4-9884-a1c24243fe80",
630 | "metadata": {
631 | "slideshow": {
632 | "slide_type": "skip"
633 | },
634 | "tags": []
635 | },
636 | "source": [
637 | "Jit Operation with Vectorize"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 13,
643 | "id": "51aa495a-1a70-4c8f-a1d5-858f83dae4cc",
644 | "metadata": {
645 | "slideshow": {
646 | "slide_type": "skip"
647 | },
648 | "tags": []
649 | },
650 | "outputs": [
651 | {
652 | "name": "stdout",
653 | "output_type": "stream",
654 | "text": [
655 | "Jit Vectorize\n",
656 | "Testing 1\n",
657 | "Testing 10\n",
658 | "Testing 100\n",
659 | "Testing 1_000\n",
660 | "Testing 10_000\n"
661 | ]
662 | }
663 | ],
664 | "source": [
665 | "@jit\n",
666 | "def jit_operation(x):\n",
667 | " val = 0\n",
668 | " for i in range(1_000):\n",
669 | " for j in range(1_000):\n",
670 | " val += (x * i) - (x * j) \n",
671 | " return val\n",
672 | "\n",
673 | "@vectorize([float64(float64)])\n",
674 | "def jit_vectorize_operation(x):\n",
675 | " return jit_operation(x)\n",
676 | "\n",
677 | "def jit_vectorize_test(n):\n",
678 | " t_d = data.head(n)\n",
679 | " return jit_vectorize_operation(t_d[\"d\"])\n",
680 | "\n",
681 | "test = \"Jit Vectorize\"\n",
682 | "print(test)\n",
683 | "for i, val in enumerate(test_values):\n",
684 | " if i >= 5:\n",
685 | " break\n",
686 | " print(f\"Testing {val}\")\n",
687 | " baseline_begin = counter()\n",
688 | " jit_vectorize_test(int(val))\n",
689 | " baseline_end = counter()\n",
690 | " results[test][i] = baseline_end-baseline_begin"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 14,
696 | "id": "33bd0e20-7824-4339-aba5-20ce7c746526",
697 | "metadata": {
698 | "slideshow": {
699 | "slide_type": "skip"
700 | },
701 | "tags": []
702 | },
703 | "outputs": [
704 | {
705 | "data": {
706 | "text/html": [
707 | "\n",
709 | "\n",
710 | " Speed of function in s (given number of operations)\n",
711 | " \n",
712 | " \n",
713 | " | \n",
714 | " 1 | \n",
715 | " 10 | \n",
716 | " 100 | \n",
717 | " 1_000 | \n",
718 | " 10_000 | \n",
719 | " 100_000 | \n",
720 | " 1_000_000 | \n",
721 | "
\n",
722 | " \n",
723 | " \n",
724 | " \n",
725 | " Pure Python | \n",
726 | " 0.490379 | \n",
727 | " 4.222536 | \n",
728 | " nan | \n",
729 | " nan | \n",
730 | " nan | \n",
731 | " nan | \n",
732 | " nan | \n",
733 | "
\n",
734 | " \n",
735 | " Jit Operation | \n",
736 | " 0.208173 | \n",
737 | " 0.015439 | \n",
738 | " 0.128556 | \n",
739 | " 1.074879 | \n",
740 | " nan | \n",
741 | " nan | \n",
742 | " nan | \n",
743 | "
\n",
744 | " \n",
745 | " Jit Apply | \n",
746 | " 0.085609 | \n",
747 | " 0.012046 | \n",
748 | " 0.110998 | \n",
749 | " 1.156075 | \n",
750 | " 10.003750 | \n",
751 | " nan | \n",
752 | " nan | \n",
753 | "
\n",
754 | " \n",
755 | " Jit Loop | \n",
756 | " 0.372415 | \n",
757 | " 0.000184 | \n",
758 | " 0.000087 | \n",
759 | " 0.000083 | \n",
760 | " 0.000138 | \n",
761 | " 0.000656 | \n",
762 | " 0.005486 | \n",
763 | "
\n",
764 | " \n",
765 | " Vectorize | \n",
766 | " 0.002910 | \n",
767 | " 0.012810 | \n",
768 | " 0.107374 | \n",
769 | " 0.993053 | \n",
770 | " 9.964653 | \n",
771 | " nan | \n",
772 | " nan | \n",
773 | "
\n",
774 | " \n",
775 | " Jit Vectorize | \n",
776 | " 0.001397 | \n",
777 | " 0.011649 | \n",
778 | " 0.100765 | \n",
779 | " 0.984344 | \n",
780 | " 10.869915 | \n",
781 | " nan | \n",
782 | " nan | \n",
783 | "
\n",
784 | " \n",
785 | "
\n"
786 | ],
787 | "text/plain": [
788 | ""
789 | ]
790 | },
791 | "execution_count": 14,
792 | "metadata": {},
793 | "output_type": "execute_result"
794 | }
795 | ],
796 | "source": [
797 | "results_df = pd.DataFrame.from_dict(results, orient=\"index\")\n",
798 | "results_df.columns = test_values\n",
799 | "results_df.style.set_caption(\"Speed of function in s (given number of operations)\")"
800 | ]
801 | },
802 | {
803 | "cell_type": "markdown",
804 | "id": "d42e1af0-1a87-467e-95a8-1ec6f8f3a8c8",
805 | "metadata": {
806 | "slideshow": {
807 | "slide_type": "slide"
808 | },
809 | "tags": []
810 | },
811 | "source": [
812 | "## 2. Cython"
813 | ]
814 | },
815 | {
816 | "cell_type": "markdown",
817 | "id": "df171326-8b9e-4a52-9893-76d8dca68e56",
818 | "metadata": {
819 | "slideshow": {
820 | "slide_type": "subslide"
821 | },
822 | "tags": []
823 | },
824 | "source": [
825 | "### Introduction"
826 | ]
827 | },
828 | {
829 | "cell_type": "markdown",
830 | "id": "65984cc1-da91-40c6-bcee-3d98c6c37878",
831 | "metadata": {
832 | "slideshow": {
833 | "slide_type": "fragment"
834 | },
835 | "tags": []
836 | },
837 | "source": [
838 | "- The Cython language is a superset of the Python language that additionally supports calling C functions and declaring C types on variables and class attributes. This allows the compiler to generate very efficient C code from Cython code. \n",
839 | "- Write Python code that calls back and forth from and to C or C++ code natively at any point.\n",
840 | "- Easily tune readable Python code into plain C performance by adding static type declarations, also in Python syntax."
841 | ]
842 | },
843 | {
844 | "cell_type": "markdown",
845 | "id": "8a17851e-c814-4fe9-9db6-1dcc3050368b",
846 | "metadata": {
847 | "slideshow": {
848 | "slide_type": "subslide"
849 | },
850 | "tags": []
851 | },
852 | "source": [
853 | "### Demo "
854 | ]
855 | },
856 | {
857 | "cell_type": "markdown",
858 | "id": "c6d25112-8aca-4b97-802a-fc6c78769cbb",
859 | "metadata": {
860 | "slideshow": {
861 | "slide_type": "subslide"
862 | },
863 | "tags": []
864 | },
865 | "source": [
866 | "### Pros v Cons"
867 | ]
868 | },
869 | {
870 | "cell_type": "markdown",
871 | "id": "79e5d6cf-8133-43e1-9037-a1ccc566b3f0",
872 | "metadata": {
873 | "slideshow": {
874 | "slide_type": "fragment"
875 | },
876 | "tags": []
877 | },
878 | "source": [
879 | "Pros\n",
880 | "- Very fast.\n",
881 | "- Extensively supported.\n",
882 | "- Utilise C libaries.\n",
883 | "\n",
884 | "Cons\n",
885 | "- Need to learn how to write.\n",
886 | "- Difficult to optimise.\n",
887 | "- Difficulty also rises quickly with complexity."
888 | ]
889 | },
890 | {
891 | "cell_type": "markdown",
892 | "id": "10a03e96-389b-47e4-89ca-c83a652b39f3",
893 | "metadata": {
894 | "slideshow": {
895 | "slide_type": "slide"
896 | },
897 | "tags": []
898 | },
899 | "source": [
900 | "## 3. Packaging"
901 | ]
902 | },
903 | {
904 | "cell_type": "markdown",
905 | "id": "334e51be-a27a-4e65-af83-46e10abf29d8",
906 | "metadata": {
907 | "slideshow": {
908 | "slide_type": "subslide"
909 | },
910 | "tags": []
911 | },
912 | "source": [
913 | "### Boilerplate"
914 | ]
915 | },
916 | {
917 | "cell_type": "markdown",
918 | "id": "a025d97d-2c50-4fa0-b8f8-277bf9b82037",
919 | "metadata": {
920 | "slideshow": {
921 | "slide_type": "slide"
922 | },
923 | "tags": []
924 | },
925 | "source": [
926 | "## 4. TensorFlow "
927 | ]
928 | },
929 | {
930 | "cell_type": "markdown",
931 | "id": "f15ab448-d130-42a4-abf9-f92c17c3eff4",
932 | "metadata": {
933 | "slideshow": {
934 | "slide_type": "subslide"
935 | },
936 | "tags": []
937 | },
938 | "source": [
939 | "### Demo"
940 | ]
941 | },
942 | {
943 | "cell_type": "markdown",
944 | "id": "60cec30e-fc95-40a2-be17-fb5caf498b4f",
945 | "metadata": {
946 | "slideshow": {
947 | "slide_type": "slide"
948 | },
949 | "tags": []
950 | },
951 | "source": [
952 | "# 5. Final questions"
953 | ]
954 | }
955 | ],
956 | "metadata": {
957 | "kernelspec": {
958 | "display_name": "Python 3 (ipykernel)",
959 | "language": "python",
960 | "name": "python3"
961 | },
962 | "language_info": {
963 | "codemirror_mode": {
964 | "name": "ipython",
965 | "version": 3
966 | },
967 | "file_extension": ".py",
968 | "mimetype": "text/x-python",
969 | "name": "python",
970 | "nbconvert_exporter": "python",
971 | "pygments_lexer": "ipython3",
972 | "version": "3.8.12"
973 | }
974 | },
975 | "nbformat": 4,
976 | "nbformat_minor": 5
977 | }
978 |
--------------------------------------------------------------------------------