├── .gitignore ├── LICENSE ├── README.md ├── project-boilerplates ├── reinforcement-learning │ ├── .envrc │ ├── .gitignore │ ├── README.md │ ├── requirements.txt │ ├── rl_boilerplate │ │ ├── __init__.py │ │ ├── agent.py │ │ ├── config.py │ │ ├── environment.py │ │ ├── main.py │ │ └── network.py │ └── setup.py ├── sending-images-streamlit-fastapi │ ├── README.md │ ├── backend │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── MANIFEST.in │ │ ├── Makefile │ │ ├── face_rec │ │ │ ├── __init__.py │ │ │ ├── face_detection.py │ │ │ └── haarcascade_frontalface_default.xml │ │ ├── fast_api │ │ │ ├── __init__.py │ │ │ └── api.py │ │ ├── notebooks │ │ │ ├── .ipynb_checkpoints │ │ │ │ └── face_detection-checkpoint.ipynb │ │ │ ├── face_detection.ipynb │ │ │ └── haarcascade_frontalface_default.xml │ │ ├── requirements.txt │ │ └── setup.py │ └── frontend │ │ ├── Dockerfile │ │ ├── app.py │ │ └── requirements.txt ├── time-series-cross-validator-challenge │ ├── .challengifyignore │ ├── .github │ │ └── workflows │ │ │ └── pythonpackage.yml │ ├── .gitignore │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── notebooks │ │ ├── test_package.ipynb │ │ └── tutorial_ts_forecating.ipynb │ ├── pytest.ini │ ├── requirements.txt │ ├── setup.py │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── integrated │ │ │ ├── test_main.py │ │ │ └── test_model_performance.py │ │ └── unittests │ │ │ ├── test_data.py │ │ │ └── test_model.py │ └── ts_boilerplate │ │ ├── __init__.py │ │ ├── dataprep.py │ │ ├── generate_dummy_data.py │ │ ├── main.py │ │ ├── metrics.py │ │ ├── model.py │ │ └── params.py └── time-series-cross-validator │ ├── .challengifyignore │ ├── .github │ └── workflows │ │ └── pythonpackage.yml │ ├── .gitignore │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── TODO.md │ ├── notebooks │ ├── WIP_tutorial_darts_library.ipynb │ ├── test_package.ipynb │ └── tutorial_ts_forecating.ipynb │ ├── pytest.ini │ ├── requirements.txt │ ├── setup.py │ ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── integrated │ │ ├── test_main.py │ │ └── test_model_performance.py │ └── unittests │ │ ├── test_data.py │ │ └── test_model.py │ └── ts_boilerplate │ ├── __init__.py │ ├── dataprep.py │ ├── generate_dummy_data.py │ ├── main.py │ ├── metrics.py │ ├── model.py │ └── params.py └── tutorials ├── .keep └── removing-bottlenecks ├── demo.ipynb ├── model.png ├── row_column_wise.png ├── slides.ipynb ├── slides.slides.html └── tensorflow.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | **/__pycache__ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Le Wagon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## What is this repo about ? 3 | 4 | Le Wagon is hereby curating a selected list of useful **open-source templates** for data analysis / data science. 5 | 6 | ✅ These templates should be: 7 | 8 | - self-explanatory 9 | - replicable (provided Le Wagon [data-setup](https://github.com/lewagon/data-setup) has been done on the machine) 10 | - focused on one specific theme 11 | 12 | A theme could be: 13 | - A **project boilerplate**: A typical 2 weeks-long / 4 students project in the spirit of what is done at [Le Wagon Data Science bootcamp](https://www.lewagon.com/data-science-course/full-time) 14 | - e.g. "Time-Series cross-validation boilerplate" 15 | - e.g. "Reinforcement Learning boilerplate" 16 | - e.g. "Image classification trainer + streamlit API boilerplate" 17 | - ... 18 | 19 | 20 | - A **tutorials** guide focused on one specific topic worth sharing 21 | - e.g. "Removing bottlenecks with Numba, Cython, and TensorFlow" 22 | 23 | ## How to contribute ? 24 | 25 | Feel free to contribute by adding your suggestions 26 | 27 | ### Submit a pull request with new templates or improve existing ones 28 | 👉 Fork this repository to your account, and submit pull-requests following [the standard open-source contribution](https://jarv.is/notes/how-to-pull-request-fork-github/) methodology 29 | 30 | 👉 Le Wagon team will check your PR and integrate it to the list if it pass quality standards 31 | 32 | ### Discuss features/improvements/suggestions 33 | 👉 https://github.com/lewagon/data-templates/discussions 34 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/.envrc: -------------------------------------------------------------------------------- 1 | layout python3 2 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | .coverage 3 | .ipynb_checkpoints 4 | **/*.DS_Store 5 | data_raw/ 6 | data_processed/ 7 | *.csv 8 | __pycache__/ 9 | .env 10 | .direnv/ 11 | .vscode/ 12 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/README.md: -------------------------------------------------------------------------------- 1 | This is a **boilerplate** repo for a reinforcement learning (RL) project. 2 | 3 | This directory provides an example repository structure for RL projects using pytorch. This template provides a generic agent using the [deep Q-learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) algorithm as well as an agent playing random actions for baseline performance. The DQN architecture is in itw own class and is hot-swappable with other potential architectures. A sample environment using [OpenAi's gym](https://github.com/openai/gym) and a generic control loop is also provided. 4 | 5 | Note that since RL projects are rarely data-centric, and data has to be generated on-the-fly, requirements are likely to differ from standard ML projects. 6 | 7 | # Detailed package workflow 8 | 9 | This boilerplate package contains multiple modules: 10 | 11 | - `main.py` is the entry point of the package. It defines the agent and environment to use. 12 | - `environment.py` defines environment-side setup and execution utilities. It uses the gym package for demonstration purposes. 13 | - `agent.py` defines multiple types of learning agent. We have included a random agent and deep Q-learning agent for demonstration purposes. 14 | - `config.py` defines a singleton class used for storing simulation parameters. This class is globally available in all packages (through the `CFG` variable). It has to be initialized once (see module documentation). 15 | - `network.py` defines the neural network used by the DQN agent. 16 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/requirements.txt: -------------------------------------------------------------------------------- 1 | # Update as needed 2 | torch 3 | #tensorflow 4 | gymnasium 5 | gymnasium[box2d] 6 | tqdm 7 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/reinforcement-learning/rl_boilerplate/__init__.py -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent module. 3 | """ 4 | 5 | import random 6 | import torch 7 | import torch.nn 8 | import tensorflow as tf 9 | 10 | from rl_boilerplate import network 11 | from rl_boilerplate.config import CFG 12 | 13 | 14 | class Agent: 15 | """ 16 | A learning agent parent class. 17 | """ 18 | 19 | def __init__(self): 20 | pass 21 | 22 | def set(self): 23 | """ 24 | Make the agent learn from a (s, a, r, s') tuple. 25 | """ 26 | raise NotImplementedError 27 | 28 | def get(self): 29 | """ 30 | Request a next action from the agent. 31 | """ 32 | raise NotImplementedError 33 | 34 | 35 | class RandomAgent(Agent): 36 | """ 37 | A random playing agent class. 38 | """ 39 | 40 | def set(self, obs_old, act, rwd, obs_new): 41 | """ 42 | A random agent doesn't learn. 43 | """ 44 | return 45 | 46 | def get(self, obs_new, act_space): 47 | """ 48 | Simply return a random action. 49 | """ 50 | return act_space.sample() 51 | 52 | 53 | class DQNAgent_pt(Agent): 54 | """ 55 | A basic pytorch Deep Q-learning agent. 56 | """ 57 | 58 | def __init__(self, x_dim, y_dim): 59 | self.net = network.DQN_pt(x_dim, y_dim) 60 | self.opt = torch.optim.Adam(self.net.parameters(), lr=0.0001) 61 | 62 | def set(self, obs_old, act, rwd, obs_new): 63 | """ 64 | Learn from a single observation sample. 65 | """ 66 | obs_new = torch.tensor(obs_new) 67 | 68 | # We get the network output 69 | out = self.net(torch.tensor(obs_new))[act] 70 | 71 | # We compute the target 72 | with torch.no_grad(): 73 | exp = rwd + CFG.gamma * self.net(obs_new).max() 74 | 75 | # Compute the loss 76 | loss = torch.square(exp - out) 77 | 78 | # Perform a backward propagation. 79 | self.opt.zero_grad() 80 | loss.sum().backward() 81 | self.opt.step() 82 | 83 | def get(self, obs_new, act_space): 84 | """ 85 | Run an epsilon-greedy policy for next actino selection. 86 | """ 87 | # Return random action with probability epsilon 88 | if random.uniform(0, 1) < CFG.epsilon: 89 | return act_space.sample() 90 | # Else, return action with highest value 91 | with torch.no_grad(): 92 | # Get the values of all possible actions 93 | val = self.net(torch.tensor(obs_new)) 94 | # Choose the highest-values action 95 | return torch.argmax(val).numpy() 96 | 97 | class DQNAgent_tf(Agent): 98 | """ 99 | A basic tensorflow Deep Q-learning agent. 100 | """ 101 | 102 | def __init__(self, x_dim, y_dim): 103 | self.net = network.DQN_tf(x_dim, y_dim) 104 | self.opt = tf.optimizers.Adam(learning_rate=0.0001) 105 | 106 | def set(self, obs_old, act, rwd, obs_new): 107 | """ 108 | Learn from a single observation sample. 109 | """ 110 | 111 | obs_new = obs_new.reshape(1, -1) 112 | 113 | with tf.GradientTape() as tape: 114 | 115 | # We get the network output 116 | out = self.net(obs_new)[0, act] 117 | 118 | # We compute the target 119 | exp = rwd + CFG.gamma * tf.reduce_max(self.net(obs_new)) 120 | 121 | # Compute the loss 122 | loss = tf.square(exp - out) 123 | print(loss) 124 | 125 | grads = tape.gradient(loss, self.net.trainable_variables) 126 | self.opt.apply_gradients(zip(grads, self.net.trainable_variables)) 127 | 128 | def get(self, obs_new, act_space): 129 | """ 130 | Run an epsilon-greedy policy for next actino selection. 131 | """ 132 | # Return random action with probability epsilon 133 | if random.uniform(0, 1) < CFG.epsilon: 134 | return act_space.sample() 135 | # Else, return action with highest value 136 | with torch.no_grad(): 137 | return tf.argmax(self.net(obs_new.reshape(1, -1)), axis=1).numpy()[0] 138 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration Module. 3 | 4 | This module defines a singleton-type configuration class that can be used all across our project. This class can contain any parameter that one may want to change from one simulation run to the other. 5 | """ 6 | 7 | import random 8 | 9 | 10 | class Configuration: 11 | """ 12 | This configuration class is extremely flexible due to a two-step init process. We only instantiate a single instance of it (at the bottom if this file) so that all modules can import this singleton at load time. The second initialization (which happens in main.py) allows the user to input custom parameters of the config class at execution time. 13 | """ 14 | 15 | def __init__(self): 16 | """ 17 | Declare types but do not instantiate anything 18 | """ 19 | self.alpha = 0.2 20 | self.gamma = 0.98 21 | self.epsilon = 1.0 22 | self.rnd_seed = None 23 | self.agt_type = None 24 | 25 | def init(self, agt_type, **kwargs): 26 | """ 27 | User-defined configuration init. Mandatory to properly set all configuration parameters. 28 | """ 29 | 30 | # Mandatory arguments go here. In our case it is useless. 31 | self.agt_type = agt_type 32 | 33 | # We set default values for arguments we have to define 34 | self.rnd_seed = random.randint(0, 1000) 35 | self.epsilon = 0.05 36 | 37 | # However, these arguments can be overridden by passing them as keyword arguments in the init method. Hence, passing for instance epsilon=0.1 as a kwarg to the init method will override the default value we just defined. 38 | self.__dict__.update(kwargs) 39 | 40 | # Once all values are properly set, use them. 41 | random.seed(self.rnd_seed) 42 | 43 | 44 | CFG = Configuration() 45 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Environment module. 3 | 4 | This module contains the RL environment. We provide a gym setup by default, which can easily be replaced by other packages such as pettingzoo. Fundamentally, this module is used to simulate the environment and generate (s, a, r, s') tuples for the agent to learn from. 5 | """ 6 | 7 | import gymnasium as gym 8 | from tqdm import tqdm 9 | 10 | from rl_boilerplate.config import CFG 11 | 12 | def get_env(): 13 | """ 14 | Returns a gym environment. Replace by a custom environment if needed. 15 | """ 16 | # We use the LunarLander env. Other environments are available. 17 | return gym.make("LunarLander-v2", render_mode="human") 18 | 19 | 20 | def run_env(env, agt, run_number): 21 | """ 22 | Run a given environment with a given agent. 23 | """ 24 | 25 | obs_old, info = env.reset(seed=CFG.rnd_seed) 26 | 27 | # We get the action space. 28 | act_space = env.action_space 29 | 30 | print(f"Run number: {run_number + 1}") 31 | for _ in range(1000): 32 | 33 | # We can visually render the learning environment. We disable it for performance. 34 | env.render() 35 | 36 | # We request an action from the agent. 37 | act = agt.get(obs_old, act_space) 38 | 39 | # We apply the action on the environment. 40 | obs_new, rwd, terminated, truncated, _ = env.step(act) 41 | 42 | # We perform a learning step. 43 | agt.set(obs_old, act, rwd, obs_new) 44 | 45 | # Update latest observation 46 | obs_old = obs_new 47 | 48 | if terminated or truncated: 49 | obs_end, info = env.reset() 50 | 51 | env.close() 52 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/main.py: -------------------------------------------------------------------------------- 1 | from rl_boilerplate import agent, environment 2 | 3 | from config import CFG 4 | 5 | # We initialize our configuration class 6 | CFG.init("", rnd_seed=22) 7 | 8 | # We create an agent. State and action spaces are hardcoded here. 9 | agt = agent.DQNAgent_tf(8, 4) 10 | 11 | # Run a learning process 12 | for i in range(1000): 13 | env = environment.get_env() 14 | environment.run_env(env, agt, i) 15 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/rl_boilerplate/network.py: -------------------------------------------------------------------------------- 1 | """ 2 | Neural network module. 3 | 4 | This module defines architectures used by reinforcement learning agents. 5 | """ 6 | 7 | import tensorflow as tf 8 | import torch 9 | import torch.nn 10 | 11 | 12 | class DQN_pt(torch.nn.Module): 13 | """ 14 | PyTorch implementation of a Deep Q-Network with 3 linear layers. 15 | x_dim refers to the number of dimensions to pass as input 16 | y_dim refers to the action space of the agent 17 | """ 18 | 19 | def __init__(self, x_dim, y_dim): 20 | super().__init__() 21 | 22 | self.net = torch.nn.Sequential( 23 | torch.nn.Linear(x_dim, 128), 24 | torch.nn.ReLU(inplace=True), 25 | torch.nn.Linear(128, 128), 26 | torch.nn.ReLU(inplace=True), 27 | torch.nn.Linear(128, y_dim), 28 | torch.nn.ReLU(inplace=True), 29 | ) 30 | 31 | def forward(self, obs): 32 | return self.net(obs) 33 | 34 | class DQN_tf(tf.keras.Model): 35 | """ 36 | Tensorflow implementation of a Deep Q-Network with 3 linear layers. 37 | x_dim refers to the number of dimensions to pass as input 38 | y_dim refers to the action space of the agent 39 | """ 40 | 41 | def __init__(self, x_dim, y_dim): 42 | super().__init__() 43 | self.layer1 = tf.keras.layers.Dense(128, activation="relu", input_shape=(x_dim,)) 44 | self.layer2 = tf.keras.layers.Dense(128, activation="relu") 45 | self.layer3 = tf.keras.layers.Dense(y_dim, activation="relu") 46 | 47 | def call(self, obs): 48 | x = self.layer1(obs) 49 | x = self.layer2(x) 50 | return self.layer3(x) 51 | -------------------------------------------------------------------------------- /project-boilerplates/reinforcement-learning/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open('requirements.txt') as f: 5 | content = f.readlines() 6 | requirements = [x.strip() for x in content if 'git+' not in x] 7 | 8 | setup(name='rl-boilerplate', 9 | version="1.0", 10 | description="Trainer Boilerplate for Reinforcement Learning Projects", 11 | packages=find_packages(), 12 | install_requires=requirements, 13 | include_package_data=True, 14 | zip_safe=False) 15 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/README.md: -------------------------------------------------------------------------------- 1 | # Simple Face Annotation with Streamlit, FastAPI and OpenCV 2 | 3 | This is a boilerplate for any projects that involve sending an image from a web UI to an API, performing some manipulation on the image, and sending it back. The example taken here is a simple face recognition app using `OpenCV`s built-in [Haar Cascade object detection algorithm](https://pyimagesearch.com/2021/04/12/opencv-haar-cascades/). 4 | 5 | ### What's here: 6 | 7 | * [Streamlit](https://docs.streamlit.io/) on the frontend 8 | * [FastAPI](https://fastapi.tiangolo.com/) on the backend 9 | * [PIL/pillow](https://pillow.readthedocs.io/en/stable/) and [opencv-python](https://github.com/opencv/opencv-python) for working with images 10 | * Backend and frontend can be deployed with Docker 11 | 12 | ### Using this template 13 | 14 | > From inside the `backend` folder: 15 | 16 | You can serve the API with `uvicorn fast_api.api:app --reload` (default port is `8000`) 17 | 18 | > From inside the `frontend` folder: 19 | 20 | You can serve the frontend with `streamlit run app.py` (default port is `8501`) 21 | 22 | ### Using this template with Docker 23 | 24 | Both the `frontend` and `backend` have corresponding `Dockerfile`s for the web UI and API. 25 | 26 | 1. To create a Docker image, inside the corresponding folders run `docker built -t NAME_FOR_THE_IMAGE .` 27 | 2. Run a container for either API or UI with `docker run -p MACHINE_PORT:CONTAINER_PORT NAME_FOR_THE_IMAGE`; 28 | 29 | Here, `MACHINE_PORT` is the `localhost` port you want to link to the container, while `CONTAINER_PORT` is the port which will be used by the running app in the container. 30 | 31 | 32 | 3. ❗ You won't be able to reach the API container through `localhost`; You'll need to [link](https://docs.docker.com/network/links/) the containers: 33 | 34 | * **API:** `docker run -p 8000:8000 NAME_FOR_THE_API_IMAGE --name api` 35 | * **UI:** `docker run -p 8501:8501 --link api:api NAME_FOR_THE_UI_IMAGE` 36 | 37 | This way you can use `api` instead of `localhost` to reach the API container from the frontend 38 | 39 | ❗ Note that Docker docs mention that `--link` might be removed in the future (as of 2022.06). Alternatives can be [user-defined bridges](https://docs.docker.com/network/bridge/#differences-between-user-defined-bridges-and-the-default-bridge) or [Docker Compose](https://docs.docker.com/compose/) 40 | 41 | Have fun! 42 | 43 | 44 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/.dockerignore: -------------------------------------------------------------------------------- 1 | /notebooks 2 | /__pycache__ 3 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.12-buster 2 | 3 | WORKDIR /app 4 | 5 | # libraries required by OpenCV 6 | RUN apt-get update 7 | RUN apt-get install \ 8 | 'ffmpeg'\ 9 | 'libsm6'\ 10 | 'libxext6' -y 11 | 12 | COPY requirements.txt . 13 | RUN pip install -r requirements.txt 14 | 15 | COPY . . 16 | 17 | # You can add --port $PORT if you need to set PORT as a specific env variable 18 | CMD uvicorn fast_api.api:app --host 0.0.0.0 --port $PORT 19 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/Makefile: -------------------------------------------------------------------------------- 1 | ##### Prediction API - - - - - - - - - - - - - - - - - - - - - - - - - 2 | 3 | run_api: 4 | uvicorn fast_api.api:app --reload 5 | 6 | ##### Docker - - - - - - - - - - - - - - - - - - - - - - - - - 7 | 8 | docker_build: 9 | docker build -t template-image-api . 10 | 11 | docker_run: 12 | docker run -p 8000:8000 --name api template-image-api 13 | 14 | ##### GCP - - - - - - - - - - - - - - - - - - - - - - - - - 15 | 16 | GCP_PROJECT_ID=XXX 17 | 18 | DOCKER_IMAGE_NAME=XXX 19 | 20 | # https://cloud.google.com/storage/docs/locations#location-mr 21 | GCR_MULTI_REGION=XXX 22 | 23 | # https://cloud.google.com/compute/docs/regions-zones#available 24 | REGION=XXX 25 | 26 | build_gcr_image: 27 | docker build -t $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) . 28 | 29 | build_gcr_image_m1: 30 | docker build --platform linux/amd64 -t $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) . 31 | 32 | run_gcr_image: 33 | docker run -e PORT=8000 -p 8080:8000 $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) 34 | 35 | push_gcr_image: 36 | docker push $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) 37 | 38 | gcr_deploy: 39 | gcloud run deploy --image $(GCR_MULTI_REGION)/$(GCP_PROJECT_ID)/$(DOCKER_IMAGE_NAME) --platform managed --region $(REGION) 40 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/__init__.py -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/face_rec/face_detection.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | def annotate_face(img_np_array): 4 | """ 5 | Detect and annotate faces with a red square. 6 | `img_np_array` should be a (width, height, 3) shape np.array 7 | """ 8 | # Load the default cascade from OpenCV 9 | face_cascade = cv2.CascadeClassifier('face_rec/haarcascade_frontalface_default.xml') 10 | 11 | # Detect faces 12 | faces = face_cascade.detectMultiScale(img_np_array, 1.1, 4) 13 | 14 | # Draw rectangle around the faces 15 | for (x, y, w, h) in faces: 16 | cv2.rectangle(img_np_array, (x, y), (x+w, y+h), (0, 0, 255), 2) 17 | 18 | # return image numpy array 19 | return img_np_array 20 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/__init__.py -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/fast_api/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, UploadFile, File 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from starlette.responses import Response 4 | 5 | import numpy as np 6 | import cv2 7 | import io 8 | from face_rec.face_detection import annotate_face 9 | 10 | app = FastAPI() 11 | 12 | # # Allow all requests (optional, good for development purposes) 13 | # app.add_middleware( 14 | # CORSMiddleware, 15 | # allow_origins=["*"], # Allows all origins 16 | # allow_credentials=True, 17 | # allow_methods=["*"], # Allows all methods 18 | # allow_headers=["*"], # Allows all headers 19 | # ) 20 | 21 | @app.get("/") 22 | def index(): 23 | return {"status": "ok"} 24 | 25 | @app.post('/upload_image') 26 | async def receive_image(img: UploadFile=File(...)): 27 | ### Receiving and decoding the image 28 | contents = await img.read() 29 | 30 | nparr = np.fromstring(contents, np.uint8) 31 | cv2_img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) # type(cv2_img) => numpy.ndarray 32 | 33 | ### Do cool stuff with your image.... For example face detection 34 | annotated_img = annotate_face(cv2_img) 35 | 36 | ### Encoding and responding with the image 37 | im = cv2.imencode('.png', annotated_img)[1] # extension depends on which format is sent from Streamlit 38 | return Response(content=im.tobytes(), media_type="image/png") 39 | 40 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/requirements.txt: -------------------------------------------------------------------------------- 1 | # packaging 2 | pip>=9 3 | setuptools>=26 4 | twine 5 | wheel>=0.29 6 | 7 | # data science 8 | six 9 | numpy 10 | 11 | # api 12 | fastapi 13 | uvicorn 14 | python-multipart 15 | 16 | # img 17 | opencv-python 18 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/backend/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open('requirements.txt') as f: 5 | content = f.readlines() 6 | requirements = [x.strip() for x in content if 'git+' not in x] 7 | 8 | setup(name='FaceRecApi', 9 | version="1.0", 10 | description="Simple FastAPI with face recognition", 11 | packages=find_packages(), 12 | # include_package_data: to install data from MANIFEST.in 13 | include_package_data=True, 14 | install_requires=requirements) 15 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.12-buster 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt . 6 | RUN pip install -r requirements.txt 7 | 8 | COPY . . 9 | 10 | # You can add --server.port $PORT if you need to set PORT as a specific env variable 11 | CMD streamlit run app.py 12 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/frontend/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from PIL import Image 3 | import requests 4 | from dotenv import load_dotenv 5 | import os 6 | 7 | # Set page tab display 8 | st.set_page_config( 9 | page_title="Simple Image Uploader", 10 | page_icon= '🖼️', 11 | layout="wide", 12 | initial_sidebar_state="expanded", 13 | ) 14 | 15 | # Example local Docker container URL 16 | # url = 'http://api:8000' 17 | # Example localhost development URL 18 | # url = 'http://localhost:8000' 19 | load_dotenv() 20 | url = os.getenv('API_URL') 21 | 22 | 23 | # App title and description 24 | st.header('Simple Image Uploader 📸') 25 | st.markdown(''' 26 | > This is a Le Wagon boilerplate for any data science projects that involve exchanging images between a Python API and a simple web frontend. 27 | 28 | > **What's here:** 29 | 30 | > * [Streamlit](https://docs.streamlit.io/) on the frontend 31 | > * [FastAPI](https://fastapi.tiangolo.com/) on the backend 32 | > * [PIL/pillow](https://pillow.readthedocs.io/en/stable/) and [opencv-python](https://github.com/opencv/opencv-python) for working with images 33 | > * Backend and frontend can be deployed with Docker 34 | ''') 35 | 36 | st.markdown("---") 37 | 38 | ### Create a native Streamlit file upload input 39 | st.markdown("### Let's do a simple face recognition 👇") 40 | img_file_buffer = st.file_uploader('Upload an image') 41 | 42 | if img_file_buffer is not None: 43 | 44 | col1, col2 = st.columns(2) 45 | 46 | with col1: 47 | ### Display the image user uploaded 48 | st.image(Image.open(img_file_buffer), caption="Here's the image you uploaded ☝️") 49 | 50 | with col2: 51 | with st.spinner("Wait for it..."): 52 | ### Get bytes from the file buffer 53 | img_bytes = img_file_buffer.getvalue() 54 | 55 | ### Make request to API (stream=True to stream response as bytes) 56 | res = requests.post(url + "/upload_image", files={'img': img_bytes}) 57 | 58 | if res.status_code == 200: 59 | ### Display the image returned by the API 60 | st.image(res.content, caption="Image returned from API ☝️") 61 | else: 62 | st.markdown("**Oops**, something went wrong 😓 Please try again.") 63 | print(res.status_code, res.content) 64 | 65 | -------------------------------------------------------------------------------- /project-boilerplates/sending-images-streamlit-fastapi/frontend/requirements.txt: -------------------------------------------------------------------------------- 1 | # packaging 2 | pip>=9 3 | setuptools>=26 4 | twine 5 | wheel>=0.29 6 | 7 | # data science 8 | six 9 | 10 | # img 11 | pillow 12 | 13 | # web 14 | streamlit 15 | requests 16 | python-dotenv 17 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/.challengifyignore: -------------------------------------------------------------------------------- 1 | TODO.md 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | 2 | # 🤖 usage 3 | # 4 | # this file contains the conf for GitHub Continuous Integration 5 | # and Continuous Deployment to Heroku 6 | # 7 | # in order to activate the tests in GitHub CI: 8 | # - uncomment the content of the CI paragraph (lines 41-55) 9 | # - create some tests in the tests/ directory 10 | # 11 | # in order to activate CD to Heroku: 12 | # - activate the tests in GitHub CI 13 | # - uncomment the content of the CD paragraph (lines 57-75) 14 | 15 | name: Python package 16 | 17 | on: 18 | push: 19 | branches: [ master ] 20 | pull_request: 21 | branches: [ master ] 22 | 23 | jobs: 24 | 25 | # 🤖 CI paragraph 26 | # 27 | # uncomment the content of this paragraph to activate the tests in GitHub CI 28 | # - remove the 2 trailing characters "# ", do not change the spaces 29 | # (the `name` keys should be at the same level as the `uses` key) 30 | # (the `strategy` key should be at the same level as the `steps` key) 31 | 32 | build: 33 | 34 | runs-on: ubuntu-latest 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | - name: Say hello 39 | run: | 40 | echo "Hello, World!" 41 | # - name: Set up Python ${{ matrix.python-version }} 42 | # uses: actions/setup-python@v1 43 | # with: 44 | # python-version: ${{ matrix.python-version }} 45 | # - name: Install dependencies 46 | # run: | 47 | # python -m pip install --upgrade pip 48 | # pip install -r requirements.txt 49 | # - name: Install package and test 50 | # run: | 51 | # make install test clean 52 | 53 | # strategy: 54 | # matrix: 55 | # python-version: [3.8] 56 | 57 | # # 🤖 CD paragraph 58 | # # 59 | # # uncomment the following lines to activate CD to Heroku 60 | # # - remove the 2 trailing characters "# ", do not change the spaces 61 | # # (there should be 2 spaces before the `deploy_heroku` key) 62 | # # - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets 63 | # # - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app 64 | 65 | # deploy_heroku: 66 | # needs: build 67 | # runs-on: ubuntu-latest 68 | 69 | # steps: 70 | # - uses: actions/checkout@v2 71 | # - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action 72 | # with: 73 | # heroku_api_key: ${{secrets.HEROKU_API_KEY}} 74 | # heroku_app_name: "REPLACE_WITH_YOUR_HEROKU_APP_NAME" # Must be unique in Heroku 75 | # heroku_email: ${{secrets.HEROKU_EMAIL}} 76 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | .coverage 3 | .ipynb_checkpoints 4 | **/*.DS_Store 5 | data_raw/ 6 | data_processed/ 7 | *.csv 8 | __pycache__/ 9 | .env 10 | .vscode/ 11 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/Makefile: -------------------------------------------------------------------------------- 1 | # TODO 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/README.md: -------------------------------------------------------------------------------- 1 | This is a **boilerplate** repo for a machine-learning project involving **Time Series forecasting**. 2 | 3 | In particular 4 | 5 | - It provides a **cross-validation framework** to ensure model are tested thoroughly and without data leakage 6 | - It is agnostic of the type of model involved 7 | - It is well suited for short research projects, typical of few-weeks coding bootcamps such as Le Wagon DataScience 8 | 9 | # Detailed package workflow 10 | 11 | ## Architecture 12 | - `ts_boilerplate` package 13 | - `main.py` comprises the main routes to be called from the CLI (`train`, `cross-validate`, `backtest`) 14 | - `params.py` contains project-level global variable to be set manually 15 |
16 | 17 | - `data` folder contains 18 | - `raw` and `clean` folder should contain **2D arrays `data` time-series**, with (axis 0) representing timesteps integer, and (axis 1) columns containing tagets and covariates, as per [picture](https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true) 19 | ```python 20 | data.shape = (length, n_targets+n_covariates) 21 | ``` 22 | - `Xy` may persist your tuple (X,y) of **3D arrays** training sets to be fed to your models if you want to store them to avoid preprocessing multiple times. 23 | ```python 24 | X.shape = (n_samples, input_length, n_covariates) 25 | y.shape = (n_samples, output_length, n_targets) 26 | ``` 27 | - `notebooks` 28 | - `test_package.ipynb` will help you understand how the package and the tests have been built. 29 | - `tutorial_ts_forecasting.ipynb` is a recommended read before diving into this project. It contains visuals that will help you fill global project params and understand naming conventions 30 | 31 |
32 | 33 | - `tests` folder detailed below 34 | 35 | ## How to test your code? 36 | First of all, fill `ts_boilerplate/params.py` corresponding to your true project speficities 37 | 38 | Then, run this in your terminal from the root project folder to check your code 39 | - `pytest` 40 | - `pytest -m "not optional"` to only check mandatory tests 41 | - `pytest -m "not optional" -m "not slow"` to also avoid tests that may be slow (involving fitting your model) 42 | 43 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/notebooks/test_package.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Step by step guide to Unit Tests used in this project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2022-03-14T14:24:55.279012Z", 16 | "start_time": "2022-03-14T14:24:53.949004Z" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import os\n", 25 | "from ts_boilerplate.params import ROOT_DIR, DATA, TRAIN, CROSS_VAL\n", 26 | "from ts_boilerplate.dataprep import get_X_y, get_folds, train_test_split, get_Xi_yi\n", 27 | "from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones\n", 28 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n", 29 | "from ts_boilerplate.metrics import mape\n", 30 | "\n", 31 | "%load_ext autoreload\n", 32 | "%autoreload 2" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## 1) `generate_dummy_data.py`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Let's create a dummy time series dataset whose value increment by 1 every day" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "data = generate_data_monotonic_increase()\n", 56 | "data" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": { 63 | "ExecuteTime": { 64 | "end_time": "2022-03-14T14:25:19.973275Z", 65 | "start_time": "2022-03-14T14:25:19.950901Z" 66 | } 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "# Store as CSV\n", 71 | "data_df = pd.DataFrame(data)\n", 72 | "data_df.to_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"), index=False)\n", 73 | "pd.read_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## 2) `dataprep.py`" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### 2.1) `getX_y`" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "X, y = get_X_y(data, **TRAIN)\n", 97 | "print(X.shape)\n", 98 | "print(y.shape)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 11, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Let's compute the shape arithmetically (for unittests)\n", 108 | "(len(data) \\\n", 109 | " - (TRAIN['input_length'] -1) \\\n", 110 | " - (TRAIN['output_length'] -1) \\\n", 111 | " - TRAIN['horizon']) \\\n", 112 | " / TRAIN[\"stride\"]" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "☝️ ceiling rounding function should be used for stride > 1" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### 2.2) `train_test_split`" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 12, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "train_test_ratio = TRAIN[\"train_test_ratio\"]\n", 136 | "input_length = TRAIN[\"input_length\"]\n", 137 | "output_length = TRAIN[\"output_length\"]\n", 138 | "data.shape" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 13, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "last_train_idx = round(train_test_ratio * len(data))\n", 148 | "data_train = data[0:last_train_idx, :]\n", 149 | "\n", 150 | "first_test_idx = last_train_idx - input_length\n", 151 | "data_test = data[first_test_idx:, :]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 14, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "data_train" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "data_test" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 179 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n", 180 | "\n", 181 | "print(\"####### Last train pair\")\n", 182 | "print(X_train[-1])\n", 183 | "print(y_train[-1])\n", 184 | "print(\"####### First test pair\")\n", 185 | "print(X_test[0])\n", 186 | "print(y_test[0])" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 17, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "gap = np.min(y_test) - np.max(y_train)\n", 196 | "gap" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 18, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "assert gap >= TRAIN[\"horizon\"], \"❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ \"" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### 2.3) `get_folds`" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "folds = get_folds(data, **CROSS_VAL)\n", 222 | "print('n_folds= ', len(folds))\n", 223 | "print(folds[-1])" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## 3) `model.py`" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 27, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "data_train, data_test = train_test_split(data, **TRAIN)\n", 240 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 241 | "X_test, y_test = get_X_y(data_test, **TRAIN)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 28, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "import tensorflow as tf\n", 251 | "from keras.models import Model\n", 252 | "from keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 18, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS\n", 262 | "input = Input(shape=X_train.shape[1:])\n", 263 | "# Take last temporal values of the targets, and duplicate it as many times as `output_length`\n", 264 | "x = Lambda(\n", 265 | " lambda x: tf.repeat(tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1),\n", 266 | " repeats=TRAIN['output_length'],\n", 267 | " axis=1))(input)\n", 268 | "output = Reshape(y_train.shape[1:])(x)\n", 269 | "model = Model(input, output)\n", 270 | "model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)\n", 271 | "model.summary()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 19, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',\n", 281 | " patience=2,\n", 282 | " verbose=0,\n", 283 | " mode='min',\n", 284 | " restore_best_weights=True)\n", 285 | "history = model.fit(X_train,\n", 286 | " y_train,\n", 287 | " epochs=50,\n", 288 | " batch_size=16,\n", 289 | " validation_split=0.3,\n", 290 | " callbacks=[es],\n", 291 | " verbose=0)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 29, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "from ts_boilerplate.metrics import mape\n", 301 | "\n", 302 | "y_pred = model.predict(X_test)\n", 303 | "mape(y_test, y_pred)\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## 4) `main.py`\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "### 4.1) `train()`" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 10, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "data = generate_data_monotonic_increase()\n", 327 | "data_train, data_test = train_test_split(data, **TRAIN)\n", 328 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 329 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n", 330 | "model = get_model(X_train, y_train)\n", 331 | "history = fit_model(model, X_train, y_train)\n", 332 | "y_pred = predict_output(model, X_test)\n", 333 | "metrics_test = mape(y_test, y_pred)\n", 334 | "\n", 335 | "print(\"### Test Metric: \", metrics_test)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### 4.2) cross_validate()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "### 4.1) `backtesting()`" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 20, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "y_pred_backtest = []" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 21, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "data = generate_data_monotonic_increase()\n", 375 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n", 376 | "from ts_boilerplate.dataprep import get_Xi_yi" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 25, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "stride = 10\n", 386 | "start_ratio:float = 0.8\n", 387 | "retrain: bool = True\n", 388 | "retrain_every: int = 50" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 30, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "from tqdm.notebook import tqdm\n", 398 | "\n", 399 | "# Initialization\n", 400 | "start_timestep_0 = round(start_ratio * len(data))\n", 401 | "data_train_0 = data[:start_timestep_0, ...]\n", 402 | "X_train_tmp, y_train_tmp = get_X_y(data_train_0, **TRAIN)\n", 403 | "data_test_backtested = data[start_timestep_0:, ...]\n", 404 | "_, y_test = get_X_y(data_test_backtested, **TRAIN, shuffle=False)\n", 405 | "y_pred_backtested = []\n", 406 | "retrain_counter = 0\n", 407 | "timesteps_backtested_list = []\n", 408 | "\n", 409 | "for i in tqdm(range(0, len(data_test_backtested), stride)):\n", 410 | " start_timestep_i = start_timestep_0 + i\n", 411 | " data_train = data[:start_timestep_i, ...]\n", 412 | " data_test = data[start_timestep_i:, ...]\n", 413 | " X_train_tmp, y_train_tmp = get_X_y(data_train, **TRAIN)\n", 414 | " X_test_i, y_test_i = get_Xi_yi(first_index=0, data=data_test, **TRAIN)\n", 415 | "\n", 416 | " # At some point after sliding through time, we will reach the end of the test set\n", 417 | " if y_test_i.shape[0] < y_train_tmp.shape[1]:\n", 418 | " break\n", 419 | "\n", 420 | " model = get_model(X_train_tmp, y_train_tmp)\n", 421 | "\n", 422 | " # Retrain when required, with incremental learning (ie. starting from previous weights)\n", 423 | " if retrain and i % retrain_every == 0:\n", 424 | " retrain_counter += 1\n", 425 | " fit_model(model, X_train_tmp, y_train_tmp)\n", 426 | "\n", 427 | " y_pred_i = np.squeeze(predict_output(model, X_test_i[None, ...]))\n", 428 | " y_pred_backtested.append(y_pred_i)\n", 429 | " timesteps_backtested_list.append(i)\n", 430 | "\n", 431 | "y_pred_backtested = np.array(y_pred_backtested)\n", 432 | "y_test_backtested = y_test[timesteps_backtested_list]\n", 433 | "# Check that we compare apples to apples\n", 434 | "assert y_pred_backtested.shape == y_test_backtested.shape\n", 435 | "\n", 436 | "metrics_backtested = mape(y_pred_backtested, y_test_backtested)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 31, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "print(\n", 446 | " f'### BACKETESTED METRICS BASED ON THE LAST {y_pred_backtested.shape[0]} TIMESTEPS AND WITH {retrain_counter} retrain operations'\n", 447 | ")\n", 448 | "print(mape(y_pred_backtested, y_test_backtested))\n" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 32, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# TODO: make it work for any dimension of y\n", 458 | "plt.plot(y_pred_backtested[:,0,0], label='historical forecasts')\n", 459 | "plt.plot(y_test_backtested[:,0,0], label='truth')\n", 460 | "plt.xlabel('timesteps')\n", 461 | "plt.legend()\n", 462 | "plt.show()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [] 471 | } 472 | ], 473 | "metadata": { 474 | "interpreter": { 475 | "hash": "572b4e543617d03e90ecaf525e08695da1ff29b13594f787e33b342cf572f792" 476 | }, 477 | "kernelspec": { 478 | "display_name": "Python 3 (ipykernel)", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.8.12" 493 | }, 494 | "toc": { 495 | "base_numbering": 1, 496 | "nav_menu": {}, 497 | "number_sections": false, 498 | "sideBar": true, 499 | "skip_h1_title": false, 500 | "title_cell": "Table of Contents", 501 | "title_sidebar": "Contents", 502 | "toc_cell": false, 503 | "toc_position": {}, 504 | "toc_section_display": true, 505 | "toc_window_display": false 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 2 510 | } 511 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/notebooks/tutorial_ts_forecating.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recap\n", 8 | "\n", 9 | "We will go through the main issues you will face when working with Recurrent Neural Networks that are designed to deal with time-series" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Part 1: How to make a proper Time Series Split ?" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Let's imagine your `data` as 2D array structured as follows\n", 24 | "\n", 25 | "`data.shape = (n_timesteps, n_features)`\n", 26 | "\n", 27 | "`features` can be separated into 3 categories\n", 28 | "- targets\n", 29 | "- past-covariates\n", 30 | "- future-covariates" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### 1.1) First, create many **FOLDS** for your cross-validation" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "`fold_1.shape = (n_timesteps_per_fold, n_features)` as 2D arrays \n", 52 | "`fold_2.shape = (n_timesteps_per_fold, n_features)` as 2D arrays" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Always split your training set *chronologically before* your test set\n", 60 | "\n", 61 | "👇 e.g. 4-time cross validation split" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "Create as many folds as needed to clearly test all type of past conditions \n", 76 | "(e.g crash markets periods 📉, bull-run markets 📈, flat markets 😴 etc...)\n", 77 | "\n", 78 | "It's very common to have **hundreds of folds** in Time Series forecasting!" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### 1.2) in each FOLD, and for each train or test SET, split your time series into different SEQUENCES of (observations, target)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "Goal: create (`X_train`, `y_train`, `X_test`, `y_test`) containing all you need to train and test your model for this fold\n", 100 | " \n", 101 | "- `X_train.shape = (n_samples, input_chunk_length, n_covariate_features)`\n", 102 | "- `y_train.shape = (n_samples, output_chunk_length, n_targets)`\n", 103 | "\n", 104 | "Notice that we now have 3D-arrays" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "\n", 112 | "💡 You can randomly sample or create them all sliding from left to right, with selected stride" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### 3) 🚨 Beware of the **GAP** of length (horizon - 1) between each train & test sets in each fold to avoid data-leakage" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "
" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "👇 Below is a zoom inside **ONE SINGLE FOLD**\n", 141 | "\n", 142 | "A gap of size `horizon - 1` is mandatory to reflect real situations:\n", 143 | "- Here the forecast horizon is `4` days\n", 144 | "- Let's say we want our train set to end by predicting day `10` based on days before `4, 5, 6`\n", 145 | "- In a real situation we would need to **wait** for day `10` to discover the true value of `y` on which to finalize training\n", 146 | "- Therefore, the test set can only start on day `10`, which is meant to predict `y_test = 10 + 4`" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "horizon $h = 4$\n", 154 | "\n", 155 | "$$ \\Large X^{t+\\color{green}4} = f(X^t, X^{t-1}, X^{t-2}) $$\n", 156 | "\n", 157 | "" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "✅ Use [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) `TimeSeriesSplit(n_splits = ..., gap=...)`" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "---" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "# Part 2: Air Pollution Solution" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## 2.1 Data\n", 193 | "\n", 194 | "❓ **Question** ❓ We will load the data from the third and fourth exercise. Load the data, and keep only the following columns : `['pm2.5', 'TEMP', 'DEWP', 'PRES', 'Ir', 'Is', 'Iws']`" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 1, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "import pandas as pd\n", 204 | "\n", 205 | "df = pd.read_csv('https://wagon-public-datasets.s3.amazonaws.com/deep_learning_datasets/air%20pollution.txt', index_col=[0])\n", 206 | "df = df[['pm2.5', 'TEMP', 'DEWP', 'PRES', 'Ir', 'Is', 'Iws']]" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 2, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "df" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "❓ **Question** ❓ For the sake of simplicity, fill in the missing values with mean over the entire dataset." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 3, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "df = df.fillna(df.mean())" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 26, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "df.describe()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "Usually, in classic settings, there is multiple independent sequences $X$, each with a corresponding $y$.\n", 248 | "However, if often happens that we don't have access to multiple sequences $X$, but to only one very long sequence as it is the case here. From that, experts usually split them into multiple sub-sequences.\n", 249 | "\n", 250 | "\n", 251 | "❓ **Question** ❓ Write a function that is able to get a subsequence $X$ and a corresponding $y$ which corresponds to the air pollution **5 days** after the last observation. The length of the subsequence should be an argument of the function" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 4, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "import numpy as np\n", 261 | "\n", 262 | "def subsample_sequence(df, length):\n", 263 | " pass # YOUR CODE HERE\n", 264 | " return X, y\n", 265 | "\n", 266 | "subsample_sequence(df, 10)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "❓ **Question** ❓ Given a list of integers, write a function that split the initial dataset as many times as there are integers in the list. The length of each sequence is the value of the integer in that list." 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 5, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "def get_X_y(df, length_of_observations):\n", 283 | " X, y = [], []\n", 284 | " pass # YOUR CODE HERE\n", 285 | " return X, y\n", 286 | "\n", 287 | "length_of_observations = np.random.randint(10, 15, 100)\n", 288 | "X, y = get_X_y(df, length_of_observations)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "❓ **Question** ❓ If you split into a train and test set _after_ creating the shorter sequences, you risk having same values in the train and test set, which corresponds to data leakage. Therefore, split your train and test set and then, get your training and test sequences - and the corresponding output.\n", 296 | "\n", 297 | "❗️ Beware of the gap required between train and test!" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "length_of_observations = np.random.randint(10, 15, 100)\n", 307 | "X_train, y_train = get_X_y(df, length_of_observations)\n", 308 | "\n", 309 | "length_of_observations = np.random.randint(10, 15, 100)\n", 310 | "X_test, y_test = get_X_y(df, length_of_observations)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "Each sequence has a certain number of observations. But across sequences, this number of observations is not the same. Because the Neural Network is trained with *batches* of data, you must ensure that, once the sequences are concatenated, they can be represented as a tensor. This operation is called the padding\n", 318 | "\n", 319 | "❓ From the four sequences above, return a padded tensor (with the dedicated Keras function) and plot it." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 10, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 329 | "pass # YOUR CODE HERE\n", 330 | "\n", 331 | "X_train_pad.shape" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "## Model\n", 339 | "\n", 340 | "As you added data to your input just for computational reasons, your model has to know which one is useful or not. \n", 341 | "\n", 342 | "❓ Initialize a model and add a masking layer so that your model does not take the padded values into account. You have to tell which value you used for the padding" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 11, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "from tensorflow.keras import Sequential, layers\n", 352 | "from tensorflow.keras.layers import Normalization" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 20, 358 | "metadata": { 359 | "tags": [ 360 | "challengify" 361 | ] 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "# YOUR CODE HERE" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "❓ Compile your model" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 22, 378 | "metadata": { 379 | "tags": [ 380 | "challengify" 381 | ] 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "# YOUR CODE HERE" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "❓ Train your model on the data" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 23, 398 | "metadata": { 399 | "tags": [ 400 | "challengify" 401 | ] 402 | }, 403 | "outputs": [], 404 | "source": [ 405 | "# YOUR CODE HERE" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 25, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "import matplotlib.pyplot as plt\n", 415 | "plt.plot(history.history['mean_absolute_percentage_error'])\n", 416 | "plt.plot(history.history['val_mean_absolute_percentage_error'])\n", 417 | "plt.legend(['train', 'test'])" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 3 (ipykernel)", 431 | "language": "python", 432 | "name": "python3" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 3 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython3", 444 | "version": "3.8.12" 445 | }, 446 | "toc": { 447 | "base_numbering": 1, 448 | "nav_menu": {}, 449 | "number_sections": false, 450 | "sideBar": true, 451 | "skip_h1_title": false, 452 | "title_cell": "Table of Contents", 453 | "title_sidebar": "Contents", 454 | "toc_cell": false, 455 | "toc_position": {}, 456 | "toc_section_display": true, 457 | "toc_window_display": false 458 | } 459 | }, 460 | "nbformat": 4, 461 | "nbformat_minor": 2 462 | } 463 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | slow: marks tests as slow (deselect with '-m "not slow"') 4 | optional: marks tests as optional (deselect with '-m "not optional"') 5 | addopts = -v -s --color=yes -W ignore::DeprecationWarning 6 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator-challenge/requirements.txt -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open('requirements.txt') as f: 5 | content = f.readlines() 6 | requirements = [x.strip() for x in content if 'git+' not in x] 7 | 8 | setup(name='ts-boilerplate', 9 | version="1.0", 10 | description="Trainer Boilerplate for Time Series Forecast Models with Cross Validation", 11 | packages=find_packages(), 12 | install_requires=requirements, 13 | test_suite='tests', 14 | # include_package_data: to install data from MANIFEST.in 15 | include_package_data=True, 16 | zip_safe=False) 17 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator-challenge/tests/__init__.py -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones 4 | from typing import Tuple 5 | 6 | @pytest.fixture(scope="session") 7 | def data_monotonic_increase() -> np.ndarray: 8 | return generate_data_monotonic_increase() 9 | 10 | @pytest.fixture(scope="session") 11 | def data_zeros_and_ones() -> np.ndarray: 12 | return generate_data_zeros_and_ones() 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def X_y_zeros_and_ones() -> Tuple[np.ndarray]: 17 | return generate_X_y_zeros_and_ones() 18 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/integrated/test_main.py: -------------------------------------------------------------------------------- 1 | """Tests that main route run without raising exceptions""" 2 | 3 | import pytest 4 | from ts_boilerplate.main import backtest, train, cross_validate 5 | 6 | @pytest.mark.slow 7 | def test_main_route_train(data_monotonic_increase): 8 | train(data_monotonic_increase) 9 | 10 | @pytest.mark.slow 11 | def test_main_route_cross_validate(data_monotonic_increase): 12 | cross_validate(data_monotonic_increase) 13 | 14 | @pytest.mark.slow 15 | def test_backtest(data_monotonic_increase): 16 | backtest(data_monotonic_increase, print_metrics=False, plot_metrics=False) 17 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/integrated/test_model_performance.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.main import train 3 | 4 | 5 | @pytest.mark.optional 6 | @pytest.mark.slow 7 | def test_model_can_fit_well_enough_on_dummy_dataset(data_zeros_and_ones): 8 | """Check that the model can fit, with MAPE lower than some threshold on dummy dataset of zeros and ones""" 9 | 10 | metrics = train(data_zeros_and_ones) 11 | #print("#### metrics on dummy dataset ", metrics) 12 | 13 | assert metrics < 5, "your model does not seem to be able to fit well enough even a very easy dataset" 14 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/unittests/test_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.params import TRAIN, DATA 3 | from ts_boilerplate.main import get_X_y 4 | from ts_boilerplate.main import train_test_split 5 | import numpy as np 6 | import math 7 | 8 | 9 | # These tests make use of the fixture `data_monotonic_increase` stored in tests/conftest.py (pytest magic under the hood) 10 | def test_get_X_y_returns_correct_shapes(data_monotonic_increase): 11 | """Test that X and y have correct shape (excluding sample size), as per project setup as defined in `params.py` 12 | """ 13 | X, y = get_X_y(data_monotonic_increase, **TRAIN) 14 | 15 | # Check that X and y have the correct lengths (in time) and depth (in number of covariates) 16 | assert X.ndim == 3 17 | assert X.shape[1] == TRAIN['input_length'] 18 | assert X.shape[2] == DATA['n_covariates'] + DATA[ 19 | 'n_targets'], "Did you forget to include your past targets-values as features ?" 20 | 21 | y_should_be_3D = TRAIN['output_length'] > 1 and DATA["n_targets"] > 1 22 | y_should_be_1D = TRAIN['output_length'] == 1 and DATA["n_targets"] == 1 23 | if y_should_be_3D: 24 | assert y.ndim == 3 25 | assert y.shape[1] == TRAIN['output_length'] 26 | assert y.shape[2] == DATA['n_targets'] 27 | elif y_should_be_1D: 28 | assert y.ndim == 1 29 | else: 30 | assert y.ndim == 2 31 | assert y.shape[1] == TRAIN['output_length'] if DATA['n_targets'] == 1 else DATA['n_targets'] 32 | 33 | 34 | @pytest.mark.optional 35 | @pytest.mark.skipif(TRAIN['stride'] == None, reason="Optional test only applicable if sliding method is used to get_X_y") 36 | def test_optional_get_X_y_returns_optimal_sample_size(data_monotonic_increase): 37 | """If get_X_y uses a stride method, check that X and y contains the optimal number of sample each 38 | """ 39 | X, y = get_X_y(data_monotonic_increase, **TRAIN) 40 | 41 | # Complex formula below retro-engineered from `create_dummy_tests.ipynb` 42 | expected_len = math.ceil( 43 | (len(data_monotonic_increase) \ 44 | - (TRAIN['input_length'] -1) \ 45 | - (TRAIN['output_length'] -1) \ 46 | - TRAIN['horizon'] 47 | ) / TRAIN["stride"] 48 | ) 49 | assert len(X) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen" 50 | assert len(y) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen" 51 | 52 | def test_no_data_leak(data_monotonic_increase): 53 | """Test that the time gap between the last timestep of `y_train` and the first timestep of `y_test` 54 | is at least as big as the forecast horizon 55 | according to 'https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png' 56 | """ 57 | 58 | data_train, data_test = train_test_split(data_monotonic_increase, **TRAIN) 59 | X_train, y_train = get_X_y(data_train, shuffle=False, **TRAIN) 60 | X_test, y_test = get_X_y(data_test, shuffle=False, **TRAIN) 61 | 62 | y_train_last_seen_timestep = np.max(y_train) # OR y_train[-1].flat[-1] 63 | y_test_first_seen_timestep = np.min(y_test) # OR y_test[0].flat[0] 64 | gap = y_test_first_seen_timestep - y_train_last_seen_timestep 65 | # Note: for strides = 1, the inequality below must be an exact equality, but we don't need to test that to ensure no data leak. 66 | assert gap >= TRAIN["horizon"], "❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ " 67 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/tests/unittests/test_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.model import fit_model, get_model, predict_output 3 | 4 | def test_model_has_correct_output_shape(X_y_zeros_and_ones): 5 | X, y = X_y_zeros_and_ones 6 | model = get_model(X,y) 7 | y_pred = predict_output(model, X) 8 | assert y_pred.shape == y.shape 9 | 10 | @pytest.mark.slow 11 | def test_model_can_fit(X_y_zeros_and_ones): 12 | """Check that the model can fit without crashing""" 13 | X, y = X_y_zeros_and_ones 14 | model = get_model(X,y) 15 | fit_model(model, X, y, verbose=0) 16 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import isfile 2 | from os.path import dirname 3 | 4 | version_file = '{}/version.txt'.format(dirname(__file__)) 5 | 6 | if isfile(version_file): 7 | with open(version_file) as version_file: 8 | __version__ = version_file.read().strip() 9 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/dataprep.py: -------------------------------------------------------------------------------- 1 | """Prepare Data so as to be used in a Pipelined ML model""" 2 | 3 | import numpy as np 4 | from ts_boilerplate.params import DATA 5 | from typing import Tuple, List 6 | import numpy as np 7 | 8 | 9 | def load_data(data_path: str) -> np.ndarray: 10 | """Load data from `data_path` into to memory 11 | Returns a 2D array with (axis 0) representing timesteps, and (axis 1) columns containing tagets and covariates 12 | ref: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true 13 | """ 14 | # YOUR_CODE_HERE 15 | pass 16 | 17 | 18 | def clean_data(data: np.ndarray) -> np.ndarray: 19 | """Clean data without creating data leakage: 20 | - make sure there is no NaN between any timestep 21 | - etc... 22 | """ 23 | # YOUR_CODE_HERE 24 | pass 25 | 26 | 27 | def get_X_y( 28 | data: np.ndarray, 29 | input_length: int, 30 | output_length: int, 31 | horizon: int, 32 | stride: int, 33 | shuffle=True, 34 | **kwargs, 35 | ) -> Tuple[np.ndarray, np.ndarray]: 36 | """ 37 | Use `data`, a 2D-array with axis=0 as timesteps, and axis=1 as (tagets+covariates columns) 38 | 39 | Returns a Tuple (X,y) of two ndarrays : 40 | X.shape = (n_samples, input_length, n_covariates) 41 | y.shape = 42 | (n_samples, output_length, n_targets) if all 3-dimensions are of size > 1 43 | (n_samples, output_length) if n_targets == 1 44 | (n_samples, n_targets) if output_length == 1 45 | (n_samples, ) if both n_targets and lenghts == 1 46 | 47 | ❗️ Raise error if data contains NaN 48 | ❗️ Make sure to shuffle the pairs in unison if `shuffle=True` for idd purpose 49 | ❗️ Don't ditch past values of your target time-series in your features - they are very useful features! 50 | 👉 illustration: https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-1.png 51 | 52 | [💡 Hints ] You can use a sliding method 53 | - Reading `data` in ascending order 54 | - `stride` timestamps after another 55 | Feel free to use another approach, for example random sampling without replacement 56 | 57 | """ 58 | pass # YOUR CODE HERE 59 | 60 | 61 | 62 | 63 | 64 | def get_folds(data: np.ndarray, 65 | fold_length: int, 66 | fold_stride: int, 67 | **kwargs) -> List[np.ndarray]: 68 | """Slide through `data` time-series (2D array) to create folds of equal `fold_length`, using `fold_stride` between each fold 69 | Returns a list of folds, each as a 2D-array time series 70 | """ 71 | pass # YOUR CODE HERE 72 | 73 | 74 | def train_test_split(data: np.ndarray, 75 | train_test_ratio: float, 76 | input_length: int, 77 | **kwargs) -> Tuple[np.ndarray, np.ndarray]: 78 | """Returns a train and test 2D-arrays, that will not create any data leaks when sampling (X, y) from them 79 | Inspired from "https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png" 80 | """ 81 | pass # YOUR CODE HERE 82 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/generate_dummy_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ts_boilerplate.params import CROSS_VAL, DATA, TRAIN 3 | from typing import Tuple 4 | 5 | def generate_data_monotonic_increase() -> np.ndarray: 6 | """Creates a monotonicly increasing time serie dataset for test purposes 7 | - shape is (DATA['length'], DATA['n_covariates] + DATA['n_targets']), 8 | - values are all equals to their respective integer index! 9 | 10 | e.g: 11 | data = array( 12 | [[ 0., 0., 0., 0., 0.], 13 | [ 1., 1., 1., 1., 1.], 14 | ..., 15 | [998., 998., 998., 998., 998.], 16 | [999., 999., 999., 999., 999.]] 17 | ) 18 | 19 | """ 20 | 21 | indexes = np.arange(0, DATA['length']) 22 | data = np.zeros((DATA['length'], DATA['n_covariates'] + DATA['n_targets'])) \ 23 | + np.expand_dims(indexes, axis=1) 24 | return data 25 | 26 | def generate_data_zeros_and_ones() -> np.ndarray: 27 | """Create a dummy data made of zeros for covariates, and ones for the targets 28 | e.g: 29 | data = array( 30 | [[1.,1.,0.,0.,0.], 31 | [1.,1.,0.,0.,0.], 32 | ..., 33 | [1.,1.,0.,0.,0.], 34 | [1.,1.,0.,0.,0.]] 35 | ) 36 | """ 37 | shape = (DATA['length'], DATA['n_covariates'] + DATA['n_targets']) 38 | data = np.zeros(shape) 39 | data[:, DATA["target_column_idx"]] = 1. 40 | return data 41 | 42 | def generate_X_y_zeros_and_ones() -> Tuple[np.ndarray]: 43 | """Create a dummy (X,y) tuple made of zeros for covariates, and ones for the targets, just to check if model fit well""" 44 | length = round(DATA["length"] / TRAIN['stride']) 45 | 46 | shape_X = (length, TRAIN['input_length'], DATA['n_covariates']+DATA['n_targets']) 47 | X = np.zeros(shape_X) 48 | X[:, :, DATA["target_column_idx"]] = 1. 49 | 50 | shape_y = (length, TRAIN['output_length'], DATA['n_targets']) 51 | y = np.ones(shape_y) 52 | y = np.squeeze(y) 53 | 54 | return (X,y) 55 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Top level orchestrator of the project. To be called from the CLI. 3 | It comprises all the "routes" you may want to call 4 | ''' 5 | from email import header 6 | import numpy as np 7 | import pandas as pd 8 | import os 9 | from ts_boilerplate.dataprep import get_Xi_yi, get_X_y, get_folds, train_test_split 10 | from ts_boilerplate.model import get_model, fit_model, predict_output 11 | from ts_boilerplate.metrics import mape, mae 12 | from ts_boilerplate.params import CROSS_VAL, ROOT_DIR, TRAIN, DATA 13 | from typing import Tuple, List 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def train(data: np.ndarray, print_metrics: bool = False): 18 | """ 19 | Train the model in this package on one fold `data` containing the 2D-array of time-series for your problem 20 | Returns `metrics_test` associated with the training 21 | """ 22 | pass # YOUR CODE HERE 23 | 24 | 25 | def cross_validate(data: np.ndarray, print_metrics: bool = False): 26 | """ 27 | Cross-Validate the model in this package on`data` 28 | Returns `metrics_cv`: the list of test metrics at each fold 29 | """ 30 | pass # YOUR CODE HERE 31 | 32 | 33 | def backtest(data: np.ndarray, 34 | stride: int = 1, 35 | start_ratio: float = 0.9, 36 | retrain: bool = True, 37 | retrain_every: int = 1, 38 | print_metrics=False, 39 | plot_metrics=False): 40 | """Returns historical forecasts for the entire dataset 41 | - by training model up to `start_ratio` of the dataset 42 | - then predicting next values using the model in this package (only predict the last time-steps if `predict_only_last_value` is True) 43 | - then moving `stride` timesteps ahead 44 | - then retraining the model if `retrain` is True and if we moved `retrain_every` timesteps since last training 45 | - then predicting next values again 46 | 47 | Return: 48 | - all historical predictions as 2D-array time-series of shape ((1-start_ratio)*len(data), n_targets)/stride 49 | - Compute the 'mean-MAPE' per forecast horizon 50 | - Print historical predictions if you want a visual check 51 | 52 | see https://unit8co.github.io/darts/generated_api/darts.models.forecasting.rnn_model.html#darts.models.forecasting.rnn_model.RNNModel.historical_forecasts 53 | """ 54 | pass # YOUR CODE HERE 55 | 56 | if __name__ == '__main__': 57 | data = pd.read_csv(os.path.join(ROOT_DIR, 'data','raw','data.csv')).to_numpy() 58 | try: 59 | train(data=data, print_metrics=True) 60 | # cross_validate(data=data, print_metrics=True) 61 | # backtest(data=data, 62 | # stride = 1, 63 | # start_ratio = 0.9, 64 | # retrain = True, 65 | # retrain_every=1, 66 | # print_metrics=True, 67 | # plot_metrics=True) 68 | except: 69 | import ipdb, traceback, sys 70 | extype, value, tb = sys.exc_info() 71 | traceback.print_exc() 72 | ipdb.post_mortem(tb) 73 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Computes usefull Time Series metrics from (y_true, y_test) 3 | ''' 4 | 5 | import numpy as np 6 | from tensorflow import reduce_mean 7 | from tensorflow.keras.metrics import mean_absolute_error, mean_absolute_percentage_error 8 | 9 | 10 | def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float: 11 | """Returns Mean Absolute Error""" 12 | pass # YOUR CODE HERE 13 | 14 | def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: 15 | """Returns Mean Absolute Percentage Error""" 16 | pass # YOUR CODE HERE 17 | 18 | def mase(y_true: np.ndarray, y_pred: np.ndarray) -> float: 19 | """Returns Mean Absolute Scaled Error (https://en.wikipedia.org/wiki/Mean_absolute_scaled_error) 20 | """ 21 | pass 22 | 23 | 24 | def play_trading_strategy(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: 25 | """Returns the array of relative portfolio values over the test period""" 26 | pass 27 | 28 | 29 | def return_on_investment(played_trading_strategy: np.ndarray) -> float: 30 | """Returns the ROI of an investment strategy""" 31 | pass 32 | 33 | 34 | def sharpe_ratio(played_trading_strategy: np.ndarray) -> float: 35 | """Returns the Sharpe Ratio (Return on Investment / Volatility) of an investment strategy""" 36 | pass 37 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input 3 | from tensorflow.keras import Model 4 | from ts_boilerplate.params import DATA, TRAIN 5 | 6 | # TODO: Should we add here the preprocessing? into a class called "pipeline"? 7 | # TODO: Should we refacto in a class ? Probably! 8 | 9 | 10 | def get_model(X_train, y_train): 11 | """Instanciate, compile and and return the model of your choice""" 12 | pass # YOUR CODE HERE 13 | 14 | 15 | def fit_model(model, X_train, y_train, **kwargs): 16 | """Fit the `model` object, including preprocessing if needs be""" 17 | pass # YOUR CODE HERE 18 | 19 | 20 | def predict_output(model, X_test): 21 | """Return y_test. Include preprocessing if needs be""" 22 | pass # YOUR CODE HERE 23 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator-challenge/ts_boilerplate/params.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | ## CREDENTIALS AND PATHS 5 | load_dotenv() 6 | API_KEY = os.getenv('API_KEY') 7 | 8 | ## DIR PARAMS 9 | ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 10 | DATA_RAW_CSV_PATH = os.path.join(ROOT_DIR, 'data', 'raw', 'data.csv') 11 | 12 | # 👇 Please fill these global variable below very carefully, in order to create tests related to your problem👇 13 | # cf: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true 14 | DATA = dict( 15 | length = 500, # How many timesteps does your dataset contains? 16 | n_covariates = 3, # number of past covariates, excluding target time series. Our tests do not support future_covariate yet. 17 | target_column_idx = [0,1] # List of index(es) of target column(s) in your dataset. e.g [0] for Mono-target problem, e.g. [0,1,4] for multi-variate targets problem. Note that past targets values will also be used as features X. 18 | ) 19 | DATA['n_targets'] = len(DATA['target_column_idx']) # number of target time series to predict. 20 | 21 | TRAIN = dict( 22 | horizon = 4, # start prediction xxx timestep ahead 23 | input_length = 10, # Length (in time) of each sequences that will be seen by the model (X.shape[1]) 24 | output_length = 7, # Length (in time) of prediction (y.shape[1]) 25 | stride = 1, # Integer used to create all pairs of sample (Xi, yi) by sliding in each data fold. Use `None` if you don't plan to use any sliding method in data.get_X_y 26 | train_test_ratio = 0.7, # ratio of train / (train+test) length in each fold 27 | ) 28 | 29 | CROSS_VAL = dict( 30 | fold_length = 200, 31 | fold_stride = 100, 32 | ) 33 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/.challengifyignore: -------------------------------------------------------------------------------- 1 | TODO.md 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | 2 | # 🤖 usage 3 | # 4 | # this file contains the conf for GitHub Continuous Integration 5 | # and Continuous Deployment to Heroku 6 | # 7 | # in order to activate the tests in GitHub CI: 8 | # - uncomment the content of the CI paragraph (lines 41-55) 9 | # - create some tests in the tests/ directory 10 | # 11 | # in order to activate CD to Heroku: 12 | # - activate the tests in GitHub CI 13 | # - uncomment the content of the CD paragraph (lines 57-75) 14 | 15 | name: Python package 16 | 17 | on: 18 | push: 19 | branches: [ master ] 20 | pull_request: 21 | branches: [ master ] 22 | 23 | jobs: 24 | 25 | # 🤖 CI paragraph 26 | # 27 | # uncomment the content of this paragraph to activate the tests in GitHub CI 28 | # - remove the 2 trailing characters "# ", do not change the spaces 29 | # (the `name` keys should be at the same level as the `uses` key) 30 | # (the `strategy` key should be at the same level as the `steps` key) 31 | 32 | build: 33 | 34 | runs-on: ubuntu-latest 35 | 36 | steps: 37 | - uses: actions/checkout@v2 38 | - name: Say hello 39 | run: | 40 | echo "Hello, World!" 41 | # - name: Set up Python ${{ matrix.python-version }} 42 | # uses: actions/setup-python@v1 43 | # with: 44 | # python-version: ${{ matrix.python-version }} 45 | # - name: Install dependencies 46 | # run: | 47 | # python -m pip install --upgrade pip 48 | # pip install -r requirements.txt 49 | # - name: Install package and test 50 | # run: | 51 | # make install test clean 52 | 53 | # strategy: 54 | # matrix: 55 | # python-version: [3.8] 56 | 57 | # # 🤖 CD paragraph 58 | # # 59 | # # uncomment the following lines to activate CD to Heroku 60 | # # - remove the 2 trailing characters "# ", do not change the spaces 61 | # # (there should be 2 spaces before the `deploy_heroku` key) 62 | # # - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets 63 | # # - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app 64 | 65 | # deploy_heroku: 66 | # needs: build 67 | # runs-on: ubuntu-latest 68 | 69 | # steps: 70 | # - uses: actions/checkout@v2 71 | # - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action 72 | # with: 73 | # heroku_api_key: ${{secrets.HEROKU_API_KEY}} 74 | # heroku_app_name: "REPLACE_WITH_YOUR_HEROKU_APP_NAME" # Must be unique in Heroku 75 | # heroku_email: ${{secrets.HEROKU_EMAIL}} 76 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | .coverage 3 | .ipynb_checkpoints 4 | **/*.DS_Store 5 | data_raw/ 6 | data_processed/ 7 | *.csv 8 | __pycache__/ 9 | .env 10 | .vscode/ 11 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/Makefile: -------------------------------------------------------------------------------- 1 | # TODO 2 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/README.md: -------------------------------------------------------------------------------- 1 | This is a **boilerplate** repo for a machine-learning project involving **Time Series forecasting**. 2 | 3 | In particular 4 | 5 | - It provides a **cross-validation framework** to ensure model are tested thoroughly and without data leakage 6 | - It is agnostic of the type of model involved 7 | - It is well suited for short research projects, typical of few-weeks coding bootcamps such as Le Wagon DataScience 8 | 9 | # Detailed package workflow 10 | 11 | ## Architecture 12 | - `ts_boilerplate` package 13 | - `main.py` comprises the main routes to be called from the CLI (`train`, `cross-validate`, `backtest`) 14 | - `params.py` contains project-level global variable to be set manually 15 |
16 | 17 | - `data` folder contains 18 | - `raw` and `clean` folder should contain **2D arrays `data` time-series**, with (axis 0) representing timesteps integer, and (axis 1) columns containing tagets and covariates, as per [picture](https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true) 19 | ```python 20 | data.shape = (length, n_targets+n_covariates) 21 | ``` 22 | - `Xy` may persist your tuple (X,y) of **3D arrays** training sets to be fed to your models if you want to store them to avoid preprocessing multiple times. 23 | ```python 24 | X.shape = (n_samples, input_length, n_covariates) 25 | y.shape = (n_samples, output_length, n_targets) 26 | ``` 27 | - `notebooks` 28 | - `test_package.ipynb` will help you understand how the package and the tests have been built. 29 | - `tutorial_ts_forecasting.ipynb` is a recommended read before diving into this project. It contains visuals that will help you fill global project params and understand naming conventions 30 | 31 |
32 | 33 | - `tests` folder detailed below 34 | 35 | ## How to test your code? 36 | First of all, fill `ts_boilerplate/params.py` corresponding to your true project speficities 37 | 38 | Then, run this in your terminal from the root project folder to check your code 39 | - `pytest` 40 | - `pytest -m "not optional"` to only check mandatory tests 41 | - `pytest -m "not optional" -m "not slow"` to also avoid tests that may be slow (involving fitting your model) 42 | 43 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/TODO.md: -------------------------------------------------------------------------------- 1 | - [ ] Refacto `model.py` 2 | - [ ] Rename `pipeline.py` because it may comprises the pre-processing such as scaling etc... 3 | - [ ] Turn into a class `TsPipeline()` instead of pure functions 4 | 5 | - [ ] Add requirements.txt 6 | - [ ] Integrate package as part of the ML Ops lifecycle 7 | - [ ] track & save experiment results 8 | - [ ] ... 9 | 10 | - [ ] Add tests for future-covariates 11 | - [ ] Create Makefile 12 | - [ ] Include DAG of the project 13 | - [ ] publish to lewagon community 14 | - [ ] cache the fixtures of conftest.py 15 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/notebooks/WIP_tutorial_darts_library.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## This is a tutorial for Darts python package" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [] 14 | } 15 | ], 16 | "metadata": { 17 | "language_info": { 18 | "name": "python" 19 | }, 20 | "orig_nbformat": 4 21 | }, 22 | "nbformat": 4, 23 | "nbformat_minor": 2 24 | } 25 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/notebooks/test_package.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Step by step guide to Unit Tests used in this project" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2022-03-14T14:24:55.279012Z", 16 | "start_time": "2022-03-14T14:24:53.949004Z" 17 | } 18 | }, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "The autoreload extension is already loaded. To reload it, use:\n", 25 | " %reload_ext autoreload\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "import os\n", 34 | "from ts_boilerplate.params import ROOT_DIR, DATA, TRAIN, CROSS_VAL\n", 35 | "from ts_boilerplate.dataprep import get_X_y, get_folds, train_test_split, get_Xi_yi\n", 36 | "from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones\n", 37 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n", 38 | "from ts_boilerplate.metrics import mape\n", 39 | "\n", 40 | "%load_ext autoreload\n", 41 | "%autoreload 2" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## 1) `generate_dummy_data.py`" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Let's create a dummy time series dataset whose value increment by 1 every day" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "array([[ 0., 0., 0., 0., 0.],\n", 67 | " [ 1., 1., 1., 1., 1.],\n", 68 | " [ 2., 2., 2., 2., 2.],\n", 69 | " ...,\n", 70 | " [497., 497., 497., 497., 497.],\n", 71 | " [498., 498., 498., 498., 498.],\n", 72 | " [499., 499., 499., 499., 499.]])" 73 | ] 74 | }, 75 | "execution_count": 2, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "data = generate_data_monotonic_increase()\n", 82 | "data" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": { 89 | "ExecuteTime": { 90 | "end_time": "2022-03-14T14:25:19.973275Z", 91 | "start_time": "2022-03-14T14:25:19.950901Z" 92 | } 93 | }, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/html": [ 98 | "
\n", 99 | "\n", 112 | "\n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
01234
00.00.00.00.00.0
11.01.01.01.01.0
22.02.02.02.02.0
33.03.03.03.03.0
44.04.04.04.04.0
..................
495495.0495.0495.0495.0495.0
496496.0496.0496.0496.0496.0
497497.0497.0497.0497.0497.0
498498.0498.0498.0498.0498.0
499499.0499.0499.0499.0499.0
\n", 214 | "

500 rows × 5 columns

\n", 215 | "
" 216 | ], 217 | "text/plain": [ 218 | " 0 1 2 3 4\n", 219 | "0 0.0 0.0 0.0 0.0 0.0\n", 220 | "1 1.0 1.0 1.0 1.0 1.0\n", 221 | "2 2.0 2.0 2.0 2.0 2.0\n", 222 | "3 3.0 3.0 3.0 3.0 3.0\n", 223 | "4 4.0 4.0 4.0 4.0 4.0\n", 224 | ".. ... ... ... ... ...\n", 225 | "495 495.0 495.0 495.0 495.0 495.0\n", 226 | "496 496.0 496.0 496.0 496.0 496.0\n", 227 | "497 497.0 497.0 497.0 497.0 497.0\n", 228 | "498 498.0 498.0 498.0 498.0 498.0\n", 229 | "499 499.0 499.0 499.0 499.0 499.0\n", 230 | "\n", 231 | "[500 rows x 5 columns]" 232 | ] 233 | }, 234 | "execution_count": 3, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "# Store as CSV\n", 241 | "data_df = pd.DataFrame(data)\n", 242 | "data_df.to_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"), index=False)\n", 243 | "pd.read_csv(os.path.join(ROOT_DIR, \"data\", \"dummy\", \"data_dummy.csv\"))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## 2) `dataprep.py`" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### 2.1) `getX_y`" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "(475, 10, 5)\n", 270 | "(475, 7, 2)\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "X, y = get_X_y(data, **TRAIN)\n", 276 | "print(X.shape)\n", 277 | "print(y.shape)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 11, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "475.0" 289 | ] 290 | }, 291 | "execution_count": 11, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "# Let's compute the shape arithmetically (for unittests)\n", 298 | "(len(data) \\\n", 299 | " - (TRAIN['input_length'] -1) \\\n", 300 | " - (TRAIN['output_length'] -1) \\\n", 301 | " - TRAIN['horizon']) \\\n", 302 | " / TRAIN[\"stride\"]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "☝️ ceiling rounding function should be used for stride > 1" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "### 2.2) `train_test_split`" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 12, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "(500, 5)" 328 | ] 329 | }, 330 | "execution_count": 12, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "train_test_ratio = TRAIN[\"train_test_ratio\"]\n", 337 | "input_length = TRAIN[\"input_length\"]\n", 338 | "output_length = TRAIN[\"output_length\"]\n", 339 | "data.shape" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 13, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "last_train_idx = round(train_test_ratio * len(data))\n", 349 | "data_train = data[0:last_train_idx, :]\n", 350 | "\n", 351 | "first_test_idx = last_train_idx - input_length\n", 352 | "data_test = data[first_test_idx:, :]" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 14, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "array([[ 0., 0., 0., 0., 0.],\n", 364 | " [ 1., 1., 1., 1., 1.],\n", 365 | " [ 2., 2., 2., 2., 2.],\n", 366 | " ...,\n", 367 | " [347., 347., 347., 347., 347.],\n", 368 | " [348., 348., 348., 348., 348.],\n", 369 | " [349., 349., 349., 349., 349.]])" 370 | ] 371 | }, 372 | "execution_count": 14, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "data_train" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "data_test" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 397 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n", 398 | "\n", 399 | "print(\"####### Last train pair\")\n", 400 | "print(X_train[-1])\n", 401 | "print(y_train[-1])\n", 402 | "print(\"####### First test pair\")\n", 403 | "print(X_test[0])\n", 404 | "print(y_test[0])" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 17, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "10.0" 416 | ] 417 | }, 418 | "execution_count": 17, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "gap = np.min(y_test) - np.max(y_train)\n", 425 | "gap" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 18, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "assert gap >= TRAIN[\"horizon\"], \"❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ \"" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "### 2.3) `get_folds`" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "folds = get_folds(data, **CROSS_VAL)\n", 451 | "print('n_folds= ', len(folds))\n", 452 | "print(folds[-1])" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "## 3) `model.py`" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 27, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "data_train, data_test = train_test_split(data, **TRAIN)\n", 469 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 470 | "X_test, y_test = get_X_y(data_test, **TRAIN)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 28, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "import tensorflow as tf\n", 480 | "from keras.models import Model\n", 481 | "from keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 18, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "Model: \"model\"\n", 494 | "_________________________________________________________________\n", 495 | " Layer (type) Output Shape Param # \n", 496 | "=================================================================\n", 497 | " input_1 (InputLayer) [(None, 10, 5)] 0 \n", 498 | " \n", 499 | " lambda (Lambda) (None, 7, 2) 0 \n", 500 | " \n", 501 | " reshape (Reshape) (None, 7, 2) 0 \n", 502 | " \n", 503 | "=================================================================\n", 504 | "Total params: 0\n", 505 | "Trainable params: 0\n", 506 | "Non-trainable params: 0\n", 507 | "_________________________________________________________________\n" 508 | ] 509 | } 510 | ], 511 | "source": [ 512 | "# BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS\n", 513 | "input = Input(shape=X_train.shape[1:])\n", 514 | "# Take last temporal values of the targets, and duplicate it as many times as `output_length`\n", 515 | "x = Lambda(\n", 516 | " lambda x: tf.repeat(tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1),\n", 517 | " repeats=TRAIN['output_length'],\n", 518 | " axis=1))(input)\n", 519 | "output = Reshape(y_train.shape[1:])(x)\n", 520 | "model = Model(input, output)\n", 521 | "model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE)\n", 522 | "model.summary()" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 19, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',\n", 532 | " patience=2,\n", 533 | " verbose=0,\n", 534 | " mode='min',\n", 535 | " restore_best_weights=True)\n", 536 | "history = model.fit(X_train,\n", 537 | " y_train,\n", 538 | " epochs=50,\n", 539 | " batch_size=16,\n", 540 | " validation_split=0.3,\n", 541 | " callbacks=[es],\n", 542 | " verbose=0)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 29, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "data": { 552 | "text/plain": [ 553 | "3.0535266" 554 | ] 555 | }, 556 | "execution_count": 29, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "from ts_boilerplate.metrics import mape\n", 563 | "\n", 564 | "y_pred = model.predict(X_test)\n", 565 | "mape(y_test, y_pred)\n" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "## 4) `main.py`\n" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "### 4.1) `train()`" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 10, 585 | "metadata": {}, 586 | "outputs": [ 587 | { 588 | "name": "stdout", 589 | "output_type": "stream", 590 | "text": [ 591 | "### Test Metric: 3.0535274\n" 592 | ] 593 | } 594 | ], 595 | "source": [ 596 | "data = generate_data_monotonic_increase()\n", 597 | "data_train, data_test = train_test_split(data, **TRAIN)\n", 598 | "X_train, y_train = get_X_y(data_train, **TRAIN)\n", 599 | "X_test, y_test = get_X_y(data_test, **TRAIN)\n", 600 | "model = get_model(X_train, y_train)\n", 601 | "history = fit_model(model, X_train, y_train)\n", 602 | "y_pred = predict_output(model, X_test)\n", 603 | "metrics_test = mape(y_test, y_pred)\n", 604 | "\n", 605 | "print(\"### Test Metric: \", metrics_test)" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "### 4.2) cross_validate()" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "### 4.1) `backtesting()`" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 20, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "y_pred_backtest = []" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 21, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "data = generate_data_monotonic_increase()\n", 645 | "from ts_boilerplate.model import get_model, fit_model, predict_output\n", 646 | "from ts_boilerplate.dataprep import get_Xi_yi" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": 25, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "stride = 10\n", 656 | "start_ratio:float = 0.8\n", 657 | "retrain: bool = True\n", 658 | "retrain_every: int = 50" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 30, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "application/vnd.jupyter.widget-view+json": { 669 | "model_id": "f1761b545b3340529fd4939039a0cb46", 670 | "version_major": 2, 671 | "version_minor": 0 672 | }, 673 | "text/plain": [ 674 | " 0%| | 0/10 [00:00" 755 | ] 756 | }, 757 | "metadata": { 758 | "needs_background": "light" 759 | }, 760 | "output_type": "display_data" 761 | } 762 | ], 763 | "source": [ 764 | "# TODO: make it work for any dimension of y\n", 765 | "plt.plot(y_pred_backtested[:,0,0], label='historical forecasts')\n", 766 | "plt.plot(y_test_backtested[:,0,0], label='truth')\n", 767 | "plt.xlabel('timesteps')\n", 768 | "plt.legend()\n", 769 | "plt.show()" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [] 778 | } 779 | ], 780 | "metadata": { 781 | "interpreter": { 782 | "hash": "572b4e543617d03e90ecaf525e08695da1ff29b13594f787e33b342cf572f792" 783 | }, 784 | "kernelspec": { 785 | "display_name": "Python 3 (ipykernel)", 786 | "language": "python", 787 | "name": "python3" 788 | }, 789 | "language_info": { 790 | "codemirror_mode": { 791 | "name": "ipython", 792 | "version": 3 793 | }, 794 | "file_extension": ".py", 795 | "mimetype": "text/x-python", 796 | "name": "python", 797 | "nbconvert_exporter": "python", 798 | "pygments_lexer": "ipython3", 799 | "version": "3.8.12" 800 | }, 801 | "toc": { 802 | "base_numbering": 1, 803 | "nav_menu": {}, 804 | "number_sections": false, 805 | "sideBar": true, 806 | "skip_h1_title": false, 807 | "title_cell": "Table of Contents", 808 | "title_sidebar": "Contents", 809 | "toc_cell": false, 810 | "toc_position": {}, 811 | "toc_section_display": true, 812 | "toc_window_display": false 813 | } 814 | }, 815 | "nbformat": 4, 816 | "nbformat_minor": 2 817 | } 818 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | slow: marks tests as slow (deselect with '-m "not slow"') 4 | optional: marks tests as optional (deselect with '-m "not optional"') 5 | addopts = -v -s --color=yes -W ignore::DeprecationWarning 6 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator/requirements.txt -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | with open('requirements.txt') as f: 5 | content = f.readlines() 6 | requirements = [x.strip() for x in content if 'git+' not in x] 7 | 8 | setup(name='ts-boilerplate', 9 | version="1.0", 10 | description="Trainer Boilerplate for Time Series Forecast Models with Cross Validation", 11 | packages=find_packages(), 12 | install_requires=requirements, 13 | test_suite='tests', 14 | # include_package_data: to install data from MANIFEST.in 15 | include_package_data=True, 16 | zip_safe=False) 17 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/project-boilerplates/time-series-cross-validator/tests/__init__.py -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from ts_boilerplate.generate_dummy_data import generate_data_monotonic_increase, generate_data_zeros_and_ones, generate_X_y_zeros_and_ones 4 | from typing import Tuple 5 | 6 | @pytest.fixture(scope="session") 7 | def data_monotonic_increase() -> np.ndarray: 8 | return generate_data_monotonic_increase() 9 | 10 | @pytest.fixture(scope="session") 11 | def data_zeros_and_ones() -> np.ndarray: 12 | return generate_data_zeros_and_ones() 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def X_y_zeros_and_ones() -> Tuple[np.ndarray]: 17 | return generate_X_y_zeros_and_ones() 18 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/integrated/test_main.py: -------------------------------------------------------------------------------- 1 | """Tests that main route run without raising exceptions""" 2 | 3 | import pytest 4 | from ts_boilerplate.main import backtest, train, cross_validate 5 | 6 | @pytest.mark.slow 7 | def test_main_route_train(data_monotonic_increase): 8 | train(data_monotonic_increase) 9 | 10 | @pytest.mark.slow 11 | def test_main_route_cross_validate(data_monotonic_increase): 12 | cross_validate(data_monotonic_increase) 13 | 14 | @pytest.mark.slow 15 | def test_backtest(data_monotonic_increase): 16 | backtest(data_monotonic_increase, print_metrics=False, plot_metrics=False) 17 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/integrated/test_model_performance.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.main import train 3 | 4 | 5 | @pytest.mark.optional 6 | @pytest.mark.slow 7 | def test_model_can_fit_well_enough_on_dummy_dataset(data_zeros_and_ones): 8 | """Check that the model can fit, with MAPE lower than some threshold on dummy dataset of zeros and ones""" 9 | 10 | metrics = train(data_zeros_and_ones) 11 | #print("#### metrics on dummy dataset ", metrics) 12 | 13 | assert metrics < 5, "your model does not seem to be able to fit well enough even a very easy dataset" 14 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/unittests/test_data.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.params import TRAIN, DATA 3 | from ts_boilerplate.main import get_X_y 4 | from ts_boilerplate.main import train_test_split 5 | import numpy as np 6 | import math 7 | 8 | 9 | # These tests make use of the fixture `data_monotonic_increase` stored in tests/conftest.py (pytest magic under the hood) 10 | def test_get_X_y_returns_correct_shapes(data_monotonic_increase): 11 | """Test that X and y have correct shape (excluding sample size), as per project setup as defined in `params.py` 12 | """ 13 | X, y = get_X_y(data_monotonic_increase, **TRAIN) 14 | 15 | # Check that X and y have the correct lengths (in time) and depth (in number of covariates) 16 | assert X.ndim == 3 17 | assert X.shape[1] == TRAIN['input_length'] 18 | assert X.shape[2] == DATA['n_covariates'] + DATA[ 19 | 'n_targets'], "Did you forget to include your past targets-values as features ?" 20 | 21 | y_should_be_3D = TRAIN['output_length'] > 1 and DATA["n_targets"] > 1 22 | y_should_be_1D = TRAIN['output_length'] == 1 and DATA["n_targets"] == 1 23 | if y_should_be_3D: 24 | assert y.ndim == 3 25 | assert y.shape[1] == TRAIN['output_length'] 26 | assert y.shape[2] == DATA['n_targets'] 27 | elif y_should_be_1D: 28 | assert y.ndim == 1 29 | else: 30 | assert y.ndim == 2 31 | assert y.shape[1] == TRAIN['output_length'] if DATA['n_targets'] == 1 else DATA['n_targets'] 32 | 33 | 34 | @pytest.mark.optional 35 | @pytest.mark.skipif(TRAIN['stride'] == None, reason="Optional test only applicable if sliding method is used to get_X_y") 36 | def test_optional_get_X_y_returns_optimal_sample_size(data_monotonic_increase): 37 | """If get_X_y uses a stride method, check that X and y contains the optimal number of sample each 38 | """ 39 | X, y = get_X_y(data_monotonic_increase, **TRAIN) 40 | 41 | # Complex formula below retro-engineered from `create_dummy_tests.ipynb` 42 | expected_len = math.ceil( 43 | (len(data_monotonic_increase) \ 44 | - (TRAIN['input_length'] -1) \ 45 | - (TRAIN['output_length'] -1) \ 46 | - TRAIN['horizon'] 47 | ) / TRAIN["stride"] 48 | ) 49 | assert len(X) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen" 50 | assert len(y) == expected_len, "you may have not generated the optimal number of samples, given the stride chosen" 51 | 52 | def test_no_data_leak(data_monotonic_increase): 53 | """Test that the time gap between the last timestep of `y_train` and the first timestep of `y_test` 54 | is at least as big as the forecast horizon 55 | according to 'https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png' 56 | """ 57 | 58 | data_train, data_test = train_test_split(data_monotonic_increase, **TRAIN) 59 | X_train, y_train = get_X_y(data_train, shuffle=False, **TRAIN) 60 | X_test, y_test = get_X_y(data_test, shuffle=False, **TRAIN) 61 | 62 | y_train_last_seen_timestep = np.max(y_train) # OR y_train[-1].flat[-1] 63 | y_test_first_seen_timestep = np.min(y_test) # OR y_test[0].flat[0] 64 | gap = y_test_first_seen_timestep - y_train_last_seen_timestep 65 | # Note: for strides = 1, the inequality below must be an exact equality, but we don't need to test that to ensure no data leak. 66 | assert gap >= TRAIN["horizon"], "❗️❗️ Data leak detected between (X_train, y_train) and (X_test, y_test)❗️❗️ " 67 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/tests/unittests/test_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ts_boilerplate.model import fit_model, get_model, predict_output 3 | 4 | def test_model_has_correct_output_shape(X_y_zeros_and_ones): 5 | X, y = X_y_zeros_and_ones 6 | model = get_model(X,y) 7 | y_pred = predict_output(model, X) 8 | assert y_pred.shape == y.shape 9 | 10 | @pytest.mark.slow 11 | def test_model_can_fit(X_y_zeros_and_ones): 12 | """Check that the model can fit without crashing""" 13 | X, y = X_y_zeros_and_ones 14 | model = get_model(X,y) 15 | fit_model(model, X, y, verbose=0) 16 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/__init__.py: -------------------------------------------------------------------------------- 1 | from os.path import isfile 2 | from os.path import dirname 3 | 4 | version_file = '{}/version.txt'.format(dirname(__file__)) 5 | 6 | if isfile(version_file): 7 | with open(version_file) as version_file: 8 | __version__ = version_file.read().strip() 9 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/dataprep.py: -------------------------------------------------------------------------------- 1 | """Prepare Data so as to be used in a Pipelined ML model""" 2 | 3 | import numpy as np 4 | from ts_boilerplate.params import DATA 5 | from typing import Tuple, List 6 | import numpy as np 7 | 8 | 9 | def load_data(data_path: str) -> np.ndarray: 10 | """Load data from `data_path` into to memory 11 | Returns a 2D array with (axis 0) representing timesteps, and (axis 1) columns containing tagets and covariates 12 | ref: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true 13 | """ 14 | # YOUR_CODE_HERE 15 | pass 16 | 17 | 18 | def clean_data(data: np.ndarray) -> np.ndarray: 19 | """Clean data without creating data leakage: 20 | - make sure there is no NaN between any timestep 21 | - etc... 22 | """ 23 | # YOUR_CODE_HERE 24 | pass 25 | 26 | 27 | def get_X_y( 28 | data: np.ndarray, 29 | input_length: int, 30 | output_length: int, 31 | horizon: int, 32 | stride: int, 33 | shuffle=True, 34 | **kwargs, 35 | ) -> Tuple[np.ndarray, np.ndarray]: 36 | """ 37 | Use `data`, a 2D-array with axis=0 as timesteps, and axis=1 as (tagets+covariates columns) 38 | 39 | Returns a Tuple (X,y) of two ndarrays : 40 | X.shape = (n_samples, input_length, n_covariates) 41 | y.shape = 42 | (n_samples, output_length, n_targets) if all 3-dimensions are of size > 1 43 | (n_samples, output_length) if n_targets == 1 44 | (n_samples, n_targets) if output_length == 1 45 | (n_samples, ) if both n_targets and lenghts == 1 46 | 47 | ❗️ Raise error if data contains NaN 48 | ❗️ Make sure to shuffle the pairs in unison if `shuffle=True` for idd purpose 49 | ❗️ Don't ditch past values of your target time-series in your features - they are very useful features! 50 | 👉 illustration: https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-1.png 51 | 52 | [💡 Hints ] You can use a sliding method 53 | - Reading `data` in ascending order 54 | - `stride` timestamps after another 55 | Feel free to use another approach, for example random sampling without replacement 56 | 57 | """ 58 | # $CHALLENGIFY_BEGIN 59 | assert np.isnan(data).sum() == 0 60 | 61 | X = [] 62 | y = [] 63 | 64 | for i in range(0, len(data), stride): 65 | Xi, yi = get_Xi_yi(first_index=i, 66 | data=data, 67 | horizon=horizon, 68 | input_length=input_length, 69 | output_length=output_length) 70 | # Exit loop as soon as we reach the end of the dataset 71 | if len(yi) < output_length: 72 | break 73 | X.append(Xi) 74 | y.append(yi) 75 | 76 | X = np.array(X) 77 | y = np.array(y) 78 | y = np.squeeze(y) 79 | if shuffle: 80 | idx = np.arange(len(X)) 81 | np.random.shuffle(idx) 82 | X = X[idx] 83 | y = y[idx] 84 | 85 | return X, y 86 | # $CHALLENGIFY_END 87 | 88 | 89 | # $DELETE_BEGIN 90 | def get_Xi_yi(first_index, 91 | data, 92 | horizon, 93 | input_length, 94 | output_length, 95 | **kwargs): 96 | X_start = first_index 97 | X_last = X_start + input_length 98 | y_start = X_last + horizon - 1 99 | y_last = y_start + output_length 100 | 101 | Xi = data[X_start:X_last] 102 | yi = data[y_start:y_last, DATA['target_column_idx']] 103 | return (Xi, yi) 104 | # $DELETE_END 105 | 106 | 107 | def get_folds(data: np.ndarray, 108 | fold_length: int, 109 | fold_stride: int, 110 | **kwargs) -> List[np.ndarray]: 111 | """Slide through `data` time-series (2D array) to create folds of equal `fold_length`, using `fold_stride` between each fold 112 | Returns a list of folds, each as a 2D-array time series 113 | """ 114 | # $CHALLENGIFY_BEGIN 115 | folds = [] 116 | for i in range(0, len(data), fold_stride): 117 | # Exit loop as soon as last fold value would exceed last data value 118 | if (i + fold_length) > len(data): 119 | break 120 | fold = data[i:i + fold_length, :] 121 | folds.append(fold) 122 | return folds 123 | # $CHALLENGIFY_END 124 | 125 | 126 | def train_test_split(data: np.ndarray, 127 | train_test_ratio: float, 128 | input_length: int, 129 | **kwargs) -> Tuple[np.ndarray, np.ndarray]: 130 | """Returns a train and test 2D-arrays, that will not create any data leaks when sampling (X, y) from them 131 | Inspired from "https://raw.githubusercontent.com/lewagon/data-images/master/DL/rnn-3.png" 132 | """ 133 | # $CHALLENGIFY_BEGIN 134 | last_train_idx = round(train_test_ratio * len(data)) 135 | data_train = data[0:last_train_idx, :] 136 | 137 | # [here is the key to no data leak] 138 | # The last idx of the first X_test must be equal to the last idx of the last y_train. 139 | # Its equal to day n°10 in the picture rnn-3.png 140 | first_test_idx = last_train_idx - input_length 141 | data_test = data[first_test_idx:, :] 142 | 143 | return (data_train, data_test) 144 | # $CHALLENGIFY_END 145 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/generate_dummy_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ts_boilerplate.params import CROSS_VAL, DATA, TRAIN 3 | from typing import Tuple 4 | 5 | def generate_data_monotonic_increase() -> np.ndarray: 6 | """Creates a monotonicly increasing time serie dataset for test purposes 7 | - shape is (DATA['length'], DATA['n_covariates] + DATA['n_targets']), 8 | - values are all equals to their respective integer index! 9 | 10 | e.g: 11 | data = array( 12 | [[ 0., 0., 0., 0., 0.], 13 | [ 1., 1., 1., 1., 1.], 14 | ..., 15 | [998., 998., 998., 998., 998.], 16 | [999., 999., 999., 999., 999.]] 17 | ) 18 | 19 | """ 20 | 21 | indexes = np.arange(0, DATA['length']) 22 | data = np.zeros((DATA['length'], DATA['n_covariates'] + DATA['n_targets'])) \ 23 | + np.expand_dims(indexes, axis=1) 24 | return data 25 | 26 | def generate_data_zeros_and_ones() -> np.ndarray: 27 | """Create a dummy data made of zeros for covariates, and ones for the targets 28 | e.g: 29 | data = array( 30 | [[1.,1.,0.,0.,0.], 31 | [1.,1.,0.,0.,0.], 32 | ..., 33 | [1.,1.,0.,0.,0.], 34 | [1.,1.,0.,0.,0.]] 35 | ) 36 | """ 37 | shape = (DATA['length'], DATA['n_covariates'] + DATA['n_targets']) 38 | data = np.zeros(shape) 39 | data[:, DATA["target_column_idx"]] = 1. 40 | return data 41 | 42 | def generate_X_y_zeros_and_ones() -> Tuple[np.ndarray]: 43 | """Create a dummy (X,y) tuple made of zeros for covariates, and ones for the targets, just to check if model fit well""" 44 | length = round(DATA["length"] / TRAIN['stride']) 45 | 46 | shape_X = (length, TRAIN['input_length'], DATA['n_covariates']+DATA['n_targets']) 47 | X = np.zeros(shape_X) 48 | X[:, :, DATA["target_column_idx"]] = 1. 49 | 50 | shape_y = (length, TRAIN['output_length'], DATA['n_targets']) 51 | y = np.ones(shape_y) 52 | y = np.squeeze(y) 53 | 54 | return (X,y) 55 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Top level orchestrator of the project. To be called from the CLI. 3 | It comprises all the "routes" you may want to call 4 | ''' 5 | from email import header 6 | import numpy as np 7 | import pandas as pd 8 | import os 9 | from ts_boilerplate.dataprep import get_Xi_yi, get_X_y, get_folds, train_test_split 10 | from ts_boilerplate.model import get_model, fit_model, predict_output 11 | from ts_boilerplate.metrics import mape, mae 12 | from ts_boilerplate.params import CROSS_VAL, ROOT_DIR, TRAIN, DATA 13 | from typing import Tuple, List 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def train(data: np.ndarray, print_metrics: bool = False): 18 | """ 19 | Train the model in this package on one fold `data` containing the 2D-array of time-series for your problem 20 | Returns `metrics_test` associated with the training 21 | """ 22 | # $CHALLENGIFY_BEGIN 23 | data_train, data_test = train_test_split(data, **TRAIN) 24 | X_train, y_train = get_X_y(data_train, **TRAIN) 25 | X_test, y_test = get_X_y(data_test, **TRAIN) 26 | model = get_model(X_train, y_train) 27 | history = fit_model(model, X_train, y_train) 28 | y_pred = predict_output(model, X_test) 29 | metrics_test = mae(y_test, y_pred) 30 | if print_metrics: 31 | print("### Test Metric: ", metrics_test) 32 | return metrics_test 33 | # $CHALLENGIFY_END 34 | 35 | 36 | def cross_validate(data: np.ndarray, print_metrics: bool = False): 37 | """ 38 | Cross-Validate the model in this package on`data` 39 | Returns `metrics_cv`: the list of test metrics at each fold 40 | """ 41 | # $CHALLENGIFY_BEGIN 42 | folds = get_folds(data, **CROSS_VAL) 43 | metrics_cv = [] 44 | for fold in folds: 45 | metrics_fold = train(fold, print_metrics=print_metrics) 46 | metrics_cv.append(metrics_fold) 47 | 48 | if print_metrics: 49 | print(f"### CV metrics after {len(folds)} folds ### ") 50 | print(metrics_cv) 51 | return metrics_cv 52 | # $CHALLENGIFY_END 53 | 54 | 55 | def backtest(data: np.ndarray, 56 | stride: int = 1, 57 | start_ratio: float = 0.9, 58 | retrain: bool = True, 59 | retrain_every: int = 1, 60 | print_metrics=False, 61 | plot_metrics=False): 62 | """Returns historical forecasts for the entire dataset 63 | - by training model up to `start_ratio` of the dataset 64 | - then predicting next values using the model in this package (only predict the last time-steps if `predict_only_last_value` is True) 65 | - then moving `stride` timesteps ahead 66 | - then retraining the model if `retrain` is True and if we moved `retrain_every` timesteps since last training 67 | - then predicting next values again 68 | 69 | Return: 70 | - all historical predictions as 2D-array time-series of shape ((1-start_ratio)*len(data), n_targets)/stride 71 | - Compute the 'mean-MAPE' per forecast horizon 72 | - Print historical predictions if you want a visual check 73 | 74 | see https://unit8co.github.io/darts/generated_api/darts.models.forecasting.rnn_model.html#darts.models.forecasting.rnn_model.RNNModel.historical_forecasts 75 | """ 76 | # $CHALLENGIFY_BEGIN 77 | 78 | # Initialization 79 | start_timestep_0 = round(start_ratio * len(data)) 80 | data_train_0 = data[:start_timestep_0, ...] 81 | X_train_tmp, y_train_tmp = get_X_y(data_train_0, **TRAIN) 82 | data_test_backtested = data[start_timestep_0:, ...] 83 | _, y_test = get_X_y(data_test_backtested, **TRAIN, shuffle=False) 84 | y_pred_backtested = [] 85 | retrain_counter = 0 86 | timesteps_backtested_list = [] 87 | for i in range(0, len(data_test_backtested), stride): 88 | start_timestep_i = start_timestep_0 + i 89 | data_train = data[:start_timestep_i, ...] 90 | data_test = data[start_timestep_i:, ...] 91 | X_train_tmp, y_train_tmp = get_X_y(data_train, **TRAIN) 92 | X_test_i, y_test_i = get_Xi_yi(first_index=0, data=data_test, **TRAIN) 93 | 94 | # At some point after sliding through time, we will reach the end of the test set 95 | if y_test_i.shape[0] < y_train_tmp.shape[1]: 96 | break 97 | 98 | model = get_model(X_train_tmp, y_train_tmp) 99 | 100 | # Retrain when required, with incremental learning (ie. starting from previous weights) 101 | if retrain and i % retrain_every == 0: 102 | retrain_counter += 1 103 | fit_model(model, X_train_tmp, y_train_tmp) 104 | 105 | y_pred_i = np.squeeze(predict_output(model, X_test_i[None, ...])) 106 | y_pred_backtested.append(y_pred_i) 107 | timesteps_backtested_list.append(i) 108 | 109 | y_pred_backtested = np.array(y_pred_backtested) 110 | y_test_backtested = y_test[timesteps_backtested_list] 111 | # Check that we compare apples to apples 112 | assert y_pred_backtested.shape == y_test_backtested.shape 113 | 114 | metrics_backtested = mae(y_pred_backtested, y_test_backtested) 115 | 116 | if print_metrics: 117 | print( 118 | f'### BACKETESTED METRICS BASED ON THE LAST {y_pred_backtested.shape[0]} TIMESTEPS AND WITH {retrain_counter} retrain operations' 119 | ) 120 | print(mae(y_pred_backtested, y_test_backtested)) 121 | if plot_metrics: 122 | # TODO: make it work for any dimension of y 123 | plt.plot(y_pred_backtested[:,0,0], label='historical forecasts') 124 | plt.plot(y_test_backtested[:,0,0], label='truth') 125 | plt.xlabel('timesteps number (0=beginning of backtest)') 126 | plt.legend() 127 | plt.show() 128 | 129 | return metrics_backtested 130 | # $CHALLENGIFY_END 131 | 132 | if __name__ == '__main__': 133 | data = pd.read_csv(os.path.join(ROOT_DIR, 'data','raw','data.csv')).to_numpy() 134 | try: 135 | train(data=data, print_metrics=True) 136 | cross_validate(data=data, print_metrics=True) 137 | backtest(data=data, 138 | stride = 1, 139 | start_ratio = 0.9, 140 | retrain = True, 141 | retrain_every=1, 142 | print_metrics=True, 143 | plot_metrics=True) 144 | except: 145 | import ipdb, traceback, sys 146 | extype, value, tb = sys.exc_info() 147 | traceback.print_exc() 148 | ipdb.post_mortem(tb) 149 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/metrics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Computes usefull Time Series metrics from (y_true, y_test) 3 | ''' 4 | 5 | import numpy as np 6 | from tensorflow import reduce_mean 7 | from tensorflow.keras.metrics import mean_absolute_error, mean_absolute_percentage_error 8 | 9 | 10 | def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float: 11 | """Returns Mean Absolute Error""" 12 | # $CHALLENGIFY_BEGIN 13 | return reduce_mean(mean_absolute_error(y_true, y_pred)).numpy() 14 | # $CHALLENGIFY_END 15 | 16 | def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float: 17 | """Returns Mean Absolute Percentage Error""" 18 | # $CHALLENGIFY_BEGIN 19 | return reduce_mean(mean_absolute_percentage_error(y_true, y_pred)).numpy() 20 | # $CHALLENGIFY_END 21 | 22 | def mase(y_true: np.ndarray, y_pred: np.ndarray) -> float: 23 | """Returns Mean Absolute Scaled Error (https://en.wikipedia.org/wiki/Mean_absolute_scaled_error) 24 | """ 25 | pass 26 | 27 | 28 | def play_trading_strategy(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: 29 | """Returns the array of relative portfolio values over the test period""" 30 | pass 31 | 32 | 33 | def return_on_investment(played_trading_strategy: np.ndarray) -> float: 34 | """Returns the ROI of an investment strategy""" 35 | pass 36 | 37 | 38 | def sharpe_ratio(played_trading_strategy: np.ndarray) -> float: 39 | """Returns the Sharpe Ratio (Return on Investment / Volatility) of an investment strategy""" 40 | pass 41 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Dense, SimpleRNN, Reshape, Lambda, Input 3 | from tensorflow.keras import Model 4 | from ts_boilerplate.params import DATA, TRAIN 5 | 6 | # TODO: Should we add here the preprocessing? into a class called "pipeline"? 7 | # TODO: Should we refacto in a class ? Probably! 8 | 9 | 10 | def get_model(X_train, y_train): 11 | """Instanciate, compile and and return the model of your choice""" 12 | # $CHALLENGIFY_BEGIN 13 | 14 | # BASELINE: PREDICT LAST VALUE - ZERO TRAINABLE WEIGHTS 15 | input = Input(shape=X_train.shape[1:]) 16 | # Take last temporal values of the targets, and duplicate it as many times as `output_length` 17 | x = Lambda( 18 | lambda x: tf.repeat( 19 | tf.expand_dims(tf.gather(x[:, -1, :], indices=DATA['target_column_idx'], axis=1), axis=1), 20 | repeats=TRAIN['output_length'], 21 | axis=1) 22 | )(input) 23 | output = Reshape(y_train.shape[1:])(x) 24 | model = Model(input, output) 25 | 26 | # # THE SIMPLEST OF ALL POSSIBLE RNN 27 | # model = tf.keras.Sequential() 28 | # model.add(SimpleRNN(1, activation='tanh', input_shape=X_train.shape[1:])) 29 | # model.add(Dense(TRAIN['output_length'] * DATA["n_targets"], activation='linear')) 30 | # model.add(Reshape(y_train.shape[1:])) 31 | # model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE) 32 | 33 | model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), metrics=tf.keras.metrics.MAPE) 34 | return model 35 | # $CHALLENGIFY_END 36 | 37 | 38 | def fit_model(model, X_train, y_train, **kwargs): 39 | """Fit the `model` object, including preprocessing if needs be""" 40 | # $CHALLENGIFY_BEGIN 41 | verbose = kwargs.get("verbose", 0) 42 | es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 43 | patience=2, 44 | verbose=verbose, 45 | mode='min', 46 | restore_best_weights=True) 47 | history = model.fit(X_train, 48 | y_train, 49 | epochs=50, 50 | batch_size=16, 51 | validation_split=0.3, 52 | callbacks=[es], 53 | verbose=verbose) 54 | return history 55 | # $CHALLENGIFY_END 56 | 57 | 58 | def predict_output(model, X_test): 59 | """Return y_test. Include preprocessing if needs be""" 60 | # $CHALLENGIFY_BEGIN 61 | y_pred = model.predict(X_test) 62 | return y_pred 63 | # $CHALLENGIFY_END 64 | -------------------------------------------------------------------------------- /project-boilerplates/time-series-cross-validator/ts_boilerplate/params.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | ## CREDENTIALS AND PATHS 5 | load_dotenv() 6 | API_KEY = os.getenv('API_KEY') 7 | 8 | ## DIR PARAMS 9 | ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 10 | DATA_RAW_CSV_PATH = os.path.join(ROOT_DIR, 'data', 'raw', 'data.csv') 11 | 12 | # 👇 Please fill these global variable below very carefully, in order to create tests related to your problem👇 13 | # cf: https://github.com/lewagon/data-images/blob/master/DL/time-series-covariates.png?raw=true 14 | DATA = dict( 15 | length = 500, # How many timesteps does your dataset contains? 16 | n_covariates = 3, # number of past covariates, excluding target time series. Our tests do not support future_covariate yet. 17 | target_column_idx = [0,1] # List of index(es) of target column(s) in your dataset. e.g [0] for Mono-target problem, e.g. [0,1,4] for multi-variate targets problem. Note that past targets values will also be used as features X. 18 | ) 19 | DATA['n_targets'] = len(DATA['target_column_idx']) # number of target time series to predict. 20 | 21 | TRAIN = dict( 22 | horizon = 4, # start prediction xxx timestep ahead 23 | input_length = 10, # Length (in time) of each sequences that will be seen by the model (X.shape[1]) 24 | output_length = 7, # Length (in time) of prediction (y.shape[1]) 25 | stride = 1, # Integer used to create all pairs of sample (Xi, yi) by sliding in each data fold. Use `None` if you don't plan to use any sliding method in data.get_X_y 26 | train_test_ratio = 0.7, # ratio of train / (train+test) length in each fold 27 | ) 28 | 29 | CROSS_VAL = dict( 30 | fold_length = 200, 31 | fold_stride = 100, 32 | ) 33 | -------------------------------------------------------------------------------- /tutorials/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/.keep -------------------------------------------------------------------------------- /tutorials/removing-bottlenecks/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/removing-bottlenecks/model.png -------------------------------------------------------------------------------- /tutorials/removing-bottlenecks/row_column_wise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lewagon/data-templates/bd824a2a6f83f5cd7da6931189abefdedac843c2/tutorials/removing-bottlenecks/row_column_wise.png -------------------------------------------------------------------------------- /tutorials/removing-bottlenecks/slides.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f5e4ca46-23e7-4bc6-a49f-8ed36e611c43", 7 | "metadata": { 8 | "slideshow": { 9 | "slide_type": "skip" 10 | }, 11 | "tags": [] 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "from numba import jit, vectorize, float64\n", 18 | "from time import perf_counter as counter" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "70135331-91db-4ae6-b7ee-9ac0b1901e97", 24 | "metadata": { 25 | "slideshow": { 26 | "slide_type": "slide" 27 | }, 28 | "tags": [] 29 | }, 30 | "source": [ 31 | "# Removing bottlenecks with Numba, Cython, and TensorFlow" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "5c9f0890-bb38-4760-a74b-091f2eb8ea67", 37 | "metadata": { 38 | "slideshow": { 39 | "slide_type": "subslide" 40 | }, 41 | "tags": [] 42 | }, 43 | "source": [ 44 | "## Topics" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "af960d3e-7c57-4211-9baf-1a5d817e7dd8", 50 | "metadata": { 51 | "slideshow": { 52 | "slide_type": "fragment" 53 | }, 54 | "tags": [] 55 | }, 56 | "source": [ 57 | "1. Numba to speed up simple operations and create vectorising functions.\n", 58 | "2. When to and how to use Cython in its simplest form.\n", 59 | "3. How to include these functions in your packages.\n", 60 | "4. Tensorflow feature engineering.\n", 61 | "5. Quick win for TensorFlow speed in prediction." 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "021cbdcd-b78b-46eb-8646-e4ee1779e7a8", 67 | "metadata": { 68 | "slideshow": { 69 | "slide_type": "slide" 70 | }, 71 | "tags": [] 72 | }, 73 | "source": [ 74 | "## 1. Numba" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "12b25089-e0d6-4252-a8ec-0a795b86ed81", 80 | "metadata": { 81 | "slideshow": { 82 | "slide_type": "subslide" 83 | }, 84 | "tags": [] 85 | }, 86 | "source": [ 87 | "### Introduction" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "3e9c5391-e3d7-4b31-ab50-c2faef528ae5", 93 | "metadata": { 94 | "slideshow": { 95 | "slide_type": "fragment" 96 | }, 97 | "tags": [] 98 | }, 99 | "source": [ 100 | "- Library to translate python code into fast machine code.\n", 101 | "- Designed specifically for compatibility with numpy.\n", 102 | "- Provides just in time compilation." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "3a6054ca-30b9-4737-a3df-080e2b4e4582", 108 | "metadata": { 109 | "slideshow": { 110 | "slide_type": "subslide" 111 | }, 112 | "tags": [] 113 | }, 114 | "source": [ 115 | "### Demo" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "c6a21db2-b420-4fa9-b188-b5935ae8250c", 121 | "metadata": { 122 | "slideshow": { 123 | "slide_type": "subslide" 124 | }, 125 | "tags": [] 126 | }, 127 | "source": [ 128 | "### Pros v Cons" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "id": "2b7a4395-3ff0-45ca-b205-c8d257fb8af0", 134 | "metadata": { 135 | "slideshow": { 136 | "slide_type": "fragment" 137 | }, 138 | "tags": [] 139 | }, 140 | "source": [ 141 | "Pros\n", 142 | "- Easy to implement in many cases.\n", 143 | "- Significant speed boosts.\n", 144 | "- Suited to a lot of data processing needs in data science.\n", 145 | "\n", 146 | "Cons\n", 147 | "- Limited scope regarding python libaries.\n", 148 | "- When certain functions either do not work or are not accelerated it is difficult to work out why not?\n", 149 | "- In more complex use cases needing to make all of the functions compatible is a hassle." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "8372b391-2719-4600-98eb-4ae7bf53895d", 155 | "metadata": { 156 | "slideshow": { 157 | "slide_type": "skip" 158 | }, 159 | "tags": [] 160 | }, 161 | "source": [ 162 | "### Speed comparisions" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "e23150e8-7c19-4e49-9836-c23afd9b0bac", 168 | "metadata": { 169 | "slideshow": { 170 | "slide_type": "skip" 171 | }, 172 | "tags": [] 173 | }, 174 | "source": [ 175 | "With loops" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "id": "e5a4cf2b-8efa-4743-bf04-62ac29375ae7", 182 | "metadata": { 183 | "slideshow": { 184 | "slide_type": "skip" 185 | }, 186 | "tags": [] 187 | }, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/html": [ 192 | "
\n", 193 | "\n", 206 | "\n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | "
d
00.638929
10.649349
20.496717
30.614220
40.260934
\n", 236 | "
" 237 | ], 238 | "text/plain": [ 239 | " d\n", 240 | "0 0.638929\n", 241 | "1 0.649349\n", 242 | "2 0.496717\n", 243 | "3 0.614220\n", 244 | "4 0.260934" 245 | ] 246 | }, 247 | "execution_count": 4, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "data = pd.DataFrame(np.random.uniform(0, 1, 1_000_000).reshape(-1,1))\n", 254 | "data.columns = [\"d\"]\n", 255 | "data.head()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 5, 261 | "id": "f455bf1f-b25c-4d81-890c-8e59d55196c7", 262 | "metadata": { 263 | "slideshow": { 264 | "slide_type": "skip" 265 | }, 266 | "tags": [] 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "test_values = [\"1\", \"10\", \"100\", \"1_000\", \"10_000\", \"100_000\", \"1_000_000\"]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 6, 276 | "id": "267c9df5-11e3-4657-bd40-0b7c4113aa90", 277 | "metadata": { 278 | "slideshow": { 279 | "slide_type": "skip" 280 | }, 281 | "tags": [] 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "empty_results = [np.nan for i in range(7)]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 7, 291 | "id": "c412aca3-4715-45cf-91eb-724032b4ff87", 292 | "metadata": { 293 | "slideshow": { 294 | "slide_type": "skip" 295 | }, 296 | "tags": [] 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "results = {\n", 301 | " \"Pure Python\": empty_results.copy(),\n", 302 | " \"Jit Operation\": empty_results.copy(),\n", 303 | " \"Jit Apply\": empty_results.copy(),\n", 304 | " \"Jit Loop\": empty_results.copy(),\n", 305 | " \"Vectorize\": empty_results.copy(),\n", 306 | " \"Jit Vectorize\": empty_results.copy()\n", 307 | "}" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "id": "36b81f68-9434-44c4-8aae-fb00b6a9604a", 313 | "metadata": { 314 | "slideshow": { 315 | "slide_type": "skip" 316 | }, 317 | "tags": [] 318 | }, 319 | "source": [ 320 | "Pure python test" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 8, 326 | "id": "27a0b14c-cb01-43ce-a58c-0b7c90f32f74", 327 | "metadata": { 328 | "slideshow": { 329 | "slide_type": "skip" 330 | }, 331 | "tags": [] 332 | }, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "Pure Python\n", 339 | "Testing 1\n", 340 | "Testing 10\n" 341 | ] 342 | } 343 | ], 344 | "source": [ 345 | "def operation(x):\n", 346 | " val = 0\n", 347 | " for i in range(1_000):\n", 348 | " for j in range(1_000):\n", 349 | " val += (x * i) - (x * j) \n", 350 | " return val\n", 351 | "\n", 352 | "def pure_python_test(n):\n", 353 | " new = np.zeros(n).reshape(-1,1)\n", 354 | " for i in range(n):\n", 355 | " new[i,:] = operation(data.iloc[i,:][0])\n", 356 | " return new\n", 357 | "\n", 358 | "test = \"Pure Python\"\n", 359 | "print(test)\n", 360 | "for i, val in enumerate(test_values):\n", 361 | " if i >= 2:\n", 362 | " break\n", 363 | " print(f\"Testing {val}\")\n", 364 | " baseline_begin = counter()\n", 365 | " pure_python_test(int(val))\n", 366 | " baseline_end = counter()\n", 367 | " results[test][i] = baseline_end-baseline_begin" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "id": "37ca0ee6-a9e3-476a-af91-23423c857f7e", 373 | "metadata": { 374 | "slideshow": { 375 | "slide_type": "skip" 376 | }, 377 | "tags": [] 378 | }, 379 | "source": [ 380 | "Jit operation test" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 9, 386 | "id": "a3ec308d-70e3-4faf-910f-8b75295eaeeb", 387 | "metadata": { 388 | "slideshow": { 389 | "slide_type": "skip" 390 | }, 391 | "tags": [] 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "Jit Operation\n", 399 | "Testing 1\n", 400 | "Testing 10\n", 401 | "Testing 100\n", 402 | "Testing 1_000\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "@jit\n", 408 | "def jit_operation(x):\n", 409 | " val = 0\n", 410 | " for i in range(1_000):\n", 411 | " for j in range(1_000):\n", 412 | " val += (x * i) - (x * j) \n", 413 | " return val\n", 414 | "\n", 415 | "def jit_operation_test(n):\n", 416 | " new = np.zeros(n).reshape(-1,1)\n", 417 | " d = data.head(n)\n", 418 | " for i in range(n):\n", 419 | " x = d.iloc[i,:][0]\n", 420 | " val = jit_operation(x) \n", 421 | " new[i,:] = val\n", 422 | " return new\n", 423 | "\n", 424 | "test = \"Jit Operation\"\n", 425 | "print(test)\n", 426 | "for i, val in enumerate(test_values):\n", 427 | " if i >= 4:\n", 428 | " break\n", 429 | " print(f\"Testing {val}\")\n", 430 | " baseline_begin = counter()\n", 431 | " jit_operation_test(int(val))\n", 432 | " baseline_end = counter()\n", 433 | " results[test][i] = baseline_end-baseline_begin" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "id": "483d7e62-5ba2-47e3-aee2-4ce2e0a85103", 439 | "metadata": { 440 | "slideshow": { 441 | "slide_type": "skip" 442 | }, 443 | "tags": [] 444 | }, 445 | "source": [ 446 | "Jit loop test" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 10, 452 | "id": "a6b9b251-63e5-4cac-acd7-c661f447703c", 453 | "metadata": { 454 | "slideshow": { 455 | "slide_type": "skip" 456 | }, 457 | "tags": [] 458 | }, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "Jit Loop\n", 465 | "Testing 1\n", 466 | "Testing 10\n", 467 | "Testing 100\n", 468 | "Testing 1_000\n", 469 | "Testing 10_000\n", 470 | "Testing 100_000\n", 471 | "Testing 1_000_000\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "@jit\n", 477 | "def jit_operation(x):\n", 478 | " val = 0\n", 479 | " for i in range(1_000):\n", 480 | " for j in range(1_000):\n", 481 | " val += (x * i) - (x * j) \n", 482 | " return val\n", 483 | "\n", 484 | "@jit\n", 485 | "def jit_loop_test(data):\n", 486 | " new = np.zeros(len(data)).reshape(-1,1)\n", 487 | " for i, val in enumerate(data):\n", 488 | " new[i,:] = val\n", 489 | " return new\n", 490 | "\n", 491 | "test = \"Jit Loop\"\n", 492 | "print(test)\n", 493 | "for i, val in enumerate(test_values):\n", 494 | " print(f\"Testing {val}\")\n", 495 | " baseline_begin = counter()\n", 496 | " jit_loop_test(np.array(data.head(int(val))[\"d\"]))\n", 497 | " baseline_end = counter()\n", 498 | " results[test][i] = baseline_end-baseline_begin" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "id": "66838bdc-a519-4899-8a46-61ca73d10f99", 504 | "metadata": { 505 | "jp-MarkdownHeadingCollapsed": true, 506 | "slideshow": { 507 | "slide_type": "skip" 508 | }, 509 | "tags": [] 510 | }, 511 | "source": [ 512 | "Jit apply test" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 11, 518 | "id": "62c520b1-fad5-43bb-b6d0-b2c79c6b8549", 519 | "metadata": { 520 | "slideshow": { 521 | "slide_type": "skip" 522 | }, 523 | "tags": [] 524 | }, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "Jit Apply\n", 531 | "Testing 1\n", 532 | "Testing 10\n", 533 | "Testing 100\n", 534 | "Testing 1_000\n", 535 | "Testing 10_000\n" 536 | ] 537 | } 538 | ], 539 | "source": [ 540 | "@jit\n", 541 | "def jit_operation(x):\n", 542 | " val = 0\n", 543 | " for i in range(1_000):\n", 544 | " for j in range(1_000):\n", 545 | " val += (x * i) - (x * j) \n", 546 | " return val\n", 547 | "\n", 548 | "def jit_apply_test(n):\n", 549 | " t_d = data.head(n)\n", 550 | " return t_d[\"d\"].apply(jit_operation)\n", 551 | "\n", 552 | "test = \"Jit Apply\"\n", 553 | "print(test)\n", 554 | "for i, val in enumerate(test_values):\n", 555 | " if i >= 5:\n", 556 | " break\n", 557 | " print(f\"Testing {val}\")\n", 558 | " baseline_begin = counter()\n", 559 | " jit_apply_test(int(val))\n", 560 | " baseline_end = counter()\n", 561 | " results[test][i] = baseline_end-baseline_begin" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "id": "35006bc5-6a8e-41df-9b64-3839e74ed232", 567 | "metadata": { 568 | "slideshow": { 569 | "slide_type": "skip" 570 | }, 571 | "tags": [] 572 | }, 573 | "source": [ 574 | "Vectorize test" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 12, 580 | "id": "b9638155-021d-47ea-a061-eb6333bc5afa", 581 | "metadata": { 582 | "slideshow": { 583 | "slide_type": "skip" 584 | }, 585 | "tags": [] 586 | }, 587 | "outputs": [ 588 | { 589 | "name": "stdout", 590 | "output_type": "stream", 591 | "text": [ 592 | "Vectorize\n", 593 | "Testing 1\n", 594 | "Testing 10\n", 595 | "Testing 100\n", 596 | "Testing 1_000\n", 597 | "Testing 10_000\n" 598 | ] 599 | } 600 | ], 601 | "source": [ 602 | "@vectorize([float64(float64)])\n", 603 | "def vectorize_operation(x):\n", 604 | " val = 0\n", 605 | " for i in range(1_000):\n", 606 | " for j in range(1_000):\n", 607 | " val += (x * i) - (x * j) \n", 608 | " return val\n", 609 | "\n", 610 | "def vectorize_test(n):\n", 611 | " t_d = data.head(n)\n", 612 | " new = vectorize_operation(t_d[\"d\"])\n", 613 | " return new\n", 614 | "\n", 615 | "test = \"Vectorize\"\n", 616 | "print(test)\n", 617 | "for i, val in enumerate(test_values):\n", 618 | " if i >= 5:\n", 619 | " break\n", 620 | " print(f\"Testing {val}\")\n", 621 | " baseline_begin = counter()\n", 622 | " vectorize_test(int(val))\n", 623 | " baseline_end = counter()\n", 624 | " results[test][i] = baseline_end-baseline_begin" 625 | ] 626 | }, 627 | { 628 | "cell_type": "markdown", 629 | "id": "45ee8732-2985-42a4-9884-a1c24243fe80", 630 | "metadata": { 631 | "slideshow": { 632 | "slide_type": "skip" 633 | }, 634 | "tags": [] 635 | }, 636 | "source": [ 637 | "Jit Operation with Vectorize" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 13, 643 | "id": "51aa495a-1a70-4c8f-a1d5-858f83dae4cc", 644 | "metadata": { 645 | "slideshow": { 646 | "slide_type": "skip" 647 | }, 648 | "tags": [] 649 | }, 650 | "outputs": [ 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "Jit Vectorize\n", 656 | "Testing 1\n", 657 | "Testing 10\n", 658 | "Testing 100\n", 659 | "Testing 1_000\n", 660 | "Testing 10_000\n" 661 | ] 662 | } 663 | ], 664 | "source": [ 665 | "@jit\n", 666 | "def jit_operation(x):\n", 667 | " val = 0\n", 668 | " for i in range(1_000):\n", 669 | " for j in range(1_000):\n", 670 | " val += (x * i) - (x * j) \n", 671 | " return val\n", 672 | "\n", 673 | "@vectorize([float64(float64)])\n", 674 | "def jit_vectorize_operation(x):\n", 675 | " return jit_operation(x)\n", 676 | "\n", 677 | "def jit_vectorize_test(n):\n", 678 | " t_d = data.head(n)\n", 679 | " return jit_vectorize_operation(t_d[\"d\"])\n", 680 | "\n", 681 | "test = \"Jit Vectorize\"\n", 682 | "print(test)\n", 683 | "for i, val in enumerate(test_values):\n", 684 | " if i >= 5:\n", 685 | " break\n", 686 | " print(f\"Testing {val}\")\n", 687 | " baseline_begin = counter()\n", 688 | " jit_vectorize_test(int(val))\n", 689 | " baseline_end = counter()\n", 690 | " results[test][i] = baseline_end-baseline_begin" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 14, 696 | "id": "33bd0e20-7824-4339-aba5-20ce7c746526", 697 | "metadata": { 698 | "slideshow": { 699 | "slide_type": "skip" 700 | }, 701 | "tags": [] 702 | }, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/html": [ 707 | "\n", 709 | "\n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | "
Speed of function in s (given number of operations)
 1101001_00010_000100_0001_000_000
Pure Python0.4903794.222536nannannannannan
Jit Operation0.2081730.0154390.1285561.074879nannannan
Jit Apply0.0856090.0120460.1109981.15607510.003750nannan
Jit Loop0.3724150.0001840.0000870.0000830.0001380.0006560.005486
Vectorize0.0029100.0128100.1073740.9930539.964653nannan
Jit Vectorize0.0013970.0116490.1007650.98434410.869915nannan
\n" 786 | ], 787 | "text/plain": [ 788 | "" 789 | ] 790 | }, 791 | "execution_count": 14, 792 | "metadata": {}, 793 | "output_type": "execute_result" 794 | } 795 | ], 796 | "source": [ 797 | "results_df = pd.DataFrame.from_dict(results, orient=\"index\")\n", 798 | "results_df.columns = test_values\n", 799 | "results_df.style.set_caption(\"Speed of function in s (given number of operations)\")" 800 | ] 801 | }, 802 | { 803 | "cell_type": "markdown", 804 | "id": "d42e1af0-1a87-467e-95a8-1ec6f8f3a8c8", 805 | "metadata": { 806 | "slideshow": { 807 | "slide_type": "slide" 808 | }, 809 | "tags": [] 810 | }, 811 | "source": [ 812 | "## 2. Cython" 813 | ] 814 | }, 815 | { 816 | "cell_type": "markdown", 817 | "id": "df171326-8b9e-4a52-9893-76d8dca68e56", 818 | "metadata": { 819 | "slideshow": { 820 | "slide_type": "subslide" 821 | }, 822 | "tags": [] 823 | }, 824 | "source": [ 825 | "### Introduction" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "id": "65984cc1-da91-40c6-bcee-3d98c6c37878", 831 | "metadata": { 832 | "slideshow": { 833 | "slide_type": "fragment" 834 | }, 835 | "tags": [] 836 | }, 837 | "source": [ 838 | "- The Cython language is a superset of the Python language that additionally supports calling C functions and declaring C types on variables and class attributes. This allows the compiler to generate very efficient C code from Cython code. \n", 839 | "- Write Python code that calls back and forth from and to C or C++ code natively at any point.\n", 840 | "- Easily tune readable Python code into plain C performance by adding static type declarations, also in Python syntax." 841 | ] 842 | }, 843 | { 844 | "cell_type": "markdown", 845 | "id": "8a17851e-c814-4fe9-9db6-1dcc3050368b", 846 | "metadata": { 847 | "slideshow": { 848 | "slide_type": "subslide" 849 | }, 850 | "tags": [] 851 | }, 852 | "source": [ 853 | "### Demo " 854 | ] 855 | }, 856 | { 857 | "cell_type": "markdown", 858 | "id": "c6d25112-8aca-4b97-802a-fc6c78769cbb", 859 | "metadata": { 860 | "slideshow": { 861 | "slide_type": "subslide" 862 | }, 863 | "tags": [] 864 | }, 865 | "source": [ 866 | "### Pros v Cons" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "id": "79e5d6cf-8133-43e1-9037-a1ccc566b3f0", 872 | "metadata": { 873 | "slideshow": { 874 | "slide_type": "fragment" 875 | }, 876 | "tags": [] 877 | }, 878 | "source": [ 879 | "Pros\n", 880 | "- Very fast.\n", 881 | "- Extensively supported.\n", 882 | "- Utilise C libaries.\n", 883 | "\n", 884 | "Cons\n", 885 | "- Need to learn how to write.\n", 886 | "- Difficult to optimise.\n", 887 | "- Difficulty also rises quickly with complexity." 888 | ] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "id": "10a03e96-389b-47e4-89ca-c83a652b39f3", 893 | "metadata": { 894 | "slideshow": { 895 | "slide_type": "slide" 896 | }, 897 | "tags": [] 898 | }, 899 | "source": [ 900 | "## 3. Packaging" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "id": "334e51be-a27a-4e65-af83-46e10abf29d8", 906 | "metadata": { 907 | "slideshow": { 908 | "slide_type": "subslide" 909 | }, 910 | "tags": [] 911 | }, 912 | "source": [ 913 | "### Boilerplate" 914 | ] 915 | }, 916 | { 917 | "cell_type": "markdown", 918 | "id": "a025d97d-2c50-4fa0-b8f8-277bf9b82037", 919 | "metadata": { 920 | "slideshow": { 921 | "slide_type": "slide" 922 | }, 923 | "tags": [] 924 | }, 925 | "source": [ 926 | "## 4. TensorFlow " 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "id": "f15ab448-d130-42a4-abf9-f92c17c3eff4", 932 | "metadata": { 933 | "slideshow": { 934 | "slide_type": "subslide" 935 | }, 936 | "tags": [] 937 | }, 938 | "source": [ 939 | "### Demo" 940 | ] 941 | }, 942 | { 943 | "cell_type": "markdown", 944 | "id": "60cec30e-fc95-40a2-be17-fb5caf498b4f", 945 | "metadata": { 946 | "slideshow": { 947 | "slide_type": "slide" 948 | }, 949 | "tags": [] 950 | }, 951 | "source": [ 952 | "# 5. Final questions" 953 | ] 954 | } 955 | ], 956 | "metadata": { 957 | "kernelspec": { 958 | "display_name": "Python 3 (ipykernel)", 959 | "language": "python", 960 | "name": "python3" 961 | }, 962 | "language_info": { 963 | "codemirror_mode": { 964 | "name": "ipython", 965 | "version": 3 966 | }, 967 | "file_extension": ".py", 968 | "mimetype": "text/x-python", 969 | "name": "python", 970 | "nbconvert_exporter": "python", 971 | "pygments_lexer": "ipython3", 972 | "version": "3.8.12" 973 | } 974 | }, 975 | "nbformat": 4, 976 | "nbformat_minor": 5 977 | } 978 | --------------------------------------------------------------------------------