├── .devcontainer └── devcontainer.json ├── .github └── workflows │ ├── feature_pipeline.yml │ └── inference_pipeline.yml ├── .gitignore ├── .python-version ├── README.md ├── notebooks ├── 01_load_and_validate_raw_data.ipynb ├── 02_transform_raw_data_into_ts_data.ipynb ├── 03_transform_ts_data_into_features_and_targets.ipynb ├── 04_transform_raw_data_into_features_and_targets.ipynb ├── 05_visualize_training_data.ipynb ├── 06_baseline_model.ipynb ├── 07_xgboost_model.ipynb ├── 08_lightgbm_model.ipynb ├── 09_lightgbm_model_with_feature_engineering.ipynb ├── 10_lightgbm_model_with_hyperparameter_tuning.ipynb ├── 11_backfill_feature_store.ipynb ├── 12_feature_pipeline.ipynb ├── 13_model_training_pipeline.ipynb └── 14_inference_pipeline.ipynb ├── poetry.lock ├── pyproject.toml └── src ├── __init__.py ├── __pycache__ ├── __init__.cpython-39.pyc ├── data.cpython-39.pyc ├── data_split.cpython-39.pyc ├── model.cpython-39.pyc ├── paths.cpython-39.pyc └── plot.cpython-39.pyc ├── config.py ├── data.py ├── data_split.py ├── feature_store_api.py ├── frontend.py ├── frontend_monitoring.py ├── inference.py ├── logger.py ├── model.py ├── model_registry_api.py ├── monitoring.py ├── paths.py └── plot.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "src/frontend_monitoring.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y > $GITHUB_PATH 47 | 48 | - name: Install Dependencies 49 | run: poetry install 50 | 51 | 52 | - name: exceute python workflows from bash script 53 | env: 54 | HOPSWORKS_PROJECT_NAME: ${{ secrets.HOPSWORKS_PROJECT_NAME }} #se agrega esto? 55 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 56 | run: poetry run jupyter nbconvert --to notebook --execute notebooks/12_feature_pipeline.ipynb 57 | 58 | # - name: Run feature generation script 59 | # env: 60 | # HOPSWORKS_API_KEY: $#{{ secrets.HOPSWORKS_API_KEY }} 61 | # run: make features 62 | 63 | 64 | -------------------------------------------------------------------------------- /.github/workflows/inference_pipeline.yml: -------------------------------------------------------------------------------- 1 | name: hourly-bike-demand-inference-pipeline #inference-pipeline 2 | 3 | # on: 4 | # workflow_run: 5 | # workflows: ["hourly-bike-demand-feature-pipeline"] 6 | # types: 7 | # - completed 8 | 9 | # workflow_dispatch: 10 | 11 | env: 12 | PYTHON_VERSION: 3.9 13 | POETRY_VERSION: 1.8.2 14 | POETRY_URL: https://install.python-poetry.org 15 | 16 | jobs: 17 | 18 | inference_pipeline: 19 | runs-on: ubuntu-latest 20 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v3 24 | 25 | # Poetry cache depends on OS, Python version and Poetry version. 26 | - name: Cache Poetry cache 27 | uses: actions/cache@v3 28 | 29 | with: 30 | path: ~/.cache/pypoetry 31 | key: poetry-cache-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.POETRY_VERSION }} 32 | 33 | # virtualenv cache should depends on OS, Python version and `poetry.lock` (and optionally workflow files). 34 | - name: Cache Packages 35 | uses: actions/cache@v3 36 | with: 37 | path: ~/.local 38 | key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }} 39 | 40 | - name: Set up Python ${{ env.PYTHON_VERSION }} 41 | uses: actions/setup-python@v3 42 | with: 43 | python-version: ${{ env.PYTHON_VERSION }} 44 | 45 | - name: Install Poetry 46 | run: | 47 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }} 48 | echo "$HOME/.local/bin" >> $GITHUB_PATH 49 | 50 | - name: Install Dependencies 51 | run: poetry install 52 | 53 | 54 | - name: exceute python workflows from bash script 55 | env: 56 | HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }} 57 | COMET_ML_API_KEY: ${{ secrets.COMET_ML_API_KEY }} 58 | COMET_ML_WORKSPACE: ${{ secrets.COMET_ML_WORKSPACE }} 59 | COMET_ML_PROJECT_NAME: ${{ secrets.COMET_ML_PROJECT_NAME }} 60 | run: poetry run jupyter nbconvert --to notebook --execute notebooks/14_inference_pipeline.ipynb 61 | 62 | # - name: Generating new batch of predictions 63 | # env: 64 | # HOPSWORKS_API_KEY: $#{{ secrets.HOPSWORKS_API_KEY }} 65 | # run: make inference 66 | 67 | 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.13 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Bike sharing demand predictor service🚲🚛📊

3 | Proyecto final🚀 4 |
5 | 6 |
7 | 8 | 9 | 10 | 11 | 12 | ## Predictor de demanda 13 | 1. El siguiente proyecto es un predictor de la demanda de un sistema de bikesharing. Está planteado como un proyecto final del Máster de Data Science de Nodd3r. Les comparto su [web](https://nodd3r.com/) y agradezco especialmente a Christian Donaire que me ayudó mucho en todo este proceso de aprendizaje. 14 | 15 | También cabe destacar que este proyecto ha sido inspirado en el [curso](https://bit.ly/MLcourse_plb) de [Pau Labarta Bajo](https://github.com/Paulescu) en el que se contruye un predictor de demanda del servicio de taxi de Nueva York. 16 | 17 | 2. ¿Que es un sistema de bikesharing? Es el sistema de bicicletas de modalidad compartidas, es decir que cada ciudadano puede usarlas y luego dejarlas en estaciones específicas para ello. Estos sistemas están presentes en distintas ciudades. En el siguiente proyecto se hará un prototipo de predicción de demanda para planificar donde deberían haber más bicicletas en determinadas horas. 18 | 19 | Por tanto, el problema que se busca resolver es el rebalanceo de bicicletas🚲➡️🚛 en sistemas de bikesharing. Pero ¿qué es el rebalanceo? Básicamente sería mover las bicicletas de una estación a otra para que cuando vayas encuentres bicicletas para realizar tu viaje. 20 | 21 | Para ello se plantea predecir la demanda de bicicletas de las siguientes 36 horas. ¿Para qué 36 horas? Para que la empresa que realiza ese trabajo pueda tener un tiempo considerable para preveer los picos de demanda. 22 | 23 | 3. ¿Cómo? Basado en los datos de demanda de las horas anteriores se buscará predecir la demanda de las siguientes 36 horas. 24 | 25 | Para ello utilicé el [dataset del Gobierno de la Cuidad de Buenos Aires](https://data.buenosaires.gob.ar/dataset/bicicletas-publicas) que se actualiza mensualmente para lograr este propósito. 26 | 27 | Cabe destacar que utilicé "poetry" para crear un entorno virtual y tener más comodidad para gestionar librerías. Además utilicé un feature store llamado "hopsworks" con el que guardo los datos históricos, el modelo creado y las predicciones. 28 | 29 | También utilicé github actions para automatizar el script de descarga de features de la web del gobierno de buenos aires y la subida a hopsworks. También hice lo mismo con las predicciones, es decir un script que cada hora predice, y sube a hopsworks esa predicción. Esto fue hecho para que el tablero sólo tenga que consumir esos datos que están guardados y sea más rápido. 30 | 31 | 4. ¿Qué modelo🤖 se utilizó para ello? Los modelos basados en XGBoost son muy útiles para predecir series de tiempo (y además mucho menos complejos que una red neuronal) pero para que funcionen correctamente se le debe dar los datos de una determinada manera que le facilita el aprendizaje. 32 | 33 | 34 | ## Resumen del código 35 | 1. En el notebook 1, 2, 3, 4 y 5 básicamente lo que se hizo fue: 36 | - Descargar los datos y descomprimirlos. 37 | - Realizar una limpieza y convertirlos a formato parquet dado que es un formato que es útil para el propósito que buscamos y tiene varias ventajas. 38 | - Eliminar los minutos y segundos y aproximarlos a la hora previa. 39 | - Agregar las horas que no hubieron viajes con el valor "cero" y graficar. 40 | - Crear una función en la que obtenemos los índices de las distintas filas para luego darle la forma más adecuada al dataset para que el modelo aprenda. 41 | - Crear ese dataset que el modelo utilizará para aprender. (La forma en la que transformamos el dataset es que pasan de ser 3 columnas con hora, viaje y estación, a una columna por cada hora, junto con la información de la estación y la hora de referencia. Es decir del dataset original tomamos una cantidad de filas (horas previas y siguientes) y realizamos una transposición, luego bajamos una fila y repetimos el proceso. En este caso utilizamos 672 horas previas es decir 28 días y 36 horas siguientes). 42 | - Por último, se realizó una función que grafique los registros previos y los siguientes. 43 | 44 | 2. En el notebook 6, 7, 8, 9 y 10: 45 | - Se divieron los datos en train y test. 46 | - Se crearon un modelos base (sin aplicar Machine Learning) sobre los que comparar luego los modelos más complejos. 47 | - Luego se probó con XGBoost y LightGBM dando mejores resultados éste último. 48 | - Lo siguiente fue seguir con LightGBM y aplicarle feature engineering para mejorar el modelo. Para ello se agregó: el promedio de las últimas 4 semanas, latitud y longitud, hora y día de la semana. 49 | - Se utilizó optuna para realizar un hyperparameter tuning del modelo. 50 | 51 | 3. En el notebook 11, 12, 13 y 14: 52 | - Se creó el proyecto en Hopsworks (feature store). Lo cual nos permite ir guardando los distintos registros que se descargan. Para ello se debe crear un feature group en el que guardarlo y luego para poder consumirlo es más cómodo mediante un feature view. Para ello se van creando este tipo de figuras para poder guardar los datos. 53 | - El notebook 12 básicamente lo que realiza es: descarga los datos de la web del Gobierno de Bs As, realiza una limpieza y lo sube al feature store. Para automatizar esto se utilizó un github action que se ejecuta cada hora. 54 | - En el notebook 13 se obtiene el modelo, se guarda y se sube a CometML (que luego se lo utilizará para realizar las predicciones). 55 | - En el notebook 14 se leen los distintos datos del feature store, se carga el modelo, se crean las predicciones y se las guarda en el feature store. Para automatizarlo se creo otra github action que se ejecuta inmediatamente después de que termina la otra github action. 56 | 57 | 4. También tenemos otros archivos en las carpeta src. En ellas hay distintas funciones que se utilizan en los distintos notebooks para no tener que repetir la función. Por tanto sólo con importarla ya se puede utilizar. Además dentro de esa carpeta están los dos tableros que ahora comentaré: 58 | - El primer tablero es el de frontend el cual consulta al feature store y carga los datos previos y las predicciones correspondientes. Además se grafica un mapa en el que se puede ver la estación que tendrá más demanda en las próximas 36 horas (en la descripción está la demanda esperada y la hora). Luego más abajo se encontrarán los gráficos que del top 10 de estaciones con la máxima demanda. 59 | - El segundo tabledo es el de frontend monitoring en el que se puede observar el error global y el error de las estaciones con mayor demanda. 60 | 61 | ## Tableros 62 | - [Dashboard con predicciones del modelo📈](https://bike-sharing-demand-predictor-ecobici.streamlit.app/) 63 | 64 |

65 | 66 |

67 |
68 | 69 | - [Dashboard con monitorización de errores del modelo🔍](https://bike-sharing-mae-error-monitoring.streamlit.app/) 70 | 71 |

72 | 73 |

74 | 75 |
76 | PD1: Cabe destacar que no se tiene acceso a los datos reales de la última hora. Por tanto para salvar eso, lo que se hace es una simulación de consulta en la que se obtienen datos de otro año que simulan ser la última hora, para luego incluirlos en la base de datos. 77 | 78 | PD2: En caso de que al abrir los tableros aparezca un error, volver a cargar la página para solucionarlo. 79 | 80 |
81 |
82 | Gracias por leer. Sigamos en contacto🙌🏻 83 |
84 | Twitter • 85 | LinkedIn 86 |
87 |
88 | 89 | 90 | -------------------------------------------------------------------------------- /notebooks/01_load_and_validate_raw_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pathlib import Path\n", 10 | "import requests\n", 11 | "import zipfile\n", 12 | "import pandas as pd\n", 13 | "import pyarrow.parquet as pq\n", 14 | "\n", 15 | "def download_one_file_of_raw_data(year: int) -> Path:\n", 16 | " \"\"\"\"\"\"\n", 17 | " URL = f'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/bicicletas-publicas/recorridos-realizados-{year}.zip'\n", 18 | " response= requests.get(URL) #, stream=True)\n", 19 | "\n", 20 | " \n", 21 | " if response.status_code == 200:\n", 22 | " # with open(nombre_archivo_zip, 'wb') as file:\n", 23 | " # for chunk in response.iter_content(chunk_size=8192): # Tamaño del búfer ajustado a 8192 bytes\n", 24 | " # file.write(chunk)\n", 25 | " path = f'../data/raw/recorridos-realizados-{year}.zip'\n", 26 | " open(path, \"wb\").write(response.content) \n", 27 | " print(f'descargado año {year}')\n", 28 | "\n", 29 | " return path\n", 30 | " else:\n", 31 | " raise Exception(f'{URL} is not available')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pyarrow as pa\n", 41 | "\n", 42 | "def unzip_and_convert_csv_to_parquet(year: int) -> Path:\n", 43 | " nombre_archivo_zip = f\"../data/raw/recorridos-realizados-{year}.zip\"\n", 44 | " # Descomprimir el archivo zip\n", 45 | " with zipfile.ZipFile(nombre_archivo_zip, 'r') as archivo_zip:\n", 46 | "\n", 47 | " # Extraer el archivo CSV del zip\n", 48 | " nombre_archivo_csv = archivo_zip.namelist()[0] # Suponiendo que el archivo CSV es el primer archivo en el zip\n", 49 | " archivo_zip.extractall(f\"../data/raw/\")\n", 50 | "\n", 51 | " # Leer el archivo CSV con pandas\n", 52 | " df = pd.read_csv(f\"../data/raw/{nombre_archivo_csv}\", delimiter=',', decimal=\".\")\n", 53 | "\n", 54 | " # Convertir el DataFrame a formato parquet\n", 55 | " nombre_archivo_parquet = f\"rides_{year}.parquet\"\n", 56 | " table = pa.Table.from_pandas(df)\n", 57 | " pq.write_table(table, f\"../data/raw/{nombre_archivo_parquet}\")\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | " # table = pq.Table.from_pandas(df)\n", 62 | " # pq.write_table(table, nombre_archivo_parquet)\n", 63 | " #pq.write_table(pq.Table.from_pandas(df), nombre_archivo_parquet)\n", 64 | " path = f'../data/raw/rides_{year}.parquet'\n", 65 | " return path\n", 66 | " " 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 1, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "download_one_file_of_raw_data(year=2022)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 9, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/plain": [ 86 | "'../data/raw/rides_2022.parquet'" 87 | ] 88 | }, 89 | "execution_count": 9, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "unzip_and_convert_csv_to_parquet(year=2022)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 10, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/html": [ 106 | "
\n", 107 | "\n", 120 | "\n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | "
Unnamed: 0XId_recorridoduracion_recorridofecha_origen_recorridoid_estacion_origennombre_estacion_origendireccion_estacion_origenlong_estacion_origenlat_estacion_origenfecha_destino_recorridoid_estacion_destinonombre_estacion_destinodireccion_estacion_destinolong_estacion_destinolat_estacion_destinoid_usuariomodelo_bicicletaGénero
01113267975BAEcobici2,6102022-01-16 14:58:425BAEcobici005 - Plaza ItaliaAv. Sarmiento 2601-58.420954-34.5805502022-01-16 15:42:12210BAEcobici335 - General UrquizaFigueroa Alcorta & Sarmiento-58.411278-34.572165776361BAEcobiciICONICFEMALE
12213268526BAEcobici5452022-01-16 17:26:2751BAEcobici051 - TUCUMANTucuman & 9 De Julio Av.-58.382126-34.6014782022-01-16 17:35:32174BAEcobici174 - MINISTERIO DE EDUCACIONMarcelo T. de Alvear & Rodriguez Peña-58.391768-34.597225776407BAEcobiciICONICMALE
23313268400BAEcobici2,0612022-01-16 16:51:12161BAEcobici161 - Humahuaca3912 Humahuaca-58.419676-34.6020782022-01-16 17:25:33117BAEcobici117 - HUMBERTO 1°Peru 1016-58.374176-34.620101671762BAEcobiciICONICFEMALE
34413268164BAEcobici12,7482022-01-16 15:58:01210BAEcobici335 - General UrquizaFigueroa Alcorta & Sarmiento-58.411278-34.5721652022-01-16 19:30:29382BAEcobici204 - BiarritzBiarritz 2403-58.477390-34.605470776361BAEcobiciICONICFEMALE
45513270010BAEcobici4,3372022-01-16 23:40:09215BAEcobici113 - GuatemalaGuatemala 4773-58.424996-34.5858782022-01-17 00:52:26205BAEcobici125 - F.J.Santamaria de OroF.J.Santamaria de Oro & Guatemala-58.428016-34.583323454615BAEcobiciICONICOTHER
56613269548BAEcobici4,2432022-01-16 20:42:30268BAEcobici399 - GARCIA DEL RIOAv. García del Río 3182-58.477000-34.5503002022-01-16 21:53:13268BAEcobici399 - GARCIA DEL RIOAv. García del Río 3182-58.477000-34.550300200959BAEcobiciICONICOTHER
67713268959BAEcobici9322022-01-16 18:47:17278BAEcobici233 - MONROE2519 Superi-58.469813-34.5641222022-01-16 19:02:49236BAEcobici254 - Plaza Rafael HernandezVuelta de Obligado 2004-58.455166-34.562161823366BAEcobiciICONICFEMALE
78813267669BAEcobici4,3372022-01-16 12:55:26368BAEcobici378 - AGRONOMIA4351 San Martin-58.482079-34.5980702022-01-16 14:07:43368BAEcobici378 - AGRONOMIA4351 San Martin-58.482079-34.598070826754BAEcobiciICONICMALE
89913267592BAEcobici3,7352022-01-16 12:25:03382BAEcobici204 - BiarritzBiarritz 2403-58.477390-34.6054702022-01-16 13:27:185BAEcobici005 - Plaza ItaliaAv. Sarmiento 2601-58.420954-34.580550776361BAEcobiciICONICFEMALE
9101013376812BAEcobici3662022-01-29 15:13:01433BAEcobici273 - Plazoleta Colombia1619 Brandsen-58.373726-34.6376972022-01-29 15:19:076BAEcobici006 - Parque LezamaAvenida Martin Garcia, 295-58.369758-34.628526772370BAEcobiciICONICMALE
\n", 368 | "
" 369 | ], 370 | "text/plain": [ 371 | " Unnamed: 0 X Id_recorrido duracion_recorrido \\\n", 372 | "0 1 1 13267975BAEcobici 2,610 \n", 373 | "1 2 2 13268526BAEcobici 545 \n", 374 | "2 3 3 13268400BAEcobici 2,061 \n", 375 | "3 4 4 13268164BAEcobici 12,748 \n", 376 | "4 5 5 13270010BAEcobici 4,337 \n", 377 | "5 6 6 13269548BAEcobici 4,243 \n", 378 | "6 7 7 13268959BAEcobici 932 \n", 379 | "7 8 8 13267669BAEcobici 4,337 \n", 380 | "8 9 9 13267592BAEcobici 3,735 \n", 381 | "9 10 10 13376812BAEcobici 366 \n", 382 | "\n", 383 | " fecha_origen_recorrido id_estacion_origen nombre_estacion_origen \\\n", 384 | "0 2022-01-16 14:58:42 5BAEcobici 005 - Plaza Italia \n", 385 | "1 2022-01-16 17:26:27 51BAEcobici 051 - TUCUMAN \n", 386 | "2 2022-01-16 16:51:12 161BAEcobici 161 - Humahuaca \n", 387 | "3 2022-01-16 15:58:01 210BAEcobici 335 - General Urquiza \n", 388 | "4 2022-01-16 23:40:09 215BAEcobici 113 - Guatemala \n", 389 | "5 2022-01-16 20:42:30 268BAEcobici 399 - GARCIA DEL RIO \n", 390 | "6 2022-01-16 18:47:17 278BAEcobici 233 - MONROE \n", 391 | "7 2022-01-16 12:55:26 368BAEcobici 378 - AGRONOMIA \n", 392 | "8 2022-01-16 12:25:03 382BAEcobici 204 - Biarritz \n", 393 | "9 2022-01-29 15:13:01 433BAEcobici 273 - Plazoleta Colombia \n", 394 | "\n", 395 | " direccion_estacion_origen long_estacion_origen lat_estacion_origen \\\n", 396 | "0 Av. Sarmiento 2601 -58.420954 -34.580550 \n", 397 | "1 Tucuman & 9 De Julio Av. -58.382126 -34.601478 \n", 398 | "2 3912 Humahuaca -58.419676 -34.602078 \n", 399 | "3 Figueroa Alcorta & Sarmiento -58.411278 -34.572165 \n", 400 | "4 Guatemala 4773 -58.424996 -34.585878 \n", 401 | "5 Av. García del Río 3182 -58.477000 -34.550300 \n", 402 | "6 2519 Superi -58.469813 -34.564122 \n", 403 | "7 4351 San Martin -58.482079 -34.598070 \n", 404 | "8 Biarritz 2403 -58.477390 -34.605470 \n", 405 | "9 1619 Brandsen -58.373726 -34.637697 \n", 406 | "\n", 407 | " fecha_destino_recorrido id_estacion_destino nombre_estacion_destino \\\n", 408 | "0 2022-01-16 15:42:12 210BAEcobici 335 - General Urquiza \n", 409 | "1 2022-01-16 17:35:32 174BAEcobici 174 - MINISTERIO DE EDUCACION \n", 410 | "2 2022-01-16 17:25:33 117BAEcobici 117 - HUMBERTO 1° \n", 411 | "3 2022-01-16 19:30:29 382BAEcobici 204 - Biarritz \n", 412 | "4 2022-01-17 00:52:26 205BAEcobici 125 - F.J.Santamaria de Oro \n", 413 | "5 2022-01-16 21:53:13 268BAEcobici 399 - GARCIA DEL RIO \n", 414 | "6 2022-01-16 19:02:49 236BAEcobici 254 - Plaza Rafael Hernandez \n", 415 | "7 2022-01-16 14:07:43 368BAEcobici 378 - AGRONOMIA \n", 416 | "8 2022-01-16 13:27:18 5BAEcobici 005 - Plaza Italia \n", 417 | "9 2022-01-29 15:19:07 6BAEcobici 006 - Parque Lezama \n", 418 | "\n", 419 | " direccion_estacion_destino long_estacion_destino \\\n", 420 | "0 Figueroa Alcorta & Sarmiento -58.411278 \n", 421 | "1 Marcelo T. de Alvear & Rodriguez Peña -58.391768 \n", 422 | "2 Peru 1016 -58.374176 \n", 423 | "3 Biarritz 2403 -58.477390 \n", 424 | "4 F.J.Santamaria de Oro & Guatemala -58.428016 \n", 425 | "5 Av. García del Río 3182 -58.477000 \n", 426 | "6 Vuelta de Obligado 2004 -58.455166 \n", 427 | "7 4351 San Martin -58.482079 \n", 428 | "8 Av. Sarmiento 2601 -58.420954 \n", 429 | "9 Avenida Martin Garcia, 295 -58.369758 \n", 430 | "\n", 431 | " lat_estacion_destino id_usuario modelo_bicicleta Género \n", 432 | "0 -34.572165 776361BAEcobici ICONIC FEMALE \n", 433 | "1 -34.597225 776407BAEcobici ICONIC MALE \n", 434 | "2 -34.620101 671762BAEcobici ICONIC FEMALE \n", 435 | "3 -34.605470 776361BAEcobici ICONIC FEMALE \n", 436 | "4 -34.583323 454615BAEcobici ICONIC OTHER \n", 437 | "5 -34.550300 200959BAEcobici ICONIC OTHER \n", 438 | "6 -34.562161 823366BAEcobici ICONIC FEMALE \n", 439 | "7 -34.598070 826754BAEcobici ICONIC MALE \n", 440 | "8 -34.580550 776361BAEcobici ICONIC FEMALE \n", 441 | "9 -34.628526 772370BAEcobici ICONIC MALE " 442 | ] 443 | }, 444 | "execution_count": 10, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | "import pandas as pd\n", 451 | "\n", 452 | "rides = pd.read_parquet('../data/raw/rides_2022.parquet')\n", 453 | "\n", 454 | "rides.head(10)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 11, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/plain": [ 465 | "(2922805, 19)" 466 | ] 467 | }, 468 | "execution_count": 11, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "rides.shape" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 12, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "331" 486 | ] 487 | }, 488 | "execution_count": 12, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "rides.id_estacion_origen.nunique()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 13, 500 | "metadata": {}, 501 | "outputs": [ 502 | { 503 | "data": { 504 | "text/html": [ 505 | "
\n", 506 | "\n", 519 | "\n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | "
pickup_datetimepickup_location_id
02022-01-16 14:58:425
12022-01-16 17:26:2751
22022-01-16 16:51:12161
32022-01-16 15:58:01210
42022-01-16 23:40:09215
52022-01-16 20:42:30268
62022-01-16 18:47:17278
72022-01-16 12:55:26368
82022-01-16 12:25:03382
92022-01-29 15:13:01433
\n", 580 | "
" 581 | ], 582 | "text/plain": [ 583 | " pickup_datetime pickup_location_id\n", 584 | "0 2022-01-16 14:58:42 5\n", 585 | "1 2022-01-16 17:26:27 51\n", 586 | "2 2022-01-16 16:51:12 161\n", 587 | "3 2022-01-16 15:58:01 210\n", 588 | "4 2022-01-16 23:40:09 215\n", 589 | "5 2022-01-16 20:42:30 268\n", 590 | "6 2022-01-16 18:47:17 278\n", 591 | "7 2022-01-16 12:55:26 368\n", 592 | "8 2022-01-16 12:25:03 382\n", 593 | "9 2022-01-29 15:13:01 433" 594 | ] 595 | }, 596 | "execution_count": 13, 597 | "metadata": {}, 598 | "output_type": "execute_result" 599 | } 600 | ], 601 | "source": [ 602 | "#Nos quedamos sólo con las columnas que nos interesan y las renombramos\n", 603 | "rides = rides[['fecha_origen_recorrido', 'id_estacion_origen']]\n", 604 | "# Eliminar la parte \"BAEcobici\" y convertir a tipo int\n", 605 | "rides['id_estacion_origen'] = rides['id_estacion_origen'].str.replace('BAEcobici', '').astype(int)\n", 606 | "\n", 607 | "rides.rename(columns={\n", 608 | " 'fecha_origen_recorrido': 'pickup_datetime',\n", 609 | " 'id_estacion_origen': 'pickup_location_id',\n", 610 | "}, inplace=True)\n", 611 | "\n", 612 | "rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'],format='%Y-%m-%d %H:%M:%S')\n", 613 | "rides.head(10)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 14, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/plain": [ 624 | "331" 625 | ] 626 | }, 627 | "execution_count": 14, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | "rides.pickup_location_id.nunique()" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 15, 639 | "metadata": {}, 640 | "outputs": [ 641 | { 642 | "name": "stderr", 643 | "output_type": "stream", 644 | "text": [ 645 | "C:\\Users\\jayan\\AppData\\Local\\Temp\\ipykernel_13464\\3389344848.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", 646 | " rides['pickup_datetime'].describe()\n" 647 | ] 648 | }, 649 | { 650 | "data": { 651 | "text/plain": [ 652 | "count 2922805\n", 653 | "unique 2689886\n", 654 | "top 2022-08-12 14:45:13\n", 655 | "freq 7\n", 656 | "first 2022-01-01 00:11:07\n", 657 | "last 2022-12-31 23:55:39\n", 658 | "Name: pickup_datetime, dtype: object" 659 | ] 660 | }, 661 | "execution_count": 15, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "rides['pickup_datetime'].describe()" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 16, 673 | "metadata": {}, 674 | "outputs": [ 675 | { 676 | "name": "stderr", 677 | "output_type": "stream", 678 | "text": [ 679 | "C:\\Users\\jayan\\AppData\\Local\\Temp\\ipykernel_13464\\213125758.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n", 680 | " rides['pickup_datetime'].describe()\n" 681 | ] 682 | }, 683 | { 684 | "data": { 685 | "text/plain": [ 686 | "count 2922805\n", 687 | "unique 2689886\n", 688 | "top 2022-08-12 14:45:13\n", 689 | "freq 7\n", 690 | "first 2022-01-01 00:11:07\n", 691 | "last 2022-12-31 23:55:39\n", 692 | "Name: pickup_datetime, dtype: object" 693 | ] 694 | }, 695 | "execution_count": 16, 696 | "metadata": {}, 697 | "output_type": "execute_result" 698 | } 699 | ], 700 | "source": [ 701 | "rides = rides[rides.pickup_datetime >= '2022-01-01']\n", 702 | "rides = rides[rides.pickup_datetime < '2023-01-01']\n", 703 | "rides['pickup_datetime'].describe()" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 17, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "rides.to_parquet('../data/transformed/validated_rides_2022.parquet')" 713 | ] 714 | } 715 | ], 716 | "metadata": { 717 | "kernelspec": { 718 | "display_name": ".venv", 719 | "language": "python", 720 | "name": "python3" 721 | }, 722 | "language_info": { 723 | "codemirror_mode": { 724 | "name": "ipython", 725 | "version": 3 726 | }, 727 | "file_extension": ".py", 728 | "mimetype": "text/x-python", 729 | "name": "python", 730 | "nbconvert_exporter": "python", 731 | "pygments_lexer": "ipython3", 732 | "version": "3.9.13" 733 | }, 734 | "orig_nbformat": 4, 735 | "vscode": { 736 | "interpreter": { 737 | "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923" 738 | } 739 | } 740 | }, 741 | "nbformat": 4, 742 | "nbformat_minor": 2 743 | } 744 | -------------------------------------------------------------------------------- /notebooks/04_transform_raw_data_into_features_and_targets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "File 2022 was already in local storage\n" 23 | ] 24 | }, 25 | { 26 | "data": { 27 | "text/html": [ 28 | "
\n", 29 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | "
pickup_datetimepickup_location_id
02022-01-16 14:58:425
12022-01-16 17:26:2751
22022-01-16 16:51:12161
32022-01-16 15:58:01210
42022-01-16 23:40:09215
.........
29228002022-12-20 22:34:33336
29228012022-12-20 20:03:24379
29228022022-12-20 12:13:32169
29228032022-12-20 17:26:49469
29228042022-12-20 19:30:59273
\n", 108 | "

2922805 rows × 2 columns

\n", 109 | "
" 110 | ], 111 | "text/plain": [ 112 | " pickup_datetime pickup_location_id\n", 113 | "0 2022-01-16 14:58:42 5\n", 114 | "1 2022-01-16 17:26:27 51\n", 115 | "2 2022-01-16 16:51:12 161\n", 116 | "3 2022-01-16 15:58:01 210\n", 117 | "4 2022-01-16 23:40:09 215\n", 118 | "... ... ...\n", 119 | "2922800 2022-12-20 22:34:33 336\n", 120 | "2922801 2022-12-20 20:03:24 379\n", 121 | "2922802 2022-12-20 12:13:32 169\n", 122 | "2922803 2022-12-20 17:26:49 469\n", 123 | "2922804 2022-12-20 19:30:59 273\n", 124 | "\n", 125 | "[2922805 rows x 2 columns]" 126 | ] 127 | }, 128 | "execution_count": 2, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "from src.data import load_raw_data\n", 135 | "\n", 136 | "rides = load_raw_data(year=2022)\n", 137 | "rides" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "\n", 150 | "Int64Index: 2922805 entries, 0 to 2922804\n", 151 | "Data columns (total 2 columns):\n", 152 | " # Column Dtype \n", 153 | "--- ------ ----- \n", 154 | " 0 pickup_datetime datetime64[ns]\n", 155 | " 1 pickup_location_id int32 \n", 156 | "dtypes: datetime64[ns](1), int32(1)\n", 157 | "memory usage: 55.7 MB\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "rides.info()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 4, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "array([0], dtype=int64)" 174 | ] 175 | }, 176 | "execution_count": 4, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "import pandas as pd\n", 183 | "nulos = pd.DataFrame(rides.isnull().sum(), columns=['Nulos'])\n", 184 | "nulos.Nulos.unique()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 5, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "100%|██████████| 331/331 [00:05<00:00, 65.03it/s]\n" 197 | ] 198 | }, 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | "
pickup_hourridespickup_location_id
02022-01-01 00:00:0019
12022-01-01 01:00:0019
22022-01-01 02:00:0019
32022-01-01 03:00:0009
42022-01-01 04:00:0019
............
28995552022-12-31 19:00:00057
28995562022-12-31 20:00:00057
28995572022-12-31 21:00:00057
28995582022-12-31 22:00:00057
28995592022-12-31 23:00:00057
\n", 294 | "

2899560 rows × 3 columns

\n", 295 | "
" 296 | ], 297 | "text/plain": [ 298 | " pickup_hour rides pickup_location_id\n", 299 | "0 2022-01-01 00:00:00 1 9\n", 300 | "1 2022-01-01 01:00:00 1 9\n", 301 | "2 2022-01-01 02:00:00 1 9\n", 302 | "3 2022-01-01 03:00:00 0 9\n", 303 | "4 2022-01-01 04:00:00 1 9\n", 304 | "... ... ... ...\n", 305 | "2899555 2022-12-31 19:00:00 0 57\n", 306 | "2899556 2022-12-31 20:00:00 0 57\n", 307 | "2899557 2022-12-31 21:00:00 0 57\n", 308 | "2899558 2022-12-31 22:00:00 0 57\n", 309 | "2899559 2022-12-31 23:00:00 0 57\n", 310 | "\n", 311 | "[2899560 rows x 3 columns]" 312 | ] 313 | }, 314 | "execution_count": 5, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "from src.data import transform_raw_data_into_ts_data\n", 321 | "\n", 322 | "ts_data = transform_raw_data_into_ts_data(rides)\n", 323 | "ts_data\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 6, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "331" 335 | ] 336 | }, 337 | "execution_count": 6, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "ts_data.pickup_location_id.unique().size" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 7, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "array([0], dtype=int64)" 355 | ] 356 | }, 357 | "execution_count": 7, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "import pandas as pd\n", 364 | "nulos = pd.DataFrame(ts_data.isnull().sum(), columns=['Nulos'])\n", 365 | "nulos.Nulos.unique()" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 8, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "100%|██████████| 331/331 [00:44<00:00, 7.38it/s]" 378 | ] 379 | }, 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "features.shape=(111216, 674)\n", 385 | "targets.shape=(111216, 36)\n" 386 | ] 387 | }, 388 | { 389 | "name": "stderr", 390 | "output_type": "stream", 391 | "text": [ 392 | "\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "from src.data import transform_ts_data_into_features_and_target\n", 398 | "\n", 399 | "features, targets = transform_ts_data_into_features_and_target(\n", 400 | " ts_data,\n", 401 | " input_seq_len=24*28*1, # one month\n", 402 | " step_size=24,\n", 403 | " output_seq_len=36\n", 404 | ")\n", 405 | "\n", 406 | "print(f'{features.shape=}')\n", 407 | "print(f'{targets.shape=}')" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 9, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/plain": [ 418 | "array([0], dtype=int64)" 419 | ] 420 | }, 421 | "execution_count": 9, 422 | "metadata": {}, 423 | "output_type": "execute_result" 424 | } 425 | ], 426 | "source": [ 427 | "import pandas as pd\n", 428 | "nulos = pd.DataFrame(features.isnull().sum(), columns=['Nulos'])\n", 429 | "nulos.Nulos.unique()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 10, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/html": [ 440 | "
\n", 441 | "\n", 454 | "\n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | "
rides_next_1_hourrides_next_2_hourrides_next_3_hourrides_next_4_hourrides_next_5_hourrides_next_6_hourrides_next_7_hourrides_next_8_hourrides_next_9_hourrides_next_10_hour...rides_next_27_hourrides_next_28_hourrides_next_29_hourrides_next_30_hourrides_next_31_hourrides_next_32_hourrides_next_33_hourrides_next_34_hourrides_next_35_hourrides_next_36_hour
1112110.00.00.00.00.01.00.00.01.00.0...2.00.00.00.00.00.01.00.02.00.0
1112121.00.02.00.00.00.00.00.01.00.0...1.00.00.00.00.02.03.00.03.00.0
1112131.00.01.00.00.00.00.02.03.00.0...0.00.00.01.00.00.02.00.01.01.0
1112141.00.00.00.00.01.00.00.02.00.0...1.00.00.01.00.00.01.02.00.00.0
1112151.00.01.00.00.01.00.00.01.02.0...0.00.00.00.00.01.00.00.00.01.0
\n", 604 | "

5 rows × 36 columns

\n", 605 | "
" 606 | ], 607 | "text/plain": [ 608 | " rides_next_1_hour rides_next_2_hour rides_next_3_hour \\\n", 609 | "111211 0.0 0.0 0.0 \n", 610 | "111212 1.0 0.0 2.0 \n", 611 | "111213 1.0 0.0 1.0 \n", 612 | "111214 1.0 0.0 0.0 \n", 613 | "111215 1.0 0.0 1.0 \n", 614 | "\n", 615 | " rides_next_4_hour rides_next_5_hour rides_next_6_hour \\\n", 616 | "111211 0.0 0.0 1.0 \n", 617 | "111212 0.0 0.0 0.0 \n", 618 | "111213 0.0 0.0 0.0 \n", 619 | "111214 0.0 0.0 1.0 \n", 620 | "111215 0.0 0.0 1.0 \n", 621 | "\n", 622 | " rides_next_7_hour rides_next_8_hour rides_next_9_hour \\\n", 623 | "111211 0.0 0.0 1.0 \n", 624 | "111212 0.0 0.0 1.0 \n", 625 | "111213 0.0 2.0 3.0 \n", 626 | "111214 0.0 0.0 2.0 \n", 627 | "111215 0.0 0.0 1.0 \n", 628 | "\n", 629 | " rides_next_10_hour ... rides_next_27_hour rides_next_28_hour \\\n", 630 | "111211 0.0 ... 2.0 0.0 \n", 631 | "111212 0.0 ... 1.0 0.0 \n", 632 | "111213 0.0 ... 0.0 0.0 \n", 633 | "111214 0.0 ... 1.0 0.0 \n", 634 | "111215 2.0 ... 0.0 0.0 \n", 635 | "\n", 636 | " rides_next_29_hour rides_next_30_hour rides_next_31_hour \\\n", 637 | "111211 0.0 0.0 0.0 \n", 638 | "111212 0.0 0.0 0.0 \n", 639 | "111213 0.0 1.0 0.0 \n", 640 | "111214 0.0 1.0 0.0 \n", 641 | "111215 0.0 0.0 0.0 \n", 642 | "\n", 643 | " rides_next_32_hour rides_next_33_hour rides_next_34_hour \\\n", 644 | "111211 0.0 1.0 0.0 \n", 645 | "111212 2.0 3.0 0.0 \n", 646 | "111213 0.0 2.0 0.0 \n", 647 | "111214 0.0 1.0 2.0 \n", 648 | "111215 1.0 0.0 0.0 \n", 649 | "\n", 650 | " rides_next_35_hour rides_next_36_hour \n", 651 | "111211 2.0 0.0 \n", 652 | "111212 3.0 0.0 \n", 653 | "111213 1.0 1.0 \n", 654 | "111214 0.0 0.0 \n", 655 | "111215 0.0 1.0 \n", 656 | "\n", 657 | "[5 rows x 36 columns]" 658 | ] 659 | }, 660 | "execution_count": 10, 661 | "metadata": {}, 662 | "output_type": "execute_result" 663 | } 664 | ], 665 | "source": [ 666 | "targets.tail()" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 11, 672 | "metadata": {}, 673 | "outputs": [], 674 | "source": [ 675 | "import pandas as pd\n", 676 | "#tabular_data = features\n", 677 | "#tabular_data = targets\n", 678 | "tabular_data = pd.concat([features, targets], axis=1)\n", 679 | "\n", 680 | "from src.paths import TRANSFORMED_DATA_DIR\n", 681 | "tabular_data.to_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 12, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "data": { 691 | "text/html": [ 692 | "
\n", 693 | "\n", 706 | "\n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | "
rides_previous_672_hourrides_previous_671_hourrides_previous_670_hourrides_previous_669_hourrides_previous_668_hourrides_previous_667_hourrides_previous_666_hourrides_previous_665_hourrides_previous_664_hourrides_previous_663_hour...rides_next_27_hourrides_next_28_hourrides_next_29_hourrides_next_30_hourrides_next_31_hourrides_next_32_hourrides_next_33_hourrides_next_34_hourrides_next_35_hourrides_next_36_hour
01.01.01.00.01.00.01.00.00.00.0...1.00.00.01.00.00.02.00.00.010.0
11.00.01.00.00.00.00.00.00.01.0...4.00.00.00.01.02.05.02.02.02.0
20.01.02.00.00.02.01.01.01.02.0...0.00.00.00.02.00.05.04.01.08.0
34.02.02.01.00.00.00.02.01.04.0...2.01.01.00.01.01.07.02.02.03.0
40.00.01.00.00.00.03.01.03.02.0...0.00.00.00.00.00.02.00.01.04.0
\n", 856 | "

5 rows × 710 columns

\n", 857 | "
" 858 | ], 859 | "text/plain": [ 860 | " rides_previous_672_hour rides_previous_671_hour rides_previous_670_hour \\\n", 861 | "0 1.0 1.0 1.0 \n", 862 | "1 1.0 0.0 1.0 \n", 863 | "2 0.0 1.0 2.0 \n", 864 | "3 4.0 2.0 2.0 \n", 865 | "4 0.0 0.0 1.0 \n", 866 | "\n", 867 | " rides_previous_669_hour rides_previous_668_hour rides_previous_667_hour \\\n", 868 | "0 0.0 1.0 0.0 \n", 869 | "1 0.0 0.0 0.0 \n", 870 | "2 0.0 0.0 2.0 \n", 871 | "3 1.0 0.0 0.0 \n", 872 | "4 0.0 0.0 0.0 \n", 873 | "\n", 874 | " rides_previous_666_hour rides_previous_665_hour rides_previous_664_hour \\\n", 875 | "0 1.0 0.0 0.0 \n", 876 | "1 0.0 0.0 0.0 \n", 877 | "2 1.0 1.0 1.0 \n", 878 | "3 0.0 2.0 1.0 \n", 879 | "4 3.0 1.0 3.0 \n", 880 | "\n", 881 | " rides_previous_663_hour ... rides_next_27_hour rides_next_28_hour \\\n", 882 | "0 0.0 ... 1.0 0.0 \n", 883 | "1 1.0 ... 4.0 0.0 \n", 884 | "2 2.0 ... 0.0 0.0 \n", 885 | "3 4.0 ... 2.0 1.0 \n", 886 | "4 2.0 ... 0.0 0.0 \n", 887 | "\n", 888 | " rides_next_29_hour rides_next_30_hour rides_next_31_hour \\\n", 889 | "0 0.0 1.0 0.0 \n", 890 | "1 0.0 0.0 1.0 \n", 891 | "2 0.0 0.0 2.0 \n", 892 | "3 1.0 0.0 1.0 \n", 893 | "4 0.0 0.0 0.0 \n", 894 | "\n", 895 | " rides_next_32_hour rides_next_33_hour rides_next_34_hour \\\n", 896 | "0 0.0 2.0 0.0 \n", 897 | "1 2.0 5.0 2.0 \n", 898 | "2 0.0 5.0 4.0 \n", 899 | "3 1.0 7.0 2.0 \n", 900 | "4 0.0 2.0 0.0 \n", 901 | "\n", 902 | " rides_next_35_hour rides_next_36_hour \n", 903 | "0 0.0 10.0 \n", 904 | "1 2.0 2.0 \n", 905 | "2 1.0 8.0 \n", 906 | "3 2.0 3.0 \n", 907 | "4 1.0 4.0 \n", 908 | "\n", 909 | "[5 rows x 710 columns]" 910 | ] 911 | }, 912 | "execution_count": 12, 913 | "metadata": {}, 914 | "output_type": "execute_result" 915 | } 916 | ], 917 | "source": [ 918 | "tabular_data.head()" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 13, 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "data": { 928 | "text/plain": [ 929 | "array([0], dtype=int64)" 930 | ] 931 | }, 932 | "execution_count": 13, 933 | "metadata": {}, 934 | "output_type": "execute_result" 935 | } 936 | ], 937 | "source": [ 938 | "import pandas as pd\n", 939 | "nulos = pd.DataFrame(tabular_data.isnull().sum(), columns=['Nulos'])\n", 940 | "nulos.Nulos.unique()" 941 | ] 942 | } 943 | ], 944 | "metadata": { 945 | "kernelspec": { 946 | "display_name": ".venv", 947 | "language": "python", 948 | "name": "python3" 949 | }, 950 | "language_info": { 951 | "codemirror_mode": { 952 | "name": "ipython", 953 | "version": 3 954 | }, 955 | "file_extension": ".py", 956 | "mimetype": "text/x-python", 957 | "name": "python", 958 | "nbconvert_exporter": "python", 959 | "pygments_lexer": "ipython3", 960 | "version": "3.9.13" 961 | }, 962 | "orig_nbformat": 4, 963 | "vscode": { 964 | "interpreter": { 965 | "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923" 966 | } 967 | } 968 | }, 969 | "nbformat": 4, 970 | "nbformat_minor": 2 971 | } 972 | -------------------------------------------------------------------------------- /notebooks/11_backfill_feature_store.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "HOPSWORKS_PROJECT_NAME = 'bike_sharing_demand'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "from dotenv import load_dotenv\n", 30 | "from src.paths import PARENT_DIR\n", 31 | "\n", 32 | "# load key-value pairs from .env file located in the parent directory\n", 33 | "load_dotenv(PARENT_DIR / '.env')\n", 34 | "\n", 35 | "HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Downloading raw data from 2022 to 2023\n", 48 | "File 2022 was already in local storage\n", 49 | "File 2023 was already in local storage\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "from datetime import datetime\n", 55 | "import pandas as pd\n", 56 | "from src.data import load_raw_data\n", 57 | "\n", 58 | "from_year = 2022\n", 59 | "to_year = datetime.now().year\n", 60 | "print(f'Downloading raw data from {from_year} to {to_year}')\n", 61 | "\n", 62 | "rides = pd.DataFrame()\n", 63 | "for year in range(from_year, to_year+1):\n", 64 | " \n", 65 | " # download data for the whole year\n", 66 | " rides_one_year = load_raw_data(year)\n", 67 | " \n", 68 | " # append rows\n", 69 | " rides = pd.concat([rides, rides_one_year])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "len(rides)=4829258\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "print(f'{len(rides)=}')" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "Timestamp('2023-09-30 23:58:56')" 98 | ] 99 | }, 100 | "execution_count": 6, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "rides.pickup_datetime.max()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 7, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "100%|██████████| 362/362 [00:09<00:00, 39.72it/s]\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "from src.data import transform_raw_data_into_ts_data\n", 124 | "\n", 125 | "ts_data = transform_raw_data_into_ts_data(rides)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 8, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/html": [ 136 | "
\n", 137 | "\n", 150 | "\n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | "
pickup_hourridespickup_location_id
50675422023-08-31 14:00:00157
50675432023-08-31 15:00:00357
50675442023-08-31 16:00:00157
50675452023-08-31 17:00:00157
50675462023-08-31 18:00:00057
............
50682672023-09-30 19:00:00057
50682682023-09-30 20:00:00057
50682692023-09-30 21:00:00057
50682702023-09-30 22:00:00357
50682712023-09-30 23:00:00057
\n", 228 | "

730 rows × 3 columns

\n", 229 | "
" 230 | ], 231 | "text/plain": [ 232 | " pickup_hour rides pickup_location_id\n", 233 | "5067542 2023-08-31 14:00:00 1 57\n", 234 | "5067543 2023-08-31 15:00:00 3 57\n", 235 | "5067544 2023-08-31 16:00:00 1 57\n", 236 | "5067545 2023-08-31 17:00:00 1 57\n", 237 | "5067546 2023-08-31 18:00:00 0 57\n", 238 | "... ... ... ...\n", 239 | "5068267 2023-09-30 19:00:00 0 57\n", 240 | "5068268 2023-09-30 20:00:00 0 57\n", 241 | "5068269 2023-09-30 21:00:00 0 57\n", 242 | "5068270 2023-09-30 22:00:00 3 57\n", 243 | "5068271 2023-09-30 23:00:00 0 57\n", 244 | "\n", 245 | "[730 rows x 3 columns]" 246 | ] 247 | }, 248 | "execution_count": 8, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "ts_data.tail(730)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 9, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stdout", 264 | "output_type": "stream", 265 | "text": [ 266 | "3.4.2\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "import hopsworks\n", 272 | "\n", 273 | "print(hopsworks.__version__)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 10, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "Connected. Call `.close()` to terminate connection gracefully.\n", 286 | "\n", 287 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "project = hopsworks.login(\n", 293 | " project=HOPSWORKS_PROJECT_NAME,\n", 294 | " api_key_value=HOPSWORKS_API_KEY\n", 295 | ")" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 11, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "Connected. Call `.close()` to terminate connection gracefully.\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "feature_store = project.get_feature_store()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 12, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'\n", 322 | "FEATURE_GROUP_VERSION = 1" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 13, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "feature_group = feature_store.get_or_create_feature_group(\n", 332 | " name=FEATURE_GROUP_NAME,\n", 333 | " version=FEATURE_GROUP_VERSION,\n", 334 | " description=\"Time-series data at hourly frequency\",\n", 335 | " primary_key = ['pickup_location_id', 'pickup_hour'],\n", 336 | " event_time='pickup_hour',\n", 337 | ")" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 14, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "Feature Group created successfully, explore it at \n", 350 | "https://c.app.hopsworks.ai:443/p/100501/fs/100420/fg/280937\n" 351 | ] 352 | }, 353 | { 354 | "data": { 355 | "application/vnd.jupyter.widget-view+json": { 356 | "model_id": "56be964fb30c4be0be2c4de6982554c1", 357 | "version_major": 2, 358 | "version_minor": 0 359 | }, 360 | "text/plain": [ 361 | "Uploading Dataframe: 0.00% | | Rows 0/5068272 | Elapsed Time: 00:00 | Remaining Time: ?" 362 | ] 363 | }, 364 | "metadata": {}, 365 | "output_type": "display_data" 366 | }, 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "Launching job: time_series_hourly_feature_group_1_offline_fg_materialization\n", 372 | "Job started successfully, you can follow the progress at \n", 373 | "https://c.app.hopsworks.ai/p/100501/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions\n" 374 | ] 375 | }, 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "(, None)" 380 | ] 381 | }, 382 | "execution_count": 14, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "feature_group.insert(ts_data, write_options={\"wait_for_job\": False})" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [] 397 | } 398 | ], 399 | "metadata": { 400 | "kernelspec": { 401 | "display_name": ".venv", 402 | "language": "python", 403 | "name": "python3" 404 | }, 405 | "language_info": { 406 | "codemirror_mode": { 407 | "name": "ipython", 408 | "version": 3 409 | }, 410 | "file_extension": ".py", 411 | "mimetype": "text/x-python", 412 | "name": "python", 413 | "nbconvert_exporter": "python", 414 | "pygments_lexer": "ipython3", 415 | "version": "3.9.13" 416 | }, 417 | "orig_nbformat": 4, 418 | "vscode": { 419 | "interpreter": { 420 | "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923" 421 | } 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /notebooks/12_feature_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import src.config as config" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "current_date=Timestamp('2025-03-06 09:00:00')\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "from datetime import datetime, timedelta\n", 37 | "\n", 38 | "import pandas as pd\n", 39 | "\n", 40 | "current_date = pd.to_datetime(datetime.utcnow()).floor('H')\n", 41 | "print(f'{current_date=}')\n", 42 | "\n", 43 | "# we fetch raw data for the last 28 days, to add redundancy to our data pipeline\n", 44 | "fetch_data_to = current_date\n", 45 | "fetch_data_from = current_date - timedelta(days=70) #28" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "from src.data import load_raw_data\n", 55 | "\n", 56 | "def fetch_batch_raw_data(from_date: datetime, to_date: datetime) -> pd.DataFrame:\n", 57 | " \"\"\"\n", 58 | " Simulate production data by sampling historical data from 52 weeks ago (i.e. 1 year)\n", 59 | " \"\"\"\n", 60 | " from_date_ = from_date - timedelta(days=7*52)\n", 61 | " to_date_ = to_date - timedelta(days=7*52)\n", 62 | " print(f'{from_date=}, {to_date_=}')\n", 63 | "\n", 64 | " # # download 2 files from website\n", 65 | " # rides = load_raw_data(year=from_date_.year) #, months=from_date_.month)\n", 66 | " # rides = rides[(rides.pickup_datetime >= from_date_) & (rides.pickup_datetime < to_date_)]\n", 67 | " \n", 68 | " # rides_2 = load_raw_data(year=to_date_.year) #, months=to_date_.month)\n", 69 | " # rides_2 = rides_2[(rides_2.pickup_datetime < to_date_) & (rides_2.pickup_datetime < to_date_)]\n", 70 | "\n", 71 | " # rides = pd.concat([rides, rides_2]) \n", 72 | "\n", 73 | " # Intenta cargar los datos del primer año\n", 74 | " rides = load_raw_data(year=from_date_.year)\n", 75 | " rides = rides[(rides.pickup_datetime >= from_date_) & (rides.pickup_datetime < to_date_)]\n", 76 | "\n", 77 | "\n", 78 | " ### FIX 1 ----- Esto hago que no se ejecute para probar si es el error\n", 79 | " # # Verifica si los años son diferentes\n", 80 | " # if from_date_.year != to_date_.year:\n", 81 | " # # Carga los datos del segundo año\n", 82 | " # rides_2 = load_raw_data(year=to_date_.year)\n", 83 | " # rides_2 = rides_2[(rides_2.pickup_datetime >= from_date_) & (rides_2.pickup_datetime < to_date_)]\n", 84 | " # rides = pd.concat([rides, rides_2]) \n", 85 | "\n", 86 | "\n", 87 | " # shift the data to pretend this is recent data\n", 88 | " rides['pickup_datetime'] += timedelta(days=7*52)\n", 89 | "\n", 90 | " rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)\n", 91 | "\n", 92 | " return rides" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "from_date=Timestamp('2024-12-26 09:00:00'), to_date_=Timestamp('2024-03-07 09:00:00')\n", 105 | "File 2023 was already in local storage\n", 106 | "File 2024 was already in local storage\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "rides = fetch_batch_raw_data(from_date=fetch_data_from, to_date=fetch_data_to)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "356" 123 | ] 124 | }, 125 | "execution_count": 7, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "len(rides.pickup_location_id.unique())" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 8, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stderr", 141 | "output_type": "stream", 142 | "text": [ 143 | "100%|██████████| 356/356 [00:02<00:00, 176.17it/s]\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "from src.data import transform_raw_data_into_ts_data\n", 149 | "ts_data = transform_raw_data_into_ts_data(rides)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/html": [ 160 | "
\n", 161 | "\n", 174 | "\n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | "
pickup_datetimepickup_location_idpickup_hour
17305832024-10-22 08:10:1322024-10-22 08:00:00
18972912024-10-22 08:26:5422024-10-22 08:00:00
18360142024-10-22 08:29:1822024-10-22 08:00:00
19161652024-10-22 08:35:4322024-10-22 08:00:00
19214882024-10-22 08:38:1122024-10-22 08:00:00
\n", 216 | "
" 217 | ], 218 | "text/plain": [ 219 | " pickup_datetime pickup_location_id pickup_hour\n", 220 | "1730583 2024-10-22 08:10:13 2 2024-10-22 08:00:00\n", 221 | "1897291 2024-10-22 08:26:54 2 2024-10-22 08:00:00\n", 222 | "1836014 2024-10-22 08:29:18 2 2024-10-22 08:00:00\n", 223 | "1916165 2024-10-22 08:35:43 2 2024-10-22 08:00:00\n", 224 | "1921488 2024-10-22 08:38:11 2 2024-10-22 08:00:00" 225 | ] 226 | }, 227 | "execution_count": 9, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "rides.head()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 10, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
\n", 245 | "\n", 258 | "\n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | "
pickup_hourridespickup_location_id
5980752024-12-31 02:00:000362
5980762024-12-31 03:00:000362
5980772024-12-31 04:00:000362
5980782024-12-31 05:00:000362
5980792024-12-31 06:00:000362
\n", 300 | "
" 301 | ], 302 | "text/plain": [ 303 | " pickup_hour rides pickup_location_id\n", 304 | "598075 2024-12-31 02:00:00 0 362\n", 305 | "598076 2024-12-31 03:00:00 0 362\n", 306 | "598077 2024-12-31 04:00:00 0 362\n", 307 | "598078 2024-12-31 05:00:00 0 362\n", 308 | "598079 2024-12-31 06:00:00 0 362" 309 | ] 310 | }, 311 | "execution_count": 10, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "ts_data.tail()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 11, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "2024-12-31 08:19:29,320 INFO: Initializing external client\n", 330 | "2024-12-31 08:19:29,323 INFO: Base URL: https://c.app.hopsworks.ai:443\n", 331 | "2024-12-31 08:19:34,011 INFO: Python Engine initialized.\n", 332 | "\n", 333 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "import hopsworks\n", 339 | "\n", 340 | "# connect to the project\n", 341 | "project = hopsworks.login(\n", 342 | " project=config.HOPSWORKS_PROJECT_NAME,\n", 343 | " api_key_value=config.HOPSWORKS_API_KEY\n", 344 | ")\n", 345 | "\n", 346 | "# connect to the feature store\n", 347 | "feature_store = project.get_feature_store()\n", 348 | "\n", 349 | "# connect to the feature group\n", 350 | "feature_group = feature_store.get_or_create_feature_group(\n", 351 | " name=config.FEATURE_GROUP_NAME,\n", 352 | " version=config.FEATURE_GROUP_VERSION,\n", 353 | " description=\"Time-series data at hourly frequency\",\n", 354 | " primary_key = ['pickup_location_id', 'pickup_hour'],\n", 355 | " event_time='pickup_hour',\n", 356 | ")" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stderr", 366 | "output_type": "stream", 367 | "text": [ 368 | "Uploading Dataframe: 100.00% |██████████| Rows 530880/530880 | Elapsed Time: 03:18 | Remaining Time: 00:00\n" 369 | ] 370 | }, 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "Launching job: time_series_hourly_feature_group_1_offline_fg_materialization\n", 376 | "Job started successfully, you can follow the progress at \n", 377 | "https://c.app.hopsworks.ai:443/p/100501/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions\n", 378 | "2024-12-31 08:23:09,205 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n", 379 | "2024-12-31 08:23:15,859 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n", 380 | "2024-12-31 08:26:32,549 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n", 381 | "2024-12-31 08:26:32,685 INFO: Waiting for log aggregation to finish.\n", 382 | "2024-12-31 08:27:16,560 INFO: Execution finished successfully.\n" 383 | ] 384 | }, 385 | { 386 | "data": { 387 | "text/plain": [ 388 | "(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),\n", 389 | " None)" 390 | ] 391 | }, 392 | "execution_count": 12, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "feature_group.insert(ts_data, write_options={\"wait_for_job\": True})" 399 | ] 400 | } 401 | ], 402 | "metadata": { 403 | "kernelspec": { 404 | "display_name": ".venv", 405 | "language": "python", 406 | "name": "python3" 407 | }, 408 | "language_info": { 409 | "codemirror_mode": { 410 | "name": "ipython", 411 | "version": 3 412 | }, 413 | "file_extension": ".py", 414 | "mimetype": "text/x-python", 415 | "name": "python", 416 | "nbconvert_exporter": "python", 417 | "pygments_lexer": "ipython3", 418 | "version": "3.9.13" 419 | }, 420 | "orig_nbformat": 4, 421 | "vscode": { 422 | "interpreter": { 423 | "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923" 424 | } 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 2 429 | } 430 | -------------------------------------------------------------------------------- /notebooks/14_inference_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "current_date=Timestamp('2024-03-25 20:00:00+0000', tz='UTC')\n" 23 | ] 24 | }, 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "Timestamp('2024-03-25 20:00:00+0000', tz='UTC')" 29 | ] 30 | }, 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "from datetime import datetime, timedelta\n", 38 | "import pandas as pd\n", 39 | "\n", 40 | "current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') # - timedelta(hours=1)\n", 41 | "print(f'{current_date=}')\n", 42 | "#current_date = pd.Timestamp('2023-11-10 10:00:00+0000', tz='UTC')\n", 43 | "current_date" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Connected. Call `.close()` to terminate connection gracefully.\n", 56 | "\n", 57 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n", 58 | "Connected. Call `.close()` to terminate connection gracefully.\n", 59 | "Fetching data from 2024-02-26 20:00:00+00:00 to 2024-03-25 19:00:00+00:00\n", 60 | "Finished: Reading data from Hopsworks, using ArrowFlight (13.50s) \n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "from src.inference import load_batch_of_features_from_store\n", 66 | "\n", 67 | "features = load_batch_of_features_from_store(current_date)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stderr", 77 | "output_type": "stream", 78 | "text": [ 79 | "DeprecationWarning: Using 'method_whitelist' with Retry is deprecated and will be removed in v2.0. Use 'allowed_methods' instead\n", 80 | "\u001b[1;38;5;214mCOMET WARNING:\u001b[0m This method has been deprecated, we recommend using the api.get_model(...) method to get the Model Object and then using model.download(...)\n", 81 | "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Downloading registry model 'bike_demand_predictor_next_hour', version '1.1.0', stage None from workspace 'javieryanzon'...\n", 82 | "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Unzipping model to 'C:\\\\Users\\\\jayan\\\\Desktop\\\\Python, SQL, Power Bi, cursos\\\\Proyectos\\\\bike_sharing_demand_predictor\\\\models' ...\n", 83 | "\u001b[1;38;5;39mCOMET INFO:\u001b[0m done!\n" 84 | ] 85 | }, 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Connection closed.\n", 91 | "Connected. Call `.close()` to terminate connection gracefully.\n", 92 | "\n", 93 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n", 94 | "Connected. Call `.close()` to terminate connection gracefully.\n" 95 | ] 96 | }, 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "VersionWarning: No version provided for getting feature view `latitud_y_longitud_view`, defaulting to `1`.\n" 102 | ] 103 | }, 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "Finished: Reading data from Hopsworks, using ArrowFlight (0.69s) \n", 109 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 110 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 111 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 112 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 113 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 114 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 115 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 116 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 117 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 118 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 119 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 120 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 121 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 122 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 123 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 124 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 125 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 126 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 127 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 128 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 129 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 130 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 131 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 132 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n" 133 | ] 134 | }, 135 | { 136 | "name": "stderr", 137 | "output_type": "stream", 138 | "text": [ 139 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 140 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 141 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 142 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 143 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 144 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 145 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 146 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 147 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 148 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 149 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 150 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 151 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 152 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 153 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 154 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 155 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 156 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 157 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 158 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 159 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 160 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 161 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 162 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 163 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 164 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 165 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 166 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 167 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 168 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 169 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 170 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 171 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 172 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 173 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 174 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 175 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 176 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 177 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 178 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 179 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 180 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 181 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 182 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 183 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 184 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n" 185 | ] 186 | }, 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 192 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 193 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 194 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 195 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 196 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 197 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 198 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 199 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 200 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 201 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 202 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 203 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 204 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 205 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 206 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 207 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 208 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 209 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 210 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 211 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 212 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 213 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 214 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 215 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 216 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 217 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 218 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 219 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 220 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 221 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 222 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 223 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 224 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 225 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 226 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 227 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 228 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 229 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 230 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 231 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 232 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 233 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 234 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 235 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 236 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n", 237 | "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n", 238 | "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n" 239 | ] 240 | }, 241 | { 242 | "name": "stderr", 243 | "output_type": "stream", 244 | "text": [ 245 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 246 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 247 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 248 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 249 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 250 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 251 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 252 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 253 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 254 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 255 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 256 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 257 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 258 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 259 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 260 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 261 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 262 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 263 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 264 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 265 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 266 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 267 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 268 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n", 269 | "DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.\n", 270 | "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "##NUEVO\n", 276 | "##SEGUIR PROBANDO esta guardando el modelo como un array y creo que esta mal\n", 277 | "from src.model_registry_api import get_latest_model_from_registry\n", 278 | "from src.inference import get_model_predictions\n", 279 | "\n", 280 | "model = get_latest_model_from_registry(model_name='bike_demand_predictor_next_hour', status= 'Production')\n", 281 | "predictions = get_model_predictions(model, features)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 5, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# from src.inference import (\n", 291 | "# load_model_from_registry,\n", 292 | "# get_model_predictions\n", 293 | "# )\n", 294 | "\n", 295 | "# model = load_model_from_registry()\n", 296 | "# predictions = get_model_predictions(model, features)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 5, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/html": [ 307 | "
\n", 308 | "\n", 321 | "\n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | "
rides_next_1_hourrides_next_2_hourrides_next_3_hourrides_next_4_hourrides_next_5_hourrides_next_6_hourrides_next_7_hourrides_next_8_hourrides_next_9_hourrides_next_10_hour...rides_next_29_hourrides_next_30_hourrides_next_31_hourrides_next_32_hourrides_next_33_hourrides_next_34_hourrides_next_35_hourrides_next_36_hourpickup_location_idpickup_hour
01.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.01.01.022024-03-25 20:00:00+00:00
12.01.02.01.01.00.00.00.00.00.0...1.01.01.00.00.00.01.04.032024-03-25 20:00:00+00:00
21.01.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.01.042024-03-25 20:00:00+00:00
37.04.04.03.03.02.01.01.02.02.0...2.02.01.01.01.02.03.03.052024-03-25 20:00:00+00:00
42.01.01.01.01.00.00.00.00.00.0...1.01.01.00.00.00.01.01.062024-03-25 20:00:00+00:00
..................................................................
3151.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.01.01.04932024-03-25 20:00:00+00:00
3161.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.04942024-03-25 20:00:00+00:00
3170.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.01.04962024-03-25 20:00:00+00:00
3181.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.01.04972024-03-25 20:00:00+00:00
3191.01.00.00.00.00.00.00.00.01.0...0.00.00.00.00.00.01.01.04982024-03-25 20:00:00+00:00
\n", 615 | "

320 rows × 38 columns

\n", 616 | "
" 617 | ], 618 | "text/plain": [ 619 | " rides_next_1_hour rides_next_2_hour rides_next_3_hour \\\n", 620 | "0 1.0 0.0 0.0 \n", 621 | "1 2.0 1.0 2.0 \n", 622 | "2 1.0 1.0 0.0 \n", 623 | "3 7.0 4.0 4.0 \n", 624 | "4 2.0 1.0 1.0 \n", 625 | ".. ... ... ... \n", 626 | "315 1.0 0.0 0.0 \n", 627 | "316 1.0 0.0 0.0 \n", 628 | "317 0.0 0.0 0.0 \n", 629 | "318 1.0 0.0 0.0 \n", 630 | "319 1.0 1.0 0.0 \n", 631 | "\n", 632 | " rides_next_4_hour rides_next_5_hour rides_next_6_hour \\\n", 633 | "0 0.0 0.0 0.0 \n", 634 | "1 1.0 1.0 0.0 \n", 635 | "2 0.0 0.0 0.0 \n", 636 | "3 3.0 3.0 2.0 \n", 637 | "4 1.0 1.0 0.0 \n", 638 | ".. ... ... ... \n", 639 | "315 0.0 0.0 0.0 \n", 640 | "316 0.0 0.0 0.0 \n", 641 | "317 0.0 0.0 0.0 \n", 642 | "318 0.0 0.0 0.0 \n", 643 | "319 0.0 0.0 0.0 \n", 644 | "\n", 645 | " rides_next_7_hour rides_next_8_hour rides_next_9_hour \\\n", 646 | "0 0.0 0.0 0.0 \n", 647 | "1 0.0 0.0 0.0 \n", 648 | "2 0.0 0.0 0.0 \n", 649 | "3 1.0 1.0 2.0 \n", 650 | "4 0.0 0.0 0.0 \n", 651 | ".. ... ... ... \n", 652 | "315 0.0 0.0 0.0 \n", 653 | "316 0.0 0.0 0.0 \n", 654 | "317 0.0 0.0 0.0 \n", 655 | "318 0.0 0.0 0.0 \n", 656 | "319 0.0 0.0 0.0 \n", 657 | "\n", 658 | " rides_next_10_hour ... rides_next_29_hour rides_next_30_hour \\\n", 659 | "0 0.0 ... 0.0 0.0 \n", 660 | "1 0.0 ... 1.0 1.0 \n", 661 | "2 0.0 ... 0.0 0.0 \n", 662 | "3 2.0 ... 2.0 2.0 \n", 663 | "4 0.0 ... 1.0 1.0 \n", 664 | ".. ... ... ... ... \n", 665 | "315 0.0 ... 0.0 0.0 \n", 666 | "316 0.0 ... 0.0 0.0 \n", 667 | "317 0.0 ... 0.0 0.0 \n", 668 | "318 0.0 ... 0.0 0.0 \n", 669 | "319 1.0 ... 0.0 0.0 \n", 670 | "\n", 671 | " rides_next_31_hour rides_next_32_hour rides_next_33_hour \\\n", 672 | "0 0.0 0.0 0.0 \n", 673 | "1 1.0 0.0 0.0 \n", 674 | "2 0.0 0.0 0.0 \n", 675 | "3 1.0 1.0 1.0 \n", 676 | "4 1.0 0.0 0.0 \n", 677 | ".. ... ... ... \n", 678 | "315 0.0 0.0 0.0 \n", 679 | "316 0.0 0.0 0.0 \n", 680 | "317 0.0 0.0 0.0 \n", 681 | "318 0.0 0.0 0.0 \n", 682 | "319 0.0 0.0 0.0 \n", 683 | "\n", 684 | " rides_next_34_hour rides_next_35_hour rides_next_36_hour \\\n", 685 | "0 0.0 1.0 1.0 \n", 686 | "1 0.0 1.0 4.0 \n", 687 | "2 0.0 0.0 1.0 \n", 688 | "3 2.0 3.0 3.0 \n", 689 | "4 0.0 1.0 1.0 \n", 690 | ".. ... ... ... \n", 691 | "315 0.0 1.0 1.0 \n", 692 | "316 0.0 0.0 0.0 \n", 693 | "317 0.0 0.0 1.0 \n", 694 | "318 0.0 0.0 1.0 \n", 695 | "319 0.0 1.0 1.0 \n", 696 | "\n", 697 | " pickup_location_id pickup_hour \n", 698 | "0 2 2024-03-25 20:00:00+00:00 \n", 699 | "1 3 2024-03-25 20:00:00+00:00 \n", 700 | "2 4 2024-03-25 20:00:00+00:00 \n", 701 | "3 5 2024-03-25 20:00:00+00:00 \n", 702 | "4 6 2024-03-25 20:00:00+00:00 \n", 703 | ".. ... ... \n", 704 | "315 493 2024-03-25 20:00:00+00:00 \n", 705 | "316 494 2024-03-25 20:00:00+00:00 \n", 706 | "317 496 2024-03-25 20:00:00+00:00 \n", 707 | "318 497 2024-03-25 20:00:00+00:00 \n", 708 | "319 498 2024-03-25 20:00:00+00:00 \n", 709 | "\n", 710 | "[320 rows x 38 columns]" 711 | ] 712 | }, 713 | "execution_count": 5, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "predictions['pickup_hour'] = current_date\n", 720 | "predictions" 721 | ] 722 | }, 723 | { 724 | "attachments": {}, 725 | "cell_type": "markdown", 726 | "metadata": {}, 727 | "source": [ 728 | "### Save these predictions in the feature store, so they can be later consumed by our Streamlit app" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 6, 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "name": "stdout", 738 | "output_type": "stream", 739 | "text": [ 740 | "Connection closed.\n", 741 | "Connected. Call `.close()` to terminate connection gracefully.\n", 742 | "\n", 743 | "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n", 744 | "Connected. Call `.close()` to terminate connection gracefully.\n" 745 | ] 746 | } 747 | ], 748 | "source": [ 749 | "from src.feature_store_api import get_feature_store\n", 750 | "import src.config as config\n", 751 | "\n", 752 | "# connect to the feature group\n", 753 | "feature_group = get_feature_store().get_or_create_feature_group(\n", 754 | " name=config.FEATURE_GROUP_MODEL_PREDICTIONS,\n", 755 | " version=1,\n", 756 | " description=\"Predictions generate by our production model\",\n", 757 | " primary_key = ['pickup_location_id', 'pickup_hour'],\n", 758 | " event_time='pickup_hour',\n", 759 | ")" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": 7, 765 | "metadata": {}, 766 | "outputs": [ 767 | { 768 | "data": { 769 | "application/vnd.jupyter.widget-view+json": { 770 | "model_id": "bc454aa3ed6c40fb88a8461cbd3e22c0", 771 | "version_major": 2, 772 | "version_minor": 0 773 | }, 774 | "text/plain": [ 775 | "Uploading Dataframe: 0.00% | | Rows 0/320 | Elapsed Time: 00:00 | Remaining Time: ?" 776 | ] 777 | }, 778 | "metadata": {}, 779 | "output_type": "display_data" 780 | }, 781 | { 782 | "name": "stdout", 783 | "output_type": "stream", 784 | "text": [ 785 | "Launching job: model_predictions_feature_group_1_offline_fg_materialization\n", 786 | "Job started successfully, you can follow the progress at \n", 787 | "https://c.app.hopsworks.ai/p/100501/jobs/named/model_predictions_feature_group_1_offline_fg_materialization/executions\n" 788 | ] 789 | }, 790 | { 791 | "data": { 792 | "text/plain": [ 793 | "(, None)" 794 | ] 795 | }, 796 | "execution_count": 7, 797 | "metadata": {}, 798 | "output_type": "execute_result" 799 | } 800 | ], 801 | "source": [ 802 | "feature_group.insert(predictions, write_options={\"wait_for_job\": False})" 803 | ] 804 | } 805 | ], 806 | "metadata": { 807 | "kernelspec": { 808 | "display_name": ".venv", 809 | "language": "python", 810 | "name": "python3" 811 | }, 812 | "language_info": { 813 | "codemirror_mode": { 814 | "name": "ipython", 815 | "version": 3 816 | }, 817 | "file_extension": ".py", 818 | "mimetype": "text/x-python", 819 | "name": "python", 820 | "nbconvert_exporter": "python", 821 | "pygments_lexer": "ipython3", 822 | "version": "3.9.13" 823 | }, 824 | "orig_nbformat": 4, 825 | "vscode": { 826 | "interpreter": { 827 | "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923" 828 | } 829 | } 830 | }, 831 | "nbformat": 4, 832 | "nbformat_minor": 2 833 | } 834 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "src" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["jayanra "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9" 10 | python-dotenv = "^1.0.0" 11 | jupyter = "^1.0.0" 12 | requests = "^2.31.0" 13 | tqdm = "^4.66.1" 14 | plotly = "^5.16.1" 15 | scikit-learn = "^1.3.0" 16 | xgboost = "^1.7.6" 17 | lightgbm = "^4.0.0" 18 | optuna = "^3.3.0" 19 | wget = "^3.2" 20 | geopandas = "^0.14.0" 21 | streamlit = {version = "^1.28.0", python = ">=3.9,<3.9.7 || >3.9.7,<4.0"} 22 | pydeck = "^0.8.0" 23 | comet-ml = "^3.38.0" 24 | hopsworks = {version = "4.1.0", python = ">=3.9,<3.11"} 25 | confluent-kafka = "^2.6.1" 26 | 27 | 28 | [tool.poetry.group.dev.dependencies] 29 | ipykernel = "^6.25.1" 30 | 31 | [build-system] 32 | requires = ["poetry-core"] 33 | build-backend = "poetry.core.masonry.api" 34 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__init__.py -------------------------------------------------------------------------------- /src/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /src/__pycache__/data.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/data.cpython-39.pyc -------------------------------------------------------------------------------- /src/__pycache__/data_split.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/data_split.cpython-39.pyc -------------------------------------------------------------------------------- /src/__pycache__/model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/model.cpython-39.pyc -------------------------------------------------------------------------------- /src/__pycache__/paths.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/paths.cpython-39.pyc -------------------------------------------------------------------------------- /src/__pycache__/plot.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/plot.cpython-39.pyc -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | from src.paths import PARENT_DIR 5 | 6 | # load key-value pairs from .env file located in the parent directory 7 | load_dotenv(PARENT_DIR / '.env') 8 | 9 | HOPSWORKS_PROJECT_NAME = 'bike_sharing_demand' 10 | try: 11 | HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY'] 12 | except: 13 | raise Exception('Create an .env file on the project root with the HOPSWORKS_API_KEY') 14 | 15 | FEATURE_GROUP_NAME = 'time_series_hourly_feature_group' 16 | FEATURE_GROUP_VERSION = 1 17 | FEATURE_VIEW_NAME = 'time_series_hourly_feature_view' 18 | FEATURE_VIEW_VERSION = 1 19 | MODEL_NAME = "bike_demand_predictor_next_hour" 20 | MODEL_VERSION = 1 21 | 22 | #Agrego esto para que se consulte al feature store la latitud y longitud 23 | FEATURE_GROUP_LAT_LONG = 'latitud_y_longitud_group' 24 | FEATURE_VIEW_LAT_LONG = 'latitud_y_longitud_view' 25 | #FEATURE_VIEW_LAT_LONG_VERSION = 1 26 | 27 | 28 | # added for monitoring purposes 29 | FEATURE_GROUP_MODEL_PREDICTIONS = 'model_predictions_feature_group_' 30 | FEATURE_VIEW_MODEL_PREDICTIONS = 'model_predictions_feature_view_' 31 | FEATURE_VIEW_MONITORING = 'predictions_vs_actuals_for_monitoring_feature_view' 32 | 33 | # number of historical values our model needs to generate predictions 34 | N_FEATURES = 24 * 28 35 | 36 | # maximum Mean Absolute Error we allow our production model to have 37 | MAX_MAE = 4.0 -------------------------------------------------------------------------------- /src/data.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from datetime import datetime, timedelta 3 | from typing import Optional, List, Tuple 4 | from pdb import set_trace as stop 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import requests 9 | from tqdm import tqdm 10 | import pyarrow as pa 11 | import zipfile 12 | import pyarrow.parquet as pq 13 | import subprocess 14 | 15 | from src.paths import RAW_DATA_DIR, TRANSFORMED_DATA_DIR 16 | 17 | 18 | def download_one_file_of_raw_data(year: int) -> Path: #, month: int) -> Path: 19 | """ 20 | Downloads Parquet file with historical bike rides for the given `year` and 21 | `month` 22 | """ 23 | URL = f'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/bicicletas-publicas/recorridos-realizados-{year}.zip' 24 | 25 | # Ruta de destino para guardar el archivo descargado 26 | destination_path = RAW_DATA_DIR / f'recorridos-realizados-{year}.zip' 27 | 28 | try: 29 | # Utiliza wget para descargar el archivo en la ubicación deseada 30 | subprocess.run(['wget', URL, '-O', destination_path]) 31 | 32 | # Verifica si el archivo se descargó correctamente 33 | if destination_path.is_file(): 34 | print(f'Descargado año {year}') 35 | return destination_path 36 | else: 37 | raise Exception(f'Error al descargar {URL}: El archivo no se descargó correctamente.') 38 | 39 | except Exception as e: 40 | raise Exception(f'Error al descargar {URL}: {str(e)}') 41 | 42 | 43 | # response = requests.get(URL) 44 | 45 | # if response.status_code == 200: 46 | # path = RAW_DATA_DIR / f'recorridos-realizados-{year}.zip' 47 | # open(path, "wb").write(response.content) 48 | # print(f'descargado año {year}') 49 | # # time.sleep(2) 50 | # return path 51 | # else: 52 | # raise Exception(f'{URL} is not available') 53 | 54 | def unzip_and_convert_csv_to_parquet(year: int) -> Path: 55 | nombre_archivo_zip = RAW_DATA_DIR / f"recorridos-realizados-{year}.zip" 56 | # Descomprimir el archivo zip 57 | with zipfile.ZipFile(nombre_archivo_zip, 'r') as archivo_zip: 58 | 59 | # Extraer el archivo CSV del zip 60 | nombre_archivo_csv = archivo_zip.namelist()[0] # Suponiendo que el archivo CSV es el primer archivo en el zip 61 | archivo_zip.extractall(RAW_DATA_DIR) #(f"../data/raw/") 62 | 63 | # Leer el archivo CSV con pandas 64 | df = pd.read_csv(RAW_DATA_DIR / nombre_archivo_csv, delimiter=',', decimal=".") #RAW_DATA_DIR / 65 | 66 | # Convertir el DataFrame a formato parquet 67 | nombre_archivo_parquet = f"rides_{year}.parquet" 68 | table = pa.Table.from_pandas(df) 69 | pq.write_table(table, RAW_DATA_DIR / nombre_archivo_parquet) 70 | 71 | path = RAW_DATA_DIR / f'rides_{year}.parquet' 72 | return path 73 | 74 | 75 | def validate_raw_data( 76 | rides: pd.DataFrame, 77 | year: int, 78 | #month: int, 79 | ) -> pd.DataFrame: 80 | """ 81 | Removes rows with pickup_datetimes outside their valid range 82 | """ 83 | # keep only rides for this month 84 | # this_month_start = f'{year}-{month:02d}-01' 85 | # next_month_start = f'{year}-{month+1:02d}-01' if month < 12 else f'{year+1}-01-01' 86 | this_year_start = f'{year}-01-01' 87 | next_year_start = f'{year+1}-01-01' 88 | rides = rides[rides.pickup_datetime >= this_year_start] 89 | rides = rides[rides.pickup_datetime < next_year_start] 90 | 91 | return rides 92 | 93 | 94 | def fetch_ride_events_from_data_warehouse( 95 | from_date: datetime, 96 | to_date: datetime 97 | ) -> pd.DataFrame: 98 | """ 99 | This function is used to simulate production data by sampling historical data 100 | from 52 weeks ago (i.e. 1 year) 101 | """ 102 | from_date_ = from_date - timedelta(days=7*52) 103 | to_date_ = to_date - timedelta(days=7*52) 104 | print(f'Fetching ride events from {from_date} to {to_date}') 105 | 106 | if (from_date_.year == to_date_.year) and (from_date_.month == to_date_.month): 107 | # download 1 file of data only 108 | rides = load_raw_data(year=from_date_.year, months=from_date_.month) 109 | rides = rides[rides.pickup_datetime >= from_date_] 110 | rides = rides[rides.pickup_datetime < to_date_] 111 | 112 | else: 113 | # download 2 files from website 114 | rides = load_raw_data(year=from_date_.year, months=from_date_.month) 115 | rides = rides[rides.pickup_datetime >= from_date_] 116 | rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month) 117 | rides_2 = rides_2[rides_2.pickup_datetime < to_date_] 118 | rides = pd.concat([rides, rides_2]) 119 | 120 | # shift the pickup_datetime back 1 year ahead, to simulate production data 121 | # using its 7*52-days-ago value 122 | rides['pickup_datetime'] += timedelta(days=7*52) 123 | 124 | rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True) 125 | 126 | return rides 127 | 128 | 129 | def load_raw_data( 130 | year: int 131 | #months: Optional[List[int]] = None 132 | ) -> pd.DataFrame: 133 | """ 134 | Loads raw data from local storage or downloads it from the BsAs website, and 135 | then loads it into a Pandas DataFrame 136 | 137 | Args: 138 | year: year of the data to download 139 | #months: months of the data to download. If `None`, download all months 140 | 141 | Returns: 142 | pd.DataFrame: DataFrame with the following columns: 143 | - pickup_datetime: datetime of the pickup 144 | - pickup_location_id: ID of the pickup location 145 | """ 146 | rides = pd.DataFrame() 147 | 148 | # if months is None: 149 | # # download data for the entire year (all months) 150 | # months = list(range(1, 13)) 151 | # elif isinstance(months, int): 152 | # # download data only for the month specified by the int `month` 153 | # months = [months] 154 | 155 | #for month in months: 156 | 157 | local_file = RAW_DATA_DIR / f'rides_{year}.parquet' #-{month:02d}.parquet' 158 | if not local_file.exists(): 159 | try: 160 | # download the file from the BsAs website 161 | print(f'Downloading file {year}') #-{month:02d} 162 | download_one_file_of_raw_data(year) 163 | unzip_and_convert_csv_to_parquet(year) 164 | except: 165 | print(f'{year} file is not available') 166 | #continue 167 | else: 168 | print(f'File {year} was already in local storage') 169 | 170 | # load the file into Pandas 171 | rides_one_year = pd.read_parquet(local_file) 172 | 173 | # rename columns 174 | rides_one_year = rides_one_year[['fecha_origen_recorrido', 'id_estacion_origen']] 175 | rides_one_year.rename(columns={ 176 | 'fecha_origen_recorrido': 'pickup_datetime', 177 | 'id_estacion_origen': 'pickup_location_id', 178 | }, inplace=True) 179 | 180 | # eliminate "BAEcobici" and convert it to int type 181 | rides_one_year['pickup_location_id'] = rides_one_year['pickup_location_id'].str.replace('BAEcobici', '').astype(int) 182 | # transform "pickup_datetime" to datetime 183 | rides_one_year['pickup_datetime'] = pd.to_datetime(rides_one_year['pickup_datetime'],format='%Y-%m-%d %H:%M:%S') 184 | 185 | # validate the file 186 | rides_one_year = validate_raw_data(rides_one_year, year) 187 | 188 | # append to existing data 189 | rides = pd.concat([rides, rides_one_year]) 190 | 191 | if rides.empty: 192 | # no data, so we return an empty dataframe 193 | return pd.DataFrame() 194 | else: 195 | # keep only time and origin of the ride 196 | rides = rides[['pickup_datetime', 'pickup_location_id']] 197 | return rides 198 | 199 | 200 | def add_missing_slots(ts_data: pd.DataFrame) -> pd.DataFrame: 201 | """ 202 | Add necessary rows to the input 'ts_data' to make sure the output 203 | has a complete list of 204 | - pickup_hours 205 | - pickup_location_ids 206 | """ 207 | #Estaba generando más locations id por tanto lo modifique. Esta es la version antigua 208 | #location_ids = range(1, ts_data['pickup_location_id'].max() + 1) 209 | 210 | #Esta es la línea modificada !!!!!!! 211 | location_ids = ts_data['pickup_location_id'].unique() 212 | 213 | full_range = pd.date_range(ts_data['pickup_hour'].min(), 214 | ts_data['pickup_hour'].max(), 215 | freq='H') 216 | output = pd.DataFrame() 217 | for location_id in tqdm(location_ids): 218 | 219 | # keep only rides for this 'location_id' 220 | ts_data_i = ts_data.loc[ts_data.pickup_location_id == location_id, ['pickup_hour', 'rides']] 221 | 222 | if ts_data_i.empty: 223 | # add a dummy entry with a 0 224 | ts_data_i = pd.DataFrame.from_dict([ 225 | {'pickup_hour': ts_data['pickup_hour'].max(), 'rides': 0} 226 | ]) 227 | 228 | # quick way to add missing dates with 0 in a Series 229 | # taken from https://stackoverflow.com/a/19324591 230 | ts_data_i.set_index('pickup_hour', inplace=True) 231 | ts_data_i.index = pd.DatetimeIndex(ts_data_i.index) 232 | ts_data_i = ts_data_i.reindex(full_range, fill_value=0) 233 | 234 | # add back `location_id` columns 235 | ts_data_i['pickup_location_id'] = location_id 236 | 237 | output = pd.concat([output, ts_data_i]) 238 | 239 | # move the pickup_hour from the index to a dataframe column 240 | output = output.reset_index().rename(columns={'index': 'pickup_hour'}) 241 | 242 | return output 243 | 244 | 245 | def transform_raw_data_into_ts_data( 246 | rides: pd.DataFrame 247 | ) -> pd.DataFrame: 248 | """""" 249 | # sum rides per location and pickup_hour 250 | rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H') 251 | agg_rides = rides.groupby(['pickup_hour', 'pickup_location_id']).size().reset_index() 252 | agg_rides.rename(columns={0: 'rides'}, inplace=True) 253 | # add rows for (locations, pickup_hours)s with 0 rides 254 | agg_rides_all_slots = add_missing_slots(agg_rides) 255 | 256 | # filtrar las filas con las estaciones del 2022 257 | # Hago esto porque como la simulacion de consulta es utilizando los datos del 2022 si luego al utilizar los datos no aparecen las nuevas el modelo va a aprender mal. 258 | estaciones_2022=[ 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 17, 20, 259 | 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 260 | 36, 38, 41, 43, 44, 45, 46, 48, 49, 50, 51, 54, 56, 261 | 57, 58, 59, 60, 61, 63, 64, 65, 66, 68, 69, 70, 71, 262 | 73, 74, 75, 76, 77, 79, 80, 82, 83, 84, 85, 86, 87, 263 | 89, 91, 92, 93, 94, 95, 96, 98, 99, 101, 102, 104, 107, 264 | 111, 112, 114, 116, 117, 118, 120, 121, 122, 124, 126, 128, 130, 265 | 131, 132, 134, 135, 137, 138, 144, 146, 149, 150, 151, 152, 153, 266 | 155, 156, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 267 | 172, 174, 175, 176, 177, 179, 181, 182, 183, 184, 186, 187, 188, 268 | 189, 190, 191, 193, 194, 196, 197, 199, 200, 202, 203, 204, 205, 269 | 206, 207, 208, 210, 212, 213, 214, 215, 216, 219, 220, 222, 223, 270 | 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 239, 241, 242, 271 | 245, 247, 248, 249, 251, 252, 253, 254, 255, 257, 258, 259, 260, 272 | 261, 262, 263, 265, 267, 268, 269, 270, 271, 273, 275, 277, 278, 273 | 280, 281, 284, 289, 291, 294, 299, 301, 302, 304, 307, 308, 309, 274 | 310, 311, 316, 318, 322, 323, 324, 327, 329, 330, 333, 335, 336, 275 | 340, 342, 348, 349, 353, 355, 358, 359, 361, 362, 363, 364, 366, 276 | 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 378, 379, 381, 277 | 382, 383, 384, 385, 386, 387, 392, 393, 395, 400, 403, 407, 408, 278 | 412, 413, 416, 417, 418, 420, 422, 423, 424, 425, 426, 427, 428, 279 | 429, 431, 432, 433, 434, 435, 436, 440, 441, 442, 443, 444, 447, 280 | 448, 449, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 464, 281 | 465, 466, 467, 468, 469, 471, 472, 473, 474, 475, 476, 477, 478, 282 | 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 283 | 492, 493, 494, 496, 497, 498] 284 | 285 | agg_rides_all_slots = agg_rides_all_slots[agg_rides_all_slots['pickup_location_id'].isin(estaciones_2022)] 286 | 287 | return agg_rides_all_slots 288 | 289 | 290 | def transform_ts_data_into_features_and_target( 291 | ts_data: pd.DataFrame, 292 | input_seq_len: int, 293 | step_size: int, 294 | output_seq_len: int #Lo que agregué nuevo 295 | ) -> pd.DataFrame: 296 | """ 297 | Slices and transposes data from time-series format into a (features, target) 298 | format that we can use to train Supervised ML models 299 | """ 300 | assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'} 301 | 302 | location_ids = ts_data['pickup_location_id'].unique() 303 | features = pd.DataFrame() 304 | targets = pd.DataFrame() 305 | 306 | for location_id in tqdm(location_ids): 307 | 308 | # keep only ts data for this `location_id` 309 | ts_data_one_location = ts_data.loc[ 310 | ts_data.pickup_location_id == location_id, 311 | ['pickup_hour', 'rides'] 312 | ].sort_values(by=['pickup_hour']) 313 | 314 | # pre-compute cutoff indices to split dataframe rows 315 | indices = get_cutoff_indices_features_and_target( 316 | ts_data_one_location, 317 | input_seq_len, 318 | step_size, 319 | output_seq_len #Lo que agregué nuevo 320 | ) 321 | 322 | # slice and transpose data into numpy arrays for features and targets 323 | n_examples = len(indices) 324 | x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32) 325 | y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas 326 | pickup_hours = [] 327 | for i, idx in enumerate(indices): 328 | x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values 329 | y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values 330 | pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour']) 331 | 332 | # numpy -> pandas 333 | features_one_location = pd.DataFrame( 334 | x, 335 | columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))] 336 | ) 337 | features_one_location['pickup_hour'] = pickup_hours 338 | features_one_location['pickup_location_id'] = location_id 339 | 340 | # numpy -> pandas 341 | targets_one_location = pd.DataFrame(y, columns=[f'rides_next_{i+1}_hour' for i in range(output_seq_len)]) 342 | 343 | # concatenate results 344 | features = pd.concat([features, features_one_location]) 345 | targets = pd.concat([targets, targets_one_location]) 346 | 347 | features.reset_index(inplace=True, drop=True) 348 | targets.reset_index(inplace=True, drop=True) 349 | 350 | return features, targets #['target_rides_next_hour'] 351 | 352 | 353 | def get_cutoff_indices_features_and_target( 354 | data: pd.DataFrame, 355 | input_seq_len: int, 356 | step_size: int, 357 | output_seq_len: int #Lo que agregué nuevo 358 | ) -> list: 359 | 360 | stop_position = len(data) - 1 361 | 362 | # Start the first sub-sequence at index position 0 363 | subseq_first_idx = 0 364 | subseq_mid_idx = input_seq_len 365 | subseq_last_idx = input_seq_len + output_seq_len #le agrego "output_seq_len" para introducirlo como variable 366 | indices = [] 367 | 368 | while subseq_last_idx <= stop_position: 369 | indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx)) 370 | subseq_first_idx += step_size 371 | subseq_mid_idx += step_size 372 | subseq_last_idx += step_size 373 | 374 | return indices 375 | 376 | #Agrego esto para transformar cualquier dataset a algo comparable con las predicciones 377 | def transform_ts_data_into_dataset_comparable_with_predictions( 378 | ts_data: pd.DataFrame, 379 | input_seq_len: int, 380 | step_size: int, 381 | output_seq_len: int #Lo que agregué nuevo 382 | ) -> pd.DataFrame: 383 | """ 384 | Slices and transposes data from time-series format into a (features, target) 385 | format that we can use to train Supervised ML models 386 | """ 387 | assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'} 388 | 389 | location_ids = ts_data['pickup_location_id'].unique() 390 | #features = pd.DataFrame() 391 | targets = pd.DataFrame() 392 | 393 | for location_id in tqdm(location_ids): 394 | 395 | # keep only ts data for this `location_id` 396 | ts_data_one_location = ts_data.loc[ 397 | ts_data.pickup_location_id == location_id, 398 | ['pickup_hour', 'rides'] 399 | ].sort_values(by=['pickup_hour']) 400 | 401 | # pre-compute cutoff indices to split dataframe rows 402 | indices = get_cutoff_indices_features_and_target( 403 | ts_data_one_location, 404 | input_seq_len, 405 | step_size, 406 | output_seq_len #Lo que agregué nuevo 407 | ) 408 | 409 | # slice and transpose data into numpy arrays for features and targets 410 | n_examples = len(indices) 411 | #x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32) 412 | y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas 413 | pickup_hours = [] 414 | for i, idx in enumerate(indices): 415 | #x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values 416 | y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values 417 | pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour']) 418 | 419 | # numpy -> pandas 420 | # features_one_location = pd.DataFrame( 421 | # x, 422 | # columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))] 423 | # ) 424 | # features_one_location['pickup_hour'] = pickup_hours 425 | # features_one_location['pickup_location_id'] = location_id 426 | 427 | # numpy -> pandas 428 | targets_one_location = pd.DataFrame(y, columns=[f'real_rides_next_{i+1}_hour' for i in range(output_seq_len)]) 429 | targets_one_location['pickup_hour'] = pickup_hours 430 | targets_one_location['pickup_location_id'] = location_id 431 | 432 | # concatenate results 433 | #features = pd.concat([features, features_one_location]) 434 | targets = pd.concat([targets, targets_one_location]) 435 | 436 | #features.reset_index(inplace=True, drop=True) 437 | targets.reset_index(inplace=True, drop=True) 438 | 439 | return targets #['target_rides_next_hour'] #features, -------------------------------------------------------------------------------- /src/data_split.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Tuple 3 | 4 | import pandas as pd 5 | 6 | def train_test_split( 7 | df: pd.DataFrame, 8 | cutoff_date: datetime, 9 | targets_columns_names: list, 10 | ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: 11 | """ 12 | """ 13 | train_data = df[df.pickup_hour < cutoff_date].reset_index(drop=True) 14 | test_data = df[df.pickup_hour >= cutoff_date].reset_index(drop=True) 15 | 16 | X_train = train_data.drop(targets_columns_names, axis=1) 17 | y_train = train_data[targets_columns_names] 18 | X_test = test_data.drop(targets_columns_names, axis=1) 19 | y_test = test_data[targets_columns_names] 20 | 21 | return X_train, y_train, X_test, y_test -------------------------------------------------------------------------------- /src/feature_store_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import hsfs 3 | import hopsworks 4 | 5 | import src.config as config 6 | 7 | def get_feature_store() -> hsfs.feature_store.FeatureStore: 8 | """Connects to Hopsworks and returns a pointer to the feature store 9 | 10 | Returns: 11 | hsfs.feature_store.FeatureStore: pointer to the feature store 12 | """ 13 | #project = get_hopsworks_project() 14 | project = hopsworks.login( 15 | project=config.HOPSWORKS_PROJECT_NAME, 16 | api_key_value=config.HOPSWORKS_API_KEY 17 | ) 18 | return project.get_feature_store() 19 | 20 | def get_feature_group( 21 | name: str, 22 | version: Optional[int] = 1 23 | ) -> hsfs.feature_group.FeatureGroup: 24 | """Connects to the feature store and returns a pointer to the given 25 | feature group `name` 26 | 27 | Args: 28 | name (str): name of the feature group 29 | version (Optional[int], optional): _description_. Defaults to 1. 30 | 31 | Returns: 32 | hsfs.feature_group.FeatureGroup: pointer to the feature group 33 | """ 34 | return get_feature_store().get_feature_group( 35 | name=name, 36 | version=version, 37 | ) -------------------------------------------------------------------------------- /src/frontend.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from datetime import datetime, timedelta 3 | 4 | import requests 5 | import numpy as np 6 | import pandas as pd 7 | import streamlit as st 8 | import geopandas as gpd 9 | import pydeck as pdk 10 | import numpy as np 11 | 12 | from src.inference import ( 13 | load_predictions_from_store, 14 | load_batch_of_features_from_store 15 | ) 16 | from src.paths import DATA_DIR 17 | from src.plot import plot_one_sample 18 | 19 | st.set_page_config(layout="wide") 20 | 21 | # title 22 | # current_date = datetime.strptime('2023-01-05 12:00:00', '%Y-%m-%d %H:%M:%S') 23 | current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') # - timedelta(hours=1) 24 | current_date_str = str(current_date.strftime('%Y-%m-%d %H:%M')) 25 | st.title(f'Bike demand prediction 🚲') 26 | # Crear el encabezado con HTML 27 | mensaje_personalizado = "Made by Javier Yanzón. Let's connect🙌🏻" 28 | 29 | # Enlaces a tus redes sociales 30 | twitter_link = "https://twitter.com/javieryanzon" 31 | linkedin_link = "https://www.linkedin.com/in/javieryanzon" 32 | st.markdown( 33 | f"{mensaje_personalizado}" 34 | #f"
" 35 | f" • LinkedIn • " 36 | f"Twitter", 37 | unsafe_allow_html=True 38 | ) 39 | st.header(f'{current_date_str} UTC') 40 | 41 | progress_bar = st.sidebar.header('⚙️ Working Progress') 42 | progress_bar = st.sidebar.progress(0) 43 | N_STEPS = 6 44 | 45 | def load_shape_data_file() -> gpd.geodataframe.GeoDataFrame: 46 | """ 47 | Fetches remote file with shape data, that we later use to plot the 48 | different pickup_location_ids on the map of NYC. 49 | 50 | Raises: 51 | Exception: when we cannot connect to the external server where 52 | the file is. 53 | 54 | Returns: 55 | GeoDataFrame: columns -> (OBJECTID Shape_Leng Shape_Area zone LocationID borough geometry) 56 | """ 57 | # download zip file 58 | URL = 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/estaciones-bicicletas-publicas/estaciones-de-bicicletas-zip.zip' 59 | response = requests.get(URL) 60 | path = DATA_DIR / f'IE-Estaciones.zip' 61 | if response.status_code == 200: 62 | open(path, "wb").write(response.content) 63 | else: 64 | raise Exception(f'{URL} is not available') 65 | 66 | # unzip file 67 | with zipfile.ZipFile(path, 'r') as zip_ref: 68 | zip_ref.extractall(DATA_DIR / 'IE-Estaciones') 69 | 70 | # load and return shape file 71 | return gpd.read_file(DATA_DIR / 'IE-Estaciones/IE-Estaciones.shp').to_crs('epsg:4326') # 3857 72 | 73 | @st.cache_data 74 | def _load_batch_of_features_from_store(current_date: datetime) -> pd.DataFrame: 75 | """Wrapped version of src.inference.load_batch_of_features_from_store, so 76 | we can add Streamlit caching 77 | 78 | Args: 79 | current_date (datetime): _description_ 80 | 81 | Returns: 82 | pd.DataFrame: n_features + 2 columns: 83 | - `rides_previous_N_hour` 84 | - `rides_previous_{N-1}_hour` 85 | - ... 86 | - `rides_previous_1_hour` 87 | - `pickup_hour` 88 | - `pickup_location_id` 89 | """ 90 | return load_batch_of_features_from_store(current_date) 91 | 92 | #Quité esto a ver si se soluciona error de cache data inicial 93 | @st.cache_data 94 | def _load_predictions_from_store( 95 | from_pickup_hour: datetime, 96 | to_pickup_hour: datetime 97 | ) -> pd.DataFrame: 98 | """ 99 | Wrapped version of src.inference.load_predictions_from_store, so we 100 | can add Streamlit caching 101 | 102 | Args: 103 | from_pickup_hour (datetime): min datetime (rounded hour) for which we want to get 104 | predictions 105 | 106 | to_pickup_hour (datetime): max datetime (rounded hour) for which we want to get 107 | predictions 108 | 109 | Returns: 110 | pd.DataFrame: 2 columns: pickup_location_id, predicted_demand 111 | """ 112 | return load_predictions_from_store(from_pickup_hour, to_pickup_hour) 113 | 114 | with st.spinner(text="Downloading shape file to plot bike stations"): 115 | geo_df = load_shape_data_file() 116 | st.sidebar.write('✅ Shape file was downloaded ') 117 | progress_bar.progress(1/N_STEPS) 118 | 119 | # with st.spinner(text="Fetching model predictions from the store"): 120 | # predictions_df = _load_predictions_from_store( 121 | # from_pickup_hour=current_date - timedelta(hours=3), 122 | # to_pickup_hour=current_date 123 | # ) 124 | # predictions_df = predictions_df.reset_index(drop=True) 125 | # #predictions_df=predictions_df.set_index("pickup_location_id") 126 | # #predictions_df.index.name = None 127 | # st.sidebar.write('✅ Model predictions arrived') 128 | # progress_bar.progress(2/N_STEPS) 129 | 130 | try: 131 | with st.spinner(text="Fetching model predictions from the store"): 132 | predictions_df = _load_predictions_from_store( 133 | from_pickup_hour=current_date - timedelta(hours=3), 134 | to_pickup_hour=current_date 135 | ) 136 | predictions_df = predictions_df.reset_index(drop=True) 137 | st.sidebar.write('✅ Model predictions arrived') 138 | progress_bar.progress(2/N_STEPS) 139 | 140 | except Exception as e: 141 | # Captura el error 142 | st.error(f"An error occurred: {str(e)}") 143 | # Intenta nuevamente 144 | st.warning(f"Retrying...") 145 | with st.spinner(text="Fetching model predictions from the store"): 146 | predictions_df = _load_predictions_from_store( 147 | from_pickup_hour=current_date - timedelta(hours=3), 148 | to_pickup_hour=current_date 149 | ) 150 | predictions_df = predictions_df.reset_index(drop=True) 151 | st.sidebar.write('✅ Model predictions arrived') 152 | progress_bar.progress(2/N_STEPS) 153 | 154 | 155 | # Here we are checking the predictions for the current hour have already been computed 156 | # and are available 157 | 158 | # next_hour_predictions_ready = \ 159 | # False if predictions_df[predictions_df.pickup_hour == current_date].empty else True 160 | prev_1_hour_predictions_ready = \ 161 | False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=1))].empty else True 162 | prev_2_hour_predictions_ready = \ 163 | False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=2))].empty else True 164 | prev_3_hour_predictions_ready = \ 165 | False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=3))].empty else True 166 | 167 | # if next_hour_predictions_ready: 168 | # # predictions for the current hour are available 169 | # predictions_df = predictions_df[predictions_df.pickup_hour == current_date] 170 | # st.subheader('The most recent data is not yet available. Using last hour predictions') 171 | 172 | if prev_1_hour_predictions_ready: 173 | # predictions for current hour sometimes makes a mistake, so we use previous hour predictions -1 174 | predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=1))] 175 | current_date = current_date - timedelta(hours=1) 176 | st.subheader('The most recent data is not available. Using last 1 hour predictions') 177 | 178 | elif prev_2_hour_predictions_ready: 179 | # predictions for hour -1 are not available, so we use previous hour predictions -2 180 | predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=2))] 181 | current_date = current_date - timedelta(hours=2) 182 | st.subheader('⚠️ The most recent data is not yet available. Using last 2 hour predictions') 183 | 184 | elif prev_3_hour_predictions_ready: 185 | # predictions for hour -2 are not available, so we use previous hour predictions -3 186 | predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=3))] 187 | current_date = current_date - timedelta(hours=3) 188 | st.subheader('⚠️ The most recent data is not yet available. Using last 3 hour predictions') 189 | else: 190 | raise Exception('Features are not available for the last 4 hours. Is your feature \ 191 | pipeline up and running? 🤔') 192 | 193 | 194 | with st.spinner(text="Preparing data to plot"): 195 | 196 | def pseudocolor(val, minval, maxval, startcolor, stopcolor): 197 | """ 198 | Convert value in the range minval...maxval to a color in the range 199 | startcolor to stopcolor. The colors passed and the the one returned are 200 | composed of a sequence of N component values. 201 | 202 | Credits to https://stackoverflow.com/a/10907855 203 | """ 204 | f = float(val-minval) / (maxval-minval) 205 | return tuple(f*(b-a)+a for (a, b) in zip(startcolor, stopcolor)) 206 | 207 | df = pd.merge(geo_df, predictions_df, 208 | right_on='pickup_location_id', 209 | left_on='ID', 210 | how='inner') 211 | 212 | BLACK, ORANGE = (0, 0, 0), (255, 128, 0) 213 | selected_columns = [c for c in df.columns if c.startswith('rides_next_')] 214 | df['max_hour'] = df[selected_columns].idxmax(axis=1) 215 | df['color_scaling'] = df[selected_columns].max(axis=1) 216 | max_pred, min_pred = df['color_scaling'].max(), df['color_scaling'].min() 217 | df['fill_color'] = df['color_scaling'].apply(lambda x: pseudocolor(x, min_pred, max_pred, BLACK, ORANGE)) 218 | 219 | progress_bar.progress(3/N_STEPS) 220 | 221 | with st.spinner(text="Generating BsAs Map"): 222 | 223 | INITIAL_VIEW_STATE = pdk.ViewState( 224 | latitude=-34.60280869220721, 225 | longitude=-58.42827362585887, 226 | zoom=11, 227 | max_zoom=16, 228 | pitch=45, 229 | bearing=0 230 | ) 231 | layer = pdk.Layer("ColumnLayer", 232 | data=df, 233 | get_position=["Lon", "Lat"], 234 | get_elevation=['color_scaling'], 235 | auto_highlight=True, 236 | radius=50, 237 | elevation_scale=300, 238 | get_fill_color="fill_color", 239 | get_line_color=[255, 255, 255], 240 | pickable=True, 241 | extruded=True, 242 | coverage=1) 243 | 244 | 245 | tooltip = {"html": "Zone ID: {ID}
Direction: {DIRECCION}
Max: {color_scaling} rides - {max_hour}"} 246 | 247 | r = pdk.Deck( 248 | layers=[layer], 249 | initial_view_state=INITIAL_VIEW_STATE, 250 | tooltip=tooltip 251 | ) 252 | 253 | st.pydeck_chart(r) 254 | progress_bar.progress(4/N_STEPS) 255 | 256 | with st.spinner(text="Fetching batch of features used in the last run"): 257 | features_df = _load_batch_of_features_from_store(current_date) 258 | features_df=features_df.reset_index(drop=True) 259 | #features_df=features_df.set_index("pickup_location_id") 260 | #features_df.index.name = None 261 | st.sidebar.write('✅ Inference features fetched from the store') 262 | progress_bar.progress(5/N_STEPS) 263 | 264 | with st.spinner(text="Plotting time-series data"): 265 | 266 | predictions_df = np.clip(predictions_df[selected_columns], 0, None) #Hago esto para limitar los valores a cero y que no de ninguno negativo 267 | 268 | 269 | predictions_df['max'] = predictions_df[selected_columns].max(axis=1) 270 | predictions_df = predictions_df.reset_index(drop=True) 271 | sorted_indices = predictions_df['max'].sort_values(ascending=False).index 272 | predictions_max = predictions_df.copy() 273 | predictions_max['max_hour'] = predictions_max[selected_columns].idxmax(axis=1) 274 | predictions_df = predictions_df.drop('max', axis=1) 275 | 276 | # Selecciona las 10 filas principales 277 | top_10_indices = sorted_indices[:10] 278 | #st.sidebar.write(top_10_indices) 279 | #st.sidebar.write(len(predictions_df)) 280 | 281 | # Agregar un botón de descarga en la esquina superior derecha 282 | df_to_download = df.copy().drop(['QUEDA_ABIE','EMPLAZAMIE','ANCLAJES','max_hour','color_scaling','fill_color'], axis=1) #pd.merge(features_df, predictions_df, on=['pickup_hour', 'pickup_location_id'], how='left') 283 | button = st.download_button( 284 | label="Download predictions CSV", 285 | data=df_to_download.to_csv(index=False).encode('utf-8'), 286 | file_name='predictions.csv', 287 | key='download_button' 288 | ) 289 | 290 | st.markdown("
Note: Do not use this data for operational purposes. As data is updated monthly, the last hours data are not available. Therefore, a travel simulation is carried out and is taken as certain from which forecasts are made.
", unsafe_allow_html=True) 291 | 292 | # plot each time-series with the prediction 293 | for row_id in top_10_indices: 294 | #if row_id < len(predictions_df): 295 | # title 296 | location_id = features_df['pickup_location_id'].iloc[row_id] 297 | location_name = df[df['pickup_location_id'] == location_id]['DIRECCION'].iloc[0] 298 | 299 | 300 | # location_id = df['pickup_location_id'].iloc[row_id] 301 | # location_name = df['DIRECCION'].iloc[row_id] 302 | #location_name = df['DIRECCION'].iloc[df['pickup_location_id'] == location_id] 303 | #location_name = df['DIRECCION'].iloc[row_id] 304 | #st.header(f'Direction: {location_id} - {location_name}') 305 | 306 | st.header(f'Direction: {location_name} [Zone ID: {location_id}]') 307 | 308 | # plot predictions 309 | prediction = predictions_max['max'].iloc[row_id] #df['color_scaling'].iloc[row_id] 310 | max_hour_prediction = predictions_max['max_hour'].iloc[row_id] 311 | max_hour_prediction_int = int(max_hour_prediction.replace('rides_next_', '').replace('_hour', '')) 312 | max_hour_prediction_str =str(pd.to_datetime(current_date + timedelta(hours=max_hour_prediction_int-1), utc=True).strftime('%Y-%m-%d %H:%M'))+ " UTC " + " - " + str(pd.to_datetime(current_date + timedelta(hours=max_hour_prediction_int), utc=True).strftime('%Y-%m-%d %H:%M') + " UTC") 313 | st.metric(label="Max rides predicted in 36 hours", value=int(prediction)) 314 | st.metric(label="Approximate Hour of max prediction", value=max_hour_prediction_str) 315 | 316 | fig = plot_one_sample( 317 | example_id=row_id, 318 | features=features_df, 319 | targets=predictions_df, 320 | predictions=predictions_df 321 | #directions=geo_df[['ID', 'DIRECCION']] 322 | ) 323 | st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000) 324 | 325 | progress_bar.progress(6/N_STEPS) -------------------------------------------------------------------------------- /src/frontend_monitoring.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import streamlit as st 6 | from sklearn.metrics import mean_absolute_error 7 | import plotly.express as px 8 | 9 | from src.monitoring import load_predictions_and_actual_values_from_store 10 | from src.data import transform_ts_data_into_dataset_comparable_with_predictions 11 | 12 | st.set_page_config(layout="wide") 13 | 14 | # title 15 | current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') 16 | st.title(f'Monitoring dashboard 🔎') 17 | 18 | progress_bar = st.sidebar.header('⚙️ Working Progress') 19 | progress_bar = st.sidebar.progress(0) 20 | N_STEPS = 3 21 | 22 | 23 | @st.cache_data 24 | def _load_predictions_and_actuals_from_store( 25 | from_date: datetime, 26 | to_date: datetime 27 | ) -> pd.DataFrame: 28 | """Wrapped version of src.monitoring.load_predictions_and_actual_values_from_store, so 29 | we can add Streamlit caching 30 | 31 | Args: 32 | from_date (datetime): min datetime for which we want predictions and 33 | actual values 34 | 35 | to_date (datetime): max datetime for which we want predictions and 36 | actual values 37 | 38 | Returns: 39 | pd.DataFrame: 4 columns 40 | - `pickup_location_id` 41 | - `predicted_demand` 42 | - `pickup_hour` 43 | - `rides` 44 | """ 45 | return load_predictions_and_actual_values_from_store(from_date, to_date) 46 | 47 | # with st.spinner(text="Fetching model predictions and actual values from the store"): 48 | 49 | # ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store( 50 | # from_date=current_date - timedelta(days=14), 51 | # to_date=current_date 52 | # ) 53 | # real_rides = transform_ts_data_into_dataset_comparable_with_predictions( 54 | # ts_data_2, 55 | # input_seq_len=0, # one month 56 | # step_size=24, 57 | # output_seq_len=36 58 | # ) 59 | # st.sidebar.write('✅ Model predictions and actual values arrived') 60 | # progress_bar.progress(1/N_STEPS) 61 | 62 | 63 | try: 64 | with st.spinner(text="Fetching model predictions and actual values from the store"): 65 | ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store( 66 | from_date=current_date - timedelta(days=14), 67 | to_date=current_date 68 | ) 69 | real_rides = transform_ts_data_into_dataset_comparable_with_predictions( 70 | ts_data_2, 71 | input_seq_len=0, # one month 72 | step_size=24, 73 | output_seq_len=36 74 | ) 75 | st.sidebar.write('✅ Model predictions and actual values arrived') 76 | progress_bar.progress(1/N_STEPS) 77 | 78 | except Exception as e: 79 | # Captura el error 80 | st.error(f"An error occurred: {str(e)}") 81 | # Intenta nuevamente 82 | st.warning(f"Retrying...") 83 | with st.spinner(text="Fetching model predictions and actual values from the store"): 84 | ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store( 85 | from_date=current_date - timedelta(days=14), 86 | to_date=current_date 87 | ) 88 | real_rides = transform_ts_data_into_dataset_comparable_with_predictions( 89 | ts_data_2, 90 | input_seq_len=0, # one month 91 | step_size=24, 92 | output_seq_len=36 93 | ) 94 | st.sidebar.write('✅ Model predictions and actual values arrived') 95 | progress_bar.progress(1/N_STEPS) 96 | 97 | 98 | with st.spinner(text="Plotting aggregate MAE hour-by-hour"): 99 | 100 | monitoring_df = pd.merge(ts_data_1, real_rides, on=['pickup_hour', 'pickup_location_id'], how='inner') 101 | st.header('Mean Absolute Error (MAE) hour-by-hour') 102 | selected_columns_pred = [c for c in monitoring_df.columns if c.startswith('rides_next_')] ##### 103 | selected_columns_real = [c for c in monitoring_df.columns if c.startswith('real_rides_next_')] 104 | 105 | # MAE per pickup_hour 106 | # https://stackoverflow.com/a/47914634 107 | mae_per_hour = ( 108 | monitoring_df 109 | .groupby('pickup_hour') 110 | .apply(lambda g: mean_absolute_error(g[selected_columns_real], g[selected_columns_pred])) #### 111 | .reset_index() 112 | .rename(columns={0: 'mae'}) 113 | .sort_values(by='pickup_hour') 114 | ) 115 | 116 | fig = px.bar( 117 | mae_per_hour, 118 | x='pickup_hour', y='mae', 119 | template='plotly_dark', 120 | ) 121 | st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000) 122 | 123 | progress_bar.progress(2/N_STEPS) 124 | 125 | 126 | with st.spinner(text="Plotting MAE hour-by-hour for top locations"): 127 | 128 | st.header('Mean Absolute Error (MAE) per location and hour') 129 | 130 | top_locations_by_demand = ( 131 | monitoring_df 132 | .groupby('pickup_location_id')[selected_columns_real].sum() 133 | .sum(axis=1) 134 | .sort_values(ascending=False) 135 | .reset_index() 136 | .head(10)['pickup_location_id'] 137 | ) 138 | 139 | for location_id in top_locations_by_demand: 140 | 141 | mae_per_hour = ( 142 | monitoring_df[monitoring_df.pickup_location_id == location_id] 143 | .groupby('pickup_hour') 144 | .apply(lambda g: mean_absolute_error(g[selected_columns_real], g[selected_columns_pred])) 145 | .reset_index() 146 | .rename(columns={0: 'mae'}) 147 | .sort_values(by='pickup_hour') 148 | ) 149 | 150 | fig = px.bar( 151 | mae_per_hour, 152 | x='pickup_hour', y='mae', 153 | template='plotly_dark', 154 | ) 155 | st.subheader(f'{location_id=}') 156 | st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000) 157 | 158 | progress_bar.progress(3/N_STEPS) -------------------------------------------------------------------------------- /src/inference.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import hopsworks 4 | #from hsfs.feature_store import FeatureStore 5 | import pandas as pd 6 | import numpy as np 7 | 8 | import src.config as config 9 | from src.feature_store_api import get_feature_store 10 | #, get_or_create_feature_view 11 | #from src.config import FEATURE_VIEW_METADATA 12 | 13 | def get_hopsworks_project() -> hopsworks.project.Project: 14 | 15 | return hopsworks.login( 16 | project=config.HOPSWORKS_PROJECT_NAME, 17 | api_key_value=config.HOPSWORKS_API_KEY 18 | ) 19 | 20 | # def get_feature_store() -> FeatureStore: 21 | 22 | # project = get_hopsworks_project() 23 | # return project.get_feature_store() 24 | 25 | 26 | def get_model_predictions(model, features: pd.DataFrame) -> pd.DataFrame: 27 | """""" 28 | # past_rides_columns = [c for c in features.columns if c.startswith('rides_')] 29 | predictions = model.predict(features) 30 | predictions = predictions.round(0) 31 | 32 | results = pd.DataFrame(predictions, 33 | columns=[f'rides_next_{i+1}_hour' for i in range(36)] 34 | ) #son 36 horas de prediccion 35 | results['pickup_location_id'] = features['pickup_location_id'].values 36 | #results['predicted_demand'] = predictions.round(0) #esto estaba antes 37 | 38 | return results 39 | 40 | 41 | def load_batch_of_features_from_store( 42 | current_date: datetime, 43 | ) -> pd.DataFrame: 44 | """Fetches the batch of features used by the ML system at `current_date` 45 | 46 | Args: 47 | current_date (datetime): datetime of the prediction for which we want 48 | to get the batch of features 49 | 50 | Returns: 51 | pd.DataFrame: 3 columns: 52 | - `pickup_hour` 53 | - `rides` 54 | - `pickup_location_id` 55 | """ 56 | n_features = config.N_FEATURES 57 | 58 | feature_store = get_feature_store() 59 | 60 | # read time-series data from the feature store 61 | fetch_data_to = pd.to_datetime(current_date - timedelta(hours=1), utc=True) 62 | fetch_data_from = pd.to_datetime(current_date - timedelta(days=28), utc=True) 63 | print(f'Fetching data from {fetch_data_from} to {fetch_data_to}') 64 | feature_view = feature_store.get_feature_view( 65 | name=config.FEATURE_VIEW_NAME, 66 | version=config.FEATURE_VIEW_VERSION 67 | ) 68 | ts_data = feature_view.get_batch_data( 69 | start_time=pd.to_datetime(fetch_data_from - timedelta(days=1), utc=True), 70 | end_time=pd.to_datetime(fetch_data_to + timedelta(days=1), utc=True) 71 | ) 72 | 73 | # Convert to UTC aware datetime 74 | ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True) 75 | 76 | # filter data to the time period we are interested in 77 | ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)] 78 | 79 | # validate we are not missing data in the feature store 80 | location_ids = ts_data['pickup_location_id'].unique() 81 | assert len(ts_data) == n_features*len(location_ids), \ 82 | "Time-series data is not complete. Make sure your feature pipeline is up and runnning." 83 | 84 | # sort data by location and time 85 | ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True) 86 | # print(f'{ts_data=}') 87 | 88 | # transpose time-series data as a feature vector, for each `pickup_location_id` 89 | x = np.ndarray(shape=(len(location_ids), n_features), dtype=np.float32) 90 | for i, location_id in enumerate(location_ids): 91 | ts_data_i = ts_data.loc[ts_data.pickup_location_id == location_id, :] 92 | ts_data_i = ts_data_i.sort_values(by=['pickup_hour']) 93 | x[i, :] = ts_data_i['rides'].values 94 | 95 | # numpy arrays to Pandas dataframes 96 | features = pd.DataFrame( 97 | x, 98 | columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))] 99 | ) 100 | 101 | features['pickup_hour'] = pd.to_datetime(current_date, utc=True) 102 | features['pickup_location_id'] = location_ids 103 | features.sort_values(by=['pickup_location_id'], inplace=True) 104 | 105 | return features 106 | 107 | 108 | def load_model_from_registry(): 109 | 110 | import joblib 111 | from pathlib import Path 112 | 113 | project = get_hopsworks_project() 114 | model_registry = project.get_model_registry() 115 | 116 | model = model_registry.get_model( 117 | name=config.MODEL_NAME, 118 | version=config.MODEL_VERSION, 119 | ) 120 | 121 | model_dir = model.download() 122 | model = joblib.load(Path(model_dir) / 'model.pkl') 123 | 124 | return model 125 | 126 | def load_predictions_from_store( 127 | from_pickup_hour: datetime, 128 | to_pickup_hour: datetime) -> pd.DataFrame: 129 | """ 130 | Connects to the feature store and retrieves model predictions for all 131 | `pickup_location_id`s and for the time period from `from_pickup_hour` 132 | to `to_pickup_hour` 133 | 134 | Args: 135 | from_pickup_hour (datetime): min datetime (rounded hour) for which we want to get 136 | predictions 137 | 138 | to_pickup_hour (datetime): max datetime (rounded hour) for which we want to get 139 | predictions 140 | 141 | Returns: 142 | pd.DataFrame: 3 columns: 143 | - `pickup_location_id` 144 | - `predicted_demand` 145 | - `pickup_hour` 146 | """ 147 | from src.feature_store_api import get_feature_store 148 | import src.config as config 149 | 150 | feature_store = get_feature_store() 151 | 152 | predictiong_fg = feature_store.get_feature_group( 153 | name=config.FEATURE_GROUP_MODEL_PREDICTIONS, 154 | version=1, 155 | ) 156 | 157 | try: 158 | # create feature view as it does not exist yet 159 | feature_store.create_feature_view( 160 | name=config.FEATURE_VIEW_MODEL_PREDICTIONS, 161 | version=1, 162 | query=predictiong_fg.select_all() 163 | ) 164 | except: 165 | print(f'Feature view {config.FEATURE_VIEW_MODEL_PREDICTIONS} \ 166 | already existed. Skipped creation.') 167 | 168 | predictions_fv = feature_store.get_feature_view( 169 | name=config.FEATURE_VIEW_MODEL_PREDICTIONS, 170 | version=1 171 | ) 172 | 173 | print(f'Fetching predictions for `pickup_hours` between {from_pickup_hour} and {to_pickup_hour}') 174 | predictions = predictions_fv.get_batch_data( 175 | start_time=from_pickup_hour - timedelta(days=1), 176 | end_time=to_pickup_hour + timedelta(days=1) 177 | ) 178 | 179 | # make sure datetimes are UTC aware 180 | predictions['pickup_hour'] = pd.to_datetime(predictions['pickup_hour'], utc=True) 181 | from_pickup_hour = pd.to_datetime(from_pickup_hour, utc=True) 182 | to_pickup_hour = pd.to_datetime(to_pickup_hour, utc=True) 183 | 184 | predictions = predictions[predictions.pickup_hour.between( 185 | from_pickup_hour, to_pickup_hour)] 186 | 187 | # sort by `pick_up_hour` and `pickup_location_id` 188 | predictions.sort_values(by=['pickup_hour', 'pickup_location_id'], inplace=True) 189 | 190 | return predictions -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | def get_logger() -> logging.Logger: 4 | """Returns a logger 5 | 6 | Returns: 7 | logging.Logger: _description_ 8 | """ 9 | logger = logging.getLogger('dataflow') 10 | logger.setLevel(logging.INFO) 11 | return logger 12 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import FunctionTransformer 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.pipeline import make_pipeline, Pipeline 5 | from sklearn.multioutput import MultiOutputRegressor 6 | from src.paths import RAW_DATA_DIR 7 | import hopsworks 8 | import src.config as config 9 | from src.feature_store_api import get_feature_store 10 | 11 | import lightgbm as lgb 12 | 13 | def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame: 14 | """ 15 | Adds one column with the average rides from 16 | - 7 days ago 17 | - 14 days ago 18 | - 21 days ago 19 | - 28 days ago 20 | """ 21 | X['average_rides_last_4_weeks'] = 0.25*( 22 | X[f'rides_previous_{7*24}_hour'] + \ 23 | X[f'rides_previous_{2*7*24}_hour'] + \ 24 | X[f'rides_previous_{3*7*24}_hour'] + \ 25 | X[f'rides_previous_{4*7*24}_hour'] 26 | ) 27 | return X 28 | 29 | def latitude_and_longitude_anterior(X: pd.DataFrame) -> pd.DataFrame: #version anterior de la funcion, la modifique por la de abajo para que solo sea consulta a feature store 30 | """ 31 | Adds two columns with the latitude and longitude from pickup_location_id 32 | 33 | """ 34 | raw_data_rides = pd.read_parquet(RAW_DATA_DIR / 'rides_2022.parquet') 35 | 36 | #Nos quedamos sólo con las columnas que nos interesan y las renombramos 37 | raw_data_rides = raw_data_rides[['id_estacion_origen', 'lat_estacion_origen', 'long_estacion_origen']] 38 | raw_data_rides['id_estacion_origen'] = raw_data_rides['id_estacion_origen'].str.replace('BAEcobici', '').astype(int) 39 | raw_data_rides = raw_data_rides.drop_duplicates().reset_index(drop=True) 40 | raw_data_rides.rename(columns={ 41 | 'id_estacion_origen': 'pickup_location_id', 42 | 'lat_estacion_origen': 'latitude', 43 | 'long_estacion_origen': 'longitude' 44 | }, inplace=True) 45 | 46 | # Combinar la información de latitud y longitud en X 47 | X = X.merge(raw_data_rides, on='pickup_location_id', how='left') 48 | 49 | # Eliminar la columna 'pickup_location_id' 50 | #X.drop('pickup_location_id', axis=1, inplace=True) 51 | 52 | return X 53 | 54 | def latitude_and_longitude(X: pd.DataFrame) -> pd.DataFrame: 55 | """ 56 | Adds two columns with the latitude and longitude from pickup_location_id 57 | 58 | """ 59 | 60 | #primero me conecto al feature store para obtenerla y luego la uno al dataset 61 | 62 | feature_store = get_feature_store() 63 | feature_view = feature_store.get_feature_view( 64 | name=config.FEATURE_VIEW_LAT_LONG 65 | ) 66 | raw_data_rides= feature_view.get_batch_data() 67 | 68 | # Combinar la información de latitud y longitud en X 69 | X = X.merge(raw_data_rides, on='pickup_location_id', how='left') 70 | 71 | # Eliminar la columna 'pickup_location_id' 72 | #X.drop('pickup_location_id', axis=1, inplace=True) 73 | 74 | return X 75 | 76 | 77 | 78 | class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin): 79 | """ 80 | Scikit-learn data transformation that adds 2 columns 81 | - hour 82 | - day_of_week 83 | and removes the `pickup_hour` datetime column. 84 | """ 85 | def fit(self, X, y=None): 86 | return self 87 | 88 | def transform(self, X, y=None): 89 | 90 | X_ = X.copy() 91 | 92 | # Generate numeric columns from datetime 93 | X_["hour"] = X_['pickup_hour'].dt.hour 94 | X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek 95 | 96 | return X_.drop(columns=['pickup_hour']) 97 | 98 | def get_pipeline(**hyperparams) -> Pipeline: 99 | 100 | # sklearn transform 101 | add_feature_average_rides_last_4_weeks = FunctionTransformer( 102 | average_rides_last_4_weeks, validate=False) 103 | 104 | # sklearn transform 105 | add_feature_latitude_and_longitude = FunctionTransformer( 106 | latitude_and_longitude, validate=False) 107 | 108 | # sklearn transform 109 | add_temporal_features = TemporalFeaturesEngineer() 110 | 111 | # sklearn pipeline 112 | return make_pipeline( 113 | add_feature_average_rides_last_4_weeks, 114 | add_feature_latitude_and_longitude, 115 | add_temporal_features, 116 | MultiOutputRegressor(lgb.LGBMRegressor(**hyperparams, force_col_wise=True)) 117 | ) -------------------------------------------------------------------------------- /src/model_registry_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import pickle 4 | 5 | import comet_ml 6 | from comet_ml import API 7 | from dotenv import load_dotenv 8 | import hopsworks 9 | from sklearn.pipeline import Pipeline 10 | import pandas as pd 11 | import joblib 12 | 13 | import src.config as config 14 | from src.paths import MODELS_DIR, PARENT_DIR 15 | from src.logger import get_logger 16 | 17 | logger = get_logger() 18 | 19 | # load variables from .env file as environment variables 20 | load_dotenv(PARENT_DIR / '.env') 21 | 22 | COMET_ML_API_KEY = os.environ["COMET_ML_API_KEY"] 23 | COMET_ML_WORKSPACE = os.environ["COMET_ML_WORKSPACE"] 24 | COMET_ML_PROJECT_NAME = os.environ['COMET_ML_PROJECT_NAME'] 25 | 26 | 27 | def get_model_registry() -> None: 28 | """Connects to Hopsworks and returns a pointer to the feature store 29 | 30 | Returns: 31 | hsfs.feature_store.FeatureStore: pointer to the feature store 32 | """ 33 | project = hopsworks.login( 34 | project=config.HOPSWORKS_PROJECT_NAME, 35 | api_key_value=config.HOPSWORKS_API_KEY 36 | ) 37 | return project.get_model_registry() 38 | 39 | def push_model_to_registry( 40 | model: Pipeline, 41 | model_name: str, 42 | ) -> int: 43 | """""" 44 | # save the model to disk 45 | model_file = MODELS_DIR / 'model.pkl' 46 | with open(model_file, "wb") as f: 47 | pickle.dump(model, f) 48 | 49 | # Get the stale experiment from the global context to grab the API key and experiment ID. 50 | stale_experiment = comet_ml.get_global_experiment() 51 | 52 | # Resume the expriment using its API key and experiment ID. 53 | experiment = comet_ml.ExistingExperiment( 54 | api_key=stale_experiment.api_key, experiment_key=stale_experiment.id 55 | ) 56 | 57 | # log model as an experiment artifact 58 | logger.info(f"Starting logging model to Comet ML") 59 | experiment.log_model(model_name, str(model_file)) 60 | logger.info(f"Finished logging model {model_name}") 61 | 62 | # push model to the registry 63 | logger.info('Pushing model to the registry as "Production"') 64 | experiment.register_model(model_name, status='Production') 65 | 66 | # end the experiment 67 | experiment.end() 68 | 69 | # get model version of the latest production model 70 | return get_latest_model_version(model_name, status='Production') 71 | 72 | 73 | def get_latest_model_version(model_name: str, status: str) -> str: 74 | """ 75 | Returns the latest model version from the registry with the given `status` 76 | """ 77 | # find all model versions from the given `model_name` registry and `status` 78 | api = API(COMET_ML_API_KEY) 79 | model_details = api.get_registry_model_details(COMET_ML_WORKSPACE, model_name)['versions'] 80 | model_versions = [md['version'] for md in model_details if md['status'] == status] 81 | 82 | # return the latest model version 83 | return max(model_versions) 84 | 85 | 86 | def get_latest_model_from_registry(model_name: str, status: str) -> Pipeline: 87 | """Returns the latest model from the registry""" 88 | 89 | # get model version to download 90 | model_version = get_latest_model_version(model_name, status) 91 | 92 | # download model from registry 93 | api = API(COMET_ML_API_KEY) 94 | api.download_registry_model( 95 | COMET_ML_WORKSPACE, 96 | registry_name=model_name, 97 | version=model_version, 98 | output_path=MODELS_DIR, 99 | expand=True 100 | ) 101 | 102 | # load model from local file to memory 103 | with open(MODELS_DIR / 'model.pkl', "rb") as f: 104 | model = pickle.load(f) 105 | 106 | return model -------------------------------------------------------------------------------- /src/monitoring.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | import src.config as config 8 | from src.feature_store_api import get_feature_store, get_feature_group 9 | from src.data import get_cutoff_indices_features_and_target 10 | 11 | from datetime import datetime, timedelta 12 | import pandas as pd 13 | from src.data import transform_raw_data_into_ts_data 14 | from src.data import transform_ts_data_into_features_and_target 15 | from src.data import transform_ts_data_into_dataset_comparable_with_predictions 16 | 17 | 18 | def load_predictions_and_actual_values_from_store( 19 | from_date: datetime, 20 | to_date: datetime, 21 | ) -> pd.DataFrame: 22 | """Fetches model predictions and actuals values from 23 | `from_date` to `to_date` from the Feature Store and returns a dataframe 24 | 25 | Args: 26 | from_date (datetime): min datetime for which we want predictions and 27 | actual values 28 | 29 | to_date (datetime): max datetime for which we want predictions and 30 | actual values 31 | 32 | Returns: 33 | pd.DataFrame: 4 columns 34 | - `pickup_location_id` 35 | - `predicted_demand` 36 | - `pickup_hour` 37 | - `rides` 38 | """ 39 | current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') 40 | 41 | fetch_data_from = pd.Timestamp('2023-01-01 0:00:00+0000', tz='UTC') #quizas cambiarlo y que sea solo el año en curso 42 | fetch_data_to = pd.to_datetime(current_date - timedelta(hours=1), utc=True) 43 | 44 | feature_store_1 = get_feature_store() 45 | predictions_fg = feature_store_1.get_feature_view(name=config.FEATURE_VIEW_MODEL_PREDICTIONS) 46 | ts_data_1 = predictions_fg.get_batch_data( 47 | start_time=pd.to_datetime(fetch_data_from, utc=True), 48 | end_time=pd.to_datetime(fetch_data_to, utc=True) 49 | ) 50 | 51 | feature_store_2 = get_feature_store() 52 | actuals_fg = feature_store_2.get_feature_view(name=config.FEATURE_VIEW_NAME) 53 | ts_data_2 = actuals_fg.get_batch_data( 54 | start_time=pd.to_datetime(fetch_data_from, utc=True), 55 | end_time=pd.to_datetime(fetch_data_to, utc=True) 56 | ) 57 | 58 | 59 | 60 | # # 2 feature groups we need to merge 61 | # predictions_fg = get_feature_group(name=config.FEATURE_GROUP_MODEL_PREDICTIONS) 62 | # actuals_fg = get_feature_group(name=config.FEATURE_GROUP_NAME) 63 | 64 | # # query to join the 2 features groups by `pickup_hour` and `pickup_location_id` 65 | # query = predictions_fg.select_all() \ 66 | # .join(actuals_fg.select_all(), on=['pickup_hour', 'pickup_location_id']) \ 67 | # .filter(predictions_fg.pickup_hour >= from_date) \ 68 | # .filter(predictions_fg.pickup_hour <= to_date) 69 | 70 | # # create the feature view `config.FEATURE_VIEW_MONITORING` if it does not 71 | # # exist yet 72 | # feature_store = get_feature_store() 73 | # try: 74 | # # create feature view as it does not exist yet 75 | # feature_store.create_feature_view( 76 | # name=config.FEATURE_VIEW_MONITORING, 77 | # version=1, 78 | # query=query 79 | # ) 80 | # except: 81 | # print('Feature view already existed. Skip creation.') 82 | 83 | # # feature view 84 | # monitoring_fv = feature_store.get_feature_view( 85 | # name=config.FEATURE_VIEW_MONITORING, 86 | # version=1 87 | # ) 88 | 89 | # # fetch data form the feature view 90 | # # fetch predicted and actual values for the last 30 days 91 | # monitoring_df = monitoring_fv.get_batch_data( 92 | # start_time=pd.to_datetime(from_date - timedelta(days=7), utc=True), 93 | # end_time=pd.to_datetime(to_date + timedelta(days=7), utc=True) 94 | # ) 95 | # monitoring_df = monitoring_df[monitoring_df.pickup_hour.between(from_date, to_date)] 96 | 97 | return ts_data_1, ts_data_2 98 | 99 | 100 | # def transform_ts_data_hopsworks_into_df_comparable_with_predictions( 101 | # ts_data: pd.DataFrame, 102 | # input_seq_len: int, 103 | # step_size: int, 104 | # output_seq_len: int #Lo que agregué nuevo 105 | # ) -> pd.DataFrame: 106 | # """ 107 | # Slices and transposes data from time-series format into a (features, target) 108 | # format that we can use to train Supervised ML models 109 | # """ 110 | # assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'} 111 | 112 | # location_ids = ts_data['pickup_location_id'].unique() 113 | # #features = pd.DataFrame() 114 | # real_rides = pd.DataFrame() 115 | 116 | # for location_id in tqdm(location_ids): 117 | 118 | # # keep only ts data for this `location_id` 119 | # ts_data_one_location = ts_data.loc[ 120 | # ts_data.pickup_location_id == location_id, 121 | # ['pickup_hour', 'rides'] 122 | # ].sort_values(by=['pickup_hour']) 123 | 124 | # # pre-compute cutoff indices to split dataframe rows 125 | # indices = get_cutoff_indices_features_and_target( 126 | # ts_data_one_location, 127 | # input_seq_len, 128 | # step_size, 129 | # output_seq_len #Lo que agregué nuevo 130 | # ) 131 | 132 | # # slice and transpose data into numpy arrays for features and targets 133 | # n_examples = len(indices) 134 | # #x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32) 135 | # y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas 136 | # pickup_hours = [] 137 | # for i, idx in enumerate(indices): 138 | # #x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values 139 | # y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values 140 | # pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour']) 141 | 142 | # # numpy -> pandas 143 | # # features_one_location = pd.DataFrame( 144 | # # x, 145 | # # columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))] 146 | # # ) 147 | # # features_one_location['pickup_hour'] = pickup_hours 148 | # # features_one_location['pickup_location_id'] = location_id 149 | 150 | # # numpy -> pandas 151 | # real_rides_one_location = pd.DataFrame(y, columns=[f'real_rides_next_{i+1}_hour' for i in range(output_seq_len)]) 152 | # real_rides_one_location['pickup_hour'] = pickup_hours 153 | # real_rides_one_location['pickup_location_id'] = location_id 154 | 155 | # # concatenate results 156 | # #features = pd.concat([features, features_one_location]) 157 | # real_rides = pd.concat([real_rides, real_rides_one_location]) 158 | 159 | # #features.reset_index(inplace=True, drop=True) 160 | # real_rides.reset_index(inplace=True, drop=True) 161 | 162 | # return real_rides #,features #['target_rides_next_hour'] -------------------------------------------------------------------------------- /src/paths.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | PARENT_DIR = Path(__file__).parent.resolve().parent 5 | DATA_DIR = PARENT_DIR / 'data' 6 | RAW_DATA_DIR = PARENT_DIR / 'data' / 'raw' 7 | TRANSFORMED_DATA_DIR = PARENT_DIR / 'data' / 'transformed' 8 | 9 | MODELS_DIR = PARENT_DIR / 'models' 10 | 11 | if not Path(DATA_DIR).exists(): 12 | os.mkdir(DATA_DIR) 13 | 14 | if not Path(RAW_DATA_DIR).exists(): 15 | os.mkdir(RAW_DATA_DIR) 16 | 17 | if not Path(TRANSFORMED_DATA_DIR).exists(): 18 | os.mkdir(TRANSFORMED_DATA_DIR) 19 | 20 | if not Path(MODELS_DIR).exists(): 21 | os.mkdir(MODELS_DIR) -------------------------------------------------------------------------------- /src/plot.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from datetime import timedelta 3 | 4 | import pandas as pd 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | 8 | # def plot_one_sample( 9 | # example_id: int, 10 | # features: pd.DataFrame, 11 | # targets: Optional[pd.DataFrame] = None, #pd.Series, 12 | # predictions: Optional[pd.DataFrame] = None, 13 | # directions: Optional[pd.DataFrame] = None 14 | # ): 15 | # """""" 16 | # if directions is not None: 17 | # features_ = pd.merge(features, directions, left_on='pickup_location_id', right_on='ID', how='left') 18 | # features_[features_.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora 19 | # #features_ = features_.iloc[example_id] #Lo modifique porque en otros casos no filtrando^ 20 | # else: 21 | # #features_ = features.iloc[example_id] #Lo modifique porque en otros casos no filtrando^ 22 | # features_[features_.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora 23 | 24 | 25 | # if targets is not None: 26 | # target_ = targets[targets.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora 27 | # #target_ = targets.iloc[example_id] #Lo modifique porque en otros casos no filtrando 28 | # ts_columns_targets = [c for c in targets.columns if c.startswith('rides_next_')] 29 | # ts_values_targets = [target_[c] for c in ts_columns_targets] 30 | # ts_dates_targets = pd.date_range( 31 | # features_['pickup_hour'].iloc[0], 32 | # features_['pickup_hour'].iloc[0] + timedelta(hours=len(ts_columns_targets)-1), 33 | # freq='H' 34 | # ) 35 | # else: 36 | # target_ = None 37 | 38 | # #features_df[features_df.index == 327].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora 39 | # # features_ = features[features['pickup_location_id'] == example_id] 40 | # # target_ = targets[targets['pickup_location_id'] == example_id] 41 | 42 | 43 | 44 | # ts_columns_features = [c for c in features.columns if c.startswith('rides_previous_')] 45 | 46 | # ts_values_features = [features_[c] for c in ts_columns_features] 47 | 48 | 49 | # ts_dates_features = pd.date_range( 50 | # features_['pickup_hour'].iloc[0] - timedelta(hours=len(ts_columns_features)), #Agregp .iloc[0] `porque quiero tomar el unico valor de esa serie` 51 | # features_['pickup_hour'].iloc[0] - timedelta(hours=1), 52 | # freq='H' 53 | # ) 54 | 55 | 56 | # fig = go.Figure() 57 | # if directions is not None: 58 | # title = f'Pick up hour= {features_["pickup_hour"]}, location_id= {features_.index}, direction= {features_["DIRECCION"]}' 59 | # else: 60 | # title = f'Pick up hour= {features_["pickup_hour"]}, location_id= {features_.index}' 61 | # fig = px.line( x=ts_dates_features, y=ts_values_features, 62 | # template='plotly_dark', 63 | # markers=True, title=title) 64 | 65 | # if targets is not None: 66 | # targets_fig = px.line(x=ts_dates_targets, y=ts_values_targets, 67 | # template='plotly_dark', 68 | # markers=True, title='actual values') 69 | # targets_fig.update_traces(line_color='green') 70 | # fig.add_traces(targets_fig.data) 71 | 72 | 73 | # if predictions is not None: 74 | 75 | # prediction_ = predictions.iloc[example_id] 76 | # #prediction_ = predictions[predictions['pickup_location_id'] == example_id] 77 | # ts_columns_predictions = [c for c in predictions.columns if c.startswith('rides_next_')] 78 | # ts_values_predictions = [prediction_[c] for c in ts_columns_predictions] 79 | # ts_dates_predictions = pd.date_range( 80 | # features_['pickup_hour'].iloc[0], 81 | # features_['pickup_hour'].iloc[0] + timedelta(hours=len(ts_columns_predictions)-1), 82 | # freq='H' 83 | # ) 84 | 85 | # prediction_fig = px.line(x=ts_dates_predictions, y=ts_values_predictions, 86 | # template='plotly_dark', 87 | # markers=True, title='predicted values') 88 | # prediction_fig.update_traces(line_color='darkorange') 89 | # fig.add_traces(prediction_fig.data) 90 | 91 | # return fig 92 | 93 | def plot_one_sample( 94 | features: pd.DataFrame, 95 | targets: pd.DataFrame, #pd.Series, 96 | example_id: int, 97 | predictions: Optional[pd.DataFrame] = None, 98 | ): 99 | """""" 100 | features_ = features.iloc[example_id] 101 | target_ = targets.iloc[example_id] 102 | 103 | # ts_columns = [c for c in features.columns if c.startswith('rides_previous_')] 104 | # ts_values = [features_[c] for c in ts_columns] + [target_] 105 | ts_columns_features = [c for c in features.columns if c.startswith('rides_previous_')] 106 | ts_columns_targets = [c for c in targets.columns if c.startswith('rides_next_')] 107 | ts_values_features = [features_[c] for c in ts_columns_features] 108 | ts_values_targets = [target_[c] for c in ts_columns_targets] 109 | # ts_dates = pd.date_range( 110 | # features_['pickup_hour'] - timedelta(hours=len(ts_columns)), 111 | # features_['pickup_hour'], 112 | # freq='H' 113 | # ) 114 | ts_dates_features = pd.date_range( 115 | features_['pickup_hour'] - timedelta(hours=len(ts_columns_features)), 116 | features_['pickup_hour'] - timedelta(hours=1), 117 | freq='H' 118 | ) 119 | ts_dates_targets = pd.date_range( 120 | features_['pickup_hour'], 121 | features_['pickup_hour'] + timedelta(hours=len(ts_columns_targets)-1), 122 | freq='H' 123 | ) 124 | 125 | # line plot with past values 126 | # title = f'Pick up hour={features_["pickup_hour"]}, location_id={features_["pickup_location_id"]}' 127 | # fig = px.line( 128 | # x=ts_dates, y=ts_values, 129 | # template='plotly_dark', 130 | # markers=True, title=title 131 | # ) 132 | fig = go.Figure() 133 | title = f'Pick up hour={features_["pickup_hour"]}, location_id={features_["pickup_location_id"]}' 134 | fig = px.line( x=ts_dates_features, y=ts_values_features, 135 | template='plotly_dark', 136 | markers=True, title=title) 137 | #features_fig.update_traces(line_color='blue') 138 | #fig.add_traces(features_fig.data) 139 | 140 | # green line for the values we wanna predict 141 | # fig.add_scatter(x=ts_dates[-1:], y=[target_], 142 | # line_color='green', 143 | # mode='markers', marker_size=10, name='actual value') 144 | targets_fig = px.line(x=ts_dates_targets, y=target_.values.tolist(), 145 | template='plotly_dark', 146 | markers=True, title='actual values') 147 | targets_fig.update_traces(line_color='green') 148 | fig.add_traces(targets_fig.data) 149 | #fig.show() 150 | 151 | if predictions is not None: 152 | # red line for the predicted values, if passed 153 | # prediction_ = predictions.iloc[example_id] 154 | # fig.add_scatter(x=ts_dates[-1:], y=[prediction_], 155 | # line_color='red', 156 | # mode='markers', marker_symbol='x', marker_size=15, 157 | # name='prediction') 158 | prediction_ = predictions.iloc[example_id] 159 | prediction_fig = px.line(x=ts_dates_targets, y=prediction_.values.tolist(), 160 | template='plotly_dark', 161 | markers=True, title='predicted values') 162 | prediction_fig.update_traces(line_color='darkorange') 163 | fig.add_traces(prediction_fig.data) 164 | 165 | return fig 166 | 167 | def plot_ts( 168 | ts_data: pd.DataFrame, 169 | locations: Optional[List[int]] = None 170 | ): 171 | """ 172 | Plot time-series data 173 | """ 174 | ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data 175 | 176 | fig = px.line( 177 | ts_data, 178 | x="pickup_hour", 179 | y="rides", 180 | color='pickup_location_id', 181 | template='none', 182 | ) 183 | 184 | fig.show() 185 | --------------------------------------------------------------------------------