├── .devcontainer
    └── devcontainer.json
├── .github
    └── workflows
    │   ├── feature_pipeline.yml
    │   └── inference_pipeline.yml
├── .gitignore
├── .python-version
├── README.md
├── notebooks
    ├── 01_load_and_validate_raw_data.ipynb
    ├── 02_transform_raw_data_into_ts_data.ipynb
    ├── 03_transform_ts_data_into_features_and_targets.ipynb
    ├── 04_transform_raw_data_into_features_and_targets.ipynb
    ├── 05_visualize_training_data.ipynb
    ├── 06_baseline_model.ipynb
    ├── 07_xgboost_model.ipynb
    ├── 08_lightgbm_model.ipynb
    ├── 09_lightgbm_model_with_feature_engineering.ipynb
    ├── 10_lightgbm_model_with_hyperparameter_tuning.ipynb
    ├── 11_backfill_feature_store.ipynb
    ├── 12_feature_pipeline.ipynb
    ├── 13_model_training_pipeline.ipynb
    └── 14_inference_pipeline.ipynb
├── poetry.lock
├── pyproject.toml
└── src
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-39.pyc
        ├── data.cpython-39.pyc
        ├── data_split.cpython-39.pyc
        ├── model.cpython-39.pyc
        ├── paths.cpython-39.pyc
        └── plot.cpython-39.pyc
    ├── config.py
    ├── data.py
    ├── data_split.py
    ├── feature_store_api.py
    ├── frontend.py
    ├── frontend_monitoring.py
    ├── inference.py
    ├── logger.py
    ├── model.py
    ├── model_registry_api.py
    ├── monitoring.py
    ├── paths.py
    └── plot.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "src/frontend_monitoring.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run src/frontend_monitoring.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.github/workflows/feature_pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: hourly-bike-demand-feature-pipeline  #feature-pipeline
 2 | 
 3 | # on:
 4 | #   schedule:
 5 | #    - cron: '0 * * * *'
 6 | 
 7 | #   workflow_dispatch:
 8 | 
 9 | env:
10 |   PYTHON_VERSION: 3.9
11 |   POETRY_VERSION: 1.4.2
12 |   POETRY_URL: https://install.python-poetry.org
13 | 
14 | jobs:
15 | 
16 |   feature_pipeline:
17 |     runs-on: ubuntu-latest
18 | 
19 |     steps:
20 |     - name: Checkout
21 |       uses: actions/checkout@v3
22 | 
23 |     # Poetry cache depends on OS, Python version and Poetry version.
24 |     - name: Cache Poetry cache
25 |       uses: actions/cache@v3
26 | 
27 |       with:
28 |         path: ~/.cache/pypoetry
29 |         key: poetry-cache-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.POETRY_VERSION }}
30 |     
31 |     # virtualenv cache should depends on OS, Python version and `poetry.lock` (and optionally workflow files).
32 |     - name: Cache Packages
33 |       uses: actions/cache@v3
34 |       with:
35 |         path: ~/.local
36 |         key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }}
37 |     
38 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
39 |       uses: actions/setup-python@v3
40 |       with:
41 |         python-version: ${{ env.PYTHON_VERSION }}
42 |     
43 |     - name: Install Poetry
44 |       run: |
45 |         curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
46 |         echo "$HOME/.local/bin" >> $GITHUB_PATH
47 |     
48 |     - name: Install Dependencies
49 |       run: poetry install
50 | 
51 |       
52 |     - name: exceute python workflows from bash script
53 |       env: 
54 |         HOPSWORKS_PROJECT_NAME: ${{ secrets.HOPSWORKS_PROJECT_NAME }} #se agrega esto?
55 |         HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
56 |       run: poetry run jupyter nbconvert --to notebook --execute notebooks/12_feature_pipeline.ipynb
57 |     
58 |     # - name: Run feature generation script
59 |     #   env: 
60 |     #     HOPSWORKS_API_KEY: $#{{ secrets.HOPSWORKS_API_KEY }}
61 |     #   run: make features
62 |   
63 | 
64 | 


--------------------------------------------------------------------------------
/.github/workflows/inference_pipeline.yml:
--------------------------------------------------------------------------------
 1 | name: hourly-bike-demand-inference-pipeline #inference-pipeline
 2 | 
 3 | # on:
 4 | #   workflow_run:
 5 | #     workflows: ["hourly-bike-demand-feature-pipeline"]  
 6 | #     types:
 7 | #       - completed
 8 | 
 9 | #   workflow_dispatch:
10 | 
11 | env:
12 |   PYTHON_VERSION: 3.9
13 |   POETRY_VERSION: 1.8.2
14 |   POETRY_URL: https://install.python-poetry.org
15 | 
16 | jobs:
17 | 
18 |   inference_pipeline:
19 |     runs-on: ubuntu-latest
20 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
21 |     steps:
22 |     - name: Checkout
23 |       uses: actions/checkout@v3
24 | 
25 |     # Poetry cache depends on OS, Python version and Poetry version.
26 |     - name: Cache Poetry cache
27 |       uses: actions/cache@v3
28 | 
29 |       with:
30 |         path: ~/.cache/pypoetry
31 |         key: poetry-cache-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.POETRY_VERSION }}
32 |     
33 |     # virtualenv cache should depends on OS, Python version and `poetry.lock` (and optionally workflow files).
34 |     - name: Cache Packages
35 |       uses: actions/cache@v3
36 |       with:
37 |         path: ~/.local
38 |         key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }}
39 |     
40 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
41 |       uses: actions/setup-python@v3
42 |       with:
43 |         python-version: ${{ env.PYTHON_VERSION }}
44 |     
45 |     - name: Install Poetry
46 |       run: |
47 |         curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
48 |         echo "$HOME/.local/bin" >> $GITHUB_PATH
49 |     
50 |     - name: Install Dependencies
51 |       run: poetry install
52 |    
53 |       
54 |     - name: exceute python workflows from bash script
55 |       env: 
56 |         HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
57 |         COMET_ML_API_KEY: ${{ secrets.COMET_ML_API_KEY }}
58 |         COMET_ML_WORKSPACE: ${{ secrets.COMET_ML_WORKSPACE }}
59 |         COMET_ML_PROJECT_NAME: ${{ secrets.COMET_ML_PROJECT_NAME }}
60 |       run: poetry run jupyter nbconvert --to notebook --execute notebooks/14_inference_pipeline.ipynb
61 |     
62 |     # - name: Generating new batch of predictions
63 |     #   env: 
64 |     #     HOPSWORKS_API_KEY: $#{{ secrets.HOPSWORKS_API_KEY }}
65 |     #   run: make inference
66 |   
67 | 
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.13
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <h1>Bike sharing demand predictor service🚲🚛📊</h1>
 3 |     <i>Proyecto final🚀</i>
 4 | </div>
 5 | 
 6 | <br />
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | ## Predictor de demanda
13 | 1. El siguiente proyecto es un predictor de la demanda de un sistema de bikesharing. Está planteado como un proyecto final del Máster de Data Science de Nodd3r. Les comparto su [web](https://nodd3r.com/) y agradezco especialmente a Christian Donaire que me ayudó mucho en todo este proceso de aprendizaje.
14 | 
15 |     También cabe destacar que este proyecto ha sido inspirado en el [curso](https://bit.ly/MLcourse_plb) de [Pau Labarta Bajo](https://github.com/Paulescu) en el que se contruye un predictor de demanda del servicio de taxi de Nueva York.
16 | 
17 | 2. ¿Que es un sistema de bikesharing? Es el sistema de bicicletas de modalidad compartidas, es decir que cada ciudadano puede usarlas y luego dejarlas en estaciones específicas para ello. Estos sistemas están presentes en distintas ciudades. En el siguiente proyecto se hará un prototipo de predicción de demanda para planificar donde deberían haber más bicicletas en determinadas horas.
18 | 
19 |     Por tanto, el problema que se busca resolver es el rebalanceo de bicicletas🚲➡️🚛 en sistemas de bikesharing. Pero ¿qué es el rebalanceo? Básicamente sería mover las bicicletas de una estación a otra para que cuando vayas encuentres bicicletas para realizar tu viaje.
20 | 
21 |     Para ello se plantea predecir la demanda de bicicletas de las siguientes 36 horas. ¿Para qué 36 horas? Para que la empresa que realiza ese trabajo pueda tener un tiempo considerable para preveer los picos de demanda.
22 | 
23 | 3. ¿Cómo? Basado en los datos de demanda de las horas anteriores se buscará predecir la demanda de las siguientes 36 horas.
24 | 
25 |     Para ello utilicé el [dataset del Gobierno de la Cuidad de Buenos Aires](https://data.buenosaires.gob.ar/dataset/bicicletas-publicas) que se actualiza mensualmente para lograr este propósito. 
26 |     
27 |     Cabe destacar que utilicé "poetry" para crear un entorno virtual y tener más comodidad para gestionar librerías. Además utilicé un feature store llamado "hopsworks" con el que guardo los datos históricos, el modelo creado y las predicciones.
28 | 
29 |     También utilicé github actions para automatizar el script de descarga de features de la web del gobierno de buenos aires y la subida a hopsworks. También hice lo mismo con las predicciones, es decir un script que cada hora predice, y sube a hopsworks esa predicción. Esto fue hecho para que el tablero sólo tenga que consumir esos datos que están guardados y sea más rápido.
30 | 
31 | 4. ¿Qué modelo🤖 se utilizó para ello? Los modelos basados en XGBoost son muy útiles para predecir series de tiempo (y además mucho menos complejos que una red neuronal) pero para que funcionen correctamente se le debe dar los datos de una determinada manera que le facilita el aprendizaje. 
32 | 
33 | 
34 | ## Resumen del código
35 | 1. En el notebook 1, 2, 3, 4 y 5 básicamente lo que se hizo fue:
36 |     - Descargar los datos y descomprimirlos.
37 |     - Realizar una limpieza y convertirlos a formato parquet dado que es un formato que es útil para el propósito que buscamos y tiene varias ventajas.
38 |     - Eliminar los minutos y segundos y aproximarlos a la hora previa.
39 |     - Agregar las horas que no hubieron viajes con el valor "cero" y graficar.
40 |     - Crear una función en la que obtenemos los índices de las distintas filas para luego darle la forma más adecuada al dataset para que el modelo aprenda.
41 |     - Crear ese dataset que el modelo utilizará para aprender. (La forma en la que transformamos el dataset es que pasan de ser 3 columnas con hora, viaje y estación, a una columna por cada hora, junto con la información de la estación y la hora de referencia. Es decir del dataset original tomamos una cantidad de filas (horas previas y siguientes) y realizamos una transposición, luego bajamos una fila y repetimos el proceso. En este caso utilizamos 672 horas previas es decir 28 días y 36 horas siguientes).
42 |     - Por último, se realizó una función que grafique los registros previos y los siguientes.
43 | 
44 |  2. En el notebook 6, 7, 8, 9 y 10:
45 |     - Se divieron los datos en train y test.
46 |     - Se crearon un modelos base (sin aplicar Machine Learning) sobre los que comparar luego los modelos más complejos.
47 |     - Luego se probó con XGBoost y LightGBM dando mejores resultados éste último.
48 |     - Lo siguiente fue seguir con LightGBM y aplicarle feature engineering para mejorar el modelo. Para ello se agregó: el promedio de las últimas 4 semanas, latitud y longitud, hora y día de la semana.
49 |     - Se utilizó optuna para realizar un hyperparameter tuning del modelo.
50 | 
51 |  3. En el notebook 11, 12, 13 y 14:
52 |     - Se creó el proyecto en Hopsworks (feature store). Lo cual nos permite ir guardando los distintos registros que se descargan. Para ello se debe crear un feature group en el que guardarlo y luego para poder consumirlo es más cómodo mediante un feature view. Para ello se van creando este tipo de figuras para poder guardar los datos.
53 |     - El notebook 12 básicamente lo que realiza es: descarga los datos de la web del Gobierno de Bs As, realiza una limpieza y lo sube al feature store. Para automatizar esto se utilizó un github action que se ejecuta cada hora.
54 |     - En el notebook 13 se obtiene el modelo, se guarda y se sube a CometML (que luego se lo utilizará para realizar las predicciones).
55 |     - En el notebook 14 se leen los distintos datos del feature store, se carga el modelo, se crean las predicciones y se las guarda en el feature store. Para automatizarlo se creo otra github action que se ejecuta inmediatamente después de que termina la otra github action.
56 | 
57 |  4. También tenemos otros archivos en las carpeta src. En ellas hay distintas funciones que se utilizan en los distintos notebooks para no tener que repetir la función. Por tanto sólo con importarla ya se puede utilizar. Además dentro de esa carpeta están los dos tableros que ahora comentaré:
58 |     - El primer tablero es el de frontend el cual consulta al feature store y carga los datos previos y las predicciones correspondientes. Además se grafica un mapa en el que se puede ver la estación que tendrá más demanda en las próximas 36 horas (en la descripción está la demanda esperada y la hora). Luego más abajo se encontrarán los gráficos que del top 10 de estaciones con la máxima demanda.
59 |     - El segundo tabledo es el de frontend monitoring en el que se puede observar el error global y el error de las estaciones con mayor demanda.
60 | 
61 | ## Tableros
62 | - [Dashboard con predicciones del modelo📈](https://bike-sharing-demand-predictor-ecobici.streamlit.app/)
63 | 
64 | <p align="center">
65 | <img src="https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExOXh6ajJvaHZ4ZWlidmpqaWV6amY0ejJvcDBuNjN6ZXZzemllaThkNSZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/0Bw3NBLM3mMmzILD7x/giphy.gif" width="500" align="center">
66 | </p>
67 | <br />
68 | 
69 | - [Dashboard con monitorización de errores del modelo🔍](https://bike-sharing-mae-error-monitoring.streamlit.app/)
70 | 
71 | <p align="center">
72 | <img src="https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExdHBxcjl4cno0eW5wc211ZXhtYWIwdTljYXp3Y3V0bHplcnB2MzZzOSZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/B6CSUxVP4r35Qr0MCI/giphy.gif" width="500" align="center">
73 | </p>
74 | 
75 |  <br />
76 |  PD1: Cabe destacar que no se tiene acceso a los datos reales de la última hora. Por tanto para salvar eso, lo que se hace es una simulación de consulta en la que se obtienen datos de otro año que simulan ser la última hora, para luego incluirlos en la base de datos.
77 | 
78 |  PD2: En caso de que al abrir los tableros aparezca un error, volver a cargar la página para solucionarlo.
79 | 
80 | <br />
81 | <div align="center">
82 |     <i>Gracias por leer. Sigamos en contacto🙌🏻</i>
83 |     <br />
84 |     <a href="https://twitter.com/javieryanzon">Twitter</a> •
85 |     <a href="https://www.linkedin.com/in/javieryanzon">LinkedIn</a>
86 | <br />
87 | </div>
88 | 
89 |  
90 | 


--------------------------------------------------------------------------------
/notebooks/01_load_and_validate_raw_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pathlib import Path\n",
 10 |     "import requests\n",
 11 |     "import zipfile\n",
 12 |     "import pandas as pd\n",
 13 |     "import pyarrow.parquet as pq\n",
 14 |     "\n",
 15 |     "def download_one_file_of_raw_data(year: int) -> Path:\n",
 16 |     "    \"\"\"\"\"\"\n",
 17 |     "    URL = f'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/bicicletas-publicas/recorridos-realizados-{year}.zip'\n",
 18 |     "    response= requests.get(URL) #, stream=True)\n",
 19 |     "\n",
 20 |     "    \n",
 21 |     "    if response.status_code == 200:\n",
 22 |     "        # with open(nombre_archivo_zip, 'wb') as file:\n",
 23 |     "        #     for chunk in response.iter_content(chunk_size=8192):  # Tamaño del búfer ajustado a 8192 bytes\n",
 24 |     "        #         file.write(chunk)\n",
 25 |     "        path = f'../data/raw/recorridos-realizados-{year}.zip'\n",
 26 |     "        open(path, \"wb\").write(response.content)     \n",
 27 |     "        print(f'descargado año {year}')\n",
 28 |     "\n",
 29 |     "        return path\n",
 30 |     "    else:\n",
 31 |     "        raise Exception(f'{URL} is not available')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import pyarrow as pa\n",
 41 |     "\n",
 42 |     "def unzip_and_convert_csv_to_parquet(year: int) -> Path:\n",
 43 |     "    nombre_archivo_zip = f\"../data/raw/recorridos-realizados-{year}.zip\"\n",
 44 |     "        # Descomprimir el archivo zip\n",
 45 |     "    with zipfile.ZipFile(nombre_archivo_zip, 'r') as archivo_zip:\n",
 46 |     "\n",
 47 |     "        # Extraer el archivo CSV del zip\n",
 48 |     "        nombre_archivo_csv = archivo_zip.namelist()[0]  # Suponiendo que el archivo CSV es el primer archivo en el zip\n",
 49 |     "        archivo_zip.extractall(f\"../data/raw/\")\n",
 50 |     "\n",
 51 |     "        # Leer el archivo CSV con pandas\n",
 52 |     "        df = pd.read_csv(f\"../data/raw/{nombre_archivo_csv}\", delimiter=',', decimal=\".\")\n",
 53 |     "\n",
 54 |     "        # Convertir el DataFrame a formato parquet\n",
 55 |     "        nombre_archivo_parquet = f\"rides_{year}.parquet\"\n",
 56 |     "        table = pa.Table.from_pandas(df)\n",
 57 |     "        pq.write_table(table, f\"../data/raw/{nombre_archivo_parquet}\")\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "        # table = pq.Table.from_pandas(df)\n",
 62 |     "        # pq.write_table(table, nombre_archivo_parquet)\n",
 63 |     "        #pq.write_table(pq.Table.from_pandas(df), nombre_archivo_parquet)\n",
 64 |     "        path = f'../data/raw/rides_{year}.parquet'\n",
 65 |     "    return path\n",
 66 |     "    "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 1,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "download_one_file_of_raw_data(year=2022)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 9,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "'../data/raw/rides_2022.parquet'"
 87 |       ]
 88 |      },
 89 |      "execution_count": 9,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "unzip_and_convert_csv_to_parquet(year=2022)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 10,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/html": [
106 |        "<div>\n",
107 |        "<style scoped>\n",
108 |        "    .dataframe tbody tr th:only-of-type {\n",
109 |        "        vertical-align: middle;\n",
110 |        "    }\n",
111 |        "\n",
112 |        "    .dataframe tbody tr th {\n",
113 |        "        vertical-align: top;\n",
114 |        "    }\n",
115 |        "\n",
116 |        "    .dataframe thead th {\n",
117 |        "        text-align: right;\n",
118 |        "    }\n",
119 |        "</style>\n",
120 |        "<table border=\"1\" class=\"dataframe\">\n",
121 |        "  <thead>\n",
122 |        "    <tr style=\"text-align: right;\">\n",
123 |        "      <th></th>\n",
124 |        "      <th>Unnamed: 0</th>\n",
125 |        "      <th>X</th>\n",
126 |        "      <th>Id_recorrido</th>\n",
127 |        "      <th>duracion_recorrido</th>\n",
128 |        "      <th>fecha_origen_recorrido</th>\n",
129 |        "      <th>id_estacion_origen</th>\n",
130 |        "      <th>nombre_estacion_origen</th>\n",
131 |        "      <th>direccion_estacion_origen</th>\n",
132 |        "      <th>long_estacion_origen</th>\n",
133 |        "      <th>lat_estacion_origen</th>\n",
134 |        "      <th>fecha_destino_recorrido</th>\n",
135 |        "      <th>id_estacion_destino</th>\n",
136 |        "      <th>nombre_estacion_destino</th>\n",
137 |        "      <th>direccion_estacion_destino</th>\n",
138 |        "      <th>long_estacion_destino</th>\n",
139 |        "      <th>lat_estacion_destino</th>\n",
140 |        "      <th>id_usuario</th>\n",
141 |        "      <th>modelo_bicicleta</th>\n",
142 |        "      <th>Género</th>\n",
143 |        "    </tr>\n",
144 |        "  </thead>\n",
145 |        "  <tbody>\n",
146 |        "    <tr>\n",
147 |        "      <th>0</th>\n",
148 |        "      <td>1</td>\n",
149 |        "      <td>1</td>\n",
150 |        "      <td>13267975BAEcobici</td>\n",
151 |        "      <td>2,610</td>\n",
152 |        "      <td>2022-01-16 14:58:42</td>\n",
153 |        "      <td>5BAEcobici</td>\n",
154 |        "      <td>005 - Plaza Italia</td>\n",
155 |        "      <td>Av. Sarmiento 2601</td>\n",
156 |        "      <td>-58.420954</td>\n",
157 |        "      <td>-34.580550</td>\n",
158 |        "      <td>2022-01-16 15:42:12</td>\n",
159 |        "      <td>210BAEcobici</td>\n",
160 |        "      <td>335 - General Urquiza</td>\n",
161 |        "      <td>Figueroa Alcorta &amp; Sarmiento</td>\n",
162 |        "      <td>-58.411278</td>\n",
163 |        "      <td>-34.572165</td>\n",
164 |        "      <td>776361BAEcobici</td>\n",
165 |        "      <td>ICONIC</td>\n",
166 |        "      <td>FEMALE</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>1</th>\n",
170 |        "      <td>2</td>\n",
171 |        "      <td>2</td>\n",
172 |        "      <td>13268526BAEcobici</td>\n",
173 |        "      <td>545</td>\n",
174 |        "      <td>2022-01-16 17:26:27</td>\n",
175 |        "      <td>51BAEcobici</td>\n",
176 |        "      <td>051 - TUCUMAN</td>\n",
177 |        "      <td>Tucuman &amp; 9 De Julio Av.</td>\n",
178 |        "      <td>-58.382126</td>\n",
179 |        "      <td>-34.601478</td>\n",
180 |        "      <td>2022-01-16 17:35:32</td>\n",
181 |        "      <td>174BAEcobici</td>\n",
182 |        "      <td>174 - MINISTERIO DE EDUCACION</td>\n",
183 |        "      <td>Marcelo T. de Alvear &amp; Rodriguez Peña</td>\n",
184 |        "      <td>-58.391768</td>\n",
185 |        "      <td>-34.597225</td>\n",
186 |        "      <td>776407BAEcobici</td>\n",
187 |        "      <td>ICONIC</td>\n",
188 |        "      <td>MALE</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>2</th>\n",
192 |        "      <td>3</td>\n",
193 |        "      <td>3</td>\n",
194 |        "      <td>13268400BAEcobici</td>\n",
195 |        "      <td>2,061</td>\n",
196 |        "      <td>2022-01-16 16:51:12</td>\n",
197 |        "      <td>161BAEcobici</td>\n",
198 |        "      <td>161 - Humahuaca</td>\n",
199 |        "      <td>3912 Humahuaca</td>\n",
200 |        "      <td>-58.419676</td>\n",
201 |        "      <td>-34.602078</td>\n",
202 |        "      <td>2022-01-16 17:25:33</td>\n",
203 |        "      <td>117BAEcobici</td>\n",
204 |        "      <td>117 - HUMBERTO 1°</td>\n",
205 |        "      <td>Peru  1016</td>\n",
206 |        "      <td>-58.374176</td>\n",
207 |        "      <td>-34.620101</td>\n",
208 |        "      <td>671762BAEcobici</td>\n",
209 |        "      <td>ICONIC</td>\n",
210 |        "      <td>FEMALE</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>3</th>\n",
214 |        "      <td>4</td>\n",
215 |        "      <td>4</td>\n",
216 |        "      <td>13268164BAEcobici</td>\n",
217 |        "      <td>12,748</td>\n",
218 |        "      <td>2022-01-16 15:58:01</td>\n",
219 |        "      <td>210BAEcobici</td>\n",
220 |        "      <td>335 - General Urquiza</td>\n",
221 |        "      <td>Figueroa Alcorta &amp; Sarmiento</td>\n",
222 |        "      <td>-58.411278</td>\n",
223 |        "      <td>-34.572165</td>\n",
224 |        "      <td>2022-01-16 19:30:29</td>\n",
225 |        "      <td>382BAEcobici</td>\n",
226 |        "      <td>204 - Biarritz</td>\n",
227 |        "      <td>Biarritz 2403</td>\n",
228 |        "      <td>-58.477390</td>\n",
229 |        "      <td>-34.605470</td>\n",
230 |        "      <td>776361BAEcobici</td>\n",
231 |        "      <td>ICONIC</td>\n",
232 |        "      <td>FEMALE</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>4</th>\n",
236 |        "      <td>5</td>\n",
237 |        "      <td>5</td>\n",
238 |        "      <td>13270010BAEcobici</td>\n",
239 |        "      <td>4,337</td>\n",
240 |        "      <td>2022-01-16 23:40:09</td>\n",
241 |        "      <td>215BAEcobici</td>\n",
242 |        "      <td>113 - Guatemala</td>\n",
243 |        "      <td>Guatemala 4773</td>\n",
244 |        "      <td>-58.424996</td>\n",
245 |        "      <td>-34.585878</td>\n",
246 |        "      <td>2022-01-17 00:52:26</td>\n",
247 |        "      <td>205BAEcobici</td>\n",
248 |        "      <td>125 - F.J.Santamaria de Oro</td>\n",
249 |        "      <td>F.J.Santamaria de Oro &amp; Guatemala</td>\n",
250 |        "      <td>-58.428016</td>\n",
251 |        "      <td>-34.583323</td>\n",
252 |        "      <td>454615BAEcobici</td>\n",
253 |        "      <td>ICONIC</td>\n",
254 |        "      <td>OTHER</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>5</th>\n",
258 |        "      <td>6</td>\n",
259 |        "      <td>6</td>\n",
260 |        "      <td>13269548BAEcobici</td>\n",
261 |        "      <td>4,243</td>\n",
262 |        "      <td>2022-01-16 20:42:30</td>\n",
263 |        "      <td>268BAEcobici</td>\n",
264 |        "      <td>399 - GARCIA DEL RIO</td>\n",
265 |        "      <td>Av. García del Río 3182</td>\n",
266 |        "      <td>-58.477000</td>\n",
267 |        "      <td>-34.550300</td>\n",
268 |        "      <td>2022-01-16 21:53:13</td>\n",
269 |        "      <td>268BAEcobici</td>\n",
270 |        "      <td>399 - GARCIA DEL RIO</td>\n",
271 |        "      <td>Av. García del Río 3182</td>\n",
272 |        "      <td>-58.477000</td>\n",
273 |        "      <td>-34.550300</td>\n",
274 |        "      <td>200959BAEcobici</td>\n",
275 |        "      <td>ICONIC</td>\n",
276 |        "      <td>OTHER</td>\n",
277 |        "    </tr>\n",
278 |        "    <tr>\n",
279 |        "      <th>6</th>\n",
280 |        "      <td>7</td>\n",
281 |        "      <td>7</td>\n",
282 |        "      <td>13268959BAEcobici</td>\n",
283 |        "      <td>932</td>\n",
284 |        "      <td>2022-01-16 18:47:17</td>\n",
285 |        "      <td>278BAEcobici</td>\n",
286 |        "      <td>233 - MONROE</td>\n",
287 |        "      <td>2519 Superi</td>\n",
288 |        "      <td>-58.469813</td>\n",
289 |        "      <td>-34.564122</td>\n",
290 |        "      <td>2022-01-16 19:02:49</td>\n",
291 |        "      <td>236BAEcobici</td>\n",
292 |        "      <td>254 - Plaza Rafael Hernandez</td>\n",
293 |        "      <td>Vuelta de Obligado 2004</td>\n",
294 |        "      <td>-58.455166</td>\n",
295 |        "      <td>-34.562161</td>\n",
296 |        "      <td>823366BAEcobici</td>\n",
297 |        "      <td>ICONIC</td>\n",
298 |        "      <td>FEMALE</td>\n",
299 |        "    </tr>\n",
300 |        "    <tr>\n",
301 |        "      <th>7</th>\n",
302 |        "      <td>8</td>\n",
303 |        "      <td>8</td>\n",
304 |        "      <td>13267669BAEcobici</td>\n",
305 |        "      <td>4,337</td>\n",
306 |        "      <td>2022-01-16 12:55:26</td>\n",
307 |        "      <td>368BAEcobici</td>\n",
308 |        "      <td>378 - AGRONOMIA</td>\n",
309 |        "      <td>4351 San Martin</td>\n",
310 |        "      <td>-58.482079</td>\n",
311 |        "      <td>-34.598070</td>\n",
312 |        "      <td>2022-01-16 14:07:43</td>\n",
313 |        "      <td>368BAEcobici</td>\n",
314 |        "      <td>378 - AGRONOMIA</td>\n",
315 |        "      <td>4351 San Martin</td>\n",
316 |        "      <td>-58.482079</td>\n",
317 |        "      <td>-34.598070</td>\n",
318 |        "      <td>826754BAEcobici</td>\n",
319 |        "      <td>ICONIC</td>\n",
320 |        "      <td>MALE</td>\n",
321 |        "    </tr>\n",
322 |        "    <tr>\n",
323 |        "      <th>8</th>\n",
324 |        "      <td>9</td>\n",
325 |        "      <td>9</td>\n",
326 |        "      <td>13267592BAEcobici</td>\n",
327 |        "      <td>3,735</td>\n",
328 |        "      <td>2022-01-16 12:25:03</td>\n",
329 |        "      <td>382BAEcobici</td>\n",
330 |        "      <td>204 - Biarritz</td>\n",
331 |        "      <td>Biarritz 2403</td>\n",
332 |        "      <td>-58.477390</td>\n",
333 |        "      <td>-34.605470</td>\n",
334 |        "      <td>2022-01-16 13:27:18</td>\n",
335 |        "      <td>5BAEcobici</td>\n",
336 |        "      <td>005 - Plaza Italia</td>\n",
337 |        "      <td>Av. Sarmiento 2601</td>\n",
338 |        "      <td>-58.420954</td>\n",
339 |        "      <td>-34.580550</td>\n",
340 |        "      <td>776361BAEcobici</td>\n",
341 |        "      <td>ICONIC</td>\n",
342 |        "      <td>FEMALE</td>\n",
343 |        "    </tr>\n",
344 |        "    <tr>\n",
345 |        "      <th>9</th>\n",
346 |        "      <td>10</td>\n",
347 |        "      <td>10</td>\n",
348 |        "      <td>13376812BAEcobici</td>\n",
349 |        "      <td>366</td>\n",
350 |        "      <td>2022-01-29 15:13:01</td>\n",
351 |        "      <td>433BAEcobici</td>\n",
352 |        "      <td>273 - Plazoleta Colombia</td>\n",
353 |        "      <td>1619 Brandsen</td>\n",
354 |        "      <td>-58.373726</td>\n",
355 |        "      <td>-34.637697</td>\n",
356 |        "      <td>2022-01-29 15:19:07</td>\n",
357 |        "      <td>6BAEcobici</td>\n",
358 |        "      <td>006 - Parque Lezama</td>\n",
359 |        "      <td>Avenida Martin Garcia, 295</td>\n",
360 |        "      <td>-58.369758</td>\n",
361 |        "      <td>-34.628526</td>\n",
362 |        "      <td>772370BAEcobici</td>\n",
363 |        "      <td>ICONIC</td>\n",
364 |        "      <td>MALE</td>\n",
365 |        "    </tr>\n",
366 |        "  </tbody>\n",
367 |        "</table>\n",
368 |        "</div>"
369 |       ],
370 |       "text/plain": [
371 |        "   Unnamed: 0   X       Id_recorrido duracion_recorrido  \\\n",
372 |        "0           1   1  13267975BAEcobici              2,610   \n",
373 |        "1           2   2  13268526BAEcobici                545   \n",
374 |        "2           3   3  13268400BAEcobici              2,061   \n",
375 |        "3           4   4  13268164BAEcobici             12,748   \n",
376 |        "4           5   5  13270010BAEcobici              4,337   \n",
377 |        "5           6   6  13269548BAEcobici              4,243   \n",
378 |        "6           7   7  13268959BAEcobici                932   \n",
379 |        "7           8   8  13267669BAEcobici              4,337   \n",
380 |        "8           9   9  13267592BAEcobici              3,735   \n",
381 |        "9          10  10  13376812BAEcobici                366   \n",
382 |        "\n",
383 |        "  fecha_origen_recorrido id_estacion_origen    nombre_estacion_origen  \\\n",
384 |        "0    2022-01-16 14:58:42         5BAEcobici        005 - Plaza Italia   \n",
385 |        "1    2022-01-16 17:26:27        51BAEcobici             051 - TUCUMAN   \n",
386 |        "2    2022-01-16 16:51:12       161BAEcobici           161 - Humahuaca   \n",
387 |        "3    2022-01-16 15:58:01       210BAEcobici     335 - General Urquiza   \n",
388 |        "4    2022-01-16 23:40:09       215BAEcobici           113 - Guatemala   \n",
389 |        "5    2022-01-16 20:42:30       268BAEcobici      399 - GARCIA DEL RIO   \n",
390 |        "6    2022-01-16 18:47:17       278BAEcobici              233 - MONROE   \n",
391 |        "7    2022-01-16 12:55:26       368BAEcobici           378 - AGRONOMIA   \n",
392 |        "8    2022-01-16 12:25:03       382BAEcobici            204 - Biarritz   \n",
393 |        "9    2022-01-29 15:13:01       433BAEcobici  273 - Plazoleta Colombia   \n",
394 |        "\n",
395 |        "      direccion_estacion_origen  long_estacion_origen  lat_estacion_origen  \\\n",
396 |        "0            Av. Sarmiento 2601            -58.420954           -34.580550   \n",
397 |        "1      Tucuman & 9 De Julio Av.            -58.382126           -34.601478   \n",
398 |        "2                3912 Humahuaca            -58.419676           -34.602078   \n",
399 |        "3  Figueroa Alcorta & Sarmiento            -58.411278           -34.572165   \n",
400 |        "4                Guatemala 4773            -58.424996           -34.585878   \n",
401 |        "5       Av. García del Río 3182            -58.477000           -34.550300   \n",
402 |        "6                   2519 Superi            -58.469813           -34.564122   \n",
403 |        "7               4351 San Martin            -58.482079           -34.598070   \n",
404 |        "8                 Biarritz 2403            -58.477390           -34.605470   \n",
405 |        "9                 1619 Brandsen            -58.373726           -34.637697   \n",
406 |        "\n",
407 |        "  fecha_destino_recorrido id_estacion_destino        nombre_estacion_destino  \\\n",
408 |        "0     2022-01-16 15:42:12        210BAEcobici          335 - General Urquiza   \n",
409 |        "1     2022-01-16 17:35:32        174BAEcobici  174 - MINISTERIO DE EDUCACION   \n",
410 |        "2     2022-01-16 17:25:33        117BAEcobici              117 - HUMBERTO 1°   \n",
411 |        "3     2022-01-16 19:30:29        382BAEcobici                 204 - Biarritz   \n",
412 |        "4     2022-01-17 00:52:26        205BAEcobici    125 - F.J.Santamaria de Oro   \n",
413 |        "5     2022-01-16 21:53:13        268BAEcobici           399 - GARCIA DEL RIO   \n",
414 |        "6     2022-01-16 19:02:49        236BAEcobici   254 - Plaza Rafael Hernandez   \n",
415 |        "7     2022-01-16 14:07:43        368BAEcobici                378 - AGRONOMIA   \n",
416 |        "8     2022-01-16 13:27:18          5BAEcobici             005 - Plaza Italia   \n",
417 |        "9     2022-01-29 15:19:07          6BAEcobici            006 - Parque Lezama   \n",
418 |        "\n",
419 |        "              direccion_estacion_destino  long_estacion_destino  \\\n",
420 |        "0           Figueroa Alcorta & Sarmiento             -58.411278   \n",
421 |        "1  Marcelo T. de Alvear & Rodriguez Peña             -58.391768   \n",
422 |        "2                             Peru  1016             -58.374176   \n",
423 |        "3                          Biarritz 2403             -58.477390   \n",
424 |        "4      F.J.Santamaria de Oro & Guatemala             -58.428016   \n",
425 |        "5                Av. García del Río 3182             -58.477000   \n",
426 |        "6                Vuelta de Obligado 2004             -58.455166   \n",
427 |        "7                        4351 San Martin             -58.482079   \n",
428 |        "8                     Av. Sarmiento 2601             -58.420954   \n",
429 |        "9             Avenida Martin Garcia, 295             -58.369758   \n",
430 |        "\n",
431 |        "   lat_estacion_destino       id_usuario modelo_bicicleta  Género  \n",
432 |        "0            -34.572165  776361BAEcobici           ICONIC  FEMALE  \n",
433 |        "1            -34.597225  776407BAEcobici           ICONIC    MALE  \n",
434 |        "2            -34.620101  671762BAEcobici           ICONIC  FEMALE  \n",
435 |        "3            -34.605470  776361BAEcobici           ICONIC  FEMALE  \n",
436 |        "4            -34.583323  454615BAEcobici           ICONIC   OTHER  \n",
437 |        "5            -34.550300  200959BAEcobici           ICONIC   OTHER  \n",
438 |        "6            -34.562161  823366BAEcobici           ICONIC  FEMALE  \n",
439 |        "7            -34.598070  826754BAEcobici           ICONIC    MALE  \n",
440 |        "8            -34.580550  776361BAEcobici           ICONIC  FEMALE  \n",
441 |        "9            -34.628526  772370BAEcobici           ICONIC    MALE  "
442 |       ]
443 |      },
444 |      "execution_count": 10,
445 |      "metadata": {},
446 |      "output_type": "execute_result"
447 |     }
448 |    ],
449 |    "source": [
450 |     "import pandas as pd\n",
451 |     "\n",
452 |     "rides = pd.read_parquet('../data/raw/rides_2022.parquet')\n",
453 |     "\n",
454 |     "rides.head(10)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 11,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/plain": [
465 |        "(2922805, 19)"
466 |       ]
467 |      },
468 |      "execution_count": 11,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "rides.shape"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 12,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "data": {
484 |       "text/plain": [
485 |        "331"
486 |       ]
487 |      },
488 |      "execution_count": 12,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "rides.id_estacion_origen.nunique()"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": 13,
500 |    "metadata": {},
501 |    "outputs": [
502 |     {
503 |      "data": {
504 |       "text/html": [
505 |        "<div>\n",
506 |        "<style scoped>\n",
507 |        "    .dataframe tbody tr th:only-of-type {\n",
508 |        "        vertical-align: middle;\n",
509 |        "    }\n",
510 |        "\n",
511 |        "    .dataframe tbody tr th {\n",
512 |        "        vertical-align: top;\n",
513 |        "    }\n",
514 |        "\n",
515 |        "    .dataframe thead th {\n",
516 |        "        text-align: right;\n",
517 |        "    }\n",
518 |        "</style>\n",
519 |        "<table border=\"1\" class=\"dataframe\">\n",
520 |        "  <thead>\n",
521 |        "    <tr style=\"text-align: right;\">\n",
522 |        "      <th></th>\n",
523 |        "      <th>pickup_datetime</th>\n",
524 |        "      <th>pickup_location_id</th>\n",
525 |        "    </tr>\n",
526 |        "  </thead>\n",
527 |        "  <tbody>\n",
528 |        "    <tr>\n",
529 |        "      <th>0</th>\n",
530 |        "      <td>2022-01-16 14:58:42</td>\n",
531 |        "      <td>5</td>\n",
532 |        "    </tr>\n",
533 |        "    <tr>\n",
534 |        "      <th>1</th>\n",
535 |        "      <td>2022-01-16 17:26:27</td>\n",
536 |        "      <td>51</td>\n",
537 |        "    </tr>\n",
538 |        "    <tr>\n",
539 |        "      <th>2</th>\n",
540 |        "      <td>2022-01-16 16:51:12</td>\n",
541 |        "      <td>161</td>\n",
542 |        "    </tr>\n",
543 |        "    <tr>\n",
544 |        "      <th>3</th>\n",
545 |        "      <td>2022-01-16 15:58:01</td>\n",
546 |        "      <td>210</td>\n",
547 |        "    </tr>\n",
548 |        "    <tr>\n",
549 |        "      <th>4</th>\n",
550 |        "      <td>2022-01-16 23:40:09</td>\n",
551 |        "      <td>215</td>\n",
552 |        "    </tr>\n",
553 |        "    <tr>\n",
554 |        "      <th>5</th>\n",
555 |        "      <td>2022-01-16 20:42:30</td>\n",
556 |        "      <td>268</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>6</th>\n",
560 |        "      <td>2022-01-16 18:47:17</td>\n",
561 |        "      <td>278</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>7</th>\n",
565 |        "      <td>2022-01-16 12:55:26</td>\n",
566 |        "      <td>368</td>\n",
567 |        "    </tr>\n",
568 |        "    <tr>\n",
569 |        "      <th>8</th>\n",
570 |        "      <td>2022-01-16 12:25:03</td>\n",
571 |        "      <td>382</td>\n",
572 |        "    </tr>\n",
573 |        "    <tr>\n",
574 |        "      <th>9</th>\n",
575 |        "      <td>2022-01-29 15:13:01</td>\n",
576 |        "      <td>433</td>\n",
577 |        "    </tr>\n",
578 |        "  </tbody>\n",
579 |        "</table>\n",
580 |        "</div>"
581 |       ],
582 |       "text/plain": [
583 |        "      pickup_datetime  pickup_location_id\n",
584 |        "0 2022-01-16 14:58:42                   5\n",
585 |        "1 2022-01-16 17:26:27                  51\n",
586 |        "2 2022-01-16 16:51:12                 161\n",
587 |        "3 2022-01-16 15:58:01                 210\n",
588 |        "4 2022-01-16 23:40:09                 215\n",
589 |        "5 2022-01-16 20:42:30                 268\n",
590 |        "6 2022-01-16 18:47:17                 278\n",
591 |        "7 2022-01-16 12:55:26                 368\n",
592 |        "8 2022-01-16 12:25:03                 382\n",
593 |        "9 2022-01-29 15:13:01                 433"
594 |       ]
595 |      },
596 |      "execution_count": 13,
597 |      "metadata": {},
598 |      "output_type": "execute_result"
599 |     }
600 |    ],
601 |    "source": [
602 |     "#Nos quedamos sólo con las columnas que nos interesan y las renombramos\n",
603 |     "rides = rides[['fecha_origen_recorrido', 'id_estacion_origen']]\n",
604 |     "# Eliminar la parte \"BAEcobici\" y convertir a tipo int\n",
605 |     "rides['id_estacion_origen'] = rides['id_estacion_origen'].str.replace('BAEcobici', '').astype(int)\n",
606 |     "\n",
607 |     "rides.rename(columns={\n",
608 |     "    'fecha_origen_recorrido': 'pickup_datetime',\n",
609 |     "    'id_estacion_origen': 'pickup_location_id',\n",
610 |     "}, inplace=True)\n",
611 |     "\n",
612 |     "rides['pickup_datetime'] = pd.to_datetime(rides['pickup_datetime'],format='%Y-%m-%d %H:%M:%S')\n",
613 |     "rides.head(10)"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 14,
619 |    "metadata": {},
620 |    "outputs": [
621 |     {
622 |      "data": {
623 |       "text/plain": [
624 |        "331"
625 |       ]
626 |      },
627 |      "execution_count": 14,
628 |      "metadata": {},
629 |      "output_type": "execute_result"
630 |     }
631 |    ],
632 |    "source": [
633 |     "rides.pickup_location_id.nunique()"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": 15,
639 |    "metadata": {},
640 |    "outputs": [
641 |     {
642 |      "name": "stderr",
643 |      "output_type": "stream",
644 |      "text": [
645 |       "C:\\Users\\jayan\\AppData\\Local\\Temp\\ipykernel_13464\\3389344848.py:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
646 |       "  rides['pickup_datetime'].describe()\n"
647 |      ]
648 |     },
649 |     {
650 |      "data": {
651 |       "text/plain": [
652 |        "count                 2922805\n",
653 |        "unique                2689886\n",
654 |        "top       2022-08-12 14:45:13\n",
655 |        "freq                        7\n",
656 |        "first     2022-01-01 00:11:07\n",
657 |        "last      2022-12-31 23:55:39\n",
658 |        "Name: pickup_datetime, dtype: object"
659 |       ]
660 |      },
661 |      "execution_count": 15,
662 |      "metadata": {},
663 |      "output_type": "execute_result"
664 |     }
665 |    ],
666 |    "source": [
667 |     "rides['pickup_datetime'].describe()"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 16,
673 |    "metadata": {},
674 |    "outputs": [
675 |     {
676 |      "name": "stderr",
677 |      "output_type": "stream",
678 |      "text": [
679 |       "C:\\Users\\jayan\\AppData\\Local\\Temp\\ipykernel_13464\\213125758.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n",
680 |       "  rides['pickup_datetime'].describe()\n"
681 |      ]
682 |     },
683 |     {
684 |      "data": {
685 |       "text/plain": [
686 |        "count                 2922805\n",
687 |        "unique                2689886\n",
688 |        "top       2022-08-12 14:45:13\n",
689 |        "freq                        7\n",
690 |        "first     2022-01-01 00:11:07\n",
691 |        "last      2022-12-31 23:55:39\n",
692 |        "Name: pickup_datetime, dtype: object"
693 |       ]
694 |      },
695 |      "execution_count": 16,
696 |      "metadata": {},
697 |      "output_type": "execute_result"
698 |     }
699 |    ],
700 |    "source": [
701 |     "rides = rides[rides.pickup_datetime >= '2022-01-01']\n",
702 |     "rides = rides[rides.pickup_datetime < '2023-01-01']\n",
703 |     "rides['pickup_datetime'].describe()"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": 17,
709 |    "metadata": {},
710 |    "outputs": [],
711 |    "source": [
712 |     "rides.to_parquet('../data/transformed/validated_rides_2022.parquet')"
713 |    ]
714 |   }
715 |  ],
716 |  "metadata": {
717 |   "kernelspec": {
718 |    "display_name": ".venv",
719 |    "language": "python",
720 |    "name": "python3"
721 |   },
722 |   "language_info": {
723 |    "codemirror_mode": {
724 |     "name": "ipython",
725 |     "version": 3
726 |    },
727 |    "file_extension": ".py",
728 |    "mimetype": "text/x-python",
729 |    "name": "python",
730 |    "nbconvert_exporter": "python",
731 |    "pygments_lexer": "ipython3",
732 |    "version": "3.9.13"
733 |   },
734 |   "orig_nbformat": 4,
735 |   "vscode": {
736 |    "interpreter": {
737 |     "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923"
738 |    }
739 |   }
740 |  },
741 |  "nbformat": 4,
742 |  "nbformat_minor": 2
743 | }
744 | 


--------------------------------------------------------------------------------
/notebooks/04_transform_raw_data_into_features_and_targets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%reload_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stdout",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "File 2022 was already in local storage\n"
 23 |      ]
 24 |     },
 25 |     {
 26 |      "data": {
 27 |       "text/html": [
 28 |        "<div>\n",
 29 |        "<style scoped>\n",
 30 |        "    .dataframe tbody tr th:only-of-type {\n",
 31 |        "        vertical-align: middle;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe tbody tr th {\n",
 35 |        "        vertical-align: top;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe thead th {\n",
 39 |        "        text-align: right;\n",
 40 |        "    }\n",
 41 |        "</style>\n",
 42 |        "<table border=\"1\" class=\"dataframe\">\n",
 43 |        "  <thead>\n",
 44 |        "    <tr style=\"text-align: right;\">\n",
 45 |        "      <th></th>\n",
 46 |        "      <th>pickup_datetime</th>\n",
 47 |        "      <th>pickup_location_id</th>\n",
 48 |        "    </tr>\n",
 49 |        "  </thead>\n",
 50 |        "  <tbody>\n",
 51 |        "    <tr>\n",
 52 |        "      <th>0</th>\n",
 53 |        "      <td>2022-01-16 14:58:42</td>\n",
 54 |        "      <td>5</td>\n",
 55 |        "    </tr>\n",
 56 |        "    <tr>\n",
 57 |        "      <th>1</th>\n",
 58 |        "      <td>2022-01-16 17:26:27</td>\n",
 59 |        "      <td>51</td>\n",
 60 |        "    </tr>\n",
 61 |        "    <tr>\n",
 62 |        "      <th>2</th>\n",
 63 |        "      <td>2022-01-16 16:51:12</td>\n",
 64 |        "      <td>161</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>3</th>\n",
 68 |        "      <td>2022-01-16 15:58:01</td>\n",
 69 |        "      <td>210</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>4</th>\n",
 73 |        "      <td>2022-01-16 23:40:09</td>\n",
 74 |        "      <td>215</td>\n",
 75 |        "    </tr>\n",
 76 |        "    <tr>\n",
 77 |        "      <th>...</th>\n",
 78 |        "      <td>...</td>\n",
 79 |        "      <td>...</td>\n",
 80 |        "    </tr>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>2922800</th>\n",
 83 |        "      <td>2022-12-20 22:34:33</td>\n",
 84 |        "      <td>336</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>2922801</th>\n",
 88 |        "      <td>2022-12-20 20:03:24</td>\n",
 89 |        "      <td>379</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>2922802</th>\n",
 93 |        "      <td>2022-12-20 12:13:32</td>\n",
 94 |        "      <td>169</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>2922803</th>\n",
 98 |        "      <td>2022-12-20 17:26:49</td>\n",
 99 |        "      <td>469</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>2922804</th>\n",
103 |        "      <td>2022-12-20 19:30:59</td>\n",
104 |        "      <td>273</td>\n",
105 |        "    </tr>\n",
106 |        "  </tbody>\n",
107 |        "</table>\n",
108 |        "<p>2922805 rows × 2 columns</p>\n",
109 |        "</div>"
110 |       ],
111 |       "text/plain": [
112 |        "            pickup_datetime  pickup_location_id\n",
113 |        "0       2022-01-16 14:58:42                   5\n",
114 |        "1       2022-01-16 17:26:27                  51\n",
115 |        "2       2022-01-16 16:51:12                 161\n",
116 |        "3       2022-01-16 15:58:01                 210\n",
117 |        "4       2022-01-16 23:40:09                 215\n",
118 |        "...                     ...                 ...\n",
119 |        "2922800 2022-12-20 22:34:33                 336\n",
120 |        "2922801 2022-12-20 20:03:24                 379\n",
121 |        "2922802 2022-12-20 12:13:32                 169\n",
122 |        "2922803 2022-12-20 17:26:49                 469\n",
123 |        "2922804 2022-12-20 19:30:59                 273\n",
124 |        "\n",
125 |        "[2922805 rows x 2 columns]"
126 |       ]
127 |      },
128 |      "execution_count": 2,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "from src.data import load_raw_data\n",
135 |     "\n",
136 |     "rides = load_raw_data(year=2022)\n",
137 |     "rides"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 3,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "<class 'pandas.core.frame.DataFrame'>\n",
150 |       "Int64Index: 2922805 entries, 0 to 2922804\n",
151 |       "Data columns (total 2 columns):\n",
152 |       " #   Column              Dtype         \n",
153 |       "---  ------              -----         \n",
154 |       " 0   pickup_datetime     datetime64[ns]\n",
155 |       " 1   pickup_location_id  int32         \n",
156 |       "dtypes: datetime64[ns](1), int32(1)\n",
157 |       "memory usage: 55.7 MB\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "rides.info()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 4,
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "array([0], dtype=int64)"
174 |       ]
175 |      },
176 |      "execution_count": 4,
177 |      "metadata": {},
178 |      "output_type": "execute_result"
179 |     }
180 |    ],
181 |    "source": [
182 |     "import pandas as pd\n",
183 |     "nulos = pd.DataFrame(rides.isnull().sum(), columns=['Nulos'])\n",
184 |     "nulos.Nulos.unique()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 5,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "name": "stderr",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "100%|██████████| 331/331 [00:05<00:00, 65.03it/s]\n"
197 |      ]
198 |     },
199 |     {
200 |      "data": {
201 |       "text/html": [
202 |        "<div>\n",
203 |        "<style scoped>\n",
204 |        "    .dataframe tbody tr th:only-of-type {\n",
205 |        "        vertical-align: middle;\n",
206 |        "    }\n",
207 |        "\n",
208 |        "    .dataframe tbody tr th {\n",
209 |        "        vertical-align: top;\n",
210 |        "    }\n",
211 |        "\n",
212 |        "    .dataframe thead th {\n",
213 |        "        text-align: right;\n",
214 |        "    }\n",
215 |        "</style>\n",
216 |        "<table border=\"1\" class=\"dataframe\">\n",
217 |        "  <thead>\n",
218 |        "    <tr style=\"text-align: right;\">\n",
219 |        "      <th></th>\n",
220 |        "      <th>pickup_hour</th>\n",
221 |        "      <th>rides</th>\n",
222 |        "      <th>pickup_location_id</th>\n",
223 |        "    </tr>\n",
224 |        "  </thead>\n",
225 |        "  <tbody>\n",
226 |        "    <tr>\n",
227 |        "      <th>0</th>\n",
228 |        "      <td>2022-01-01 00:00:00</td>\n",
229 |        "      <td>1</td>\n",
230 |        "      <td>9</td>\n",
231 |        "    </tr>\n",
232 |        "    <tr>\n",
233 |        "      <th>1</th>\n",
234 |        "      <td>2022-01-01 01:00:00</td>\n",
235 |        "      <td>1</td>\n",
236 |        "      <td>9</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>2</th>\n",
240 |        "      <td>2022-01-01 02:00:00</td>\n",
241 |        "      <td>1</td>\n",
242 |        "      <td>9</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>3</th>\n",
246 |        "      <td>2022-01-01 03:00:00</td>\n",
247 |        "      <td>0</td>\n",
248 |        "      <td>9</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>4</th>\n",
252 |        "      <td>2022-01-01 04:00:00</td>\n",
253 |        "      <td>1</td>\n",
254 |        "      <td>9</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>...</th>\n",
258 |        "      <td>...</td>\n",
259 |        "      <td>...</td>\n",
260 |        "      <td>...</td>\n",
261 |        "    </tr>\n",
262 |        "    <tr>\n",
263 |        "      <th>2899555</th>\n",
264 |        "      <td>2022-12-31 19:00:00</td>\n",
265 |        "      <td>0</td>\n",
266 |        "      <td>57</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>2899556</th>\n",
270 |        "      <td>2022-12-31 20:00:00</td>\n",
271 |        "      <td>0</td>\n",
272 |        "      <td>57</td>\n",
273 |        "    </tr>\n",
274 |        "    <tr>\n",
275 |        "      <th>2899557</th>\n",
276 |        "      <td>2022-12-31 21:00:00</td>\n",
277 |        "      <td>0</td>\n",
278 |        "      <td>57</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>2899558</th>\n",
282 |        "      <td>2022-12-31 22:00:00</td>\n",
283 |        "      <td>0</td>\n",
284 |        "      <td>57</td>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>2899559</th>\n",
288 |        "      <td>2022-12-31 23:00:00</td>\n",
289 |        "      <td>0</td>\n",
290 |        "      <td>57</td>\n",
291 |        "    </tr>\n",
292 |        "  </tbody>\n",
293 |        "</table>\n",
294 |        "<p>2899560 rows × 3 columns</p>\n",
295 |        "</div>"
296 |       ],
297 |       "text/plain": [
298 |        "                pickup_hour  rides  pickup_location_id\n",
299 |        "0       2022-01-01 00:00:00      1                   9\n",
300 |        "1       2022-01-01 01:00:00      1                   9\n",
301 |        "2       2022-01-01 02:00:00      1                   9\n",
302 |        "3       2022-01-01 03:00:00      0                   9\n",
303 |        "4       2022-01-01 04:00:00      1                   9\n",
304 |        "...                     ...    ...                 ...\n",
305 |        "2899555 2022-12-31 19:00:00      0                  57\n",
306 |        "2899556 2022-12-31 20:00:00      0                  57\n",
307 |        "2899557 2022-12-31 21:00:00      0                  57\n",
308 |        "2899558 2022-12-31 22:00:00      0                  57\n",
309 |        "2899559 2022-12-31 23:00:00      0                  57\n",
310 |        "\n",
311 |        "[2899560 rows x 3 columns]"
312 |       ]
313 |      },
314 |      "execution_count": 5,
315 |      "metadata": {},
316 |      "output_type": "execute_result"
317 |     }
318 |    ],
319 |    "source": [
320 |     "from src.data import transform_raw_data_into_ts_data\n",
321 |     "\n",
322 |     "ts_data = transform_raw_data_into_ts_data(rides)\n",
323 |     "ts_data\n"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 6,
329 |    "metadata": {},
330 |    "outputs": [
331 |     {
332 |      "data": {
333 |       "text/plain": [
334 |        "331"
335 |       ]
336 |      },
337 |      "execution_count": 6,
338 |      "metadata": {},
339 |      "output_type": "execute_result"
340 |     }
341 |    ],
342 |    "source": [
343 |     "ts_data.pickup_location_id.unique().size"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 7,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "array([0], dtype=int64)"
355 |       ]
356 |      },
357 |      "execution_count": 7,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "import pandas as pd\n",
364 |     "nulos = pd.DataFrame(ts_data.isnull().sum(), columns=['Nulos'])\n",
365 |     "nulos.Nulos.unique()"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 8,
371 |    "metadata": {},
372 |    "outputs": [
373 |     {
374 |      "name": "stderr",
375 |      "output_type": "stream",
376 |      "text": [
377 |       "100%|██████████| 331/331 [00:44<00:00,  7.38it/s]"
378 |      ]
379 |     },
380 |     {
381 |      "name": "stdout",
382 |      "output_type": "stream",
383 |      "text": [
384 |       "features.shape=(111216, 674)\n",
385 |       "targets.shape=(111216, 36)\n"
386 |      ]
387 |     },
388 |     {
389 |      "name": "stderr",
390 |      "output_type": "stream",
391 |      "text": [
392 |       "\n"
393 |      ]
394 |     }
395 |    ],
396 |    "source": [
397 |     "from src.data import transform_ts_data_into_features_and_target\n",
398 |     "\n",
399 |     "features, targets = transform_ts_data_into_features_and_target(\n",
400 |     "    ts_data,\n",
401 |     "    input_seq_len=24*28*1, # one month\n",
402 |     "    step_size=24,\n",
403 |     "    output_seq_len=36\n",
404 |     ")\n",
405 |     "\n",
406 |     "print(f'{features.shape=}')\n",
407 |     "print(f'{targets.shape=}')"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 9,
413 |    "metadata": {},
414 |    "outputs": [
415 |     {
416 |      "data": {
417 |       "text/plain": [
418 |        "array([0], dtype=int64)"
419 |       ]
420 |      },
421 |      "execution_count": 9,
422 |      "metadata": {},
423 |      "output_type": "execute_result"
424 |     }
425 |    ],
426 |    "source": [
427 |     "import pandas as pd\n",
428 |     "nulos = pd.DataFrame(features.isnull().sum(), columns=['Nulos'])\n",
429 |     "nulos.Nulos.unique()"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 10,
435 |    "metadata": {},
436 |    "outputs": [
437 |     {
438 |      "data": {
439 |       "text/html": [
440 |        "<div>\n",
441 |        "<style scoped>\n",
442 |        "    .dataframe tbody tr th:only-of-type {\n",
443 |        "        vertical-align: middle;\n",
444 |        "    }\n",
445 |        "\n",
446 |        "    .dataframe tbody tr th {\n",
447 |        "        vertical-align: top;\n",
448 |        "    }\n",
449 |        "\n",
450 |        "    .dataframe thead th {\n",
451 |        "        text-align: right;\n",
452 |        "    }\n",
453 |        "</style>\n",
454 |        "<table border=\"1\" class=\"dataframe\">\n",
455 |        "  <thead>\n",
456 |        "    <tr style=\"text-align: right;\">\n",
457 |        "      <th></th>\n",
458 |        "      <th>rides_next_1_hour</th>\n",
459 |        "      <th>rides_next_2_hour</th>\n",
460 |        "      <th>rides_next_3_hour</th>\n",
461 |        "      <th>rides_next_4_hour</th>\n",
462 |        "      <th>rides_next_5_hour</th>\n",
463 |        "      <th>rides_next_6_hour</th>\n",
464 |        "      <th>rides_next_7_hour</th>\n",
465 |        "      <th>rides_next_8_hour</th>\n",
466 |        "      <th>rides_next_9_hour</th>\n",
467 |        "      <th>rides_next_10_hour</th>\n",
468 |        "      <th>...</th>\n",
469 |        "      <th>rides_next_27_hour</th>\n",
470 |        "      <th>rides_next_28_hour</th>\n",
471 |        "      <th>rides_next_29_hour</th>\n",
472 |        "      <th>rides_next_30_hour</th>\n",
473 |        "      <th>rides_next_31_hour</th>\n",
474 |        "      <th>rides_next_32_hour</th>\n",
475 |        "      <th>rides_next_33_hour</th>\n",
476 |        "      <th>rides_next_34_hour</th>\n",
477 |        "      <th>rides_next_35_hour</th>\n",
478 |        "      <th>rides_next_36_hour</th>\n",
479 |        "    </tr>\n",
480 |        "  </thead>\n",
481 |        "  <tbody>\n",
482 |        "    <tr>\n",
483 |        "      <th>111211</th>\n",
484 |        "      <td>0.0</td>\n",
485 |        "      <td>0.0</td>\n",
486 |        "      <td>0.0</td>\n",
487 |        "      <td>0.0</td>\n",
488 |        "      <td>0.0</td>\n",
489 |        "      <td>1.0</td>\n",
490 |        "      <td>0.0</td>\n",
491 |        "      <td>0.0</td>\n",
492 |        "      <td>1.0</td>\n",
493 |        "      <td>0.0</td>\n",
494 |        "      <td>...</td>\n",
495 |        "      <td>2.0</td>\n",
496 |        "      <td>0.0</td>\n",
497 |        "      <td>0.0</td>\n",
498 |        "      <td>0.0</td>\n",
499 |        "      <td>0.0</td>\n",
500 |        "      <td>0.0</td>\n",
501 |        "      <td>1.0</td>\n",
502 |        "      <td>0.0</td>\n",
503 |        "      <td>2.0</td>\n",
504 |        "      <td>0.0</td>\n",
505 |        "    </tr>\n",
506 |        "    <tr>\n",
507 |        "      <th>111212</th>\n",
508 |        "      <td>1.0</td>\n",
509 |        "      <td>0.0</td>\n",
510 |        "      <td>2.0</td>\n",
511 |        "      <td>0.0</td>\n",
512 |        "      <td>0.0</td>\n",
513 |        "      <td>0.0</td>\n",
514 |        "      <td>0.0</td>\n",
515 |        "      <td>0.0</td>\n",
516 |        "      <td>1.0</td>\n",
517 |        "      <td>0.0</td>\n",
518 |        "      <td>...</td>\n",
519 |        "      <td>1.0</td>\n",
520 |        "      <td>0.0</td>\n",
521 |        "      <td>0.0</td>\n",
522 |        "      <td>0.0</td>\n",
523 |        "      <td>0.0</td>\n",
524 |        "      <td>2.0</td>\n",
525 |        "      <td>3.0</td>\n",
526 |        "      <td>0.0</td>\n",
527 |        "      <td>3.0</td>\n",
528 |        "      <td>0.0</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>111213</th>\n",
532 |        "      <td>1.0</td>\n",
533 |        "      <td>0.0</td>\n",
534 |        "      <td>1.0</td>\n",
535 |        "      <td>0.0</td>\n",
536 |        "      <td>0.0</td>\n",
537 |        "      <td>0.0</td>\n",
538 |        "      <td>0.0</td>\n",
539 |        "      <td>2.0</td>\n",
540 |        "      <td>3.0</td>\n",
541 |        "      <td>0.0</td>\n",
542 |        "      <td>...</td>\n",
543 |        "      <td>0.0</td>\n",
544 |        "      <td>0.0</td>\n",
545 |        "      <td>0.0</td>\n",
546 |        "      <td>1.0</td>\n",
547 |        "      <td>0.0</td>\n",
548 |        "      <td>0.0</td>\n",
549 |        "      <td>2.0</td>\n",
550 |        "      <td>0.0</td>\n",
551 |        "      <td>1.0</td>\n",
552 |        "      <td>1.0</td>\n",
553 |        "    </tr>\n",
554 |        "    <tr>\n",
555 |        "      <th>111214</th>\n",
556 |        "      <td>1.0</td>\n",
557 |        "      <td>0.0</td>\n",
558 |        "      <td>0.0</td>\n",
559 |        "      <td>0.0</td>\n",
560 |        "      <td>0.0</td>\n",
561 |        "      <td>1.0</td>\n",
562 |        "      <td>0.0</td>\n",
563 |        "      <td>0.0</td>\n",
564 |        "      <td>2.0</td>\n",
565 |        "      <td>0.0</td>\n",
566 |        "      <td>...</td>\n",
567 |        "      <td>1.0</td>\n",
568 |        "      <td>0.0</td>\n",
569 |        "      <td>0.0</td>\n",
570 |        "      <td>1.0</td>\n",
571 |        "      <td>0.0</td>\n",
572 |        "      <td>0.0</td>\n",
573 |        "      <td>1.0</td>\n",
574 |        "      <td>2.0</td>\n",
575 |        "      <td>0.0</td>\n",
576 |        "      <td>0.0</td>\n",
577 |        "    </tr>\n",
578 |        "    <tr>\n",
579 |        "      <th>111215</th>\n",
580 |        "      <td>1.0</td>\n",
581 |        "      <td>0.0</td>\n",
582 |        "      <td>1.0</td>\n",
583 |        "      <td>0.0</td>\n",
584 |        "      <td>0.0</td>\n",
585 |        "      <td>1.0</td>\n",
586 |        "      <td>0.0</td>\n",
587 |        "      <td>0.0</td>\n",
588 |        "      <td>1.0</td>\n",
589 |        "      <td>2.0</td>\n",
590 |        "      <td>...</td>\n",
591 |        "      <td>0.0</td>\n",
592 |        "      <td>0.0</td>\n",
593 |        "      <td>0.0</td>\n",
594 |        "      <td>0.0</td>\n",
595 |        "      <td>0.0</td>\n",
596 |        "      <td>1.0</td>\n",
597 |        "      <td>0.0</td>\n",
598 |        "      <td>0.0</td>\n",
599 |        "      <td>0.0</td>\n",
600 |        "      <td>1.0</td>\n",
601 |        "    </tr>\n",
602 |        "  </tbody>\n",
603 |        "</table>\n",
604 |        "<p>5 rows × 36 columns</p>\n",
605 |        "</div>"
606 |       ],
607 |       "text/plain": [
608 |        "        rides_next_1_hour  rides_next_2_hour  rides_next_3_hour  \\\n",
609 |        "111211                0.0                0.0                0.0   \n",
610 |        "111212                1.0                0.0                2.0   \n",
611 |        "111213                1.0                0.0                1.0   \n",
612 |        "111214                1.0                0.0                0.0   \n",
613 |        "111215                1.0                0.0                1.0   \n",
614 |        "\n",
615 |        "        rides_next_4_hour  rides_next_5_hour  rides_next_6_hour  \\\n",
616 |        "111211                0.0                0.0                1.0   \n",
617 |        "111212                0.0                0.0                0.0   \n",
618 |        "111213                0.0                0.0                0.0   \n",
619 |        "111214                0.0                0.0                1.0   \n",
620 |        "111215                0.0                0.0                1.0   \n",
621 |        "\n",
622 |        "        rides_next_7_hour  rides_next_8_hour  rides_next_9_hour  \\\n",
623 |        "111211                0.0                0.0                1.0   \n",
624 |        "111212                0.0                0.0                1.0   \n",
625 |        "111213                0.0                2.0                3.0   \n",
626 |        "111214                0.0                0.0                2.0   \n",
627 |        "111215                0.0                0.0                1.0   \n",
628 |        "\n",
629 |        "        rides_next_10_hour  ...  rides_next_27_hour  rides_next_28_hour  \\\n",
630 |        "111211                 0.0  ...                 2.0                 0.0   \n",
631 |        "111212                 0.0  ...                 1.0                 0.0   \n",
632 |        "111213                 0.0  ...                 0.0                 0.0   \n",
633 |        "111214                 0.0  ...                 1.0                 0.0   \n",
634 |        "111215                 2.0  ...                 0.0                 0.0   \n",
635 |        "\n",
636 |        "        rides_next_29_hour  rides_next_30_hour  rides_next_31_hour  \\\n",
637 |        "111211                 0.0                 0.0                 0.0   \n",
638 |        "111212                 0.0                 0.0                 0.0   \n",
639 |        "111213                 0.0                 1.0                 0.0   \n",
640 |        "111214                 0.0                 1.0                 0.0   \n",
641 |        "111215                 0.0                 0.0                 0.0   \n",
642 |        "\n",
643 |        "        rides_next_32_hour  rides_next_33_hour  rides_next_34_hour  \\\n",
644 |        "111211                 0.0                 1.0                 0.0   \n",
645 |        "111212                 2.0                 3.0                 0.0   \n",
646 |        "111213                 0.0                 2.0                 0.0   \n",
647 |        "111214                 0.0                 1.0                 2.0   \n",
648 |        "111215                 1.0                 0.0                 0.0   \n",
649 |        "\n",
650 |        "        rides_next_35_hour  rides_next_36_hour  \n",
651 |        "111211                 2.0                 0.0  \n",
652 |        "111212                 3.0                 0.0  \n",
653 |        "111213                 1.0                 1.0  \n",
654 |        "111214                 0.0                 0.0  \n",
655 |        "111215                 0.0                 1.0  \n",
656 |        "\n",
657 |        "[5 rows x 36 columns]"
658 |       ]
659 |      },
660 |      "execution_count": 10,
661 |      "metadata": {},
662 |      "output_type": "execute_result"
663 |     }
664 |    ],
665 |    "source": [
666 |     "targets.tail()"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": 11,
672 |    "metadata": {},
673 |    "outputs": [],
674 |    "source": [
675 |     "import pandas as pd\n",
676 |     "#tabular_data = features\n",
677 |     "#tabular_data = targets\n",
678 |     "tabular_data = pd.concat([features, targets], axis=1)\n",
679 |     "\n",
680 |     "from src.paths import TRANSFORMED_DATA_DIR\n",
681 |     "tabular_data.to_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')"
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": 12,
687 |    "metadata": {},
688 |    "outputs": [
689 |     {
690 |      "data": {
691 |       "text/html": [
692 |        "<div>\n",
693 |        "<style scoped>\n",
694 |        "    .dataframe tbody tr th:only-of-type {\n",
695 |        "        vertical-align: middle;\n",
696 |        "    }\n",
697 |        "\n",
698 |        "    .dataframe tbody tr th {\n",
699 |        "        vertical-align: top;\n",
700 |        "    }\n",
701 |        "\n",
702 |        "    .dataframe thead th {\n",
703 |        "        text-align: right;\n",
704 |        "    }\n",
705 |        "</style>\n",
706 |        "<table border=\"1\" class=\"dataframe\">\n",
707 |        "  <thead>\n",
708 |        "    <tr style=\"text-align: right;\">\n",
709 |        "      <th></th>\n",
710 |        "      <th>rides_previous_672_hour</th>\n",
711 |        "      <th>rides_previous_671_hour</th>\n",
712 |        "      <th>rides_previous_670_hour</th>\n",
713 |        "      <th>rides_previous_669_hour</th>\n",
714 |        "      <th>rides_previous_668_hour</th>\n",
715 |        "      <th>rides_previous_667_hour</th>\n",
716 |        "      <th>rides_previous_666_hour</th>\n",
717 |        "      <th>rides_previous_665_hour</th>\n",
718 |        "      <th>rides_previous_664_hour</th>\n",
719 |        "      <th>rides_previous_663_hour</th>\n",
720 |        "      <th>...</th>\n",
721 |        "      <th>rides_next_27_hour</th>\n",
722 |        "      <th>rides_next_28_hour</th>\n",
723 |        "      <th>rides_next_29_hour</th>\n",
724 |        "      <th>rides_next_30_hour</th>\n",
725 |        "      <th>rides_next_31_hour</th>\n",
726 |        "      <th>rides_next_32_hour</th>\n",
727 |        "      <th>rides_next_33_hour</th>\n",
728 |        "      <th>rides_next_34_hour</th>\n",
729 |        "      <th>rides_next_35_hour</th>\n",
730 |        "      <th>rides_next_36_hour</th>\n",
731 |        "    </tr>\n",
732 |        "  </thead>\n",
733 |        "  <tbody>\n",
734 |        "    <tr>\n",
735 |        "      <th>0</th>\n",
736 |        "      <td>1.0</td>\n",
737 |        "      <td>1.0</td>\n",
738 |        "      <td>1.0</td>\n",
739 |        "      <td>0.0</td>\n",
740 |        "      <td>1.0</td>\n",
741 |        "      <td>0.0</td>\n",
742 |        "      <td>1.0</td>\n",
743 |        "      <td>0.0</td>\n",
744 |        "      <td>0.0</td>\n",
745 |        "      <td>0.0</td>\n",
746 |        "      <td>...</td>\n",
747 |        "      <td>1.0</td>\n",
748 |        "      <td>0.0</td>\n",
749 |        "      <td>0.0</td>\n",
750 |        "      <td>1.0</td>\n",
751 |        "      <td>0.0</td>\n",
752 |        "      <td>0.0</td>\n",
753 |        "      <td>2.0</td>\n",
754 |        "      <td>0.0</td>\n",
755 |        "      <td>0.0</td>\n",
756 |        "      <td>10.0</td>\n",
757 |        "    </tr>\n",
758 |        "    <tr>\n",
759 |        "      <th>1</th>\n",
760 |        "      <td>1.0</td>\n",
761 |        "      <td>0.0</td>\n",
762 |        "      <td>1.0</td>\n",
763 |        "      <td>0.0</td>\n",
764 |        "      <td>0.0</td>\n",
765 |        "      <td>0.0</td>\n",
766 |        "      <td>0.0</td>\n",
767 |        "      <td>0.0</td>\n",
768 |        "      <td>0.0</td>\n",
769 |        "      <td>1.0</td>\n",
770 |        "      <td>...</td>\n",
771 |        "      <td>4.0</td>\n",
772 |        "      <td>0.0</td>\n",
773 |        "      <td>0.0</td>\n",
774 |        "      <td>0.0</td>\n",
775 |        "      <td>1.0</td>\n",
776 |        "      <td>2.0</td>\n",
777 |        "      <td>5.0</td>\n",
778 |        "      <td>2.0</td>\n",
779 |        "      <td>2.0</td>\n",
780 |        "      <td>2.0</td>\n",
781 |        "    </tr>\n",
782 |        "    <tr>\n",
783 |        "      <th>2</th>\n",
784 |        "      <td>0.0</td>\n",
785 |        "      <td>1.0</td>\n",
786 |        "      <td>2.0</td>\n",
787 |        "      <td>0.0</td>\n",
788 |        "      <td>0.0</td>\n",
789 |        "      <td>2.0</td>\n",
790 |        "      <td>1.0</td>\n",
791 |        "      <td>1.0</td>\n",
792 |        "      <td>1.0</td>\n",
793 |        "      <td>2.0</td>\n",
794 |        "      <td>...</td>\n",
795 |        "      <td>0.0</td>\n",
796 |        "      <td>0.0</td>\n",
797 |        "      <td>0.0</td>\n",
798 |        "      <td>0.0</td>\n",
799 |        "      <td>2.0</td>\n",
800 |        "      <td>0.0</td>\n",
801 |        "      <td>5.0</td>\n",
802 |        "      <td>4.0</td>\n",
803 |        "      <td>1.0</td>\n",
804 |        "      <td>8.0</td>\n",
805 |        "    </tr>\n",
806 |        "    <tr>\n",
807 |        "      <th>3</th>\n",
808 |        "      <td>4.0</td>\n",
809 |        "      <td>2.0</td>\n",
810 |        "      <td>2.0</td>\n",
811 |        "      <td>1.0</td>\n",
812 |        "      <td>0.0</td>\n",
813 |        "      <td>0.0</td>\n",
814 |        "      <td>0.0</td>\n",
815 |        "      <td>2.0</td>\n",
816 |        "      <td>1.0</td>\n",
817 |        "      <td>4.0</td>\n",
818 |        "      <td>...</td>\n",
819 |        "      <td>2.0</td>\n",
820 |        "      <td>1.0</td>\n",
821 |        "      <td>1.0</td>\n",
822 |        "      <td>0.0</td>\n",
823 |        "      <td>1.0</td>\n",
824 |        "      <td>1.0</td>\n",
825 |        "      <td>7.0</td>\n",
826 |        "      <td>2.0</td>\n",
827 |        "      <td>2.0</td>\n",
828 |        "      <td>3.0</td>\n",
829 |        "    </tr>\n",
830 |        "    <tr>\n",
831 |        "      <th>4</th>\n",
832 |        "      <td>0.0</td>\n",
833 |        "      <td>0.0</td>\n",
834 |        "      <td>1.0</td>\n",
835 |        "      <td>0.0</td>\n",
836 |        "      <td>0.0</td>\n",
837 |        "      <td>0.0</td>\n",
838 |        "      <td>3.0</td>\n",
839 |        "      <td>1.0</td>\n",
840 |        "      <td>3.0</td>\n",
841 |        "      <td>2.0</td>\n",
842 |        "      <td>...</td>\n",
843 |        "      <td>0.0</td>\n",
844 |        "      <td>0.0</td>\n",
845 |        "      <td>0.0</td>\n",
846 |        "      <td>0.0</td>\n",
847 |        "      <td>0.0</td>\n",
848 |        "      <td>0.0</td>\n",
849 |        "      <td>2.0</td>\n",
850 |        "      <td>0.0</td>\n",
851 |        "      <td>1.0</td>\n",
852 |        "      <td>4.0</td>\n",
853 |        "    </tr>\n",
854 |        "  </tbody>\n",
855 |        "</table>\n",
856 |        "<p>5 rows × 710 columns</p>\n",
857 |        "</div>"
858 |       ],
859 |       "text/plain": [
860 |        "   rides_previous_672_hour  rides_previous_671_hour  rides_previous_670_hour  \\\n",
861 |        "0                      1.0                      1.0                      1.0   \n",
862 |        "1                      1.0                      0.0                      1.0   \n",
863 |        "2                      0.0                      1.0                      2.0   \n",
864 |        "3                      4.0                      2.0                      2.0   \n",
865 |        "4                      0.0                      0.0                      1.0   \n",
866 |        "\n",
867 |        "   rides_previous_669_hour  rides_previous_668_hour  rides_previous_667_hour  \\\n",
868 |        "0                      0.0                      1.0                      0.0   \n",
869 |        "1                      0.0                      0.0                      0.0   \n",
870 |        "2                      0.0                      0.0                      2.0   \n",
871 |        "3                      1.0                      0.0                      0.0   \n",
872 |        "4                      0.0                      0.0                      0.0   \n",
873 |        "\n",
874 |        "   rides_previous_666_hour  rides_previous_665_hour  rides_previous_664_hour  \\\n",
875 |        "0                      1.0                      0.0                      0.0   \n",
876 |        "1                      0.0                      0.0                      0.0   \n",
877 |        "2                      1.0                      1.0                      1.0   \n",
878 |        "3                      0.0                      2.0                      1.0   \n",
879 |        "4                      3.0                      1.0                      3.0   \n",
880 |        "\n",
881 |        "   rides_previous_663_hour  ...  rides_next_27_hour  rides_next_28_hour  \\\n",
882 |        "0                      0.0  ...                 1.0                 0.0   \n",
883 |        "1                      1.0  ...                 4.0                 0.0   \n",
884 |        "2                      2.0  ...                 0.0                 0.0   \n",
885 |        "3                      4.0  ...                 2.0                 1.0   \n",
886 |        "4                      2.0  ...                 0.0                 0.0   \n",
887 |        "\n",
888 |        "   rides_next_29_hour  rides_next_30_hour  rides_next_31_hour  \\\n",
889 |        "0                 0.0                 1.0                 0.0   \n",
890 |        "1                 0.0                 0.0                 1.0   \n",
891 |        "2                 0.0                 0.0                 2.0   \n",
892 |        "3                 1.0                 0.0                 1.0   \n",
893 |        "4                 0.0                 0.0                 0.0   \n",
894 |        "\n",
895 |        "   rides_next_32_hour  rides_next_33_hour  rides_next_34_hour  \\\n",
896 |        "0                 0.0                 2.0                 0.0   \n",
897 |        "1                 2.0                 5.0                 2.0   \n",
898 |        "2                 0.0                 5.0                 4.0   \n",
899 |        "3                 1.0                 7.0                 2.0   \n",
900 |        "4                 0.0                 2.0                 0.0   \n",
901 |        "\n",
902 |        "   rides_next_35_hour  rides_next_36_hour  \n",
903 |        "0                 0.0                10.0  \n",
904 |        "1                 2.0                 2.0  \n",
905 |        "2                 1.0                 8.0  \n",
906 |        "3                 2.0                 3.0  \n",
907 |        "4                 1.0                 4.0  \n",
908 |        "\n",
909 |        "[5 rows x 710 columns]"
910 |       ]
911 |      },
912 |      "execution_count": 12,
913 |      "metadata": {},
914 |      "output_type": "execute_result"
915 |     }
916 |    ],
917 |    "source": [
918 |     "tabular_data.head()"
919 |    ]
920 |   },
921 |   {
922 |    "cell_type": "code",
923 |    "execution_count": 13,
924 |    "metadata": {},
925 |    "outputs": [
926 |     {
927 |      "data": {
928 |       "text/plain": [
929 |        "array([0], dtype=int64)"
930 |       ]
931 |      },
932 |      "execution_count": 13,
933 |      "metadata": {},
934 |      "output_type": "execute_result"
935 |     }
936 |    ],
937 |    "source": [
938 |     "import pandas as pd\n",
939 |     "nulos = pd.DataFrame(tabular_data.isnull().sum(), columns=['Nulos'])\n",
940 |     "nulos.Nulos.unique()"
941 |    ]
942 |   }
943 |  ],
944 |  "metadata": {
945 |   "kernelspec": {
946 |    "display_name": ".venv",
947 |    "language": "python",
948 |    "name": "python3"
949 |   },
950 |   "language_info": {
951 |    "codemirror_mode": {
952 |     "name": "ipython",
953 |     "version": 3
954 |    },
955 |    "file_extension": ".py",
956 |    "mimetype": "text/x-python",
957 |    "name": "python",
958 |    "nbconvert_exporter": "python",
959 |    "pygments_lexer": "ipython3",
960 |    "version": "3.9.13"
961 |   },
962 |   "orig_nbformat": 4,
963 |   "vscode": {
964 |    "interpreter": {
965 |     "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923"
966 |    }
967 |   }
968 |  },
969 |  "nbformat": 4,
970 |  "nbformat_minor": 2
971 | }
972 | 


--------------------------------------------------------------------------------
/notebooks/11_backfill_feature_store.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%reload_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "HOPSWORKS_PROJECT_NAME = 'bike_sharing_demand'"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import os\n",
 29 |     "from dotenv import load_dotenv\n",
 30 |     "from src.paths import PARENT_DIR\n",
 31 |     "\n",
 32 |     "# load key-value pairs from .env file located in the parent directory\n",
 33 |     "load_dotenv(PARENT_DIR / '.env')\n",
 34 |     "\n",
 35 |     "HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Downloading raw data from 2022 to 2023\n",
 48 |       "File 2022 was already in local storage\n",
 49 |       "File 2023 was already in local storage\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "from datetime import datetime\n",
 55 |     "import pandas as pd\n",
 56 |     "from src.data import load_raw_data\n",
 57 |     "\n",
 58 |     "from_year = 2022\n",
 59 |     "to_year = datetime.now().year\n",
 60 |     "print(f'Downloading raw data from {from_year} to {to_year}')\n",
 61 |     "\n",
 62 |     "rides = pd.DataFrame()\n",
 63 |     "for year in range(from_year, to_year+1):\n",
 64 |     "    \n",
 65 |     "    # download data for the whole year\n",
 66 |     "    rides_one_year = load_raw_data(year)\n",
 67 |     "    \n",
 68 |     "    # append rows\n",
 69 |     "    rides = pd.concat([rides, rides_one_year])"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "len(rides)=4829258\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "print(f'{len(rides)=}')"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "Timestamp('2023-09-30 23:58:56')"
 98 |       ]
 99 |      },
100 |      "execution_count": 6,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "rides.pickup_datetime.max()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 7,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "100%|██████████| 362/362 [00:09<00:00, 39.72it/s]\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "from src.data import transform_raw_data_into_ts_data\n",
124 |     "\n",
125 |     "ts_data = transform_raw_data_into_ts_data(rides)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 8,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/html": [
136 |        "<div>\n",
137 |        "<style scoped>\n",
138 |        "    .dataframe tbody tr th:only-of-type {\n",
139 |        "        vertical-align: middle;\n",
140 |        "    }\n",
141 |        "\n",
142 |        "    .dataframe tbody tr th {\n",
143 |        "        vertical-align: top;\n",
144 |        "    }\n",
145 |        "\n",
146 |        "    .dataframe thead th {\n",
147 |        "        text-align: right;\n",
148 |        "    }\n",
149 |        "</style>\n",
150 |        "<table border=\"1\" class=\"dataframe\">\n",
151 |        "  <thead>\n",
152 |        "    <tr style=\"text-align: right;\">\n",
153 |        "      <th></th>\n",
154 |        "      <th>pickup_hour</th>\n",
155 |        "      <th>rides</th>\n",
156 |        "      <th>pickup_location_id</th>\n",
157 |        "    </tr>\n",
158 |        "  </thead>\n",
159 |        "  <tbody>\n",
160 |        "    <tr>\n",
161 |        "      <th>5067542</th>\n",
162 |        "      <td>2023-08-31 14:00:00</td>\n",
163 |        "      <td>1</td>\n",
164 |        "      <td>57</td>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "      <th>5067543</th>\n",
168 |        "      <td>2023-08-31 15:00:00</td>\n",
169 |        "      <td>3</td>\n",
170 |        "      <td>57</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>5067544</th>\n",
174 |        "      <td>2023-08-31 16:00:00</td>\n",
175 |        "      <td>1</td>\n",
176 |        "      <td>57</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>5067545</th>\n",
180 |        "      <td>2023-08-31 17:00:00</td>\n",
181 |        "      <td>1</td>\n",
182 |        "      <td>57</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>5067546</th>\n",
186 |        "      <td>2023-08-31 18:00:00</td>\n",
187 |        "      <td>0</td>\n",
188 |        "      <td>57</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>...</th>\n",
192 |        "      <td>...</td>\n",
193 |        "      <td>...</td>\n",
194 |        "      <td>...</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>5068267</th>\n",
198 |        "      <td>2023-09-30 19:00:00</td>\n",
199 |        "      <td>0</td>\n",
200 |        "      <td>57</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>5068268</th>\n",
204 |        "      <td>2023-09-30 20:00:00</td>\n",
205 |        "      <td>0</td>\n",
206 |        "      <td>57</td>\n",
207 |        "    </tr>\n",
208 |        "    <tr>\n",
209 |        "      <th>5068269</th>\n",
210 |        "      <td>2023-09-30 21:00:00</td>\n",
211 |        "      <td>0</td>\n",
212 |        "      <td>57</td>\n",
213 |        "    </tr>\n",
214 |        "    <tr>\n",
215 |        "      <th>5068270</th>\n",
216 |        "      <td>2023-09-30 22:00:00</td>\n",
217 |        "      <td>3</td>\n",
218 |        "      <td>57</td>\n",
219 |        "    </tr>\n",
220 |        "    <tr>\n",
221 |        "      <th>5068271</th>\n",
222 |        "      <td>2023-09-30 23:00:00</td>\n",
223 |        "      <td>0</td>\n",
224 |        "      <td>57</td>\n",
225 |        "    </tr>\n",
226 |        "  </tbody>\n",
227 |        "</table>\n",
228 |        "<p>730 rows × 3 columns</p>\n",
229 |        "</div>"
230 |       ],
231 |       "text/plain": [
232 |        "                pickup_hour  rides  pickup_location_id\n",
233 |        "5067542 2023-08-31 14:00:00      1                  57\n",
234 |        "5067543 2023-08-31 15:00:00      3                  57\n",
235 |        "5067544 2023-08-31 16:00:00      1                  57\n",
236 |        "5067545 2023-08-31 17:00:00      1                  57\n",
237 |        "5067546 2023-08-31 18:00:00      0                  57\n",
238 |        "...                     ...    ...                 ...\n",
239 |        "5068267 2023-09-30 19:00:00      0                  57\n",
240 |        "5068268 2023-09-30 20:00:00      0                  57\n",
241 |        "5068269 2023-09-30 21:00:00      0                  57\n",
242 |        "5068270 2023-09-30 22:00:00      3                  57\n",
243 |        "5068271 2023-09-30 23:00:00      0                  57\n",
244 |        "\n",
245 |        "[730 rows x 3 columns]"
246 |       ]
247 |      },
248 |      "execution_count": 8,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "ts_data.tail(730)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 9,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "name": "stdout",
264 |      "output_type": "stream",
265 |      "text": [
266 |       "3.4.2\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "import hopsworks\n",
272 |     "\n",
273 |     "print(hopsworks.__version__)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 10,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "Connected. Call `.close()` to terminate connection gracefully.\n",
286 |       "\n",
287 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "project = hopsworks.login(\n",
293 |     "    project=HOPSWORKS_PROJECT_NAME,\n",
294 |     "    api_key_value=HOPSWORKS_API_KEY\n",
295 |     ")"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 11,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "name": "stdout",
305 |      "output_type": "stream",
306 |      "text": [
307 |       "Connected. Call `.close()` to terminate connection gracefully.\n"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "feature_store = project.get_feature_store()"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 12,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'\n",
322 |     "FEATURE_GROUP_VERSION = 1"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 13,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "feature_group = feature_store.get_or_create_feature_group(\n",
332 |     "    name=FEATURE_GROUP_NAME,\n",
333 |     "    version=FEATURE_GROUP_VERSION,\n",
334 |     "    description=\"Time-series data at hourly frequency\",\n",
335 |     "    primary_key = ['pickup_location_id', 'pickup_hour'],\n",
336 |     "    event_time='pickup_hour',\n",
337 |     ")"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 14,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "Feature Group created successfully, explore it at \n",
350 |       "https://c.app.hopsworks.ai:443/p/100501/fs/100420/fg/280937\n"
351 |      ]
352 |     },
353 |     {
354 |      "data": {
355 |       "application/vnd.jupyter.widget-view+json": {
356 |        "model_id": "56be964fb30c4be0be2c4de6982554c1",
357 |        "version_major": 2,
358 |        "version_minor": 0
359 |       },
360 |       "text/plain": [
361 |        "Uploading Dataframe: 0.00% |          | Rows 0/5068272 | Elapsed Time: 00:00 | Remaining Time: ?"
362 |       ]
363 |      },
364 |      "metadata": {},
365 |      "output_type": "display_data"
366 |     },
367 |     {
368 |      "name": "stdout",
369 |      "output_type": "stream",
370 |      "text": [
371 |       "Launching job: time_series_hourly_feature_group_1_offline_fg_materialization\n",
372 |       "Job started successfully, you can follow the progress at \n",
373 |       "https://c.app.hopsworks.ai/p/100501/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions\n"
374 |      ]
375 |     },
376 |     {
377 |      "data": {
378 |       "text/plain": [
379 |        "(<hsfs.core.job.Job at 0x2100bd63280>, None)"
380 |       ]
381 |      },
382 |      "execution_count": 14,
383 |      "metadata": {},
384 |      "output_type": "execute_result"
385 |     }
386 |    ],
387 |    "source": [
388 |     "feature_group.insert(ts_data, write_options={\"wait_for_job\": False})"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": []
397 |   }
398 |  ],
399 |  "metadata": {
400 |   "kernelspec": {
401 |    "display_name": ".venv",
402 |    "language": "python",
403 |    "name": "python3"
404 |   },
405 |   "language_info": {
406 |    "codemirror_mode": {
407 |     "name": "ipython",
408 |     "version": 3
409 |    },
410 |    "file_extension": ".py",
411 |    "mimetype": "text/x-python",
412 |    "name": "python",
413 |    "nbconvert_exporter": "python",
414 |    "pygments_lexer": "ipython3",
415 |    "version": "3.9.13"
416 |   },
417 |   "orig_nbformat": 4,
418 |   "vscode": {
419 |    "interpreter": {
420 |     "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923"
421 |    }
422 |   }
423 |  },
424 |  "nbformat": 4,
425 |  "nbformat_minor": 2
426 | }
427 | 


--------------------------------------------------------------------------------
/notebooks/12_feature_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%reload_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import src.config as config"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stdout",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "current_date=Timestamp('2025-03-06 09:00:00')\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "from datetime import datetime, timedelta\n",
 37 |     "\n",
 38 |     "import pandas as pd\n",
 39 |     "\n",
 40 |     "current_date = pd.to_datetime(datetime.utcnow()).floor('H')\n",
 41 |     "print(f'{current_date=}')\n",
 42 |     "\n",
 43 |     "# we fetch raw data for the last 28 days, to add redundancy to our data pipeline\n",
 44 |     "fetch_data_to = current_date\n",
 45 |     "fetch_data_from = current_date - timedelta(days=70) #28"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from src.data import load_raw_data\n",
 55 |     "\n",
 56 |     "def fetch_batch_raw_data(from_date: datetime, to_date: datetime) -> pd.DataFrame:\n",
 57 |     "    \"\"\"\n",
 58 |     "    Simulate production data by sampling historical data from 52 weeks ago (i.e. 1 year)\n",
 59 |     "    \"\"\"\n",
 60 |     "    from_date_ = from_date - timedelta(days=7*52)\n",
 61 |     "    to_date_ = to_date - timedelta(days=7*52)\n",
 62 |     "    print(f'{from_date=}, {to_date_=}')\n",
 63 |     "\n",
 64 |     "    # # download 2 files from website\n",
 65 |     "    # rides = load_raw_data(year=from_date_.year) #, months=from_date_.month)\n",
 66 |     "    # rides = rides[(rides.pickup_datetime >= from_date_) & (rides.pickup_datetime < to_date_)]\n",
 67 |     "   \n",
 68 |     "    # rides_2 = load_raw_data(year=to_date_.year) #, months=to_date_.month)\n",
 69 |     "    # rides_2 = rides_2[(rides_2.pickup_datetime < to_date_) & (rides_2.pickup_datetime < to_date_)]\n",
 70 |     "\n",
 71 |     "    # rides = pd.concat([rides, rides_2]) \n",
 72 |     "\n",
 73 |     "    # Intenta cargar los datos del primer año\n",
 74 |     "    rides = load_raw_data(year=from_date_.year)\n",
 75 |     "    rides = rides[(rides.pickup_datetime >= from_date_) & (rides.pickup_datetime < to_date_)]\n",
 76 |     "\n",
 77 |     "\n",
 78 |     "    ### FIX 1 ----- Esto hago que no se ejecute para probar si es el error\n",
 79 |     "    # # Verifica si los años son diferentes\n",
 80 |     "    # if from_date_.year != to_date_.year:\n",
 81 |     "    # # Carga los datos del segundo año\n",
 82 |     "    #     rides_2 = load_raw_data(year=to_date_.year)\n",
 83 |     "    #     rides_2 = rides_2[(rides_2.pickup_datetime >= from_date_) & (rides_2.pickup_datetime < to_date_)]\n",
 84 |     "    #     rides = pd.concat([rides, rides_2]) \n",
 85 |     "\n",
 86 |     "\n",
 87 |     "    # shift the data to pretend this is recent data\n",
 88 |     "    rides['pickup_datetime'] += timedelta(days=7*52)\n",
 89 |     "\n",
 90 |     "    rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)\n",
 91 |     "\n",
 92 |     "    return rides"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "from_date=Timestamp('2024-12-26 09:00:00'), to_date_=Timestamp('2024-03-07 09:00:00')\n",
105 |       "File 2023 was already in local storage\n",
106 |       "File 2024 was already in local storage\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "rides = fetch_batch_raw_data(from_date=fetch_data_from, to_date=fetch_data_to)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "356"
123 |       ]
124 |      },
125 |      "execution_count": 7,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "len(rides.pickup_location_id.unique())"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stderr",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "100%|██████████| 356/356 [00:02<00:00, 176.17it/s]\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "from src.data import transform_raw_data_into_ts_data\n",
149 |     "ts_data = transform_raw_data_into_ts_data(rides)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 9,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/html": [
160 |        "<div>\n",
161 |        "<style scoped>\n",
162 |        "    .dataframe tbody tr th:only-of-type {\n",
163 |        "        vertical-align: middle;\n",
164 |        "    }\n",
165 |        "\n",
166 |        "    .dataframe tbody tr th {\n",
167 |        "        vertical-align: top;\n",
168 |        "    }\n",
169 |        "\n",
170 |        "    .dataframe thead th {\n",
171 |        "        text-align: right;\n",
172 |        "    }\n",
173 |        "</style>\n",
174 |        "<table border=\"1\" class=\"dataframe\">\n",
175 |        "  <thead>\n",
176 |        "    <tr style=\"text-align: right;\">\n",
177 |        "      <th></th>\n",
178 |        "      <th>pickup_datetime</th>\n",
179 |        "      <th>pickup_location_id</th>\n",
180 |        "      <th>pickup_hour</th>\n",
181 |        "    </tr>\n",
182 |        "  </thead>\n",
183 |        "  <tbody>\n",
184 |        "    <tr>\n",
185 |        "      <th>1730583</th>\n",
186 |        "      <td>2024-10-22 08:10:13</td>\n",
187 |        "      <td>2</td>\n",
188 |        "      <td>2024-10-22 08:00:00</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>1897291</th>\n",
192 |        "      <td>2024-10-22 08:26:54</td>\n",
193 |        "      <td>2</td>\n",
194 |        "      <td>2024-10-22 08:00:00</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>1836014</th>\n",
198 |        "      <td>2024-10-22 08:29:18</td>\n",
199 |        "      <td>2</td>\n",
200 |        "      <td>2024-10-22 08:00:00</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>1916165</th>\n",
204 |        "      <td>2024-10-22 08:35:43</td>\n",
205 |        "      <td>2</td>\n",
206 |        "      <td>2024-10-22 08:00:00</td>\n",
207 |        "    </tr>\n",
208 |        "    <tr>\n",
209 |        "      <th>1921488</th>\n",
210 |        "      <td>2024-10-22 08:38:11</td>\n",
211 |        "      <td>2</td>\n",
212 |        "      <td>2024-10-22 08:00:00</td>\n",
213 |        "    </tr>\n",
214 |        "  </tbody>\n",
215 |        "</table>\n",
216 |        "</div>"
217 |       ],
218 |       "text/plain": [
219 |        "            pickup_datetime  pickup_location_id         pickup_hour\n",
220 |        "1730583 2024-10-22 08:10:13                   2 2024-10-22 08:00:00\n",
221 |        "1897291 2024-10-22 08:26:54                   2 2024-10-22 08:00:00\n",
222 |        "1836014 2024-10-22 08:29:18                   2 2024-10-22 08:00:00\n",
223 |        "1916165 2024-10-22 08:35:43                   2 2024-10-22 08:00:00\n",
224 |        "1921488 2024-10-22 08:38:11                   2 2024-10-22 08:00:00"
225 |       ]
226 |      },
227 |      "execution_count": 9,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "rides.head()"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 10,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "data": {
243 |       "text/html": [
244 |        "<div>\n",
245 |        "<style scoped>\n",
246 |        "    .dataframe tbody tr th:only-of-type {\n",
247 |        "        vertical-align: middle;\n",
248 |        "    }\n",
249 |        "\n",
250 |        "    .dataframe tbody tr th {\n",
251 |        "        vertical-align: top;\n",
252 |        "    }\n",
253 |        "\n",
254 |        "    .dataframe thead th {\n",
255 |        "        text-align: right;\n",
256 |        "    }\n",
257 |        "</style>\n",
258 |        "<table border=\"1\" class=\"dataframe\">\n",
259 |        "  <thead>\n",
260 |        "    <tr style=\"text-align: right;\">\n",
261 |        "      <th></th>\n",
262 |        "      <th>pickup_hour</th>\n",
263 |        "      <th>rides</th>\n",
264 |        "      <th>pickup_location_id</th>\n",
265 |        "    </tr>\n",
266 |        "  </thead>\n",
267 |        "  <tbody>\n",
268 |        "    <tr>\n",
269 |        "      <th>598075</th>\n",
270 |        "      <td>2024-12-31 02:00:00</td>\n",
271 |        "      <td>0</td>\n",
272 |        "      <td>362</td>\n",
273 |        "    </tr>\n",
274 |        "    <tr>\n",
275 |        "      <th>598076</th>\n",
276 |        "      <td>2024-12-31 03:00:00</td>\n",
277 |        "      <td>0</td>\n",
278 |        "      <td>362</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>598077</th>\n",
282 |        "      <td>2024-12-31 04:00:00</td>\n",
283 |        "      <td>0</td>\n",
284 |        "      <td>362</td>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>598078</th>\n",
288 |        "      <td>2024-12-31 05:00:00</td>\n",
289 |        "      <td>0</td>\n",
290 |        "      <td>362</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>598079</th>\n",
294 |        "      <td>2024-12-31 06:00:00</td>\n",
295 |        "      <td>0</td>\n",
296 |        "      <td>362</td>\n",
297 |        "    </tr>\n",
298 |        "  </tbody>\n",
299 |        "</table>\n",
300 |        "</div>"
301 |       ],
302 |       "text/plain": [
303 |        "               pickup_hour  rides  pickup_location_id\n",
304 |        "598075 2024-12-31 02:00:00      0                 362\n",
305 |        "598076 2024-12-31 03:00:00      0                 362\n",
306 |        "598077 2024-12-31 04:00:00      0                 362\n",
307 |        "598078 2024-12-31 05:00:00      0                 362\n",
308 |        "598079 2024-12-31 06:00:00      0                 362"
309 |       ]
310 |      },
311 |      "execution_count": 10,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "ts_data.tail()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 11,
323 |    "metadata": {},
324 |    "outputs": [
325 |     {
326 |      "name": "stdout",
327 |      "output_type": "stream",
328 |      "text": [
329 |       "2024-12-31 08:19:29,320 INFO: Initializing external client\n",
330 |       "2024-12-31 08:19:29,323 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
331 |       "2024-12-31 08:19:34,011 INFO: Python Engine initialized.\n",
332 |       "\n",
333 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n"
334 |      ]
335 |     }
336 |    ],
337 |    "source": [
338 |     "import hopsworks\n",
339 |     "\n",
340 |     "# connect to the project\n",
341 |     "project = hopsworks.login(\n",
342 |     "    project=config.HOPSWORKS_PROJECT_NAME,\n",
343 |     "    api_key_value=config.HOPSWORKS_API_KEY\n",
344 |     ")\n",
345 |     "\n",
346 |     "# connect to the feature store\n",
347 |     "feature_store = project.get_feature_store()\n",
348 |     "\n",
349 |     "# connect to the feature group\n",
350 |     "feature_group = feature_store.get_or_create_feature_group(\n",
351 |     "    name=config.FEATURE_GROUP_NAME,\n",
352 |     "    version=config.FEATURE_GROUP_VERSION,\n",
353 |     "    description=\"Time-series data at hourly frequency\",\n",
354 |     "    primary_key = ['pickup_location_id', 'pickup_hour'],\n",
355 |     "    event_time='pickup_hour',\n",
356 |     ")"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 12,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stderr",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "Uploading Dataframe: 100.00% |██████████| Rows 530880/530880 | Elapsed Time: 03:18 | Remaining Time: 00:00\n"
369 |      ]
370 |     },
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "Launching job: time_series_hourly_feature_group_1_offline_fg_materialization\n",
376 |       "Job started successfully, you can follow the progress at \n",
377 |       "https://c.app.hopsworks.ai:443/p/100501/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions\n",
378 |       "2024-12-31 08:23:09,205 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED\n",
379 |       "2024-12-31 08:23:15,859 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED\n",
380 |       "2024-12-31 08:26:32,549 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED\n",
381 |       "2024-12-31 08:26:32,685 INFO: Waiting for log aggregation to finish.\n",
382 |       "2024-12-31 08:27:16,560 INFO: Execution finished successfully.\n"
383 |      ]
384 |     },
385 |     {
386 |      "data": {
387 |       "text/plain": [
388 |        "(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),\n",
389 |        " None)"
390 |       ]
391 |      },
392 |      "execution_count": 12,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "feature_group.insert(ts_data, write_options={\"wait_for_job\": True})"
399 |    ]
400 |   }
401 |  ],
402 |  "metadata": {
403 |   "kernelspec": {
404 |    "display_name": ".venv",
405 |    "language": "python",
406 |    "name": "python3"
407 |   },
408 |   "language_info": {
409 |    "codemirror_mode": {
410 |     "name": "ipython",
411 |     "version": 3
412 |    },
413 |    "file_extension": ".py",
414 |    "mimetype": "text/x-python",
415 |    "name": "python",
416 |    "nbconvert_exporter": "python",
417 |    "pygments_lexer": "ipython3",
418 |    "version": "3.9.13"
419 |   },
420 |   "orig_nbformat": 4,
421 |   "vscode": {
422 |    "interpreter": {
423 |     "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923"
424 |    }
425 |   }
426 |  },
427 |  "nbformat": 4,
428 |  "nbformat_minor": 2
429 | }
430 | 


--------------------------------------------------------------------------------
/notebooks/14_inference_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "name": "stdout",
 20 |      "output_type": "stream",
 21 |      "text": [
 22 |       "current_date=Timestamp('2024-03-25 20:00:00+0000', tz='UTC')\n"
 23 |      ]
 24 |     },
 25 |     {
 26 |      "data": {
 27 |       "text/plain": [
 28 |        "Timestamp('2024-03-25 20:00:00+0000', tz='UTC')"
 29 |       ]
 30 |      },
 31 |      "execution_count": 2,
 32 |      "metadata": {},
 33 |      "output_type": "execute_result"
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "from datetime import datetime, timedelta\n",
 38 |     "import pandas as pd\n",
 39 |     "\n",
 40 |     "current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') # - timedelta(hours=1)\n",
 41 |     "print(f'{current_date=}')\n",
 42 |     "#current_date = pd.Timestamp('2023-11-10 10:00:00+0000', tz='UTC')\n",
 43 |     "current_date"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Connected. Call `.close()` to terminate connection gracefully.\n",
 56 |       "\n",
 57 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n",
 58 |       "Connected. Call `.close()` to terminate connection gracefully.\n",
 59 |       "Fetching data from 2024-02-26 20:00:00+00:00 to 2024-03-25 19:00:00+00:00\n",
 60 |       "Finished: Reading data from Hopsworks, using ArrowFlight (13.50s) \n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "from src.inference import load_batch_of_features_from_store\n",
 66 |     "\n",
 67 |     "features = load_batch_of_features_from_store(current_date)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stderr",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "DeprecationWarning: Using 'method_whitelist' with Retry is deprecated and will be removed in v2.0. Use 'allowed_methods' instead\n",
 80 |       "\u001b[1;38;5;214mCOMET WARNING:\u001b[0m This method has been deprecated, we recommend using the api.get_model(...) method to get the Model Object and then using model.download(...)\n",
 81 |       "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Downloading registry model 'bike_demand_predictor_next_hour', version '1.1.0', stage None from workspace 'javieryanzon'...\n",
 82 |       "\u001b[1;38;5;39mCOMET INFO:\u001b[0m Unzipping model to 'C:\\\\Users\\\\jayan\\\\Desktop\\\\Python, SQL, Power Bi, cursos\\\\Proyectos\\\\bike_sharing_demand_predictor\\\\models' ...\n",
 83 |       "\u001b[1;38;5;39mCOMET INFO:\u001b[0m done!\n"
 84 |      ]
 85 |     },
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "Connection closed.\n",
 91 |       "Connected. Call `.close()` to terminate connection gracefully.\n",
 92 |       "\n",
 93 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n",
 94 |       "Connected. Call `.close()` to terminate connection gracefully.\n"
 95 |      ]
 96 |     },
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "VersionWarning: No version provided for getting feature view `latitud_y_longitud_view`, defaulting to `1`.\n"
102 |      ]
103 |     },
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "Finished: Reading data from Hopsworks, using ArrowFlight (0.69s) \n",
109 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
110 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
111 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
112 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
113 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
114 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
115 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
116 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
117 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
118 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
119 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
120 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
121 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
122 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
123 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
124 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
125 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
126 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
127 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
128 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
129 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
130 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
131 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
132 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n"
133 |      ]
134 |     },
135 |     {
136 |      "name": "stderr",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
140 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
141 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
142 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
143 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
144 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
145 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
146 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
147 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
148 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
149 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
150 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
151 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
152 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
153 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
154 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
155 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
156 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
157 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
158 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
159 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
160 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
161 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
162 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
163 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
164 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
165 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
166 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
167 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
168 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
169 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
170 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
171 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
172 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
173 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
174 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
175 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
176 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
177 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
178 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
179 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
180 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
181 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
182 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
183 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
184 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n"
185 |      ]
186 |     },
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
192 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
193 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
194 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
195 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
196 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
197 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
198 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
199 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
200 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
201 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
202 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
203 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
204 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
205 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
206 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
207 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
208 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
209 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
210 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
211 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
212 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
213 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
214 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
215 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
216 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
217 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
218 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
219 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
220 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
221 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
222 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
223 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
224 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
225 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
226 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
227 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
228 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
229 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
230 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
231 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
232 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
233 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
234 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
235 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
236 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n",
237 |       "[LightGBM] [Warning] feature_fraction is set=0.39920038588970796, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.39920038588970796\n",
238 |       "[LightGBM] [Warning] bagging_fraction is set=0.7983723942570424, subsample=1.0 will be ignored. Current value: bagging_fraction=0.7983723942570424\n"
239 |      ]
240 |     },
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
246 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
247 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
248 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
249 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
250 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
251 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
252 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
253 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
254 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
255 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
256 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
257 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
258 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
259 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
260 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
261 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
262 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
263 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
264 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
265 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
266 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
267 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
268 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n",
269 |       "DeprecationWarning: np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.\n",
270 |       "See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)\n"
271 |      ]
272 |     }
273 |    ],
274 |    "source": [
275 |     "##NUEVO\n",
276 |     "##SEGUIR PROBANDO esta guardando el modelo como un array y creo que esta mal\n",
277 |     "from src.model_registry_api import get_latest_model_from_registry\n",
278 |     "from src.inference import get_model_predictions\n",
279 |     "\n",
280 |     "model = get_latest_model_from_registry(model_name='bike_demand_predictor_next_hour', status= 'Production')\n",
281 |     "predictions = get_model_predictions(model, features)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 5,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "# from src.inference import (\n",
291 |     "#     load_model_from_registry,\n",
292 |     "#     get_model_predictions\n",
293 |     "# )\n",
294 |     "\n",
295 |     "# model = load_model_from_registry()\n",
296 |     "# predictions = get_model_predictions(model, features)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 5,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/html": [
307 |        "<div>\n",
308 |        "<style scoped>\n",
309 |        "    .dataframe tbody tr th:only-of-type {\n",
310 |        "        vertical-align: middle;\n",
311 |        "    }\n",
312 |        "\n",
313 |        "    .dataframe tbody tr th {\n",
314 |        "        vertical-align: top;\n",
315 |        "    }\n",
316 |        "\n",
317 |        "    .dataframe thead th {\n",
318 |        "        text-align: right;\n",
319 |        "    }\n",
320 |        "</style>\n",
321 |        "<table border=\"1\" class=\"dataframe\">\n",
322 |        "  <thead>\n",
323 |        "    <tr style=\"text-align: right;\">\n",
324 |        "      <th></th>\n",
325 |        "      <th>rides_next_1_hour</th>\n",
326 |        "      <th>rides_next_2_hour</th>\n",
327 |        "      <th>rides_next_3_hour</th>\n",
328 |        "      <th>rides_next_4_hour</th>\n",
329 |        "      <th>rides_next_5_hour</th>\n",
330 |        "      <th>rides_next_6_hour</th>\n",
331 |        "      <th>rides_next_7_hour</th>\n",
332 |        "      <th>rides_next_8_hour</th>\n",
333 |        "      <th>rides_next_9_hour</th>\n",
334 |        "      <th>rides_next_10_hour</th>\n",
335 |        "      <th>...</th>\n",
336 |        "      <th>rides_next_29_hour</th>\n",
337 |        "      <th>rides_next_30_hour</th>\n",
338 |        "      <th>rides_next_31_hour</th>\n",
339 |        "      <th>rides_next_32_hour</th>\n",
340 |        "      <th>rides_next_33_hour</th>\n",
341 |        "      <th>rides_next_34_hour</th>\n",
342 |        "      <th>rides_next_35_hour</th>\n",
343 |        "      <th>rides_next_36_hour</th>\n",
344 |        "      <th>pickup_location_id</th>\n",
345 |        "      <th>pickup_hour</th>\n",
346 |        "    </tr>\n",
347 |        "  </thead>\n",
348 |        "  <tbody>\n",
349 |        "    <tr>\n",
350 |        "      <th>0</th>\n",
351 |        "      <td>1.0</td>\n",
352 |        "      <td>0.0</td>\n",
353 |        "      <td>0.0</td>\n",
354 |        "      <td>0.0</td>\n",
355 |        "      <td>0.0</td>\n",
356 |        "      <td>0.0</td>\n",
357 |        "      <td>0.0</td>\n",
358 |        "      <td>0.0</td>\n",
359 |        "      <td>0.0</td>\n",
360 |        "      <td>0.0</td>\n",
361 |        "      <td>...</td>\n",
362 |        "      <td>0.0</td>\n",
363 |        "      <td>0.0</td>\n",
364 |        "      <td>0.0</td>\n",
365 |        "      <td>0.0</td>\n",
366 |        "      <td>0.0</td>\n",
367 |        "      <td>0.0</td>\n",
368 |        "      <td>1.0</td>\n",
369 |        "      <td>1.0</td>\n",
370 |        "      <td>2</td>\n",
371 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>1</th>\n",
375 |        "      <td>2.0</td>\n",
376 |        "      <td>1.0</td>\n",
377 |        "      <td>2.0</td>\n",
378 |        "      <td>1.0</td>\n",
379 |        "      <td>1.0</td>\n",
380 |        "      <td>0.0</td>\n",
381 |        "      <td>0.0</td>\n",
382 |        "      <td>0.0</td>\n",
383 |        "      <td>0.0</td>\n",
384 |        "      <td>0.0</td>\n",
385 |        "      <td>...</td>\n",
386 |        "      <td>1.0</td>\n",
387 |        "      <td>1.0</td>\n",
388 |        "      <td>1.0</td>\n",
389 |        "      <td>0.0</td>\n",
390 |        "      <td>0.0</td>\n",
391 |        "      <td>0.0</td>\n",
392 |        "      <td>1.0</td>\n",
393 |        "      <td>4.0</td>\n",
394 |        "      <td>3</td>\n",
395 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
396 |        "    </tr>\n",
397 |        "    <tr>\n",
398 |        "      <th>2</th>\n",
399 |        "      <td>1.0</td>\n",
400 |        "      <td>1.0</td>\n",
401 |        "      <td>0.0</td>\n",
402 |        "      <td>0.0</td>\n",
403 |        "      <td>0.0</td>\n",
404 |        "      <td>0.0</td>\n",
405 |        "      <td>0.0</td>\n",
406 |        "      <td>0.0</td>\n",
407 |        "      <td>0.0</td>\n",
408 |        "      <td>0.0</td>\n",
409 |        "      <td>...</td>\n",
410 |        "      <td>0.0</td>\n",
411 |        "      <td>0.0</td>\n",
412 |        "      <td>0.0</td>\n",
413 |        "      <td>0.0</td>\n",
414 |        "      <td>0.0</td>\n",
415 |        "      <td>0.0</td>\n",
416 |        "      <td>0.0</td>\n",
417 |        "      <td>1.0</td>\n",
418 |        "      <td>4</td>\n",
419 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>3</th>\n",
423 |        "      <td>7.0</td>\n",
424 |        "      <td>4.0</td>\n",
425 |        "      <td>4.0</td>\n",
426 |        "      <td>3.0</td>\n",
427 |        "      <td>3.0</td>\n",
428 |        "      <td>2.0</td>\n",
429 |        "      <td>1.0</td>\n",
430 |        "      <td>1.0</td>\n",
431 |        "      <td>2.0</td>\n",
432 |        "      <td>2.0</td>\n",
433 |        "      <td>...</td>\n",
434 |        "      <td>2.0</td>\n",
435 |        "      <td>2.0</td>\n",
436 |        "      <td>1.0</td>\n",
437 |        "      <td>1.0</td>\n",
438 |        "      <td>1.0</td>\n",
439 |        "      <td>2.0</td>\n",
440 |        "      <td>3.0</td>\n",
441 |        "      <td>3.0</td>\n",
442 |        "      <td>5</td>\n",
443 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>4</th>\n",
447 |        "      <td>2.0</td>\n",
448 |        "      <td>1.0</td>\n",
449 |        "      <td>1.0</td>\n",
450 |        "      <td>1.0</td>\n",
451 |        "      <td>1.0</td>\n",
452 |        "      <td>0.0</td>\n",
453 |        "      <td>0.0</td>\n",
454 |        "      <td>0.0</td>\n",
455 |        "      <td>0.0</td>\n",
456 |        "      <td>0.0</td>\n",
457 |        "      <td>...</td>\n",
458 |        "      <td>1.0</td>\n",
459 |        "      <td>1.0</td>\n",
460 |        "      <td>1.0</td>\n",
461 |        "      <td>0.0</td>\n",
462 |        "      <td>0.0</td>\n",
463 |        "      <td>0.0</td>\n",
464 |        "      <td>1.0</td>\n",
465 |        "      <td>1.0</td>\n",
466 |        "      <td>6</td>\n",
467 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
468 |        "    </tr>\n",
469 |        "    <tr>\n",
470 |        "      <th>...</th>\n",
471 |        "      <td>...</td>\n",
472 |        "      <td>...</td>\n",
473 |        "      <td>...</td>\n",
474 |        "      <td>...</td>\n",
475 |        "      <td>...</td>\n",
476 |        "      <td>...</td>\n",
477 |        "      <td>...</td>\n",
478 |        "      <td>...</td>\n",
479 |        "      <td>...</td>\n",
480 |        "      <td>...</td>\n",
481 |        "      <td>...</td>\n",
482 |        "      <td>...</td>\n",
483 |        "      <td>...</td>\n",
484 |        "      <td>...</td>\n",
485 |        "      <td>...</td>\n",
486 |        "      <td>...</td>\n",
487 |        "      <td>...</td>\n",
488 |        "      <td>...</td>\n",
489 |        "      <td>...</td>\n",
490 |        "      <td>...</td>\n",
491 |        "      <td>...</td>\n",
492 |        "    </tr>\n",
493 |        "    <tr>\n",
494 |        "      <th>315</th>\n",
495 |        "      <td>1.0</td>\n",
496 |        "      <td>0.0</td>\n",
497 |        "      <td>0.0</td>\n",
498 |        "      <td>0.0</td>\n",
499 |        "      <td>0.0</td>\n",
500 |        "      <td>0.0</td>\n",
501 |        "      <td>0.0</td>\n",
502 |        "      <td>0.0</td>\n",
503 |        "      <td>0.0</td>\n",
504 |        "      <td>0.0</td>\n",
505 |        "      <td>...</td>\n",
506 |        "      <td>0.0</td>\n",
507 |        "      <td>0.0</td>\n",
508 |        "      <td>0.0</td>\n",
509 |        "      <td>0.0</td>\n",
510 |        "      <td>0.0</td>\n",
511 |        "      <td>0.0</td>\n",
512 |        "      <td>1.0</td>\n",
513 |        "      <td>1.0</td>\n",
514 |        "      <td>493</td>\n",
515 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
516 |        "    </tr>\n",
517 |        "    <tr>\n",
518 |        "      <th>316</th>\n",
519 |        "      <td>1.0</td>\n",
520 |        "      <td>0.0</td>\n",
521 |        "      <td>0.0</td>\n",
522 |        "      <td>0.0</td>\n",
523 |        "      <td>0.0</td>\n",
524 |        "      <td>0.0</td>\n",
525 |        "      <td>0.0</td>\n",
526 |        "      <td>0.0</td>\n",
527 |        "      <td>0.0</td>\n",
528 |        "      <td>0.0</td>\n",
529 |        "      <td>...</td>\n",
530 |        "      <td>0.0</td>\n",
531 |        "      <td>0.0</td>\n",
532 |        "      <td>0.0</td>\n",
533 |        "      <td>0.0</td>\n",
534 |        "      <td>0.0</td>\n",
535 |        "      <td>0.0</td>\n",
536 |        "      <td>0.0</td>\n",
537 |        "      <td>0.0</td>\n",
538 |        "      <td>494</td>\n",
539 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
540 |        "    </tr>\n",
541 |        "    <tr>\n",
542 |        "      <th>317</th>\n",
543 |        "      <td>0.0</td>\n",
544 |        "      <td>0.0</td>\n",
545 |        "      <td>0.0</td>\n",
546 |        "      <td>0.0</td>\n",
547 |        "      <td>0.0</td>\n",
548 |        "      <td>0.0</td>\n",
549 |        "      <td>0.0</td>\n",
550 |        "      <td>0.0</td>\n",
551 |        "      <td>0.0</td>\n",
552 |        "      <td>0.0</td>\n",
553 |        "      <td>...</td>\n",
554 |        "      <td>0.0</td>\n",
555 |        "      <td>0.0</td>\n",
556 |        "      <td>0.0</td>\n",
557 |        "      <td>0.0</td>\n",
558 |        "      <td>0.0</td>\n",
559 |        "      <td>0.0</td>\n",
560 |        "      <td>0.0</td>\n",
561 |        "      <td>1.0</td>\n",
562 |        "      <td>496</td>\n",
563 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
564 |        "    </tr>\n",
565 |        "    <tr>\n",
566 |        "      <th>318</th>\n",
567 |        "      <td>1.0</td>\n",
568 |        "      <td>0.0</td>\n",
569 |        "      <td>0.0</td>\n",
570 |        "      <td>0.0</td>\n",
571 |        "      <td>0.0</td>\n",
572 |        "      <td>0.0</td>\n",
573 |        "      <td>0.0</td>\n",
574 |        "      <td>0.0</td>\n",
575 |        "      <td>0.0</td>\n",
576 |        "      <td>0.0</td>\n",
577 |        "      <td>...</td>\n",
578 |        "      <td>0.0</td>\n",
579 |        "      <td>0.0</td>\n",
580 |        "      <td>0.0</td>\n",
581 |        "      <td>0.0</td>\n",
582 |        "      <td>0.0</td>\n",
583 |        "      <td>0.0</td>\n",
584 |        "      <td>0.0</td>\n",
585 |        "      <td>1.0</td>\n",
586 |        "      <td>497</td>\n",
587 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
588 |        "    </tr>\n",
589 |        "    <tr>\n",
590 |        "      <th>319</th>\n",
591 |        "      <td>1.0</td>\n",
592 |        "      <td>1.0</td>\n",
593 |        "      <td>0.0</td>\n",
594 |        "      <td>0.0</td>\n",
595 |        "      <td>0.0</td>\n",
596 |        "      <td>0.0</td>\n",
597 |        "      <td>0.0</td>\n",
598 |        "      <td>0.0</td>\n",
599 |        "      <td>0.0</td>\n",
600 |        "      <td>1.0</td>\n",
601 |        "      <td>...</td>\n",
602 |        "      <td>0.0</td>\n",
603 |        "      <td>0.0</td>\n",
604 |        "      <td>0.0</td>\n",
605 |        "      <td>0.0</td>\n",
606 |        "      <td>0.0</td>\n",
607 |        "      <td>0.0</td>\n",
608 |        "      <td>1.0</td>\n",
609 |        "      <td>1.0</td>\n",
610 |        "      <td>498</td>\n",
611 |        "      <td>2024-03-25 20:00:00+00:00</td>\n",
612 |        "    </tr>\n",
613 |        "  </tbody>\n",
614 |        "</table>\n",
615 |        "<p>320 rows × 38 columns</p>\n",
616 |        "</div>"
617 |       ],
618 |       "text/plain": [
619 |        "     rides_next_1_hour  rides_next_2_hour  rides_next_3_hour  \\\n",
620 |        "0                  1.0                0.0                0.0   \n",
621 |        "1                  2.0                1.0                2.0   \n",
622 |        "2                  1.0                1.0                0.0   \n",
623 |        "3                  7.0                4.0                4.0   \n",
624 |        "4                  2.0                1.0                1.0   \n",
625 |        "..                 ...                ...                ...   \n",
626 |        "315                1.0                0.0                0.0   \n",
627 |        "316                1.0                0.0                0.0   \n",
628 |        "317                0.0                0.0                0.0   \n",
629 |        "318                1.0                0.0                0.0   \n",
630 |        "319                1.0                1.0                0.0   \n",
631 |        "\n",
632 |        "     rides_next_4_hour  rides_next_5_hour  rides_next_6_hour  \\\n",
633 |        "0                  0.0                0.0                0.0   \n",
634 |        "1                  1.0                1.0                0.0   \n",
635 |        "2                  0.0                0.0                0.0   \n",
636 |        "3                  3.0                3.0                2.0   \n",
637 |        "4                  1.0                1.0                0.0   \n",
638 |        "..                 ...                ...                ...   \n",
639 |        "315                0.0                0.0                0.0   \n",
640 |        "316                0.0                0.0                0.0   \n",
641 |        "317                0.0                0.0                0.0   \n",
642 |        "318                0.0                0.0                0.0   \n",
643 |        "319                0.0                0.0                0.0   \n",
644 |        "\n",
645 |        "     rides_next_7_hour  rides_next_8_hour  rides_next_9_hour  \\\n",
646 |        "0                  0.0                0.0                0.0   \n",
647 |        "1                  0.0                0.0                0.0   \n",
648 |        "2                  0.0                0.0                0.0   \n",
649 |        "3                  1.0                1.0                2.0   \n",
650 |        "4                  0.0                0.0                0.0   \n",
651 |        "..                 ...                ...                ...   \n",
652 |        "315                0.0                0.0                0.0   \n",
653 |        "316                0.0                0.0                0.0   \n",
654 |        "317                0.0                0.0                0.0   \n",
655 |        "318                0.0                0.0                0.0   \n",
656 |        "319                0.0                0.0                0.0   \n",
657 |        "\n",
658 |        "     rides_next_10_hour  ...  rides_next_29_hour  rides_next_30_hour  \\\n",
659 |        "0                   0.0  ...                 0.0                 0.0   \n",
660 |        "1                   0.0  ...                 1.0                 1.0   \n",
661 |        "2                   0.0  ...                 0.0                 0.0   \n",
662 |        "3                   2.0  ...                 2.0                 2.0   \n",
663 |        "4                   0.0  ...                 1.0                 1.0   \n",
664 |        "..                  ...  ...                 ...                 ...   \n",
665 |        "315                 0.0  ...                 0.0                 0.0   \n",
666 |        "316                 0.0  ...                 0.0                 0.0   \n",
667 |        "317                 0.0  ...                 0.0                 0.0   \n",
668 |        "318                 0.0  ...                 0.0                 0.0   \n",
669 |        "319                 1.0  ...                 0.0                 0.0   \n",
670 |        "\n",
671 |        "     rides_next_31_hour  rides_next_32_hour  rides_next_33_hour  \\\n",
672 |        "0                   0.0                 0.0                 0.0   \n",
673 |        "1                   1.0                 0.0                 0.0   \n",
674 |        "2                   0.0                 0.0                 0.0   \n",
675 |        "3                   1.0                 1.0                 1.0   \n",
676 |        "4                   1.0                 0.0                 0.0   \n",
677 |        "..                  ...                 ...                 ...   \n",
678 |        "315                 0.0                 0.0                 0.0   \n",
679 |        "316                 0.0                 0.0                 0.0   \n",
680 |        "317                 0.0                 0.0                 0.0   \n",
681 |        "318                 0.0                 0.0                 0.0   \n",
682 |        "319                 0.0                 0.0                 0.0   \n",
683 |        "\n",
684 |        "     rides_next_34_hour  rides_next_35_hour  rides_next_36_hour  \\\n",
685 |        "0                   0.0                 1.0                 1.0   \n",
686 |        "1                   0.0                 1.0                 4.0   \n",
687 |        "2                   0.0                 0.0                 1.0   \n",
688 |        "3                   2.0                 3.0                 3.0   \n",
689 |        "4                   0.0                 1.0                 1.0   \n",
690 |        "..                  ...                 ...                 ...   \n",
691 |        "315                 0.0                 1.0                 1.0   \n",
692 |        "316                 0.0                 0.0                 0.0   \n",
693 |        "317                 0.0                 0.0                 1.0   \n",
694 |        "318                 0.0                 0.0                 1.0   \n",
695 |        "319                 0.0                 1.0                 1.0   \n",
696 |        "\n",
697 |        "     pickup_location_id               pickup_hour  \n",
698 |        "0                     2 2024-03-25 20:00:00+00:00  \n",
699 |        "1                     3 2024-03-25 20:00:00+00:00  \n",
700 |        "2                     4 2024-03-25 20:00:00+00:00  \n",
701 |        "3                     5 2024-03-25 20:00:00+00:00  \n",
702 |        "4                     6 2024-03-25 20:00:00+00:00  \n",
703 |        "..                  ...                       ...  \n",
704 |        "315                 493 2024-03-25 20:00:00+00:00  \n",
705 |        "316                 494 2024-03-25 20:00:00+00:00  \n",
706 |        "317                 496 2024-03-25 20:00:00+00:00  \n",
707 |        "318                 497 2024-03-25 20:00:00+00:00  \n",
708 |        "319                 498 2024-03-25 20:00:00+00:00  \n",
709 |        "\n",
710 |        "[320 rows x 38 columns]"
711 |       ]
712 |      },
713 |      "execution_count": 5,
714 |      "metadata": {},
715 |      "output_type": "execute_result"
716 |     }
717 |    ],
718 |    "source": [
719 |     "predictions['pickup_hour'] = current_date\n",
720 |     "predictions"
721 |    ]
722 |   },
723 |   {
724 |    "attachments": {},
725 |    "cell_type": "markdown",
726 |    "metadata": {},
727 |    "source": [
728 |     "### Save these predictions in the feature store, so they can be later consumed by our Streamlit app"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 6,
734 |    "metadata": {},
735 |    "outputs": [
736 |     {
737 |      "name": "stdout",
738 |      "output_type": "stream",
739 |      "text": [
740 |       "Connection closed.\n",
741 |       "Connected. Call `.close()` to terminate connection gracefully.\n",
742 |       "\n",
743 |       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/100501\n",
744 |       "Connected. Call `.close()` to terminate connection gracefully.\n"
745 |      ]
746 |     }
747 |    ],
748 |    "source": [
749 |     "from src.feature_store_api import get_feature_store\n",
750 |     "import src.config as config\n",
751 |     "\n",
752 |     "# connect to the feature group\n",
753 |     "feature_group = get_feature_store().get_or_create_feature_group(\n",
754 |     "    name=config.FEATURE_GROUP_MODEL_PREDICTIONS,\n",
755 |     "    version=1,\n",
756 |     "    description=\"Predictions generate by our production model\",\n",
757 |     "    primary_key = ['pickup_location_id', 'pickup_hour'],\n",
758 |     "    event_time='pickup_hour',\n",
759 |     ")"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "code",
764 |    "execution_count": 7,
765 |    "metadata": {},
766 |    "outputs": [
767 |     {
768 |      "data": {
769 |       "application/vnd.jupyter.widget-view+json": {
770 |        "model_id": "bc454aa3ed6c40fb88a8461cbd3e22c0",
771 |        "version_major": 2,
772 |        "version_minor": 0
773 |       },
774 |       "text/plain": [
775 |        "Uploading Dataframe: 0.00% |          | Rows 0/320 | Elapsed Time: 00:00 | Remaining Time: ?"
776 |       ]
777 |      },
778 |      "metadata": {},
779 |      "output_type": "display_data"
780 |     },
781 |     {
782 |      "name": "stdout",
783 |      "output_type": "stream",
784 |      "text": [
785 |       "Launching job: model_predictions_feature_group_1_offline_fg_materialization\n",
786 |       "Job started successfully, you can follow the progress at \n",
787 |       "https://c.app.hopsworks.ai/p/100501/jobs/named/model_predictions_feature_group_1_offline_fg_materialization/executions\n"
788 |      ]
789 |     },
790 |     {
791 |      "data": {
792 |       "text/plain": [
793 |        "(<hsfs.core.job.Job at 0x170fca56250>, None)"
794 |       ]
795 |      },
796 |      "execution_count": 7,
797 |      "metadata": {},
798 |      "output_type": "execute_result"
799 |     }
800 |    ],
801 |    "source": [
802 |     "feature_group.insert(predictions, write_options={\"wait_for_job\": False})"
803 |    ]
804 |   }
805 |  ],
806 |  "metadata": {
807 |   "kernelspec": {
808 |    "display_name": ".venv",
809 |    "language": "python",
810 |    "name": "python3"
811 |   },
812 |   "language_info": {
813 |    "codemirror_mode": {
814 |     "name": "ipython",
815 |     "version": 3
816 |    },
817 |    "file_extension": ".py",
818 |    "mimetype": "text/x-python",
819 |    "name": "python",
820 |    "nbconvert_exporter": "python",
821 |    "pygments_lexer": "ipython3",
822 |    "version": "3.9.13"
823 |   },
824 |   "orig_nbformat": 4,
825 |   "vscode": {
826 |    "interpreter": {
827 |     "hash": "b98d97558a062384a76b0309256306c9ce5dd4e2074fe66c33532239207fc923"
828 |    }
829 |   }
830 |  },
831 |  "nbformat": 4,
832 |  "nbformat_minor": 2
833 | }
834 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "src"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["jayanra <yanzonjavier@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.9"
10 | python-dotenv = "^1.0.0"
11 | jupyter = "^1.0.0"
12 | requests = "^2.31.0"
13 | tqdm = "^4.66.1"
14 | plotly = "^5.16.1"
15 | scikit-learn = "^1.3.0"
16 | xgboost = "^1.7.6"
17 | lightgbm = "^4.0.0"
18 | optuna = "^3.3.0"
19 | wget = "^3.2"
20 | geopandas = "^0.14.0"
21 | streamlit = {version = "^1.28.0", python = ">=3.9,<3.9.7 || >3.9.7,<4.0"}
22 | pydeck = "^0.8.0"
23 | comet-ml = "^3.38.0"
24 | hopsworks = {version = "4.1.0", python = ">=3.9,<3.11"}
25 | confluent-kafka = "^2.6.1"
26 | 
27 | 
28 | [tool.poetry.group.dev.dependencies]
29 | ipykernel = "^6.25.1"
30 | 
31 | [build-system]
32 | requires = ["poetry-core"]
33 | build-backend = "poetry.core.masonry.api"
34 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__init__.py


--------------------------------------------------------------------------------
/src/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/data.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/data.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/data_split.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/data_split.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/model.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/paths.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/paths.cpython-39.pyc


--------------------------------------------------------------------------------
/src/__pycache__/plot.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javieryanzon/bike_sharing_demand_predictor/5c1b3f01a5da13e588127d7cc1f68580bd2868ec/src/__pycache__/plot.cpython-39.pyc


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | from src.paths import PARENT_DIR
 5 | 
 6 | # load key-value pairs from .env file located in the parent directory
 7 | load_dotenv(PARENT_DIR / '.env')
 8 | 
 9 | HOPSWORKS_PROJECT_NAME = 'bike_sharing_demand'
10 | try:
11 |     HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
12 | except:
13 |     raise Exception('Create an .env file on the project root with the HOPSWORKS_API_KEY')
14 | 
15 | FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
16 | FEATURE_GROUP_VERSION = 1
17 | FEATURE_VIEW_NAME = 'time_series_hourly_feature_view'
18 | FEATURE_VIEW_VERSION = 1
19 | MODEL_NAME = "bike_demand_predictor_next_hour"
20 | MODEL_VERSION = 1
21 | 
22 | #Agrego esto para que se consulte al feature store la latitud y longitud
23 | FEATURE_GROUP_LAT_LONG = 'latitud_y_longitud_group' 
24 | FEATURE_VIEW_LAT_LONG = 'latitud_y_longitud_view'
25 | #FEATURE_VIEW_LAT_LONG_VERSION = 1
26 | 
27 | 
28 | # added for monitoring purposes
29 | FEATURE_GROUP_MODEL_PREDICTIONS = 'model_predictions_feature_group_'
30 | FEATURE_VIEW_MODEL_PREDICTIONS = 'model_predictions_feature_view_'
31 | FEATURE_VIEW_MONITORING = 'predictions_vs_actuals_for_monitoring_feature_view'
32 | 
33 | # number of historical values our model needs to generate predictions
34 | N_FEATURES = 24 * 28
35 | 
36 | # maximum Mean Absolute Error we allow our production model to have
37 | MAX_MAE = 4.0


--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from datetime import datetime, timedelta
  3 | from typing import Optional, List, Tuple
  4 | from pdb import set_trace as stop
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import requests
  9 | from tqdm import tqdm
 10 | import pyarrow as pa
 11 | import zipfile
 12 | import pyarrow.parquet as pq
 13 | import subprocess
 14 | 
 15 | from src.paths import RAW_DATA_DIR, TRANSFORMED_DATA_DIR
 16 | 
 17 | 
 18 | def download_one_file_of_raw_data(year: int) -> Path: #, month: int) -> Path:
 19 |     """
 20 |     Downloads Parquet file with historical bike rides for the given `year` and
 21 |     `month`
 22 |     """
 23 |     URL = f'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/bicicletas-publicas/recorridos-realizados-{year}.zip'
 24 |     
 25 |     # Ruta de destino para guardar el archivo descargado
 26 |     destination_path = RAW_DATA_DIR / f'recorridos-realizados-{year}.zip'
 27 | 
 28 |     try:
 29 |         # Utiliza wget para descargar el archivo en la ubicación deseada
 30 |         subprocess.run(['wget', URL, '-O', destination_path])
 31 | 
 32 |         # Verifica si el archivo se descargó correctamente
 33 |         if destination_path.is_file():
 34 |             print(f'Descargado año {year}')
 35 |             return destination_path
 36 |         else:
 37 |             raise Exception(f'Error al descargar {URL}: El archivo no se descargó correctamente.')
 38 | 
 39 |     except Exception as e:
 40 |         raise Exception(f'Error al descargar {URL}: {str(e)}')
 41 |     
 42 |       
 43 |         # response = requests.get(URL)
 44 | 
 45 |     # if response.status_code == 200:
 46 |     #     path = RAW_DATA_DIR / f'recorridos-realizados-{year}.zip'
 47 |     #     open(path, "wb").write(response.content)
 48 |     #     print(f'descargado año {year}')
 49 |     #     # time.sleep(2)
 50 |     #     return path
 51 |     # else:
 52 |     #     raise Exception(f'{URL} is not available')
 53 | 
 54 | def unzip_and_convert_csv_to_parquet(year: int) -> Path:
 55 |     nombre_archivo_zip = RAW_DATA_DIR / f"recorridos-realizados-{year}.zip"
 56 |         # Descomprimir el archivo zip
 57 |     with zipfile.ZipFile(nombre_archivo_zip, 'r') as archivo_zip:
 58 | 
 59 |         # Extraer el archivo CSV del zip
 60 |         nombre_archivo_csv = archivo_zip.namelist()[0]  # Suponiendo que el archivo CSV es el primer archivo en el zip
 61 |         archivo_zip.extractall(RAW_DATA_DIR) #(f"../data/raw/")
 62 | 
 63 |         # Leer el archivo CSV con pandas
 64 |         df = pd.read_csv(RAW_DATA_DIR / nombre_archivo_csv, delimiter=',', decimal=".") #RAW_DATA_DIR /
 65 | 
 66 |         # Convertir el DataFrame a formato parquet
 67 |         nombre_archivo_parquet = f"rides_{year}.parquet"
 68 |         table = pa.Table.from_pandas(df)
 69 |         pq.write_table(table, RAW_DATA_DIR / nombre_archivo_parquet)
 70 | 
 71 |         path = RAW_DATA_DIR / f'rides_{year}.parquet'
 72 |     return path
 73 | 
 74 | 
 75 | def validate_raw_data(
 76 |     rides: pd.DataFrame,
 77 |     year: int,
 78 |     #month: int,
 79 | ) -> pd.DataFrame:
 80 |     """
 81 |     Removes rows with pickup_datetimes outside their valid range
 82 |     """
 83 |     # keep only rides for this month
 84 |     # this_month_start = f'{year}-{month:02d}-01'
 85 |     # next_month_start = f'{year}-{month+1:02d}-01' if month < 12 else f'{year+1}-01-01'
 86 |     this_year_start = f'{year}-01-01'
 87 |     next_year_start = f'{year+1}-01-01'
 88 |     rides = rides[rides.pickup_datetime >= this_year_start]
 89 |     rides = rides[rides.pickup_datetime < next_year_start]
 90 |     
 91 |     return rides
 92 | 
 93 | 
 94 | def fetch_ride_events_from_data_warehouse(
 95 |     from_date: datetime,
 96 |     to_date: datetime
 97 | ) -> pd.DataFrame:
 98 |     """
 99 |     This function is used to simulate production data by sampling historical data
100 |     from 52 weeks ago (i.e. 1 year)
101 |     """
102 |     from_date_ = from_date - timedelta(days=7*52)
103 |     to_date_ = to_date - timedelta(days=7*52)
104 |     print(f'Fetching ride events from {from_date} to {to_date}')
105 | 
106 |     if (from_date_.year == to_date_.year) and (from_date_.month == to_date_.month):
107 |         # download 1 file of data only
108 |         rides = load_raw_data(year=from_date_.year, months=from_date_.month)
109 |         rides = rides[rides.pickup_datetime >= from_date_]
110 |         rides = rides[rides.pickup_datetime < to_date_]
111 | 
112 |     else:
113 |         # download 2 files from website
114 |         rides = load_raw_data(year=from_date_.year, months=from_date_.month)
115 |         rides = rides[rides.pickup_datetime >= from_date_]
116 |         rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month)
117 |         rides_2 = rides_2[rides_2.pickup_datetime < to_date_]
118 |         rides = pd.concat([rides, rides_2])
119 | 
120 |     # shift the pickup_datetime back 1 year ahead, to simulate production data
121 |     # using its 7*52-days-ago value
122 |     rides['pickup_datetime'] += timedelta(days=7*52)
123 | 
124 |     rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)
125 | 
126 |     return rides
127 | 
128 | 
129 | def load_raw_data(
130 |     year: int
131 |     #months: Optional[List[int]] = None
132 | ) -> pd.DataFrame:
133 |     """
134 |     Loads raw data from local storage or downloads it from the BsAs website, and
135 |     then loads it into a Pandas DataFrame
136 | 
137 |     Args:
138 |         year: year of the data to download
139 |         #months: months of the data to download. If `None`, download all months
140 | 
141 |     Returns:
142 |         pd.DataFrame: DataFrame with the following columns:
143 |             - pickup_datetime: datetime of the pickup
144 |             - pickup_location_id: ID of the pickup location
145 |     """  
146 |     rides = pd.DataFrame()
147 |     
148 |     # if months is None:
149 |     #     # download data for the entire year (all months)
150 |     #     months = list(range(1, 13))
151 |     # elif isinstance(months, int):
152 |     #     # download data only for the month specified by the int `month`
153 |     #     months = [months]
154 | 
155 |     #for month in months:
156 |         
157 |     local_file = RAW_DATA_DIR / f'rides_{year}.parquet' #-{month:02d}.parquet'
158 |     if not local_file.exists():
159 |         try:
160 |             # download the file from the BsAs website
161 |             print(f'Downloading file {year}') #-{month:02d}
162 |             download_one_file_of_raw_data(year)
163 |             unzip_and_convert_csv_to_parquet(year)
164 |         except:
165 |             print(f'{year} file is not available')
166 |             #continue
167 |     else:
168 |         print(f'File {year} was already in local storage') 
169 | 
170 |     # load the file into Pandas
171 |     rides_one_year = pd.read_parquet(local_file)
172 | 
173 |     # rename columns
174 |     rides_one_year = rides_one_year[['fecha_origen_recorrido', 'id_estacion_origen']]
175 |     rides_one_year.rename(columns={
176 |         'fecha_origen_recorrido': 'pickup_datetime',
177 |         'id_estacion_origen': 'pickup_location_id',
178 |         }, inplace=True)
179 |     
180 |     # eliminate "BAEcobici" and convert it to int type
181 |     rides_one_year['pickup_location_id'] = rides_one_year['pickup_location_id'].str.replace('BAEcobici', '').astype(int)
182 |     # transform "pickup_datetime" to datetime
183 |     rides_one_year['pickup_datetime'] = pd.to_datetime(rides_one_year['pickup_datetime'],format='%Y-%m-%d %H:%M:%S')
184 | 
185 |     # validate the file
186 |     rides_one_year = validate_raw_data(rides_one_year, year)
187 | 
188 |     # append to existing data
189 |     rides = pd.concat([rides, rides_one_year])
190 | 
191 |     if rides.empty:
192 |         # no data, so we return an empty dataframe
193 |         return pd.DataFrame()
194 |     else:
195 |         # keep only time and origin of the ride
196 |         rides = rides[['pickup_datetime', 'pickup_location_id']]
197 |         return rides
198 | 
199 | 
200 | def add_missing_slots(ts_data: pd.DataFrame) -> pd.DataFrame:
201 |     """
202 |     Add necessary rows to the input 'ts_data' to make sure the output
203 |     has a complete list of
204 |     - pickup_hours
205 |     - pickup_location_ids
206 |     """
207 |     #Estaba generando más locations id por tanto lo modifique. Esta es la version antigua
208 |     #location_ids = range(1, ts_data['pickup_location_id'].max() + 1)
209 |     
210 |     #Esta es la línea modificada !!!!!!!
211 |     location_ids = ts_data['pickup_location_id'].unique()
212 | 
213 |     full_range = pd.date_range(ts_data['pickup_hour'].min(),
214 |                                ts_data['pickup_hour'].max(),
215 |                                freq='H')
216 |     output = pd.DataFrame()
217 |     for location_id in tqdm(location_ids):
218 | 
219 |         # keep only rides for this 'location_id'
220 |         ts_data_i = ts_data.loc[ts_data.pickup_location_id == location_id, ['pickup_hour', 'rides']]
221 |         
222 |         if ts_data_i.empty:
223 |             # add a dummy entry with a 0
224 |             ts_data_i = pd.DataFrame.from_dict([
225 |                 {'pickup_hour': ts_data['pickup_hour'].max(), 'rides': 0}
226 |             ])
227 | 
228 |         # quick way to add missing dates with 0 in a Series
229 |         # taken from https://stackoverflow.com/a/19324591
230 |         ts_data_i.set_index('pickup_hour', inplace=True)
231 |         ts_data_i.index = pd.DatetimeIndex(ts_data_i.index)
232 |         ts_data_i = ts_data_i.reindex(full_range, fill_value=0)
233 |         
234 |         # add back `location_id` columns
235 |         ts_data_i['pickup_location_id'] = location_id
236 | 
237 |         output = pd.concat([output, ts_data_i])
238 |     
239 |     # move the pickup_hour from the index to a dataframe column
240 |     output = output.reset_index().rename(columns={'index': 'pickup_hour'})
241 |     
242 |     return output
243 | 
244 | 
245 | def transform_raw_data_into_ts_data(
246 |     rides: pd.DataFrame
247 | ) -> pd.DataFrame:
248 |     """"""
249 |     # sum rides per location and pickup_hour
250 |     rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H')
251 |     agg_rides = rides.groupby(['pickup_hour', 'pickup_location_id']).size().reset_index()
252 |     agg_rides.rename(columns={0: 'rides'}, inplace=True)
253 |     # add rows for (locations, pickup_hours)s with 0 rides
254 |     agg_rides_all_slots = add_missing_slots(agg_rides)
255 | 
256 |     # filtrar las filas con las estaciones del 2022
257 |     # Hago esto porque como la simulacion de consulta es utilizando los datos del 2022 si luego al utilizar los datos no aparecen las nuevas el modelo va a aprender mal.
258 |     estaciones_2022=[  2,   3,   4,   5,   6,   7,   8,   9,  12,  13,  14,  17,  20,
259 |         21,  22,  23,  24,  25,  26,  27,  29,  30,  31,  32,  33,  35,
260 |         36,  38,  41,  43,  44,  45,  46,  48,  49,  50,  51,  54,  56,
261 |         57,  58,  59,  60,  61,  63,  64,  65,  66,  68,  69,  70,  71,
262 |         73,  74,  75,  76,  77,  79,  80,  82,  83,  84,  85,  86,  87,
263 |         89,  91,  92,  93,  94,  95,  96,  98,  99, 101, 102, 104, 107,
264 |        111, 112, 114, 116, 117, 118, 120, 121, 122, 124, 126, 128, 130,
265 |        131, 132, 134, 135, 137, 138, 144, 146, 149, 150, 151, 152, 153,
266 |        155, 156, 158, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171,
267 |        172, 174, 175, 176, 177, 179, 181, 182, 183, 184, 186, 187, 188,
268 |        189, 190, 191, 193, 194, 196, 197, 199, 200, 202, 203, 204, 205,
269 |        206, 207, 208, 210, 212, 213, 214, 215, 216, 219, 220, 222, 223,
270 |        227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 239, 241, 242,
271 |        245, 247, 248, 249, 251, 252, 253, 254, 255, 257, 258, 259, 260,
272 |        261, 262, 263, 265, 267, 268, 269, 270, 271, 273, 275, 277, 278,
273 |        280, 281, 284, 289, 291, 294, 299, 301, 302, 304, 307, 308, 309,
274 |        310, 311, 316, 318, 322, 323, 324, 327, 329, 330, 333, 335, 336,
275 |        340, 342, 348, 349, 353, 355, 358, 359, 361, 362, 363, 364, 366,
276 |        367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 378, 379, 381,
277 |        382, 383, 384, 385, 386, 387, 392, 393, 395, 400, 403, 407, 408,
278 |        412, 413, 416, 417, 418, 420, 422, 423, 424, 425, 426, 427, 428,
279 |        429, 431, 432, 433, 434, 435, 436, 440, 441, 442, 443, 444, 447,
280 |        448, 449, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 464,
281 |        465, 466, 467, 468, 469, 471, 472, 473, 474, 475, 476, 477, 478,
282 |        479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491,
283 |        492, 493, 494, 496, 497, 498]
284 | 
285 |     agg_rides_all_slots = agg_rides_all_slots[agg_rides_all_slots['pickup_location_id'].isin(estaciones_2022)]
286 | 
287 |     return agg_rides_all_slots
288 | 
289 | 
290 | def transform_ts_data_into_features_and_target(
291 |     ts_data: pd.DataFrame,
292 |     input_seq_len: int,
293 |     step_size: int,
294 |     output_seq_len: int #Lo que agregué nuevo
295 | ) -> pd.DataFrame:
296 |     """
297 |     Slices and transposes data from time-series format into a (features, target)
298 |     format that we can use to train Supervised ML models
299 |     """
300 |     assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}
301 | 
302 |     location_ids = ts_data['pickup_location_id'].unique()
303 |     features = pd.DataFrame()
304 |     targets = pd.DataFrame()
305 |     
306 |     for location_id in tqdm(location_ids):
307 |         
308 |         # keep only ts data for this `location_id`
309 |         ts_data_one_location = ts_data.loc[
310 |             ts_data.pickup_location_id == location_id, 
311 |             ['pickup_hour', 'rides']
312 |         ].sort_values(by=['pickup_hour'])
313 | 
314 |         # pre-compute cutoff indices to split dataframe rows
315 |         indices = get_cutoff_indices_features_and_target(
316 |             ts_data_one_location,
317 |             input_seq_len,
318 |             step_size,
319 |             output_seq_len #Lo que agregué nuevo
320 |         )
321 | 
322 |         # slice and transpose data into numpy arrays for features and targets
323 |         n_examples = len(indices)
324 |         x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
325 |         y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas
326 |         pickup_hours = []
327 |         for i, idx in enumerate(indices):
328 |             x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
329 |             y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
330 |             pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])
331 | 
332 |         # numpy -> pandas
333 |         features_one_location = pd.DataFrame(
334 |             x,
335 |             columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
336 |         )
337 |         features_one_location['pickup_hour'] = pickup_hours
338 |         features_one_location['pickup_location_id'] = location_id
339 | 
340 |         # numpy -> pandas
341 |         targets_one_location = pd.DataFrame(y, columns=[f'rides_next_{i+1}_hour' for i in range(output_seq_len)])
342 | 
343 |         # concatenate results
344 |         features = pd.concat([features, features_one_location])
345 |         targets = pd.concat([targets, targets_one_location])
346 | 
347 |     features.reset_index(inplace=True, drop=True)
348 |     targets.reset_index(inplace=True, drop=True)
349 | 
350 |     return features, targets #['target_rides_next_hour']
351 | 
352 | 
353 | def get_cutoff_indices_features_and_target(
354 |     data: pd.DataFrame,
355 |     input_seq_len: int,
356 |     step_size: int,
357 |     output_seq_len: int #Lo que agregué nuevo
358 |     ) -> list:
359 | 
360 |         stop_position = len(data) - 1
361 |         
362 |         # Start the first sub-sequence at index position 0
363 |         subseq_first_idx = 0
364 |         subseq_mid_idx = input_seq_len
365 |         subseq_last_idx = input_seq_len + output_seq_len #le agrego "output_seq_len" para introducirlo como variable
366 |         indices = []
367 |         
368 |         while subseq_last_idx <= stop_position:
369 |             indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
370 |             subseq_first_idx += step_size
371 |             subseq_mid_idx += step_size
372 |             subseq_last_idx += step_size
373 | 
374 |         return indices
375 | 
376 | #Agrego esto para transformar cualquier dataset a algo comparable con las predicciones
377 | def transform_ts_data_into_dataset_comparable_with_predictions(
378 |     ts_data: pd.DataFrame,
379 |     input_seq_len: int,
380 |     step_size: int,
381 |     output_seq_len: int #Lo que agregué nuevo
382 | ) -> pd.DataFrame:
383 |     """
384 |     Slices and transposes data from time-series format into a (features, target)
385 |     format that we can use to train Supervised ML models
386 |     """
387 |     assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}
388 | 
389 |     location_ids = ts_data['pickup_location_id'].unique()
390 |     #features = pd.DataFrame()
391 |     targets = pd.DataFrame()
392 |     
393 |     for location_id in tqdm(location_ids):
394 |         
395 |         # keep only ts data for this `location_id`
396 |         ts_data_one_location = ts_data.loc[
397 |             ts_data.pickup_location_id == location_id, 
398 |             ['pickup_hour', 'rides']
399 |         ].sort_values(by=['pickup_hour'])
400 | 
401 |         # pre-compute cutoff indices to split dataframe rows
402 |         indices = get_cutoff_indices_features_and_target(
403 |             ts_data_one_location,
404 |             input_seq_len,
405 |             step_size,
406 |             output_seq_len #Lo que agregué nuevo
407 |         )
408 | 
409 |         # slice and transpose data into numpy arrays for features and targets
410 |         n_examples = len(indices)
411 |         #x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
412 |         y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas
413 |         pickup_hours = []
414 |         for i, idx in enumerate(indices):
415 |             #x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
416 |             y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
417 |             pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])
418 | 
419 |         # numpy -> pandas
420 |         # features_one_location = pd.DataFrame(
421 |         #     x,
422 |         #     columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
423 |         # )
424 |         # features_one_location['pickup_hour'] = pickup_hours
425 |         # features_one_location['pickup_location_id'] = location_id
426 | 
427 |         # numpy -> pandas
428 |         targets_one_location = pd.DataFrame(y, columns=[f'real_rides_next_{i+1}_hour' for i in range(output_seq_len)])
429 |         targets_one_location['pickup_hour'] = pickup_hours
430 |         targets_one_location['pickup_location_id'] = location_id    
431 | 
432 |         # concatenate results
433 |         #features = pd.concat([features, features_one_location])
434 |         targets = pd.concat([targets, targets_one_location])
435 | 
436 |     #features.reset_index(inplace=True, drop=True)
437 |     targets.reset_index(inplace=True, drop=True)
438 | 
439 |     return targets #['target_rides_next_hour'] #features, 


--------------------------------------------------------------------------------
/src/data_split.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Tuple
 3 | 
 4 | import pandas as pd
 5 | 
 6 | def train_test_split(
 7 |     df: pd.DataFrame,
 8 |     cutoff_date: datetime,
 9 |     targets_columns_names: list,
10 |     ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
11 |     """
12 |     """
13 |     train_data = df[df.pickup_hour < cutoff_date].reset_index(drop=True)
14 |     test_data = df[df.pickup_hour >= cutoff_date].reset_index(drop=True)
15 | 
16 |     X_train = train_data.drop(targets_columns_names, axis=1)
17 |     y_train = train_data[targets_columns_names]
18 |     X_test = test_data.drop(targets_columns_names, axis=1)
19 |     y_test = test_data[targets_columns_names]
20 | 
21 |     return X_train, y_train, X_test, y_test


--------------------------------------------------------------------------------
/src/feature_store_api.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import hsfs
 3 | import hopsworks
 4 | 
 5 | import src.config as config
 6 | 
 7 | def get_feature_store() -> hsfs.feature_store.FeatureStore:
 8 |     """Connects to Hopsworks and returns a pointer to the feature store
 9 | 
10 |     Returns:
11 |         hsfs.feature_store.FeatureStore: pointer to the feature store
12 |     """
13 |     #project = get_hopsworks_project()
14 |     project = hopsworks.login(
15 |         project=config.HOPSWORKS_PROJECT_NAME,
16 |         api_key_value=config.HOPSWORKS_API_KEY
17 |     )
18 |     return project.get_feature_store()
19 | 
20 | def get_feature_group(
21 |     name: str,
22 |     version: Optional[int] = 1
23 |     ) -> hsfs.feature_group.FeatureGroup:
24 |     """Connects to the feature store and returns a pointer to the given
25 |     feature group `name`
26 | 
27 |     Args:
28 |         name (str): name of the feature group
29 |         version (Optional[int], optional): _description_. Defaults to 1.
30 | 
31 |     Returns:
32 |         hsfs.feature_group.FeatureGroup: pointer to the feature group
33 |     """
34 |     return get_feature_store().get_feature_group(
35 |         name=name,
36 |         version=version,
37 |     )


--------------------------------------------------------------------------------
/src/frontend.py:
--------------------------------------------------------------------------------
  1 | import zipfile 
  2 | from datetime import datetime, timedelta
  3 | 
  4 | import requests
  5 | import numpy as np
  6 | import pandas as pd
  7 | import streamlit as st
  8 | import geopandas as gpd
  9 | import pydeck as pdk
 10 | import numpy as np
 11 | 
 12 | from src.inference import (
 13 |     load_predictions_from_store,
 14 |     load_batch_of_features_from_store
 15 | )
 16 | from src.paths import DATA_DIR
 17 | from src.plot import plot_one_sample
 18 | 
 19 | st.set_page_config(layout="wide")
 20 | 
 21 | # title
 22 | # current_date = datetime.strptime('2023-01-05 12:00:00', '%Y-%m-%d %H:%M:%S')
 23 | current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H') # - timedelta(hours=1)
 24 | current_date_str = str(current_date.strftime('%Y-%m-%d %H:%M'))
 25 | st.title(f'Bike demand prediction 🚲')
 26 | # Crear el encabezado con HTML
 27 | mensaje_personalizado = "Made by Javier Yanzón. Let's connect🙌🏻"
 28 | 
 29 | # Enlaces a tus redes sociales
 30 | twitter_link = "https://twitter.com/javieryanzon"
 31 | linkedin_link = "https://www.linkedin.com/in/javieryanzon"
 32 | st.markdown(
 33 |     f"<href>{mensaje_personalizado}</href>"
 34 |     #f"<br />"
 35 |     f" • <a href='{linkedin_link}'>LinkedIn</a> • "
 36 |     f"<a href='{twitter_link}'>Twitter</a>",
 37 |     unsafe_allow_html=True
 38 | ) 
 39 | st.header(f'{current_date_str} UTC')
 40 | 
 41 | progress_bar = st.sidebar.header('⚙️ Working Progress')
 42 | progress_bar = st.sidebar.progress(0)
 43 | N_STEPS = 6
 44 | 
 45 | def load_shape_data_file() -> gpd.geodataframe.GeoDataFrame:
 46 |     """
 47 |     Fetches remote file with shape data, that we later use to plot the
 48 |     different pickup_location_ids on the map of NYC.
 49 | 
 50 |     Raises:
 51 |         Exception: when we cannot connect to the external server where
 52 |         the file is.
 53 | 
 54 |     Returns:
 55 |         GeoDataFrame: columns -> (OBJECTID	Shape_Leng	Shape_Area	zone	LocationID	borough	geometry)
 56 |     """
 57 |     # download zip file
 58 |     URL = 'https://cdn.buenosaires.gob.ar/datosabiertos/datasets/transporte-y-obras-publicas/estaciones-bicicletas-publicas/estaciones-de-bicicletas-zip.zip'
 59 |     response = requests.get(URL)
 60 |     path = DATA_DIR / f'IE-Estaciones.zip'
 61 |     if response.status_code == 200:
 62 |         open(path, "wb").write(response.content)
 63 |     else:
 64 |         raise Exception(f'{URL} is not available')
 65 | 
 66 |     # unzip file
 67 |     with zipfile.ZipFile(path, 'r') as zip_ref:
 68 |         zip_ref.extractall(DATA_DIR / 'IE-Estaciones')
 69 | 
 70 |     # load and return shape file
 71 |     return gpd.read_file(DATA_DIR / 'IE-Estaciones/IE-Estaciones.shp').to_crs('epsg:4326') # 3857
 72 | 
 73 | @st.cache_data
 74 | def _load_batch_of_features_from_store(current_date: datetime) -> pd.DataFrame:
 75 |     """Wrapped version of src.inference.load_batch_of_features_from_store, so
 76 |     we can add Streamlit caching
 77 | 
 78 |     Args:
 79 |         current_date (datetime): _description_
 80 | 
 81 |     Returns:
 82 |         pd.DataFrame: n_features + 2 columns:
 83 |             - `rides_previous_N_hour`
 84 |             - `rides_previous_{N-1}_hour`
 85 |             - ...
 86 |             - `rides_previous_1_hour`
 87 |             - `pickup_hour`
 88 |             - `pickup_location_id`
 89 |     """
 90 |     return load_batch_of_features_from_store(current_date)
 91 | 
 92 | #Quité esto a ver si se soluciona error de cache data inicial
 93 | @st.cache_data
 94 | def _load_predictions_from_store(
 95 |     from_pickup_hour: datetime,
 96 |     to_pickup_hour: datetime
 97 |     ) -> pd.DataFrame:
 98 |     """
 99 |     Wrapped version of src.inference.load_predictions_from_store, so we
100 |     can add Streamlit caching
101 | 
102 |     Args:
103 |         from_pickup_hour (datetime): min datetime (rounded hour) for which we want to get
104 |         predictions
105 | 
106 |         to_pickup_hour (datetime): max datetime (rounded hour) for which we want to get
107 |         predictions
108 | 
109 |     Returns:
110 |         pd.DataFrame: 2 columns: pickup_location_id, predicted_demand
111 |     """
112 |     return load_predictions_from_store(from_pickup_hour, to_pickup_hour)
113 | 
114 | with st.spinner(text="Downloading shape file to plot bike stations"):
115 |     geo_df = load_shape_data_file()
116 |     st.sidebar.write('✅ Shape file was downloaded ')
117 |     progress_bar.progress(1/N_STEPS)
118 | 
119 | # with st.spinner(text="Fetching model predictions from the store"):
120 | #     predictions_df = _load_predictions_from_store(   
121 | #         from_pickup_hour=current_date - timedelta(hours=3),
122 | #         to_pickup_hour=current_date
123 | #         )
124 | #     predictions_df = predictions_df.reset_index(drop=True)
125 | #     #predictions_df=predictions_df.set_index("pickup_location_id")
126 | #     #predictions_df.index.name = None
127 | #     st.sidebar.write('✅ Model predictions arrived')
128 | #     progress_bar.progress(2/N_STEPS)
129 | 
130 | try:
131 |     with st.spinner(text="Fetching model predictions from the store"):
132 |                 predictions_df = _load_predictions_from_store(   
133 |                     from_pickup_hour=current_date - timedelta(hours=3),
134 |                     to_pickup_hour=current_date
135 |                 )
136 |                 predictions_df = predictions_df.reset_index(drop=True)
137 |                 st.sidebar.write('✅ Model predictions arrived')
138 |                 progress_bar.progress(2/N_STEPS)
139 | 
140 | except Exception as e:
141 |     # Captura el error
142 |     st.error(f"An error occurred: {str(e)}")
143 |     # Intenta nuevamente
144 |     st.warning(f"Retrying...")
145 |     with st.spinner(text="Fetching model predictions from the store"):
146 |                 predictions_df = _load_predictions_from_store(   
147 |                     from_pickup_hour=current_date - timedelta(hours=3),
148 |                     to_pickup_hour=current_date
149 |                 )
150 |                 predictions_df = predictions_df.reset_index(drop=True)
151 |                 st.sidebar.write('✅ Model predictions arrived')
152 |                 progress_bar.progress(2/N_STEPS)
153 | 
154 | 
155 | # Here we are checking the predictions for the current hour have already been computed
156 | # and are available
157 | 
158 | # next_hour_predictions_ready = \
159 | #     False if predictions_df[predictions_df.pickup_hour == current_date].empty else True
160 | prev_1_hour_predictions_ready = \
161 |     False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=1))].empty else True
162 | prev_2_hour_predictions_ready = \
163 |     False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=2))].empty else True
164 | prev_3_hour_predictions_ready = \
165 |     False if predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=3))].empty else True
166 | 
167 | # if next_hour_predictions_ready:
168 | #     # predictions for the current hour are available
169 | #     predictions_df = predictions_df[predictions_df.pickup_hour == current_date]
170 | #     st.subheader('The most recent data is not yet available. Using last hour predictions')
171 |                                  
172 | if prev_1_hour_predictions_ready:
173 |     # predictions for current hour sometimes makes a mistake, so we use previous hour predictions -1
174 |     predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=1))]
175 |     current_date = current_date - timedelta(hours=1)
176 |     st.subheader('The most recent data is not available. Using last 1 hour predictions')
177 | 
178 | elif prev_2_hour_predictions_ready:
179 |     # predictions for hour -1 are not available, so we use previous hour predictions -2
180 |     predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=2))]
181 |     current_date = current_date - timedelta(hours=2)
182 |     st.subheader('⚠️ The most recent data is not yet available. Using last 2 hour predictions')
183 | 
184 | elif prev_3_hour_predictions_ready:
185 |     # predictions for hour -2 are not available, so we use previous hour predictions -3
186 |     predictions_df = predictions_df[predictions_df.pickup_hour == (current_date - timedelta(hours=3))]
187 |     current_date = current_date - timedelta(hours=3)
188 |     st.subheader('⚠️ The most recent data is not yet available. Using last 3 hour predictions')
189 | else:
190 |     raise Exception('Features are not available for the last 4 hours. Is your feature \
191 |                     pipeline up and running? 🤔')
192 | 
193 | 
194 | with st.spinner(text="Preparing data to plot"):
195 | 
196 |     def pseudocolor(val, minval, maxval, startcolor, stopcolor):
197 |         """
198 |         Convert value in the range minval...maxval to a color in the range
199 |         startcolor to stopcolor. The colors passed and the the one returned are
200 |         composed of a sequence of N component values.
201 | 
202 |         Credits to https://stackoverflow.com/a/10907855
203 |         """
204 |         f = float(val-minval) / (maxval-minval)
205 |         return tuple(f*(b-a)+a for (a, b) in zip(startcolor, stopcolor))
206 |     
207 |     df = pd.merge(geo_df, predictions_df,
208 |                   right_on='pickup_location_id',
209 |                   left_on='ID',
210 |                   how='inner')
211 |     
212 |     BLACK, ORANGE = (0, 0, 0), (255, 128, 0)
213 |     selected_columns = [c for c in df.columns if c.startswith('rides_next_')]
214 |     df['max_hour'] = df[selected_columns].idxmax(axis=1)
215 |     df['color_scaling'] = df[selected_columns].max(axis=1) 
216 |     max_pred, min_pred = df['color_scaling'].max(), df['color_scaling'].min()
217 |     df['fill_color'] = df['color_scaling'].apply(lambda x: pseudocolor(x, min_pred, max_pred, BLACK, ORANGE))
218 |     
219 |     progress_bar.progress(3/N_STEPS)
220 | 
221 | with st.spinner(text="Generating BsAs Map"):
222 | 
223 |     INITIAL_VIEW_STATE = pdk.ViewState(
224 |         latitude=-34.60280869220721,
225 |         longitude=-58.42827362585887,
226 |         zoom=11,
227 |         max_zoom=16,
228 |         pitch=45,
229 |         bearing=0
230 |     )
231 |     layer = pdk.Layer("ColumnLayer",
232 |                           data=df,
233 |                           get_position=["Lon", "Lat"],
234 |                           get_elevation=['color_scaling'],
235 |                           auto_highlight=True,
236 |                           radius=50,
237 |                           elevation_scale=300,                          
238 |                           get_fill_color="fill_color",
239 |                           get_line_color=[255, 255, 255],
240 |                           pickable=True,
241 |                           extruded=True,
242 |                           coverage=1)
243 |     
244 | 
245 |     tooltip = {"html": "<b>Zone ID:</b> {ID} <br /> <b>Direction:</b> {DIRECCION} <br /> <b>Max: </b> {color_scaling} rides - {max_hour}"}
246 | 
247 |     r = pdk.Deck(
248 |         layers=[layer],
249 |         initial_view_state=INITIAL_VIEW_STATE,
250 |         tooltip=tooltip
251 |     )
252 | 
253 |     st.pydeck_chart(r)
254 |     progress_bar.progress(4/N_STEPS)
255 | 
256 | with st.spinner(text="Fetching batch of features used in the last run"):
257 |     features_df = _load_batch_of_features_from_store(current_date)
258 |     features_df=features_df.reset_index(drop=True)
259 |     #features_df=features_df.set_index("pickup_location_id")
260 |     #features_df.index.name = None
261 |     st.sidebar.write('✅ Inference features fetched from the store')
262 |     progress_bar.progress(5/N_STEPS)
263 | 
264 | with st.spinner(text="Plotting time-series data"):
265 |     
266 |     predictions_df = np.clip(predictions_df[selected_columns], 0, None) #Hago esto para limitar los valores a cero y que no de ninguno negativo 
267 |     
268 | 
269 |     predictions_df['max'] = predictions_df[selected_columns].max(axis=1)
270 |     predictions_df = predictions_df.reset_index(drop=True) 
271 |     sorted_indices = predictions_df['max'].sort_values(ascending=False).index
272 |     predictions_max = predictions_df.copy()
273 |     predictions_max['max_hour'] = predictions_max[selected_columns].idxmax(axis=1)
274 |     predictions_df = predictions_df.drop('max', axis=1)
275 | 
276 |     # Selecciona las 10 filas principales 
277 |     top_10_indices = sorted_indices[:10]
278 |     #st.sidebar.write(top_10_indices)
279 |     #st.sidebar.write(len(predictions_df))
280 | 
281 |     # Agregar un botón de descarga en la esquina superior derecha
282 |     df_to_download = df.copy().drop(['QUEDA_ABIE','EMPLAZAMIE','ANCLAJES','max_hour','color_scaling','fill_color'], axis=1) #pd.merge(features_df, predictions_df, on=['pickup_hour', 'pickup_location_id'], how='left')
283 |     button = st.download_button(
284 |     label="Download predictions CSV",
285 |     data=df_to_download.to_csv(index=False).encode('utf-8'),
286 |     file_name='predictions.csv',
287 |     key='download_button'
288 |     )
289 | 
290 |     st.markdown("<div style='text-align: left; font-size: small;'>Note: Do not use this data for operational purposes. As data is updated monthly, the last hours data are not available. Therefore, a travel simulation is carried out and is taken as certain from which forecasts are made.</div>", unsafe_allow_html=True)
291 | 
292 |     # plot each time-series with the prediction 
293 |     for row_id in top_10_indices:
294 |         #if row_id < len(predictions_df):
295 |             # title
296 |             location_id = features_df['pickup_location_id'].iloc[row_id] 
297 |             location_name = df[df['pickup_location_id'] == location_id]['DIRECCION'].iloc[0]
298 |             
299 | 
300 |             # location_id = df['pickup_location_id'].iloc[row_id]
301 |             # location_name = df['DIRECCION'].iloc[row_id]
302 |             #location_name = df['DIRECCION'].iloc[df['pickup_location_id'] == location_id]         
303 |             #location_name = df['DIRECCION'].iloc[row_id]
304 |             #st.header(f'Direction: {location_id} - {location_name}')
305 |             
306 |             st.header(f'Direction: {location_name} [Zone ID: {location_id}]')
307 | 
308 |             # plot predictions
309 |             prediction = predictions_max['max'].iloc[row_id] #df['color_scaling'].iloc[row_id]
310 |             max_hour_prediction = predictions_max['max_hour'].iloc[row_id]
311 |             max_hour_prediction_int = int(max_hour_prediction.replace('rides_next_', '').replace('_hour', ''))
312 |             max_hour_prediction_str =str(pd.to_datetime(current_date + timedelta(hours=max_hour_prediction_int-1), utc=True).strftime('%Y-%m-%d %H:%M'))+ " UTC " + " - " + str(pd.to_datetime(current_date + timedelta(hours=max_hour_prediction_int), utc=True).strftime('%Y-%m-%d %H:%M') + " UTC")
313 |             st.metric(label="Max rides predicted in 36 hours", value=int(prediction))
314 |             st.metric(label="Approximate Hour of max prediction", value=max_hour_prediction_str)
315 | 
316 |             fig = plot_one_sample(
317 |                 example_id=row_id,
318 |                 features=features_df,
319 |                 targets=predictions_df,
320 |                 predictions=predictions_df
321 |                 #directions=geo_df[['ID', 'DIRECCION']]
322 |             )
323 |             st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000)
324 | 
325 |     progress_bar.progress(6/N_STEPS)


--------------------------------------------------------------------------------
/src/frontend_monitoring.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import streamlit as st
  6 | from sklearn.metrics import mean_absolute_error
  7 | import plotly.express as px
  8 | 
  9 | from src.monitoring import load_predictions_and_actual_values_from_store
 10 | from src.data import transform_ts_data_into_dataset_comparable_with_predictions
 11 | 
 12 | st.set_page_config(layout="wide")
 13 | 
 14 | # title
 15 | current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H')
 16 | st.title(f'Monitoring dashboard 🔎')
 17 | 
 18 | progress_bar = st.sidebar.header('⚙️ Working Progress')
 19 | progress_bar = st.sidebar.progress(0)
 20 | N_STEPS = 3
 21 | 
 22 | 
 23 | @st.cache_data
 24 | def _load_predictions_and_actuals_from_store(
 25 |     from_date: datetime,
 26 |     to_date: datetime
 27 |     ) -> pd.DataFrame:
 28 |     """Wrapped version of src.monitoring.load_predictions_and_actual_values_from_store, so
 29 |     we can add Streamlit caching
 30 | 
 31 |     Args:
 32 |         from_date (datetime): min datetime for which we want predictions and
 33 |         actual values
 34 | 
 35 |         to_date (datetime): max datetime for which we want predictions and
 36 |         actual values
 37 | 
 38 |     Returns:
 39 |         pd.DataFrame: 4 columns
 40 |             - `pickup_location_id`
 41 |             - `predicted_demand`
 42 |             - `pickup_hour`
 43 |             - `rides`
 44 |     """
 45 |     return load_predictions_and_actual_values_from_store(from_date, to_date)
 46 | 
 47 | # with st.spinner(text="Fetching model predictions and actual values from the store"):
 48 |     
 49 | #     ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store(
 50 | #         from_date=current_date - timedelta(days=14),
 51 | #         to_date=current_date
 52 | #     )
 53 | #     real_rides = transform_ts_data_into_dataset_comparable_with_predictions(
 54 | #     ts_data_2,
 55 | #     input_seq_len=0, # one month
 56 | #     step_size=24,
 57 | #     output_seq_len=36
 58 | #     )
 59 | #     st.sidebar.write('✅ Model predictions and actual values arrived')
 60 | #     progress_bar.progress(1/N_STEPS)
 61 | 
 62 | 
 63 | try:
 64 |     with st.spinner(text="Fetching model predictions and actual values from the store"):
 65 |                 ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store(
 66 |                 from_date=current_date - timedelta(days=14),
 67 |                 to_date=current_date
 68 |                 )
 69 |                 real_rides = transform_ts_data_into_dataset_comparable_with_predictions(
 70 |                 ts_data_2,
 71 |                 input_seq_len=0, # one month
 72 |                 step_size=24,
 73 |                 output_seq_len=36
 74 |                 )
 75 |                 st.sidebar.write('✅ Model predictions and actual values arrived')
 76 |                 progress_bar.progress(1/N_STEPS)
 77 |                 
 78 | except Exception as e:
 79 |     # Captura el error
 80 |     st.error(f"An error occurred: {str(e)}")
 81 |     # Intenta nuevamente
 82 |     st.warning(f"Retrying...")
 83 |     with st.spinner(text="Fetching model predictions and actual values from the store"):
 84 |                 ts_data_1, ts_data_2 = _load_predictions_and_actuals_from_store(
 85 |                 from_date=current_date - timedelta(days=14),
 86 |                 to_date=current_date
 87 |                 )
 88 |                 real_rides = transform_ts_data_into_dataset_comparable_with_predictions(
 89 |                 ts_data_2,
 90 |                 input_seq_len=0, # one month
 91 |                 step_size=24,
 92 |                 output_seq_len=36
 93 |                 )
 94 |                 st.sidebar.write('✅ Model predictions and actual values arrived')
 95 |                 progress_bar.progress(1/N_STEPS)
 96 | 
 97 | 
 98 | with st.spinner(text="Plotting aggregate MAE hour-by-hour"):
 99 | 
100 |     monitoring_df = pd.merge(ts_data_1, real_rides, on=['pickup_hour', 'pickup_location_id'], how='inner')
101 |     st.header('Mean Absolute Error (MAE) hour-by-hour')
102 |     selected_columns_pred = [c for c in monitoring_df.columns if c.startswith('rides_next_')] #####
103 |     selected_columns_real = [c for c in monitoring_df.columns if c.startswith('real_rides_next_')]
104 | 
105 |     # MAE per pickup_hour
106 |     # https://stackoverflow.com/a/47914634
107 |     mae_per_hour = (
108 |         monitoring_df
109 |         .groupby('pickup_hour')
110 |         .apply(lambda g: mean_absolute_error(g[selected_columns_real], g[selected_columns_pred])) ####
111 |         .reset_index()
112 |         .rename(columns={0: 'mae'})
113 |         .sort_values(by='pickup_hour')
114 |     )
115 | 
116 |     fig = px.bar(
117 |         mae_per_hour,
118 |         x='pickup_hour', y='mae',
119 |         template='plotly_dark',
120 |     )
121 |     st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000)
122 | 
123 |     progress_bar.progress(2/N_STEPS)
124 | 
125 | 
126 | with st.spinner(text="Plotting MAE hour-by-hour for top locations"):
127 |     
128 |     st.header('Mean Absolute Error (MAE) per location and hour')
129 | 
130 |     top_locations_by_demand = (
131 |         monitoring_df
132 |         .groupby('pickup_location_id')[selected_columns_real].sum()
133 |         .sum(axis=1)
134 |         .sort_values(ascending=False)
135 |         .reset_index()
136 |         .head(10)['pickup_location_id']
137 |     )
138 | 
139 |     for location_id in top_locations_by_demand:
140 |         
141 |         mae_per_hour = (
142 |             monitoring_df[monitoring_df.pickup_location_id == location_id]
143 |             .groupby('pickup_hour')
144 |             .apply(lambda g: mean_absolute_error(g[selected_columns_real], g[selected_columns_pred]))
145 |             .reset_index()
146 |             .rename(columns={0: 'mae'})
147 |             .sort_values(by='pickup_hour')
148 |         )
149 | 
150 |         fig = px.bar(
151 |             mae_per_hour,
152 |             x='pickup_hour', y='mae',
153 |             template='plotly_dark',
154 |         )
155 |         st.subheader(f'{location_id=}')
156 |         st.plotly_chart(fig, theme="streamlit", use_container_width=True, width=1000)
157 | 
158 |     progress_bar.progress(3/N_STEPS)


--------------------------------------------------------------------------------
/src/inference.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | import hopsworks
  4 | #from hsfs.feature_store import FeatureStore
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | import src.config as config
  9 | from src.feature_store_api import get_feature_store
 10 | #, get_or_create_feature_view
 11 | #from src.config import FEATURE_VIEW_METADATA
 12 | 
 13 | def get_hopsworks_project() -> hopsworks.project.Project:
 14 | 
 15 |     return hopsworks.login(
 16 |         project=config.HOPSWORKS_PROJECT_NAME,
 17 |         api_key_value=config.HOPSWORKS_API_KEY
 18 |     )
 19 | 
 20 | # def get_feature_store() -> FeatureStore:
 21 |     
 22 | #     project = get_hopsworks_project()
 23 | #     return project.get_feature_store()
 24 | 
 25 | 
 26 | def get_model_predictions(model, features: pd.DataFrame) -> pd.DataFrame:
 27 |     """"""
 28 |     # past_rides_columns = [c for c in features.columns if c.startswith('rides_')]
 29 |     predictions = model.predict(features)
 30 |     predictions = predictions.round(0)
 31 | 
 32 |     results = pd.DataFrame(predictions,
 33 |             columns=[f'rides_next_{i+1}_hour' for i in range(36)]
 34 |         ) #son 36 horas de prediccion
 35 |     results['pickup_location_id'] = features['pickup_location_id'].values
 36 |     #results['predicted_demand'] = predictions.round(0) #esto estaba antes
 37 |     
 38 |     return results
 39 | 
 40 | 
 41 | def load_batch_of_features_from_store(
 42 |     current_date: datetime,    
 43 | ) -> pd.DataFrame:
 44 |     """Fetches the batch of features used by the ML system at `current_date`
 45 | 
 46 |     Args:
 47 |         current_date (datetime): datetime of the prediction for which we want
 48 |         to get the batch of features
 49 | 
 50 |     Returns:
 51 |         pd.DataFrame: 3 columns:
 52 |             - `pickup_hour`
 53 |             - `rides`
 54 |             - `pickup_location_id`
 55 |     """
 56 |     n_features = config.N_FEATURES
 57 | 
 58 |     feature_store = get_feature_store()
 59 | 
 60 |     # read time-series data from the feature store
 61 |     fetch_data_to = pd.to_datetime(current_date - timedelta(hours=1), utc=True)
 62 |     fetch_data_from = pd.to_datetime(current_date - timedelta(days=28), utc=True)
 63 |     print(f'Fetching data from {fetch_data_from} to {fetch_data_to}')
 64 |     feature_view = feature_store.get_feature_view(
 65 |         name=config.FEATURE_VIEW_NAME,
 66 |         version=config.FEATURE_VIEW_VERSION
 67 |     )
 68 |     ts_data = feature_view.get_batch_data(
 69 |         start_time=pd.to_datetime(fetch_data_from - timedelta(days=1), utc=True),
 70 |         end_time=pd.to_datetime(fetch_data_to + timedelta(days=1), utc=True)
 71 |     )
 72 |     
 73 |     # Convert to UTC aware datetime
 74 |     ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)
 75 | 
 76 |     # filter data to the time period we are interested in
 77 |     ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]
 78 | 
 79 |     # validate we are not missing data in the feature store
 80 |     location_ids = ts_data['pickup_location_id'].unique()
 81 |     assert len(ts_data) == n_features*len(location_ids), \
 82 |         "Time-series data is not complete. Make sure your feature pipeline is up and runnning."
 83 |     
 84 |     # sort data by location and time
 85 |     ts_data.sort_values(by=['pickup_location_id', 'pickup_hour'], inplace=True)
 86 |     # print(f'{ts_data=}')
 87 | 
 88 |     # transpose time-series data as a feature vector, for each `pickup_location_id`
 89 |     x = np.ndarray(shape=(len(location_ids), n_features), dtype=np.float32)
 90 |     for i, location_id in enumerate(location_ids):
 91 |         ts_data_i = ts_data.loc[ts_data.pickup_location_id == location_id, :]
 92 |         ts_data_i = ts_data_i.sort_values(by=['pickup_hour'])
 93 |         x[i, :] = ts_data_i['rides'].values
 94 | 
 95 |     # numpy arrays to Pandas dataframes
 96 |     features = pd.DataFrame(
 97 |         x,
 98 |         columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))]
 99 |     )
100 |     
101 |     features['pickup_hour'] = pd.to_datetime(current_date, utc=True)
102 |     features['pickup_location_id'] = location_ids
103 |     features.sort_values(by=['pickup_location_id'], inplace=True)
104 | 
105 |     return features
106 |     
107 | 
108 | def load_model_from_registry():
109 |     
110 |     import joblib
111 |     from pathlib import Path
112 | 
113 |     project = get_hopsworks_project()
114 |     model_registry = project.get_model_registry()
115 | 
116 |     model = model_registry.get_model(
117 |         name=config.MODEL_NAME,
118 |         version=config.MODEL_VERSION,
119 |     )  
120 |     
121 |     model_dir = model.download()
122 |     model = joblib.load(Path(model_dir)  / 'model.pkl')
123 |        
124 |     return model
125 | 
126 | def load_predictions_from_store(
127 |         from_pickup_hour: datetime,
128 |         to_pickup_hour: datetime) -> pd.DataFrame:
129 |     """
130 |     Connects to the feature store and retrieves model predictions for all
131 |     `pickup_location_id`s and for the time period from `from_pickup_hour`
132 |     to `to_pickup_hour`
133 | 
134 |     Args:
135 |         from_pickup_hour (datetime): min datetime (rounded hour) for which we want to get
136 |         predictions
137 | 
138 |         to_pickup_hour (datetime): max datetime (rounded hour) for which we want to get
139 |         predictions
140 | 
141 |     Returns:
142 |         pd.DataFrame: 3 columns:
143 |             - `pickup_location_id`
144 |             - `predicted_demand`
145 |             - `pickup_hour`
146 |     """
147 |     from src.feature_store_api import get_feature_store
148 |     import src.config as config
149 | 
150 |     feature_store = get_feature_store()
151 | 
152 |     predictiong_fg = feature_store.get_feature_group(
153 |         name=config.FEATURE_GROUP_MODEL_PREDICTIONS,
154 |         version=1,
155 |     )
156 | 
157 |     try:
158 |         # create feature view as it does not exist yet
159 |         feature_store.create_feature_view(
160 |             name=config.FEATURE_VIEW_MODEL_PREDICTIONS,
161 |             version=1,
162 |             query=predictiong_fg.select_all()
163 |         )
164 |     except:
165 |         print(f'Feature view {config.FEATURE_VIEW_MODEL_PREDICTIONS} \
166 |               already existed. Skipped creation.')
167 |         
168 |     predictions_fv = feature_store.get_feature_view(
169 |         name=config.FEATURE_VIEW_MODEL_PREDICTIONS,
170 |         version=1
171 |     )
172 |     
173 |     print(f'Fetching predictions for `pickup_hours` between {from_pickup_hour}  and {to_pickup_hour}')
174 |     predictions = predictions_fv.get_batch_data(
175 |         start_time=from_pickup_hour - timedelta(days=1),
176 |         end_time=to_pickup_hour + timedelta(days=1)
177 |     )
178 |     
179 |     # make sure datetimes are UTC aware
180 |     predictions['pickup_hour'] = pd.to_datetime(predictions['pickup_hour'], utc=True)
181 |     from_pickup_hour = pd.to_datetime(from_pickup_hour, utc=True)
182 |     to_pickup_hour = pd.to_datetime(to_pickup_hour, utc=True)
183 | 
184 |     predictions = predictions[predictions.pickup_hour.between(
185 |         from_pickup_hour, to_pickup_hour)]
186 | 
187 |     # sort by `pick_up_hour` and `pickup_location_id`
188 |     predictions.sort_values(by=['pickup_hour', 'pickup_location_id'], inplace=True)
189 | 
190 |     return predictions


--------------------------------------------------------------------------------
/src/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | def get_logger() -> logging.Logger:
 4 |     """Returns a logger
 5 | 
 6 |     Returns:
 7 |         logging.Logger: _description_
 8 |     """
 9 |     logger = logging.getLogger('dataflow')
10 |     logger.setLevel(logging.INFO)
11 |     return logger
12 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.preprocessing import FunctionTransformer
  3 | from sklearn.base import BaseEstimator, TransformerMixin
  4 | from sklearn.pipeline import make_pipeline, Pipeline
  5 | from sklearn.multioutput import MultiOutputRegressor
  6 | from src.paths import RAW_DATA_DIR
  7 | import hopsworks
  8 | import src.config as config
  9 | from src.feature_store_api import get_feature_store
 10 | 
 11 | import lightgbm as lgb
 12 | 
 13 | def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
 14 |     """
 15 |     Adds one column with the average rides from
 16 |     - 7 days ago
 17 |     - 14 days ago
 18 |     - 21 days ago
 19 |     - 28 days ago
 20 |     """
 21 |     X['average_rides_last_4_weeks'] = 0.25*(
 22 |         X[f'rides_previous_{7*24}_hour'] + \
 23 |         X[f'rides_previous_{2*7*24}_hour'] + \
 24 |         X[f'rides_previous_{3*7*24}_hour'] + \
 25 |         X[f'rides_previous_{4*7*24}_hour']
 26 |     )
 27 |     return X
 28 | 
 29 | def latitude_and_longitude_anterior(X: pd.DataFrame) -> pd.DataFrame: #version anterior de la funcion, la modifique por la de abajo para que solo sea consulta a feature store
 30 |     """
 31 |     Adds two columns with the latitude and longitude from pickup_location_id
 32 |     
 33 |     """
 34 |     raw_data_rides = pd.read_parquet(RAW_DATA_DIR / 'rides_2022.parquet')
 35 | 
 36 |     #Nos quedamos sólo con las columnas que nos interesan y las renombramos
 37 |     raw_data_rides = raw_data_rides[['id_estacion_origen', 'lat_estacion_origen', 'long_estacion_origen']]
 38 |     raw_data_rides['id_estacion_origen'] = raw_data_rides['id_estacion_origen'].str.replace('BAEcobici', '').astype(int)
 39 |     raw_data_rides = raw_data_rides.drop_duplicates().reset_index(drop=True)
 40 |     raw_data_rides.rename(columns={
 41 |     'id_estacion_origen': 'pickup_location_id',
 42 |     'lat_estacion_origen': 'latitude',
 43 |     'long_estacion_origen': 'longitude'
 44 |     }, inplace=True)
 45 | 
 46 |     # Combinar la información de latitud y longitud en X
 47 |     X = X.merge(raw_data_rides, on='pickup_location_id', how='left')
 48 | 
 49 |     # Eliminar la columna 'pickup_location_id'
 50 |     #X.drop('pickup_location_id', axis=1, inplace=True)
 51 | 
 52 |     return X
 53 | 
 54 | def latitude_and_longitude(X: pd.DataFrame) -> pd.DataFrame:
 55 |     """
 56 |     Adds two columns with the latitude and longitude from pickup_location_id
 57 |     
 58 |     """
 59 | 
 60 |     #primero me conecto al feature store para obtenerla y luego la uno al dataset
 61 |     
 62 |     feature_store = get_feature_store()
 63 |     feature_view = feature_store.get_feature_view(
 64 |         name=config.FEATURE_VIEW_LAT_LONG
 65 |     )
 66 |     raw_data_rides= feature_view.get_batch_data()
 67 | 
 68 |     # Combinar la información de latitud y longitud en X
 69 |     X = X.merge(raw_data_rides, on='pickup_location_id', how='left')
 70 | 
 71 |     # Eliminar la columna 'pickup_location_id'
 72 |     #X.drop('pickup_location_id', axis=1, inplace=True)
 73 | 
 74 |     return X
 75 | 
 76 | 
 77 | 
 78 | class TemporalFeaturesEngineer(BaseEstimator, TransformerMixin):
 79 |     """
 80 |     Scikit-learn data transformation that adds 2 columns
 81 |     - hour
 82 |     - day_of_week
 83 |     and removes the `pickup_hour` datetime column.
 84 |     """
 85 |     def fit(self, X, y=None):
 86 |         return self
 87 |     
 88 |     def transform(self, X, y=None):
 89 |         
 90 |         X_ = X.copy()
 91 |         
 92 |         # Generate numeric columns from datetime
 93 |         X_["hour"] = X_['pickup_hour'].dt.hour
 94 |         X_["day_of_week"] = X_['pickup_hour'].dt.dayofweek
 95 |         
 96 |         return X_.drop(columns=['pickup_hour'])
 97 | 
 98 | def get_pipeline(**hyperparams) -> Pipeline:
 99 | 
100 |     # sklearn transform
101 |     add_feature_average_rides_last_4_weeks = FunctionTransformer(
102 |         average_rides_last_4_weeks, validate=False)
103 |     
104 |     # sklearn transform
105 |     add_feature_latitude_and_longitude = FunctionTransformer(
106 |     latitude_and_longitude, validate=False)
107 | 
108 |     # sklearn transform
109 |     add_temporal_features = TemporalFeaturesEngineer()
110 | 
111 |     # sklearn pipeline
112 |     return make_pipeline(
113 |         add_feature_average_rides_last_4_weeks,
114 |         add_feature_latitude_and_longitude,
115 |         add_temporal_features,
116 |         MultiOutputRegressor(lgb.LGBMRegressor(**hyperparams, force_col_wise=True))
117 |     )


--------------------------------------------------------------------------------
/src/model_registry_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | import pickle
  4 | 
  5 | import comet_ml
  6 | from comet_ml import API
  7 | from dotenv import load_dotenv
  8 | import hopsworks
  9 | from sklearn.pipeline import Pipeline
 10 | import pandas as pd
 11 | import joblib
 12 | 
 13 | import src.config as config
 14 | from src.paths import MODELS_DIR, PARENT_DIR
 15 | from src.logger import get_logger
 16 | 
 17 | logger = get_logger()
 18 | 
 19 | # load variables from .env file as environment variables
 20 | load_dotenv(PARENT_DIR / '.env')
 21 | 
 22 | COMET_ML_API_KEY = os.environ["COMET_ML_API_KEY"]
 23 | COMET_ML_WORKSPACE = os.environ["COMET_ML_WORKSPACE"]
 24 | COMET_ML_PROJECT_NAME = os.environ['COMET_ML_PROJECT_NAME']
 25 | 
 26 | 
 27 | def get_model_registry() -> None:
 28 |     """Connects to Hopsworks and returns a pointer to the feature store
 29 | 
 30 |     Returns:
 31 |         hsfs.feature_store.FeatureStore: pointer to the feature store
 32 |     """
 33 |     project = hopsworks.login(
 34 |         project=config.HOPSWORKS_PROJECT_NAME,
 35 |         api_key_value=config.HOPSWORKS_API_KEY
 36 |     )
 37 |     return project.get_model_registry()
 38 | 
 39 | def push_model_to_registry(
 40 |     model: Pipeline,
 41 |     model_name: str,
 42 | ) -> int:
 43 |     """"""
 44 |     # save the model to disk
 45 |     model_file = MODELS_DIR / 'model.pkl'
 46 |     with open(model_file, "wb") as f:
 47 |         pickle.dump(model, f)
 48 | 
 49 |     # Get the stale experiment from the global context to grab the API key and experiment ID.
 50 |     stale_experiment = comet_ml.get_global_experiment()
 51 |     
 52 |     # Resume the expriment using its API key and experiment ID.
 53 |     experiment = comet_ml.ExistingExperiment(
 54 |         api_key=stale_experiment.api_key, experiment_key=stale_experiment.id
 55 |     )
 56 | 
 57 |     # log model as an experiment artifact
 58 |     logger.info(f"Starting logging model to Comet ML")
 59 |     experiment.log_model(model_name, str(model_file))
 60 |     logger.info(f"Finished logging model {model_name}")
 61 |     
 62 |     # push model to the registry
 63 |     logger.info('Pushing model to the registry as "Production"')
 64 |     experiment.register_model(model_name, status='Production')
 65 | 
 66 |     # end the experiment
 67 |     experiment.end()
 68 | 
 69 |     # get model version of the latest production model
 70 |     return get_latest_model_version(model_name, status='Production')
 71 | 
 72 | 
 73 | def get_latest_model_version(model_name: str, status: str) -> str:
 74 |     """
 75 |     Returns the latest model version from the registry with the given `status`
 76 |     """
 77 |     # find all model versions from the given `model_name` registry and `status`
 78 |     api = API(COMET_ML_API_KEY)
 79 |     model_details = api.get_registry_model_details(COMET_ML_WORKSPACE, model_name)['versions']
 80 |     model_versions = [md['version'] for md in model_details if md['status'] == status]
 81 |     
 82 |     # return the latest model version
 83 |     return max(model_versions)
 84 | 
 85 | 
 86 | def get_latest_model_from_registry(model_name: str, status: str) -> Pipeline:
 87 |     """Returns the latest model from the registry"""
 88 |     
 89 |     # get model version to download
 90 |     model_version = get_latest_model_version(model_name, status)
 91 | 
 92 |     # download model from registry
 93 |     api = API(COMET_ML_API_KEY)
 94 |     api.download_registry_model(
 95 |         COMET_ML_WORKSPACE,
 96 |         registry_name=model_name,
 97 |         version=model_version,
 98 |         output_path=MODELS_DIR,
 99 |         expand=True
100 |     )
101 | 
102 |     # load model from local file to memory
103 |     with open(MODELS_DIR / 'model.pkl', "rb") as f:
104 |         model = pickle.load(f)
105 | 
106 |     return model


--------------------------------------------------------------------------------
/src/monitoring.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import src.config as config
  8 | from src.feature_store_api import get_feature_store, get_feature_group
  9 | from src.data import get_cutoff_indices_features_and_target
 10 | 
 11 | from datetime import datetime, timedelta
 12 | import pandas as pd
 13 | from src.data import transform_raw_data_into_ts_data
 14 | from src.data import transform_ts_data_into_features_and_target
 15 | from src.data import transform_ts_data_into_dataset_comparable_with_predictions
 16 | 
 17 | 
 18 | def load_predictions_and_actual_values_from_store(
 19 |     from_date: datetime,
 20 |     to_date: datetime,
 21 | ) -> pd.DataFrame:
 22 |     """Fetches model predictions and actuals values from
 23 |     `from_date` to `to_date` from the Feature Store and returns a dataframe
 24 | 
 25 |     Args:
 26 |         from_date (datetime): min datetime for which we want predictions and
 27 |         actual values
 28 | 
 29 |         to_date (datetime): max datetime for which we want predictions and
 30 |         actual values
 31 | 
 32 |     Returns:
 33 |         pd.DataFrame: 4 columns
 34 |             - `pickup_location_id`
 35 |             - `predicted_demand`
 36 |             - `pickup_hour`
 37 |             - `rides`
 38 |     """
 39 |     current_date = pd.to_datetime(datetime.utcnow(), utc=True).floor('H')
 40 | 
 41 |     fetch_data_from = pd.Timestamp('2023-01-01 0:00:00+0000', tz='UTC') #quizas cambiarlo y que sea solo el año en curso
 42 |     fetch_data_to = pd.to_datetime(current_date - timedelta(hours=1), utc=True)
 43 | 
 44 |     feature_store_1 = get_feature_store()
 45 |     predictions_fg = feature_store_1.get_feature_view(name=config.FEATURE_VIEW_MODEL_PREDICTIONS)
 46 |     ts_data_1 = predictions_fg.get_batch_data(
 47 |         start_time=pd.to_datetime(fetch_data_from, utc=True),
 48 |         end_time=pd.to_datetime(fetch_data_to, utc=True)
 49 |     )
 50 | 
 51 |     feature_store_2 = get_feature_store()
 52 |     actuals_fg = feature_store_2.get_feature_view(name=config.FEATURE_VIEW_NAME)
 53 |     ts_data_2 = actuals_fg.get_batch_data(
 54 |         start_time=pd.to_datetime(fetch_data_from, utc=True),
 55 |         end_time=pd.to_datetime(fetch_data_to, utc=True)
 56 |     )
 57 | 
 58 | 
 59 | 
 60 |     # # 2 feature groups we need to merge
 61 |     # predictions_fg = get_feature_group(name=config.FEATURE_GROUP_MODEL_PREDICTIONS)
 62 |     # actuals_fg = get_feature_group(name=config.FEATURE_GROUP_NAME)
 63 | 
 64 |     # # query to join the 2 features groups by `pickup_hour` and `pickup_location_id`
 65 |     # query = predictions_fg.select_all() \
 66 |     #     .join(actuals_fg.select_all(), on=['pickup_hour', 'pickup_location_id']) \
 67 |     #     .filter(predictions_fg.pickup_hour >= from_date) \
 68 |     #     .filter(predictions_fg.pickup_hour <= to_date)
 69 |     
 70 |     # # create the feature view `config.FEATURE_VIEW_MONITORING` if it does not
 71 |     # # exist yet
 72 |     # feature_store = get_feature_store()
 73 |     # try:
 74 |     #     # create feature view as it does not exist yet
 75 |     #     feature_store.create_feature_view(
 76 |     #         name=config.FEATURE_VIEW_MONITORING,
 77 |     #         version=1,
 78 |     #         query=query
 79 |     #     )
 80 |     # except:
 81 |     #     print('Feature view already existed. Skip creation.')
 82 | 
 83 |     # # feature view
 84 |     # monitoring_fv = feature_store.get_feature_view(
 85 |     #     name=config.FEATURE_VIEW_MONITORING,
 86 |     #     version=1
 87 |     # )
 88 | 
 89 |     # # fetch data form the feature view
 90 |     # # fetch predicted and actual values for the last 30 days
 91 |     # monitoring_df = monitoring_fv.get_batch_data(
 92 |     #     start_time=pd.to_datetime(from_date - timedelta(days=7), utc=True),
 93 |     #     end_time=pd.to_datetime(to_date + timedelta(days=7), utc=True)
 94 |     # )
 95 |     # monitoring_df = monitoring_df[monitoring_df.pickup_hour.between(from_date, to_date)]
 96 | 
 97 |     return ts_data_1, ts_data_2
 98 | 
 99 | 
100 | # def transform_ts_data_hopsworks_into_df_comparable_with_predictions(
101 | #     ts_data: pd.DataFrame,
102 | #     input_seq_len: int,
103 | #     step_size: int,
104 | #     output_seq_len: int #Lo que agregué nuevo
105 | # ) -> pd.DataFrame:
106 | #     """
107 | #     Slices and transposes data from time-series format into a (features, target)
108 | #     format that we can use to train Supervised ML models
109 | #     """
110 | #     assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}
111 | 
112 | #     location_ids = ts_data['pickup_location_id'].unique()
113 | #     #features = pd.DataFrame()
114 | #     real_rides = pd.DataFrame()
115 |     
116 | #     for location_id in tqdm(location_ids):
117 |         
118 | #         # keep only ts data for this `location_id`
119 | #         ts_data_one_location = ts_data.loc[
120 | #             ts_data.pickup_location_id == location_id, 
121 | #             ['pickup_hour', 'rides']
122 | #         ].sort_values(by=['pickup_hour'])
123 | 
124 | #         # pre-compute cutoff indices to split dataframe rows
125 | #         indices = get_cutoff_indices_features_and_target(
126 | #             ts_data_one_location,
127 | #             input_seq_len,
128 | #             step_size,
129 | #             output_seq_len #Lo que agregué nuevo
130 | #         )
131 | 
132 | #         # slice and transpose data into numpy arrays for features and targets
133 | #         n_examples = len(indices)
134 | #         #x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
135 | #         y = np.ndarray(shape=(n_examples, output_seq_len), dtype=np.float32) #Agregué el (output_seq_len) porque quiero esa cantidad de horas
136 | #         pickup_hours = []
137 | #         for i, idx in enumerate(indices):
138 | #             #x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
139 | #             y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values
140 | #             pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])
141 | 
142 | #         # numpy -> pandas
143 | #         # features_one_location = pd.DataFrame(
144 | #         #     x,
145 | #         #     columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
146 | #         # )
147 | #         # features_one_location['pickup_hour'] = pickup_hours
148 | #         # features_one_location['pickup_location_id'] = location_id
149 | 
150 | #         # numpy -> pandas
151 | #         real_rides_one_location = pd.DataFrame(y, columns=[f'real_rides_next_{i+1}_hour' for i in range(output_seq_len)])
152 | #         real_rides_one_location['pickup_hour'] = pickup_hours
153 | #         real_rides_one_location['pickup_location_id'] = location_id
154 | 
155 | #         # concatenate results
156 | #         #features = pd.concat([features, features_one_location])
157 | #         real_rides = pd.concat([real_rides, real_rides_one_location])
158 | 
159 | #     #features.reset_index(inplace=True, drop=True)
160 | #     real_rides.reset_index(inplace=True, drop=True)
161 | 
162 | #     return  real_rides #,features  #['target_rides_next_hour']


--------------------------------------------------------------------------------
/src/paths.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | PARENT_DIR = Path(__file__).parent.resolve().parent
 5 | DATA_DIR = PARENT_DIR / 'data'
 6 | RAW_DATA_DIR = PARENT_DIR / 'data' / 'raw'
 7 | TRANSFORMED_DATA_DIR = PARENT_DIR / 'data' / 'transformed'
 8 | 
 9 | MODELS_DIR = PARENT_DIR / 'models'
10 | 
11 | if not Path(DATA_DIR).exists():
12 |     os.mkdir(DATA_DIR)
13 | 
14 | if not Path(RAW_DATA_DIR).exists():
15 |     os.mkdir(RAW_DATA_DIR)
16 | 
17 | if not Path(TRANSFORMED_DATA_DIR).exists():
18 |     os.mkdir(TRANSFORMED_DATA_DIR)
19 | 
20 | if not Path(MODELS_DIR).exists():
21 |     os.mkdir(MODELS_DIR)


--------------------------------------------------------------------------------
/src/plot.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, List
  2 | from datetime import timedelta
  3 | 
  4 | import pandas as pd
  5 | import plotly.express as px 
  6 | import plotly.graph_objects as go
  7 | 
  8 | # def plot_one_sample(
  9 | #     example_id: int,
 10 | #     features: pd.DataFrame,
 11 | #     targets: Optional[pd.DataFrame] = None, #pd.Series, 
 12 | #     predictions: Optional[pd.DataFrame] = None,
 13 | #     directions: Optional[pd.DataFrame] = None
 14 | # ):
 15 | #     """"""
 16 | #     if directions is not None:
 17 | #         features_ = pd.merge(features, directions, left_on='pickup_location_id', right_on='ID', how='left')
 18 | #         features_[features_.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora
 19 | #         #features_ = features_.iloc[example_id] #Lo modifique porque en otros casos no filtrando^
 20 | #     else:
 21 | #         #features_ = features.iloc[example_id] #Lo modifique porque en otros casos no filtrando^ 
 22 | #         features_[features_.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora
 23 |     
 24 |     
 25 | #     if targets is not None:
 26 | #         target_ = targets[targets.index == example_id].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora
 27 | #         #target_ = targets.iloc[example_id] #Lo modifique porque en otros casos no filtrando
 28 | #         ts_columns_targets = [c for c in targets.columns if c.startswith('rides_next_')]
 29 | #         ts_values_targets = [target_[c] for c in ts_columns_targets] 
 30 | #         ts_dates_targets = pd.date_range(
 31 | #         features_['pickup_hour'].iloc[0],
 32 | #         features_['pickup_hour'].iloc[0] + timedelta(hours=len(ts_columns_targets)-1),
 33 | #         freq='H'
 34 | #     )
 35 | #     else:
 36 | #         target_ = None
 37 |     
 38 | #     #features_df[features_df.index == 327].iloc[-1] #para traer la ultima fila del dataset en caso de que la consulta a la feature store traiga más de una hora
 39 | #     # features_ = features[features['pickup_location_id'] == example_id]
 40 | #     # target_ = targets[targets['pickup_location_id'] == example_id]
 41 | 
 42 |    
 43 | 
 44 | #     ts_columns_features = [c for c in features.columns if c.startswith('rides_previous_')]
 45 |     
 46 | #     ts_values_features = [features_[c] for c in ts_columns_features] 
 47 |     
 48 | 
 49 | #     ts_dates_features = pd.date_range(
 50 | #         features_['pickup_hour'].iloc[0] - timedelta(hours=len(ts_columns_features)), #Agregp .iloc[0] `porque quiero tomar el unico valor de esa serie`
 51 | #         features_['pickup_hour'].iloc[0] - timedelta(hours=1),
 52 | #         freq='H'
 53 | #     )
 54 | 
 55 | 
 56 | #     fig = go.Figure()
 57 | #     if directions is not None:
 58 | #         title = f'Pick up hour= {features_["pickup_hour"]}, location_id= {features_.index}, direction= {features_["DIRECCION"]}'
 59 | #     else:
 60 | #         title = f'Pick up hour= {features_["pickup_hour"]}, location_id= {features_.index}'
 61 | #     fig = px.line( x=ts_dates_features, y=ts_values_features,
 62 | #                 template='plotly_dark',
 63 | #                 markers=True, title=title)
 64 | 
 65 | #     if targets is not None:
 66 | #         targets_fig = px.line(x=ts_dates_targets, y=ts_values_targets,  
 67 | #                     template='plotly_dark', 
 68 | #                     markers=True, title='actual values')
 69 | #         targets_fig.update_traces(line_color='green')
 70 | #         fig.add_traces(targets_fig.data)
 71 |     
 72 | 
 73 | #     if predictions is not None:
 74 |  
 75 | #         prediction_ = predictions.iloc[example_id]
 76 | #         #prediction_ = predictions[predictions['pickup_location_id'] == example_id]
 77 | #         ts_columns_predictions = [c for c in predictions.columns if c.startswith('rides_next_')]       
 78 | #         ts_values_predictions = [prediction_[c] for c in ts_columns_predictions]
 79 | #         ts_dates_predictions = pd.date_range(
 80 | #         features_['pickup_hour'].iloc[0],
 81 | #         features_['pickup_hour'].iloc[0] + timedelta(hours=len(ts_columns_predictions)-1),
 82 | #         freq='H'
 83 | #         )
 84 |  
 85 | #         prediction_fig = px.line(x=ts_dates_predictions, y=ts_values_predictions, 
 86 | #                 template='plotly_dark', 
 87 | #                 markers=True, title='predicted values')
 88 | #         prediction_fig.update_traces(line_color='darkorange')
 89 | #         fig.add_traces(prediction_fig.data)   
 90 | 
 91 | #     return fig
 92 | 
 93 | def plot_one_sample(
 94 |     features: pd.DataFrame,
 95 |     targets: pd.DataFrame, #pd.Series, 
 96 |     example_id: int,
 97 |     predictions: Optional[pd.DataFrame] = None,
 98 | ):
 99 |     """"""
100 |     features_ = features.iloc[example_id]
101 |     target_ = targets.iloc[example_id]
102 |     
103 |     # ts_columns = [c for c in features.columns if c.startswith('rides_previous_')]
104 |     # ts_values = [features_[c] for c in ts_columns] + [target_]
105 |     ts_columns_features = [c for c in features.columns if c.startswith('rides_previous_')]
106 |     ts_columns_targets = [c for c in targets.columns if c.startswith('rides_next_')]
107 |     ts_values_features = [features_[c] for c in ts_columns_features] 
108 |     ts_values_targets = [target_[c] for c in ts_columns_targets] 
109 |     # ts_dates = pd.date_range(
110 |     #     features_['pickup_hour'] - timedelta(hours=len(ts_columns)),
111 |     #     features_['pickup_hour'],
112 |     #     freq='H'
113 |     # )
114 |     ts_dates_features = pd.date_range(
115 |         features_['pickup_hour'] - timedelta(hours=len(ts_columns_features)),
116 |         features_['pickup_hour'] - timedelta(hours=1),
117 |         freq='H'
118 |     )
119 |     ts_dates_targets = pd.date_range(
120 |         features_['pickup_hour'],
121 |         features_['pickup_hour'] + timedelta(hours=len(ts_columns_targets)-1),
122 |         freq='H'
123 |     )
124 |     
125 |     # line plot with past values
126 |     # title = f'Pick up hour={features_["pickup_hour"]}, location_id={features_["pickup_location_id"]}'
127 |     # fig = px.line(
128 |     #     x=ts_dates, y=ts_values,
129 |     #     template='plotly_dark',
130 |     #     markers=True, title=title
131 |     # )
132 |     fig = go.Figure()
133 |     title = f'Pick up hour={features_["pickup_hour"]}, location_id={features_["pickup_location_id"]}'
134 |     fig = px.line( x=ts_dates_features, y=ts_values_features,
135 |                 template='plotly_dark',
136 |                 markers=True, title=title)
137 |     #features_fig.update_traces(line_color='blue')
138 |     #fig.add_traces(features_fig.data)
139 |     
140 |     # green line for the values we wanna predict
141 |     # fig.add_scatter(x=ts_dates[-1:], y=[target_],
142 |     #                 line_color='green',
143 |     #                 mode='markers', marker_size=10, name='actual value') 
144 |     targets_fig = px.line(x=ts_dates_targets, y=target_.values.tolist(),
145 |                 template='plotly_dark', 
146 |                 markers=True, title='actual values')
147 |     targets_fig.update_traces(line_color='green')
148 |     fig.add_traces(targets_fig.data)
149 |     #fig.show()
150 | 
151 |     if predictions is not None:
152 |         # red line for the predicted values, if passed
153 |         # prediction_ = predictions.iloc[example_id]
154 |         # fig.add_scatter(x=ts_dates[-1:], y=[prediction_],
155 |         #                 line_color='red',
156 |         #                 mode='markers', marker_symbol='x', marker_size=15,
157 |         #                 name='prediction')  
158 |         prediction_ = predictions.iloc[example_id]
159 |         prediction_fig = px.line(x=ts_dates_targets, y=prediction_.values.tolist(),
160 |                 template='plotly_dark', 
161 |                 markers=True, title='predicted values')
162 |         prediction_fig.update_traces(line_color='darkorange')
163 |         fig.add_traces(prediction_fig.data)   
164 | 
165 |     return fig
166 | 
167 | def plot_ts(
168 |     ts_data: pd.DataFrame,
169 |     locations: Optional[List[int]] = None
170 |     ):
171 |     """
172 |     Plot time-series data
173 |     """
174 |     ts_data_to_plot = ts_data[ts_data.pickup_location_id.isin(locations)] if locations else ts_data
175 | 
176 |     fig = px.line(
177 |         ts_data,
178 |         x="pickup_hour",
179 |         y="rides",
180 |         color='pickup_location_id',
181 |         template='none',
182 |     )
183 | 
184 |     fig.show()
185 | 


--------------------------------------------------------------------------------