├── data └── .gitkeep ├── .dockerignore ├── requirements-dev.txt ├── docs ├── Proposta tirocinio.pdf └── Proposta tirocinio.tex ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── Dockerfile ├── src ├── scraper │ ├── tests │ │ ├── data │ │ │ ├── train-stop_10911.json │ │ │ ├── train-stop_24955.json │ │ │ ├── station_S01608.json │ │ │ ├── train-stop_52.json │ │ │ ├── station_S01700.json │ │ │ ├── train-stop_22662.json │ │ │ ├── train-stop_10860.json │ │ │ ├── train-stop_3073.json │ │ │ ├── train-stop_555.json │ │ │ └── train-stops_2647.json │ │ ├── __init__.py │ │ ├── test_api.py │ │ ├── test_train_stop.py │ │ ├── test_station.py │ │ └── test_train.py │ ├── __init__.py │ ├── exceptions.py │ ├── main.py │ ├── api.py │ ├── station.py │ └── train_stop.py ├── analysis │ ├── __init__.py │ ├── assets │ │ ├── markers │ │ │ ├── trenord_reg.svg │ │ │ ├── obb_ec.svg │ │ │ ├── other.svg │ │ │ ├── trenitalia_fb.svg │ │ │ ├── trenitalia_ic.svg │ │ │ ├── trenitalia_icn.svg │ │ │ ├── trenitalia_reg.svg │ │ │ ├── tper_reg.svg │ │ │ ├── trenitalia_ec.svg │ │ │ ├── trenitalia_fa.svg │ │ │ └── trenitalia_fr.svg │ │ └── templates │ │ │ ├── stats_chart.html │ │ │ └── marker_legend.html │ ├── groupby.py │ ├── filter.py │ ├── timetable.py │ ├── load_data.py │ ├── stat.py │ ├── main.py │ └── trajectories_map.py ├── __init__.py ├── types.py ├── utils.py ├── const.py ├── station_extractor.py └── train_extractor.py ├── CONTRIBUTING.md ├── requirements.txt ├── .github └── workflows │ └── docker-build.yml ├── main.py ├── .gitignore └── README.md /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data/* 2 | venv/* 3 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit==3.1.1 2 | pytest==7.2.2 3 | black==23.1.0 4 | isort==5.12.0 5 | -------------------------------------------------------------------------------- /docs/Proposta tirocinio.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MarcoBuster/railway-opendata/HEAD/docs/Proposta tirocinio.pdf -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | default: 2 | image: python:3.11 3 | 4 | before_script: 5 | - pip install -r requirements.txt 6 | - pip install -r requirements-dev.txt 7 | 8 | stages: 9 | - test 10 | - lint 11 | 12 | pytest: 13 | stage: test 14 | script: 15 | - pytest src/ 16 | when: always 17 | 18 | black-linter: 19 | stage: lint 20 | script: 21 | - black --check src/ 22 | when: always 23 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | 9 | - repo: https://github.com/psf/black 10 | rev: 23.1.0 11 | hooks: 12 | - id: black 13 | 14 | - repo: https://github.com/pycqa/isort 15 | rev: 5.12.0 16 | hooks: 17 | - id: isort 18 | name: isort (python) 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13 2 | 3 | LABEL org.opencontainers.image.source=https://github.com/MarcoBuster/railway-opendata 4 | LABEL org.opencontainers.image.description="Italian railway opendata scraper and analyzer" 5 | LABEL org.opencontainers.image.licenses=GPL-2.0-or-later 6 | 7 | WORKDIR /app 8 | 9 | COPY requirements.txt . 10 | RUN pip install -r requirements.txt 11 | 12 | VOLUME /app/data 13 | ENV PYTHONHASHSEED=0 14 | 15 | COPY . . 16 | 17 | ENTRYPOINT ["python", "main.py"] 18 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_10911.json: -------------------------------------------------------------------------------- 1 | { 2 | "arr_time": "01:35:00", 3 | "station": { 4 | "station_id": "S09999", 5 | "station_ori_name": "BRESCIA" 6 | }, 7 | "type": "D", 8 | "is_journey": true, 9 | "actual_data": { 10 | "actual_station_mir": "S09999", 11 | "actual_station_name": "BRESCIA", 12 | "actual_train_id": "110911", 13 | "actual_type": "D", 14 | "arr_delay_actual": 1, 15 | "arr_actual_time": "01:36:00" 16 | }, 17 | "cancelled": false, 18 | "platform": "3", 19 | "pass_count": 23, 20 | "date": "20230325" 21 | } 22 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_24955.json: -------------------------------------------------------------------------------- 1 | { 2 | "dep_time": "14:35:00", 3 | "station": { 4 | "station_id": "S01933", 5 | "station_ori_name": "SARONNO" 6 | }, 7 | "type": "O", 8 | "is_journey": true, 9 | "actual_data": { 10 | "actual_station_mir": "S01933", 11 | "actual_station_name": "SARONNO", 12 | "actual_train_id": "124955", 13 | "actual_type": "O", 14 | "dep_actual_time": "14:37:34", 15 | "dep_delay_actual": 2 16 | }, 17 | "cancelled": false, 18 | "platform": "7", 19 | "pass_count": 1, 20 | "date": "20230325" 21 | } 22 | -------------------------------------------------------------------------------- /src/scraper/tests/data/station_S01608.json: -------------------------------------------------------------------------------- 1 | { 2 | "codReg": 1, 3 | "tipoStazione": 3, 4 | "dettZoomStaz": [], 5 | "pstaz": [], 6 | "mappaCitta": { 7 | "urlImagePinpoint": "", 8 | "urlImageBaloon": "" 9 | }, 10 | "codiceStazione": "S01608", 11 | "codStazione": "S01608", 12 | "lat": 45.577162, 13 | "lon": 9.606652, 14 | "latMappaCitta": 0, 15 | "lonMappaCitta": 0, 16 | "localita": { 17 | "nomeLungo": "ARCENE", 18 | "nomeBreve": "Arcene", 19 | "label": "", 20 | "id": "S01608" 21 | }, 22 | "esterno": false, 23 | "offsetX": 0, 24 | "offsetY": 0, 25 | "nomeCitta": "A" 26 | } 27 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_52.json: -------------------------------------------------------------------------------- 1 | { 2 | "arr_time": "14:10:30", 3 | "dep_time": "14:11:30", 4 | "station": { 5 | "station_id": "S01739", 6 | "station_ori_name": "VARESE CASBENO" 7 | }, 8 | "type": "F", 9 | "is_journey": true, 10 | "actual_data": { 11 | "actual_station_mir": "S01739", 12 | "actual_station_name": "VARESE CASBENO", 13 | "actual_train_id": "1900052", 14 | "actual_type": "F", 15 | "dep_actual_time": "14:17:43", 16 | "arr_delay_actual": 5, 17 | "arr_actual_time": "14:15:42", 18 | "dep_delay_actual": 6 19 | }, 20 | "cancelled": false, 21 | "platform": "2", 22 | "pass_count": 9, 23 | "date": "20230325" 24 | } 25 | -------------------------------------------------------------------------------- /src/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | -------------------------------------------------------------------------------- /src/scraper/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | from src import scraper 19 | -------------------------------------------------------------------------------- /src/types.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import typing as t 19 | 20 | JSONType = t.Union[t.Any, t.Dict[t.Any, t.Any]] 21 | -------------------------------------------------------------------------------- /src/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | from src.scraper.api import ViaggiaTrenoAPI 19 | from src.scraper.exceptions import BadRequestException 20 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guide 2 | 3 | Contributions/forks are welcome and appreciated! 4 | For instance, you can: 5 | - improve the scraper; 6 | - add more tests and documentation; 7 | - improve or add more statistics and visualizations. 8 | 9 | ## Development environment 10 | 11 | Before starting, install the development requirements by running this command: 12 | 13 | ```bash 14 | $ pip install -r requirements-dev.txt 15 | ``` 16 | 17 | This project has set up some [pre-commit](https://pre-commit.com/) hooks (like `black` and `isort`) to ensure code readability and consistency: please use them before submitting patches. 18 | 19 | Due to the inability to redistribuite scraped train data (see [Licensing](#licensing) section), there are tests ([pytest](https://pytest.org)) only for the scraping module: run them with 20 | 21 | ```bash 22 | $ pytest . 23 | ``` 24 | 25 | ## Debug logging 26 | 27 | You can enable debug logging using the `-d` (or `--debug`) command line flag. 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asttokens==3.0.0 2 | branca==0.6.0 3 | certifi==2025.4.26 4 | charset-normalizer==3.4.2 5 | colour==0.1.5 6 | contourpy==1.3.2 7 | cycler==0.12.1 8 | dateparser==1.1.8 9 | decorator==5.2.1 10 | executing==2.2.0 11 | folium==0.14.0 12 | fonttools==4.58.0 13 | geojson==3.1.0 14 | idna==3.10 15 | ipython==9.2.0 16 | ipython-pygments-lexers==1.1.1 17 | itables==1.5.2 18 | jedi==0.19.2 19 | jinja2==3.1.6 20 | joblib==1.2.0 21 | kiwisolver==1.4.8 22 | markupsafe==3.0.2 23 | matplotlib==3.10.3 24 | matplotlib-inline==0.1.7 25 | numpy==2.2.5 26 | packaging==25.0 27 | pandas==2.2.3 28 | parso==0.8.4 29 | pexpect==4.9.0 30 | pillow==11.2.1 31 | prompt-toolkit==3.0.51 32 | ptyprocess==0.7.0 33 | pure-eval==0.2.3 34 | pygments==2.19.1 35 | pyparsing==3.2.3 36 | python-dateutil==2.8.2 37 | pytz==2025.2 38 | regex==2024.11.6 39 | requests==2.28.2 40 | seaborn==0.13.2 41 | sentry-sdk==1.18.0 42 | setuptools==80.7.1 43 | six==1.17.0 44 | stack-data==0.6.3 45 | timple==0.1.5 46 | tqdm==4.66.0 47 | traitlets==5.14.3 48 | tzdata==2025.2 49 | tzlocal==5.3.1 50 | urllib3==1.26.20 51 | wcwidth==0.2.13 52 | -------------------------------------------------------------------------------- /src/scraper/tests/data/station_S01700.json: -------------------------------------------------------------------------------- 1 | { 2 | "codReg": 1, 3 | "tipoStazione": 1, 4 | "dettZoomStaz": [ 5 | { 6 | "codiceStazione": "S01700", 7 | "zoomStartRange": 8, 8 | "zoomStopRange": 9, 9 | "pinpointVisibile": true, 10 | "pinpointVisible": true, 11 | "labelVisibile": true, 12 | "labelVisible": true, 13 | "codiceRegione": null 14 | }, 15 | { 16 | "codiceStazione": "S01700", 17 | "zoomStartRange": 10, 18 | "zoomStopRange": 11, 19 | "pinpointVisibile": true, 20 | "pinpointVisible": true, 21 | "labelVisibile": true, 22 | "labelVisible": true, 23 | "codiceRegione": null 24 | } 25 | ], 26 | "pstaz": [ 27 | 28 | ], 29 | "mappaCitta": { 30 | "urlImagePinpoint": "", 31 | "urlImageBaloon": "" 32 | }, 33 | "codiceStazione": "S01700", 34 | "codStazione": "S01700", 35 | "lat": 45.486347, 36 | "lon": 9.204528, 37 | "latMappaCitta": 0, 38 | "lonMappaCitta": 0, 39 | "localita": { 40 | "nomeLungo": "MILANO CENTRALE", 41 | "nomeBreve": "Milano Centrale", 42 | "label": "Milano", 43 | "id": "S01700" 44 | }, 45 | "esterno": false, 46 | "offsetX": 0, 47 | "offsetY": 0, 48 | "nomeCitta": "Milano" 49 | } 50 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_22662.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientamento": null, 3 | "kcNumTreno": null, 4 | "stazione": "TREVIGLIO", 5 | "id": "S01708", 6 | "listaCorrispondenze": null, 7 | "programmata": 1678639800000, 8 | "programmataZero": null, 9 | "effettiva": null, 10 | "ritardo": 0, 11 | "partenzaTeoricaZero": null, 12 | "arrivoTeoricoZero": null, 13 | "partenza_teorica": null, 14 | "arrivo_teorico": 1678639800000, 15 | "isNextChanged": false, 16 | "partenzaReale": null, 17 | "arrivoReale": null, 18 | "ritardoPartenza": 0, 19 | "ritardoArrivo": 0, 20 | "progressivo": 7, 21 | "binarioEffettivoArrivoCodice": null, 22 | "binarioEffettivoArrivoTipo": null, 23 | "binarioEffettivoArrivoDescrizione": null, 24 | "binarioProgrammatoArrivoCodice": null, 25 | "binarioProgrammatoArrivoDescrizione": "2 TR Ovest", 26 | "binarioEffettivoPartenzaCodice": null, 27 | "binarioEffettivoPartenzaTipo": null, 28 | "binarioEffettivoPartenzaDescrizione": null, 29 | "binarioProgrammatoPartenzaCodice": null, 30 | "binarioProgrammatoPartenzaDescrizione": null, 31 | "tipoFermata": "A", 32 | "visualizzaPrevista": true, 33 | "nextChanged": false, 34 | "nextTrattaType": 2, 35 | "actualFermataType": 0, 36 | "materiale_label": null 37 | } 38 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_10860.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientamento": null, 3 | "kcNumTreno": null, 4 | "stazione": "PIACENZA", 5 | "id": "S05000", 6 | "listaCorrispondenze": null, 7 | "programmata": 1678608420000, 8 | "programmataZero": null, 9 | "effettiva": 1678608450000, 10 | "ritardo": 1, 11 | "partenzaTeoricaZero": null, 12 | "arrivoTeoricoZero": null, 13 | "partenza_teorica": 1678608420000, 14 | "arrivo_teorico": null, 15 | "isNextChanged": false, 16 | "partenzaReale": 1678608450000, 17 | "arrivoReale": null, 18 | "ritardoPartenza": 1, 19 | "ritardoArrivo": 0, 20 | "progressivo": 1, 21 | "binarioEffettivoArrivoCodice": "0", 22 | "binarioEffettivoArrivoTipo": "0", 23 | "binarioEffettivoArrivoDescrizione": "5", 24 | "binarioProgrammatoArrivoCodice": null, 25 | "binarioProgrammatoArrivoDescrizione": null, 26 | "binarioEffettivoPartenzaCodice": "0", 27 | "binarioEffettivoPartenzaTipo": "0", 28 | "binarioEffettivoPartenzaDescrizione": "5", 29 | "binarioProgrammatoPartenzaCodice": null, 30 | "binarioProgrammatoPartenzaDescrizione": "5", 31 | "tipoFermata": "P", 32 | "visualizzaPrevista": true, 33 | "nextChanged": false, 34 | "nextTrattaType": 0, 35 | "actualFermataType": 1, 36 | "materiale_label": null 37 | } 38 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_3073.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientamento": null, 3 | "kcNumTreno": null, 4 | "stazione": "ARQUATA SCRIVIA", 5 | "id": "S04207", 6 | "listaCorrispondenze": null, 7 | "programmata": 1678639380000, 8 | "programmataZero": null, 9 | "effettiva": null, 10 | "ritardo": 0, 11 | "partenzaTeoricaZero": null, 12 | "arrivoTeoricoZero": null, 13 | "partenza_teorica": 1678639440000, 14 | "arrivo_teorico": 1678639380000, 15 | "isNextChanged": false, 16 | "partenzaReale": null, 17 | "arrivoReale": null, 18 | "ritardoPartenza": 0, 19 | "ritardoArrivo": 0, 20 | "progressivo": 17, 21 | "binarioEffettivoArrivoCodice": null, 22 | "binarioEffettivoArrivoTipo": null, 23 | "binarioEffettivoArrivoDescrizione": null, 24 | "binarioProgrammatoArrivoCodice": null, 25 | "binarioProgrammatoArrivoDescrizione": "5", 26 | "binarioEffettivoPartenzaCodice": null, 27 | "binarioEffettivoPartenzaTipo": null, 28 | "binarioEffettivoPartenzaDescrizione": null, 29 | "binarioProgrammatoPartenzaCodice": null, 30 | "binarioProgrammatoPartenzaDescrizione": "5", 31 | "tipoFermata": "F", 32 | "visualizzaPrevista": true, 33 | "nextChanged": false, 34 | "nextTrattaType": 2, 35 | "actualFermataType": 0, 36 | "materiale_label": null 37 | } 38 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stop_555.json: -------------------------------------------------------------------------------- 1 | { 2 | "orientamento": null, 3 | "kcNumTreno": null, 4 | "stazione": "LATINA", 5 | "id": "S08608", 6 | "listaCorrispondenze": null, 7 | "programmata": 1678629480000, 8 | "programmataZero": null, 9 | "effettiva": 1678629690000, 10 | "ritardo": 4, 11 | "partenzaTeoricaZero": null, 12 | "arrivoTeoricoZero": null, 13 | "partenza_teorica": 1678629600000, 14 | "arrivo_teorico": 1678629480000, 15 | "isNextChanged": false, 16 | "partenzaReale": 1678629810000, 17 | "arrivoReale": 1678629690000, 18 | "ritardoPartenza": 4, 19 | "ritardoArrivo": 4, 20 | "progressivo": 7, 21 | "binarioEffettivoArrivoCodice": "0", 22 | "binarioEffettivoArrivoTipo": "0", 23 | "binarioEffettivoArrivoDescrizione": "2", 24 | "binarioProgrammatoArrivoCodice": null, 25 | "binarioProgrammatoArrivoDescrizione": null, 26 | "binarioEffettivoPartenzaCodice": "0", 27 | "binarioEffettivoPartenzaTipo": "0", 28 | "binarioEffettivoPartenzaDescrizione": "2", 29 | "binarioProgrammatoPartenzaCodice": null, 30 | "binarioProgrammatoPartenzaDescrizione": null, 31 | "tipoFermata": "F", 32 | "visualizzaPrevista": true, 33 | "nextChanged": false, 34 | "nextTrattaType": 0, 35 | "actualFermataType": 1, 36 | "materiale_label": null 37 | } 38 | -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: [ master ] 4 | workflow_dispatch: 5 | 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | 10 | permissions: 11 | packages: write 12 | 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 10 18 | 19 | - name: Set up QEMU 20 | uses: docker/setup-qemu-action@v2 21 | 22 | - name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v2 24 | 25 | - name: Login to ghcr registry 26 | uses: docker/login-action@v1 27 | with: 28 | registry: ghcr.io 29 | username: ${{ github.actor }} 30 | password: ${{ secrets.GITHUB_TOKEN }} 31 | 32 | - name: Prepare version info 33 | run: | 34 | echo "LATEST_COMMIT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV 35 | 36 | - name: Write version file 37 | run: echo "${{ env.LATEST_COMMIT_SHA }}" > version.txt 38 | 39 | - name: Build and push Docker images 40 | uses: docker/build-push-action@v4 41 | with: 42 | context: . 43 | file: ./Dockerfile 44 | tags: | 45 | ghcr.io/marcobuster/railway-opendata:${{ env.LATEST_COMMIT_SHA }} 46 | ghcr.io/marcobuster/railway-opendata:latest 47 | push: true 48 | -------------------------------------------------------------------------------- /src/scraper/exceptions.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | class BadRequestException(Exception): 19 | """Bad request to ViaggiaTreno API.""" 20 | 21 | def __init__( 22 | self, url: str, status_code: int, response: str, *args: object 23 | ) -> None: 24 | """Creates a BadRequestException. 25 | 26 | Args: 27 | url (str): the request URL 28 | status_code (int): the response status code 29 | response (str): the response data 30 | """ 31 | self.url = url 32 | self.status_code = status_code 33 | self.response = response 34 | super().__init__(*args) 35 | 36 | 37 | class IncompleteTrenordStopDataException(Exception): 38 | def __init__(self, *args: object) -> None: 39 | super().__init__(*args) 40 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import typing as t 20 | from pathlib import Path 21 | 22 | 23 | def _arg_or_default(args: argparse.Namespace, field: str, default: t.Any) -> t.Any: 24 | if not hasattr(args, field) or not getattr(args, field): 25 | return default 26 | 27 | return getattr(args, field) 28 | 29 | 30 | def parse_input_format_output_args( 31 | args: argparse.Namespace, 32 | ) -> t.Tuple[Path, Path, str]: 33 | input_f: Path = Path(args.pickle_file) 34 | format: str = _arg_or_default(args, "format", "csv") 35 | output_f: Path = Path( 36 | _arg_or_default( 37 | args, 38 | "output_file", 39 | input_f.parents[0] / input_f.name.replace("pickle", format), 40 | ) 41 | ) 42 | return input_f, output_f, format 43 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenord_reg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | RE 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/analysis/groupby.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import pandas as pd 19 | from pandas.core.groupby.generic import DataFrameGroupBy 20 | 21 | from src.const import LOCALE 22 | 23 | 24 | def train_hash(df: pd.DataFrame) -> DataFrameGroupBy: 25 | """Group the dataframe by the train hash.""" 26 | return df.groupby("train_hash") 27 | 28 | 29 | def client_code(df: pd.DataFrame) -> DataFrameGroupBy: 30 | """Group the dataframe by the client code.""" 31 | df = df.loc[df.client_code != "OTHER"] 32 | return df.groupby("client_code") 33 | 34 | 35 | def weekday(df: pd.DataFrame) -> DataFrameGroupBy: 36 | """Group the dataframe by the (departure) weekday""" 37 | df["weekday"] = df.day.dt.day_name(locale=LOCALE) 38 | return df.groupby("weekday") 39 | 40 | 41 | def agg_last(df_grouped: DataFrameGroupBy) -> pd.DataFrame: 42 | return df_grouped.last() 43 | 44 | 45 | def agg_mean(df_grouped: DataFrameGroupBy) -> pd.DataFrame: 46 | return df_grouped.mean() 47 | -------------------------------------------------------------------------------- /src/scraper/tests/test_api.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import itertools 19 | import typing as t 20 | 21 | import pytest 22 | 23 | from src.scraper import BadRequestException, ViaggiaTrenoAPI 24 | from src.scraper.train import Train 25 | 26 | 27 | def test_bad_request(): 28 | with pytest.raises(BadRequestException): 29 | ViaggiaTrenoAPI._raw_request("invalid", "method") 30 | 31 | 32 | def test_ok_request(): 33 | response: str = ViaggiaTrenoAPI._raw_request("regione", "S01700") 34 | assert type(response) == str 35 | assert response == "1" 36 | 37 | 38 | @pytest.mark.parametrize( 39 | "kind, station_code", 40 | itertools.product( 41 | ("partenze", "arrivi"), 42 | [ 43 | "S01700", 44 | "S08409", 45 | "S09218", 46 | "S01608", 47 | ], 48 | ), 49 | ) 50 | def test_station_departures_or_arrivals(kind: str, station_code: str): 51 | response: t.List[Train] = ViaggiaTrenoAPI._station_departures_or_arrivals( 52 | kind, station_code 53 | ) 54 | for train in response: 55 | assert type(train) == Train 56 | assert train.number is not None 57 | assert train.origin is not None 58 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/obb_ec.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | 37 | EC 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/other.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | 37 | ? 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_fb.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | 37 | FB 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_ic.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | 37 | IC 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_icn.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 14 | 16 | 20 | 21 | 22 | 25 | 31 | 37 | ICN 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/const.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | from enum import Enum 19 | 20 | from dateutil import tz 21 | 22 | # Global timezone used in all datetime calls. 23 | TIMEZONE = tz.gettz("Europe/Rome") 24 | TIMEZONE_GMT = tz.gettz("GMT") 25 | 26 | # Intra-day split hour 27 | INTRADAY_SPLIT_HOUR: int = 4 28 | 29 | # Pandas locale 30 | LOCALE: str = "it_IT.utf-8" 31 | 32 | # Italian weekdays - see 'LOCALE' 33 | WEEKDAYS = { 34 | "Lunedì": 1, # Monday 35 | "Martedì": 2, # Tuesday 36 | "Mercoledì": 3, # Wednesday 37 | "Giovedì": 4, # Thursday 38 | "Venerdì": 5, # Friday 39 | "Sabato": 6, # Saturday 40 | "Domenica": 7, # Sunday 41 | } 42 | 43 | # Railway company palette 44 | RAILWAY_COMPANIES_PALETTE = { 45 | "TRENITALIA_REG": "#fa1b0f", 46 | "TRENORD": "#298044", 47 | "TPER": "#d014fa", 48 | "TRENITALIA_AV": "#c2152e", 49 | "TRENITALIA_IC": "#1b48f2", 50 | "OBB": "#464644", 51 | "OTHER": "#858585", 52 | } 53 | 54 | 55 | class RailwayCompany(Enum): 56 | """Italian railway companies codes.""" 57 | 58 | TRENITALIA_AV = 1 59 | TRENITALIA_REG = 2 60 | TRENITALIA_IC = 4 61 | TPER = 18 62 | TRENORD = 63 63 | OBB = 64 64 | OTHER = -1 65 | 66 | @classmethod 67 | def from_code(cls, code: int) -> str: 68 | try: 69 | instance: "RailwayCompany" = cls(code) 70 | except ValueError: 71 | instance: "RailwayCompany" = cls.OTHER 72 | return instance.name 73 | -------------------------------------------------------------------------------- /src/analysis/filter.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | from datetime import datetime 19 | 20 | import pandas as pd 21 | 22 | 23 | def date_filter( 24 | df: pd.DataFrame, start_date: datetime | None, end_date: datetime | None 25 | ) -> pd.DataFrame: 26 | """Filter dataframe by date (day). 27 | 28 | Args: 29 | df (pd.DataFrame): the considered dataframe 30 | start_date (datetime | None): the start date 31 | end_date (datetime | None): the end date 32 | 33 | Returns: 34 | pd.DataFrame: the filtered dataframe 35 | """ 36 | if isinstance(start_date, datetime): 37 | start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) 38 | df = df.loc[df.day >= start_date] 39 | if isinstance(end_date, datetime): 40 | end_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0) 41 | df = df.loc[df.day <= end_date] 42 | return df 43 | 44 | 45 | def railway_company_filter( 46 | df: pd.DataFrame, railway_companies: str | None 47 | ) -> pd.DataFrame: 48 | """Filter dataframe by the railway company. 49 | 50 | Args: 51 | df (pd.DataFrame): the considered dataframe 52 | client_codes (str | None): a comma-separated list of client names 53 | 54 | Returns: 55 | pd.DataFrame: the filtered dataframe 56 | """ 57 | if not railway_companies or len(railway_companies) < 1: 58 | return df 59 | 60 | code_list: list[str] = [ 61 | s.strip().lower() for s in railway_companies.strip().split(",") if len(s) > 0 62 | ] 63 | return df.loc[df.client_code.str.lower().isin(code_list)] 64 | 65 | 66 | def railway_lines_filter(df: pd.DataFrame, lines: str | None): 67 | """Filter dataframe by the railway line. 68 | 69 | Args: 70 | df (pd.DataFrame): the considered dataframe 71 | line (str | None): a comma-separated list of railway lines 72 | 73 | Returns: 74 | pd.DataFrame: the filtered dataframe 75 | """ 76 | if not lines or len(lines) < 1: 77 | return df 78 | 79 | line_list: list[str] = [ 80 | l.strip().upper() for l in lines.strip().split(",") if len(l) > 0 81 | ] 82 | return df.loc[df.line.isin(line_list)] 83 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import logging 20 | import os 21 | import sys 22 | 23 | import src.analysis.main as analysis 24 | import src.scraper.main as scraper 25 | from src import station_extractor, train_extractor 26 | 27 | parser = argparse.ArgumentParser( 28 | prog="train-scraper", 29 | ) 30 | subparsers = parser.add_subparsers(dest="subcommand", required=True) 31 | parser.add_argument("-d", "--debug", action="store_true", help="activate debug logs") 32 | 33 | scraper_p = subparsers.add_parser( 34 | "scraper", 35 | help="station and train data scraper", 36 | ) 37 | 38 | train_extractor.register_args( 39 | subparsers.add_parser( 40 | "train-extractor", 41 | help="convert scraped train data", 42 | ) 43 | ) 44 | station_extractor.register_args( 45 | subparsers.add_parser( 46 | "station-extractor", 47 | help="convert scraped station data", 48 | ) 49 | ) 50 | analysis.register_args( 51 | subparsers.add_parser( 52 | "analyze", 53 | help="data analyzer and visualizer", 54 | ) 55 | ) 56 | 57 | 58 | def main(): 59 | print( 60 | "railway-opendata, Copyright (C) 2023 Marco Aceti" 61 | "\nrailway-opendata comes with ABSOLUTELY NO WARRANTY; " 62 | "for details read the LICENSE." 63 | ) 64 | print() 65 | 66 | hashseed: str | None = os.getenv("PYTHONHASHSEED") 67 | if not hashseed or hashseed != "0": 68 | logging.critical( 69 | "Hash seed randomization is not disabled. " 70 | "Please disable it by setting the PYTHONHASHSEED=0 environment variable." 71 | ) 72 | sys.exit(1) 73 | 74 | args: argparse.Namespace = parser.parse_args() 75 | 76 | logging.basicConfig( 77 | stream=sys.stdout, 78 | format="[%(asctime)s - %(levelname)s] %(message)s", 79 | level=logging.INFO if not args.debug else logging.DEBUG, 80 | ) 81 | 82 | if args.subcommand == "scraper": 83 | scraper.main() 84 | 85 | if args.subcommand == "train-extractor": 86 | train_extractor.main(args) 87 | 88 | if args.subcommand == "station-extractor": 89 | station_extractor.main(args) 90 | 91 | if args.subcommand == "analyze": 92 | analysis.main(args) 93 | 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_reg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 36 | 38 | 41 | 45 | 46 | 47 | 52 | 69 | RE 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/tper_reg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 44 | 48 | 49 | 50 | 55 | 73 | RE 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/scraper/tests/test_train_stop.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import json 19 | import pathlib 20 | from datetime import datetime 21 | 22 | import pytest 23 | 24 | from src import types 25 | from src.scraper.train_stop import TrainStop, TrainStopTime 26 | 27 | DATA_DIR = pathlib.Path("src/scraper/tests/data") 28 | 29 | 30 | t1 = datetime(year=2023, month=1, day=1, hour=12, minute=00, second=0) 31 | t2 = datetime(year=2023, month=1, day=1, hour=12, minute=5, second=30) 32 | t3 = datetime(year=2023, month=1, day=1, hour=12, minute=6, second=0) 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "expected, actual, passed, delay", 37 | [ 38 | (t1, t2, True, 5.5), 39 | (t1, None, False, None), 40 | (t3, t2, True, -0.5), 41 | (t3, t1, True, -6), 42 | ], 43 | ) 44 | def test_stop_time( 45 | expected: datetime, actual: datetime | None, passed: bool, delay: int | None 46 | ): 47 | stop_time: TrainStopTime = TrainStopTime(expected=expected, actual=actual) 48 | assert stop_time.passed() == passed 49 | assert stop_time.delay() == delay 50 | 51 | 52 | def test_stop_time_assumption(): 53 | with pytest.raises(AssertionError): 54 | TrainStopTime(None, actual=t1) # type: ignore 55 | 56 | 57 | @pytest.mark.parametrize( 58 | "data_file, expected_repr", 59 | [ 60 | ("train-stop_10860.json", "@ (P) Piacenza 09:07 ~ 09:07 +0.5m [5 ~ 5]"), 61 | ("train-stop_3073.json", "@ (F) Arquata Scrivia 17:43 --> 17:44 [5]"), 62 | ( 63 | "train-stop_555.json", 64 | "@ (F) Latina 14:58 ~ 15:01 +3.5m --> 15:00 ~ 15:03 +3.5m [? ~ 2]", 65 | ), 66 | ("train-stop_22662.json", "@ (A) Treviglio 17:50 [2 TR Ovest]"), 67 | ], 68 | ) 69 | def test_stop_repr(data_file, expected_repr): 70 | with open(DATA_DIR / data_file, "r") as f: 71 | data: types.JSONType = json.load(f) 72 | 73 | stop: TrainStop = TrainStop._from_raw_data(stop_data=data) 74 | assert repr(stop) == expected_repr 75 | 76 | 77 | @pytest.mark.parametrize( 78 | "data_file, expected_repr", 79 | [ 80 | ("train-stop_24955.json", "@ (P) Saronno 14:35 ~ 14:37 +2.6m [7]"), 81 | ( 82 | "train-stop_52.json", 83 | "@ (F) Varese Casbeno 14:10 ~ 14:15 +5.2m --> 14:11 ~ 14:17 +6.2m [2]", 84 | ), 85 | ("train-stop_10911.json", "@ (A) Brescia 01:35 ~ 01:36 +1.0m [3]"), 86 | ], 87 | ) 88 | def test_stop_trenord(data_file, expected_repr): 89 | with open(DATA_DIR / data_file, "r") as f: 90 | data: types.JSONType = json.load(f) 91 | 92 | stop: TrainStop | None = TrainStop._from_trenord_raw_data( 93 | stop_data=data, day=datetime.now().date() 94 | ) 95 | assert repr(stop) == expected_repr 96 | -------------------------------------------------------------------------------- /src/scraper/tests/data/train-stops_2647.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "dep_time": "23:25:00", 4 | "station": { 5 | "station_id": "S01700", 6 | "station_ori_name": "MILANO CENTRALE" 7 | }, 8 | "type": "O", 9 | "is_journey": true, 10 | "cancelled": false, 11 | "actual_data": { 12 | "dep_actual_time": "23:27:00" 13 | } 14 | }, 15 | { 16 | "arr_time": "23:31:00", 17 | "dep_time": "23:33:00", 18 | "station": { 19 | "station_id": "S01701", 20 | "station_ori_name": "MILANO LAMBRATE" 21 | }, 22 | "type": "F", 23 | "is_journey": true, 24 | "cancelled": false, 25 | "actual_data": { 26 | "arr_actual_time": "23:33:00", 27 | "dep_actual_time": "23:35:00" 28 | } 29 | }, 30 | { 31 | "arr_time": "23:39:00", 32 | "dep_time": "23:40:00", 33 | "station": { 34 | "station_id": "S01703", 35 | "station_ori_name": "PIOLTELLO LIMITO" 36 | }, 37 | "type": "F", 38 | "is_journey": true, 39 | "cancelled": false, 40 | "actual_data": { 41 | "arr_actual_time": "23:40:00", 42 | "dep_actual_time": "23:43:00" 43 | } 44 | }, 45 | { 46 | "arr_time": "23:54:00", 47 | "dep_time": "23:56:00", 48 | "station": { 49 | "station_id": "S01708", 50 | "station_ori_name": "TREVIGLIO" 51 | }, 52 | "type": "F", 53 | "is_journey": true, 54 | "cancelled": false, 55 | "actual_data": { 56 | "arr_actual_time": "23:54:00", 57 | "dep_actual_time": "23:56:00" 58 | } 59 | }, 60 | { 61 | "arr_time": "00:04:00", 62 | "dep_time": "00:05:00", 63 | "station": { 64 | "station_id": "S01711", 65 | "station_ori_name": "ROMANO" 66 | }, 67 | "type": "F", 68 | "is_journey": true, 69 | "cancelled": false, 70 | "actual_data": { 71 | "arr_actual_time": "00:03:00", 72 | "dep_actual_time": "00:05:00" 73 | } 74 | }, 75 | { 76 | "arr_time": "00:13:00", 77 | "dep_time": "00:14:00", 78 | "station": { 79 | "station_id": "S01713", 80 | "station_ori_name": "CHIARI" 81 | }, 82 | "type": "F", 83 | "is_journey": true, 84 | "cancelled": false, 85 | "actual_data": { 86 | "dep_actual_time": "00:15:00" 87 | } 88 | }, 89 | { 90 | "arr_time": "00:19:00", 91 | "dep_time": "00:20:00", 92 | "station": { 93 | "station_id": "S01714", 94 | "station_ori_name": "ROVATO" 95 | }, 96 | "type": "F", 97 | "is_journey": true, 98 | "cancelled": false, 99 | "actual_data": { 100 | "arr_actual_time": "00:18:00", 101 | "dep_actual_time": "00:20:00" 102 | } 103 | }, 104 | { 105 | "arr_time": "00:31:00", 106 | "dep_time": "00:33:00", 107 | "station": { 108 | "station_id": "S09999", 109 | "station_ori_name": "BRESCIA" 110 | }, 111 | "type": "F", 112 | "is_journey": true, 113 | "cancelled": false, 114 | "actual_data": { 115 | "arr_actual_time": "00:29:00", 116 | "dep_actual_time": "00:33:00" 117 | } 118 | }, 119 | { 120 | "arr_time": "00:48:00", 121 | "dep_time": "00:49:00", 122 | "station": { 123 | "station_id": "S02084", 124 | "station_ori_name": "DESENZANO DEL GARDA-SIRMIONE" 125 | }, 126 | "type": "F", 127 | "is_journey": true, 128 | "cancelled": false, 129 | "actual_data": { 130 | "arr_actual_time": "00:48:00", 131 | "dep_actual_time": "00:50:00" 132 | } 133 | }, 134 | { 135 | "arr_time": "00:57:00", 136 | "dep_time": "00:58:00", 137 | "station": { 138 | "station_id": "S02088", 139 | "station_ori_name": "PESCHIERA DEL GARDA" 140 | }, 141 | "type": "F", 142 | "is_journey": true, 143 | "cancelled": false, 144 | "actual_data": { 145 | "arr_actual_time": "00:56:00", 146 | "dep_actual_time": "00:58:00" 147 | } 148 | }, 149 | { 150 | "arr_time": "01:17:00", 151 | "station": { 152 | "station_id": "S02430", 153 | "station_ori_name": "VERONA PORTA NUOVA" 154 | }, 155 | "type": "D", 156 | "is_journey": true, 157 | "cancelled": false, 158 | "actual_data": { 159 | "arr_actual_time": "01:17:00" 160 | } 161 | } 162 | ] 163 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_ec.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 44 | 48 | 49 | 50 | 55 | 73 | 91 | EC 104 | 105 | 106 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_fa.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 44 | 48 | 49 | 50 | 55 | 73 | 91 | FA 104 | 105 | 106 | -------------------------------------------------------------------------------- /src/analysis/assets/markers/trenitalia_fr.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 44 | 48 | 49 | 50 | 55 | 73 | 91 | FR 104 | 105 | 106 | -------------------------------------------------------------------------------- /src/station_extractor.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import csv 20 | import pickle 21 | from pathlib import Path 22 | 23 | from geojson import Feature, FeatureCollection, Point 24 | 25 | from src.scraper.station import Station 26 | from src.utils import parse_input_format_output_args 27 | 28 | 29 | def load_file(file: Path) -> dict[str, Station]: 30 | """Load a station data pickle file and return it. 31 | 32 | Args: 33 | file (Path): the file to load 34 | 35 | Returns: 36 | dict[str, Station]: the station data contained in the file 37 | """ 38 | with open(file, "rb") as f: 39 | data: dict[str, Station] = pickle.load(f) 40 | 41 | return data 42 | 43 | 44 | def to_csv(data: dict[str, Station], output_file: Path) -> None: 45 | """Convert to CSV station data, one row per station. 46 | 47 | Args: 48 | data (dict[int, Station]): the data to convert 49 | output_file (Path): the file to write 50 | """ 51 | FIELDS: tuple = ( 52 | "code", 53 | "region", 54 | "long_name", 55 | "short_name", 56 | "latitude", 57 | "longitude", 58 | ) 59 | 60 | csvfile = open(output_file, "w+", newline="") 61 | writer = csv.writer( 62 | csvfile, 63 | delimiter=",", 64 | quotechar="|", 65 | quoting=csv.QUOTE_MINIMAL, 66 | ) 67 | writer.writerow(FIELDS) 68 | 69 | for station_c in data: 70 | station: Station = data[station_c] 71 | writer.writerow( 72 | ( 73 | station.code, 74 | station.region_code, 75 | station.name, 76 | station.short_name if hasattr(station, "short_name") else None, 77 | station.position[0] if station.position else None, 78 | station.position[1] if station.position else None, 79 | ) 80 | ) 81 | csvfile.close() 82 | 83 | 84 | def to_geojson(data: dict[str, Station], output_file: Path) -> None: 85 | feature_list: list[Feature] = list() 86 | 87 | for station_c in data: 88 | station: Station = data[station_c] 89 | if not station.position: 90 | continue 91 | 92 | feature: Feature = Feature( 93 | geometry=Point((station.position[1], station.position[0])), 94 | properties={ 95 | "code": station.code, 96 | "name": station.name, 97 | "short_name": station.short_name 98 | if hasattr(station, "short_name") 99 | else None, 100 | "region": station.region_code, 101 | }, 102 | ) 103 | feature_list.append(feature) 104 | 105 | collection: FeatureCollection = FeatureCollection(feature_list) 106 | with open(output_file, "w+") as f: 107 | f.write(str(collection)) 108 | 109 | 110 | def register_args(parser: argparse.ArgumentParser): 111 | parser.add_argument( 112 | "pickle_file", 113 | help=".pickle file to parse", 114 | metavar="PICKLE_FILE", 115 | ) 116 | parser.add_argument( 117 | "-f", 118 | default="csv", 119 | choices=["csv", "geojson"], 120 | help="output file format", 121 | dest="format", 122 | ) 123 | parser.add_argument( 124 | "-o", 125 | help="output file name", 126 | metavar="OUTPUT_FILE", 127 | dest="output_file", 128 | ) 129 | 130 | 131 | def main(args: argparse.Namespace): 132 | input_f, output_f, format = parse_input_format_output_args(args) 133 | 134 | data: dict[str, Station] = load_file(input_f) 135 | 136 | if format == "csv": 137 | to_csv(data, output_f) 138 | 139 | if format == "geojson": 140 | to_geojson(data, output_f) 141 | -------------------------------------------------------------------------------- /src/scraper/tests/test_station.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import json 19 | import pathlib 20 | import typing as t 21 | 22 | import pytest 23 | 24 | from src import types 25 | from src.scraper import BadRequestException 26 | from src.scraper.station import Station 27 | 28 | DATA_DIR = pathlib.Path("src/scraper/tests/data") 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "station_file, expected", 33 | [ 34 | ( 35 | "station_S01700.json", 36 | { 37 | "code": "S01700", 38 | "region_code": 1, 39 | "name": "Milano Centrale", 40 | "short_name": "Milano Centrale", 41 | "position": (45.486347, 9.204528), 42 | }, 43 | ), 44 | ( 45 | "station_S01608.json", 46 | { 47 | "code": "S01608", 48 | "region_code": 1, 49 | "name": "Arcene", 50 | "short_name": "Arcene", 51 | "position": (45.577162, 9.606652), 52 | }, 53 | ), 54 | ], 55 | ) 56 | def test_init(station_file: str, expected: dict): 57 | with open(DATA_DIR / station_file, "r") as f: 58 | data: types.JSONType = json.load(f) 59 | 60 | station = Station._from_raw(data) 61 | assert station.code == expected["code"] 62 | assert station.region_code == expected["region_code"] 63 | assert station.name == expected["name"] 64 | assert station.short_name == expected["short_name"] 65 | assert station.position == expected["position"] 66 | 67 | 68 | @pytest.mark.parametrize("region_code", range(0, 22 + 1)) 69 | def test_assumptions(region_code): 70 | """For each station returned by the API, we assume there is no None field.""" 71 | response: t.List[Station] = Station.by_region(region_code) 72 | for station in response: 73 | assert station.code is not None 74 | assert station.name is not None 75 | assert station.short_name is not None 76 | assert station.position is not None 77 | 78 | 79 | @pytest.mark.parametrize( 80 | "station_code, station_name", 81 | [ 82 | ("S01700", "Milano Centrale"), 83 | ("S08409", "Roma Termini"), 84 | ("S09218", "Napoli Centrale"), 85 | ("S01608", "Arcene"), 86 | ], 87 | ) 88 | def test_by_code(station_code, station_name): 89 | station: Station = Station.by_code(station_code) 90 | assert station.code == station_code 91 | assert station.name == station_name 92 | 93 | 94 | @pytest.mark.parametrize( 95 | "station_code, region_code", 96 | [ 97 | ("S01700", 1), # Milano Centrale 98 | ("S08409", 5), # Roma Termini 99 | ("S09218", 18), # Napoli Centrale 100 | ("S01608", 1), # Arcene 101 | ], 102 | ) 103 | def test_station_region_code(station_code, region_code): 104 | response: int = Station._region_code(station_code) 105 | assert type(response) == int 106 | assert response == region_code 107 | 108 | 109 | def test_station_region_code_invalid(): 110 | with pytest.raises(BadRequestException): 111 | Station._region_code("S00000") 112 | 113 | 114 | @pytest.mark.parametrize("region_code", range(0, 22 + 1)) 115 | def test_by_region(region_code): 116 | response: t.List[Station] = Station.by_region(region_code) 117 | for station in response: 118 | assert type(station) == Station 119 | try: 120 | assert station.region_code == region_code 121 | except AssertionError: 122 | # Recheck with the *actually* correct _region_code: 123 | # sometimes the 'elencoStazioni' call can be misleading. 124 | assert station.region_code == Station._region_code(station.code) 125 | 126 | 127 | def test_hash(): 128 | milan: Station = Station.by_code("S01700") 129 | rome: Station = Station.by_code("S08409") 130 | assert hash(milan) != hash(rome) 131 | -------------------------------------------------------------------------------- /src/scraper/tests/test_train.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import itertools 19 | import json 20 | import pathlib 21 | import typing as t 22 | from datetime import date, datetime 23 | 24 | import pytest 25 | 26 | from src import types 27 | from src.scraper.station import Station 28 | from src.scraper.train import Train 29 | from src.scraper.train_stop import TrainStop, TrainStopTime 30 | 31 | DATA_DIR = pathlib.Path("src/scraper/tests/data") 32 | 33 | 34 | @pytest.mark.parametrize( 35 | "kind, station_code", 36 | itertools.product( 37 | ("partenze", "arrivi"), 38 | [ 39 | "S01700", 40 | "S08409", 41 | "S09218", 42 | "S01608", 43 | "N00001", 44 | "N00005", 45 | ], 46 | ), 47 | ) 48 | def test_fetch(kind, station_code): 49 | station: Station = Station.by_code(station_code) 50 | trains: t.List[Train] = ( 51 | station.departures() if kind == "partenze" else station.arrivals() 52 | ) 53 | for train in trains: 54 | train.fetch() 55 | if ( 56 | not train.departed 57 | and not train._phantom 58 | and not train._trenord_phantom 59 | and not train.cancelled 60 | ): 61 | assert not train.arrived() 62 | 63 | 64 | def test_unfetched_repr_1(): 65 | milan: Station = Station.by_code("S01700") 66 | train: Train = Train(10911, milan, datetime.now().date()) 67 | assert repr(train) == "Treno [?] ??? 10911 : Milano Centrale [S01700@1] -> ???" 68 | 69 | 70 | def test_unfetched_repr_2(): 71 | train: Train = Train._from_station_departures_arrivals( 72 | { 73 | "numeroTreno": 10911, 74 | "codOrigine": "S01700", 75 | "categoriaDescrizione": "REG", 76 | "dataPartenzaTreno": 1678662000000, 77 | "codiceCliente": 1, 78 | "nonPartito": False, 79 | "provvedimento": 0, 80 | "compImgCambiNumerazione": "", 81 | } 82 | ) 83 | assert repr(train) == "Treno [D] REG 10911 : Milano Centrale [S01700@1] -> ???" 84 | 85 | 86 | def test_hash(): 87 | milan: Station = Station.by_code("S01700") 88 | trains: list[Train] = milan.departures() 89 | if not trains: 90 | return 91 | assert hash(trains[0]) is not None 92 | 93 | 94 | def test_fix_intraday_datetimes(): 95 | milan: Station = Station.by_code("S01700") 96 | mock_train: Train = Train(2647, milan, date(year=2023, month=3, day=25)) 97 | 98 | mock_train.category = "REG" 99 | mock_train.destination = Station.by_code("S02430") 100 | mock_train._phantom = False 101 | mock_train._trenord_phantom = False 102 | mock_train.cancelled = False 103 | mock_train._fetched = datetime.now() 104 | 105 | with open(DATA_DIR / "train-stops_2647.json") as f: 106 | stops: list[types.JSONType] = json.load(f) 107 | 108 | mock_train.stops = list() 109 | for stop in stops: 110 | fetched_stop = TrainStop._from_trenord_raw_data( 111 | stop, day=mock_train.departing_date 112 | ) 113 | if fetched_stop: 114 | mock_train.stops.append(fetched_stop) 115 | 116 | assert len(mock_train.stops) == 11 117 | 118 | mock_train._fix_intraday_datetimes() 119 | 120 | for i, stop in enumerate(mock_train.stops): 121 | expected_day = 25 if i < 4 else 26 122 | 123 | if i != 0: 124 | assert isinstance(stop.arrival, TrainStopTime) 125 | assert stop.arrival.expected.day == expected_day 126 | if isinstance(stop.arrival.actual, datetime): 127 | assert stop.arrival.actual.day == expected_day 128 | 129 | if i != len(mock_train.stops) - 1: 130 | assert isinstance(stop.departure, TrainStopTime) 131 | assert stop.departure.expected.day == expected_day 132 | if isinstance(stop.departure.actual, datetime): 133 | assert stop.departure.actual.day == expected_day 134 | -------------------------------------------------------------------------------- /src/analysis/timetable.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import matplotlib.dates as mdates 19 | import matplotlib.pyplot as plt 20 | import pandas as pd 21 | import timple 22 | 23 | from src.const import TIMEZONE, TIMEZONE_GMT 24 | 25 | 26 | def same_line(df: pd.DataFrame) -> bool: 27 | """Check if the trains in the provided DataFrame are ALL on the same line 28 | 29 | Args: 30 | df (pd.DataFrame): the trains to check 31 | 32 | Return: 33 | bool: True if the trains are all on the same line, False otherwise 34 | """ 35 | return df.line.nunique() <= 1 36 | 37 | 38 | def timetable_train(train: pd.DataFrame, expected: bool = False, collapse: bool = True): 39 | """Generate a timetable graph of a train 40 | 41 | Args: 42 | train (pd.DataFrame): the train stop data to consider 43 | expected (bool, optional): determines whatever to consider the 'expected' or 'actual' arrival/departure times. Defaults to False. 44 | collapse (bool, optional): determines whatever to _collapse_ the times in the graph, relative to the first. Defaults to True. 45 | """ 46 | 47 | if collapse: 48 | train.value -= train.value.min() 49 | 50 | train_f = train.loc[ 51 | train.variable.str.endswith("expected" if expected else "actual") 52 | ] 53 | plt.plot( 54 | train_f.value, 55 | train_f.long_name, 56 | "ko" if expected else "o", 57 | linestyle="-" if expected else "--", 58 | linewidth=3 if expected else 2, 59 | label=f"{train.iloc[0].category} {train.iloc[0].number}" 60 | if not expected 61 | else "expected", 62 | zorder=10 if expected else 5, 63 | ) 64 | 65 | 66 | def timetable_graph(trains: pd.DataFrame, st: pd.DataFrame, collapse: bool = True): 67 | """Generate a timetable graph of trains in a line. 68 | 69 | Args: 70 | trains (pd.DataFrame): the train stop data to consider 71 | st (pd.DataFrame): the station data 72 | collapse (bool, optional): determines whatever to _collapse_ the times in the graph, relative to the first. Defaults to True. 73 | """ 74 | tmpl = timple.Timple() 75 | tmpl.enable() 76 | 77 | trains_j = ( 78 | trains.sort_values(by="stop_number") 79 | .join(st, on="stop_station_code") 80 | .reset_index(drop=True) 81 | ) 82 | trains_m = ( 83 | pd.melt( 84 | trains_j, 85 | id_vars=[ 86 | "long_name", 87 | "stop_number", 88 | "train_hash", 89 | "category", 90 | "number", 91 | "origin", 92 | ], 93 | value_vars=[ 94 | "departure_expected", 95 | "departure_actual", 96 | "arrival_expected", 97 | "arrival_actual", 98 | ], 99 | ) 100 | .sort_values(["stop_number", "variable"]) 101 | .dropna() 102 | ) 103 | 104 | # expected 105 | if collapse: 106 | for origin in trains_m.origin.unique(): 107 | train = list(trains_m.loc[trains_m.origin == origin].groupby("train_hash"))[0][1] # fmt: skip 108 | timetable_train(train, True) 109 | 110 | # actual 111 | for _, train in trains_m.groupby("train_hash"): 112 | timetable_train(train, False, collapse) 113 | 114 | # get station names for proper title 115 | st_names: pd.DataFrame = st.drop( 116 | ["region", "latitude", "longitude", "short_name"], 117 | axis=1, 118 | ) 119 | line: pd.DataFrame = ( 120 | trains.join(st_names, on="origin") 121 | .rename({"long_name": "station_a"}, axis=1) 122 | .join(st_names, on="destination") 123 | .rename({"long_name": "station_b"}, axis=1) 124 | )[["station_a", "station_b", "stop_number"]].agg( 125 | { 126 | "station_a": lambda s: s.iloc[0], 127 | "station_b": lambda s: s.iloc[0], 128 | "stop_number": lambda n: max(n) + 1, 129 | } 130 | ) 131 | 132 | plt.title(f"{line.station_a} ↔ {line.station_b} [{line.stop_number} stops]") 133 | start_day, end_day = trains.day.min().date(), trains.day.max().date() 134 | plt.title(f"{start_day} => {end_day}", loc="left") 135 | 136 | plt.ylabel("Station") 137 | plt.xlabel("Time") 138 | 139 | ax = plt.gca() 140 | ax.invert_yaxis() 141 | ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M", TIMEZONE if not collapse else TIMEZONE_GMT)) # type: ignore 142 | 143 | plt.show() 144 | -------------------------------------------------------------------------------- /src/analysis/load_data.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | from datetime import datetime 19 | from pathlib import Path 20 | 21 | import numpy as np 22 | import pandas as pd 23 | 24 | from src.const import RailwayCompany 25 | 26 | 27 | def read_train_csv(file: Path) -> pd.DataFrame: 28 | """Load train CSV to a pandas dataframe 29 | 30 | Args: 31 | file (Path): the train CSV file path 32 | 33 | Returns: 34 | pd.DataFrame: the loaded dataframe 35 | """ 36 | 37 | df: pd.DataFrame = pd.read_csv(file) 38 | 39 | @np.vectorize 40 | def _parse_dt(_string: str | None) -> datetime | None: 41 | try: 42 | if not isinstance(_string, str): 43 | return None 44 | return datetime.fromisoformat(_string) 45 | except ValueError: 46 | return None 47 | 48 | # Parse datetimes 49 | for dt_field in [ 50 | "arrival_expected", 51 | "arrival_actual", 52 | "departure_expected", 53 | "departure_actual", 54 | ]: 55 | df[dt_field] = ( 56 | df[dt_field] 57 | .apply(_parse_dt) 58 | .astype("object") 59 | .where(df[dt_field].notnull(), None) 60 | ) 61 | 62 | df.day = pd.to_datetime(df.day) 63 | 64 | # Map client codes 65 | df.client_code = df.client_code.apply(RailwayCompany.from_code) # type: ignore 66 | 67 | # Exclude phantom data 68 | df = df.loc[(df.phantom == False) & (df.trenord_phantom == False)].drop( 69 | ["phantom", "trenord_phantom"], axis=1 70 | ) 71 | 72 | # Fix incorrect origin and destination 73 | df["origin"] = (df.groupby("train_hash").transform("first"))["stop_station_code"] 74 | df["destination"] = df.groupby("train_hash").transform("last")["stop_station_code"] 75 | 76 | return df 77 | 78 | 79 | def read_station_csv(file: Path) -> pd.DataFrame: 80 | """Load station CSV to a pandas dataframe 81 | 82 | Args: 83 | file (Path): the station CSV file path 84 | 85 | Returns: 86 | pd.DataFrame: the loaded dataframe 87 | """ 88 | 89 | st: pd.DataFrame = pd.read_csv(file, index_col="code") 90 | 91 | # Some stations (like 'Brescia') have MULTIPLE codes, 92 | # but only one associated row has useful (non-NaN) information. 93 | for idx, station in st.iterrows(): 94 | # Search other stations with the same name 95 | other: pd.DataFrame = st.loc[st.long_name == station.long_name] 96 | if len(other) == 1: 97 | continue 98 | 99 | # If 'this' station has useful information, don't perform any actions 100 | if not np.isnan(station.latitude) and not np.isnan(station.longitude): 101 | continue 102 | 103 | # If present, select the 'oracle' station with information 104 | other = other.loc[~np.isnan(other.latitude)] 105 | if len(other) == 0: 106 | continue 107 | oracle = other.iloc[0] 108 | 109 | # Fill missing information using the oracle data 110 | st.loc[st.index == idx, ["short_name", "latitude", "longitude"]] = ( # type: ignore 111 | oracle.short_name, 112 | oracle.latitude, 113 | oracle.longitude, 114 | ) 115 | 116 | return st 117 | 118 | 119 | def tag_lines(df: pd.DataFrame, stations: pd.DataFrame) -> pd.DataFrame: 120 | """Add 'railway line' information to the 'trains' dataframe. 121 | 122 | Args: 123 | trains (pd.DataFrame): the considered dataframe 124 | stations (pd.DataFrame): the station data 125 | 126 | Returns: 127 | pd.DataFrame: the tagged dataframe 128 | 129 | Notes: 130 | Two trains (t_1, t_2) are considered of the same 'railway line' iff: 131 | - t_1.railway_company == t_2.railway_company; 132 | - t_1.origin == t_2.origin and t_1.destination == t_2.destination or viceversa; 133 | - t_1.stop_set == t_2.stop_set (*). 134 | 135 | (*): can be simplified in t_1.stop_count == t_2.stop_count. 136 | 137 | The above definition is just a convenient approximation. 138 | More precise considerations can only be made on a case-by-case basis. 139 | """ 140 | 141 | df = df.sort_values(["train_hash", "stop_number"]) 142 | df["stop_set"] = df.groupby("train_hash").stop_station_code.transform( 143 | lambda stops: hash(frozenset(stops.unique())) 144 | ) 145 | df["track"] = df.apply( 146 | lambda r: (r.origin + "_" + r.destination) 147 | if r.origin > r.destination 148 | else (r.destination + "_" + r.origin), 149 | axis=1, 150 | ) 151 | df["line"] = df.apply( 152 | lambda r: f"{r.client_code}_{r.track}_{r.stop_set}", 153 | axis=1, 154 | ) 155 | return df 156 | -------------------------------------------------------------------------------- /src/scraper/main.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import itertools 19 | import logging 20 | import os 21 | import pathlib 22 | import pickle 23 | import subprocess 24 | import sys 25 | import typing as t 26 | from datetime import date, datetime, timedelta 27 | 28 | import sentry_sdk 29 | from tqdm import tqdm 30 | 31 | from src.const import TIMEZONE 32 | from src.scraper.station import Station 33 | from src.scraper.train import Train 34 | 35 | DATA_DIR = pathlib.Path("data/") 36 | 37 | 38 | def get_git_revision_short_hash() -> str: 39 | try: 40 | return ( 41 | subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]) 42 | .decode("ascii") 43 | .strip() 44 | ) 45 | except subprocess.CalledProcessError: 46 | try: 47 | with open("version.txt", "r") as f: 48 | return f.read().strip() 49 | except FileNotFoundError: 50 | return "unknown" 51 | 52 | 53 | def load_dataset(file_path: pathlib.Path) -> dict[t.Any, t.Any]: 54 | try: 55 | with open(file_path, "rb") as f: 56 | return pickle.load(f) 57 | except FileNotFoundError: 58 | return dict() 59 | 60 | 61 | def save_dataset(file_path: pathlib.Path, dataset: dict[t.Any, t.Any]) -> None: 62 | with open(file_path, "wb") as f: 63 | pickle.dump(dataset, f) 64 | 65 | 66 | def main() -> None: 67 | hashseed = os.getenv("PYTHONHASHSEED") 68 | if not hashseed or hashseed != "0": 69 | logging.critical( 70 | "Hash seed randomization is not disabled. " 71 | "Please disable it by setting PYTHONHASHSEED=0 environment variable" 72 | ) 73 | sys.exit(1) 74 | 75 | sentry_dsn = os.getenv("SENTRY_DSN") 76 | if sentry_dsn is not None: 77 | sentry_sdk.init( 78 | dsn=sentry_dsn, 79 | release=get_git_revision_short_hash(), 80 | traces_sample_rate=1.0, 81 | ) 82 | logging.info("Activated sentry error reporting") 83 | 84 | # Today + ~3 hours 85 | today: date = (datetime.now(tz=TIMEZONE) - timedelta(hours=3)).date() 86 | today_path: pathlib.Path = DATA_DIR / today.strftime("%Y-%m-%d") 87 | try: 88 | os.mkdir(today_path.absolute()) 89 | except FileExistsError: 90 | pass 91 | 92 | station_cache: dict[str, Station] = load_dataset(DATA_DIR / "stations.pickle") 93 | fetched_trains: dict[int, Train] = load_dataset(today_path / "trains.pickle") 94 | unfetched_trains: dict[int, Train] = load_dataset(today_path / "unfetched.pickle") 95 | 96 | fetched_old_n = len(fetched_trains) 97 | unfetched_old_n = len(unfetched_trains) 98 | logging.info( 99 | f"Loaded {fetched_old_n} already fetched and {unfetched_old_n} unfetched trains" 100 | ) 101 | 102 | # Initialize Station cache 103 | if len(station_cache) != 0: 104 | Station._cache = station_cache 105 | logging.info(f"Initialized station cache with {len(station_cache)} elements") 106 | 107 | # Fetch stations 108 | stations: set[Station] = set( 109 | itertools.chain.from_iterable([Station.by_region(r) for r in range(1, 23)]) 110 | ) 111 | logging.info(f"Retrieved {len(stations)} stations") 112 | 113 | # Try to fetch unfetched trains 114 | logging.info( 115 | f"Starting fetching {len(unfetched_trains)} previously unfetched trains" 116 | ) 117 | _fetched_trains_delete_later: list[int] = list() 118 | for unfetched_train_hash in tqdm(unfetched_trains): 119 | train = unfetched_trains[unfetched_train_hash] 120 | try: 121 | train.fetch() 122 | except Exception as e: 123 | logging.exception(e, exc_info=True) 124 | continue 125 | 126 | if train._phantom or train.arrived(): 127 | fetched_trains[unfetched_train_hash] = train 128 | logging.debug(f"Saved previously unfetched {train.category} {train.number}") 129 | 130 | # It is not possible to delete dict keys in-place 131 | _fetched_trains_delete_later.append(unfetched_train_hash) 132 | 133 | for to_delete in _fetched_trains_delete_later: 134 | del unfetched_trains[to_delete] 135 | 136 | logging.info("Starting fetching departures from all stations") 137 | for station in tqdm(stations): 138 | logging.debug(f"Processing {station}") 139 | 140 | departing: list[Train] = station.departures() 141 | for train in departing: 142 | if hash(train) in fetched_trains or hash(train) in unfetched_trains: 143 | continue 144 | 145 | try: 146 | train.fetch() 147 | except Exception as e: 148 | logging.exception(e, exc_info=True) 149 | continue 150 | 151 | if train._phantom or train.arrived(): 152 | fetched_trains[hash(train)] = train 153 | logging.debug(f"Saved {train.category} {train.number}") 154 | else: 155 | unfetched_trains[hash(train)] = train 156 | 157 | logging.info(f"Retrieved {len(fetched_trains) - fetched_old_n} new trains") 158 | logging.info( 159 | f"Unfetched trains: {len(unfetched_trains)} " 160 | f"({(len(unfetched_trains) - unfetched_old_n):+d})" 161 | ) 162 | 163 | save_dataset(DATA_DIR / "stations.pickle", Station._cache) 164 | save_dataset(today_path / "trains.pickle", fetched_trains) 165 | save_dataset(today_path / "unfetched.pickle", unfetched_trains) 166 | 167 | logging.info(f"Trains saved today: {len(fetched_trains)}") 168 | logging.info(f"Station cache size: {len(Station._cache)}") 169 | -------------------------------------------------------------------------------- /src/scraper/api.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import json 19 | import typing as t 20 | from datetime import datetime 21 | 22 | import requests 23 | from requests.adapters import HTTPAdapter, Retry 24 | 25 | import src.scraper.train as tr 26 | from src import types 27 | from src.const import TIMEZONE, TIMEZONE_GMT 28 | from src.scraper.exceptions import BadRequestException 29 | 30 | 31 | class ViaggiaTrenoAPI: 32 | BASE_URL: str = "http://www.viaggiatreno.it/infomobilita/resteasy/viaggiatreno/" 33 | 34 | # Initialize requests session with auto-retry and exponential backoff 35 | _session: requests.Session = requests.Session() 36 | _session.mount( 37 | "http://", 38 | HTTPAdapter( 39 | max_retries=Retry( 40 | total=10, 41 | read=5, 42 | status=10, 43 | status_forcelist=[403, 500, 502, 503, 504], 44 | backoff_factor=0.2, 45 | ) 46 | ), 47 | ) 48 | 49 | @classmethod 50 | def _raw_request(cls, method: str, *parameters: t.Any) -> str: 51 | """Perform a HTTP request to ViaggiaTreno API and return a raw string, 52 | if the request has been successful. 53 | 54 | Args: 55 | method (str): the method to be called 56 | parameters (tuple[str]): a list of parameters 57 | 58 | Raises: 59 | BadRequestException: if the response is not ok 60 | 61 | Returns: 62 | str: the raw response from the API 63 | """ 64 | response: requests.Response = cls._session.get( 65 | f"{ViaggiaTrenoAPI.BASE_URL}{method}/" 66 | f"{'/'.join(map(lambda p: str(p), parameters))}" 67 | ) 68 | 69 | if response.status_code != 200 or "Error" in response.text: 70 | raise BadRequestException( 71 | url=response.url, 72 | status_code=response.status_code, 73 | response=response.text, 74 | ) 75 | 76 | return response.text 77 | 78 | @staticmethod 79 | def _decode_json(string: str) -> types.JSONType: 80 | """Decode a JSON string. 81 | 82 | Args: 83 | string (str): the string to decode 84 | 85 | Returns: 86 | types.JSONType: the decoded JSON value 87 | """ 88 | return json.loads(string) 89 | 90 | @staticmethod 91 | def _to_datetime(time: int | None) -> datetime | None: 92 | """Convert a UNIX timestamp with milliseconds to datetime. 93 | If None is passed, None is returned. 94 | 95 | Args: 96 | time (int | None): the UNIX timestamp to convert 97 | 98 | Returns: 99 | datetime | None: the resulting datetime object 100 | """ 101 | if not time: 102 | return None 103 | 104 | return datetime.fromtimestamp(time / 1000, tz=TIMEZONE) 105 | 106 | @staticmethod 107 | def _station_departures_or_arrivals( 108 | kind: str, station_code: str 109 | ) -> t.List["tr.Train"]: 110 | """Helper function to Station.departures and Station.arrivals methods. 111 | 112 | Args: 113 | kind (str): either 'partenze' (departures) or 'arrivi' (arrivals) 114 | station_code (str): the code of the considered station 115 | 116 | Returns: 117 | t.List[Train]: a list of trains departing o arriving to the station 118 | """ 119 | assert kind in ["partenze", "arrivi"] 120 | 121 | now: str = datetime.now(tz=TIMEZONE_GMT).strftime("%a %b %d %Y %H:%M:%S %Z%z") 122 | raw_trains: str = ViaggiaTrenoAPI._raw_request(kind, station_code, now) 123 | trains: types.JSONType = ViaggiaTrenoAPI._decode_json(raw_trains) 124 | return list( 125 | map( 126 | lambda t: tr.Train._from_station_departures_arrivals(t), 127 | trains, 128 | ) 129 | ) 130 | 131 | 132 | class TrenordAPI: 133 | BASE_URL: str = "https://admin.trenord.it/store-management-api/mia/" 134 | 135 | TRENORD_CLIENT_CODE: int = 63 136 | 137 | # Initialize requests session with auto-retry and exponential backoff 138 | _session: requests.Session = requests.Session() 139 | _session.mount( 140 | "https://", 141 | HTTPAdapter( 142 | max_retries=Retry( 143 | total=10, 144 | read=5, 145 | status=5, 146 | status_forcelist=[403, 500, 502, 503, 504], 147 | backoff_factor=0.2, 148 | ) 149 | ), 150 | ) 151 | 152 | @classmethod 153 | def _raw_request(cls, method: str, *parameters: t.Any) -> str: 154 | """Perform a HTTP request to Trenord API and return a raw string, 155 | if the request has been successful. 156 | 157 | Args: 158 | method (str): the method to be called 159 | parameters (tuple[str]): a list of parameters 160 | 161 | Raises: 162 | BadRequestException: if the response is not ok 163 | 164 | Returns: 165 | str: the raw response from the API 166 | """ 167 | 168 | response: requests.Response = cls._session.get( 169 | f"{TrenordAPI.BASE_URL}{method}/" 170 | f"{'/'.join(map(lambda p: str(p), parameters))}" 171 | ) 172 | 173 | if response.status_code != 200 or "Error" in response.text: 174 | raise BadRequestException( 175 | url=response.url, 176 | status_code=response.status_code, 177 | response=response.text, 178 | ) 179 | 180 | return response.text 181 | -------------------------------------------------------------------------------- /src/analysis/stat.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import webbrowser 20 | from tempfile import NamedTemporaryFile 21 | 22 | import matplotlib as mpl 23 | import matplotlib.pyplot as plt 24 | import pandas as pd 25 | import seaborn as sns 26 | from itables import to_html_datatable 27 | from pandas.core.groupby.generic import DataFrameGroupBy 28 | 29 | from src.const import RAILWAY_COMPANIES_PALETTE, WEEKDAYS 30 | 31 | 32 | def describe(df: pd.DataFrame | DataFrameGroupBy) -> None: 33 | """Call pandas.DataFrame.describe()""" 34 | print( 35 | df[["stop_number", "arrival_delay", "departure_delay", "crowding"]].describe() 36 | ) 37 | 38 | 39 | def prepare_mpl(df: pd.DataFrame, args: argparse.Namespace) -> None: 40 | """Prepare matplotlib params""" 41 | if args.stat not in [ 42 | "delay_boxplot", 43 | "day_train_count", 44 | ]: 45 | return 46 | 47 | mpl.rcParams["figure.figsize"] = (12, 12 * 5 / 7) 48 | sns.set_theme(style="whitegrid", palette="pastel") 49 | 50 | plt.title(args.stat) 51 | 52 | start_day, end_day = df.day.min().date(), df.day.max().date() 53 | plt.title(f"{start_day} => {end_day}", loc="left") 54 | 55 | if args.group_by != "none": 56 | grouped_str = f" grouped by {args.group_by}" 57 | if args.agg_func == "none": 58 | grouped_str += ", unaggregated" 59 | else: 60 | grouped_str += f", aggr. with '{args.agg_func}' func" 61 | plt.title(grouped_str, loc="right") 62 | 63 | 64 | def delay_boxplot(df: pd.DataFrame | DataFrameGroupBy) -> None: 65 | """Show a seaborn boxplot of departure and arrival delays""" 66 | 67 | if isinstance(df, DataFrameGroupBy): 68 | grouped_by: str = df.any().index.name 69 | group_melt = pd.DataFrame() 70 | 71 | grouped: list = list(df) # type: ignore 72 | 73 | # Re-order fields 74 | if grouped_by == "weekday": 75 | grouped.sort(key=lambda t: WEEKDAYS[t[0]]) 76 | elif grouped_by == "client_code": 77 | grouped.sort(key=lambda g: len(g[1]), reverse=True) 78 | 79 | for group in grouped: # type: ignore 80 | melt = pd.melt( 81 | group[1], 82 | id_vars=[ 83 | col 84 | for col in df.obj.columns 85 | if col 86 | not in [ 87 | "arrival_delay", 88 | "departure_delay", 89 | ] 90 | ], 91 | value_name="value", 92 | ) 93 | group_melt = pd.concat([group_melt, melt]) 94 | 95 | ax = sns.boxplot( 96 | group_melt[[grouped_by, "variable", "value"]], 97 | x=grouped_by, 98 | y="value", 99 | hue="variable", 100 | showfliers=False, 101 | ) 102 | ax.set(xlabel=grouped_by, ylabel="Delay (minutes)") 103 | 104 | elif isinstance(df, pd.DataFrame): 105 | ax = sns.boxplot( 106 | df[["arrival_delay", "departure_delay"]], 107 | showfliers=False, 108 | ) 109 | ax.set(xlabel="Variable", ylabel="Delay (minutes)") 110 | 111 | 112 | def day_train_count(df: pd.DataFrame | DataFrameGroupBy) -> None: 113 | """Show a seaborn barplot of unique train count, grouped by day""" 114 | 115 | if isinstance(df, DataFrameGroupBy): 116 | grouped_by: str = df.any().index.name 117 | 118 | palette: None | dict[str, str] = None 119 | hue_order: None | list[str] = None 120 | 121 | if grouped_by == "client_code": 122 | palette = RAILWAY_COMPANIES_PALETTE 123 | hue_order = ( 124 | df.train_hash.nunique().sort_values(ascending=False).index.to_list() 125 | ) 126 | 127 | grouped = df.obj.groupby(["day", grouped_by]).nunique().reset_index() 128 | grouped["day"] = grouped["day"].apply(lambda d: d.date().isoformat()) 129 | 130 | ax = sns.barplot( 131 | data=grouped, 132 | x="day", 133 | y="train_hash", 134 | hue=grouped_by, 135 | palette=palette, 136 | hue_order=hue_order, 137 | ) 138 | 139 | elif isinstance(df, pd.DataFrame): 140 | grouped = df.groupby("day").nunique().reset_index() 141 | grouped["day"] = grouped["day"].apply(lambda d: d.date().isoformat()) 142 | 143 | ax = sns.barplot( 144 | data=grouped, 145 | x="day", 146 | y="train_hash", 147 | ) 148 | 149 | ax.set(xlabel="Day", ylabel="Train count") 150 | plt.xticks(rotation=45) 151 | 152 | 153 | def detect_lines(df: pd.DataFrame, st: pd.DataFrame) -> None: 154 | """Show a interactive table with the detected (by tag_lines) railway lines""" 155 | 156 | st_names: pd.DataFrame = st.drop( 157 | ["region", "latitude", "longitude", "short_name"], 158 | axis=1, 159 | ) 160 | lines: pd.DataFrame = ( 161 | ( 162 | df.join(st_names, on="origin") 163 | .rename({"long_name": "station_a"}, axis=1) 164 | .join(st_names, on="destination") 165 | .rename({"long_name": "station_b"}, axis=1) 166 | )[["line", "station_a", "station_b", "train_hash", "stop_number"]] 167 | .groupby("line") 168 | .agg( 169 | { 170 | "station_a": "first", 171 | "station_b": "first", 172 | "train_hash": "nunique", 173 | "stop_number": lambda g: max(g) + 1, 174 | } 175 | ) 176 | .rename({"train_hash": "train_count"}, axis=1) 177 | .sort_values(by="train_count", ascending=False) 178 | .reset_index() 179 | ) 180 | html: str = to_html_datatable( 181 | lines, 182 | caption="Detected railway lines", 183 | lengthMenu=[20, 50, 100], 184 | order=[3, "desc"], 185 | maxBytes=2**17, 186 | ) 187 | 188 | outfile = NamedTemporaryFile(delete=False, suffix=".html") 189 | outfile.write(html.encode("utf-8")) 190 | webbrowser.open(outfile.name) 191 | -------------------------------------------------------------------------------- /src/analysis/assets/templates/stats_chart.html: -------------------------------------------------------------------------------- 1 | 18 | 19 | {% macro html(this, kwargs) %} 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 46 | 47 | 48 | 49 |
50 |
51 |
52 | 53 |
54 |
55 |
56 | 57 | 159 | 160 | 161 | 162 | {% endmacro %} 163 | -------------------------------------------------------------------------------- /src/analysis/main.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import logging 20 | import pathlib 21 | import warnings 22 | from datetime import datetime 23 | 24 | import matplotlib.pyplot as plt 25 | import pandas as pd 26 | from dateparser import parse 27 | from joblib import Parallel, delayed 28 | from pandas.core.groupby.generic import DataFrameGroupBy 29 | 30 | from src.analysis import groupby, stat, timetable, trajectories_map 31 | from src.analysis.filter import * 32 | from src.analysis.load_data import read_station_csv, read_train_csv, tag_lines 33 | 34 | 35 | def register_args(parser: argparse.ArgumentParser): 36 | parser.add_argument( 37 | "--start-date", 38 | help="the start date in a 'dateparser'-friendly format", 39 | ) 40 | parser.add_argument( 41 | "--end-date", 42 | help="the end date in a 'dateparser'-friendly format", 43 | ) 44 | parser.add_argument( 45 | "--railway-companies", 46 | help="comma-separated list of railway companies to include. If not set, all companies will be included.", 47 | dest="client_codes", 48 | ) 49 | parser.add_argument( 50 | "--railway-lines", 51 | help=( 52 | "comma-separated list of railway lines to include. " 53 | "If not set, all lines will be include. " 54 | "Use --stat detect_lines to see available lines." 55 | ), 56 | dest="railway_lines", 57 | ) 58 | parser.add_argument( 59 | "--group-by", 60 | help="group by stops by a value", 61 | choices=( 62 | "none", 63 | "train_hash", 64 | "client_code", 65 | "weekday", 66 | ), 67 | default="none", 68 | ) 69 | parser.add_argument( 70 | "--agg-func", 71 | help="group by aggregation function", 72 | choices=( 73 | "none", 74 | "mean", 75 | "last", 76 | ), 77 | default="none", 78 | ) 79 | parser.add_argument( 80 | "--stat", 81 | help="the stat to calculate", 82 | choices=( 83 | "describe", 84 | "delay_boxplot", 85 | "day_train_count", 86 | "trajectories_map", 87 | "detect_lines", 88 | "timetable", 89 | ), 90 | default="describe", 91 | ) 92 | parser.add_argument( 93 | "--save-fig", 94 | metavar="FILENAME", 95 | help="save the output figure to a file if using delay_boxplot or day_train_count stats. If not specified, use pyplot.show()", 96 | default=None, 97 | ) 98 | parser.add_argument( 99 | "--timetable-collapse", 100 | help="collapse the train stop times in the graph, relative to the first (only for 'timetable' stat). Defaults to False", 101 | action=argparse.BooleanOptionalAction, 102 | default=False, 103 | ) 104 | parser.add_argument( 105 | "station_csv", 106 | help="exported station CSV", 107 | ) 108 | parser.add_argument( 109 | "trains_csv", 110 | nargs="+", 111 | help="exported train CSV", 112 | ) 113 | 114 | 115 | @delayed 116 | def _load_train_dataset(train_csv: str) -> pd.DataFrame: 117 | path = pathlib.Path(train_csv) 118 | train_df: pd.DataFrame = read_train_csv(pathlib.Path(train_csv)) 119 | logging.debug(f"Loaded {len(train_df)} data points @ {path}") 120 | return train_df 121 | 122 | 123 | def main(args: argparse.Namespace): 124 | with warnings.catch_warnings(): 125 | warnings.simplefilter("ignore") 126 | 127 | start_date: datetime | None = parse(args.start_date if args.start_date else "") 128 | if args.start_date and not start_date: 129 | raise argparse.ArgumentTypeError("invalid start_date") 130 | 131 | end_date: datetime | None = parse(args.end_date if args.end_date else "") 132 | if args.end_date and not end_date: 133 | raise argparse.ArgumentTypeError("invalid end_date") 134 | 135 | railway_companies: str | None = args.client_codes 136 | railway_lines: str | None = args.railway_lines 137 | 138 | # Load dataset 139 | df: pd.DataFrame | DataFrameGroupBy = pd.DataFrame() 140 | logging.info("Loading datasets...") 141 | 142 | for train_df in Parallel(n_jobs=-1, verbose=5)( 143 | _load_train_dataset(train_csv) for train_csv in args.trains_csv # type: ignore 144 | ): 145 | df = pd.concat([df, train_df], axis=0) 146 | 147 | df.reset_index(drop=True, inplace=True) 148 | 149 | stations: pd.DataFrame = read_station_csv(args.station_csv) 150 | original_length: int = len(df) 151 | 152 | # Tag lines 153 | df = tag_lines(df, stations) 154 | 155 | # Apply filters 156 | df = date_filter(df, start_date, end_date) 157 | df = railway_company_filter(df, railway_companies) 158 | df = railway_lines_filter(df, railway_lines) 159 | logging.info(f"Loaded {len(df)} data points ({original_length} before filtering)") 160 | 161 | # Prepare graphics 162 | stat.prepare_mpl(df, args) 163 | 164 | if args.group_by != "none": 165 | df_grouped: DataFrameGroupBy | None = None 166 | 167 | if args.group_by == "train_hash": 168 | df_grouped = groupby.train_hash(df) 169 | elif args.group_by == "client_code": 170 | df_grouped = groupby.client_code(df) 171 | elif args.group_by == "weekday": 172 | df_grouped = groupby.weekday(df) 173 | 174 | assert df_grouped is not None 175 | 176 | if args.agg_func == "last": 177 | df = df_grouped.last() 178 | elif args.agg_func == "mean": 179 | df = df_grouped.mean(numeric_only=True) 180 | elif args.agg_func == "none": 181 | df = df_grouped 182 | 183 | if args.stat in [ 184 | "trajectories_map", 185 | "detect_lines", 186 | "timetable", 187 | ] and not isinstance(df, pd.DataFrame): 188 | raise ValueError(f"can't use {args.stat} with unaggregated data") 189 | 190 | if args.stat == "describe": 191 | stat.describe(df) 192 | elif args.stat == "delay_boxplot": 193 | stat.delay_boxplot(df) 194 | elif args.stat == "day_train_count": 195 | stat.day_train_count(df) 196 | elif args.stat == "trajectories_map": 197 | trajectories_map.build_map(stations, df) 198 | elif args.stat == "detect_lines": 199 | stat.detect_lines(df, stations) 200 | elif args.stat == "timetable": 201 | if not timetable.same_line(df): 202 | raise ValueError( 203 | f"can't use timetable if --railway-lines filter is not used" 204 | ) 205 | timetable.timetable_graph(df, stations, args.timetable_collapse) 206 | 207 | # Visualizations only 208 | if args.stat in ["delay_boxplot", "day_train_count", "timetable"]: 209 | plt.tight_layout() 210 | if args.save_fig: 211 | plt.savefig(args.save_fig) 212 | else: 213 | plt.show() 214 | -------------------------------------------------------------------------------- /src/train_extractor.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import argparse 19 | import csv 20 | import hashlib 21 | import pickle 22 | from datetime import date, datetime 23 | from pathlib import Path 24 | 25 | from src.const import TIMEZONE 26 | from src.scraper.train import Train 27 | from src.scraper.train_stop import TrainStopTime 28 | from src.utils import parse_input_format_output_args 29 | 30 | 31 | def load_file(file: Path) -> dict[int, Train]: 32 | """Load a train data pickle file and return it. 33 | 34 | Args: 35 | file (Path): the file to load 36 | 37 | Returns: 38 | dict[int, Train]: the train data contained in the file 39 | 40 | Notes: 41 | Before commit 48966dfab25553650e3d743a4ecc77db02c4b30, 42 | departure and arrival timestamps dates of Trenord trains 43 | were all 1900-01-01. 44 | This function fixes such incorrect dates. 45 | """ 46 | with open(file, "rb") as f: 47 | data: dict[int, Train] = pickle.load(f) 48 | 49 | def _fix_datetime(train: Train, dt: datetime | None) -> datetime | None: 50 | """Fix departure and arrival timestamps""" 51 | if isinstance(dt, datetime) and dt.year < 2000: 52 | dep_date: date = train.departing_date 53 | dt = dt.replace( 54 | year=dep_date.year, 55 | month=dep_date.month, 56 | day=dep_date.day, 57 | tzinfo=TIMEZONE, 58 | ) 59 | return dt 60 | 61 | def _detect_crazy_time_difference(train: Train, time: TrainStopTime): 62 | """Ignore trains if the difference between expected and actual 63 | times in a stop is greater than one day. 64 | 65 | Example: 66 | REG Train 17907 operated by TPER. S05311 stop on 2023-03-30. 67 | arrival_expected 2025-08-30 17:33:00+02:00 68 | arrival_actual 2023-03-30 17:34:30+02:00 69 | arrival_delay -1438.5 70 | """ 71 | if not time.actual or not time.expected: 72 | return 73 | 74 | if abs((time.actual - time.expected).days) > 1: 75 | train._phantom = True 76 | 77 | for train_h in data: 78 | train: Train = data[train_h] 79 | 80 | for stop in train.stops if isinstance(train.stops, list) else []: 81 | if isinstance(stop.arrival, TrainStopTime): 82 | _detect_crazy_time_difference(train, stop.arrival) 83 | stop.arrival.actual = _fix_datetime(train, stop.arrival.actual) 84 | stop.arrival.expected = _fix_datetime(train, stop.arrival.expected) # type: ignore 85 | if isinstance(stop.departure, TrainStopTime): 86 | _detect_crazy_time_difference(train, stop.departure) 87 | stop.departure.actual = _fix_datetime(train, stop.departure.actual) 88 | stop.departure.expected = _fix_datetime(train, stop.departure.expected) # type: ignore 89 | 90 | if train.client_code == 63: 91 | train._fix_intraday_datetimes() 92 | 93 | return data 94 | 95 | 96 | def to_csv(data: dict[int, Train], output_file: Path) -> None: 97 | """Convert to CSV train data, one row per stop. 98 | 99 | Args: 100 | data (dict[int, Train]): the data to convert 101 | output_file (Path): the file to write 102 | """ 103 | FIELDS: tuple = ( 104 | "train_hash", 105 | "number", 106 | "day", 107 | "origin", 108 | "destination", 109 | "category", 110 | "client_code", 111 | "phantom", 112 | "trenord_phantom", 113 | "cancelled", 114 | "stop_number", 115 | "stop_station_code", 116 | "stop_type", 117 | "platform", 118 | "arrival_expected", 119 | "arrival_actual", 120 | "arrival_delay", 121 | "departure_expected", 122 | "departure_actual", 123 | "departure_delay", 124 | "crowding", 125 | ) 126 | 127 | csvfile = open(output_file, "w+", newline="") 128 | writer = csv.writer( 129 | csvfile, 130 | delimiter=",", 131 | quotechar="|", 132 | quoting=csv.QUOTE_MINIMAL, 133 | ) 134 | writer.writerow(FIELDS) 135 | 136 | for train_h in data: 137 | train: Train = data[train_h] 138 | 139 | for i, stop in enumerate(train.stops) if isinstance(train.stops, list) else []: 140 | writer.writerow( 141 | ( 142 | hashlib.md5(str(train_h).encode("ascii")).hexdigest(), 143 | train.number, 144 | train.departing_date.isoformat(), 145 | train.origin.code, 146 | train.destination.code if train.destination else None, 147 | train.category, 148 | train.client_code, 149 | train._phantom, 150 | train._trenord_phantom 151 | if hasattr(train, "_trenord_phantom") 152 | else False, 153 | train.cancelled, 154 | i, 155 | stop.station.code, 156 | stop.stop_type.value, 157 | stop.platform_actual or stop.platform_expected, 158 | stop.arrival.expected.isoformat() 159 | if stop.arrival and stop.arrival.expected 160 | else None, 161 | stop.arrival.actual.isoformat() 162 | if stop.arrival and stop.arrival.actual 163 | else None, 164 | stop.arrival.delay() if stop.arrival else None, 165 | stop.departure.expected.isoformat() 166 | if stop.departure and stop.departure.expected 167 | else None, 168 | stop.departure.actual.isoformat() 169 | if stop.departure and stop.departure.actual 170 | else None, 171 | stop.departure.delay() if stop.departure else None, 172 | train.crowding if hasattr(train, "crowding") else None, 173 | ) 174 | ) 175 | 176 | csvfile.close() 177 | 178 | 179 | def register_args(parser: argparse.ArgumentParser): 180 | parser.add_argument( 181 | "pickle_file", 182 | help=".pickle file to parse", 183 | metavar="PICKLE_FILE", 184 | ) 185 | parser.add_argument( 186 | "-f", 187 | default="csv", 188 | choices=[ 189 | "csv", 190 | ], 191 | help="output file format", 192 | dest="format", 193 | ) 194 | parser.add_argument( 195 | "-o", 196 | help="output file name", 197 | metavar="OUTPUT_FILE", 198 | dest="output_file", 199 | ) 200 | 201 | 202 | def main(args: argparse.Namespace): 203 | input_f, output_f, format = parse_input_format_output_args(args) 204 | 205 | data: dict[int, Train] = load_file(input_f) 206 | if format == "csv": 207 | to_csv(data, output_f) 208 | -------------------------------------------------------------------------------- /src/scraper/station.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import logging 19 | import typing as t 20 | 21 | import src.scraper.api as api 22 | import src.scraper.train as tr 23 | from src import types 24 | from src.scraper.exceptions import BadRequestException 25 | 26 | 27 | class Station: 28 | """A ViaggiaTreno station. 29 | 30 | Attributes: 31 | code (str): the station code, used in API calls (e.g. S01700) 32 | region_code (int): the code of the region where the station is located 33 | name (str | None): the station name (e.g. Milano Centrale) 34 | short_name (str | None): a shortened version of the name (e.g. Milano C.le) 35 | position (Tuple[float, float] | None): the latitude and longitude of the station 36 | 37 | Other attributes: 38 | _phantom (bool): if True, the details of the station can't be fetched 39 | """ 40 | 41 | _cache: dict[str, "Station"] = dict() 42 | 43 | def __init__( 44 | self, 45 | code: str, 46 | region_code: int, 47 | name: str | None, 48 | short_name: str | None = None, 49 | position: t.Tuple[float, float] | None = None, 50 | ) -> None: 51 | """Initialize a new station. 52 | 53 | Args: 54 | code (str): the station code, used in API calls (e.g. S01700) 55 | region_code (int): the code of the region where the station is located 56 | name (str | None): the station name (e.g. Milano Centrale) 57 | short_name (str | None, optional): a shortened version of the name (e.g. Milano C.le) 58 | position (Tuple[float, float] | None, optional): the latitude and longitude of the station 59 | """ 60 | self.code: str = code 61 | self.region_code: int = region_code 62 | self.name: str | None = None 63 | if name: 64 | self.name: str | None = name.title().strip() 65 | self.short_name: str | None = ( 66 | short_name.title().strip() if short_name else name 67 | ) 68 | self.position: t.Tuple[float, float] | None = position 69 | 70 | self._phantom: bool = self.name == None 71 | 72 | @classmethod 73 | def _from_raw(cls, raw_data: dict) -> "Station": 74 | """Initialize a new station from raw API data, or use the class cache. 75 | 76 | Args: 77 | station_data (dict): raw data returned by the API. 78 | """ 79 | station_code = raw_data["codStazione"] 80 | 81 | if station_code not in cls._cache: 82 | cls._cache[station_code] = cls( 83 | code=station_code, 84 | region_code=raw_data["codReg"], 85 | name=raw_data["localita"]["nomeLungo"], 86 | short_name=raw_data["localita"]["nomeBreve"], 87 | position=(raw_data["lat"], raw_data["lon"]), 88 | ) 89 | else: 90 | cached: Station = cls._cache[station_code] 91 | 92 | # codReg can have multiple values depending on the request. 93 | # If an inequality is detected, settle the correct region_code once for all. 94 | if raw_data["codReg"] != cached.region_code: 95 | logging.warning( 96 | f"Provided region code for {station_code} is different from the cached one" 97 | ) 98 | cached.region_code = Station._region_code(station_code) 99 | 100 | return cls._cache[station_code] 101 | 102 | def __repr__(self) -> str: 103 | return f"{self.name} [{self.code}@{self.region_code}]" 104 | 105 | @classmethod 106 | def by_code(cls, station_code: str) -> "Station": 107 | """Retrieve a station by its code, or use cache. 108 | 109 | Args: 110 | station_code (str): the station code 111 | 112 | Returns: 113 | Station: a station corresponding to the passed station code 114 | """ 115 | if station_code not in cls._cache: 116 | try: 117 | region_code: int = cls._region_code(station_code) 118 | except BadRequestException as e: 119 | if e.status_code != 204: 120 | raise e 121 | 122 | region_code: int = 0 123 | 124 | try: 125 | response: str = api.ViaggiaTrenoAPI._raw_request( 126 | "dettaglioStazione", station_code, region_code 127 | ) 128 | raw_data: types.JSONType = api.ViaggiaTrenoAPI._decode_json(response) 129 | cls._cache[station_code] = cls._from_raw(raw_data) 130 | except BadRequestException as e: 131 | if e.status_code != 204: 132 | raise e 133 | 134 | cls._cache[station_code] = cls( 135 | code=station_code, 136 | region_code=region_code, 137 | name=None, 138 | ) 139 | 140 | return cls._cache[station_code] 141 | 142 | @staticmethod 143 | def _region_code(station_code: str) -> int: 144 | """Retrieve the region code of a given station (by its code). 145 | 146 | Args: 147 | station_code (str): the code of the station to check 148 | 149 | Raises: 150 | BadRequestException: if the response is not ok 151 | 152 | Returns: 153 | int: the region code of the given station 154 | """ 155 | region_code = api.ViaggiaTrenoAPI._raw_request("regione", station_code) 156 | return int(region_code) 157 | 158 | @classmethod 159 | def by_region(cls, region_code: int) -> t.List["Station"]: 160 | """Retrieve the list of train stations of a given region. 161 | 162 | Args: 163 | region_code (int): the code of the region to query 164 | 165 | Returns: 166 | t.List[Station]: a list of train stations 167 | """ 168 | raw_stations: str = api.ViaggiaTrenoAPI._raw_request( 169 | "elencoStazioni", region_code 170 | ) 171 | stations: types.JSONType = api.ViaggiaTrenoAPI._decode_json(raw_stations) 172 | return list( 173 | map( 174 | lambda s: cls._from_raw(s), 175 | filter(lambda s: s["tipoStazione"] != 4, stations), 176 | ) 177 | ) 178 | 179 | def departures(self) -> t.List["tr.Train"]: 180 | """Retrieve the departures of a train station. 181 | 182 | Args: 183 | station_code (str): the code of the considered station 184 | 185 | Returns: 186 | t.List[Train]: a list of trains departing from the station 187 | """ 188 | return api.ViaggiaTrenoAPI._station_departures_or_arrivals( 189 | "partenze", self.code 190 | ) 191 | 192 | def arrivals(self) -> t.List["tr.Train"]: 193 | """Retrieve the arrivals of a train station. 194 | 195 | Args: 196 | station_code (str): the code of the considered station 197 | 198 | Returns: 199 | t.List[Train]: a list of trains departing from the station 200 | """ 201 | return api.ViaggiaTrenoAPI._station_departures_or_arrivals("arrivi", self.code) 202 | 203 | def __hash__(self) -> int: 204 | return hash(self.name) 205 | -------------------------------------------------------------------------------- /src/analysis/assets/templates/marker_legend.html: -------------------------------------------------------------------------------- 1 | 18 | 19 | {% macro html(this, kwargs) %} 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 63 | 64 | 65 | 66 |
67 |
68 |
69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 82 | 88 | 94 | 95 | 96 | 97 | 103 | 109 | 115 | 116 | 117 | 118 | 124 | 130 | 131 | 132 | 133 | 134 | 140 | 146 | 147 | 148 | 149 |

Regional

77 |
78 | 79 | Trenitalia 80 |
81 |
83 |
84 | 85 | Trenord 86 |
87 |
89 |
90 | 91 | TPER 92 |
93 |

High speed

98 |
99 | 100 | Frecciarossa 101 |
102 |
104 |
105 | 106 | Frecciargento 107 |
108 |
110 |
111 | 112 | Frecciabianca 113 |
114 |

Long haul

119 |
120 | 121 | Intercity 122 |
123 |
125 |
126 | 127 | IC Notte 128 |
129 |

International

135 |
136 | 137 | Eurocity 138 |
139 |
141 |
142 | 143 | OBB 144 |
145 |
150 |
151 |
152 |
153 | 154 | 155 | 156 | {% endmacro %} 157 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### VSCodium 2 | .vscode/* 3 | !.vscode/settings.json 4 | !.vscode/tasks.json 5 | !.vscode/launch.json 6 | !.vscode/extensions.json 7 | !.vscode/*.code-snippets 8 | 9 | # Local History for Visual Studio Code 10 | .history/ 11 | 12 | # Built Visual Studio Code Extensions 13 | *.vsix 14 | 15 | ### Python 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | cover/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | .pybuilder/ 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | # For a library or package, you might want to ignore these files since the code is 102 | # intended to run in multiple environments; otherwise, check them in: 103 | # .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # poetry 113 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 114 | # This is especially recommended for binary packages to ensure reproducibility, and is more 115 | # commonly ignored for libraries. 116 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 117 | #poetry.lock 118 | 119 | # pdm 120 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 121 | #pdm.lock 122 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 123 | # in version control. 124 | # https://pdm.fming.dev/#use-with-ide 125 | .pdm.toml 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | # PyCharm 171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 173 | # and can be added to the global gitignore or merged into this file. For a more nuclear 174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 175 | #.idea/ 176 | 177 | ### Rust 178 | # Generated by Cargo 179 | # will have compiled files and executables 180 | debug/ 181 | target/ 182 | 183 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 184 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 185 | Cargo.lock 186 | 187 | # These are backup files generated by rustfmt 188 | **/*.rs.bk 189 | 190 | # MSVC Windows builds of rustc generate these, which store debugging information 191 | *.pdb 192 | 193 | ### TeX 194 | ## Core latex/pdflatex auxiliary files: 195 | *.aux 196 | *.lof 197 | *.log 198 | *.lot 199 | *.fls 200 | *.out 201 | *.toc 202 | *.fmt 203 | *.fot 204 | *.cb 205 | *.cb2 206 | .*.lb 207 | 208 | ## Intermediate documents: 209 | *.dvi 210 | *.xdv 211 | *-converted-to.* 212 | # these rules might exclude image files for figures etc. 213 | # *.ps 214 | # *.eps 215 | # *.pdf 216 | 217 | ## Generated if empty string is given at "Please type another file name for output:" 218 | .pdf 219 | 220 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 221 | *.bbl 222 | *.bcf 223 | *.blg 224 | *-blx.aux 225 | *-blx.bib 226 | *.run.xml 227 | 228 | ## Build tool auxiliary files: 229 | *.fdb_latexmk 230 | *.synctex 231 | *.synctex(busy) 232 | *.synctex.gz 233 | *.synctex.gz(busy) 234 | *.pdfsync 235 | 236 | ## Build tool directories for auxiliary files 237 | # latexrun 238 | latex.out/ 239 | 240 | ## Auxiliary and intermediate files from other packages: 241 | # algorithms 242 | *.alg 243 | *.loa 244 | 245 | # achemso 246 | acs-*.bib 247 | 248 | # amsthm 249 | *.thm 250 | 251 | # beamer 252 | *.nav 253 | *.pre 254 | *.snm 255 | *.vrb 256 | 257 | # changes 258 | *.soc 259 | 260 | # comment 261 | *.cut 262 | 263 | # cprotect 264 | *.cpt 265 | 266 | # elsarticle (documentclass of Elsevier journals) 267 | *.spl 268 | 269 | # endnotes 270 | *.ent 271 | 272 | # fixme 273 | *.lox 274 | 275 | # feynmf/feynmp 276 | *.mf 277 | *.mp 278 | *.t[1-9] 279 | *.t[1-9][0-9] 280 | *.tfm 281 | 282 | #(r)(e)ledmac/(r)(e)ledpar 283 | *.end 284 | *.?end 285 | *.[1-9] 286 | *.[1-9][0-9] 287 | *.[1-9][0-9][0-9] 288 | *.[1-9]R 289 | *.[1-9][0-9]R 290 | *.[1-9][0-9][0-9]R 291 | *.eledsec[1-9] 292 | *.eledsec[1-9]R 293 | *.eledsec[1-9][0-9] 294 | *.eledsec[1-9][0-9]R 295 | *.eledsec[1-9][0-9][0-9] 296 | *.eledsec[1-9][0-9][0-9]R 297 | 298 | # glossaries 299 | *.acn 300 | *.acr 301 | *.glg 302 | *.glo 303 | *.gls 304 | *.glsdefs 305 | *.lzo 306 | *.lzs 307 | *.slg 308 | *.slo 309 | *.sls 310 | 311 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 312 | # *.ist 313 | 314 | # gnuplot 315 | *.gnuplot 316 | *.table 317 | 318 | # gnuplottex 319 | *-gnuplottex-* 320 | 321 | # gregoriotex 322 | *.gaux 323 | *.glog 324 | *.gtex 325 | 326 | # htlatex 327 | *.4ct 328 | *.4tc 329 | *.idv 330 | *.lg 331 | *.trc 332 | *.xref 333 | 334 | # hyperref 335 | *.brf 336 | 337 | # knitr 338 | *-concordance.tex 339 | # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files 340 | # *.tikz 341 | *-tikzDictionary 342 | 343 | # listings 344 | *.lol 345 | 346 | # luatexja-ruby 347 | *.ltjruby 348 | 349 | # makeidx 350 | *.idx 351 | *.ilg 352 | *.ind 353 | 354 | # minitoc 355 | *.maf 356 | *.mlf 357 | *.mlt 358 | *.mtc[0-9]* 359 | *.slf[0-9]* 360 | *.slt[0-9]* 361 | *.stc[0-9]* 362 | 363 | # minted 364 | _minted* 365 | *.pyg 366 | 367 | # morewrites 368 | *.mw 369 | 370 | # newpax 371 | *.newpax 372 | 373 | # nomencl 374 | *.nlg 375 | *.nlo 376 | *.nls 377 | 378 | # pax 379 | *.pax 380 | 381 | # pdfpcnotes 382 | *.pdfpc 383 | 384 | # sagetex 385 | *.sagetex.sage 386 | *.sagetex.py 387 | *.sagetex.scmd 388 | 389 | # scrwfile 390 | *.wrt 391 | 392 | # svg 393 | svg-inkscape/ 394 | 395 | # sympy 396 | *.sout 397 | *.sympy 398 | sympy-plots-for-*.tex/ 399 | 400 | # pdfcomment 401 | *.upa 402 | *.upb 403 | 404 | # pythontex 405 | *.pytxcode 406 | pythontex-files-*/ 407 | 408 | # tcolorbox 409 | *.listing 410 | 411 | # thmtools 412 | *.loe 413 | 414 | # TikZ & PGF 415 | *.dpth 416 | *.md5 417 | *.auxlock 418 | 419 | # titletoc 420 | *.ptc 421 | 422 | # todonotes 423 | *.tdo 424 | 425 | # vhistory 426 | *.hst 427 | *.ver 428 | 429 | # easy-todo 430 | *.lod 431 | 432 | # xcolor 433 | *.xcp 434 | 435 | # xmpincl 436 | *.xmpi 437 | 438 | # xindy 439 | *.xdy 440 | 441 | # xypic precompiled matrices and outlines 442 | *.xyc 443 | *.xyd 444 | 445 | # endfloat 446 | *.ttt 447 | *.fff 448 | 449 | # Latexian 450 | TSWLatexianTemp* 451 | 452 | ## Editors: 453 | # WinEdt 454 | *.bak 455 | *.sav 456 | 457 | # Texpad 458 | .texpadtmp 459 | 460 | # LyX 461 | *.lyx~ 462 | 463 | # Kile 464 | *.backup 465 | 466 | # gummi 467 | .*.swp 468 | 469 | # KBibTeX 470 | *~[0-9]* 471 | 472 | # TeXnicCenter 473 | *.tps 474 | 475 | # auto folder when using emacs and auctex 476 | ./auto/* 477 | *.el 478 | 479 | # expex forward references with \gathertags 480 | *-tags.tex 481 | 482 | # standalone packages 483 | *.sta 484 | 485 | # Makeindex log files 486 | *.lpz 487 | 488 | # xwatermark package 489 | *.xwm 490 | 491 | # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib 492 | # option is specified. Footnotes are the stored in a file with suffix Notes.bib. 493 | # Uncomment the next line to have this generated file ignored. 494 | #*Notes.bib 495 | 496 | # Emacs .gitignore 497 | *~ 498 | \#*\# 499 | /.emacs.desktop 500 | /.emacs.desktop.lock 501 | *.elc 502 | auto-save-list 503 | tramp 504 | .\#* 505 | 506 | # Org-mode 507 | .org-id-locations 508 | *_archive 509 | 510 | # flymake-mode 511 | *_flymake.* 512 | 513 | # eshell files 514 | /eshell/history 515 | /eshell/lastdir 516 | 517 | # elpa packages 518 | /elpa/ 519 | 520 | # reftex files 521 | *.rel 522 | 523 | # AUCTeX auto folder 524 | /auto/ 525 | 526 | # cask packages 527 | .cask/ 528 | dist/ 529 | 530 | # Flycheck 531 | flycheck_*.el 532 | 533 | # server auth directory 534 | /server/ 535 | 536 | # projectiles files 537 | .projectile 538 | 539 | # directory configuration 540 | .dir-locals.el 541 | 542 | # network security 543 | /network-security.data 544 | 545 | ### Custom 546 | data/* 547 | !.gitkeep 548 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RailScrape (railway-opendata) 2 | 3 | In Italy there are no available official **Open Data** about the _performance_ (delays, cancellations, ...) of the **italian public rail transport**. 4 | This project offers a tool which allows anyone to gather it and run some stats and visualizations. 5 | 6 | ## Architecture 7 | 8 | ```mermaid 9 | flowchart TB 10 | 11 | S[Scraper] --> |Downloads data| D("ViaggiaTreno and Trenord APIs") 12 | S -->|Produces| P[(Daily .pickle dumps)] 13 | E[Extractor] -->|Reads| P 14 | E[Extractor] -->|Produces| C[(Daily .CSV dumps)] 15 | A2["(BYOD Analyzer)"] -.->|Reads| C 16 | A[Analyzer] -->|Reads| C 17 | A[Analyzer] -->|Produces| K(Stats, visualizations, etc...) 18 | ``` 19 | 20 | The application is composed by multiple modules, accessible via CLI: 21 | - **`scraper`**: unattended script to incrementally download and preserve the current status of the italian railway network. If run constantly (e.g. ~every hour using `cron`) all trains will be captured and saved in `data/%Y-%m-%d/trains.pickle`. 22 | - **`train-extractor`** and **`station-extractor`**: converts raw scraped data to usable `.csv` files; 23 | - **`analyze`** : shows reproducible stats and visualizations. 24 | 25 | ## Running 26 | 27 | The project is written in Python and it uses modern typing annotations, so **Python >= 3.11** is needed. 28 | 29 | ### Using Docker (easy) 30 | 31 | A [Dockerfile](./Dockerfile) is available to avoid installing the dependencies manually. 32 | You can use the automatically updated [ghcr.io/marcobuster/railway-opendata:latest](https://github.com/MarcoBuster/railway-opendata/pkgs/container/railway-opendata) 33 | Docker image if you want the latest version available on the master branch. 34 | 35 | For instance, the following command will start the scraper on your machine. 36 | 37 | ```bash 38 | $ docker run -v ./data:/app/data ghcr.io/marcobuster/railway-opendata:latest scraper 39 | ``` 40 | 41 | ### Using virtual envs 42 | 43 | > ⚠️ __WARNING__: this project currently uses the builtin `hash(...)` function to quickly index objects. 44 | > To ensure reproducibility between runs, you need to disable Python's **hash seed randomization** by setting the `PYTHONHASHSEED=0` environment variable. 45 | > If you fail to do so, the software will refuse to start. 46 | 47 | ```bash 48 | $ export PYTHONHASHSEED=0 49 | $ virtualenv venv 50 | $ source ./venv/bin/activate 51 | $ pip install -r requirements.txt 52 | $ python main.py ... 53 | ``` 54 | 55 | ## Example usages 56 | 57 | - __Start the scraper__. For continuos data collection, it should be run every ~hour. 58 | 59 | `$ python main.py scraper` 60 | 61 | - __Extract train data__ from a pickle file and save it in CSV. 62 | 63 | `$ python main.py train-extractor -o data/2023/04-29/trains.csv data/2023-04-29/trains.pickle` 64 | 65 | - __Extract station data__ from a pickle file and save it in GeoJSON. 66 | 67 | `$ python main.py station-extractor -f geojson data/stations.pickle` 68 | 69 | - __Describe a dataset__ and filter observation by date. 70 | 71 | `$ python main.py analyze --start-date 2023-05-01 --end-date today data/stations.csv data/2023-05-*/trains.csv --stat describe` 72 | 73 | - __Show delay stats__ of the last stop. 74 | 75 | `$ python main.py analyze --group-by train_hash --agg-func last [..]/stations.csv [..]/trains.csv --stat delay_box_plot` 76 | 77 | - __Show daily train count__ grouped by railway companies. 78 | 79 | `$ python main.py analyze --group-by client_code [..]/stations.csv [..]/trains.csv --stat day_train_count` 80 | 81 | - __Display an interactive map__ and open it in the web browser. 82 | 83 | `$ python main.py analyze [..]/stations.csv [..]/trains.csv --stat trajectories_map` 84 | 85 | - __Display a timetable graph__. 86 | 87 | `$ python main.py analyze [..]/stations.csv [..]/trains.csv --stat timetable --timetable-collapse` 88 | 89 | ## Fields 90 | 91 | ### Stations CSV 92 | 93 | | Column | Data type | Description | Notes | 94 | |--------|-----------|-------------|-------| 95 | | `code` | String | Station code | This field is not actually unique. One station can have multiple codes | 96 | | `region` | Integer | Region code | If zero, unknown. Used in API calls | 97 | | `long_name` | String | Station long name | | 98 | | `short_name` | String | Station short name | Can be empty | 99 | | `latitude` | Float | Station latitude | Can be empty | 100 | | `longitude` | Float | Station longitude | Can be empty | 101 | 102 | ### Trains CSV 103 | In the extracted trains CSV, each line is a _train stop_ (not station nor train). 104 | Many fields are actually duplicated. 105 | 106 | | Column | Data type | Description | Notes | 107 | |--------|-----------|-------------|-------| 108 | | `train_hash` | MD5 hash | Unique identifier for a particular train | | 109 | | `number` | Integer | Train number | Can't be used to uniquely identify a train[^train_number_unique] | 110 | | `day` | Date | Train departing date | | 111 | | `origin` | Station (code) | Train absolute origin | | 112 | | `category` | String | Train Category | See table[^categories] | 113 | | `destination` | Station (code) | Train final destination | | 114 | | `client_code` | Integer | Railway company | See table[^client_codes] | 115 | | `phantom` | Boolean | True if train was only partially fetched | Trains with this flag should be safely ignored | 116 | | `trenord_phantom` | Boolean | True if the train was only partially fetched using Trenord APIs | Trains with this flag should be safely ignored[^trenord_phantom] | 117 | | `cancelled` | Boolean | True if the train is marked as cancelled | Not all cancelled trains are marked as cancelled: for more accuracy, you should always check `stop_type` | 118 | | `stop_number` | Integer | Stop progressive number (starting at 0) | | 119 | | `stop_station_code` | Station (code) | Stop station code | | 120 | | `stop_type` | Char | Stop type | `P` if first, `F` if intermediate, `A` if last, `C` if cancelled | 121 | | `platform` | String | Stop platform | Can be empty | 122 | | `arrival_expected` | ISO 8601 | Stop expected arrival time | Can be empty | 123 | | `arrival_actual` | ISO 8601 | Stop actual arriving time | Can be empty | 124 | | `arrival_delay` | Integer | Stop arriving delay in minutes | Is empty if `arrival_expected` or `arrival_actual` are both empty | 125 | | `departure_expected` | ISO 8601 | Stop expected departing time | Can be empty | 126 | | `departure_actual` | ISO 8601 | Stop actual departing time | Can be empty | 127 | | `departure_delay` | Integer | Stop departing delay in minutes | Is empty if `departing_expected` or `departing_actual` are both empty | 128 | | `crowding` | Integer | Train crowding in percentage | Reported by Trenord | 129 | 130 | [^train_number_unique]: In Italy, two different trains can share the same number. A train is only uniquely identified by the triple (number, origin, day). 131 | 132 | [^categories]: Known categories are listed below. 133 | 134 | | Category | Description | 135 | |----------|-------------| 136 | | REG | Regional trains | 137 | | MET | Metropolitan trains | 138 | | FR | Frecciarossa (red arrow) | 139 | | IC | Intercity | 140 | | ICN | Intercity Night | 141 | | EC | Eurocity | 142 | | FB | Frecciabianca (white arrow) | 143 | | FA | Frecciargento (silver arrow) | 144 | | EN | EuroNight | 145 | | EC ER | Eurocity | 146 | 147 | [^client_codes]: Known client codes are listed below. 148 | 149 | | Client code | Railway company | 150 | |-------------|-----------------| 151 | | 1 | TRENITALIA_AV | 152 | | 2 | TRENITALIA_REG | 153 | | 4 | TRENITALIA_IC | 154 | | 18 | TPER | 155 | | 63 | TRENORD | 156 | | 64 | OBB | 157 | 158 | [^trenord_phantom]: This flag is activated when a train is seen on ViaggiaTreno APIs and marked as Trenord's but it can't be fetched on Trenord's APIs. 159 | 160 | ## Contributing 161 | 162 | See [CONTRIBUTING.md](CONTRIBUTING.md). 163 | 164 | ## Notes and caveats 165 | 166 | ### Data completeness and correctness 167 | 168 | The [ViaggiaTreno](https://viaggiatreno.it) APIs are [known](https://medium.com/@albigiu/trenitalia-shock-non-crederete-mai-a-queste-api-painful-14433096502c) to be **buggy** and **unreliable**. 169 | As stated before, many fields (like `departure_expected` and `arrival_expected`) are not always guaranteed to be present and some concepts are counter-intuitive (a train number is not an unique identifier nor are station codes). 170 | 171 | ViaggiaTreno is the main _source of truth_ for many final user applications (like [Trenìt!](https://play.google.com/store/apps/details?id=eu.baroncelli.oraritrenitalia) or [Orario Treni](https://play.google.com/store/apps/details?id=org.paoloconte.treni_lite)) and is itself linked on the Trenitalia official website. 172 | For instance, if the API does not return information for a train stop, no other application will display it: the data simply does not exists online. 173 | The scraper always tries to save as much data as possible (___"best effort"___) even when is probably incomplete; in those cases, proper flags (like `phantom` and `trenord_phantom`) are activated so the developer can choose for themselves. 174 | 175 | ### Licensing 176 | 177 | Copyright (c) 2023 Marco Aceti. Some rights reserved (see [LICENSE](./LICENSE)). 178 | 179 | Terms and conditions of the ViaggiaTreno web portal state that copying is prohibited (except for personal use) as **all rights for the content are reserved** to the original owner (Trenitalia or Gruppo FS). 180 | In July 2019 Trenitalia sued Trenìt for using train data in its app, but [partially lost](https://www.wired.it/lifestyle/mobilita/2019/09/06/trenitalia-tornata-online-trenit/). 181 | I think data about the performance of __public__ transport should be __open__ as well, but I'm not a lawyer and I'm not willing to risk lawsuits redistributing data; if someone wants to, the tool is now available. 182 | 183 | BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 184 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 185 | THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. 186 | SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 187 | 188 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 189 | -------------------------------------------------------------------------------- /src/scraper/train_stop.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import typing as t 19 | from datetime import date, datetime 20 | from enum import Enum 21 | 22 | import src.scraper.api as api 23 | import src.scraper.station as st 24 | from src.scraper.exceptions import IncompleteTrenordStopDataException 25 | 26 | 27 | class TrainStopType(Enum): 28 | """A train stop type.""" 29 | 30 | FIRST = "P" 31 | STOP = "F" 32 | LAST = "A" 33 | CANCELLED = "C" 34 | 35 | 36 | class TrainStopTime: 37 | """Helper class to handle arrival and departures times. 38 | 39 | Attributes: 40 | expected (datetime): expected departing or arrival time 41 | actual (datetime | None): actual departing or arrival time 42 | """ 43 | 44 | def __init__(self, expected: datetime, actual: datetime | None) -> None: 45 | """Initialize a new TrainStopTime object. 46 | 47 | Args: 48 | expected (datetime): expected departing or arrival time 49 | actual (datetime | None): actual departing or arrival time 50 | """ 51 | assert expected is not None 52 | 53 | self.expected: datetime = expected 54 | self.actual: datetime | None = actual 55 | 56 | def passed(self) -> bool: 57 | """Return if the train actually arrived or departed from the station. 58 | 59 | Returns: 60 | bool: True if the actual time is not None 61 | """ 62 | return self.actual is not None 63 | 64 | def delay(self) -> float | None: 65 | """Return the delay in minutes. 66 | 67 | Returns: 68 | int | None: delay in minutes, None if not .passed(). 69 | """ 70 | if not self.passed(): 71 | return None 72 | 73 | assert isinstance(self.actual, datetime) 74 | assert isinstance(self.expected, datetime) 75 | 76 | if self.actual >= self.expected: 77 | return (self.actual - self.expected).seconds / 60 78 | else: 79 | return -(self.expected - self.actual).seconds / 60 80 | 81 | def __repr__(self) -> str: 82 | hm = lambda d: d.strftime("%H:%M") 83 | 84 | ret: str = hm(self.expected) 85 | if not self.passed(): 86 | return ret 87 | 88 | ret += f" ~ {hm(self.actual)}" 89 | if self.delay() == 0: 90 | return ret 91 | 92 | delay: float | None = self.delay() 93 | assert isinstance(delay, float) 94 | 95 | sign: str = "+" if delay > 0 else "-" 96 | ret += f" {sign}{round(abs(delay), 1)}m" 97 | 98 | return ret 99 | 100 | 101 | class TrainStop: 102 | """A ViaggiaTreno train stop. 103 | 104 | Attributes: 105 | station (st.Station): the station the train is stopping by 106 | stop_type (TrainStopType): the type of stop (first, last, stop) 107 | platform_expected (str | None): expected platform 108 | platform_actual (str | None): actual platform 109 | arrival (TrainStopTime | None): arrival time, can be None if it's the first stop 110 | departure (TrainStopTime | None): departure time, can be None if it's the last stop 111 | """ 112 | 113 | def __init__( 114 | self, 115 | station: st.Station, 116 | stop_type: TrainStopType, 117 | platform_expected: str | None, 118 | platform_actual: str | None, 119 | arrival_expected: datetime | None, 120 | arrival_actual: datetime | None, 121 | departure_expected: datetime | None, 122 | departure_actual: datetime | None, 123 | ) -> None: 124 | """Initialize a new TrainStop object. 125 | 126 | Args: 127 | station (st.Station): the station the train is stopping by 128 | stop_type (TrainStopType): the type of stop (first, last, stop) 129 | platform_expected (str | None): expected platform 130 | platform_actual (str | None): actual platform 131 | arrival_expected (datetime | None): expected arrival time 132 | arrival_actual (datetime | None): actual arrival time 133 | departure_expected (datetime | None): expected departure time 134 | departure_actual (datetime | None): actual departure time 135 | """ 136 | self.station: st.Station = station 137 | self.stop_type: TrainStopType = stop_type 138 | 139 | self.platform_expected: str | None = platform_expected 140 | self.platform_actual: str | None = platform_actual 141 | 142 | self.arrival: TrainStopTime | None = None 143 | self.departure: TrainStopTime | None = None 144 | 145 | if self.stop_type == TrainStopType.CANCELLED: 146 | return 147 | 148 | if self.stop_type != TrainStopType.FIRST: 149 | assert isinstance(arrival_expected, datetime) 150 | self.arrival = TrainStopTime(arrival_expected, arrival_actual) 151 | 152 | if self.stop_type != TrainStopType.LAST: 153 | assert isinstance(departure_expected, datetime) 154 | self.departure = TrainStopTime(departure_expected, departure_actual) 155 | 156 | @classmethod 157 | def _from_raw_data(cls, stop_data: dict) -> "TrainStop": 158 | """Initialize a new train stop from the data processed by Train.fetch() 159 | 160 | Args: 161 | stop_data (dict): the data to initialize the class with 162 | 163 | Returns: 164 | TrainStop: a constructed TrainStop object 165 | """ 166 | station = st.Station.by_code(stop_data["id"]) 167 | if station._phantom: 168 | station.name = stop_data["stazione"].title().strip() 169 | 170 | stop_type: TrainStopType 171 | if stop_data["tipoFermata"] == "P": 172 | stop_type = TrainStopType.FIRST 173 | elif stop_data["tipoFermata"] == "A": 174 | stop_type = TrainStopType.LAST 175 | elif stop_data["tipoFermata"] == "F": 176 | stop_type = TrainStopType.STOP 177 | else: 178 | stop_type = TrainStopType.CANCELLED 179 | 180 | _to_dt = api.ViaggiaTrenoAPI._to_datetime 181 | 182 | return cls( 183 | station=station, 184 | stop_type=stop_type, 185 | platform_expected=( 186 | stop_data["binarioProgrammatoArrivoDescrizione"] 187 | or stop_data["binarioProgrammatoPartenzaDescrizione"] 188 | ), 189 | platform_actual=( 190 | stop_data["binarioEffettivoArrivoDescrizione"] 191 | or stop_data["binarioEffettivoPartenzaDescrizione"] 192 | ), 193 | arrival_expected=_to_dt(stop_data["arrivo_teorico"]), 194 | arrival_actual=_to_dt(stop_data["arrivoReale"]), 195 | departure_expected=_to_dt(stop_data["partenza_teorica"]), 196 | departure_actual=_to_dt(stop_data["partenzaReale"]), 197 | ) 198 | 199 | @classmethod 200 | def _from_trenord_raw_data( 201 | cls, stop_data: dict, day: date 202 | ) -> t.Union["TrainStop", None]: 203 | """Initialize a new train stop from data processed by Train.trenord_fetch() 204 | 205 | Args: 206 | stop_data (dict): the data to initialize the class with 207 | today (date): the date of the train, used to parse datetimes 208 | 209 | Returns: 210 | TrainStop | None: a constructed TrainStop object, 211 | or None if there isn't actual data 212 | """ 213 | 214 | def _hhmmss_to_dt(hhmmss: str | None) -> datetime | None: 215 | """Parse and return a Trenord time string into a datetime object. 216 | 217 | Args: 218 | hhmmss (str | None): the string to parse 219 | 220 | Returns: 221 | datetime | None: the parsed datetime object. 222 | """ 223 | if not hhmmss: 224 | return None 225 | 226 | return datetime.strptime(hhmmss, "%H:%M:%S").replace( 227 | year=day.year, 228 | month=day.month, 229 | day=day.day, 230 | tzinfo=api.TIMEZONE, 231 | ) 232 | 233 | if not stop_data["actual_data"]: 234 | return None 235 | 236 | station_code: str | None = ( 237 | stop_data["station"].get("station_id") 238 | or stop_data["actual_data"]["actual_station_mir"] 239 | ) 240 | try: 241 | assert isinstance(station_code, str) and len(station_code) > 0 242 | except AssertionError: 243 | raise IncompleteTrenordStopDataException 244 | 245 | station = st.Station.by_code(station_code) 246 | if station._phantom and stop_data.get("station", {}).get("station_ori_name"): 247 | station.name = stop_data["station"]["station_ori_name"].title().strip() 248 | 249 | stop_type: TrainStopType 250 | stop_type_raw = ( 251 | stop_data["actual_data"].get("actual_type", None) or stop_data["type"] 252 | ) 253 | if stop_type_raw == "O": 254 | stop_type = TrainStopType.FIRST 255 | elif stop_type_raw == "F": 256 | stop_type = TrainStopType.STOP 257 | elif stop_type_raw == "D": 258 | stop_type = TrainStopType.LAST 259 | else: 260 | stop_type = TrainStopType.CANCELLED 261 | 262 | if stop_data["cancelled"]: 263 | stop_type = TrainStopType.CANCELLED 264 | 265 | return cls( 266 | station=station, 267 | stop_type=stop_type, 268 | platform_expected=stop_data.get("platform", None), 269 | platform_actual=None, 270 | arrival_expected=_hhmmss_to_dt(stop_data.get("arr_time")), 271 | arrival_actual=_hhmmss_to_dt( 272 | stop_data["actual_data"].get("arr_actual_time") 273 | ), 274 | departure_expected=_hhmmss_to_dt(stop_data.get("dep_time")), 275 | departure_actual=_hhmmss_to_dt( 276 | stop_data["actual_data"].get("dep_actual_time") 277 | ), 278 | ) 279 | 280 | def __repr__(self) -> str: 281 | ret = f"@ ({self.stop_type.value}) {self.station.name} " 282 | if self.stop_type == TrainStopType.FIRST: 283 | ret += f"{self.departure}" 284 | elif self.stop_type == TrainStopType.LAST: 285 | ret += f"{self.arrival}" 286 | else: 287 | ret += f"{self.arrival} --> {self.departure}" 288 | 289 | platform_exp: str = self.platform_expected if self.platform_expected else "?" 290 | 291 | if self.platform_actual: 292 | return ret + f" [{platform_exp} ~ {self.platform_actual}]" 293 | else: 294 | return ret + f" [{platform_exp}]" 295 | -------------------------------------------------------------------------------- /docs/Proposta tirocinio.tex: -------------------------------------------------------------------------------- 1 | \documentclass[italian,11pt,a4paper,final]{article} 2 | \usepackage[a4paper, 3 | bindingoffset=0.2in, 4 | left=1in, 5 | right=1in, 6 | top=1in, 7 | bottom=1in, 8 | footskip=.25in]{geometry} 9 | \usepackage[utf8]{inputenc} 10 | \usepackage[T1]{fontenc} 11 | \usepackage{hyperref} 12 | \usepackage{babel} 13 | \date{2 marzo 2023} 14 | 15 | \newcommand{\hochkomma}{$^{,\,}$} 16 | 17 | \author{Marco Aceti} 18 | \title{ 19 | Open Data e trasporto ferroviario \\ 20 | \textit{\small{Proposta di tirocinio interno}} 21 | } 22 | 23 | \begin{document} 24 | \maketitle 25 | 26 | \begin{abstract} 27 | In Italia non esistono Open Data sulle performance del trasporto pubblico ferroviario: le metriche definite nei contratti di servizio tra gli enti locali committenti e le imprese ferroviarie sono insufficienti e spesso inaccessibili. 28 | La proposta di tirocinio si articola sull'idea di preservare i dati istantanei della circolazione ferroviaria dalla piattaforma ViaggiaTreno per produrre Open Data storici, \textit{machine-readable} e di qualità. 29 | Infine, si propone un'analisi dei dati raccolti a fini statistici e di verifica. 30 | \end{abstract} 31 | 32 | \section{Stato dell'arte} 33 | In Italia, il servizio di trasporto pubblico è operato da aziende\footnote{\url{https://it.wikipedia.org/wiki/Aziende_di_trasporto_pubblico_italiane}} private o partecipate. 34 | Sul territorio nazionale sono autorizzate\footnote{\url{https://www.mit.gov.it/documentazione/elenco-imprese-ferroviarie-titolari-di-licenza-1}} una ventina di \textit{Imprese Ferroviarie} (IF) adibite al trasporto passeggeri aventi in essere numerosi \textit{Contratti di Servizio} (CdS) con gli enti locali (tipicamente le Regioni). 35 | La qualità del servizio è misurata da \textbf{metriche di performance} stabilite nei CdS e comunicate agli enti dalle IF. 36 | 37 | \subsection{Esempio: il servizio ferroviario lombardo} 38 | In Lombardia, Trenord S.r.l.\ definisce\footnote{\url{https://www.regione.lombardia.it/wps/wcm/connect/7144d5b9-7e3c-4e44-82ad-30a1652e2642/Contratto+Trenord+con+firme.pdf} -- Allegato 11} un \textit{indice di puntualità entro i 5 minuti} che considera il \textit{``numero di corse circolanti giunte puntuali o con ritardo fino a 5 minuti''}, ma esclude i \textit{``ritardi maturati per cause esterne''} o \textit{``per lavori''}. 39 | La Regione pubblica mensilmente un rapporto sulla puntualità dei treni\footnote{\url{https://www.regione.lombardia.it/wps/wcm/connect/4eae62eb-dfcf-4446-82ea-72dbfdfb2c4a/Puntualit\%C3\%A0.pdf}} in formato PDF, ma con diverse criticità: 40 | \begin{itemize} 41 | \item vengono considerati solo i ritardi in arrivo alla destinazione finale, escludendo quindi le stazioni intermedie; 42 | \item i dati forniti non sono granulari ma \textit{brutalmente} aggregati per mese; 43 | \item sono escluse le \textit{cause esterne} e le \textit{circostanze occasionali}: gli indici di puntualità effettivi non sono pubblicati; 44 | \item i rapporti non rispettano neanche una \textit{stella} dei livelli definiti da Tim Berners-Lee per valutare gli Open Data: non è nemmeno presente una licenza d'uso. 45 | \end{itemize} 46 | 47 | C'è da considerare inoltre che Trenord (società tra l'altro partecipata al 50\% da Regione Lombardia stessa) comunica al committente gli indici già calcolati, senza che quest'ultimo abbia modo di verificarli. 48 | 49 | Infine, non tutti gli enti committenti pubblicano rapporti sulla qualità del servizio: per esempio, la Regione Campania prevede nel suo CdS\footnote{ 50 | \url{https://www.regione.campania.it/assets/documents/contratto-di-servizio-tpl-ferro.pdf} \\ 51 | sez.\ \textit{``Penali e forme di mitigazione delle stesse''} -- Allegato 7 52 | } con Trenitalia S.p.A.\ la fornitura di indici simili per il calcolo di penali e mitigazioni, 53 | ma non è reperibile nessun documento che li attesti. \\ 54 | 55 | \subsection{Open Data storici} 56 | In conclusione, non esistono attualmente Open Data {storici}, completi, strutturati e \textit{machine-readable} sul servizio di trasporto ferroviario in Italia. 57 | Gli indici di puntualità (e affidabilità) definiti nei CdS possono essere utili agli enti committenti per calcolare penali o comparare offerte di mercato, ma i Cittadini Digitali meritano una \textbf{maggiore trasparenza} per poter verificare autonomamente lo stato reale del \textit{Sistema Ferrovia}. 58 | 59 | \section{Rilevazioni istantanee} 60 | Nella sezione precedente si è discusso di \textbf{dati storici}; la situazione è molto più rosea per i \textbf{dati in tempo reale}. 61 | Esistono innumerevoli siti web e applicazioni, ufficiali e non, che mostrano lo stato attuale di un treno in viaggio. 62 | L'app \textit{Orario Treni}\footnote{\url{https://www.orariotreniapp.it/}} di Paolo Conte, per esempio, presenta con un'interfaccia molto semplice e intuitiva la possibilità di cercare treni per itinerario e numero, visualizzare arrivi e partenze di una stazione e consultare l'\textit{andamento istantaneo} di un treno. 63 | Quest'ultimo è composto da informazioni come gli orari programmati ed \textit{effettivi} di partenza e arrivo ad ogni fermata intermedia, ritardo cumulato fino a quel momento e luogo di ultimo rilevamento (non necessariamente corrispondente ad una fermata). \\ 64 | 65 | L'idea fondante della proposta in oggetto è sfruttare la ghiotta quantità di dati offerta dalle rilevazioni istantanee nel corso del tempo per produrre Open Data storici. 66 | 67 | \subsection{ViaggiaTreno} 68 | Il Gruppo Ferrovie dello Stato Italiane (\textit{holding} di diverse società\footnote{\url{https://it.wikipedia.org/wiki/Ferrovie_dello_Stato_Italiane}} come Trenitalia, RFI, ANAS, ...) permette ai viaggiatori di trovare soluzioni di viaggio e visualizzare l'andamento di una corsa tramite la piattaforma web ViaggiaTreno\footnote{\url{http://www.viaggiatreno.it/infomobilita/index.jsp}}, similmente all'app \textit{Orario Treni}. 69 | Si può infatti speculare che quest'ultima utilizzi proprio ViaggiaTreno come fonte dei dati. 70 | 71 | \subsubsection{API} 72 | Il \textit{motore} dell'interfaccia web di ViaggiaTreno è un insieme di API ``REST'' non ufficialmente documentate e di scarsa qualità\footnote{\url{https://medium.com/@albigiu/trenitalia-shock-non-crederete-mai-a-queste-api-painful-14433096502c}}. 73 | In rete sono presenti diversi tentativi di documentazione, mantenuti dalla community open source\footnote{\url{https://github.com/sabas/trenitalia}}\hochkomma\footnote{\url{https://github.com/roughconsensusandrunningcode/TrainMonitor/wiki/API-del-sistema-Viaggiatreno}}\hochkomma\footnote{\url{https://github.com/Razorphyn/Informazioni-Treni-Italiani}}. 74 | 75 | \subsubsection{Copyright e licenza d'uso} 76 | Le \textit{note legali} riportate sul portale ViaggiaTreno sono abbastanza aggressive. 77 | \begin{quote} 78 | \textit{I contenuti, la grafica e le immagini sono soggetti a Copyright. \textbf{Ogni diritto sui contenuti} (a titolo esemplificativo e non esaustivo: l’architettura del servizio, i testi, le immagini grafiche e fotografiche, ecc.) \textbf{è riservato ai sensi della normativa vigente}. I contenuti di ViaggiaTreno non possono, neppure in parte, essere copiati, riprodotti, trasferiti, caricati, pubblicati o distribuiti in qualsiasi modo senza il preventivo consenso scritto della società Trenitalia S.p.A.. È possibile scaricare i contenuti nel proprio computer e/o stampare estratti \textbf{unicamente per utilizzo personale} di carattere informativo. \textbf{Qualsiasi forma di link al sito www.ViaggiaTreno.it deve essere preventivamente autorizzata}\footnote{L'autore di questo documento si dichiara reo del \textit{reato di linking non autorizzato}} e non deve recare danno all'immagine e alle attività di Trenitalia S.p.A.. è vietato il c.d.\ deep linking ossia l'utilizzo, su siti di soggetti terzi, di parti del Servizio Internet o, comunque, il collegamento diretto alle pagine senza passare per la home page del Servizio Internet. \textbf{L'eventuale inosservanza delle presenti disposizioni}, salvo esplicita autorizzazione scritta, \textbf{sarà perseguita} nelle competenti sedi giudiziarie civili e penali.} 79 | \end{quote} 80 | Il Gruppo Ferrovie dello Stato Italiane vieta formalmente ai soggetti non autorizzati l'utilizzo di ViaggiaTreno per fini diversi dal mero uso personale, riservando tutti i diritti sui contenuti. 81 | Nel 2019, l'applicazione Trenìt!\ è stata costretta\footnote{\url{https://www.startmag.it/smartcity/perche-trenitalia-ha-tamponato-lapp-trenit-per-il-momento/}} a interrompere il servizio in seguito a un processo giudiziario iniziato da Trenitalia, che contestava il riutilizzo dei dati sulla circolazione ferroviaria presenti su ViaggiaTreno. 82 | Il giudice nella sua sentenza\footnote{\url{https://www.startmag.it/innovazione/trenit-trenitalia/}} ha invece stabilito che \textit{``la banca dati degli orari dei treni e i prezzi di questi, non è protetta da diritto d’autore''} e quindi Trenìt!\ li può utilizzare. \\ 83 | 84 | Ritengo quindi che non ci siano reali limiti legali nell'utilizzo della piattaforma ViaggiaTreno e in particolare delle sue API per i fini della proposta in oggetto. 85 | 86 | \subsection{Avvisi Trenord sulla circolazione} 87 | 88 | Trenord, oltre alla tracciabilità dei suoi treni in ViaggiaTreno, offre anche un servizio di avviso delle criticità di tutte le linee (simile all'InfoMobilità di Trenitalia). 89 | Gli avvisi sono rilasciati da esseri umani, ma hanno un formato simile. Di seguito ne sono riportati alcuni della linea \textit{Verona-Brescia-Milano}\footnote{\url{https://www.trenord.it/linee-e-orari/circolazione/le-nostre-linee/brescia-treviglio-milano/?code=R4}}. 90 | 91 | \begin{quote} 92 | \textbf{Criticità} --- 01/03/2023 06:24 93 | 94 | \texttt{Aggiornamento: 95 | Il treno 10913 (MILANO GRECO PIRELLI 05:52 - BRESCIA 07:12) sta viaggiando con un ritardo di 30 minuti perché è stato necessario prolungare i controlli tecnici che precedono la partenza del treno.} 96 | \end{quote} 97 | 98 | \begin{quote} 99 | \textbf{Criticità} --- 01/03/2023 10:07\nopagebreak 100 | 101 | \texttt{Il treno 2624 (VERONA PORTA NUOVA 09:43 - MILANO CENTRALE 11:35) viaggia con 12 minuti di ritardo in seguito alla sosta prolungata di un altro treno della linea.} 102 | \end{quote} 103 | 104 | \section{Proposta operativa} 105 | La proposta si articola in tre fasi. 106 | 107 | \subsection{Indagine esplorativa} 108 | Come concordato a voce nello scorso colloquio, in questa fase potrei indagare più a fondo sullo stato degli Open Data nel trasporto ferroviario in Italia e negli altri Paesi europei. 109 | Progetti simili potrebbero influenzare positivamente scelte come la granularità e il formato dei dati. 110 | 111 | \subsection{Raccolta e produzione degli Open Data} 112 | Lo scopo di questa fase è progettare e implementare uno strumento che raccoglie i dati in tempo reale dal portale ViaggiaTreno di \textbf{tutti i treni} in circolazione e li salva in un database. 113 | Quindi, creare successivamente un altro strumento che li esporta in un formato concordato. 114 | 115 | Per quanto riguarda gli avvisi di Trenord, un semplice script che li scarica dal sito web dovrebbe essere sufficiente. \\ 116 | 117 | Al fine di avere dati significati nella fase successiva, è importante iniziare il prima possibile l'attività di raccolta dati. 118 | 119 | \subsection{Analisi dei dati raccolti} 120 | Innanzitutto, è necessario definire opportunamente i concetti di \textit{tratta} e \textit{corsa}: considerando che i numeri identificativi dei treni mutano da un giorno all'altro e non sono univoci, non è un compito banale. 121 | Quindi, si possono ricalcolare gli indici di puntualità e affidabilità \textit{effettivi} per ogni tratta individuata e trovare correlazioni tra le performance del servizio e giorno della settimana, orario, condizioni meteo, ecc\ldots 122 | 123 | Si potrebbe anche verificare la regolarità e correttezza degli avvisi ai passeggeri nelle tratte affidate a Trenord e analizzare le cause dichiarate più comuni. \\ 124 | 125 | \section{Sviluppi futuri} 126 | 127 | Nonostante ritenga l'attività di analisi dei dati estremamente interessante, l'obiettivo principale della proposta in oggetto è fornire strumenti liberi e Open Data di qualità per permettere a chiunque dotato delle capacità necessarie di continuare il lavoro. 128 | Con il supporto del Dipartimento di Informatica si potrebbe rendere l'attività di raccolta dati permanente e costante fornendo \textit{dump} regolari accessibili da un portale web, anche di semplice costruzione. 129 | 130 | \end{document} 131 | -------------------------------------------------------------------------------- /src/analysis/trajectories_map.py: -------------------------------------------------------------------------------- 1 | # railway-opendata: scrape and analyze italian railway data 2 | # Copyright (C) 2023 Marco Aceti 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | 18 | import itertools 19 | import logging 20 | import pathlib 21 | import typing as t 22 | import webbrowser 23 | from collections import defaultdict 24 | from datetime import datetime, timedelta 25 | from tempfile import NamedTemporaryFile 26 | 27 | import folium 28 | import folium.plugins 29 | import numpy as np 30 | import pandas as pd 31 | from branca.colormap import LinearColormap 32 | from branca.element import MacroElement, Template 33 | from colour import Color 34 | from joblib import Parallel, delayed 35 | 36 | # The 'length' (in minutes) of a frame 37 | WINDOW_SIZE: int = 2 38 | assert WINDOW_SIZE > 0 39 | 40 | # Minimum line weight 41 | MIN_WEIGHT: int = 4 42 | assert MIN_WEIGHT > 0 43 | 44 | # Safe values used in sanity checks 45 | MIN_YEAR: int = datetime.now().year - 50 46 | MAX_YEAR: int = datetime.now().year + 10 47 | 48 | # Folium map initialization arguments 49 | MAP_KWARGS: dict = { 50 | "location": (41.890, 12.492), 51 | "zoom_start": 7, 52 | "attr": "OSM", 53 | } 54 | 55 | # Assets path (marker icons) 56 | ASSETS_PATH = pathlib.Path("./src/analysis/assets/").resolve() 57 | 58 | # Delay color range: (lower_bound, color) 59 | _color_map: list[tuple[float, Color]] = [ 60 | (-5, Color("#34ebc0")), 61 | (0, Color("green")), 62 | (10, Color("orange")), 63 | (30, Color("red")), 64 | (120, Color("black")), 65 | ] 66 | 67 | # Statically populate COLORS dict 68 | COLORS: dict[int | float, Color] = defaultdict(lambda: Color("gray")) 69 | for i, (lower_bound, color) in enumerate(_color_map[1:]): 70 | prev_bound, prev_color = _color_map[i + 1 - 1] 71 | n_range: range = range(round(prev_bound), round(lower_bound) + 1) 72 | scale: list[Color] = list(prev_color.range_to(color, len(n_range))) 73 | for j, n in enumerate(n_range): 74 | COLORS[n] = scale[j] 75 | 76 | 77 | def fill_time(start: datetime, end: datetime) -> t.Generator[datetime, None, None]: 78 | """Generate a consecutive list of times between the 'start' and 'end' period. 79 | 80 | Args: 81 | start (datetime): start time 82 | end (datetime): end time 83 | 84 | Returns: 85 | Generator[datetime, None, None]: the generated datetimes 86 | """ 87 | # Fix empty intervals 88 | if start == end: 89 | start -= timedelta(minutes=WINDOW_SIZE) 90 | 91 | while start <= end: 92 | yield start 93 | start += timedelta(minutes=WINDOW_SIZE) 94 | 95 | 96 | def icon_marker(railway_company: str, category: str) -> str: 97 | """Select a proper marker (from the src/analysis/assets/markers/ directory) 98 | by railway_company and category. 99 | 100 | Args: 101 | railway_company (str): a railway company 102 | category (str): a category 103 | 104 | Returns: 105 | str: filename of the proper marker 106 | """ 107 | 108 | category = category.replace("MET", "REG").replace("EC FR", "EC") 109 | railway_company = railway_company.lower() 110 | 111 | if railway_company.startswith("trenitalia") and category in [ 112 | "EC", 113 | "FA", 114 | "FB", 115 | "FR", 116 | "IC", 117 | "ICN", 118 | "REG", 119 | ]: 120 | return f"trenitalia_{category.lower()}.svg" 121 | 122 | if railway_company in ["trenord", "tper"] and category == "REG": 123 | return f"{railway_company}_reg.svg" 124 | 125 | if railway_company == "obb" and category == "EC": 126 | return "obb_ec.svg" 127 | 128 | return "other.svg" 129 | 130 | 131 | @delayed 132 | def train_stop_geojson(st: pd.DataFrame, train: pd.DataFrame) -> list[dict]: 133 | """Generate a list of GeoJSON formatted data for train stops. 134 | 135 | Args: 136 | st (pd.DataFrame): global station data 137 | train (pd.DataFrame): the train stop data 138 | 139 | Returns: 140 | Generator[dict, None, None]: a generator of GeoJSON formatted 141 | dictionaries representing the train _geographic trajectory_. 142 | """ 143 | ret: list[dict] = list() 144 | train = train.sort_values(by="stop_number") 145 | 146 | # Iterate the train stops two by two 147 | for i in range(len(train))[1:]: 148 | prev = train.iloc[i - 1] 149 | curr = train.iloc[i] 150 | 151 | try: 152 | prev_st = st.loc[ 153 | (st.index == prev.stop_station_code) 154 | & ~st.latitude.isna() 155 | & ~st.longitude.isna() 156 | ].iloc[0] 157 | curr_st = st.loc[ 158 | (st.index == curr.stop_station_code) 159 | & ~st.latitude.isna() 160 | & ~st.longitude.isna() 161 | ].iloc[0] 162 | except IndexError: 163 | # The station location can't be retrieved 164 | continue 165 | 166 | prev_time: datetime | None = prev.departure_actual or prev.departure_expected 167 | curr_time: datetime | None = curr.arrival_actual or curr.arrival_expected 168 | delay: float = ( 169 | round(prev.departure_delay) 170 | if not np.isnan(prev.departure_delay) 171 | else np.nan 172 | ) 173 | 174 | # Sanity check: _time must be not null 175 | if not prev_time or not curr_time: 176 | continue 177 | 178 | # Sanity check: a train should arrive in a given station after 179 | # it departs from the previous one 180 | if not curr_time >= prev_time: 181 | continue 182 | 183 | # Sanity check: sometimes the API returns insane year values 184 | if curr_time.year > MAX_YEAR or prev_time.year < MIN_YEAR: 185 | continue 186 | 187 | # Tooltip pop up display 188 | tooltip: str = ( 189 | f"{curr.client_code}{curr.category} {curr.number}" 190 | f"
{prev_st.long_name} " 191 | f"{f'({round(prev.departure_delay, 1):+g} min)' if not np.isnan(prev.departure_delay) else ''}" 192 | f" → " 193 | f"{curr_st.long_name} " 194 | f"{f' ({round(prev.arrival_delay, 1):+g} min)' if not np.isnan(prev.arrival_delay) else ''}" 195 | ) 196 | 197 | for timestamp in fill_time(prev_time, curr_time): 198 | ret.extend( 199 | [ 200 | { 201 | "type": "Feature", 202 | "geometry": { 203 | "type": "LineString", 204 | "coordinates": [ 205 | (prev_st.longitude, prev_st.latitude), 206 | (curr_st.longitude, curr_st.latitude), 207 | ], 208 | }, 209 | "properties": { 210 | "times": [timestamp.isoformat()] * 2, 211 | "style": { 212 | "color": COLORS[delay].get_hex(), 213 | "weight": int(curr.crowding / 10) 214 | if not np.isnan(curr.crowding) 215 | and curr.crowding > MIN_WEIGHT * 10 216 | else MIN_WEIGHT, 217 | }, 218 | "tooltip": tooltip, 219 | }, 220 | }, 221 | { 222 | "type": "Feature", 223 | "geometry": { 224 | "type": "Point", 225 | "coordinates": (curr_st.longitude, curr_st.latitude), 226 | }, 227 | "properties": { 228 | "icon": "marker", 229 | "iconstyle": { 230 | "iconUrl": str( 231 | ASSETS_PATH 232 | / "markers" 233 | / icon_marker(curr.client_code, curr.category) 234 | ), 235 | "iconSize": [24, 24], 236 | "fillOpacity": 1, 237 | }, 238 | "tooltip": tooltip, 239 | "name": "", 240 | "times": [timestamp.isoformat()], 241 | }, 242 | }, 243 | ] 244 | ) 245 | 246 | return ret 247 | 248 | 249 | class StatsChart(MacroElement): 250 | """Helper class to compute and embed the train count chart.""" 251 | 252 | def __init__(self, df: pd.DataFrame, *args, **kwargs): 253 | """Initialize a new object. 254 | 255 | Args: 256 | df (pd.DataFrame): the train stop data 257 | """ 258 | super().__init__(*args, **kwargs) 259 | 260 | # Prepare dataset 261 | trains = df.groupby("train_hash") 262 | self.data = pd.DataFrame(index=df.train_hash.unique()) 263 | self.data["departure"] = trains.first()["departure_actual"].fillna( 264 | trains.first()["departure_expected"] 265 | ) 266 | self.data["arrival"] = trains.last()["arrival_actual"].fillna( 267 | trains.first()["arrival_expected"] 268 | ) 269 | self.data["delay"] = trains.mean(numeric_only=True)["departure_delay"].fillna( 270 | trains.mean(numeric_only=True)["arrival_delay"] 271 | ) 272 | 273 | def get_train_count_data(self) -> list[dict[str, str | int]]: 274 | """Return circulating train count in a JS-likable format.""" 275 | ret: list[dict[str, str | int]] = [] 276 | for time in fill_time(self.data.departure.min(), self.data.arrival.max()): 277 | subset: pd.DataFrame = self.data.loc[ 278 | (time >= self.data.departure) & (time <= self.data.arrival) 279 | ] 280 | ret.append( 281 | { 282 | "x": time.isoformat(), 283 | "y": len(subset), 284 | } 285 | ) 286 | return ret 287 | 288 | def get_delays_data(self) -> list[dict[str, str | float]]: 289 | ret: list[dict[str, str | float]] = [] 290 | for time in fill_time(self.data.departure.min(), self.data.arrival.max()): 291 | subset: pd.DataFrame = self.data.loc[ 292 | (time >= self.data.departure) & (time <= self.data.arrival) 293 | ] 294 | ret.append( 295 | { 296 | "x": time.isoformat(), 297 | "y": subset.delay.mean() if len(subset) > 20 else "NaN", 298 | } 299 | ) 300 | return ret 301 | 302 | 303 | class MarkerLegend(MacroElement): 304 | """Helper class to embed the marker legend""" 305 | 306 | @staticmethod 307 | def get_markers_path() -> str: 308 | """Return the absolute path of assets""" 309 | return str(ASSETS_PATH / "markers") 310 | 311 | 312 | def build_map(st: pd.DataFrame, df: pd.DataFrame) -> None: 313 | """Build a Folium map with train trajectories, 314 | and open it with a web browser. 315 | 316 | Args: 317 | st (pd.DataFrame): global station data 318 | df (pd.DataFrame): the train stop data 319 | """ 320 | m = folium.Map(**MAP_KWARGS) 321 | 322 | # Drop cancelled stops and trains 323 | df = df.loc[(df.stop_type != "C") & (df.cancelled == False)].copy() 324 | 325 | logging.info("Generating GeoJSON features...") 326 | features = Parallel(n_jobs=-1, verbose=5)( 327 | train_stop_geojson(st, train_df) for _, train_df in df.groupby("train_hash") 328 | ) 329 | 330 | # Add TimestampedGeoJson plugin 331 | folium.plugins.TimestampedGeoJson( 332 | { 333 | "type": "FeatureCollection", 334 | "features": list(itertools.chain(*features)), # type: ignore 335 | }, 336 | add_last_point=False, 337 | period=f"PT{WINDOW_SIZE}M", 338 | duration=f"PT{WINDOW_SIZE}M", 339 | ).add_to(m) 340 | 341 | # Add delay legend 342 | LinearColormap( 343 | colors=list(map(lambda c: c.get_rgb(), COLORS.values())), 344 | index=COLORS.keys(), 345 | vmin=min(COLORS.keys()), 346 | vmax=min(60, max(COLORS.keys())), 347 | max_labels=50, 348 | tick_labels=list(range(-5, 61, 5)), 349 | caption="Departure delay", 350 | ).add_to(m) 351 | 352 | # Add marker legend 353 | legend = MarkerLegend() 354 | with open(ASSETS_PATH / "templates" / "marker_legend.html", "r") as f: 355 | legend._template = Template("\n".join(f.readlines())) 356 | m.get_root().add_child(legend) 357 | 358 | # Add train count chart 359 | macro = StatsChart(df) 360 | with open(ASSETS_PATH / "templates" / "stats_chart.html", "r") as f: 361 | macro._template = Template("\n".join(f.readlines())) 362 | m.get_root().add_child(macro) 363 | 364 | # Save the map to a temporary file and open it with a web browser 365 | outfile = NamedTemporaryFile(delete=False, suffix=".html") 366 | m.save(outfile.file) 367 | 368 | webbrowser.open(outfile.name) 369 | --------------------------------------------------------------------------------