56 |
57 |
159 |
160 |
161 |
162 | {% endmacro %}
163 |
--------------------------------------------------------------------------------
/src/analysis/main.py:
--------------------------------------------------------------------------------
1 | # railway-opendata: scrape and analyze italian railway data
2 | # Copyright (C) 2023 Marco Aceti
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 |
17 |
18 | import argparse
19 | import logging
20 | import pathlib
21 | import warnings
22 | from datetime import datetime
23 |
24 | import matplotlib.pyplot as plt
25 | import pandas as pd
26 | from dateparser import parse
27 | from joblib import Parallel, delayed
28 | from pandas.core.groupby.generic import DataFrameGroupBy
29 |
30 | from src.analysis import groupby, stat, timetable, trajectories_map
31 | from src.analysis.filter import *
32 | from src.analysis.load_data import read_station_csv, read_train_csv, tag_lines
33 |
34 |
35 | def register_args(parser: argparse.ArgumentParser):
36 | parser.add_argument(
37 | "--start-date",
38 | help="the start date in a 'dateparser'-friendly format",
39 | )
40 | parser.add_argument(
41 | "--end-date",
42 | help="the end date in a 'dateparser'-friendly format",
43 | )
44 | parser.add_argument(
45 | "--railway-companies",
46 | help="comma-separated list of railway companies to include. If not set, all companies will be included.",
47 | dest="client_codes",
48 | )
49 | parser.add_argument(
50 | "--railway-lines",
51 | help=(
52 | "comma-separated list of railway lines to include. "
53 | "If not set, all lines will be include. "
54 | "Use --stat detect_lines to see available lines."
55 | ),
56 | dest="railway_lines",
57 | )
58 | parser.add_argument(
59 | "--group-by",
60 | help="group by stops by a value",
61 | choices=(
62 | "none",
63 | "train_hash",
64 | "client_code",
65 | "weekday",
66 | ),
67 | default="none",
68 | )
69 | parser.add_argument(
70 | "--agg-func",
71 | help="group by aggregation function",
72 | choices=(
73 | "none",
74 | "mean",
75 | "last",
76 | ),
77 | default="none",
78 | )
79 | parser.add_argument(
80 | "--stat",
81 | help="the stat to calculate",
82 | choices=(
83 | "describe",
84 | "delay_boxplot",
85 | "day_train_count",
86 | "trajectories_map",
87 | "detect_lines",
88 | "timetable",
89 | ),
90 | default="describe",
91 | )
92 | parser.add_argument(
93 | "--save-fig",
94 | metavar="FILENAME",
95 | help="save the output figure to a file if using delay_boxplot or day_train_count stats. If not specified, use pyplot.show()",
96 | default=None,
97 | )
98 | parser.add_argument(
99 | "--timetable-collapse",
100 | help="collapse the train stop times in the graph, relative to the first (only for 'timetable' stat). Defaults to False",
101 | action=argparse.BooleanOptionalAction,
102 | default=False,
103 | )
104 | parser.add_argument(
105 | "station_csv",
106 | help="exported station CSV",
107 | )
108 | parser.add_argument(
109 | "trains_csv",
110 | nargs="+",
111 | help="exported train CSV",
112 | )
113 |
114 |
115 | @delayed
116 | def _load_train_dataset(train_csv: str) -> pd.DataFrame:
117 | path = pathlib.Path(train_csv)
118 | train_df: pd.DataFrame = read_train_csv(pathlib.Path(train_csv))
119 | logging.debug(f"Loaded {len(train_df)} data points @ {path}")
120 | return train_df
121 |
122 |
123 | def main(args: argparse.Namespace):
124 | with warnings.catch_warnings():
125 | warnings.simplefilter("ignore")
126 |
127 | start_date: datetime | None = parse(args.start_date if args.start_date else "")
128 | if args.start_date and not start_date:
129 | raise argparse.ArgumentTypeError("invalid start_date")
130 |
131 | end_date: datetime | None = parse(args.end_date if args.end_date else "")
132 | if args.end_date and not end_date:
133 | raise argparse.ArgumentTypeError("invalid end_date")
134 |
135 | railway_companies: str | None = args.client_codes
136 | railway_lines: str | None = args.railway_lines
137 |
138 | # Load dataset
139 | df: pd.DataFrame | DataFrameGroupBy = pd.DataFrame()
140 | logging.info("Loading datasets...")
141 |
142 | for train_df in Parallel(n_jobs=-1, verbose=5)(
143 | _load_train_dataset(train_csv) for train_csv in args.trains_csv # type: ignore
144 | ):
145 | df = pd.concat([df, train_df], axis=0)
146 |
147 | df.reset_index(drop=True, inplace=True)
148 |
149 | stations: pd.DataFrame = read_station_csv(args.station_csv)
150 | original_length: int = len(df)
151 |
152 | # Tag lines
153 | df = tag_lines(df, stations)
154 |
155 | # Apply filters
156 | df = date_filter(df, start_date, end_date)
157 | df = railway_company_filter(df, railway_companies)
158 | df = railway_lines_filter(df, railway_lines)
159 | logging.info(f"Loaded {len(df)} data points ({original_length} before filtering)")
160 |
161 | # Prepare graphics
162 | stat.prepare_mpl(df, args)
163 |
164 | if args.group_by != "none":
165 | df_grouped: DataFrameGroupBy | None = None
166 |
167 | if args.group_by == "train_hash":
168 | df_grouped = groupby.train_hash(df)
169 | elif args.group_by == "client_code":
170 | df_grouped = groupby.client_code(df)
171 | elif args.group_by == "weekday":
172 | df_grouped = groupby.weekday(df)
173 |
174 | assert df_grouped is not None
175 |
176 | if args.agg_func == "last":
177 | df = df_grouped.last()
178 | elif args.agg_func == "mean":
179 | df = df_grouped.mean(numeric_only=True)
180 | elif args.agg_func == "none":
181 | df = df_grouped
182 |
183 | if args.stat in [
184 | "trajectories_map",
185 | "detect_lines",
186 | "timetable",
187 | ] and not isinstance(df, pd.DataFrame):
188 | raise ValueError(f"can't use {args.stat} with unaggregated data")
189 |
190 | if args.stat == "describe":
191 | stat.describe(df)
192 | elif args.stat == "delay_boxplot":
193 | stat.delay_boxplot(df)
194 | elif args.stat == "day_train_count":
195 | stat.day_train_count(df)
196 | elif args.stat == "trajectories_map":
197 | trajectories_map.build_map(stations, df)
198 | elif args.stat == "detect_lines":
199 | stat.detect_lines(df, stations)
200 | elif args.stat == "timetable":
201 | if not timetable.same_line(df):
202 | raise ValueError(
203 | f"can't use timetable if --railway-lines filter is not used"
204 | )
205 | timetable.timetable_graph(df, stations, args.timetable_collapse)
206 |
207 | # Visualizations only
208 | if args.stat in ["delay_boxplot", "day_train_count", "timetable"]:
209 | plt.tight_layout()
210 | if args.save_fig:
211 | plt.savefig(args.save_fig)
212 | else:
213 | plt.show()
214 |
--------------------------------------------------------------------------------
/src/train_extractor.py:
--------------------------------------------------------------------------------
1 | # railway-opendata: scrape and analyze italian railway data
2 | # Copyright (C) 2023 Marco Aceti
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 |
17 |
18 | import argparse
19 | import csv
20 | import hashlib
21 | import pickle
22 | from datetime import date, datetime
23 | from pathlib import Path
24 |
25 | from src.const import TIMEZONE
26 | from src.scraper.train import Train
27 | from src.scraper.train_stop import TrainStopTime
28 | from src.utils import parse_input_format_output_args
29 |
30 |
31 | def load_file(file: Path) -> dict[int, Train]:
32 | """Load a train data pickle file and return it.
33 |
34 | Args:
35 | file (Path): the file to load
36 |
37 | Returns:
38 | dict[int, Train]: the train data contained in the file
39 |
40 | Notes:
41 | Before commit 48966dfab25553650e3d743a4ecc77db02c4b30,
42 | departure and arrival timestamps dates of Trenord trains
43 | were all 1900-01-01.
44 | This function fixes such incorrect dates.
45 | """
46 | with open(file, "rb") as f:
47 | data: dict[int, Train] = pickle.load(f)
48 |
49 | def _fix_datetime(train: Train, dt: datetime | None) -> datetime | None:
50 | """Fix departure and arrival timestamps"""
51 | if isinstance(dt, datetime) and dt.year < 2000:
52 | dep_date: date = train.departing_date
53 | dt = dt.replace(
54 | year=dep_date.year,
55 | month=dep_date.month,
56 | day=dep_date.day,
57 | tzinfo=TIMEZONE,
58 | )
59 | return dt
60 |
61 | def _detect_crazy_time_difference(train: Train, time: TrainStopTime):
62 | """Ignore trains if the difference between expected and actual
63 | times in a stop is greater than one day.
64 |
65 | Example:
66 | REG Train 17907 operated by TPER. S05311 stop on 2023-03-30.
67 | arrival_expected 2025-08-30 17:33:00+02:00
68 | arrival_actual 2023-03-30 17:34:30+02:00
69 | arrival_delay -1438.5
70 | """
71 | if not time.actual or not time.expected:
72 | return
73 |
74 | if abs((time.actual - time.expected).days) > 1:
75 | train._phantom = True
76 |
77 | for train_h in data:
78 | train: Train = data[train_h]
79 |
80 | for stop in train.stops if isinstance(train.stops, list) else []:
81 | if isinstance(stop.arrival, TrainStopTime):
82 | _detect_crazy_time_difference(train, stop.arrival)
83 | stop.arrival.actual = _fix_datetime(train, stop.arrival.actual)
84 | stop.arrival.expected = _fix_datetime(train, stop.arrival.expected) # type: ignore
85 | if isinstance(stop.departure, TrainStopTime):
86 | _detect_crazy_time_difference(train, stop.departure)
87 | stop.departure.actual = _fix_datetime(train, stop.departure.actual)
88 | stop.departure.expected = _fix_datetime(train, stop.departure.expected) # type: ignore
89 |
90 | if train.client_code == 63:
91 | train._fix_intraday_datetimes()
92 |
93 | return data
94 |
95 |
96 | def to_csv(data: dict[int, Train], output_file: Path) -> None:
97 | """Convert to CSV train data, one row per stop.
98 |
99 | Args:
100 | data (dict[int, Train]): the data to convert
101 | output_file (Path): the file to write
102 | """
103 | FIELDS: tuple = (
104 | "train_hash",
105 | "number",
106 | "day",
107 | "origin",
108 | "destination",
109 | "category",
110 | "client_code",
111 | "phantom",
112 | "trenord_phantom",
113 | "cancelled",
114 | "stop_number",
115 | "stop_station_code",
116 | "stop_type",
117 | "platform",
118 | "arrival_expected",
119 | "arrival_actual",
120 | "arrival_delay",
121 | "departure_expected",
122 | "departure_actual",
123 | "departure_delay",
124 | "crowding",
125 | )
126 |
127 | csvfile = open(output_file, "w+", newline="")
128 | writer = csv.writer(
129 | csvfile,
130 | delimiter=",",
131 | quotechar="|",
132 | quoting=csv.QUOTE_MINIMAL,
133 | )
134 | writer.writerow(FIELDS)
135 |
136 | for train_h in data:
137 | train: Train = data[train_h]
138 |
139 | for i, stop in enumerate(train.stops) if isinstance(train.stops, list) else []:
140 | writer.writerow(
141 | (
142 | hashlib.md5(str(train_h).encode("ascii")).hexdigest(),
143 | train.number,
144 | train.departing_date.isoformat(),
145 | train.origin.code,
146 | train.destination.code if train.destination else None,
147 | train.category,
148 | train.client_code,
149 | train._phantom,
150 | train._trenord_phantom
151 | if hasattr(train, "_trenord_phantom")
152 | else False,
153 | train.cancelled,
154 | i,
155 | stop.station.code,
156 | stop.stop_type.value,
157 | stop.platform_actual or stop.platform_expected,
158 | stop.arrival.expected.isoformat()
159 | if stop.arrival and stop.arrival.expected
160 | else None,
161 | stop.arrival.actual.isoformat()
162 | if stop.arrival and stop.arrival.actual
163 | else None,
164 | stop.arrival.delay() if stop.arrival else None,
165 | stop.departure.expected.isoformat()
166 | if stop.departure and stop.departure.expected
167 | else None,
168 | stop.departure.actual.isoformat()
169 | if stop.departure and stop.departure.actual
170 | else None,
171 | stop.departure.delay() if stop.departure else None,
172 | train.crowding if hasattr(train, "crowding") else None,
173 | )
174 | )
175 |
176 | csvfile.close()
177 |
178 |
179 | def register_args(parser: argparse.ArgumentParser):
180 | parser.add_argument(
181 | "pickle_file",
182 | help=".pickle file to parse",
183 | metavar="PICKLE_FILE",
184 | )
185 | parser.add_argument(
186 | "-f",
187 | default="csv",
188 | choices=[
189 | "csv",
190 | ],
191 | help="output file format",
192 | dest="format",
193 | )
194 | parser.add_argument(
195 | "-o",
196 | help="output file name",
197 | metavar="OUTPUT_FILE",
198 | dest="output_file",
199 | )
200 |
201 |
202 | def main(args: argparse.Namespace):
203 | input_f, output_f, format = parse_input_format_output_args(args)
204 |
205 | data: dict[int, Train] = load_file(input_f)
206 | if format == "csv":
207 | to_csv(data, output_f)
208 |
--------------------------------------------------------------------------------
/src/scraper/station.py:
--------------------------------------------------------------------------------
1 | # railway-opendata: scrape and analyze italian railway data
2 | # Copyright (C) 2023 Marco Aceti
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 |
17 |
18 | import logging
19 | import typing as t
20 |
21 | import src.scraper.api as api
22 | import src.scraper.train as tr
23 | from src import types
24 | from src.scraper.exceptions import BadRequestException
25 |
26 |
27 | class Station:
28 | """A ViaggiaTreno station.
29 |
30 | Attributes:
31 | code (str): the station code, used in API calls (e.g. S01700)
32 | region_code (int): the code of the region where the station is located
33 | name (str | None): the station name (e.g. Milano Centrale)
34 | short_name (str | None): a shortened version of the name (e.g. Milano C.le)
35 | position (Tuple[float, float] | None): the latitude and longitude of the station
36 |
37 | Other attributes:
38 | _phantom (bool): if True, the details of the station can't be fetched
39 | """
40 |
41 | _cache: dict[str, "Station"] = dict()
42 |
43 | def __init__(
44 | self,
45 | code: str,
46 | region_code: int,
47 | name: str | None,
48 | short_name: str | None = None,
49 | position: t.Tuple[float, float] | None = None,
50 | ) -> None:
51 | """Initialize a new station.
52 |
53 | Args:
54 | code (str): the station code, used in API calls (e.g. S01700)
55 | region_code (int): the code of the region where the station is located
56 | name (str | None): the station name (e.g. Milano Centrale)
57 | short_name (str | None, optional): a shortened version of the name (e.g. Milano C.le)
58 | position (Tuple[float, float] | None, optional): the latitude and longitude of the station
59 | """
60 | self.code: str = code
61 | self.region_code: int = region_code
62 | self.name: str | None = None
63 | if name:
64 | self.name: str | None = name.title().strip()
65 | self.short_name: str | None = (
66 | short_name.title().strip() if short_name else name
67 | )
68 | self.position: t.Tuple[float, float] | None = position
69 |
70 | self._phantom: bool = self.name == None
71 |
72 | @classmethod
73 | def _from_raw(cls, raw_data: dict) -> "Station":
74 | """Initialize a new station from raw API data, or use the class cache.
75 |
76 | Args:
77 | station_data (dict): raw data returned by the API.
78 | """
79 | station_code = raw_data["codStazione"]
80 |
81 | if station_code not in cls._cache:
82 | cls._cache[station_code] = cls(
83 | code=station_code,
84 | region_code=raw_data["codReg"],
85 | name=raw_data["localita"]["nomeLungo"],
86 | short_name=raw_data["localita"]["nomeBreve"],
87 | position=(raw_data["lat"], raw_data["lon"]),
88 | )
89 | else:
90 | cached: Station = cls._cache[station_code]
91 |
92 | # codReg can have multiple values depending on the request.
93 | # If an inequality is detected, settle the correct region_code once for all.
94 | if raw_data["codReg"] != cached.region_code:
95 | logging.warning(
96 | f"Provided region code for {station_code} is different from the cached one"
97 | )
98 | cached.region_code = Station._region_code(station_code)
99 |
100 | return cls._cache[station_code]
101 |
102 | def __repr__(self) -> str:
103 | return f"{self.name} [{self.code}@{self.region_code}]"
104 |
105 | @classmethod
106 | def by_code(cls, station_code: str) -> "Station":
107 | """Retrieve a station by its code, or use cache.
108 |
109 | Args:
110 | station_code (str): the station code
111 |
112 | Returns:
113 | Station: a station corresponding to the passed station code
114 | """
115 | if station_code not in cls._cache:
116 | try:
117 | region_code: int = cls._region_code(station_code)
118 | except BadRequestException as e:
119 | if e.status_code != 204:
120 | raise e
121 |
122 | region_code: int = 0
123 |
124 | try:
125 | response: str = api.ViaggiaTrenoAPI._raw_request(
126 | "dettaglioStazione", station_code, region_code
127 | )
128 | raw_data: types.JSONType = api.ViaggiaTrenoAPI._decode_json(response)
129 | cls._cache[station_code] = cls._from_raw(raw_data)
130 | except BadRequestException as e:
131 | if e.status_code != 204:
132 | raise e
133 |
134 | cls._cache[station_code] = cls(
135 | code=station_code,
136 | region_code=region_code,
137 | name=None,
138 | )
139 |
140 | return cls._cache[station_code]
141 |
142 | @staticmethod
143 | def _region_code(station_code: str) -> int:
144 | """Retrieve the region code of a given station (by its code).
145 |
146 | Args:
147 | station_code (str): the code of the station to check
148 |
149 | Raises:
150 | BadRequestException: if the response is not ok
151 |
152 | Returns:
153 | int: the region code of the given station
154 | """
155 | region_code = api.ViaggiaTrenoAPI._raw_request("regione", station_code)
156 | return int(region_code)
157 |
158 | @classmethod
159 | def by_region(cls, region_code: int) -> t.List["Station"]:
160 | """Retrieve the list of train stations of a given region.
161 |
162 | Args:
163 | region_code (int): the code of the region to query
164 |
165 | Returns:
166 | t.List[Station]: a list of train stations
167 | """
168 | raw_stations: str = api.ViaggiaTrenoAPI._raw_request(
169 | "elencoStazioni", region_code
170 | )
171 | stations: types.JSONType = api.ViaggiaTrenoAPI._decode_json(raw_stations)
172 | return list(
173 | map(
174 | lambda s: cls._from_raw(s),
175 | filter(lambda s: s["tipoStazione"] != 4, stations),
176 | )
177 | )
178 |
179 | def departures(self) -> t.List["tr.Train"]:
180 | """Retrieve the departures of a train station.
181 |
182 | Args:
183 | station_code (str): the code of the considered station
184 |
185 | Returns:
186 | t.List[Train]: a list of trains departing from the station
187 | """
188 | return api.ViaggiaTrenoAPI._station_departures_or_arrivals(
189 | "partenze", self.code
190 | )
191 |
192 | def arrivals(self) -> t.List["tr.Train"]:
193 | """Retrieve the arrivals of a train station.
194 |
195 | Args:
196 | station_code (str): the code of the considered station
197 |
198 | Returns:
199 | t.List[Train]: a list of trains departing from the station
200 | """
201 | return api.ViaggiaTrenoAPI._station_departures_or_arrivals("arrivi", self.code)
202 |
203 | def __hash__(self) -> int:
204 | return hash(self.name)
205 |
--------------------------------------------------------------------------------
/src/analysis/assets/templates/marker_legend.html:
--------------------------------------------------------------------------------
1 |
18 |
19 | {% macro html(this, kwargs) %}
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
Regional
76 |
77 |
78 |
79 | Trenitalia
80 |
81 |
82 |
83 |
84 |
85 | Trenord
86 |
87 |
88 |
89 |
90 |
91 | TPER
92 |
93 |
94 |
95 |
96 |
High speed
97 |
98 |
99 |
100 | Frecciarossa
101 |
102 |
103 |
104 |
105 |
106 | Frecciargento
107 |
108 |
109 |
110 |
111 |
112 | Frecciabianca
113 |
114 |
115 |
116 |
117 |
Long haul
118 |
119 |
120 |
121 | Intercity
122 |
123 |
124 |
125 |
126 |
127 | IC Notte
128 |
129 |
130 |
131 |
132 |
133 |
International
134 |
135 |
136 |
137 | Eurocity
138 |
139 |
140 |
141 |
142 |
143 | OBB
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | {% endmacro %}
157 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### VSCodium
2 | .vscode/*
3 | !.vscode/settings.json
4 | !.vscode/tasks.json
5 | !.vscode/launch.json
6 | !.vscode/extensions.json
7 | !.vscode/*.code-snippets
8 |
9 | # Local History for Visual Studio Code
10 | .history/
11 |
12 | # Built Visual Studio Code Extensions
13 | *.vsix
14 |
15 | ### Python
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 |
21 | # C extensions
22 | *.so
23 |
24 | # Distribution / packaging
25 | .Python
26 | build/
27 | develop-eggs/
28 | dist/
29 | downloads/
30 | eggs/
31 | .eggs/
32 | lib/
33 | lib64/
34 | parts/
35 | sdist/
36 | var/
37 | wheels/
38 | share/python-wheels/
39 | *.egg-info/
40 | .installed.cfg
41 | *.egg
42 | MANIFEST
43 |
44 | # PyInstaller
45 | # Usually these files are written by a python script from a template
46 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
47 | *.manifest
48 | *.spec
49 |
50 | # Installer logs
51 | pip-log.txt
52 | pip-delete-this-directory.txt
53 |
54 | # Unit test / coverage reports
55 | htmlcov/
56 | .tox/
57 | .nox/
58 | .coverage
59 | .coverage.*
60 | .cache
61 | nosetests.xml
62 | coverage.xml
63 | *.cover
64 | *.py,cover
65 | .hypothesis/
66 | .pytest_cache/
67 | cover/
68 |
69 | # Translations
70 | *.mo
71 | *.pot
72 |
73 | # Django stuff:
74 | *.log
75 | local_settings.py
76 | db.sqlite3
77 | db.sqlite3-journal
78 |
79 | # Flask stuff:
80 | instance/
81 | .webassets-cache
82 |
83 | # Scrapy stuff:
84 | .scrapy
85 |
86 | # Sphinx documentation
87 | docs/_build/
88 |
89 | # PyBuilder
90 | .pybuilder/
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # IPython
97 | profile_default/
98 | ipython_config.py
99 |
100 | # pyenv
101 | # For a library or package, you might want to ignore these files since the code is
102 | # intended to run in multiple environments; otherwise, check them in:
103 | # .python-version
104 |
105 | # pipenv
106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | # install all needed dependencies.
110 | #Pipfile.lock
111 |
112 | # poetry
113 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
114 | # This is especially recommended for binary packages to ensure reproducibility, and is more
115 | # commonly ignored for libraries.
116 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
117 | #poetry.lock
118 |
119 | # pdm
120 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
121 | #pdm.lock
122 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
123 | # in version control.
124 | # https://pdm.fming.dev/#use-with-ide
125 | .pdm.toml
126 |
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 |
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 |
134 | # SageMath parsed files
135 | *.sage.py
136 |
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 |
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 |
150 | # Rope project settings
151 | .ropeproject
152 |
153 | # mkdocs documentation
154 | /site
155 |
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 |
161 | # Pyre type checker
162 | .pyre/
163 |
164 | # pytype static type analyzer
165 | .pytype/
166 |
167 | # Cython debug symbols
168 | cython_debug/
169 |
170 | # PyCharm
171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
173 | # and can be added to the global gitignore or merged into this file. For a more nuclear
174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
175 | #.idea/
176 |
177 | ### Rust
178 | # Generated by Cargo
179 | # will have compiled files and executables
180 | debug/
181 | target/
182 |
183 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
184 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
185 | Cargo.lock
186 |
187 | # These are backup files generated by rustfmt
188 | **/*.rs.bk
189 |
190 | # MSVC Windows builds of rustc generate these, which store debugging information
191 | *.pdb
192 |
193 | ### TeX
194 | ## Core latex/pdflatex auxiliary files:
195 | *.aux
196 | *.lof
197 | *.log
198 | *.lot
199 | *.fls
200 | *.out
201 | *.toc
202 | *.fmt
203 | *.fot
204 | *.cb
205 | *.cb2
206 | .*.lb
207 |
208 | ## Intermediate documents:
209 | *.dvi
210 | *.xdv
211 | *-converted-to.*
212 | # these rules might exclude image files for figures etc.
213 | # *.ps
214 | # *.eps
215 | # *.pdf
216 |
217 | ## Generated if empty string is given at "Please type another file name for output:"
218 | .pdf
219 |
220 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
221 | *.bbl
222 | *.bcf
223 | *.blg
224 | *-blx.aux
225 | *-blx.bib
226 | *.run.xml
227 |
228 | ## Build tool auxiliary files:
229 | *.fdb_latexmk
230 | *.synctex
231 | *.synctex(busy)
232 | *.synctex.gz
233 | *.synctex.gz(busy)
234 | *.pdfsync
235 |
236 | ## Build tool directories for auxiliary files
237 | # latexrun
238 | latex.out/
239 |
240 | ## Auxiliary and intermediate files from other packages:
241 | # algorithms
242 | *.alg
243 | *.loa
244 |
245 | # achemso
246 | acs-*.bib
247 |
248 | # amsthm
249 | *.thm
250 |
251 | # beamer
252 | *.nav
253 | *.pre
254 | *.snm
255 | *.vrb
256 |
257 | # changes
258 | *.soc
259 |
260 | # comment
261 | *.cut
262 |
263 | # cprotect
264 | *.cpt
265 |
266 | # elsarticle (documentclass of Elsevier journals)
267 | *.spl
268 |
269 | # endnotes
270 | *.ent
271 |
272 | # fixme
273 | *.lox
274 |
275 | # feynmf/feynmp
276 | *.mf
277 | *.mp
278 | *.t[1-9]
279 | *.t[1-9][0-9]
280 | *.tfm
281 |
282 | #(r)(e)ledmac/(r)(e)ledpar
283 | *.end
284 | *.?end
285 | *.[1-9]
286 | *.[1-9][0-9]
287 | *.[1-9][0-9][0-9]
288 | *.[1-9]R
289 | *.[1-9][0-9]R
290 | *.[1-9][0-9][0-9]R
291 | *.eledsec[1-9]
292 | *.eledsec[1-9]R
293 | *.eledsec[1-9][0-9]
294 | *.eledsec[1-9][0-9]R
295 | *.eledsec[1-9][0-9][0-9]
296 | *.eledsec[1-9][0-9][0-9]R
297 |
298 | # glossaries
299 | *.acn
300 | *.acr
301 | *.glg
302 | *.glo
303 | *.gls
304 | *.glsdefs
305 | *.lzo
306 | *.lzs
307 | *.slg
308 | *.slo
309 | *.sls
310 |
311 | # uncomment this for glossaries-extra (will ignore makeindex's style files!)
312 | # *.ist
313 |
314 | # gnuplot
315 | *.gnuplot
316 | *.table
317 |
318 | # gnuplottex
319 | *-gnuplottex-*
320 |
321 | # gregoriotex
322 | *.gaux
323 | *.glog
324 | *.gtex
325 |
326 | # htlatex
327 | *.4ct
328 | *.4tc
329 | *.idv
330 | *.lg
331 | *.trc
332 | *.xref
333 |
334 | # hyperref
335 | *.brf
336 |
337 | # knitr
338 | *-concordance.tex
339 | # TODO Uncomment the next line if you use knitr and want to ignore its generated tikz files
340 | # *.tikz
341 | *-tikzDictionary
342 |
343 | # listings
344 | *.lol
345 |
346 | # luatexja-ruby
347 | *.ltjruby
348 |
349 | # makeidx
350 | *.idx
351 | *.ilg
352 | *.ind
353 |
354 | # minitoc
355 | *.maf
356 | *.mlf
357 | *.mlt
358 | *.mtc[0-9]*
359 | *.slf[0-9]*
360 | *.slt[0-9]*
361 | *.stc[0-9]*
362 |
363 | # minted
364 | _minted*
365 | *.pyg
366 |
367 | # morewrites
368 | *.mw
369 |
370 | # newpax
371 | *.newpax
372 |
373 | # nomencl
374 | *.nlg
375 | *.nlo
376 | *.nls
377 |
378 | # pax
379 | *.pax
380 |
381 | # pdfpcnotes
382 | *.pdfpc
383 |
384 | # sagetex
385 | *.sagetex.sage
386 | *.sagetex.py
387 | *.sagetex.scmd
388 |
389 | # scrwfile
390 | *.wrt
391 |
392 | # svg
393 | svg-inkscape/
394 |
395 | # sympy
396 | *.sout
397 | *.sympy
398 | sympy-plots-for-*.tex/
399 |
400 | # pdfcomment
401 | *.upa
402 | *.upb
403 |
404 | # pythontex
405 | *.pytxcode
406 | pythontex-files-*/
407 |
408 | # tcolorbox
409 | *.listing
410 |
411 | # thmtools
412 | *.loe
413 |
414 | # TikZ & PGF
415 | *.dpth
416 | *.md5
417 | *.auxlock
418 |
419 | # titletoc
420 | *.ptc
421 |
422 | # todonotes
423 | *.tdo
424 |
425 | # vhistory
426 | *.hst
427 | *.ver
428 |
429 | # easy-todo
430 | *.lod
431 |
432 | # xcolor
433 | *.xcp
434 |
435 | # xmpincl
436 | *.xmpi
437 |
438 | # xindy
439 | *.xdy
440 |
441 | # xypic precompiled matrices and outlines
442 | *.xyc
443 | *.xyd
444 |
445 | # endfloat
446 | *.ttt
447 | *.fff
448 |
449 | # Latexian
450 | TSWLatexianTemp*
451 |
452 | ## Editors:
453 | # WinEdt
454 | *.bak
455 | *.sav
456 |
457 | # Texpad
458 | .texpadtmp
459 |
460 | # LyX
461 | *.lyx~
462 |
463 | # Kile
464 | *.backup
465 |
466 | # gummi
467 | .*.swp
468 |
469 | # KBibTeX
470 | *~[0-9]*
471 |
472 | # TeXnicCenter
473 | *.tps
474 |
475 | # auto folder when using emacs and auctex
476 | ./auto/*
477 | *.el
478 |
479 | # expex forward references with \gathertags
480 | *-tags.tex
481 |
482 | # standalone packages
483 | *.sta
484 |
485 | # Makeindex log files
486 | *.lpz
487 |
488 | # xwatermark package
489 | *.xwm
490 |
491 | # REVTeX puts footnotes in the bibliography by default, unless the nofootinbib
492 | # option is specified. Footnotes are the stored in a file with suffix Notes.bib.
493 | # Uncomment the next line to have this generated file ignored.
494 | #*Notes.bib
495 |
496 | # Emacs .gitignore
497 | *~
498 | \#*\#
499 | /.emacs.desktop
500 | /.emacs.desktop.lock
501 | *.elc
502 | auto-save-list
503 | tramp
504 | .\#*
505 |
506 | # Org-mode
507 | .org-id-locations
508 | *_archive
509 |
510 | # flymake-mode
511 | *_flymake.*
512 |
513 | # eshell files
514 | /eshell/history
515 | /eshell/lastdir
516 |
517 | # elpa packages
518 | /elpa/
519 |
520 | # reftex files
521 | *.rel
522 |
523 | # AUCTeX auto folder
524 | /auto/
525 |
526 | # cask packages
527 | .cask/
528 | dist/
529 |
530 | # Flycheck
531 | flycheck_*.el
532 |
533 | # server auth directory
534 | /server/
535 |
536 | # projectiles files
537 | .projectile
538 |
539 | # directory configuration
540 | .dir-locals.el
541 |
542 | # network security
543 | /network-security.data
544 |
545 | ### Custom
546 | data/*
547 | !.gitkeep
548 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RailScrape (railway-opendata)
2 |
3 | In Italy there are no available official **Open Data** about the _performance_ (delays, cancellations, ...) of the **italian public rail transport**.
4 | This project offers a tool which allows anyone to gather it and run some stats and visualizations.
5 |
6 | ## Architecture
7 |
8 | ```mermaid
9 | flowchart TB
10 |
11 | S[Scraper] --> |Downloads data| D("ViaggiaTreno and Trenord APIs")
12 | S -->|Produces| P[(Daily .pickle dumps)]
13 | E[Extractor] -->|Reads| P
14 | E[Extractor] -->|Produces| C[(Daily .CSV dumps)]
15 | A2["(BYOD Analyzer)"] -.->|Reads| C
16 | A[Analyzer] -->|Reads| C
17 | A[Analyzer] -->|Produces| K(Stats, visualizations, etc...)
18 | ```
19 |
20 | The application is composed by multiple modules, accessible via CLI:
21 | - **`scraper`**: unattended script to incrementally download and preserve the current status of the italian railway network. If run constantly (e.g. ~every hour using `cron`) all trains will be captured and saved in `data/%Y-%m-%d/trains.pickle`.
22 | - **`train-extractor`** and **`station-extractor`**: converts raw scraped data to usable `.csv` files;
23 | - **`analyze`** : shows reproducible stats and visualizations.
24 |
25 | ## Running
26 |
27 | The project is written in Python and it uses modern typing annotations, so **Python >= 3.11** is needed.
28 |
29 | ### Using Docker (easy)
30 |
31 | A [Dockerfile](./Dockerfile) is available to avoid installing the dependencies manually.
32 | You can use the automatically updated [ghcr.io/marcobuster/railway-opendata:latest](https://github.com/MarcoBuster/railway-opendata/pkgs/container/railway-opendata)
33 | Docker image if you want the latest version available on the master branch.
34 |
35 | For instance, the following command will start the scraper on your machine.
36 |
37 | ```bash
38 | $ docker run -v ./data:/app/data ghcr.io/marcobuster/railway-opendata:latest scraper
39 | ```
40 |
41 | ### Using virtual envs
42 |
43 | > ⚠️ __WARNING__: this project currently uses the builtin `hash(...)` function to quickly index objects.
44 | > To ensure reproducibility between runs, you need to disable Python's **hash seed randomization** by setting the `PYTHONHASHSEED=0` environment variable.
45 | > If you fail to do so, the software will refuse to start.
46 |
47 | ```bash
48 | $ export PYTHONHASHSEED=0
49 | $ virtualenv venv
50 | $ source ./venv/bin/activate
51 | $ pip install -r requirements.txt
52 | $ python main.py ...
53 | ```
54 |
55 | ## Example usages
56 |
57 | - __Start the scraper__. For continuos data collection, it should be run every ~hour.
58 |
59 | `$ python main.py scraper`
60 |
61 | - __Extract train data__ from a pickle file and save it in CSV.
62 |
63 | `$ python main.py train-extractor -o data/2023/04-29/trains.csv data/2023-04-29/trains.pickle`
64 |
65 | - __Extract station data__ from a pickle file and save it in GeoJSON.
66 |
67 | `$ python main.py station-extractor -f geojson data/stations.pickle`
68 |
69 | - __Describe a dataset__ and filter observation by date.
70 |
71 | `$ python main.py analyze --start-date 2023-05-01 --end-date today data/stations.csv data/2023-05-*/trains.csv --stat describe`
72 |
73 | - __Show delay stats__ of the last stop.
74 |
75 | `$ python main.py analyze --group-by train_hash --agg-func last [..]/stations.csv [..]/trains.csv --stat delay_box_plot`
76 |
77 | - __Show daily train count__ grouped by railway companies.
78 |
79 | `$ python main.py analyze --group-by client_code [..]/stations.csv [..]/trains.csv --stat day_train_count`
80 |
81 | - __Display an interactive map__ and open it in the web browser.
82 |
83 | `$ python main.py analyze [..]/stations.csv [..]/trains.csv --stat trajectories_map`
84 |
85 | - __Display a timetable graph__.
86 |
87 | `$ python main.py analyze [..]/stations.csv [..]/trains.csv --stat timetable --timetable-collapse`
88 |
89 | ## Fields
90 |
91 | ### Stations CSV
92 |
93 | | Column | Data type | Description | Notes |
94 | |--------|-----------|-------------|-------|
95 | | `code` | String | Station code | This field is not actually unique. One station can have multiple codes |
96 | | `region` | Integer | Region code | If zero, unknown. Used in API calls |
97 | | `long_name` | String | Station long name | |
98 | | `short_name` | String | Station short name | Can be empty |
99 | | `latitude` | Float | Station latitude | Can be empty |
100 | | `longitude` | Float | Station longitude | Can be empty |
101 |
102 | ### Trains CSV
103 | In the extracted trains CSV, each line is a _train stop_ (not station nor train).
104 | Many fields are actually duplicated.
105 |
106 | | Column | Data type | Description | Notes |
107 | |--------|-----------|-------------|-------|
108 | | `train_hash` | MD5 hash | Unique identifier for a particular train | |
109 | | `number` | Integer | Train number | Can't be used to uniquely identify a train[^train_number_unique] |
110 | | `day` | Date | Train departing date | |
111 | | `origin` | Station (code) | Train absolute origin | |
112 | | `category` | String | Train Category | See table[^categories] |
113 | | `destination` | Station (code) | Train final destination | |
114 | | `client_code` | Integer | Railway company | See table[^client_codes] |
115 | | `phantom` | Boolean | True if train was only partially fetched | Trains with this flag should be safely ignored |
116 | | `trenord_phantom` | Boolean | True if the train was only partially fetched using Trenord APIs | Trains with this flag should be safely ignored[^trenord_phantom] |
117 | | `cancelled` | Boolean | True if the train is marked as cancelled | Not all cancelled trains are marked as cancelled: for more accuracy, you should always check `stop_type` |
118 | | `stop_number` | Integer | Stop progressive number (starting at 0) | |
119 | | `stop_station_code` | Station (code) | Stop station code | |
120 | | `stop_type` | Char | Stop type | `P` if first, `F` if intermediate, `A` if last, `C` if cancelled |
121 | | `platform` | String | Stop platform | Can be empty |
122 | | `arrival_expected` | ISO 8601 | Stop expected arrival time | Can be empty |
123 | | `arrival_actual` | ISO 8601 | Stop actual arriving time | Can be empty |
124 | | `arrival_delay` | Integer | Stop arriving delay in minutes | Is empty if `arrival_expected` or `arrival_actual` are both empty |
125 | | `departure_expected` | ISO 8601 | Stop expected departing time | Can be empty |
126 | | `departure_actual` | ISO 8601 | Stop actual departing time | Can be empty |
127 | | `departure_delay` | Integer | Stop departing delay in minutes | Is empty if `departing_expected` or `departing_actual` are both empty |
128 | | `crowding` | Integer | Train crowding in percentage | Reported by Trenord |
129 |
130 | [^train_number_unique]: In Italy, two different trains can share the same number. A train is only uniquely identified by the triple (number, origin, day).
131 |
132 | [^categories]: Known categories are listed below.
133 |
134 | | Category | Description |
135 | |----------|-------------|
136 | | REG | Regional trains |
137 | | MET | Metropolitan trains |
138 | | FR | Frecciarossa (red arrow) |
139 | | IC | Intercity |
140 | | ICN | Intercity Night |
141 | | EC | Eurocity |
142 | | FB | Frecciabianca (white arrow) |
143 | | FA | Frecciargento (silver arrow) |
144 | | EN | EuroNight |
145 | | EC ER | Eurocity |
146 |
147 | [^client_codes]: Known client codes are listed below.
148 |
149 | | Client code | Railway company |
150 | |-------------|-----------------|
151 | | 1 | TRENITALIA_AV |
152 | | 2 | TRENITALIA_REG |
153 | | 4 | TRENITALIA_IC |
154 | | 18 | TPER |
155 | | 63 | TRENORD |
156 | | 64 | OBB |
157 |
158 | [^trenord_phantom]: This flag is activated when a train is seen on ViaggiaTreno APIs and marked as Trenord's but it can't be fetched on Trenord's APIs.
159 |
160 | ## Contributing
161 |
162 | See [CONTRIBUTING.md](CONTRIBUTING.md).
163 |
164 | ## Notes and caveats
165 |
166 | ### Data completeness and correctness
167 |
168 | The [ViaggiaTreno](https://viaggiatreno.it) APIs are [known](https://medium.com/@albigiu/trenitalia-shock-non-crederete-mai-a-queste-api-painful-14433096502c) to be **buggy** and **unreliable**.
169 | As stated before, many fields (like `departure_expected` and `arrival_expected`) are not always guaranteed to be present and some concepts are counter-intuitive (a train number is not an unique identifier nor are station codes).
170 |
171 | ViaggiaTreno is the main _source of truth_ for many final user applications (like [Trenìt!](https://play.google.com/store/apps/details?id=eu.baroncelli.oraritrenitalia) or [Orario Treni](https://play.google.com/store/apps/details?id=org.paoloconte.treni_lite)) and is itself linked on the Trenitalia official website.
172 | For instance, if the API does not return information for a train stop, no other application will display it: the data simply does not exists online.
173 | The scraper always tries to save as much data as possible (___"best effort"___) even when is probably incomplete; in those cases, proper flags (like `phantom` and `trenord_phantom`) are activated so the developer can choose for themselves.
174 |
175 | ### Licensing
176 |
177 | Copyright (c) 2023 Marco Aceti. Some rights reserved (see [LICENSE](./LICENSE)).
178 |
179 | Terms and conditions of the ViaggiaTreno web portal state that copying is prohibited (except for personal use) as **all rights for the content are reserved** to the original owner (Trenitalia or Gruppo FS).
180 | In July 2019 Trenitalia sued Trenìt for using train data in its app, but [partially lost](https://www.wired.it/lifestyle/mobilita/2019/09/06/trenitalia-tornata-online-trenit/).
181 | I think data about the performance of __public__ transport should be __open__ as well, but I'm not a lawyer and I'm not willing to risk lawsuits redistributing data; if someone wants to, the tool is now available.
182 |
183 | BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
184 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
185 | THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.
186 | SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
187 |
188 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
189 |
--------------------------------------------------------------------------------
/src/scraper/train_stop.py:
--------------------------------------------------------------------------------
1 | # railway-opendata: scrape and analyze italian railway data
2 | # Copyright (C) 2023 Marco Aceti
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 |
17 |
18 | import typing as t
19 | from datetime import date, datetime
20 | from enum import Enum
21 |
22 | import src.scraper.api as api
23 | import src.scraper.station as st
24 | from src.scraper.exceptions import IncompleteTrenordStopDataException
25 |
26 |
27 | class TrainStopType(Enum):
28 | """A train stop type."""
29 |
30 | FIRST = "P"
31 | STOP = "F"
32 | LAST = "A"
33 | CANCELLED = "C"
34 |
35 |
36 | class TrainStopTime:
37 | """Helper class to handle arrival and departures times.
38 |
39 | Attributes:
40 | expected (datetime): expected departing or arrival time
41 | actual (datetime | None): actual departing or arrival time
42 | """
43 |
44 | def __init__(self, expected: datetime, actual: datetime | None) -> None:
45 | """Initialize a new TrainStopTime object.
46 |
47 | Args:
48 | expected (datetime): expected departing or arrival time
49 | actual (datetime | None): actual departing or arrival time
50 | """
51 | assert expected is not None
52 |
53 | self.expected: datetime = expected
54 | self.actual: datetime | None = actual
55 |
56 | def passed(self) -> bool:
57 | """Return if the train actually arrived or departed from the station.
58 |
59 | Returns:
60 | bool: True if the actual time is not None
61 | """
62 | return self.actual is not None
63 |
64 | def delay(self) -> float | None:
65 | """Return the delay in minutes.
66 |
67 | Returns:
68 | int | None: delay in minutes, None if not .passed().
69 | """
70 | if not self.passed():
71 | return None
72 |
73 | assert isinstance(self.actual, datetime)
74 | assert isinstance(self.expected, datetime)
75 |
76 | if self.actual >= self.expected:
77 | return (self.actual - self.expected).seconds / 60
78 | else:
79 | return -(self.expected - self.actual).seconds / 60
80 |
81 | def __repr__(self) -> str:
82 | hm = lambda d: d.strftime("%H:%M")
83 |
84 | ret: str = hm(self.expected)
85 | if not self.passed():
86 | return ret
87 |
88 | ret += f" ~ {hm(self.actual)}"
89 | if self.delay() == 0:
90 | return ret
91 |
92 | delay: float | None = self.delay()
93 | assert isinstance(delay, float)
94 |
95 | sign: str = "+" if delay > 0 else "-"
96 | ret += f" {sign}{round(abs(delay), 1)}m"
97 |
98 | return ret
99 |
100 |
101 | class TrainStop:
102 | """A ViaggiaTreno train stop.
103 |
104 | Attributes:
105 | station (st.Station): the station the train is stopping by
106 | stop_type (TrainStopType): the type of stop (first, last, stop)
107 | platform_expected (str | None): expected platform
108 | platform_actual (str | None): actual platform
109 | arrival (TrainStopTime | None): arrival time, can be None if it's the first stop
110 | departure (TrainStopTime | None): departure time, can be None if it's the last stop
111 | """
112 |
113 | def __init__(
114 | self,
115 | station: st.Station,
116 | stop_type: TrainStopType,
117 | platform_expected: str | None,
118 | platform_actual: str | None,
119 | arrival_expected: datetime | None,
120 | arrival_actual: datetime | None,
121 | departure_expected: datetime | None,
122 | departure_actual: datetime | None,
123 | ) -> None:
124 | """Initialize a new TrainStop object.
125 |
126 | Args:
127 | station (st.Station): the station the train is stopping by
128 | stop_type (TrainStopType): the type of stop (first, last, stop)
129 | platform_expected (str | None): expected platform
130 | platform_actual (str | None): actual platform
131 | arrival_expected (datetime | None): expected arrival time
132 | arrival_actual (datetime | None): actual arrival time
133 | departure_expected (datetime | None): expected departure time
134 | departure_actual (datetime | None): actual departure time
135 | """
136 | self.station: st.Station = station
137 | self.stop_type: TrainStopType = stop_type
138 |
139 | self.platform_expected: str | None = platform_expected
140 | self.platform_actual: str | None = platform_actual
141 |
142 | self.arrival: TrainStopTime | None = None
143 | self.departure: TrainStopTime | None = None
144 |
145 | if self.stop_type == TrainStopType.CANCELLED:
146 | return
147 |
148 | if self.stop_type != TrainStopType.FIRST:
149 | assert isinstance(arrival_expected, datetime)
150 | self.arrival = TrainStopTime(arrival_expected, arrival_actual)
151 |
152 | if self.stop_type != TrainStopType.LAST:
153 | assert isinstance(departure_expected, datetime)
154 | self.departure = TrainStopTime(departure_expected, departure_actual)
155 |
156 | @classmethod
157 | def _from_raw_data(cls, stop_data: dict) -> "TrainStop":
158 | """Initialize a new train stop from the data processed by Train.fetch()
159 |
160 | Args:
161 | stop_data (dict): the data to initialize the class with
162 |
163 | Returns:
164 | TrainStop: a constructed TrainStop object
165 | """
166 | station = st.Station.by_code(stop_data["id"])
167 | if station._phantom:
168 | station.name = stop_data["stazione"].title().strip()
169 |
170 | stop_type: TrainStopType
171 | if stop_data["tipoFermata"] == "P":
172 | stop_type = TrainStopType.FIRST
173 | elif stop_data["tipoFermata"] == "A":
174 | stop_type = TrainStopType.LAST
175 | elif stop_data["tipoFermata"] == "F":
176 | stop_type = TrainStopType.STOP
177 | else:
178 | stop_type = TrainStopType.CANCELLED
179 |
180 | _to_dt = api.ViaggiaTrenoAPI._to_datetime
181 |
182 | return cls(
183 | station=station,
184 | stop_type=stop_type,
185 | platform_expected=(
186 | stop_data["binarioProgrammatoArrivoDescrizione"]
187 | or stop_data["binarioProgrammatoPartenzaDescrizione"]
188 | ),
189 | platform_actual=(
190 | stop_data["binarioEffettivoArrivoDescrizione"]
191 | or stop_data["binarioEffettivoPartenzaDescrizione"]
192 | ),
193 | arrival_expected=_to_dt(stop_data["arrivo_teorico"]),
194 | arrival_actual=_to_dt(stop_data["arrivoReale"]),
195 | departure_expected=_to_dt(stop_data["partenza_teorica"]),
196 | departure_actual=_to_dt(stop_data["partenzaReale"]),
197 | )
198 |
199 | @classmethod
200 | def _from_trenord_raw_data(
201 | cls, stop_data: dict, day: date
202 | ) -> t.Union["TrainStop", None]:
203 | """Initialize a new train stop from data processed by Train.trenord_fetch()
204 |
205 | Args:
206 | stop_data (dict): the data to initialize the class with
207 | today (date): the date of the train, used to parse datetimes
208 |
209 | Returns:
210 | TrainStop | None: a constructed TrainStop object,
211 | or None if there isn't actual data
212 | """
213 |
214 | def _hhmmss_to_dt(hhmmss: str | None) -> datetime | None:
215 | """Parse and return a Trenord time string into a datetime object.
216 |
217 | Args:
218 | hhmmss (str | None): the string to parse
219 |
220 | Returns:
221 | datetime | None: the parsed datetime object.
222 | """
223 | if not hhmmss:
224 | return None
225 |
226 | return datetime.strptime(hhmmss, "%H:%M:%S").replace(
227 | year=day.year,
228 | month=day.month,
229 | day=day.day,
230 | tzinfo=api.TIMEZONE,
231 | )
232 |
233 | if not stop_data["actual_data"]:
234 | return None
235 |
236 | station_code: str | None = (
237 | stop_data["station"].get("station_id")
238 | or stop_data["actual_data"]["actual_station_mir"]
239 | )
240 | try:
241 | assert isinstance(station_code, str) and len(station_code) > 0
242 | except AssertionError:
243 | raise IncompleteTrenordStopDataException
244 |
245 | station = st.Station.by_code(station_code)
246 | if station._phantom and stop_data.get("station", {}).get("station_ori_name"):
247 | station.name = stop_data["station"]["station_ori_name"].title().strip()
248 |
249 | stop_type: TrainStopType
250 | stop_type_raw = (
251 | stop_data["actual_data"].get("actual_type", None) or stop_data["type"]
252 | )
253 | if stop_type_raw == "O":
254 | stop_type = TrainStopType.FIRST
255 | elif stop_type_raw == "F":
256 | stop_type = TrainStopType.STOP
257 | elif stop_type_raw == "D":
258 | stop_type = TrainStopType.LAST
259 | else:
260 | stop_type = TrainStopType.CANCELLED
261 |
262 | if stop_data["cancelled"]:
263 | stop_type = TrainStopType.CANCELLED
264 |
265 | return cls(
266 | station=station,
267 | stop_type=stop_type,
268 | platform_expected=stop_data.get("platform", None),
269 | platform_actual=None,
270 | arrival_expected=_hhmmss_to_dt(stop_data.get("arr_time")),
271 | arrival_actual=_hhmmss_to_dt(
272 | stop_data["actual_data"].get("arr_actual_time")
273 | ),
274 | departure_expected=_hhmmss_to_dt(stop_data.get("dep_time")),
275 | departure_actual=_hhmmss_to_dt(
276 | stop_data["actual_data"].get("dep_actual_time")
277 | ),
278 | )
279 |
280 | def __repr__(self) -> str:
281 | ret = f"@ ({self.stop_type.value}) {self.station.name} "
282 | if self.stop_type == TrainStopType.FIRST:
283 | ret += f"{self.departure}"
284 | elif self.stop_type == TrainStopType.LAST:
285 | ret += f"{self.arrival}"
286 | else:
287 | ret += f"{self.arrival} --> {self.departure}"
288 |
289 | platform_exp: str = self.platform_expected if self.platform_expected else "?"
290 |
291 | if self.platform_actual:
292 | return ret + f" [{platform_exp} ~ {self.platform_actual}]"
293 | else:
294 | return ret + f" [{platform_exp}]"
295 |
--------------------------------------------------------------------------------
/docs/Proposta tirocinio.tex:
--------------------------------------------------------------------------------
1 | \documentclass[italian,11pt,a4paper,final]{article}
2 | \usepackage[a4paper,
3 | bindingoffset=0.2in,
4 | left=1in,
5 | right=1in,
6 | top=1in,
7 | bottom=1in,
8 | footskip=.25in]{geometry}
9 | \usepackage[utf8]{inputenc}
10 | \usepackage[T1]{fontenc}
11 | \usepackage{hyperref}
12 | \usepackage{babel}
13 | \date{2 marzo 2023}
14 |
15 | \newcommand{\hochkomma}{$^{,\,}$}
16 |
17 | \author{Marco Aceti}
18 | \title{
19 | Open Data e trasporto ferroviario \\
20 | \textit{\small{Proposta di tirocinio interno}}
21 | }
22 |
23 | \begin{document}
24 | \maketitle
25 |
26 | \begin{abstract}
27 | In Italia non esistono Open Data sulle performance del trasporto pubblico ferroviario: le metriche definite nei contratti di servizio tra gli enti locali committenti e le imprese ferroviarie sono insufficienti e spesso inaccessibili.
28 | La proposta di tirocinio si articola sull'idea di preservare i dati istantanei della circolazione ferroviaria dalla piattaforma ViaggiaTreno per produrre Open Data storici, \textit{machine-readable} e di qualità.
29 | Infine, si propone un'analisi dei dati raccolti a fini statistici e di verifica.
30 | \end{abstract}
31 |
32 | \section{Stato dell'arte}
33 | In Italia, il servizio di trasporto pubblico è operato da aziende\footnote{\url{https://it.wikipedia.org/wiki/Aziende_di_trasporto_pubblico_italiane}} private o partecipate.
34 | Sul territorio nazionale sono autorizzate\footnote{\url{https://www.mit.gov.it/documentazione/elenco-imprese-ferroviarie-titolari-di-licenza-1}} una ventina di \textit{Imprese Ferroviarie} (IF) adibite al trasporto passeggeri aventi in essere numerosi \textit{Contratti di Servizio} (CdS) con gli enti locali (tipicamente le Regioni).
35 | La qualità del servizio è misurata da \textbf{metriche di performance} stabilite nei CdS e comunicate agli enti dalle IF.
36 |
37 | \subsection{Esempio: il servizio ferroviario lombardo}
38 | In Lombardia, Trenord S.r.l.\ definisce\footnote{\url{https://www.regione.lombardia.it/wps/wcm/connect/7144d5b9-7e3c-4e44-82ad-30a1652e2642/Contratto+Trenord+con+firme.pdf} -- Allegato 11} un \textit{indice di puntualità entro i 5 minuti} che considera il \textit{``numero di corse circolanti giunte puntuali o con ritardo fino a 5 minuti''}, ma esclude i \textit{``ritardi maturati per cause esterne''} o \textit{``per lavori''}.
39 | La Regione pubblica mensilmente un rapporto sulla puntualità dei treni\footnote{\url{https://www.regione.lombardia.it/wps/wcm/connect/4eae62eb-dfcf-4446-82ea-72dbfdfb2c4a/Puntualit\%C3\%A0.pdf}} in formato PDF, ma con diverse criticità:
40 | \begin{itemize}
41 | \item vengono considerati solo i ritardi in arrivo alla destinazione finale, escludendo quindi le stazioni intermedie;
42 | \item i dati forniti non sono granulari ma \textit{brutalmente} aggregati per mese;
43 | \item sono escluse le \textit{cause esterne} e le \textit{circostanze occasionali}: gli indici di puntualità effettivi non sono pubblicati;
44 | \item i rapporti non rispettano neanche una \textit{stella} dei livelli definiti da Tim Berners-Lee per valutare gli Open Data: non è nemmeno presente una licenza d'uso.
45 | \end{itemize}
46 |
47 | C'è da considerare inoltre che Trenord (società tra l'altro partecipata al 50\% da Regione Lombardia stessa) comunica al committente gli indici già calcolati, senza che quest'ultimo abbia modo di verificarli.
48 |
49 | Infine, non tutti gli enti committenti pubblicano rapporti sulla qualità del servizio: per esempio, la Regione Campania prevede nel suo CdS\footnote{
50 | \url{https://www.regione.campania.it/assets/documents/contratto-di-servizio-tpl-ferro.pdf} \\
51 | sez.\ \textit{``Penali e forme di mitigazione delle stesse''} -- Allegato 7
52 | } con Trenitalia S.p.A.\ la fornitura di indici simili per il calcolo di penali e mitigazioni,
53 | ma non è reperibile nessun documento che li attesti. \\
54 |
55 | \subsection{Open Data storici}
56 | In conclusione, non esistono attualmente Open Data {storici}, completi, strutturati e \textit{machine-readable} sul servizio di trasporto ferroviario in Italia.
57 | Gli indici di puntualità (e affidabilità) definiti nei CdS possono essere utili agli enti committenti per calcolare penali o comparare offerte di mercato, ma i Cittadini Digitali meritano una \textbf{maggiore trasparenza} per poter verificare autonomamente lo stato reale del \textit{Sistema Ferrovia}.
58 |
59 | \section{Rilevazioni istantanee}
60 | Nella sezione precedente si è discusso di \textbf{dati storici}; la situazione è molto più rosea per i \textbf{dati in tempo reale}.
61 | Esistono innumerevoli siti web e applicazioni, ufficiali e non, che mostrano lo stato attuale di un treno in viaggio.
62 | L'app \textit{Orario Treni}\footnote{\url{https://www.orariotreniapp.it/}} di Paolo Conte, per esempio, presenta con un'interfaccia molto semplice e intuitiva la possibilità di cercare treni per itinerario e numero, visualizzare arrivi e partenze di una stazione e consultare l'\textit{andamento istantaneo} di un treno.
63 | Quest'ultimo è composto da informazioni come gli orari programmati ed \textit{effettivi} di partenza e arrivo ad ogni fermata intermedia, ritardo cumulato fino a quel momento e luogo di ultimo rilevamento (non necessariamente corrispondente ad una fermata). \\
64 |
65 | L'idea fondante della proposta in oggetto è sfruttare la ghiotta quantità di dati offerta dalle rilevazioni istantanee nel corso del tempo per produrre Open Data storici.
66 |
67 | \subsection{ViaggiaTreno}
68 | Il Gruppo Ferrovie dello Stato Italiane (\textit{holding} di diverse società\footnote{\url{https://it.wikipedia.org/wiki/Ferrovie_dello_Stato_Italiane}} come Trenitalia, RFI, ANAS, ...) permette ai viaggiatori di trovare soluzioni di viaggio e visualizzare l'andamento di una corsa tramite la piattaforma web ViaggiaTreno\footnote{\url{http://www.viaggiatreno.it/infomobilita/index.jsp}}, similmente all'app \textit{Orario Treni}.
69 | Si può infatti speculare che quest'ultima utilizzi proprio ViaggiaTreno come fonte dei dati.
70 |
71 | \subsubsection{API}
72 | Il \textit{motore} dell'interfaccia web di ViaggiaTreno è un insieme di API ``REST'' non ufficialmente documentate e di scarsa qualità\footnote{\url{https://medium.com/@albigiu/trenitalia-shock-non-crederete-mai-a-queste-api-painful-14433096502c}}.
73 | In rete sono presenti diversi tentativi di documentazione, mantenuti dalla community open source\footnote{\url{https://github.com/sabas/trenitalia}}\hochkomma\footnote{\url{https://github.com/roughconsensusandrunningcode/TrainMonitor/wiki/API-del-sistema-Viaggiatreno}}\hochkomma\footnote{\url{https://github.com/Razorphyn/Informazioni-Treni-Italiani}}.
74 |
75 | \subsubsection{Copyright e licenza d'uso}
76 | Le \textit{note legali} riportate sul portale ViaggiaTreno sono abbastanza aggressive.
77 | \begin{quote}
78 | \textit{I contenuti, la grafica e le immagini sono soggetti a Copyright. \textbf{Ogni diritto sui contenuti} (a titolo esemplificativo e non esaustivo: l’architettura del servizio, i testi, le immagini grafiche e fotografiche, ecc.) \textbf{è riservato ai sensi della normativa vigente}. I contenuti di ViaggiaTreno non possono, neppure in parte, essere copiati, riprodotti, trasferiti, caricati, pubblicati o distribuiti in qualsiasi modo senza il preventivo consenso scritto della società Trenitalia S.p.A.. È possibile scaricare i contenuti nel proprio computer e/o stampare estratti \textbf{unicamente per utilizzo personale} di carattere informativo. \textbf{Qualsiasi forma di link al sito www.ViaggiaTreno.it deve essere preventivamente autorizzata}\footnote{L'autore di questo documento si dichiara reo del \textit{reato di linking non autorizzato}} e non deve recare danno all'immagine e alle attività di Trenitalia S.p.A.. è vietato il c.d.\ deep linking ossia l'utilizzo, su siti di soggetti terzi, di parti del Servizio Internet o, comunque, il collegamento diretto alle pagine senza passare per la home page del Servizio Internet. \textbf{L'eventuale inosservanza delle presenti disposizioni}, salvo esplicita autorizzazione scritta, \textbf{sarà perseguita} nelle competenti sedi giudiziarie civili e penali.}
79 | \end{quote}
80 | Il Gruppo Ferrovie dello Stato Italiane vieta formalmente ai soggetti non autorizzati l'utilizzo di ViaggiaTreno per fini diversi dal mero uso personale, riservando tutti i diritti sui contenuti.
81 | Nel 2019, l'applicazione Trenìt!\ è stata costretta\footnote{\url{https://www.startmag.it/smartcity/perche-trenitalia-ha-tamponato-lapp-trenit-per-il-momento/}} a interrompere il servizio in seguito a un processo giudiziario iniziato da Trenitalia, che contestava il riutilizzo dei dati sulla circolazione ferroviaria presenti su ViaggiaTreno.
82 | Il giudice nella sua sentenza\footnote{\url{https://www.startmag.it/innovazione/trenit-trenitalia/}} ha invece stabilito che \textit{``la banca dati degli orari dei treni e i prezzi di questi, non è protetta da diritto d’autore''} e quindi Trenìt!\ li può utilizzare. \\
83 |
84 | Ritengo quindi che non ci siano reali limiti legali nell'utilizzo della piattaforma ViaggiaTreno e in particolare delle sue API per i fini della proposta in oggetto.
85 |
86 | \subsection{Avvisi Trenord sulla circolazione}
87 |
88 | Trenord, oltre alla tracciabilità dei suoi treni in ViaggiaTreno, offre anche un servizio di avviso delle criticità di tutte le linee (simile all'InfoMobilità di Trenitalia).
89 | Gli avvisi sono rilasciati da esseri umani, ma hanno un formato simile. Di seguito ne sono riportati alcuni della linea \textit{Verona-Brescia-Milano}\footnote{\url{https://www.trenord.it/linee-e-orari/circolazione/le-nostre-linee/brescia-treviglio-milano/?code=R4}}.
90 |
91 | \begin{quote}
92 | \textbf{Criticità} --- 01/03/2023 06:24
93 |
94 | \texttt{Aggiornamento:
95 | Il treno 10913 (MILANO GRECO PIRELLI 05:52 - BRESCIA 07:12) sta viaggiando con un ritardo di 30 minuti perché è stato necessario prolungare i controlli tecnici che precedono la partenza del treno.}
96 | \end{quote}
97 |
98 | \begin{quote}
99 | \textbf{Criticità} --- 01/03/2023 10:07\nopagebreak
100 |
101 | \texttt{Il treno 2624 (VERONA PORTA NUOVA 09:43 - MILANO CENTRALE 11:35) viaggia con 12 minuti di ritardo in seguito alla sosta prolungata di un altro treno della linea.}
102 | \end{quote}
103 |
104 | \section{Proposta operativa}
105 | La proposta si articola in tre fasi.
106 |
107 | \subsection{Indagine esplorativa}
108 | Come concordato a voce nello scorso colloquio, in questa fase potrei indagare più a fondo sullo stato degli Open Data nel trasporto ferroviario in Italia e negli altri Paesi europei.
109 | Progetti simili potrebbero influenzare positivamente scelte come la granularità e il formato dei dati.
110 |
111 | \subsection{Raccolta e produzione degli Open Data}
112 | Lo scopo di questa fase è progettare e implementare uno strumento che raccoglie i dati in tempo reale dal portale ViaggiaTreno di \textbf{tutti i treni} in circolazione e li salva in un database.
113 | Quindi, creare successivamente un altro strumento che li esporta in un formato concordato.
114 |
115 | Per quanto riguarda gli avvisi di Trenord, un semplice script che li scarica dal sito web dovrebbe essere sufficiente. \\
116 |
117 | Al fine di avere dati significati nella fase successiva, è importante iniziare il prima possibile l'attività di raccolta dati.
118 |
119 | \subsection{Analisi dei dati raccolti}
120 | Innanzitutto, è necessario definire opportunamente i concetti di \textit{tratta} e \textit{corsa}: considerando che i numeri identificativi dei treni mutano da un giorno all'altro e non sono univoci, non è un compito banale.
121 | Quindi, si possono ricalcolare gli indici di puntualità e affidabilità \textit{effettivi} per ogni tratta individuata e trovare correlazioni tra le performance del servizio e giorno della settimana, orario, condizioni meteo, ecc\ldots
122 |
123 | Si potrebbe anche verificare la regolarità e correttezza degli avvisi ai passeggeri nelle tratte affidate a Trenord e analizzare le cause dichiarate più comuni. \\
124 |
125 | \section{Sviluppi futuri}
126 |
127 | Nonostante ritenga l'attività di analisi dei dati estremamente interessante, l'obiettivo principale della proposta in oggetto è fornire strumenti liberi e Open Data di qualità per permettere a chiunque dotato delle capacità necessarie di continuare il lavoro.
128 | Con il supporto del Dipartimento di Informatica si potrebbe rendere l'attività di raccolta dati permanente e costante fornendo \textit{dump} regolari accessibili da un portale web, anche di semplice costruzione.
129 |
130 | \end{document}
131 |
--------------------------------------------------------------------------------
/src/analysis/trajectories_map.py:
--------------------------------------------------------------------------------
1 | # railway-opendata: scrape and analyze italian railway data
2 | # Copyright (C) 2023 Marco Aceti
3 | #
4 | # This program is free software; you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation; either version 2 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 |
17 |
18 | import itertools
19 | import logging
20 | import pathlib
21 | import typing as t
22 | import webbrowser
23 | from collections import defaultdict
24 | from datetime import datetime, timedelta
25 | from tempfile import NamedTemporaryFile
26 |
27 | import folium
28 | import folium.plugins
29 | import numpy as np
30 | import pandas as pd
31 | from branca.colormap import LinearColormap
32 | from branca.element import MacroElement, Template
33 | from colour import Color
34 | from joblib import Parallel, delayed
35 |
36 | # The 'length' (in minutes) of a frame
37 | WINDOW_SIZE: int = 2
38 | assert WINDOW_SIZE > 0
39 |
40 | # Minimum line weight
41 | MIN_WEIGHT: int = 4
42 | assert MIN_WEIGHT > 0
43 |
44 | # Safe values used in sanity checks
45 | MIN_YEAR: int = datetime.now().year - 50
46 | MAX_YEAR: int = datetime.now().year + 10
47 |
48 | # Folium map initialization arguments
49 | MAP_KWARGS: dict = {
50 | "location": (41.890, 12.492),
51 | "zoom_start": 7,
52 | "attr": "OSM",
53 | }
54 |
55 | # Assets path (marker icons)
56 | ASSETS_PATH = pathlib.Path("./src/analysis/assets/").resolve()
57 |
58 | # Delay color range: (lower_bound, color)
59 | _color_map: list[tuple[float, Color]] = [
60 | (-5, Color("#34ebc0")),
61 | (0, Color("green")),
62 | (10, Color("orange")),
63 | (30, Color("red")),
64 | (120, Color("black")),
65 | ]
66 |
67 | # Statically populate COLORS dict
68 | COLORS: dict[int | float, Color] = defaultdict(lambda: Color("gray"))
69 | for i, (lower_bound, color) in enumerate(_color_map[1:]):
70 | prev_bound, prev_color = _color_map[i + 1 - 1]
71 | n_range: range = range(round(prev_bound), round(lower_bound) + 1)
72 | scale: list[Color] = list(prev_color.range_to(color, len(n_range)))
73 | for j, n in enumerate(n_range):
74 | COLORS[n] = scale[j]
75 |
76 |
77 | def fill_time(start: datetime, end: datetime) -> t.Generator[datetime, None, None]:
78 | """Generate a consecutive list of times between the 'start' and 'end' period.
79 |
80 | Args:
81 | start (datetime): start time
82 | end (datetime): end time
83 |
84 | Returns:
85 | Generator[datetime, None, None]: the generated datetimes
86 | """
87 | # Fix empty intervals
88 | if start == end:
89 | start -= timedelta(minutes=WINDOW_SIZE)
90 |
91 | while start <= end:
92 | yield start
93 | start += timedelta(minutes=WINDOW_SIZE)
94 |
95 |
96 | def icon_marker(railway_company: str, category: str) -> str:
97 | """Select a proper marker (from the src/analysis/assets/markers/ directory)
98 | by railway_company and category.
99 |
100 | Args:
101 | railway_company (str): a railway company
102 | category (str): a category
103 |
104 | Returns:
105 | str: filename of the proper marker
106 | """
107 |
108 | category = category.replace("MET", "REG").replace("EC FR", "EC")
109 | railway_company = railway_company.lower()
110 |
111 | if railway_company.startswith("trenitalia") and category in [
112 | "EC",
113 | "FA",
114 | "FB",
115 | "FR",
116 | "IC",
117 | "ICN",
118 | "REG",
119 | ]:
120 | return f"trenitalia_{category.lower()}.svg"
121 |
122 | if railway_company in ["trenord", "tper"] and category == "REG":
123 | return f"{railway_company}_reg.svg"
124 |
125 | if railway_company == "obb" and category == "EC":
126 | return "obb_ec.svg"
127 |
128 | return "other.svg"
129 |
130 |
131 | @delayed
132 | def train_stop_geojson(st: pd.DataFrame, train: pd.DataFrame) -> list[dict]:
133 | """Generate a list of GeoJSON formatted data for train stops.
134 |
135 | Args:
136 | st (pd.DataFrame): global station data
137 | train (pd.DataFrame): the train stop data
138 |
139 | Returns:
140 | Generator[dict, None, None]: a generator of GeoJSON formatted
141 | dictionaries representing the train _geographic trajectory_.
142 | """
143 | ret: list[dict] = list()
144 | train = train.sort_values(by="stop_number")
145 |
146 | # Iterate the train stops two by two
147 | for i in range(len(train))[1:]:
148 | prev = train.iloc[i - 1]
149 | curr = train.iloc[i]
150 |
151 | try:
152 | prev_st = st.loc[
153 | (st.index == prev.stop_station_code)
154 | & ~st.latitude.isna()
155 | & ~st.longitude.isna()
156 | ].iloc[0]
157 | curr_st = st.loc[
158 | (st.index == curr.stop_station_code)
159 | & ~st.latitude.isna()
160 | & ~st.longitude.isna()
161 | ].iloc[0]
162 | except IndexError:
163 | # The station location can't be retrieved
164 | continue
165 |
166 | prev_time: datetime | None = prev.departure_actual or prev.departure_expected
167 | curr_time: datetime | None = curr.arrival_actual or curr.arrival_expected
168 | delay: float = (
169 | round(prev.departure_delay)
170 | if not np.isnan(prev.departure_delay)
171 | else np.nan
172 | )
173 |
174 | # Sanity check: _time must be not null
175 | if not prev_time or not curr_time:
176 | continue
177 |
178 | # Sanity check: a train should arrive in a given station after
179 | # it departs from the previous one
180 | if not curr_time >= prev_time:
181 | continue
182 |
183 | # Sanity check: sometimes the API returns insane year values
184 | if curr_time.year > MAX_YEAR or prev_time.year < MIN_YEAR:
185 | continue
186 |
187 | # Tooltip pop up display
188 | tooltip: str = (
189 | f"{curr.client_code} ∙ {curr.category}{curr.number}"
190 | f"