├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── build.sh
├── dist
    ├── gtfs_functions-2.5-py3-none-any.whl
    └── gtfs_functions-2.5.tar.gz
├── gtfs_functions
    ├── __init__.py
    ├── aux_functions.py
    ├── gtfs_functions.py
    └── gtfs_plots.py
├── images
    ├── bus_segments.jpg
    ├── fancy_speed_per_hour.jpg
    ├── heatmap.jpg
    ├── histogram.jpg
    ├── kepler_seg_freq.jpg
    ├── kepler_speeds.jpg
    ├── line_frequencies.jpg
    ├── map_line_freq.jpg
    ├── map_stop_freq.jpg
    ├── routes.jpg
    ├── shapes.jpg
    ├── speed_hour.jpg
    ├── stop_times.jpg
    ├── stops.jpg
    ├── stops_freq_output.jpg
    └── trips.jpg
├── pyproject.toml
├── setup.cfg
└── setup.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | jobs:
14 |   build:
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout code
19 |       uses: actions/checkout@v2
20 | 
21 |     - name: Set up Python
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: '3.x'
25 | 
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install setuptools wheel twine
30 | 
31 |     - name: Build package
32 |       run: |
33 |         python setup.py sdist bdist_wheel
34 | 
35 |     - name: Publish package
36 |       env:
37 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
38 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
39 |       run: |
40 |         twine upload dist/*


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | example
3 | notebooks
4 | *.pyc
5 | *.sh


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GTFS functions
  2 | 
  3 | This package allows you to create various layers directly from the GTFS and visualize the results in the most straightforward way possible.
  4 | 
  5 | ## Update November 2023:
  6 | * Possibility to check the `service_id` for a given date:
  7 | 
  8 | ```python
  9 | parsed_calendar = Feed(gtfs_path).parse_calendar()
 10 | ```
 11 | 
 12 | or if you want it already grouped by date:
 13 | ``` python
 14 | date_service = Feed(gtfs_path).get_dates_service_id()
 15 | ``` 
 16 | 
 17 | 
 18 | ## Update August 2023:
 19 | * Possibility to parse the GTFS for a specific date range.
 20 | ```python
 21 | feed = Feed(gtfs_path, start_date='2023-03-31', end_date='2023-04-04')
 22 | ```
 23 | 
 24 | ## Update March 2023:
 25 | * Removed dependency with [partridge](https://github.com/remix/partridge). As much as we love this package and think it is absolutely great, removing a dependency gives us more control and keeps this package from failing whenever something changes in `partridge`.
 26 | * We treat the GTFS as a class, where each file is a property. See examples below to find out how to work with it. We hope this simplifies your code.
 27 | * Fixed and enhanced **segment cutting**. Shout out to [Mattijs De Paepe](https://github.com/mattijsdp)
 28 | * Support to identify route patterns!! Check it out using `feed.routes_patterns`. Shout out to [Tobias Bartsch](https://github.com/tobiasbartsch)
 29 | * The rest should stay the same.
 30 | 
 31 | #### Warning! 
 32 | Make sure `stop_times.txt` has no `Null` values in the columns `arrival_time` and `departure_time`. If this is not the case, some functions on this package might fail.
 33 | 
 34 | ## Table of contents
 35 | * [Installation](#installation)
 36 | * [GTFS parsing](#gtfs_parsing)
 37 | * [Stop frequencies](#stop_freq)
 38 | * [Line frequencies](#line_freq)
 39 | * [Cut in Bus segments](#segments)
 40 | * [Speeds](#speeds)
 41 | * [Segment frequencies](#segments_freq)
 42 | * [Mapping the results](#map_gdf)
 43 | * [Other plots](#plotly)
 44 | 
 45 | ## Python version
 46 | The package requires `python>=3.8`. You can create a new environment with this version using conda:
 47 | ```console
 48 | conda create -n new-env python=3.8
 49 | ```
 50 | 
 51 | ## Installation <a class="anchor" id="installation"></a>
 52 | 
 53 | You can install the package running the following in your console:
 54 | ```console
 55 | pip install gtfs_functions
 56 | ```
 57 | 
 58 | Import the package in your script/notebook
 59 | ```python
 60 | from gtfs_functions import Feed
 61 | ```
 62 | 
 63 | # GTFS Import <a class="anchor" id="gtfs_parsing"></a>
 64 | Now you can interact with your GTFS with the class `Feed`. Take a look at the class with `?Feed` to check what arguments you can specify. 
 65 | 
 66 | 
 67 | ```python
 68 | gtfs_path = 'data/sfmta.zip'
 69 | 
 70 | # It also works with URL's
 71 | gtfs_path = 'https://transitfeeds.com/p/sfmta/60/latest/download'
 72 | 
 73 | feed = Feed(gtfs_path, time_windows=[0, 6, 10, 12, 16, 19, 24])
 74 | ```
 75 | 
 76 | 
 77 | ```python
 78 | routes = feed.routes
 79 | routes.head(2)
 80 | ```
 81 | 
 82 | 
 83 | 
 84 | 
 85 | <div>
 86 | <table border="1" class="dataframe">
 87 |   <thead>
 88 |     <tr style="text-align: right;">
 89 |       <th></th>
 90 |       <th>route_id</th>
 91 |       <th>agency_id</th>
 92 |       <th>route_short_name</th>
 93 |       <th>route_long_name</th>
 94 |       <th>route_desc</th>
 95 |       <th>route_type</th>
 96 |       <th>route_url</th>
 97 |       <th>route_color</th>
 98 |       <th>route_text_color</th>
 99 |     </tr>
100 |   </thead>
101 |   <tbody>
102 |     <tr>
103 |       <th>0</th>
104 |       <td>15761</td>
105 |       <td>SFMTA</td>
106 |       <td>1</td>
107 |       <td>CALIFORNIA</td>
108 |       <td></td>
109 |       <td>3</td>
110 |       <td>https://SFMTA.com/1</td>
111 |       <td></td>
112 |       <td></td>
113 |     </tr>
114 |     <tr>
115 |       <th>1</th>
116 |       <td>15766</td>
117 |       <td>SFMTA</td>
118 |       <td>5</td>
119 |       <td>FULTON</td>
120 |       <td></td>
121 |       <td>3</td>
122 |       <td>https://SFMTA.com/5</td>
123 |       <td></td>
124 |       <td></td>
125 |     </tr>
126 |   </tbody>
127 | </table>
128 | </div>
129 | 
130 | 
131 | 
132 | ```python
133 | stops = feed.stops
134 | stops.head(2)
135 | ```
136 | 
137 | 
138 | <table border="1" class="dataframe">
139 |   <thead>
140 |     <tr style="text-align: right;">
141 |       <th></th>
142 |       <th>stop_id</th>
143 |       <th>stop_code</th>
144 |       <th>stop_name</th>
145 |       <th>stop_desc</th>
146 |       <th>zone_id</th>
147 |       <th>stop_url</th>
148 |       <th>geometry</th>
149 |     </tr>
150 |   </thead>
151 |   <tbody>
152 |     <tr>
153 |       <th>0</th>
154 |       <td>390</td>
155 |       <td>10390</td>
156 |       <td>19th Avenue &amp; Holloway St</td>
157 |       <td></td>
158 |       <td></td>
159 |       <td></td>
160 |       <td>POINT (-122.47510 37.72119)</td>
161 |     </tr>
162 |     <tr>
163 |       <th>1</th>
164 |       <td>3016</td>
165 |       <td>13016</td>
166 |       <td>3rd St &amp; 4th St</td>
167 |       <td></td>
168 |       <td></td>
169 |       <td></td>
170 |       <td>POINT (-122.38979 37.77262)</td>
171 |     </tr>
172 |   </tbody>
173 | </table>
174 | </div>
175 | 
176 | 
177 | 
178 | 
179 | ```python
180 | stop_times = feed.stop_times
181 | stop_times.head(2)
182 | ```
183 | 
184 | 
185 | 
186 | 
187 | <div>
188 | <table border="1" class="dataframe">
189 |   <thead>
190 |     <tr style="text-align: right;">
191 |       <th></th>
192 |       <th>trip_id</th>
193 |       <th>arrival_time</th>
194 |       <th>departure_time</th>
195 |       <th>stop_id</th>
196 |       <th>stop_sequence</th>
197 |       <th>stop_headsign</th>
198 |       <th>pickup_type</th>
199 |       <th>drop_off_type</th>
200 |       <th>shape_dist_traveled</th>
201 |       <th>route_id</th>
202 |       <th>service_id</th>
203 |       <th>direction_id</th>
204 |       <th>shape_id</th>
205 |       <th>stop_code</th>
206 |       <th>stop_name</th>
207 |       <th>stop_desc</th>
208 |       <th>zone_id</th>
209 |       <th>stop_url</th>
210 |       <th>geometry</th>
211 |     </tr>
212 |   </thead>
213 |   <tbody>
214 |     <tr>
215 |       <th>0</th>
216 |       <td>9413147</td>
217 |       <td>81840.0</td>
218 |       <td>81840.0</td>
219 |       <td>4015</td>
220 |       <td>1</td>
221 |       <td></td>
222 |       <td>NaN</td>
223 |       <td></td>
224 |       <td>NaN</td>
225 |       <td>15761</td>
226 |       <td>1</td>
227 |       <td>0</td>
228 |       <td>179928</td>
229 |       <td>14015</td>
230 |       <td>Clay St &amp; Drumm St</td>
231 |       <td></td>
232 |       <td></td>
233 |       <td></td>
234 |       <td>POINT (-122.39682 37.79544)</td>
235 |     </tr>
236 |     <tr>
237 |       <th>1</th>
238 |       <td>9413147</td>
239 |       <td>81902.0</td>
240 |       <td>81902.0</td>
241 |       <td>6294</td>
242 |       <td>2</td>
243 |       <td></td>
244 |       <td>NaN</td>
245 |       <td></td>
246 |       <td>NaN</td>
247 |       <td>15761</td>
248 |       <td>1</td>
249 |       <td>0</td>
250 |       <td>179928</td>
251 |       <td>16294</td>
252 |       <td>Sacramento St &amp; Davis St</td>
253 |       <td></td>
254 |       <td></td>
255 |       <td></td>
256 |       <td>POINT (-122.39761 37.79450)</td>
257 |     </tr>
258 |   </tbody>
259 | </table>
260 | </div>
261 | 
262 | 
263 | 
264 | 
265 | ```python
266 | trips = feed.trips
267 | trips.head(2)
268 | ```
269 | 
270 | 
271 | 
272 | 
273 | <div>
274 | <table border="1" class="dataframe">
275 |   <thead>
276 |     <tr style="text-align: right;">
277 |       <th></th>
278 |       <th>trip_id</th>
279 |       <th>route_id</th>
280 |       <th>service_id</th>
281 |       <th>direction_id</th>
282 |       <th>shape_id</th>
283 |     </tr>
284 |   </thead>
285 |   <tbody>
286 |     <tr>
287 |       <th>0</th>
288 |       <td>9547346</td>
289 |       <td>15804</td>
290 |       <td>1</td>
291 |       <td>0</td>
292 |       <td>180140</td>
293 |     </tr>
294 |     <tr>
295 |       <th>1</th>
296 |       <td>9547345</td>
297 |       <td>15804</td>
298 |       <td>1</td>
299 |       <td>0</td>
300 |       <td>180140</td>
301 |     </tr>
302 |   </tbody>
303 | </table>
304 | </div>
305 | 
306 | 
307 | 
308 | 
309 | ```python
310 | shapes = feed.shapes
311 | shapes.head(2)
312 | ```
313 | 
314 | 
315 | 
316 | 
317 | <div>
318 | <table border="1" class="dataframe">
319 |   <thead>
320 |     <tr style="text-align: right;">
321 |       <th></th>
322 |       <th>shape_id</th>
323 |       <th>geometry</th>
324 |     </tr>
325 |   </thead>
326 |   <tbody>
327 |     <tr>
328 |       <th>0</th>
329 |       <td>179928</td>
330 |       <td>LINESTRING (-122.39697 37.79544, -122.39678 37...</td>
331 |     </tr>
332 |     <tr>
333 |       <th>1</th>
334 |       <td>179929</td>
335 |       <td>LINESTRING (-122.39697 37.79544, -122.39678 37...</td>
336 |     </tr>
337 |   </tbody>
338 | </table>
339 | </div>
340 | 
341 | 
342 | 
343 | # Stop frequencies <a class="anchor" id="stop_freq"></a>
344 | 
345 | Returns a geodataframe with the frequency for each combination of `stop`, `time of day` and `direction`. Each row with a **Point** geometry. The user can optionally specify `cutoffs` as a list in case the default is not good. These `cutoffs` should be specified at the moment of reading the `Feed` class. These `cutoffs` are the times of days to use as aggregation.
346 | 
347 | 
348 | ```python
349 | time_windows = [0, 6, 9, 15.5, 19, 22, 24]
350 | 
351 | feed = Feed(gtfs_path, time_windows=time_windows)
352 | stop_freq = feed.stops_freq
353 | stop_freq.head(2)
354 | ```
355 | 
356 | 
357 | 
358 | 
359 | <div>
360 | <table border="1" class="dataframe">
361 |   <thead>
362 |     <tr style="text-align: right;">
363 |       <th></th>
364 |       <th>stop_id</th>
365 |       <th>dir_id</th>
366 |       <th>window</th>
367 |       <th>ntrips</th>
368 |       <th>min_per_trip</th>
369 |       <th>stop_name</th>
370 |       <th>geometry</th>
371 |     </tr>
372 |   </thead>
373 |   <tbody>
374 |     <tr>
375 |       <th>8157</th>
376 |       <td>5763</td>
377 |       <td>Inbound</td>
378 |       <td>0:00-6:00</td>
379 |       <td>1</td>
380 |       <td>360</td>
381 |       <td>Noriega St &amp; 48th Ave</td>
382 |       <td>POINT (-122.50785 37.75293)</td>
383 |     </tr>
384 |     <tr>
385 |       <th>13102</th>
386 |       <td>7982</td>
387 |       <td>Outbound</td>
388 |       <td>0:00-6:00</td>
389 |       <td>1</td>
390 |       <td>360</td>
391 |       <td>Moscow St &amp; RussiaAvet</td>
392 |       <td>POINT (-122.42996 37.71804)</td>
393 |     </tr>
394 |     <tr>
395 |       <th>9539</th>
396 |       <td>6113</td>
397 |       <td>Inbound</td>
398 |       <td>0:00-6:00</td>
399 |       <td>1</td>
400 |       <td>360</td>
401 |       <td>Portola Dr &amp; Laguna Honda Blvd</td>
402 |       <td>POINT (-122.45526 37.74310)</td>
403 |     </tr>
404 |     <tr>
405 |       <th>12654</th>
406 |       <td>7719</td>
407 |       <td>Inbound</td>
408 |       <td>0:00-6:00</td>
409 |       <td>1</td>
410 |       <td>360</td>
411 |       <td>Middle Point &amp; Acacia</td>
412 |       <td>POINT (-122.37952 37.73707)</td>
413 |     </tr>
414 |     <tr>
415 |       <th>9553</th>
416 |       <td>6116</td>
417 |       <td>Inbound</td>
418 |       <td>0:00-6:00</td>
419 |       <td>1</td>
420 |       <td>360</td>
421 |       <td>Portola Dr &amp; San Pablo Ave</td>
422 |       <td>POINT (-122.46107 37.74040)</td>
423 |     </tr>
424 |   </tbody>
425 | </table>
426 | </div>
427 | 
428 | 
429 | 
430 | # Line frequencies <a class="anchor" id="line_freq"></a>
431 | 
432 | Returns a geodataframe with the frequency for each combination of `line`, `time of day` and `direction`. Each row with a **LineString** geometry. The user can optionally specify `cutoffs` as a list in case the default is not good. These `cutoffs` should be specified at the moment of reading the `Feed` class. These `cutoffs` are the times of days to use as aggregation.
433 | 
434 | 
435 | ```python
436 | line_freq = feed.lines_freq
437 | line_freq.head()
438 | ```
439 | 
440 | 
441 | 
442 | 
443 | <div>
444 | <table border="1" class="dataframe">
445 |   <thead>
446 |     <tr style="text-align: right;">
447 |       <th></th>
448 |       <th>route_id</th>
449 |       <th>route_name</th>
450 |       <th>dir_id</th>
451 |       <th>window</th>
452 |       <th>min_per_trip</th>
453 |       <th>ntrips</th>
454 |       <th>geometry</th>
455 |     </tr>
456 |   </thead>
457 |   <tbody>
458 |     <tr>
459 |       <th>376</th>
460 |       <td>15808</td>
461 |       <td>44 O'SHAUGHNESSY</td>
462 |       <td>Inbound</td>
463 |       <td>0:00-6:00</td>
464 |       <td>360</td>
465 |       <td>1</td>
466 |       <td>LINESTRING (-122.46459 37.78500, -122.46352 37...</td>
467 |     </tr>
468 |     <tr>
469 |       <th>378</th>
470 |       <td>15808</td>
471 |       <td>44 O'SHAUGHNESSY</td>
472 |       <td>Inbound</td>
473 |       <td>0:00-6:00</td>
474 |       <td>360</td>
475 |       <td>1</td>
476 |       <td>LINESTRING (-122.43416 37.73355, -122.43299 37...</td>
477 |     </tr>
478 |     <tr>
479 |       <th>242</th>
480 |       <td>15787</td>
481 |       <td>25 TREASURE ISLAND</td>
482 |       <td>Inbound</td>
483 |       <td>0:00-6:00</td>
484 |       <td>360</td>
485 |       <td>1</td>
486 |       <td>LINESTRING (-122.39611 37.79013, -122.39603 37...</td>
487 |     </tr>
488 |     <tr>
489 |       <th>451</th>
490 |       <td>15814</td>
491 |       <td>54 FELTON</td>
492 |       <td>Inbound</td>
493 |       <td>0:00-6:00</td>
494 |       <td>360</td>
495 |       <td>1</td>
496 |       <td>LINESTRING (-122.38845 37.73994, -122.38844 37...</td>
497 |     </tr>
498 |     <tr>
499 |       <th>241</th>
500 |       <td>15787</td>
501 |       <td>25 TREASURE ISLAND</td>
502 |       <td>Inbound</td>
503 |       <td>0:00-6:00</td>
504 |       <td>360</td>
505 |       <td>1</td>
506 |       <td>LINESTRING (-122.39542 37.78978, -122.39563 37...</td>
507 |     </tr>
508 |   </tbody>
509 | </table>
510 | </div>
511 | 
512 | 
513 | 
514 | # Bus segments <a class="anchor" id="segments"></a>
515 | 
516 | Returns a geodataframe where each segment is a row and has a **LineString** geometry.
517 | 
518 | 
519 | ```python
520 | segments_gdf = feed.segments
521 | segments_gdf.head(2)
522 | ```
523 | 
524 | 
525 | 
526 | 
527 | <div>
528 | <table border="1" class="dataframe">
529 |   <thead>
530 |     <tr style="text-align: right;">
531 |       <th></th>
532 |       <th>route_id</th>
533 |       <th>direction_id</th>
534 |       <th>stop_sequence</th>
535 |       <th>start_stop_name</th>
536 |       <th>end_stop_name</th>
537 |       <th>start_stop_id</th>
538 |       <th>end_stop_id</th>
539 |       <th>segment_id</th>
540 |       <th>shape_id</th>
541 |       <th>geometry</th>
542 |       <th>distance_m</th>
543 |     </tr>
544 |   </thead>
545 |   <tbody>
546 |     <tr>
547 |       <th>0</th>
548 |       <td>15761</td>
549 |       <td>0</td>
550 |       <td>1</td>
551 |       <td>Clay St &amp; Drumm St</td>
552 |       <td>Sacramento St &amp; Davis St</td>
553 |       <td>4015</td>
554 |       <td>6294</td>
555 |       <td>4015-6294</td>
556 |       <td>179928</td>
557 |       <td>LINESTRING (-122.39697 37.79544, -122.39678 37...</td>
558 |       <td>205.281653</td>
559 |     </tr>
560 |     <tr>
561 |       <th>1</th>
562 |       <td>15761</td>
563 |       <td>0</td>
564 |       <td>2</td>
565 |       <td>Sacramento St &amp; Davis St</td>
566 |       <td>Sacramento St &amp; Battery St</td>
567 |       <td>6294</td>
568 |       <td>6290</td>
569 |       <td>6294-6290</td>
570 |       <td>179928</td>
571 |       <td>LINESTRING (-122.39761 37.79446, -122.39781 37...</td>
572 |       <td>238.047505</td>
573 |     </tr>
574 |   </tbody>
575 | </table>
576 | </div>
577 | 
578 | 
579 | 
580 | # Scheduled Speeds <a class="anchor" id="speeds"></a>
581 | 
582 | Returns a geodataframe with the `speed_kmh` for each combination of `route`, `segment`, `time of day` and `direction`. Each row with a **LineString** geometry. The user can optionally specify `cutoffs` as explained in previous sections.
583 | 
584 | 
585 | ```python
586 | # Cutoffs to make get hourly values
587 | speeds = feed.avg_speeds
588 | speeds.head(1)
589 | ```
590 | 
591 | 
592 | 
593 | 
594 | <div>
595 | <table border="1" class="dataframe">
596 |   <thead>
597 |     <tr style="text-align: right;">
598 |       <th></th>
599 |       <th>route_id</th>
600 |       <th>route_name</th>
601 |       <th>direction_id</th>
602 |       <th>segment_id</th>
603 |       <th>window</th>
604 |       <th>speed_kmh</th>
605 |       <th>start_stop_id</th>
606 |       <th>start_stop_name</th>
607 |       <th>end_stop_id</th>
608 |       <th>end_stop_name</th>
609 |       <th>distance_m</th>
610 |       <th>stop_sequence</th>
611 |       <th>runtime_sec</th>
612 |       <th>segment_max_speed_kmh</th>
613 |       <th>geometry</th>
614 |     </tr>
615 |   </thead>
616 |   <tbody>
617 |     <tr>
618 |       <th>0</th>
619 |       <td>15761</td>
620 |       <td>1 CALIFORNIA</td>
621 |       <td>Inbound</td>
622 |       <td>4015-6294</td>
623 |       <td>10:00-11:00</td>
624 |       <td>12.0</td>
625 |       <td>4015</td>
626 |       <td>Clay St &amp; Drumm St</td>
627 |       <td>6294</td>
628 |       <td>Sacramento St &amp; Davis St</td>
629 |       <td>205.281653</td>
630 |       <td>1</td>
631 |       <td>61.9</td>
632 |       <td>12.0</td>
633 |       <td>LINESTRING (-122.39697 37.79544, -122.39678 37...</td>
634 |     </tr>
635 |   </tbody>
636 | </table>
637 | </div>
638 | 
639 | 
640 | 
641 | # Segment frequencies <a class="anchor" id="segments_freq"></a>
642 | 
643 | 
644 | ```python
645 | segments_freq = feed.segments_freq
646 | segments_freq.head(2)
647 | ```
648 | 
649 | 
650 | 
651 | 
652 | <div>
653 | <table border="1" class="dataframe">
654 |   <thead>
655 |     <tr style="text-align: right;">
656 |       <th></th>
657 |       <th>route_id</th>
658 |       <th>route_name</th>
659 |       <th>direction_id</th>
660 |       <th>segment_name</th>
661 |       <th>window</th>
662 |       <th>min_per_trip</th>
663 |       <th>ntrips</th>
664 |       <th>start_stop_id</th>
665 |       <th>start_stop_name</th>
666 |       <th>end_stop_name</th>
667 |       <th>geometry</th>
668 |     </tr>
669 |   </thead>
670 |   <tbody>
671 |     <tr>
672 |       <th>23191</th>
673 |       <td>ALL_LINES</td>
674 |       <td>All lines</td>
675 |       <td>NA</td>
676 |       <td>3628-3622</td>
677 |       <td>0:00-6:00</td>
678 |       <td>360</td>
679 |       <td>1</td>
680 |       <td>3628</td>
681 |       <td>Alemany Blvd &amp; St Charles Ave</td>
682 |       <td>Alemany Blvd &amp; Arch St</td>
683 |       <td>LINESTRING (-122.46949 37.71045, -122.46941 37...</td>
684 |     </tr>
685 |     <tr>
686 |       <th>6160</th>
687 |       <td>15787</td>
688 |       <td>25 TREASURE ISLAND</td>
689 |       <td>Inbound</td>
690 |       <td>7948-8017</td>
691 |       <td>0:00-6:00</td>
692 |       <td>360</td>
693 |       <td>1</td>
694 |       <td>7948</td>
695 |       <td>Transit Center Bay 29</td>
696 |       <td>Shoreline Access Road</td>
697 |       <td>LINESTRING (-122.39611 37.79013, -122.39603 37...</td>
698 |     </tr>
699 |   </tbody>
700 | </table>
701 | </div>
702 | 
703 | 
704 | 
705 | # Map your work <a class="anchor" id="map_gdf"></a>
706 | 
707 | ## Stop frequencies
708 | ```python
709 | # Stops
710 | from gtfs_functions.gtfs_plots import map_gdf
711 | 
712 | condition_dir = stop_freq.dir_id == 'Inbound'
713 | condition_window = stop_freq.window == '6:00-9:00'
714 | 
715 | gdf = stop_freq.loc[(condition_dir & condition_window),:].reset_index()
716 | 
717 | map_gdf(
718 |   gdf = gdf, 
719 |   variable = 'ntrips', 
720 |   colors = ["#d13870", "#e895b3" ,'#55d992', '#3ab071', '#0e8955','#066a40'], 
721 |   tooltip_var = ['min_per_trip'] , 
722 |   tooltip_labels = ['Frequency: '], 
723 |   breaks = [10, 20, 30, 40, 120, 200]
724 | )
725 | ```
726 | ![stops](/images/map_stop_freq.jpg)
727 | 
728 | ## Line frequencies
729 | ```python
730 | # Line frequencies
731 | from gtfs_functions.gtfs_plots import map_gdf
732 | 
733 | condition_dir = line_freq.direction_id == 'Inbound'
734 | condition_window = line_freq.window == '6:00-9:00'
735 | 
736 | gdf = line_freq.loc[(condition_dir & condition_window),:].reset_index()
737 | 
738 | map_gdf(
739 |   gdf = gdf, 
740 |   variable = 'ntrips', 
741 |   colors = ["#d13870", "#e895b3" ,'#55d992', '#3ab071', '#0e8955','#066a40'], 
742 |   tooltip_var = ['route_name'] , 
743 |   tooltip_labels = ['Route: '], 
744 |   breaks = [5, 10, 20, 50]
745 | )
746 | ```
747 | ![line](/images/map_line_freq.jpg)
748 | 
749 | ## Speeds
750 | If you are looking to visualize data at the segment level for all lines I recommend you go with something more powerful like kepler.gl (AKA my favorite data viz library). For example, to check the scheduled speeds per segment:
751 | ```python
752 | # Speeds
753 | import keplergl as kp
754 | m = kp.KeplerGl(data=dict(data=speeds, name='Speed Lines'), height=400)
755 | m
756 | ```
757 | ![kepler_speeds](/images/kepler_speeds.jpg)
758 | 
759 | ## Segment frequencies
760 | ```python
761 | # Segment frequencies
762 | import keplergl as kp
763 | m = kp.KeplerGl(data=dict(data=seg_freq, name='Segment frequency'), height=400)
764 | m
765 | ```
766 | ![kepler_segment_freq](/images/kepler_seg_freq.jpg)
767 | 
768 | # Other plots <a class="anchor" id="plotly"></a>
769 | ## Histogram
770 | ```python
771 | # Histogram
772 | import plotly.express as px
773 | px.histogram(
774 |     stop_freq.loc[stop_freq.min_per_trip<50], 
775 |     x='frequency', 
776 |     title='Stop frequencies',
777 |     template='simple_white', 
778 |     nbins =20)
779 | ```
780 | ![histogram](/images/histogram.jpg)
781 | 
782 | ## Heatmap
783 | ```python
784 | # Heatmap
785 | import plotly.graph_objects as go
786 | dir_0 = speeds.loc[(speeds.dir_id=='Inbound')&(speeds.route_name=='1 CALIFORNIA')].sort_values(by='stop_sequence') 
787 | dir_0['hour'] = dir_0.window.apply(lambda x: int(x.split(':')[0]))
788 | dir_0.sort_values(by='hour', ascending=True, inplace=True)
789 | 
790 | fig = go.Figure(data=go.Heatmap(
791 |                    z=dir_0.speed_kmh,
792 |                    y=dir_0.start_stop_name,
793 |                    x=dir_0.window,
794 |                    hoverongaps = False,
795 |                    colorscale=px.colors.colorbrewer.RdYlBu, 
796 |                    reversescale=False
797 | ))
798 | 
799 | fig.update_yaxes(title_text='Stop', autorange='reversed')
800 | fig.update_xaxes(title_text='Hour of day', side='top')
801 | fig.update_layout(showlegend=False, height=600, width=1000,
802 |                  title='Speed heatmap per direction and hour of the day')
803 | 
804 | fig.show()
805 | ```
806 | ![heatmap](/images/heatmap.jpg)
807 | 
808 | ## Line chart
809 | ```python
810 | by_hour = speeds.pivot_table('speed_kmh', index = ['window'], aggfunc = ['mean','std'] ).reset_index()
811 | by_hour.columns = ['_'.join(col).strip() for col in by_hour.columns.values]
812 | by_hour['hour'] = by_hour.window_.apply(lambda x: int(x.split(':')[0]))
813 | by_hour.sort_values(by='hour', ascending=True, inplace=True)
814 | 
815 | # Scatter
816 | fig = px.line(by_hour, 
817 |            x='window_', 
818 |            y='mean_speed_kmh', 
819 |            template='simple_white', 
820 |            #error_y = 'std_speed_kmh'
821 |                 )
822 | 
823 | fig.update_yaxes(rangemode='tozero')
824 | 
825 | fig.show()
826 | ```
827 | ![line_chart](/images/speed_hour.jpg)
828 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | python3 -m build
2 | python3 -m twine upload --repository pypi dist/* 


--------------------------------------------------------------------------------
/dist/gtfs_functions-2.5-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/dist/gtfs_functions-2.5-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/gtfs_functions-2.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/dist/gtfs_functions-2.5.tar.gz


--------------------------------------------------------------------------------
/gtfs_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from gtfs_functions.gtfs_functions import Feed
2 | 
3 | # from gtfs_functions.gtfs_plots import map_gdf
4 | 


--------------------------------------------------------------------------------
/gtfs_functions/aux_functions.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import math
  3 | import utm
  4 | import geopandas as gpd
  5 | import logging
  6 | import numpy as np
  7 | 
  8 | 
  9 | def add_runtime(st):
 10 |     # Get the runtime between stops
 11 |     logging.info("adding runtime")
 12 |     st.sort_values(by=["trip_id", "stop_sequence"], inplace=True, ascending=True)
 13 |     c = st.trip_id == st.trip_id.shift(-1)
 14 |     st.loc[c, "runtime_sec"] = st.arrival_time.shift(-1)[c] - st.arrival_time[c]
 15 |     st["end_stop_id"] = st.stop_id.shift(-1)
 16 | 
 17 |     return st
 18 | 
 19 | 
 20 | def add_distance(
 21 |     stop_times,
 22 |     segments_gdf,
 23 |     seg_cols=[
 24 |         "shape_id",
 25 |         "route_id",
 26 |         "direction_id",
 27 |         "stop_sequence",
 28 |         "segment_id",
 29 |         "segment_name",
 30 |         "start_stop_id",
 31 |         "end_stop_id",
 32 |         "start_stop_name",
 33 |         "end_stop_name",
 34 |         "distance_m",
 35 |         "geometry",
 36 |     ],
 37 |     st_cols=[
 38 |         "shape_id",
 39 |         "route_id",
 40 |         "route_name",
 41 |         "direction_id",
 42 |         "stop_sequence",
 43 |         "stop_id",
 44 |         "end_stop_id",
 45 |         "runtime_sec",
 46 |         "arrival_time",
 47 |         "departure_time",
 48 |     ],
 49 | ):
 50 |     logging.info("adding distance in meters")
 51 |     st = stop_times[st_cols]
 52 |     st.rename(columns={"stop_id": "start_stop_id"}, inplace=True)
 53 | 
 54 |     # Merge with segments_gdf to get the distance
 55 |     dist = pd.merge(st, segments_gdf[seg_cols], how="left")
 56 |     dist = gpd.GeoDataFrame(data=dist, geometry=dist.geometry, crs="EPSG:4326")
 57 | 
 58 |     return dist
 59 | 
 60 | 
 61 | def add_speed(speeds):
 62 |     # Calculate the speed for runtimes != 0
 63 |     logging.info("calculating speed in km/h")
 64 |     c = speeds.runtime_sec != 0
 65 |     speeds.loc[c, "speed_kmh"] = round(speeds[c].distance_m / speeds[c].runtime_sec * 3.6)
 66 | 
 67 |     # Assign average speed to those with runtimes==0
 68 |     speeds.loc[~c, "speed_kmh"] = speeds[c].speed_kmh.mean()
 69 | 
 70 |     # Remove null values
 71 |     speeds = speeds.loc[~speeds.speed_kmh.isnull()]
 72 | 
 73 |     return speeds
 74 | 
 75 | 
 76 | def fix_outliers(speeds):
 77 |     # Calculate average speed to modify outliers
 78 |     logging.info("fixing outliers")
 79 |     avg_speed_route = speeds.pivot_table(
 80 |         "speed_kmh", index=["route_id", "direction_id", "window"], aggfunc="mean"
 81 |     ).reset_index()
 82 | 
 83 |     avg_speed_route.rename(columns={"speed_kmh": "avg_route_speed_kmh"}, inplace=True)
 84 | 
 85 |     # Assign average speed to outliers
 86 |     speeds = pd.merge(speeds, avg_speed_route, how="left")
 87 |     out_c = speeds.speed_kmh > 120
 88 |     speeds.loc[out_c, "speed_kmh"] = speeds.loc[out_c, "avg_route_speed_kmh"]
 89 | 
 90 |     # Get the columns in the right format
 91 |     speeds["avg_route_speed_kmh"] = round(speeds.avg_route_speed_kmh, 1)
 92 | 
 93 |     return speeds
 94 | 
 95 | 
 96 | def aggregate_speed(speeds, segments_gdf):
 97 |     # Get the average per route, direction, segment and time of day
 98 |     logging.info("aggregating speed by segment and window")
 99 |     speeds_agg = speeds.pivot_table(
100 |         ["speed_kmh", "runtime_sec", "avg_route_speed_kmh"],
101 |         index=["route_name", "direction_id", "segment_id", "window"],
102 |         aggfunc="mean",
103 |     ).reset_index()
104 | 
105 |     # Format the merge columns correctly
106 |     speeds_agg["direction_id"] = speeds_agg.direction_id.astype(int)
107 |     segments_gdf["direction_id"] = segments_gdf.direction_id.astype(int)
108 | 
109 |     # Add geometries to segments
110 |     data = (
111 |         pd.merge(
112 |             speeds_agg,
113 |             segments_gdf,
114 |             left_on=["route_name", "direction_id", "segment_id"],
115 |             right_on=["route_name", "direction_id", "segment_id"],
116 |             how="left",
117 |         )
118 |         .reset_index(drop=True)
119 |         .sort_values(by=["route_id", "direction_id", "window", "stop_sequence"], ascending=True)
120 |     )
121 | 
122 |     ordered_cols = [
123 |         "route_id",
124 |         "route_name",
125 |         "direction_id",
126 |         "segment_id",
127 |         "window",
128 |         "speed_kmh",
129 |         "avg_route_speed_kmh",
130 |         "stop_sequence",
131 |         "segment_name",
132 |         "start_stop_name",
133 |         "end_stop_name",
134 |         "start_stop_id",
135 |         "end_stop_id",
136 |         "shape_id",
137 |         "runtime_sec",
138 |         "distance_m",
139 |         "geometry",
140 |     ]
141 | 
142 |     return data[ordered_cols]
143 | 
144 | 
145 | def get_all_lines_speed(speeds, segments_gdf):
146 |     # Get the average per segment and time of day
147 |     # Then add it to the rest of the data
148 |     all_lines = speeds.pivot_table(
149 |         ["speed_kmh", "runtime_sec", "avg_route_speed_kmh"],
150 |         index=["segment_id", "window"],
151 |         aggfunc="mean",
152 |     ).reset_index()
153 | 
154 |     data_all_lines = (
155 |         pd.merge(
156 |             all_lines,
157 |             segments_gdf.drop_duplicates(subset=["segment_id"]),
158 |             left_on=["segment_id"],
159 |             right_on=["segment_id"],
160 |             how="left",
161 |         )
162 |         .reset_index(drop=True)
163 |         .sort_values(by=["direction_id", "window", "stop_sequence"], ascending=True)
164 |     )
165 | 
166 |     data_all_lines["route_id"] = "ALL_LINES"
167 |     data_all_lines["route_name"] = "All lines"
168 |     data_all_lines["direction_id"] = "NA"
169 | 
170 |     return data_all_lines
171 | 
172 | 
173 | def add_all_lines_speed(data, speeds, segments_gdf):
174 |     # Get data for all lines
175 |     data_all_lines = get_all_lines_speed(speeds, segments_gdf)
176 | 
177 |     # Add it to the data we already had
178 |     data_complete = pd.concat([data, data_all_lines])
179 | 
180 |     # Clean data
181 |     data_complete = data_complete[~data_complete.route_name.isnull()].reset_index(drop=True)
182 | 
183 |     # Get the columns in the right format
184 |     data_complete["speed_kmh"] = round(data_complete.speed_kmh, 1)
185 | 
186 |     cols = [
187 |         "route_id",
188 |         "route_name",
189 |         "direction_id",
190 |         "segment_name",
191 |         "window",
192 |         "speed_kmh",
193 |         "segment_id",
194 |         "start_stop_id",
195 |         "start_stop_name",
196 |         "end_stop_id",
197 |         "end_stop_name",
198 |         "distance_m",
199 |         "stop_sequence",
200 |         "shape_id",
201 |         "runtime_sec",
202 |         "geometry",
203 |     ]
204 | 
205 |     return data_complete
206 | 
207 | 
208 | def add_free_flow(speeds, data_complete):
209 |     # Calculate max speed per segment to have a free_flow reference
210 |     max_speed_segment = speeds.pivot_table("speed_kmh", index="segment_name", aggfunc="max")
211 | 
212 |     max_speed_segment.rename(columns={"speed_kmh": "segment_max_speed_kmh"}, inplace=True)
213 | 
214 |     # Assign max speeds to each segment
215 |     data_complete = pd.merge(
216 |         data_complete,
217 |         max_speed_segment,
218 |         left_on=["segment_name"],
219 |         right_index=True,
220 |         how="left",
221 |     )
222 | 
223 |     order_cols = [
224 |         "route_name",
225 |         "direction_id",
226 |         "window",
227 |         "segment_name",
228 |         "stop_sequence",
229 |         "speed_kmh",
230 |         "avg_route_speed_kmh",
231 |         "segment_max_speed_kmh",
232 |         "route_id",
233 |         "segment_id",
234 |         "start_stop_name",
235 |         "end_stop_name",
236 |         "start_stop_id",
237 |         "end_stop_id",
238 |         "shape_id",
239 |         "runtime_sec",
240 |         "distance_m",
241 |         "geometry",
242 |     ]
243 | 
244 |     return data_complete
245 | 
246 | 
247 | def add_all_lines(line_frequencies, segments_gdf, labels, cutoffs):
248 | 
249 |     logging.info("adding data for all lines.")
250 | 
251 |     # Calculate sum of trips per segment with all lines
252 |     all_lines = line_frequencies.pivot_table(["ntrips"], index=["segment_id", "window"], aggfunc="sum").reset_index()
253 | 
254 |     sort_these = ["direction_id", "window", "stop_sequence"]
255 | 
256 |     data_all_lines = (
257 |         pd.merge(
258 |             all_lines,
259 |             segments_gdf.drop_duplicates(subset=["segment_id"]),
260 |             left_on=["segment_id"],
261 |             right_on=["segment_id"],
262 |             how="left",
263 |         )
264 |         .reset_index()
265 |         .sort_values(by=sort_these, ascending=True)
266 |     )
267 | 
268 |     data_all_lines.drop(["index"], axis=1, inplace=True)
269 |     data_all_lines["route_id"] = "ALL_LINES"
270 |     data_all_lines["route_name"] = "All lines"
271 |     data_all_lines["direction_id"] = "NA"
272 | 
273 |     # Add frequency for all lines
274 |     start_time = data_all_lines.window.apply(lambda x: cutoffs[labels.index(x)])
275 |     end_time = data_all_lines.window.apply(lambda x: cutoffs[labels.index(x) + 1])
276 | 
277 |     data_all_lines["min_per_trip"] = ((end_time - start_time) * 60 / data_all_lines.ntrips).astype(int)
278 | 
279 |     # Append data for all lines to the input df
280 |     data_complete = pd.concat([line_frequencies, data_all_lines]).reset_index(drop=True)
281 | 
282 |     return data_complete
283 | 
284 | 
285 | def fix_departure_time(times_to_fix):
286 |     """
287 |     Reassigns departure time to trips that start after the hour 24
288 |     for the to fit in a 0-24 hour range
289 |     Input:
290 |         - times_to_fix: np.array of integers with seconds past from midnight.
291 |     """
292 | 
293 |     next_day = times_to_fix >= 24 * 3600
294 |     times_to_fix[next_day] = times_to_fix[next_day] - 24 * 3600
295 | 
296 |     return times_to_fix
297 | 
298 | 
299 | def label_creation(cutoffs):
300 |     """
301 |     Creates the labels of the time windows.
302 |     Input:
303 |         - cutoffs: list of floats or int.
304 |     Output:
305 |         - labels: list of strings.
306 | 
307 |     Example:
308 |     label_creation(cutoffs=[0, 10, 15.5, 25]) --> [0:00, 10:00, 15:30, 25:00]
309 |     """
310 |     labels = []
311 |     if max(cutoffs) <= 24:
312 |         for w in cutoffs:
313 |             if float(w).is_integer():
314 |                 label = str(w) + ":00"
315 |             else:
316 |                 n = math.modf(w)
317 |                 label = str(int(n[1])) + ":" + str(int(n[0] * 60))
318 |             labels.append(label)
319 |     else:
320 |         labels = []
321 |         for w in cutoffs:
322 |             if float(w).is_integer():
323 |                 if w > 24:
324 |                     w1 = w - 24
325 |                     label = str(w1) + ":00"
326 |                 else:
327 |                     label = str(w) + ":00"
328 |                 labels.append(label)
329 |             else:
330 |                 if w > 24:
331 |                     w1 = w - 24
332 |                     n = math.modf(w1)
333 |                     label = str(int(n[1])) + ":" + str(int(n[0] * 60))
334 |                 else:
335 |                     n = math.modf(w)
336 |                     label = str(int(n[1])) + ":" + str(int(n[0] * 60))
337 |                 labels.append(label)
338 | 
339 |     labels = [labels[i] + "-" + labels[i + 1] for i in range(0, len(labels) - 1)]
340 | 
341 |     return labels
342 | 
343 | 
344 | def window_creation(stop_times, cutoffs):
345 |     "Adds the time time window and labels to stop_times"
346 | 
347 |     # If the cutoffs are withing 0 and 24 hours, let's make sure
348 |     # the times of the GTFS fit this time period
349 |     if max(cutoffs) <= 24:
350 |         stop_times["departure_time"] = fix_departure_time(stop_times.departure_time.values)
351 |         stop_times["arrival_time"] = fix_departure_time(stop_times.arrival_time.values)
352 | 
353 |     # Create the labels for the cutoffs
354 |     labels = label_creation(cutoffs)
355 | 
356 |     # Get departure time as hour and a fraction
357 |     departure_time = stop_times.departure_time / 3600
358 | 
359 |     # Put each trip in the right window
360 |     stop_times["window"] = pd.cut(departure_time, bins=cutoffs, right=False, labels=labels)
361 |     stop_times = stop_times.loc[~stop_times.window.isnull()]
362 | 
363 |     stop_times["window"] = stop_times.window.astype(str)
364 | 
365 |     return stop_times
366 | 
367 | 
368 | def seconds_since_midnight(times_string):
369 |     """
370 |     Transforms a series of time strings of the form "10:00:10"
371 |     to an integer that represents the seconds since midnight.
372 |     """
373 | 
374 |     vals = times_string.split(":")
375 |     seconds = 0
376 | 
377 |     for p, v in enumerate(vals):
378 |         seconds += int(v) * (3600 / (60**p))
379 | 
380 |     return seconds
381 | 
382 | 
383 | def add_frequency(
384 |     stop_times,
385 |     labels,
386 |     index_="stop_id",
387 |     col="window",
388 |     cutoffs=[0, 6, 9, 15, 19, 22, 24],
389 | ):
390 | 
391 |     if isinstance(index_, list):
392 |         index_list = index_ + ["direction_id", col]
393 |     elif isinstance(index_, str):
394 |         index_list = [index_, "direction_id", col]
395 | 
396 |     # Some gtfs feeds only contain direction_id 0, use that as default
397 |     trips_agg = stop_times.pivot_table("trip_id", index=index_list, aggfunc="count").reset_index()
398 | 
399 |     # direction_id is optional, as it is not needed to determine trip frequencies
400 |     # However, if direction_id is NaN, pivot_table will return an empty DataFrame.
401 |     # Therefore, use a sensible default if direction id is not known.
402 |     # Some gtfs feeds only contain direction_id 0, use that as default
403 |     trips_agg.rename(columns={"trip_id": "ntrips"}, inplace=True)
404 | 
405 |     start_time = trips_agg.window.apply(lambda x: cutoffs[labels.index(x)])
406 |     end_time = trips_agg.window.apply(lambda x: cutoffs[labels.index(x) + 1])
407 | 
408 |     trips_agg["min_per_trip"] = ((end_time - start_time) * 60 / trips_agg.ntrips).astype(int)
409 | 
410 |     return trips_agg
411 | 
412 | 
413 | def add_route_name(data, routes):
414 |     # Add the route name
415 |     routes["route_name"] = ""
416 | 
417 |     def check_null(col):
418 |         # Check for null values
419 |         check = (
420 |             routes[col].isnull().unique()[0] | (routes[col] == np.nan).unique()[0] | (routes[col] == "nan").unique()[0]
421 |         )
422 | 
423 |         return check
424 | 
425 |     if check_null("route_short_name"):
426 |         routes["route_name"] = routes.route_long_name
427 |     elif check_null("route_long_name"):
428 |         routes["route_name"] = routes.route_short_name
429 |     else:
430 |         routes["route_name"] = routes.route_short_name.astype(str) + " " + routes.route_long_name.astype(str)
431 | 
432 |     data = pd.merge(
433 |         data,
434 |         routes[["route_id", "route_name"]],
435 |         left_on="route_id",
436 |         right_on="route_id",
437 |         how="left",
438 |     )
439 | 
440 |     return data
441 | 
442 | 
443 | def code(gdf):
444 |     gdf.index = list(range(0, len(gdf)))
445 |     gdf.crs = {"init": "epsg:4326"}
446 |     lat_referece = gdf.geometry[0].coords[0][1]
447 |     lon_reference = gdf.geometry[0].coords[0][0]
448 | 
449 |     zone = utm.from_latlon(lat_referece, lon_reference)
450 |     # The EPSG code is 32600+zone for positive latitudes and 32700+zone for negatives.
451 |     if lat_referece < 0:
452 |         epsg_code = 32700 + zone[2]
453 |     else:
454 |         epsg_code = 32600 + zone[2]
455 | 
456 |     return epsg_code
457 | 
458 | 
459 | def num_to_letters(num):
460 |     result = ""
461 |     while num > 0:
462 |         num -= 1
463 |         digit = num % 26
464 |         result = chr(digit + 65) + result
465 |         num //= 26
466 |     return result
467 | 


--------------------------------------------------------------------------------
/gtfs_functions/gtfs_functions.py:
--------------------------------------------------------------------------------
   1 | import numpy as np
   2 | import pandas as pd
   3 | from zipfile import ZipFile
   4 | import os
   5 | import logging
   6 | import geopandas as gpd
   7 | import requests
   8 | import io
   9 | import pendulum as pl
  10 | import hashlib
  11 | from shapely.geometry import LineString, MultiPoint
  12 | from gtfs_functions.aux_functions import (
  13 |     add_all_lines,
  14 |     add_runtime,
  15 |     add_distance,
  16 |     add_speed,
  17 |     code,
  18 |     fix_outliers,
  19 |     num_to_letters,
  20 |     add_route_name,
  21 |     seconds_since_midnight,
  22 |     window_creation,
  23 |     label_creation,
  24 |     add_frequency,
  25 |     aggregate_speed,
  26 |     add_all_lines_speed,
  27 |     add_free_flow
  28 | )
  29 | from itertools import permutations, chain
  30 | from shapely import distance
  31 | from h3 import latlng_to_cell, grid_ring
  32 | from time import time
  33 | import boto3
  34 | import sys
  35 | 
  36 | 
  37 | if not sys.warnoptions:
  38 |     import warnings
  39 | 
  40 |     warnings.simplefilter("ignore")
  41 | 
  42 | logging.basicConfig(level=logging.INFO)
  43 | 
  44 | 
  45 | class Feed:
  46 |     def __init__(
  47 |         self,
  48 |         gtfs_path: str,
  49 |         time_windows: list = [0, 6, 9, 15, 19, 22, 24],
  50 |         busiest_date: bool = True,
  51 |         geo: bool = True,
  52 |         patterns: bool = True,
  53 |         start_date: str = None,
  54 |         end_date: str = None,
  55 |     ):
  56 | 
  57 |         self._gtfs_path = gtfs_path
  58 |         self._time_windows = time_windows
  59 |         self._busiest_date = busiest_date
  60 |         self._geo = geo
  61 |         self._patterns = patterns
  62 |         self._start_date = start_date
  63 |         self._end_date = end_date
  64 |         self._dates = None
  65 |         self._routes_patterns = None
  66 |         self._trips_patterns = None
  67 |         self._files = None
  68 |         self._bbox = None
  69 |         self._busiest_service_id = None
  70 |         self._agency = None
  71 |         self._calendar = None
  72 |         self._calendar_dates = None
  73 |         self._trips = None
  74 |         self._routes = None
  75 |         self._stops = None
  76 |         self._stop_times = None
  77 |         self._shapes = None
  78 |         self._stops_freq = None
  79 |         self._lines_freq = None
  80 |         self._segments = None
  81 |         self._segments_freq = None
  82 |         self._speeds = None
  83 |         self._avg_speeds = None
  84 |         self._dist_matrix = None
  85 |         self._dates_service_id = None
  86 | 
  87 |     @property
  88 |     def gtfs_path(self):
  89 |         return self._gtfs_path
  90 | 
  91 |     @property
  92 |     def time_windows(self):
  93 |         return self._time_windows
  94 | 
  95 |     @property
  96 |     def busiest_date(self):
  97 |         return self._busiest_date
  98 | 
  99 |     @property
 100 |     def geo(self):
 101 |         return self._geo
 102 | 
 103 |     @property
 104 |     def files(self):
 105 |         if self._files is None:
 106 |             self._files = self.get_files()
 107 | 
 108 |         return self._files
 109 | 
 110 |     @property
 111 |     def bbox(self):
 112 |         if self._bbox is None:
 113 |             self._bbox = self.get_bbox()
 114 |         return self._bbox
 115 | 
 116 |     @property
 117 |     def start_date(self):
 118 |         return self._start_date
 119 | 
 120 |     @property
 121 |     def end_date(self):
 122 |         return self._end_date
 123 | 
 124 |     @property
 125 |     def dates(self):
 126 |         if self._dates is None:
 127 |             self._dates = self.get_dates()
 128 |         return self._dates
 129 | 
 130 |     @property
 131 |     def routes_patterns(self):
 132 |         """
 133 |         Return the patterns of each route and the number of trips defined
 134 |         for each pattern.
 135 |         """
 136 |         if self._routes_patterns is None:
 137 |             (trips_patterns, routes_patterns) = self.get_routes_patterns(self.trips)
 138 |             self._trips_patterns = trips_patterns
 139 |             self._routes_patterns = routes_patterns
 140 |         return self._routes_patterns
 141 | 
 142 |     @property
 143 |     def trips_patterns(self):
 144 |         """
 145 |         Return trips augmented with the patterns they belong to.
 146 |         """
 147 |         if self._trips_patterns is None:
 148 | 
 149 |             (trips_patterns, routes_patterns) = self.get_routes_patterns(self.trips)
 150 |             self._trips_patterns = trips_patterns
 151 |             self._routes_patterns = routes_patterns
 152 |         return self._trips_patterns
 153 | 
 154 |     @property
 155 |     def busiest_service_id(self):
 156 |         """
 157 |         Returns the service_id with most trips as a string.
 158 |         """
 159 |         if self._busiest_service_id is None:
 160 |             self._busiest_service_id = self.get_busiest_service_id()
 161 | 
 162 |         return self._busiest_service_id
 163 | 
 164 |     @property
 165 |     def agency(self):
 166 |         if self._agency is None:
 167 |             self._agency = self.get_agency()
 168 | 
 169 |         return self._agency
 170 | 
 171 |     @property
 172 |     def calendar(self):
 173 |         if self._calendar is None:
 174 |             self._calendar = self.get_calendar()
 175 | 
 176 |         return self._calendar
 177 | 
 178 |     @property
 179 |     def calendar_dates(self):
 180 |         if self._calendar_dates is None:
 181 |             self._calendar_dates = self.get_calendar_dates()
 182 | 
 183 |         return self._calendar_dates
 184 | 
 185 |     @property
 186 |     def trips(self):
 187 |         logging.info("accessing trips")
 188 |         if self._trips is None:
 189 |             self._trips = self.get_trips()
 190 | 
 191 |         if self._patterns and self._trips_patterns is None:
 192 |             (trips_patterns, routes_patterns) = self.get_routes_patterns(self._trips)
 193 |             self._trips_patterns = trips_patterns
 194 |             self._routes_patterns = routes_patterns
 195 |             return self._trips_patterns
 196 |         elif self._patterns:
 197 |             return self._trips_patterns
 198 | 
 199 |         return self._trips
 200 | 
 201 |     @property
 202 |     def routes(self):
 203 |         if self._routes is None:
 204 |             self._routes = self.get_routes()
 205 | 
 206 |         return self._routes
 207 | 
 208 |     @property
 209 |     def stops(self):
 210 |         if self._stops is None:
 211 |             self._stops = self.get_stops()
 212 | 
 213 |         return self._stops
 214 | 
 215 |     @property
 216 |     def stop_times(self):
 217 |         if self._stop_times is None:
 218 |             self._stop_times = self.get_stop_times()
 219 | 
 220 |         return self._stop_times
 221 | 
 222 |     @property
 223 |     def shapes(self):
 224 |         if self._shapes is None:
 225 |             self._shapes = self.get_shapes()
 226 | 
 227 |         return self._shapes
 228 | 
 229 |     @property
 230 |     def stops_freq(self):
 231 |         if self._stops_freq is None:
 232 |             self._stops_freq = self.get_stops_freq()
 233 | 
 234 |         return self._stops_freq
 235 | 
 236 |     @property
 237 |     def lines_freq(self):
 238 |         if self._lines_freq is None:
 239 |             self._lines_freq = self.get_lines_freq()
 240 | 
 241 |         return self._lines_freq
 242 | 
 243 |     @property
 244 |     def segments(self):
 245 |         if self._segments is None:
 246 |             self._segments = self.get_segments()
 247 | 
 248 |         return self._segments
 249 | 
 250 |     @property
 251 |     def segments_freq(self):
 252 |         if self._segments_freq is None:
 253 |             self._segments_freq = self.get_segments_freq()
 254 | 
 255 |         return self._segments_freq
 256 | 
 257 |     @property
 258 |     def speeds(self):
 259 |         if self._speeds is None:
 260 |             self._speeds = self.get_speeds()
 261 | 
 262 |         return self._speeds
 263 | 
 264 |     @property
 265 |     def avg_speeds(self):
 266 |         if self._avg_speeds is None:
 267 |             self._avg_speeds = self.get_avg_speeds()
 268 | 
 269 |         return self._avg_speeds
 270 | 
 271 |     @property
 272 |     def distance_matrix(self):
 273 |         if self._dist_matrix is None:
 274 |             self._dist_matrix = self.get_distance_between_stops()
 275 | 
 276 |         return self._dist_matrix
 277 | 
 278 |     @property
 279 |     def dates_service_id(self):
 280 |         if self._dates_service_id is None:
 281 |             self._dates_service_id = self.get_dates_service_id()
 282 |         return self._dates_service_id
 283 | 
 284 |     @trips.setter
 285 |     def trips(self, value):
 286 |         self._trips = value
 287 | 
 288 |     @stop_times.setter
 289 |     def stop_times(self, value):
 290 |         self._stop_times = value
 291 | 
 292 |     @stops.setter
 293 |     def stops(self, value):
 294 |         self._stops = value
 295 | 
 296 |     @routes.setter
 297 |     def routes(self, value):
 298 |         self._routes = value
 299 | 
 300 |     @shapes.setter
 301 |     def shapes(self, value):
 302 |         self._shapes = value
 303 | 
 304 |     @dates_service_id.setter
 305 |     def dates_service_id(self, value):
 306 |         self._dates_service_id = value
 307 | 
 308 |     def get_files(self):
 309 |         gtfs_path = self.gtfs_path
 310 | 
 311 |         # S3 implementation
 312 |         if gtfs_path.split("://")[0] == "s3":
 313 |             s3 = boto3.resource("s3")
 314 |             bucket = gtfs_path.split("://")[1].split("/")[0]
 315 |             boto_bucket = s3.Bucket(bucket)
 316 |             key = "/".join(gtfs_path.split("/")[3:])
 317 | 
 318 |             with io.BytesIO() as data:
 319 |                 boto_bucket.download_fileobj(key, data)
 320 |                 with ZipFile(data) as myzip:
 321 |                     return myzip.namelist()
 322 |         else:
 323 |             try:
 324 |                 with ZipFile(gtfs_path) as myzip:
 325 |                     return myzip.namelist()
 326 |             # Try as a URL if the file is not in local
 327 |             except (FileNotFoundError, OSError) as e:
 328 |                 logging.error(e)
 329 |                 r = requests.get(self.gtfs_path)
 330 | 
 331 |                 with ZipFile(io.BytesIO(r.content)) as myzip:
 332 |                     return myzip.namelist()
 333 | 
 334 |     def get_bbox(self):
 335 |         logging.info("Getting the bounding box.")
 336 |         stops = extract_file("stops", self)
 337 | 
 338 |         max_x = stops.stop_lon.max()
 339 |         min_x = stops.stop_lon.min()
 340 |         max_y = stops.stop_lat.max()
 341 |         min_y = stops.stop_lat.min()
 342 | 
 343 |         geo = {
 344 |             "type": "Polygon",
 345 |             "coordinates": [
 346 |                 [
 347 |                     [max_x, max_y],
 348 |                     [max_x, min_y],
 349 |                     [min_x, min_y],
 350 |                     [min_x, max_y],
 351 |                     [max_x, max_y],
 352 |                 ]
 353 |             ],
 354 |         }
 355 | 
 356 |         return geo
 357 | 
 358 |     def get_dates(self):
 359 |         start_date = self.start_date
 360 |         end_date = self.end_date
 361 |         if start_date is not None:
 362 |             pl_start_date = pl.from_format(start_date, "YYYY-MM-DD")
 363 | 
 364 |             if end_date is not None:
 365 |                 pl_end_date = pl.from_format(end_date, "YYYY-MM-DD")
 366 | 
 367 |             elif end_date is None:
 368 |                 logging.info("End date is None so we will take today as end date.")
 369 | 
 370 |                 pl_end_date = pl.today()
 371 | 
 372 |             # Get all dates between start and end date
 373 |             period = pl.interval(pl_start_date, pl_end_date)
 374 | 
 375 |             return [day.to_date_string() for day in period]
 376 |         else:
 377 |             logging.info("Start date is None. You should either specify a start date or set busiest_date to True.")
 378 |             return []
 379 | 
 380 |     def get_routes_patterns(self, trips):
 381 |         """
 382 |         Compute the different patterns of each route.
 383 |         returns (trips_patterns, routes_patterns)
 384 |         """
 385 |         stop_times = self.stop_times
 386 |         logging.info("computing patterns")
 387 |         trip_stops = stop_times[
 388 |             [
 389 |                 "route_id",
 390 |                 "route_name",
 391 |                 "direction_id",
 392 |                 "shape_id",
 393 |                 "trip_id",
 394 |                 "stop_id",
 395 |                 "stop_sequence",
 396 |             ]
 397 |         ]
 398 |         trip_stops["zipped_stops"] = list(zip(trip_stops.stop_id, trip_stops.stop_sequence))
 399 | 
 400 |         trip_stops_zipped = trip_stops.pivot_table(
 401 |             "zipped_stops",
 402 |             index=["trip_id", "route_id", "route_name", "direction_id", "shape_id"],
 403 |             aggfunc=list,
 404 |         ).reset_index()
 405 | 
 406 |         trips_with_stops = trips.merge(trip_stops_zipped)
 407 | 
 408 |         def version_hash(x):
 409 |             hash = hashlib.sha1(f"{x.route_id}{x.direction_id}{str(x.zipped_stops)}".encode("UTF-8")).hexdigest()
 410 |             return hash[:18]
 411 | 
 412 |         trips_with_stops["pattern_id"] = trips_with_stops.apply(version_hash, axis=1)
 413 | 
 414 |         # Count number of trips per pattern to identify the main one
 415 |         route_patterns = trips_with_stops.pivot_table(
 416 |             "trip_id",
 417 |             index=[
 418 |                 "route_id",
 419 |                 "route_name",
 420 |                 "pattern_id",
 421 |                 "direction_id",
 422 |                 "shape_id",
 423 |                 trips_with_stops.zipped_stops.astype(str),
 424 |             ],
 425 |             aggfunc="count",
 426 |         ).reset_index()
 427 | 
 428 |         route_patterns = (
 429 |             route_patterns.rename({"trip_id": "cnt_trips"}, axis=1)
 430 |             .sort_values(
 431 |                 by=["route_name", "direction_id", "cnt_trips"],
 432 |                 ascending=[True, True, False],
 433 |             )
 434 |             .reset_index(drop=True)
 435 |         )
 436 | 
 437 |         # Add simple names to patterns: A, B, C, etc.
 438 |         n_patterns = route_patterns.pivot_table(
 439 |             "cnt_trips", index=["route_name", "direction_id"], aggfunc="count"
 440 |         ).reset_index()
 441 |         n_patterns["route_pattern"] = n_patterns.cnt_trips.apply(lambda row: tuple(np.arange(1, row + 1)))
 442 |         n_patterns = n_patterns.explode("route_pattern").reset_index(drop=True)
 443 |         n_patterns["route_pattern"] = n_patterns.route_pattern.apply(num_to_letters)
 444 |         n_patterns["pattern_name"] = (
 445 |             n_patterns.route_name
 446 |             + " - "
 447 |             + n_patterns.direction_id.astype(int).astype(str)
 448 |             + " - "
 449 |             + n_patterns.route_pattern
 450 |         )
 451 |         n_patterns.sort_values(by=["route_name", "direction_id", "route_pattern"], inplace=True)
 452 | 
 453 |         route_patterns = route_patterns.merge(
 454 |             n_patterns[["route_pattern", "pattern_name"]],
 455 |             right_index=True,
 456 |             left_index=True,
 457 |             how="left",
 458 |         )
 459 | 
 460 |         # Bring the pattern names to trips
 461 |         trips_with_stops = trips_with_stops.merge(
 462 |             route_patterns[["pattern_id", "route_pattern", "pattern_name"]], how="left"
 463 |         )
 464 |         trips_with_patterns = trips_with_stops[
 465 |             [
 466 |                 "trip_id",
 467 |                 "route_id",
 468 |                 "pattern_id",
 469 |                 "route_pattern",
 470 |                 "pattern_name",
 471 |                 "route_name",
 472 |                 "service_id",
 473 |                 "direction_id",
 474 |                 "shape_id",
 475 |             ]
 476 |         ]
 477 | 
 478 |         return trips_with_patterns.copy(), route_patterns.copy()
 479 | 
 480 |     def get_busiest_service_id(self):
 481 |         """
 482 |         Returns the service_id with most trips as a string.
 483 |         """
 484 |         trips = extract_file("trips", self)
 485 |         return (
 486 |             trips.pivot_table("trip_id", index="service_id", aggfunc="count")
 487 |             .sort_values(by="trip_id", ascending=False)
 488 |             .index[0]
 489 |         )
 490 | 
 491 |     def get_dates_service_id(self):
 492 |         dates_service_id = self.parse_calendar()
 493 |         return dates_service_id.groupby("date").service_id.apply(list)
 494 | 
 495 |     def get_agency(self):
 496 |         return extract_file("agency", self)
 497 | 
 498 |     def get_calendar(self):
 499 |         return extract_file("calendar", self)
 500 | 
 501 |     def get_calendar_dates(self):
 502 |         return extract_file("calendar_dates", self)
 503 | 
 504 |     def parse_calendar(self):
 505 |         calendar = self.calendar
 506 |         calendar_dates = self.calendar_dates
 507 |         busiest_date = self.busiest_date
 508 | 
 509 |         if calendar is not None:
 510 |             # Parse dates
 511 |             calendar["start_date_dt"] = calendar.start_date.astype(str).apply(pl.parse)
 512 |             calendar["end_date_dt"] = calendar.end_date.astype(str).apply(pl.parse)
 513 | 
 514 |             # Get all dates for a given service_id
 515 |             calendar["all_dates"] = calendar.apply(
 516 |                 lambda x: np.array([d for d in pl.interval(x.start_date_dt, x.end_date_dt).range("days")]),
 517 |                 axis=1,
 518 |             )
 519 | 
 520 |             # Boolean variables for day types
 521 |             cols = [
 522 |                 "monday",
 523 |                 "tuesday",
 524 |                 "wednesday",
 525 |                 "thursday",
 526 |                 "friday",
 527 |                 "saturday",
 528 |                 "sunday",
 529 |             ]
 530 | 
 531 |             vf = np.vectorize(bool)
 532 |             calendar[cols] = vf(calendar[cols].values)
 533 | 
 534 |             # Hash weekdays to make it faster
 535 |             def get_hash_weekdays(row):
 536 |                 return {i: v for i, v in enumerate(row[cols].values[0])}
 537 | 
 538 |             hash_weekdays = calendar.groupby("service_id").apply(get_hash_weekdays)
 539 | 
 540 |             # Filter dates depending on the days of the week
 541 |             calendar["filtered_dates"] = calendar.apply(
 542 |                 lambda row: row.all_dates[[hash_weekdays[row.service_id][d.weekday()] for d in row.all_dates]],
 543 |                 axis=1,
 544 |             )
 545 | 
 546 |             # Explode filtered_dates
 547 |             t = calendar[["service_id", "filtered_dates"]].explode("filtered_dates")
 548 | 
 549 |             # Keep the service_ids that apply to at least one date
 550 |             t = t[t.filtered_dates.notnull()]
 551 |             t["filtered_dates"] = t.filtered_dates.dt.date.astype(str)
 552 | 
 553 |             t = t.groupby("filtered_dates").service_id.apply(list)
 554 | 
 555 |             # Create dictionary with dates as keys and service_id as items
 556 |             date_hash = t.apply(lambda x: dict(zip(x, [True] * len(x)))).to_dict()
 557 |         else:
 558 |             date_hash = {}
 559 | 
 560 |         if calendar_dates is not None:
 561 |             # --- Do the same for calendar_dates ---
 562 |             calendar_dates["date_str"] = calendar_dates.date.astype(str).apply(pl.parse).dt.date.astype(str)
 563 | 
 564 |             cdates_hash = (
 565 |                 calendar_dates[calendar_dates.exception_type == 1]
 566 |                 .groupby("date_str")
 567 |                 .service_id.apply(list)
 568 |                 .apply(lambda x: dict(zip(x, [True] * len(x))))
 569 |                 .to_dict()
 570 |             )
 571 |         else:
 572 |             cdates_hash = {}
 573 | 
 574 |         # Were dates provided or we're looking for the busiest_date?
 575 |         if busiest_date:
 576 |             # We need to look for the busiest date.
 577 |             # To do enable that we need to return the complete
 578 |             # list of dates to `get_trips()`
 579 |             # Get max date and min date
 580 |             dates = list(date_hash.keys()) + list(cdates_hash.keys())
 581 |         else:
 582 |             dates = self.dates
 583 | 
 584 |             # Check if the dates have service in the calendars
 585 |             remove_dates = []
 586 |             for i, d in enumerate(dates):
 587 |                 if (d not in date_hash) & (d not in cdates_hash):
 588 |                     print(f'The date "{d}" does not have service in this feed and will be removed from the analysis.')
 589 |                     remove_dates.append(d)
 590 | 
 591 |             for d in remove_dates:
 592 |                 dates.remove(d)
 593 | 
 594 |         # Create dataframe with the service_id that applies to each date
 595 |         aux = pd.concat([pd.DataFrame(date_hash), pd.DataFrame(cdates_hash)]).T.reset_index()
 596 |         dates_service_id = pd.melt(aux, id_vars="index", value_vars=aux.columns)
 597 |         dates_service_id.columns = ["date", "service_id", "keep"]
 598 | 
 599 |         return dates_service_id[~dates_service_id.keep.isnull()]
 600 | 
 601 |     def get_trips(self):
 602 |         routes = self.routes
 603 |         dates = self.dates
 604 | 
 605 |         trips = extract_file("trips", self)
 606 |         trips["trip_id"] = trips.trip_id.astype(str)
 607 |         trips["route_id"] = trips.route_id.astype(str)
 608 | 
 609 |         if "shape_id" in trips.columns:
 610 |             trips["shape_id"] = trips.shape_id.astype(str)
 611 | 
 612 |         # If we were asked to only fetch the busiest date
 613 |         # if self.busiest_date:
 614 |         # trips = trips[trips.service_id==self.busiest_service_id]
 615 | 
 616 |         # If we're looking for the busiest date or a specific list of
 617 |         # dates we need to parse de calendar
 618 |         if (self.busiest_date) | (dates != []):
 619 |             """
 620 |             In the case we have three possibilites:
 621 |             1. busiest_date=True & dates==[]: in this case the user looks for the
 622 |                 busiest date in the entire feed
 623 |             2. busiest_date=True & dates!=[]: in this case the user looks for the
 624 |                 busiest date within the date range provided.
 625 |             3. busiest_daet=False & dates!=[]: in this case the user looks for the
 626 |                 entire feed within the date range provided and we don't need to change
 627 |                 the "dates" variable at all.
 628 |             """
 629 |             dates_service_id = self.parse_calendar()
 630 | 
 631 |             # If busiest_date=True, we have to count the number of trips
 632 |             if self.busiest_date:
 633 |                 # Trip per date
 634 |                 date_ntrips = (
 635 |                     trips.merge(dates_service_id).groupby(["date"]).trip_id.count().sort_values(ascending=False)
 636 |                 )
 637 | 
 638 |             # If we are looking for the busiest date within our date period,
 639 |             # we only keep the dates in that period of time.
 640 |             if (self.busiest_date) & (dates != []):
 641 |                 dates_service_id = dates_service_id[dates_service_id.date.isin(dates)]
 642 |                 date_ntrips = date_ntrips[date_ntrips.index.isin(dates)]
 643 | 
 644 |             # Now that we've considered both cases we can just filter
 645 |             # with the busiest_date of the "dates" that made it this far
 646 |             if self.busiest_date:
 647 |                 # In that case, if "dates" is empty we need to find the busiest date
 648 |                 busiest_date = list(date_ntrips[date_ntrips == date_ntrips.max()].index)
 649 |                 max_trips = date_ntrips[date_ntrips == date_ntrips.max()].values[0]
 650 | 
 651 |                 logging.info(
 652 |                     "The busiest date/s of this feed or your selected date range"
 653 |                     + f" is/are:  {busiest_date} with {max_trips} trips."
 654 |                 )
 655 |                 logging.info("In the case that more than one busiest date was found, the first one will be considered.")
 656 |                 logging.info(f"In this case is {busiest_date[0]}.")
 657 | 
 658 |                 # We need "dates" to be a list
 659 |                 dates = busiest_date[:1]
 660 | 
 661 |             # Keep only the trips that are relevant to the use case
 662 |             trips = (
 663 |                 trips.set_index("service_id")
 664 |                 .join(
 665 |                     dates_service_id[dates_service_id.date.isin(dates)].set_index("service_id"),
 666 |                     how="inner",
 667 |                 )
 668 |                 .reset_index(names="service_id")
 669 |                 .drop(["keep", "date"], axis=1)
 670 |                 .drop_duplicates()
 671 |             )
 672 | 
 673 |         # Get routes info in trips
 674 |         # The GTFS feed might be missing some of the keys, e.g. direction_id or shape_id.
 675 |         # To allow processing incomplete GTFS data, we must reindex instead:
 676 |         # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
 677 |         # This will add NaN for any missing columns.
 678 |         cols = [
 679 |             "trip_id",
 680 |             "route_id",
 681 |             "route_name",
 682 |             "service_id",
 683 |             "direction_id",
 684 |             "shape_id",
 685 |         ]
 686 |         trips = add_route_name(trips, routes).reindex(columns=cols)
 687 | 
 688 |         # Fill null values
 689 |         trips["direction_id"] = trips.direction_id.fillna(0)
 690 | 
 691 |         return trips
 692 | 
 693 |     def get_routes(self):
 694 |         routes = extract_file("routes", self)
 695 |         routes["route_id"] = routes.route_id.astype(str)
 696 | 
 697 |         if "route_short_name" in routes.columns:
 698 |             routes["route_short_name"] = routes.route_short_name.astype(str)
 699 |         if "route_short_name" in routes.columns:
 700 |             routes["route_long_name"] = routes.route_long_name.astype(str)
 701 | 
 702 |         return routes
 703 | 
 704 |     def get_stops(self):
 705 |         stops = extract_file("stops", self)
 706 | 
 707 |         if self.geo:
 708 |             # Add geometry to stops
 709 |             stops = gpd.GeoDataFrame(
 710 |                 data=stops,
 711 |                 geometry=gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
 712 |                 crs=4326,
 713 |             )
 714 | 
 715 |         stops["stop_id"] = stops.stop_id.astype(str)
 716 |         stops["stop_name"] = stops.stop_name.astype(str)
 717 | 
 718 |         return stops
 719 | 
 720 |     def get_stop_times(self):
 721 |         # Get trips, routes and stops info in stop_times
 722 |         stop_times = extract_file("stop_times", self)
 723 |         if self._trips is not None:  # prevents infinite loop
 724 |             logging.info("_trips is defined in stop_times")
 725 |             trips = self._trips
 726 |         else:
 727 |             logging.info("get trips in stop_times")
 728 |             trips = self.trips
 729 |         stops = self.stops
 730 | 
 731 |         # Fix data types
 732 |         stop_times["trip_id"] = stop_times.trip_id.astype(str)
 733 |         stop_times["stop_id"] = stop_times.stop_id.astype(str)
 734 | 
 735 |         if "route_id" in stop_times.columns:
 736 |             stop_times["route_id"] = stop_times.route_id.astype(str)
 737 | 
 738 |         if "shape_id" in stop_times.columns:
 739 |             stop_times["shape_id"] = stop_times.shape_id.astype(str)
 740 | 
 741 |         # We merge stop_times to "trips" (not the other way around) because
 742 |         # "trips" have already been filtered by the busiest service_id
 743 |         stop_times = trips.merge(stop_times, how="inner")
 744 | 
 745 |         if self.geo:
 746 |             stop_times = stop_times.merge(stops, how="left")
 747 | 
 748 |             # stop_times needs to be geodataframe if we want to do geometry operations
 749 |             stop_times = gpd.GeoDataFrame(stop_times, geometry="geometry")
 750 | 
 751 |         # direction_id is optional, as it is not needed to determine route shapes
 752 |         # However, if direction_id is NaN, pivot_table will return an empty DataFrame.
 753 |         # Therefore, use a sensible default if direction id is not known.
 754 |         # Some gtfs feeds only contain direction_id 0, use that as default
 755 |         stop_times["direction_id"] = stop_times["direction_id"].fillna(0)
 756 | 
 757 |         # Pass times to seconds since midnight
 758 |         stop_times["arrival_time"] = [
 759 |             seconds_since_midnight(t) if t not in [None, np.nan] else None for t in stop_times.arrival_time
 760 |         ]
 761 |         stop_times["departure_time"] = [
 762 |             seconds_since_midnight(t) if t not in [None, np.nan] else None for t in stop_times.departure_time
 763 |         ]
 764 | 
 765 |         return stop_times
 766 | 
 767 |     def get_shapes(self):
 768 |         if self.geo:
 769 |             aux = extract_file("shapes", self)
 770 | 
 771 |             # Sort shapes by shape_pt_sequence
 772 |             aux.sort_values(["shape_id", "shape_pt_sequence"], inplace=True)
 773 |             shapes = (
 774 |                 aux[["shape_id", "shape_pt_lat", "shape_pt_lon"]]
 775 |                 .groupby("shape_id")
 776 |                 .agg(list)
 777 |                 .apply(lambda x: LineString(zip(x[1], x[0])), axis=1)
 778 |             )
 779 | 
 780 |             shapes = gpd.GeoDataFrame(data=shapes.index, geometry=shapes.values, crs=4326)
 781 |             shapes["shape_id"] = shapes.shape_id.astype(str)
 782 | 
 783 |             return shapes
 784 |         else:
 785 |             shapes = extract_file("shapes", self)
 786 |             shapes["shape_id"] = shapes.shape_id.astype(str)
 787 |             return shapes
 788 | 
 789 |     def get_stops_freq(self):
 790 |         """
 791 |         Get the stop frequencies. For each stop of each route it
 792 |         returns the bus frequency in minutes/bus broken down by
 793 |         time window.
 794 |         """
 795 |         stop_times = self.stop_times
 796 |         stops = self.stops
 797 |         cutoffs = self.time_windows
 798 | 
 799 |         if "window" not in stop_times.columns:
 800 |             stop_times = window_creation(stop_times, cutoffs)
 801 |         else:
 802 |             stop_times["window"] = stop_times.window.astype(str)
 803 | 
 804 |         labels = label_creation(cutoffs)
 805 |         stop_frequencies = add_frequency(stop_times, labels, index_="stop_id", col="window", cutoffs=cutoffs)
 806 | 
 807 |         if self.geo:
 808 |             stops_cols = ["stop_id", "stop_name", "geometry"]
 809 |         else:
 810 |             stops_cols = ["stop_id", "stop_name"]
 811 | 
 812 |         stop_frequencies = stop_frequencies.merge(stops[stops_cols], how="left")
 813 | 
 814 |         if self.geo:
 815 |             stop_frequencies = gpd.GeoDataFrame(data=stop_frequencies, geometry=stop_frequencies.geometry)
 816 | 
 817 |         return stop_frequencies
 818 | 
 819 |     def get_lines_freq(self):
 820 |         """
 821 |         Calculates the frequency for each pattern of a route.
 822 |         Returns the bus frequency in minutes/bus broken down by
 823 |         time window.
 824 |         """
 825 | 
 826 |         stop_times = self.stop_times
 827 |         shapes = self.shapes
 828 |         cutoffs = self.time_windows
 829 | 
 830 |         stop_times_first = stop_times.loc[stop_times.stop_sequence == 1, :]
 831 | 
 832 |         # Create time windows
 833 |         if "window" not in stop_times_first.columns:
 834 |             stop_times_first = window_creation(stop_times_first, cutoffs)
 835 |         else:
 836 |             stop_times_first["window"] = stop_times_first.window.astype(str)
 837 | 
 838 |         # Create labels
 839 |         labels = label_creation(cutoffs)
 840 | 
 841 |         # Get frequencies
 842 |         line_frequencies = add_frequency(
 843 |             stop_times_first,
 844 |             labels,
 845 |             index_=["route_id", "route_name", "shape_id"],
 846 |             col="window",
 847 |             cutoffs=cutoffs,
 848 |         )
 849 | 
 850 |         # Do we want a geodataframe?
 851 |         if self.geo:
 852 |             line_frequencies = pd.merge(line_frequencies, shapes, how="left")
 853 |             line_frequencies = gpd.GeoDataFrame(data=line_frequencies, geometry=line_frequencies.geometry, crs=4326)
 854 | 
 855 |         # Clean the df
 856 |         keep_these = [
 857 |             "route_id",
 858 |             "route_name",
 859 |             "direction_id",
 860 |             "window",
 861 |             "min_per_trip",
 862 |             "ntrips",
 863 |             "geometry",
 864 |         ]
 865 | 
 866 |         line_frequencies = line_frequencies.loc[~line_frequencies.geometry.isnull(), keep_these]
 867 | 
 868 |         return line_frequencies
 869 | 
 870 |     def get_segments(self):
 871 |         """Splits each route's shape into stop-stop LineString called segments
 872 | 
 873 |         Returns the segment geometry as well as additional segment information
 874 |         """
 875 |         logging.info("Getting segments...")
 876 |         stop_times = self.stop_times
 877 |         shapes = self.shapes
 878 | 
 879 |         req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry"]
 880 |         add_columns = ["route_id", "route_name", "direction_id", "stop_name"]
 881 | 
 882 |         # merge stop_times and shapes to calculate cut distance and interpolated point
 883 |         df_shape_stop = (
 884 |             stop_times[req_columns + add_columns]
 885 |             .drop_duplicates()
 886 |             .merge(shapes, on="shape_id", suffixes=("_stop", "_shape"))
 887 |         )
 888 |         logging.info("Projecting stops onto shape...")
 889 |         df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]].apply(
 890 |             lambda x: x[1].project(x[0], normalized=True), axis=1
 891 |         )
 892 |         logging.info("Interpolating stops onto shape...")
 893 |         df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply(
 894 |             lambda x: x[0].interpolate(x[1], normalized=True), axis=1
 895 |         )
 896 | 
 897 |         # calculate cut distance for
 898 |         logging.info("Sorting shape points and stops...")
 899 |         df_shape = shapes[shapes.shape_id.isin(stop_times.shape_id.unique())]
 900 |         df_shape["list_of_points"] = df_shape.geometry.apply(lambda x: list(MultiPoint(x.coords).geoms))
 901 |         df_shape_exp = df_shape.explode("list_of_points")
 902 |         df_shape_exp["projected_line_points"] = df_shape_exp[["geometry", "list_of_points"]].apply(
 903 |             lambda x: x[0].project(x[1], normalized=True), axis=1
 904 |         )
 905 | 
 906 |         # rename both dfs to concatenate
 907 |         df_shape_stop.rename(
 908 |             {
 909 |                 "projected_stop_point": "geometry",
 910 |                 "cut_distance_stop_point": "normalized_distance_along_shape",
 911 |             },
 912 |             axis=1,
 913 |             inplace=True,
 914 |         )
 915 |         df_shape_stop["cut_flag"] = True
 916 | 
 917 |         df_shape_exp = df_shape_exp[["shape_id", "list_of_points", "projected_line_points"]]
 918 |         df_shape_exp.rename(
 919 |             {
 920 |                 "list_of_points": "geometry",
 921 |                 "projected_line_points": "normalized_distance_along_shape",
 922 |             },
 923 |             axis=1,
 924 |             inplace=True,
 925 |         )
 926 | 
 927 |         df_shape_exp["cut_flag"] = False
 928 | 
 929 |         # combine stops and shape points
 930 |         gdf = pd.concat([df_shape_stop, df_shape_exp], ignore_index=False)
 931 |         gdf.sort_values(["shape_id", "normalized_distance_along_shape"], inplace=True)
 932 |         gdf.reset_index(inplace=True, drop=True)
 933 | 
 934 |         # drop all non stops (had to combine first fto get their gdf index)
 935 |         cuts = gdf.where(gdf.cut_flag).dropna(subset="cut_flag")
 936 |         cuts = cuts.astype({"shape_id": str, "stop_sequence": int, "direction_id": int})
 937 |         cuts[["end_stop_id", "end_stop_name"]] = cuts.groupby("shape_id")[["stop_id", "stop_name"]].shift(-1)
 938 | 
 939 |         # Create LineString for each stop to stop
 940 |         segment_geometries = []
 941 |         for shape_id in cuts.shape_id.drop_duplicates():
 942 |             cut_idx = cuts[cuts.shape_id == shape_id].index
 943 |             for i, cut in enumerate(cut_idx[:-1]):
 944 |                 segment_geometries.append(LineString(gdf.iloc[cut_idx[i] : cut_idx[i + 1] + 1].geometry))
 945 | 
 946 |         # create into gpd adding additional columns
 947 |         segment_df = cuts.dropna(subset="end_stop_id", axis=0)
 948 |         logging.info(f"segments_df: {len(segment_df)}, geometry: {len(segment_geometries)}")
 949 |         segment_gdf = gpd.GeoDataFrame(segment_df, geometry=segment_geometries)
 950 |         # drop irrelevant columns
 951 |         segment_gdf.drop(
 952 |             [
 953 |                 "geometry_shape",
 954 |                 "cut_flag",
 955 |                 "normalized_distance_along_shape",
 956 |                 "geometry_stop",
 957 |             ],
 958 |             axis=1,
 959 |             inplace=True,
 960 |         )
 961 |         segment_gdf.crs = "EPSG:4326"
 962 | 
 963 |         # Add segment length in meters
 964 |         segment_gdf["distance_m"] = segment_gdf.to_crs(code(segment_gdf)).length
 965 | 
 966 |         # Add segment_id and name
 967 |         segment_gdf["segment_id"] = segment_gdf.stop_id.astype(str) + " - " + segment_gdf.end_stop_id.astype(str)
 968 |         segment_gdf["segment_name"] = segment_gdf.stop_name + " - " + segment_gdf.end_stop_name
 969 | 
 970 |         # Order columns
 971 |         col_ordered = [
 972 |             "shape_id",
 973 |             "route_id",
 974 |             "route_name",
 975 |             "direction_id",
 976 |             "stop_sequence",
 977 |             "segment_name",
 978 |             "stop_name",
 979 |             "end_stop_name",
 980 |             "segment_id",
 981 |             "stop_id",
 982 |             "end_stop_id",
 983 |             "distance_m",
 984 |             "geometry",
 985 |         ]
 986 | 
 987 |         segment_gdf = segment_gdf[col_ordered]
 988 |         segment_gdf.rename(
 989 |             columns=dict(stop_name="start_stop_name", stop_id="start_stop_id"),
 990 |             inplace=True,
 991 |         )
 992 | 
 993 |         return segment_gdf
 994 | 
 995 |     def get_speeds(self):
 996 |         stop_times = self.stop_times
 997 |         segment_gdf = self.segments
 998 | 
 999 |         # Add runtime and distance to stop_times
1000 |         aux = add_runtime(stop_times)
1001 |         aux = add_distance(aux, segment_gdf)
1002 | 
1003 |         # Calculate the speed per segment
1004 |         speeds = add_speed(aux)
1005 | 
1006 |         cols = [
1007 |             "route_name",
1008 |             "direction_id",
1009 |             "stop_sequence",
1010 |             "segment_name",
1011 |             "start_stop_name",
1012 |             "end_stop_name",
1013 |             "speed_kmh",
1014 |             "runtime_sec",
1015 |             "arrival_time",
1016 |             "departure_time",
1017 |             "distance_m",
1018 |             "route_id",
1019 |             "start_stop_id",
1020 |             "end_stop_id",
1021 |             "segment_id",
1022 |             "shape_id",
1023 |             "geometry",
1024 |         ]
1025 | 
1026 |         return speeds[cols]
1027 | 
1028 |     def get_avg_speeds(self):
1029 |         """
1030 |         Calculate the average speed per route, segment and window.
1031 |         """
1032 |         speeds = self.speeds
1033 |         segment_gdf = self.segments
1034 |         cutoffs = self.time_windows
1035 | 
1036 |         # Create windows for aggregation
1037 |         speeds = window_creation(speeds, cutoffs)
1038 | 
1039 |         # Fix outliers
1040 |         speeds = fix_outliers(speeds)
1041 | 
1042 |         # Aggregate by route, segment, and window
1043 |         agg_speed = aggregate_speed(speeds, segment_gdf)
1044 | 
1045 |         # Aggregate by segment and window (add ALL LINES level)
1046 |         all_lines = add_all_lines_speed(agg_speed, speeds, segment_gdf)
1047 | 
1048 |         # Add free flow speed
1049 |         data = add_free_flow(speeds, all_lines)
1050 | 
1051 |         # Do we want a geodataframe?
1052 |         if self.geo:
1053 |             data = gpd.GeoDataFrame(data=data, geometry=data.geometry, crs=4326)
1054 | 
1055 |         ordered_cols = [
1056 |             "route_id",
1057 |             "route_name",
1058 |             "direction_id",
1059 |             "stop_sequence",
1060 |             "segment_name",
1061 |             "window",
1062 |             "speed_kmh",
1063 |             "avg_route_speed_kmh",
1064 |             "segment_max_speed_kmh",
1065 |             "runtime_sec",
1066 |             "start_stop_name",
1067 |             "end_stop_name",
1068 |             "segment_id",
1069 |             "start_stop_id",
1070 |             "end_stop_id",
1071 |             "shape_id",
1072 |             "distance_m",
1073 |             "geometry",
1074 |         ]
1075 | 
1076 |         return data[ordered_cols]
1077 | 
1078 |     def get_segments_freq(self):
1079 | 
1080 |         stop_times = self.stop_times
1081 |         segment_gdf = self.segments
1082 |         cutoffs = self.time_windows
1083 | 
1084 |         if "window" not in stop_times.columns:
1085 |             stop_times = window_creation(stop_times, cutoffs)
1086 |         else:
1087 |             stop_times["window"] = stop_times.window.astype(str)
1088 | 
1089 |         # Get labels
1090 |         labels = label_creation(cutoffs)
1091 | 
1092 |         # Aggregate trips
1093 |         line_frequencies = add_frequency(
1094 |             stop_times,
1095 |             labels,
1096 |             index_=["route_id", "route_name", "stop_id"],
1097 |             col="window",
1098 |             cutoffs=cutoffs,
1099 |         )
1100 | 
1101 |         keep_these = [
1102 |             "route_id",
1103 |             "route_name",
1104 |             "segment_name",
1105 |             "start_stop_name",
1106 |             "end_stop_name",
1107 |             "segment_id",
1108 |             "start_stop_id",
1109 |             "end_stop_id",
1110 |             "direction_id",
1111 |             "geometry",
1112 |         ]
1113 | 
1114 |         line_frequencies = pd.merge(
1115 |             line_frequencies,
1116 |             segment_gdf[keep_these],
1117 |             left_on=["route_id", "route_name", "stop_id", "direction_id"],
1118 |             right_on=["route_id", "route_name", "start_stop_id", "direction_id"],
1119 |             how="left",
1120 |         )
1121 | 
1122 |         line_frequencies.drop("stop_id", axis=1, inplace=True)
1123 | 
1124 |         # Remove duplicates after merging
1125 |         line_frequencies.drop_duplicates(inplace=True)
1126 | 
1127 |         # Aggregate for all lines
1128 |         data_complete = add_all_lines(line_frequencies, segment_gdf, labels, cutoffs)
1129 | 
1130 |         # Do we want a geodataframe?
1131 |         if self.geo is True:
1132 |             data_complete = gpd.GeoDataFrame(
1133 |                 data=data_complete.drop("geometry", axis=1),
1134 |                 geometry=data_complete.geometry,
1135 |             )
1136 | 
1137 |         # Clean data
1138 |         keep_these = [
1139 |             "route_id",
1140 |             "route_name",
1141 |             "direction_id",
1142 |             "segment_name",
1143 |             "start_stop_name",
1144 |             "end_stop_name",
1145 |             "window",
1146 |             "min_per_trip",
1147 |             "ntrips",
1148 |             "start_stop_id",
1149 |             "end_stop_id",
1150 |             "segment_id",
1151 |             "geometry",
1152 |         ]
1153 | 
1154 |         data_complete = data_complete.loc[~data_complete.geometry.isnull()][keep_these]
1155 | 
1156 |         return data_complete
1157 | 
1158 |     def get_distance_between_stops(self):
1159 |         """
1160 |         Compared H3 hex bins to DBSCAN clusters in this map:
1161 |         https://studio.foursquare.com/public/8436b10c-4ccc-48a3-a232-e8026f81a117
1162 | 
1163 |         From what I see, the optimal resolution for our purposes is resolution=9.
1164 |         Taking the Hex bin at this resolution and its neighbors works as a better
1165 |         clustering method than DBSCAN.
1166 | 
1167 |         We can then only calculate the distance between each stop and the ones that
1168 |         are in the neighboring hex bins.
1169 |         """
1170 |         stops_ = self.stops.copy()
1171 | 
1172 |         logging.info("Getting hex bins.")
1173 |         RESOLUTION = 9
1174 | 
1175 |         stops_hash = stops_.to_dict()["stop_id"]
1176 |         stops_.reset_index(inplace=True)
1177 |         stops_.rename(columns=dict(index="stop_index"), inplace=True)
1178 | 
1179 |         stops_["hex"] = stops_.apply(lambda row: latlng_to_cell(row.stop_lat, row.stop_lon, RESOLUTION), axis=1)
1180 | 
1181 |         # stops.head()
1182 | 
1183 |         # Stops to utm for distance calculatin
1184 |         utm_stops = stops_[["stop_index", "stop_id", "stop_name", "hex", "geometry"]].to_crs(code(stops_))
1185 | 
1186 |         # # Hash for faster results
1187 |         # stops_h3_hash = stops.set_index('stop_index').to_dict()['hex']
1188 |         # h3_stops_hash = stops.set_index('hex').to_dict()['stop_index']
1189 | 
1190 |         # Stops in Hexbins
1191 |         h3_stops = stops_.groupby("hex").stop_index.apply(list)
1192 |         h3_geos = utm_stops.groupby("hex").geometry.apply(list)
1193 | 
1194 |         #  Unique hex
1195 |         h3_neighbors = {hex: grid_ring(hex, k=1) for hex in stops_.hex.unique()}
1196 | 
1197 |         st = time()
1198 | 
1199 |         stops_comb = []
1200 |         distances = []
1201 | 
1202 |         logging.info("Looking for stop distances")
1203 | 
1204 |         for hex, h3_group in h3_neighbors.items():
1205 |             s_index = h3_stops[h3_stops.index.isin(h3_group)].values
1206 |             s_geos = h3_geos[h3_geos.index.isin(h3_group)].values
1207 | 
1208 |             stops_list = list(chain.from_iterable(s_index))
1209 |             geo_list = list(chain.from_iterable(s_geos))
1210 |             geo_perm = list(permutations(geo_list, 2))
1211 | 
1212 |             stops_comb.extend(list(permutations(stops_list, 2)))
1213 |             distances.extend([distance(pair[0], pair[1]) for pair in geo_perm])
1214 | 
1215 |         # Make dataframe
1216 |         dist_df = pd.DataFrame(data=stops_comb, columns=["stop_index_1", "stop_index_2"])
1217 |         dist_df["distance_m"] = distances
1218 |         dist_df.drop_duplicates(subset=["stop_index_1", "stop_index_2"], inplace=True)
1219 | 
1220 |         et = time()
1221 |         # print(f'Calculating distances took {et-st} seconds')
1222 |         logging.info(f"Calculating distances took {et-st} seconds")
1223 |         logging.info("Calculate walking times")
1224 | 
1225 |         # Calculate walking times
1226 |         # Assume 1.11 m/s as average walking speed as the literature suggests (4km/h=1.11 m/s)
1227 |         walking_speed_ms = 4 / 3.6
1228 |         dist_df["connection_time_min"] = dist_df.distance_m * walking_speed_ms / 60
1229 | 
1230 |         # Add stop_id to distance matrix
1231 |         dist_df["stop_id_1"] = dist_df.stop_index_1.apply(lambda x: stops_hash[x])
1232 |         dist_df["stop_id_2"] = dist_df.stop_index_2.apply(lambda x: stops_hash[x])
1233 | 
1234 |         return dist_df
1235 | 
1236 | 
1237 | def extract_file(file, feed):
1238 |     data_types = {"shape_id": str, "stop_id": str, "route_id": str, "trip_id": str}
1239 | 
1240 |     files = feed.files
1241 |     gtfs_path = feed.gtfs_path
1242 | 
1243 |     # check if the the zip file came from a zipped folder
1244 |     if len(files[0].split("/")) == 1:
1245 |         file_path = f"{file}.txt"
1246 |     else:
1247 |         file_path = f"{files[0].split('/')[0]}/{file}.txt"
1248 | 
1249 |     # Try as a local file
1250 |     try:
1251 |         if file_path in files:
1252 |             with ZipFile(gtfs_path) as myzip:
1253 |                 logging.info(f'Reading "{file}.txt".')
1254 |                 myzip.extract(file_path, path="/tmp")
1255 |                 data = pd.read_csv(f"/tmp/{file_path}", dtype=data_types)
1256 | 
1257 |                 os.remove(f"/tmp/{file_path}")
1258 |                 return data
1259 |         else:
1260 |             return logging.info(f'File "{file}.txt" not found.')
1261 | 
1262 |     # Try as a URL
1263 |     except (FileNotFoundError, OSError) as e:
1264 |         logging.error(e)
1265 |         if f"{file}.txt" in files:
1266 |             r = requests.get(gtfs_path)
1267 |             with ZipFile(io.BytesIO(r.content)) as myzip:
1268 |                 logging.info(f'Reading "{file}.txt".')
1269 |                 myzip.extract(f"{file_path}", path="/tmp")
1270 |                 data = pd.read_csv(f"/tmp/{file_path}", dtype=data_types)
1271 | 
1272 |                 os.remove(f"/tmp/{file_path}")
1273 |                 return data
1274 |         else:
1275 |             return logging.info(f'File "{file}.txt" not found.')
1276 | 


--------------------------------------------------------------------------------
/gtfs_functions/gtfs_plots.py:
--------------------------------------------------------------------------------
  1 | import branca
  2 | import pandas as pd
  3 | import os
  4 | import plotly.express as px
  5 | import jenkspy
  6 | import folium
  7 | import logging
  8 | 
  9 | import warnings
 10 | 
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | 
 14 | def map_gdf(
 15 |     gdf,
 16 |     variable="min_per_trip",
 17 |     colors=["#d13870", "#e895b3", "#55d992", "#3ab071", "#0e8955", "#066a40"],
 18 |     tooltip_var=["min_per_trip"],
 19 |     tooltip_labels=["Headway: "],
 20 |     breaks=[],
 21 | ):
 22 | 
 23 |     gdf.reset_index(inplace=True, drop=True)
 24 |     # Look for the center of the map
 25 |     minx, miny, maxx, maxy = gdf.geometry.total_bounds
 26 | 
 27 |     centroid_lat = miny + (maxy - miny) / 2
 28 |     centroid_lon = minx + (maxx - minx) / 2
 29 | 
 30 |     if isinstance(gdf[variable].values[0], str):
 31 |         categorical = True
 32 |     else:
 33 |         categorical = False
 34 | 
 35 |     # Calculate the breaks if they were not specified
 36 |     if (breaks == []) & (not categorical):
 37 |         breaks = jenkspy.jenks_breaks(gdf[variable], n_classes=len(colors))
 38 |         breaks = [int(b) for b in breaks]
 39 | 
 40 |     m = folium.Map(location=[centroid_lat, centroid_lon], tiles="cartodbpositron", zoom_start=12)
 41 | 
 42 |     # If the variable is categorical
 43 |     if categorical:
 44 |         gdf["radius"] = 5
 45 | 
 46 |         # We start with Remix Lightrail colors
 47 |         # and then add default colors from Plotly
 48 |         qualitative_palette = [
 49 |             "#0066a1",
 50 |             "#a92023",
 51 |             "#066a40",
 52 |             "#e89b01",
 53 |             "#613fa6",
 54 |             "#024b50",
 55 |             "#a72051",
 56 |             "#a72f00",
 57 |             "#476800",
 58 |         ]
 59 | 
 60 |         color_palette = (
 61 |             qualitative_palette
 62 |             + px.colors.qualitative.Pastel
 63 |             + px.colors.qualitative.Prism
 64 |             + px.colors.qualitative.Vivid
 65 |             + px.colors.qualitative.Light24
 66 |         )
 67 | 
 68 |         fill_color = pd.DataFrame(
 69 |             dict(
 70 |                 variable=gdf[variable].unique(),
 71 |                 fill_color=color_palette[0 : len(gdf[variable].unique())],
 72 |             )
 73 |         )
 74 | 
 75 |         gdf = pd.merge(gdf, fill_color, left_on=variable, right_on=variable, how="left")
 76 | 
 77 |     # If the variable is numerical
 78 |     else:
 79 |         gdf["radius"] = gdf[variable] / gdf[variable].max() * 10
 80 |         index = [int(b) for b in breaks]
 81 |         colorscale = branca.colormap.StepColormap(colors, index=index, caption=variable)
 82 |         gdf["fill_color"] = gdf[variable].apply(lambda x: colorscale(x))
 83 | 
 84 |     if gdf.geom_type.values[0] == "Point":
 85 |         # my code for circles
 86 |         # Create the circles
 87 |         for i in range(int(len(gdf))):
 88 |             folium.CircleMarker(
 89 |                 location=[gdf.loc[i, "geometry"].y, gdf.loc[i, "geometry"].x],
 90 |                 radius=float(gdf.loc[i, "radius"]),
 91 |                 tooltip=tooltip_labels[0] + str(gdf.loc[i, tooltip_var[0]]) + " min",
 92 |                 color="#ffffff00",
 93 |                 fill=True,
 94 |                 fill_opacity=0.7,
 95 |                 fill_color=str(gdf.loc[i, "fill_color"]),
 96 |             ).add_to(m)
 97 |     else:
 98 |         # Styling function for LineStrings
 99 |         def style_function(feature):
100 |             return {
101 |                 "fillOpacity": 0.5,
102 |                 "weight": 3,  # math.log2(feature['properties']['speed'])*2,
103 |                 "color": feature["properties"]["fill_color"],
104 |             }
105 | 
106 |         # my code for lines
107 |         geo_data = gdf.__geo_interface__
108 |         folium.GeoJson(
109 |             geo_data,
110 |             style_function=style_function,
111 |             tooltip=folium.features.GeoJsonTooltip(
112 |                 fields=tooltip_var, aliases=tooltip_labels, labels=True, sticky=False
113 |             ),
114 |         ).add_to(m)
115 | 
116 |     return m
117 | 


--------------------------------------------------------------------------------
/images/bus_segments.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/bus_segments.jpg


--------------------------------------------------------------------------------
/images/fancy_speed_per_hour.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/fancy_speed_per_hour.jpg


--------------------------------------------------------------------------------
/images/heatmap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/heatmap.jpg


--------------------------------------------------------------------------------
/images/histogram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/histogram.jpg


--------------------------------------------------------------------------------
/images/kepler_seg_freq.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/kepler_seg_freq.jpg


--------------------------------------------------------------------------------
/images/kepler_speeds.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/kepler_speeds.jpg


--------------------------------------------------------------------------------
/images/line_frequencies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/line_frequencies.jpg


--------------------------------------------------------------------------------
/images/map_line_freq.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/map_line_freq.jpg


--------------------------------------------------------------------------------
/images/map_stop_freq.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/map_stop_freq.jpg


--------------------------------------------------------------------------------
/images/routes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/routes.jpg


--------------------------------------------------------------------------------
/images/shapes.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/shapes.jpg


--------------------------------------------------------------------------------
/images/speed_hour.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/speed_hour.jpg


--------------------------------------------------------------------------------
/images/stop_times.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/stop_times.jpg


--------------------------------------------------------------------------------
/images/stops.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/stops.jpg


--------------------------------------------------------------------------------
/images/stops_freq_output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/stops_freq_output.jpg


--------------------------------------------------------------------------------
/images/trips.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bondify/gtfs_functions/9cb098517641b1c8b2dfb73c8e961c8735085a2f/images/trips.jpg


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=43.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.isort]
 6 | profile = "black"
 7 | line_length = 120
 8 | multi_line_output = 3
 9 | include_trailing_comma = true
10 | filter_files = true
11 | skip = ".venv,.mypy_cache"
12 | 
13 | [tool.black]
14 | line-length = 120
15 | exclude = "(\\.git|\\.mypy_cache|\\.venv|_build|buck-out|build|dist)"


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | version = 2.7.0
 3 | 
 4 | [flake8]
 5 | select = B,B9,C,DAR,E,F,N,RST,S,W
 6 | extend-ignore = E704,W503,E203,C901
 7 | max-line-length = 120
 8 | max-complexity = 10
 9 | docstring-convention = google
10 | per-file-ignores = tests/*:S101
11 | rst-roles = class,const,func,meth,mod,ref
12 | rst-directives = deprecated
13 | exclude = 
14 | 	notebooks
15 | 	./notebooks
16 | 	notebooks/*
17 | 	./notebooks/*
18 | 	.venv
19 | 	gtfs_functions/__init__.py
20 | 	data/*
21 | 
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="gtfs-functions",
 5 |     description="Package to easily wrangle GTFS files geospatially.",
 6 |     project_urls={
 7 |         "Source": "https://github.com/Bondify/gtfs_functions/tree/master",
 8 |         "Tracker": "https://github.com/Bondify/gtfs_functions/issues",
 9 |     },
10 |     author="Santiago Toso",
11 |     author_email="santiagoa.toso@gmail.com",
12 |     packages=find_packages(where="gtfs_functions"),
13 |     package_dir={"gtfs_functions": "gtfs_functions"},
14 |     python_requires=">=3.8, <4",
15 |     install_requires=[
16 |         # Data wrangling
17 |         "pandas",
18 |         "numpy",
19 |         "pendulum>=3.0.0",
20 |         # Geo
21 |         "geopandas",
22 |         "shapely",
23 |         "utm>=0.7.0",
24 |         "h3>3.7.7",
25 |         "haversine",
26 |         # Plotting
27 |         "branca>=0.6.0",
28 |         "plotly>=5.13.0",
29 |         "jenkspy>=0.3.2",
30 |         "folium>=0.14.0",
31 |         "unicode>=2.9",
32 |     ],
33 |     classifiers=[
34 |         "Development Status :: 5 - Production/Stable",
35 |         "Intended Audience :: Developers",
36 |         "License :: OSI Approved :: MIT License",
37 |         "Operating System :: OS Independent",
38 |     ],
39 |     keywords="gtfs",
40 | )
41 | 


--------------------------------------------------------------------------------