├── README.md
├── plot_delays.py
├── plot_travel_time.py
├── plot_trips.py
├── plot_wait_time.py
├── run.py
├── time_between_arrivals.png
├── time_to_arrival_by_line.png
├── time_to_arrival_by_time_of_day.png
├── time_to_arrival_percentiles.png
├── time_to_next_arrival.png
├── travel_time_delay.png
├── trips.png
└── worst_stations.py


/README.md:
--------------------------------------------------------------------------------
1 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_between_arrivals.png)
2 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_next_arrival.png)
3 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_arrival_by_time_of_day.png)
4 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_arrival_percentiles.png)
5 | ![Cool plot](https://raw.github.com/erikbern/mta/master/travel_time_delay.png)
6 | 


--------------------------------------------------------------------------------
/plot_delays.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import datetime
  3 | import seaborn
  4 | import numpy
  5 | import math
  6 | import pandas
  7 | from matplotlib import pyplot
  8 | import matplotlib
  9 | 
 10 | matplotlib.rcParams.update({'font.size': 48})
 11 | 
 12 | stations = {}
 13 | for n_lines, line in enumerate(open('log.jsons')):
 14 |     for vehicle in json.loads(line.strip()):
 15 |         if vehicle.get('current_status') != 1: # STOPPED_AT
 16 |             continue
 17 |         try:
 18 |             line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal
 19 |             if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']:
 20 |                 print 'weird line', line
 21 |                 continue
 22 |             if 'stop_id' in vehicle:
 23 |                 stop = vehicle['stop_id']
 24 |             else:
 25 |                 # L and SI stop at every station, need to use 
 26 |                 stop = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1])
 27 |             key = (line, stop)
 28 |             timestamp = vehicle['timestamp'] # datetime.datetime.utcfromtimestamp(vehicle['timestamp'])
 29 |             stations.setdefault(key, set()).add(timestamp)
 30 |         except:
 31 |             print 'weird vehicle', vehicle
 32 |             continue
 33 | 
 34 |     #if n_lines >= 10000:
 35 |     #    break
 36 | 
 37 | pyplot.figure(figsize=(10, 10))
 38 | 
 39 | # Look at all intervals between subway arrivals
 40 | def next_whole_minute(t):
 41 |     return t+59 - (t+59)%60
 42 | 
 43 | deltas = []
 44 | next_subway = []
 45 | next_subway_by_time_of_day = [[] for x in xrange(24 * 60)]
 46 | next_subway_by_line_ts = []
 47 | next_subway_by_line_ls = []
 48 | next_subway_rush_hour = []
 49 | max_limit = 4 * 3600 # cap max value so that Seaborn's KDE works better
 50 | for key, values in stations.iteritems():
 51 |     line, stop = key
 52 |     values = sorted(values)
 53 |     print key, len(values)
 54 |     last_value = None
 55 |     for i in xrange(1, len(values)):
 56 |         last_value, value = values[i-1], values[i]
 57 |         if value - last_value >= max_limit:
 58 |             continue
 59 |         deltas.append(1. / 60 * (value - last_value))
 60 |         for t in xrange(next_whole_minute(last_value), value, 60):
 61 |             x = (t // 60 + 19 * 60) % (24 * 60) # 19 from UTC offset
 62 |             waiting_time = 1. / 60 * (value - t)
 63 |             next_subway_by_time_of_day[x].append(waiting_time)
 64 |             next_subway.append(waiting_time)
 65 |             next_subway_by_line_ts.append(waiting_time)
 66 |             next_subway_by_line_ls.append(line)
 67 |             if x >= 7 * 60 and x < 19 * 60:
 68 |                 next_subway_rush_hour.append(waiting_time)
 69 | 
 70 | # Plot distributions of deltas
 71 | for data, fn, title, color in [(deltas, 'time_between_arrivals.png', 'Distribution of delays between subway arrivals', 'blue'),
 72 |                                (next_subway, 'time_to_next_arrival.png', 'Distribution of time until the next subway arrival', 'red')]:
 73 |     print 'got', len(data), 'points'
 74 |     pyplot.clf()
 75 |     lm = seaborn.distplot(data, bins=numpy.linspace(0, 60, num=61), color=color, kde_kws={'gridsize': 2000})
 76 |     pyplot.xlim([-1, 40])
 77 |     pyplot.title(title)
 78 |     pyplot.xlabel('Time (min)')
 79 |     pyplot.ylabel('Probability distribution')
 80 |     pyplot.savefig(fn)
 81 |     print 'mean', 60*numpy.mean(data), 'median', 60*numpy.median(data)
 82 | 
 83 | # Plot deltas by line
 84 | pyplot.clf()
 85 | seaborn.violinplot(orient='h',
 86 |                    x=next_subway_by_line_ts,
 87 |                    y=next_subway_by_line_ls,
 88 |                    order=['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI'],
 89 |                    scale='width',
 90 |                    palette=['#EE352E']*3 + ['#00933C']*3 + ['#808183', '#A7A9AC', '#555555'],
 91 |                    bw=0.03, cut=0, gridsize=2000)
 92 | pyplot.xlim([-1, 40])
 93 | pyplot.title('Time until the next subway')
 94 | pyplot.xlabel('Time (min)')
 95 | pyplot.ylabel('Line')
 96 | pyplot.savefig('time_to_arrival_by_line.png')
 97 | 
 98 | # Plot distribution of delays by time of day
 99 | percs = [50, 60, 70, 80, 90]
100 | results = [[] for perc in percs]
101 | xs = range(0, 24 * 60)
102 | for x, next_subway_slice in enumerate(next_subway_by_time_of_day):
103 |     print x, len(next_subway_slice), '...'
104 |     rs = numpy.percentile(next_subway_slice, percs)
105 |     for i, r in enumerate(rs):
106 |         results[i].append(r)
107 | 
108 | pyplot.clf()
109 | for i, result in enumerate(results):
110 |     pyplot.plot([x * 1.0 / 60 for x in xs], result, label='%d percentile' % percs[i])
111 | pyplot.ylim([0, 60])
112 | pyplot.xlim([0, 24])
113 | pyplot.title('How long do you have to wait given time of day')
114 | pyplot.xlabel('Time of day (h)')
115 | pyplot.ylabel('Time until subway arrives (min))')
116 | pyplot.legend()
117 | pyplot.savefig('time_to_arrival_by_time_of_day.png')
118 | 
119 | # Compute all percentiles
120 | results = [[] for perc in percs]
121 | offsets = numpy.arange(0, 40, 0.1)
122 | for offset in offsets:
123 |     print offset, '...'
124 |     rs = numpy.percentile([d-offset for d in next_subway_rush_hour if d >= offset], percs)
125 |     for i, r in enumerate(rs):
126 |         results[i].append(r)
127 | 
128 | pyplot.clf()
129 | for i, result in enumerate(results):
130 |     pyplot.plot(offsets, result, label='%d percentile' % percs[i])
131 | pyplot.ylim([0, 60])
132 | pyplot.title('How long do you have to wait given that you already waited?')
133 | pyplot.xlabel('Time you have waited for the subway (min)')
134 | pyplot.ylabel('Additional time until subway arrives (min)')
135 | pyplot.legend()
136 | pyplot.savefig('time_to_arrival_percentiles.png')
137 | 


--------------------------------------------------------------------------------
/plot_travel_time.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import seaborn
 4 | import numpy
 5 | import scipy
 6 | from matplotlib import pyplot
 7 | 
 8 | def parse_time(t):
 9 |     h, m, s = map(int, t.split(':'))
10 |     return h*60*60 + m*60 + s
11 | 
12 | sched_trips = {}
13 | for i, line in enumerate(open('stop_times.txt')):
14 |     line = line.strip().split(',')
15 |     if i > 0:
16 |         trip_id, arr, dep, stop_id  = line[:4]
17 |         trip_id_short = trip_id.split('_', 1)[1]
18 |         sched_trips.setdefault(trip_id_short, {}).setdefault(trip_id, {})[stop_id] = parse_time(arr)
19 | 
20 | real_trips = {}
21 | for n_lines, line in enumerate(open('log.jsons')):
22 |     for vehicle in json.loads(line.strip()):
23 |         if vehicle.get('current_status') != 1: # STOPPED_AT
24 |             continue
25 |         try:
26 |             line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal
27 |             if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']:
28 |                 print 'weird line', line
29 |                 continue
30 |             if 'stop_id' in vehicle:
31 |                 stop = vehicle['stop_id']
32 |             else:
33 |                 # L and SI stop at every station, need to use 
34 |                 stop = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1])
35 |             key = (vehicle['trip']['start_date'], vehicle['trip']['trip_id'])
36 |             timestamp = vehicle['timestamp']
37 |             real_trips.setdefault(key, []).append((timestamp, stop))
38 |         except:
39 |             print 'weird vehicle', vehicle
40 |             continue
41 | 
42 | xs = []
43 | ys = []
44 | 
45 | for key, stops in real_trips.iteritems():
46 |     _, trip_id = key
47 |     if trip_id not in sched_trips:
48 |         print 'unknown trip', trip_id
49 |         continue
50 | 
51 |     stops_compressed = []
52 |     last_stop = None
53 |     for t, stop in stops:
54 |         if stop != last_stop:
55 |             stops_compressed.append((t, stop))
56 |         last_stop = stop
57 | 
58 |     for t0, stop0 in stops_compressed:
59 |         sched = random.choice(sched_trips[trip_id].values())
60 |         if stop0 not in sched:
61 |             print 'weird stop for schedule', stop0
62 |             continue
63 |         t1, stop1 = random.choice(stops_compressed)
64 |         if stop1 not in sched:
65 |             print 'weird stop for schedule', stop1
66 |             continue
67 |         if stop0 == stop1:
68 |             continue
69 |         elif t0 > t1:
70 |             t0, t1, stop0, stop1 = t1, t0, stop1, stop0
71 |         xs.append(sched[stop1] - sched[stop0]) # timetable
72 |         ys.append(t1 - t0) # actual
73 | 
74 | delays = (numpy.array(ys) - numpy.array(xs))
75 | xmin, xmax = -10, 30
76 | seaborn.distplot(delays / 60., bins=numpy.linspace(xmin, xmax, num=(xmax-xmin+1)), kde_kws={'gridsize': 2000})
77 | mean, median, pc90 = ('%dm%02ds' % (t / 60, t % 60) for t in (numpy.mean(delays), numpy.median(delays), numpy.percentile(delays, 90)))
78 | print mean, median, pc90
79 | pyplot.title('Travel time delays (mean = %s, median = %s, 90th percentile = %s' % (mean, median, pc90))
80 | pyplot.xlabel('Delay between real and scheduled (min)')
81 | pyplot.ylabel('Probability distribution')
82 | pyplot.xlim([xmin, xmax])
83 | pyplot.savefig('travel_time_delay.png')
84 | 
85 | pyplot.clf()
86 | delays_frac = numpy.array(ys) / numpy.array(xs) - 1.0
87 | xmin, xmax = -1.0, 50
88 | seaborn.distplot(delays_frac * 100, kde_kws={'gridsize': 2000})
89 | mean, median, pc90 = ('%.2f%%' % x for x in (numpy.mean(delays_frac), numpy.median(delays_frac), numpy.percentile(delays_frac, 90)))
90 | print mean, median, pc90
91 | pyplot.title('Travel time percent delays (mean = %s, median = %s, 90th percentile = %s' % (mean, median, pc90))
92 | pyplot.xlabel('Delay between real and scheduled (%)')
93 | pyplot.ylabel('Probability distribution')
94 | pyplot.xlim([xmin, xmax])
95 | pyplot.savefig('travel_time_delay_frac.png')
96 | 


--------------------------------------------------------------------------------
/plot_trips.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import json
 3 | import matplotlib.pyplot as plt
 4 | import pytz
 5 | 
 6 | # nyc_tz = pytz.timezone('US/Eastern')
 7 | 
 8 | stops = {}
 9 | for line in open('stops.txt'):
10 |     stop_id, _, stop = line.strip().split(',')[:3]
11 |     if stop_id[-1] == 'N':
12 |         try:
13 |             stops[int(stop_id[:-1])] = stop
14 |         except:
15 |             print 'could not parse', stop_id, stop
16 | 
17 | print stops
18 | 
19 | def datetime2timestamp(d):
20 |     return (d - datetime.datetime(1970, 1, 1)).total_seconds()
21 | 
22 | 
23 | def read_data(date_a, date_b, margin):
24 |     date_a, date_b = (datetime2timestamp(d) for d in(date_a, date_b))
25 |     trips = {}
26 |     for line in open('log.jsons'):
27 |         for vehicle in json.loads(line.strip()):
28 |             if vehicle.get('current_status') != 1: # STOPPED_AT
29 |                 continue
30 |         
31 |             if vehicle['trip']['route_id'][:1] != '1':
32 |                 continue
33 | 
34 |             t = vehicle['timestamp']
35 |                         
36 |             if t > date_b + margin:
37 |                 return trips
38 | 
39 |             if t < date_a - margin:
40 |                 continue
41 |         
42 |             stop = vehicle['stop_id']
43 |             trip = (vehicle['trip']['start_date'], vehicle['trip']['trip_id'])
44 |             trips.setdefault(trip, {})[t] = stop
45 | 
46 | date_a, date_b = [datetime.datetime(2016, 3, 3, 14), datetime.datetime(2016, 3, 3, 20)]
47 | margin = 0 # 3600 * 3
48 | plt.figure(figsize=(24, 6))
49 | unique_ys = set()
50 | for trip, seq in read_data(date_a, date_b, margin).iteritems():
51 |     if len(seq) < 10:
52 |         continue
53 |     seq = sorted(seq.items())
54 |     ts = [datetime.datetime.utcfromtimestamp(t) for t, _ in seq]
55 |     ys = [int(s[:-1]) for _, s in seq]
56 |     unique_ys.update(ys)
57 |     plt.plot(ts, ys, '#EE352E', lw=2.0) #, tz=nyc_tz)
58 | 
59 | plt.yticks(sorted(unique_ys), [stops[y] for y in sorted(unique_ys)])
60 | plt.xlim([date_a, date_b])
61 | plt.tight_layout()    
62 | plt.savefig('trips.png')
63 | 
64 |         
65 | 


--------------------------------------------------------------------------------
/plot_wait_time.py:
--------------------------------------------------------------------------------
  1 | import bisect
  2 | import datetime
  3 | import json
  4 | import random
  5 | import seaborn
  6 | import numpy
  7 | import pandas
  8 | import scipy
  9 | from matplotlib import pyplot
 10 | 
 11 | def parse_time(t):
 12 |     h, m, s = map(int, t.split(':'))
 13 |     return h*60*60 + m*60 + s
 14 | 
 15 | sched_trips = {}
 16 | for i, line in enumerate(open('stop_times.txt')):
 17 |     line = line.strip().split(',')
 18 |     if i > 0:
 19 |         trip_id, arr, dep, stop_id  = line[:4]
 20 |         if 'WKD' not in trip_id:
 21 |             continue
 22 |         line = trip_id.split('_')[2].split('.')[0]
 23 |         key = (stop_id, line)
 24 |         arr = parse_time(arr)
 25 |         sched_trips.setdefault(key, []).append(arr)
 26 | 
 27 | for key, stops in sched_trips.iteritems():
 28 |     stops.sort()
 29 | 
 30 | real_trips = {}
 31 | for n_lines, line in enumerate(open('log.jsons')):
 32 |     for vehicle in json.loads(line.strip()):
 33 |         if vehicle.get('current_status') != 1: # STOPPED_AT
 34 |             continue
 35 |         try:
 36 |             line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal
 37 |             if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']:
 38 |                 print 'weird line', line
 39 |                 continue
 40 |             if 'stop_id' in vehicle:
 41 |                 stop_id = vehicle['stop_id']
 42 |             else:
 43 |                 # L and SI stop at every station, need to use 
 44 |                 stop_id = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1])
 45 |             key = (stop_id, line)
 46 |             timestamp = vehicle['timestamp']
 47 |             t = datetime.datetime.utcfromtimestamp(vehicle['timestamp'])
 48 |             if t.weekday() < 5:
 49 |                 real_trips.setdefault(key, set()).add(timestamp)
 50 |         except:
 51 |             print 'weird vehicle', vehicle
 52 |             continue
 53 |     if n_lines % 1000 == 0:
 54 |         print n_lines, '...'
 55 | 
 56 | xs = []
 57 | ys = []
 58 | 
 59 | MAX = 1800
 60 | 
 61 | ys_by_x = [[] for x in xrange(MAX/60)]
 62 | print ys_by_x
 63 | 
 64 | max_time = 4 * 3600
 65 | 
 66 | for key, stops in real_trips.iteritems():
 67 |     stop_id, line = key
 68 | 
 69 |     stops = sorted(stops)
 70 |     if len(stops) < 5:
 71 |         print key, 'not enough stops'
 72 |         continue # stupid
 73 |     if key not in sched_trips:
 74 |         print key, 'has no schedule'
 75 |         continue
 76 | 
 77 |     # Sample random points in time and tie 
 78 |     lo = stops[0]
 79 |     hi = stops[-1]
 80 |     for i in xrange(len(stops)): # pretty arbitrary number of samples
 81 |         t = lo + random.random() * (hi - lo)
 82 |         j = bisect.bisect(stops, t)
 83 |         t0, t1 = stops[j-1], stops[j]
 84 |         if t1 - t0 > max_time:
 85 |             continue
 86 |         real_wait_time = t1 - t
 87 |         # transform t to day offset
 88 |         u = (t + (19 * 60 * 60)) % (24 * 60 * 60)
 89 |         j = bisect.bisect(sched_trips[key], u)
 90 |         if j < len(sched_trips[key]):
 91 |             u1 = sched_trips[key][j]
 92 |         else:
 93 |             u1 = 24 * 60 * 60 + sched_trips[key][0]
 94 |         sched_wait_time = u1 - u
 95 | 
 96 |         if max(sched_wait_time, real_wait_time) < MAX:
 97 |             xs.append(sched_wait_time / 60.)
 98 |             ys.append(real_wait_time / 60.)
 99 | 
100 |         if sched_wait_time < MAX:
101 |             ys_by_x[int(sched_wait_time / 60.0)].append(real_wait_time / 60.)
102 | 
103 | seaborn.jointplot(numpy.array(xs), numpy.array(ys), kind='hex')
104 | pyplot.savefig('wait_time_real_vs_sched_joint.png')
105 | 
106 | pyplot.clf()
107 | percs = [50, 60, 70, 80, 90]
108 | results = [[] for p in percs]
109 | for x, ys in enumerate(ys_by_x):
110 |     print x, len(ys)
111 |     ps = numpy.percentile(ys, percs)
112 |     for i, y in enumerate(ps):
113 |         results[i].append(y)
114 | 
115 | for i, ys in enumerate(results):
116 |     pyplot.plot(range(len(ys)), ys, label='%d percentile' % percs[i])
117 | pyplot.ylim([0, 60])
118 | pyplot.title('How long do you have to wait given how much schedule predicts')
119 | pyplot.xlabel('Scheduled waiting time (min)')
120 | pyplot.ylabel('Real waiting time (min)')
121 | pyplot.legend()
122 | pyplot.savefig('wait_time_real_vs_sched_percentiles.png')
123 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from google.transit import gtfs_realtime_pb2
 2 | import urllib
 3 | import time
 4 | import traceback
 5 | from protobuf_to_dict import protobuf_to_dict
 6 | import itertools
 7 | import json
 8 | import random
 9 | import os
10 | 
11 | feed_ids = [1, 2, 11]
12 | for i in itertools.count():
13 |     if i > 0:
14 |         delay = 2.0 + 5 * random.random()
15 |         print 'sleeping %ss...' % delay
16 |         time.sleep(delay)
17 | 
18 |     feed_id = feed_ids[i % len(feed_ids)]
19 |     try:
20 |         feed = gtfs_realtime_pb2.FeedMessage()
21 |         response = urllib.urlopen('http://datamine.mta.info/mta_esi.php?key=%s&feed_id=%d' % (os.environ['MTA_KEY'], feed_id))
22 |         feed.ParseFromString(response.read())
23 |     except:
24 |         traceback.print_exc()
25 |         continue
26 | 
27 |     vehicles = [protobuf_to_dict(entity.vehicle) for entity in feed.entity if entity.HasField('vehicle')]
28 |     print 'got', len(vehicles), 'vehicles'
29 | 
30 |     f = open('log.jsons', 'a')
31 |     json.dump(vehicles, f)
32 |     f.write('\n')
33 |     f.close()
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/time_between_arrivals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_between_arrivals.png


--------------------------------------------------------------------------------
/time_to_arrival_by_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_by_line.png


--------------------------------------------------------------------------------
/time_to_arrival_by_time_of_day.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_by_time_of_day.png


--------------------------------------------------------------------------------
/time_to_arrival_percentiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_percentiles.png


--------------------------------------------------------------------------------
/time_to_next_arrival.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_next_arrival.png


--------------------------------------------------------------------------------
/travel_time_delay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/travel_time_delay.png


--------------------------------------------------------------------------------
/trips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/trips.png


--------------------------------------------------------------------------------
/worst_stations.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import bisect
 4 | import numpy
 5 | from matplotlib import pyplot
 6 | import datetime
 7 | 
 8 | def parse_time(t):
 9 |     h, m, s = map(int, t.split(':'))
10 |     return h*60*60 + m*60 + s
11 | 
12 | # Need to map stop sequence to stop id for L and SI
13 | #seq2stop = {}
14 | #for i, line in enumerate(open('stop_times.txt')):
15 | #    line = line.strip().split(',')
16 | #    if i > 0:
17 | #        trip_id, arr, dep, stop_id, stop_sequence  = line[:5]
18 | #        trip_id_short = trip_id.split('_', 1)[1][:11]
19 | #
20 | #        if 'WKD' not in trip_id:
21 | #            continue
22 | #        if 'L' not in trip_id_short and 'SI' not in trip_id_short:
23 | #            continue
24 | #
25 | #        k = (trip_id_short, stop_sequence)
26 | #        if k in seq2stop and seq2stop[k] != stop_id:
27 | #            print k, stop_id, seq2stop[k]
28 | #            raise
29 | #        seq2stop[k] = stop_id
30 | 
31 | stops = {}
32 | for i, line in enumerate(open('stops.txt')):
33 |     line = line.strip().split(',')
34 |     if i > 0:
35 |         stop_id, _, stop_name = line[:3]
36 |         if stop_id in stops and stops[stop_id] != stop_name:
37 |             raise
38 |         stops[stop_id] = stop_name
39 | 
40 | print stops
41 | 
42 | stations = {}
43 | for n_lines, line in enumerate(open('log.jsons')):
44 |     if n_lines % 1000 == 0:
45 |         print n_lines, '...'
46 |     try:
47 |         vehicles = json.loads(line.strip())
48 |     except:
49 |         print 'could not parse', line
50 |         continue
51 |     for vehicle in vehicles:
52 |         if vehicle.get('current_status') != 1: # STOPPED_AT
53 |             continue
54 |         try:
55 |             line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal
56 |             if 'stop_id' not in vehicle:
57 |                 continue
58 |             timestamp = vehicle['timestamp']
59 |             stations.setdefault((vehicle['stop_id'], line), set()).add(timestamp)
60 |         except:
61 |             raise
62 |             print 'weird vehicle', vehicle
63 |             continue
64 | 
65 | delays = {}
66 | 
67 | for key, timestamps in stations.iteritems():
68 |     stop_id, line = key
69 |     timestamps = sorted(list(timestamps))
70 |     if len(timestamps) < 100:
71 |         continue
72 |     lo, hi = timestamps[0], timestamps[-1]
73 |     for i in xrange(1000):
74 |         t = lo + random.random() * (hi - lo)
75 |         j = bisect.bisect_left(timestamps, t)
76 |         t0, t1 = timestamps[j-1], timestamps[j]
77 |         if t1 - t0 < 4 * 3600:
78 |             delays.setdefault(key, []).append(t1 - t)
79 | 
80 | for k in sorted(delays.keys(), key=lambda k: numpy.median(delays[k])):
81 |     stop_id, line = k
82 |     print stop_id, line, len(stations[k]), numpy.median(delays[k]), stops[stop_id]
83 | 
84 | 


--------------------------------------------------------------------------------