├── README.md ├── plot_delays.py ├── plot_travel_time.py ├── plot_trips.py ├── plot_wait_time.py ├── run.py ├── time_between_arrivals.png ├── time_to_arrival_by_line.png ├── time_to_arrival_by_time_of_day.png ├── time_to_arrival_percentiles.png ├── time_to_next_arrival.png ├── travel_time_delay.png ├── trips.png └── worst_stations.py /README.md: -------------------------------------------------------------------------------- 1 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_between_arrivals.png) 2 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_next_arrival.png) 3 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_arrival_by_time_of_day.png) 4 | ![Cool plot](https://raw.github.com/erikbern/mta/master/time_to_arrival_percentiles.png) 5 | ![Cool plot](https://raw.github.com/erikbern/mta/master/travel_time_delay.png) 6 | -------------------------------------------------------------------------------- /plot_delays.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime 3 | import seaborn 4 | import numpy 5 | import math 6 | import pandas 7 | from matplotlib import pyplot 8 | import matplotlib 9 | 10 | matplotlib.rcParams.update({'font.size': 48}) 11 | 12 | stations = {} 13 | for n_lines, line in enumerate(open('log.jsons')): 14 | for vehicle in json.loads(line.strip()): 15 | if vehicle.get('current_status') != 1: # STOPPED_AT 16 | continue 17 | try: 18 | line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal 19 | if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']: 20 | print 'weird line', line 21 | continue 22 | if 'stop_id' in vehicle: 23 | stop = vehicle['stop_id'] 24 | else: 25 | # L and SI stop at every station, need to use 26 | stop = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1]) 27 | key = (line, stop) 28 | timestamp = vehicle['timestamp'] # datetime.datetime.utcfromtimestamp(vehicle['timestamp']) 29 | stations.setdefault(key, set()).add(timestamp) 30 | except: 31 | print 'weird vehicle', vehicle 32 | continue 33 | 34 | #if n_lines >= 10000: 35 | # break 36 | 37 | pyplot.figure(figsize=(10, 10)) 38 | 39 | # Look at all intervals between subway arrivals 40 | def next_whole_minute(t): 41 | return t+59 - (t+59)%60 42 | 43 | deltas = [] 44 | next_subway = [] 45 | next_subway_by_time_of_day = [[] for x in xrange(24 * 60)] 46 | next_subway_by_line_ts = [] 47 | next_subway_by_line_ls = [] 48 | next_subway_rush_hour = [] 49 | max_limit = 4 * 3600 # cap max value so that Seaborn's KDE works better 50 | for key, values in stations.iteritems(): 51 | line, stop = key 52 | values = sorted(values) 53 | print key, len(values) 54 | last_value = None 55 | for i in xrange(1, len(values)): 56 | last_value, value = values[i-1], values[i] 57 | if value - last_value >= max_limit: 58 | continue 59 | deltas.append(1. / 60 * (value - last_value)) 60 | for t in xrange(next_whole_minute(last_value), value, 60): 61 | x = (t // 60 + 19 * 60) % (24 * 60) # 19 from UTC offset 62 | waiting_time = 1. / 60 * (value - t) 63 | next_subway_by_time_of_day[x].append(waiting_time) 64 | next_subway.append(waiting_time) 65 | next_subway_by_line_ts.append(waiting_time) 66 | next_subway_by_line_ls.append(line) 67 | if x >= 7 * 60 and x < 19 * 60: 68 | next_subway_rush_hour.append(waiting_time) 69 | 70 | # Plot distributions of deltas 71 | for data, fn, title, color in [(deltas, 'time_between_arrivals.png', 'Distribution of delays between subway arrivals', 'blue'), 72 | (next_subway, 'time_to_next_arrival.png', 'Distribution of time until the next subway arrival', 'red')]: 73 | print 'got', len(data), 'points' 74 | pyplot.clf() 75 | lm = seaborn.distplot(data, bins=numpy.linspace(0, 60, num=61), color=color, kde_kws={'gridsize': 2000}) 76 | pyplot.xlim([-1, 40]) 77 | pyplot.title(title) 78 | pyplot.xlabel('Time (min)') 79 | pyplot.ylabel('Probability distribution') 80 | pyplot.savefig(fn) 81 | print 'mean', 60*numpy.mean(data), 'median', 60*numpy.median(data) 82 | 83 | # Plot deltas by line 84 | pyplot.clf() 85 | seaborn.violinplot(orient='h', 86 | x=next_subway_by_line_ts, 87 | y=next_subway_by_line_ls, 88 | order=['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI'], 89 | scale='width', 90 | palette=['#EE352E']*3 + ['#00933C']*3 + ['#808183', '#A7A9AC', '#555555'], 91 | bw=0.03, cut=0, gridsize=2000) 92 | pyplot.xlim([-1, 40]) 93 | pyplot.title('Time until the next subway') 94 | pyplot.xlabel('Time (min)') 95 | pyplot.ylabel('Line') 96 | pyplot.savefig('time_to_arrival_by_line.png') 97 | 98 | # Plot distribution of delays by time of day 99 | percs = [50, 60, 70, 80, 90] 100 | results = [[] for perc in percs] 101 | xs = range(0, 24 * 60) 102 | for x, next_subway_slice in enumerate(next_subway_by_time_of_day): 103 | print x, len(next_subway_slice), '...' 104 | rs = numpy.percentile(next_subway_slice, percs) 105 | for i, r in enumerate(rs): 106 | results[i].append(r) 107 | 108 | pyplot.clf() 109 | for i, result in enumerate(results): 110 | pyplot.plot([x * 1.0 / 60 for x in xs], result, label='%d percentile' % percs[i]) 111 | pyplot.ylim([0, 60]) 112 | pyplot.xlim([0, 24]) 113 | pyplot.title('How long do you have to wait given time of day') 114 | pyplot.xlabel('Time of day (h)') 115 | pyplot.ylabel('Time until subway arrives (min))') 116 | pyplot.legend() 117 | pyplot.savefig('time_to_arrival_by_time_of_day.png') 118 | 119 | # Compute all percentiles 120 | results = [[] for perc in percs] 121 | offsets = numpy.arange(0, 40, 0.1) 122 | for offset in offsets: 123 | print offset, '...' 124 | rs = numpy.percentile([d-offset for d in next_subway_rush_hour if d >= offset], percs) 125 | for i, r in enumerate(rs): 126 | results[i].append(r) 127 | 128 | pyplot.clf() 129 | for i, result in enumerate(results): 130 | pyplot.plot(offsets, result, label='%d percentile' % percs[i]) 131 | pyplot.ylim([0, 60]) 132 | pyplot.title('How long do you have to wait given that you already waited?') 133 | pyplot.xlabel('Time you have waited for the subway (min)') 134 | pyplot.ylabel('Additional time until subway arrives (min)') 135 | pyplot.legend() 136 | pyplot.savefig('time_to_arrival_percentiles.png') 137 | -------------------------------------------------------------------------------- /plot_travel_time.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import seaborn 4 | import numpy 5 | import scipy 6 | from matplotlib import pyplot 7 | 8 | def parse_time(t): 9 | h, m, s = map(int, t.split(':')) 10 | return h*60*60 + m*60 + s 11 | 12 | sched_trips = {} 13 | for i, line in enumerate(open('stop_times.txt')): 14 | line = line.strip().split(',') 15 | if i > 0: 16 | trip_id, arr, dep, stop_id = line[:4] 17 | trip_id_short = trip_id.split('_', 1)[1] 18 | sched_trips.setdefault(trip_id_short, {}).setdefault(trip_id, {})[stop_id] = parse_time(arr) 19 | 20 | real_trips = {} 21 | for n_lines, line in enumerate(open('log.jsons')): 22 | for vehicle in json.loads(line.strip()): 23 | if vehicle.get('current_status') != 1: # STOPPED_AT 24 | continue 25 | try: 26 | line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal 27 | if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']: 28 | print 'weird line', line 29 | continue 30 | if 'stop_id' in vehicle: 31 | stop = vehicle['stop_id'] 32 | else: 33 | # L and SI stop at every station, need to use 34 | stop = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1]) 35 | key = (vehicle['trip']['start_date'], vehicle['trip']['trip_id']) 36 | timestamp = vehicle['timestamp'] 37 | real_trips.setdefault(key, []).append((timestamp, stop)) 38 | except: 39 | print 'weird vehicle', vehicle 40 | continue 41 | 42 | xs = [] 43 | ys = [] 44 | 45 | for key, stops in real_trips.iteritems(): 46 | _, trip_id = key 47 | if trip_id not in sched_trips: 48 | print 'unknown trip', trip_id 49 | continue 50 | 51 | stops_compressed = [] 52 | last_stop = None 53 | for t, stop in stops: 54 | if stop != last_stop: 55 | stops_compressed.append((t, stop)) 56 | last_stop = stop 57 | 58 | for t0, stop0 in stops_compressed: 59 | sched = random.choice(sched_trips[trip_id].values()) 60 | if stop0 not in sched: 61 | print 'weird stop for schedule', stop0 62 | continue 63 | t1, stop1 = random.choice(stops_compressed) 64 | if stop1 not in sched: 65 | print 'weird stop for schedule', stop1 66 | continue 67 | if stop0 == stop1: 68 | continue 69 | elif t0 > t1: 70 | t0, t1, stop0, stop1 = t1, t0, stop1, stop0 71 | xs.append(sched[stop1] - sched[stop0]) # timetable 72 | ys.append(t1 - t0) # actual 73 | 74 | delays = (numpy.array(ys) - numpy.array(xs)) 75 | xmin, xmax = -10, 30 76 | seaborn.distplot(delays / 60., bins=numpy.linspace(xmin, xmax, num=(xmax-xmin+1)), kde_kws={'gridsize': 2000}) 77 | mean, median, pc90 = ('%dm%02ds' % (t / 60, t % 60) for t in (numpy.mean(delays), numpy.median(delays), numpy.percentile(delays, 90))) 78 | print mean, median, pc90 79 | pyplot.title('Travel time delays (mean = %s, median = %s, 90th percentile = %s' % (mean, median, pc90)) 80 | pyplot.xlabel('Delay between real and scheduled (min)') 81 | pyplot.ylabel('Probability distribution') 82 | pyplot.xlim([xmin, xmax]) 83 | pyplot.savefig('travel_time_delay.png') 84 | 85 | pyplot.clf() 86 | delays_frac = numpy.array(ys) / numpy.array(xs) - 1.0 87 | xmin, xmax = -1.0, 50 88 | seaborn.distplot(delays_frac * 100, kde_kws={'gridsize': 2000}) 89 | mean, median, pc90 = ('%.2f%%' % x for x in (numpy.mean(delays_frac), numpy.median(delays_frac), numpy.percentile(delays_frac, 90))) 90 | print mean, median, pc90 91 | pyplot.title('Travel time percent delays (mean = %s, median = %s, 90th percentile = %s' % (mean, median, pc90)) 92 | pyplot.xlabel('Delay between real and scheduled (%)') 93 | pyplot.ylabel('Probability distribution') 94 | pyplot.xlim([xmin, xmax]) 95 | pyplot.savefig('travel_time_delay_frac.png') 96 | -------------------------------------------------------------------------------- /plot_trips.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import matplotlib.pyplot as plt 4 | import pytz 5 | 6 | # nyc_tz = pytz.timezone('US/Eastern') 7 | 8 | stops = {} 9 | for line in open('stops.txt'): 10 | stop_id, _, stop = line.strip().split(',')[:3] 11 | if stop_id[-1] == 'N': 12 | try: 13 | stops[int(stop_id[:-1])] = stop 14 | except: 15 | print 'could not parse', stop_id, stop 16 | 17 | print stops 18 | 19 | def datetime2timestamp(d): 20 | return (d - datetime.datetime(1970, 1, 1)).total_seconds() 21 | 22 | 23 | def read_data(date_a, date_b, margin): 24 | date_a, date_b = (datetime2timestamp(d) for d in(date_a, date_b)) 25 | trips = {} 26 | for line in open('log.jsons'): 27 | for vehicle in json.loads(line.strip()): 28 | if vehicle.get('current_status') != 1: # STOPPED_AT 29 | continue 30 | 31 | if vehicle['trip']['route_id'][:1] != '1': 32 | continue 33 | 34 | t = vehicle['timestamp'] 35 | 36 | if t > date_b + margin: 37 | return trips 38 | 39 | if t < date_a - margin: 40 | continue 41 | 42 | stop = vehicle['stop_id'] 43 | trip = (vehicle['trip']['start_date'], vehicle['trip']['trip_id']) 44 | trips.setdefault(trip, {})[t] = stop 45 | 46 | date_a, date_b = [datetime.datetime(2016, 3, 3, 14), datetime.datetime(2016, 3, 3, 20)] 47 | margin = 0 # 3600 * 3 48 | plt.figure(figsize=(24, 6)) 49 | unique_ys = set() 50 | for trip, seq in read_data(date_a, date_b, margin).iteritems(): 51 | if len(seq) < 10: 52 | continue 53 | seq = sorted(seq.items()) 54 | ts = [datetime.datetime.utcfromtimestamp(t) for t, _ in seq] 55 | ys = [int(s[:-1]) for _, s in seq] 56 | unique_ys.update(ys) 57 | plt.plot(ts, ys, '#EE352E', lw=2.0) #, tz=nyc_tz) 58 | 59 | plt.yticks(sorted(unique_ys), [stops[y] for y in sorted(unique_ys)]) 60 | plt.xlim([date_a, date_b]) 61 | plt.tight_layout() 62 | plt.savefig('trips.png') 63 | 64 | 65 | -------------------------------------------------------------------------------- /plot_wait_time.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import datetime 3 | import json 4 | import random 5 | import seaborn 6 | import numpy 7 | import pandas 8 | import scipy 9 | from matplotlib import pyplot 10 | 11 | def parse_time(t): 12 | h, m, s = map(int, t.split(':')) 13 | return h*60*60 + m*60 + s 14 | 15 | sched_trips = {} 16 | for i, line in enumerate(open('stop_times.txt')): 17 | line = line.strip().split(',') 18 | if i > 0: 19 | trip_id, arr, dep, stop_id = line[:4] 20 | if 'WKD' not in trip_id: 21 | continue 22 | line = trip_id.split('_')[2].split('.')[0] 23 | key = (stop_id, line) 24 | arr = parse_time(arr) 25 | sched_trips.setdefault(key, []).append(arr) 26 | 27 | for key, stops in sched_trips.iteritems(): 28 | stops.sort() 29 | 30 | real_trips = {} 31 | for n_lines, line in enumerate(open('log.jsons')): 32 | for vehicle in json.loads(line.strip()): 33 | if vehicle.get('current_status') != 1: # STOPPED_AT 34 | continue 35 | try: 36 | line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal 37 | if line not in ['1', '2', '3', '4', '5', '6', 'GS', 'L', 'SI']: 38 | print 'weird line', line 39 | continue 40 | if 'stop_id' in vehicle: 41 | stop_id = vehicle['stop_id'] 42 | else: 43 | # L and SI stop at every station, need to use 44 | stop_id = '%d%s' % (vehicle['current_stop_sequence'], vehicle['trip']['trip_id'][-1]) 45 | key = (stop_id, line) 46 | timestamp = vehicle['timestamp'] 47 | t = datetime.datetime.utcfromtimestamp(vehicle['timestamp']) 48 | if t.weekday() < 5: 49 | real_trips.setdefault(key, set()).add(timestamp) 50 | except: 51 | print 'weird vehicle', vehicle 52 | continue 53 | if n_lines % 1000 == 0: 54 | print n_lines, '...' 55 | 56 | xs = [] 57 | ys = [] 58 | 59 | MAX = 1800 60 | 61 | ys_by_x = [[] for x in xrange(MAX/60)] 62 | print ys_by_x 63 | 64 | max_time = 4 * 3600 65 | 66 | for key, stops in real_trips.iteritems(): 67 | stop_id, line = key 68 | 69 | stops = sorted(stops) 70 | if len(stops) < 5: 71 | print key, 'not enough stops' 72 | continue # stupid 73 | if key not in sched_trips: 74 | print key, 'has no schedule' 75 | continue 76 | 77 | # Sample random points in time and tie 78 | lo = stops[0] 79 | hi = stops[-1] 80 | for i in xrange(len(stops)): # pretty arbitrary number of samples 81 | t = lo + random.random() * (hi - lo) 82 | j = bisect.bisect(stops, t) 83 | t0, t1 = stops[j-1], stops[j] 84 | if t1 - t0 > max_time: 85 | continue 86 | real_wait_time = t1 - t 87 | # transform t to day offset 88 | u = (t + (19 * 60 * 60)) % (24 * 60 * 60) 89 | j = bisect.bisect(sched_trips[key], u) 90 | if j < len(sched_trips[key]): 91 | u1 = sched_trips[key][j] 92 | else: 93 | u1 = 24 * 60 * 60 + sched_trips[key][0] 94 | sched_wait_time = u1 - u 95 | 96 | if max(sched_wait_time, real_wait_time) < MAX: 97 | xs.append(sched_wait_time / 60.) 98 | ys.append(real_wait_time / 60.) 99 | 100 | if sched_wait_time < MAX: 101 | ys_by_x[int(sched_wait_time / 60.0)].append(real_wait_time / 60.) 102 | 103 | seaborn.jointplot(numpy.array(xs), numpy.array(ys), kind='hex') 104 | pyplot.savefig('wait_time_real_vs_sched_joint.png') 105 | 106 | pyplot.clf() 107 | percs = [50, 60, 70, 80, 90] 108 | results = [[] for p in percs] 109 | for x, ys in enumerate(ys_by_x): 110 | print x, len(ys) 111 | ps = numpy.percentile(ys, percs) 112 | for i, y in enumerate(ps): 113 | results[i].append(y) 114 | 115 | for i, ys in enumerate(results): 116 | pyplot.plot(range(len(ys)), ys, label='%d percentile' % percs[i]) 117 | pyplot.ylim([0, 60]) 118 | pyplot.title('How long do you have to wait given how much schedule predicts') 119 | pyplot.xlabel('Scheduled waiting time (min)') 120 | pyplot.ylabel('Real waiting time (min)') 121 | pyplot.legend() 122 | pyplot.savefig('wait_time_real_vs_sched_percentiles.png') 123 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from google.transit import gtfs_realtime_pb2 2 | import urllib 3 | import time 4 | import traceback 5 | from protobuf_to_dict import protobuf_to_dict 6 | import itertools 7 | import json 8 | import random 9 | import os 10 | 11 | feed_ids = [1, 2, 11] 12 | for i in itertools.count(): 13 | if i > 0: 14 | delay = 2.0 + 5 * random.random() 15 | print 'sleeping %ss...' % delay 16 | time.sleep(delay) 17 | 18 | feed_id = feed_ids[i % len(feed_ids)] 19 | try: 20 | feed = gtfs_realtime_pb2.FeedMessage() 21 | response = urllib.urlopen('http://datamine.mta.info/mta_esi.php?key=%s&feed_id=%d' % (os.environ['MTA_KEY'], feed_id)) 22 | feed.ParseFromString(response.read()) 23 | except: 24 | traceback.print_exc() 25 | continue 26 | 27 | vehicles = [protobuf_to_dict(entity.vehicle) for entity in feed.entity if entity.HasField('vehicle')] 28 | print 'got', len(vehicles), 'vehicles' 29 | 30 | f = open('log.jsons', 'a') 31 | json.dump(vehicles, f) 32 | f.write('\n') 33 | f.close() 34 | 35 | 36 | -------------------------------------------------------------------------------- /time_between_arrivals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_between_arrivals.png -------------------------------------------------------------------------------- /time_to_arrival_by_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_by_line.png -------------------------------------------------------------------------------- /time_to_arrival_by_time_of_day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_by_time_of_day.png -------------------------------------------------------------------------------- /time_to_arrival_percentiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_arrival_percentiles.png -------------------------------------------------------------------------------- /time_to_next_arrival.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/time_to_next_arrival.png -------------------------------------------------------------------------------- /travel_time_delay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/travel_time_delay.png -------------------------------------------------------------------------------- /trips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikbern/mta/b518b3dab0fe616519f95e37161a87974ca79ff1/trips.png -------------------------------------------------------------------------------- /worst_stations.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import bisect 4 | import numpy 5 | from matplotlib import pyplot 6 | import datetime 7 | 8 | def parse_time(t): 9 | h, m, s = map(int, t.split(':')) 10 | return h*60*60 + m*60 + s 11 | 12 | # Need to map stop sequence to stop id for L and SI 13 | #seq2stop = {} 14 | #for i, line in enumerate(open('stop_times.txt')): 15 | # line = line.strip().split(',') 16 | # if i > 0: 17 | # trip_id, arr, dep, stop_id, stop_sequence = line[:5] 18 | # trip_id_short = trip_id.split('_', 1)[1][:11] 19 | # 20 | # if 'WKD' not in trip_id: 21 | # continue 22 | # if 'L' not in trip_id_short and 'SI' not in trip_id_short: 23 | # continue 24 | # 25 | # k = (trip_id_short, stop_sequence) 26 | # if k in seq2stop and seq2stop[k] != stop_id: 27 | # print k, stop_id, seq2stop[k] 28 | # raise 29 | # seq2stop[k] = stop_id 30 | 31 | stops = {} 32 | for i, line in enumerate(open('stops.txt')): 33 | line = line.strip().split(',') 34 | if i > 0: 35 | stop_id, _, stop_name = line[:3] 36 | if stop_id in stops and stops[stop_id] != stop_name: 37 | raise 38 | stops[stop_id] = stop_name 39 | 40 | print stops 41 | 42 | stations = {} 43 | for n_lines, line in enumerate(open('log.jsons')): 44 | if n_lines % 1000 == 0: 45 | print n_lines, '...' 46 | try: 47 | vehicles = json.loads(line.strip()) 48 | except: 49 | print 'could not parse', line 50 | continue 51 | for vehicle in vehicles: 52 | if vehicle.get('current_status') != 1: # STOPPED_AT 53 | continue 54 | try: 55 | line = vehicle['trip']['route_id'].rstrip('X') # fold express into normal 56 | if 'stop_id' not in vehicle: 57 | continue 58 | timestamp = vehicle['timestamp'] 59 | stations.setdefault((vehicle['stop_id'], line), set()).add(timestamp) 60 | except: 61 | raise 62 | print 'weird vehicle', vehicle 63 | continue 64 | 65 | delays = {} 66 | 67 | for key, timestamps in stations.iteritems(): 68 | stop_id, line = key 69 | timestamps = sorted(list(timestamps)) 70 | if len(timestamps) < 100: 71 | continue 72 | lo, hi = timestamps[0], timestamps[-1] 73 | for i in xrange(1000): 74 | t = lo + random.random() * (hi - lo) 75 | j = bisect.bisect_left(timestamps, t) 76 | t0, t1 = timestamps[j-1], timestamps[j] 77 | if t1 - t0 < 4 * 3600: 78 | delays.setdefault(key, []).append(t1 - t) 79 | 80 | for k in sorted(delays.keys(), key=lambda k: numpy.median(delays[k])): 81 | stop_id, line = k 82 | print stop_id, line, len(stations[k]), numpy.median(delays[k]), stops[stop_id] 83 | 84 | --------------------------------------------------------------------------------