├── BayStations.txt
├── README.md
├── crawler.py
└── requirements.txt


/BayStations.txt:
--------------------------------------------------------------------------------
  1 | 402000
  2 | 400174
  3 | 401846
  4 | 401351
  5 | 402291
  6 | 401817
  7 | 400264
  8 | 400869
  9 | 401845
 10 | 400581
 11 | 402290
 12 | 401816
 13 | 401949
 14 | 407323
 15 | 404640
 16 | 404585
 17 | 401538
 18 | 409526
 19 | 401507
 20 | 407374
 21 | 404586
 22 | 401489
 23 | 409529
 24 | 401926
 25 | 400528
 26 | 400823
 27 | 400687
 28 | 401464
 29 | 409525
 30 | 401937
 31 | 407348
 32 | 400499
 33 | 400274
 34 | 413026
 35 | 409528
 36 | 401948
 37 | 407325
 38 | 400673
 39 | 400970
 40 | 402119
 41 | 409524
 42 | 401534
 43 | 407367
 44 | 400084
 45 | 402288
 46 | 402118
 47 | 401936
 48 | 407342
 49 | 400560
 50 | 400096
 51 | 402289
 52 | 402117
 53 | 401998
 54 | 407328
 55 | 400429
 56 | 400973
 57 | 400088
 58 | 402059
 59 | 400435
 60 | 407352
 61 | 400414
 62 | 400201
 63 | 400238
 64 | 402058
 65 | 404522
 66 | 407335
 67 | 400292
 68 | 400567
 69 | 402286
 70 | 402057
 71 | 404554
 72 | 407360
 73 | 400714
 74 | 400449
 75 | 402287
 76 | 402056
 77 | 400545
 78 | 407336
 79 | 401400
 80 | 400104
 81 | 402120
 82 | 400563
 83 | 400743
 84 | 407361
 85 | 401388
 86 | 401014
 87 | 402285
 88 | 400236
 89 | 400330
 90 | 402060
 91 | 407711
 92 | 400296
 93 | 402283
 94 | 400916
 95 | 401997
 96 | 402061
 97 | 407710
 98 | 400158
 99 | 402284
100 | 400065
101 | 401996
102 | 407337
103 | 405701
104 | 400873
105 | 402281
106 | 400258
107 | 404553
108 | 407339
109 | 401167
110 | 413845
111 | 402282
112 | 400995
113 | 400904
114 | 407364
115 | 401163
116 | 402121
117 | 404453
118 | 400790
119 | 407341
120 | 403419
121 | 401541
122 | 404461
123 | 400168
124 | 407372
125 | 401327
126 | 400971
127 | 400664
128 | 400643
129 | 407373
130 | 414694
131 | 400122
132 | 400837
133 | 400794
134 | 407331
135 | 403414
136 | 404759
137 | 400654
138 | 400246
139 | 407321
140 | 413877
141 | 400045
142 | 400040
143 | 400069
144 | 407370
145 | 402067
146 | 400479
147 | 400418
148 | 401994
149 | 407344
150 | 401942
151 | 400030
152 | 400257
153 | 400372
154 | 400677
155 | 401943
156 | 401560
157 | 401597
158 | 400353
159 | 400209
160 | 403412
161 | 401440
162 | 401606
163 | 404521
164 | 400185
165 | 414284
166 | 403225
167 | 404452
168 | 400206
169 | 400648
170 | 403409
171 | 403265
172 | 405619
173 | 400895
174 | 400507
175 | 413878
176 | 400343
177 | 401567
178 | 400227
179 | 400828
180 | 401810
181 | 400508
182 | 401457
183 | 400586
184 | 401210
185 | 401811
186 | 400253
187 | 400178
188 | 400964
189 | 401224
190 | 403406
191 | 400147
192 | 400457
193 | 400172
194 | 400952
195 | 400953
196 | 400709
197 | 404462
198 | 400760
199 | 400097
200 | 400799
201 | 400057
202 | 404451
203 | 400911
204 | 400222
205 | 403404
206 | 400514
207 | 401957
208 | 400863
209 | 400582
210 | 403401
211 | 400723
212 | 401958
213 | 404753
214 | 400688
215 | 403402
216 | 400951
217 | 401129
218 | 400394
219 | 400213
220 | 401809
221 | 408911
222 | 401154
223 | 400001
224 | 400464
225 | 401808
226 | 408907
227 | 405613
228 | 400922
229 | 400907
230 | 401655
231 | 401611
232 | 400842
233 | 400965
234 | 400822
235 | 401391
236 | 404370
237 | 400280
238 | 400109
239 | 400792
240 | 401403
241 | 400100
242 | 401495
243 | 402362
244 | 400485
245 | 400665
246 | 400221
247 | 402363
248 | 400934
249 | 400298
250 | 400668
251 | 400017
252 | 407173
253 | 400436
254 | 400440
255 | 400713
256 | 407172
257 | 403329
258 | 400996
259 | 400700
260 | 407165
261 | 400278
262 | 400160
263 | 400336
264 | 407194
265 | 400240
266 | 401890
267 | 400772
268 | 400535
269 | 400073
270 | 401891
271 | 400148
272 | 407167
273 | 400715
274 | 400400
275 | 400717
276 | 407196
277 | 400832
278 | 402361
279 | 400750
280 | 407197
281 | 400649
282 | 400804
283 | 400059
284 | 407170
285 | 404378
286 | 401906
287 | 400461
288 | 407176
289 | 401908
290 | 400690
291 | 407177
292 | 402359
293 | 400052
294 | 407202
295 | 402360
296 | 400085
297 | 407204
298 | 401555
299 | 407179
300 | 400268
301 | 407180
302 | 400637
303 | 407206
304 | 400519
305 | 407181
306 | 400145
307 | 407207
308 | 400754
309 | 407187
310 | 400444
311 | 407157
312 | 400577
313 | 407155
314 | 407186
315 | 407153
316 | 407200
317 | 407174
318 | 407161
319 | 407191
320 | 407190
321 | 407151
322 | 407185
323 | 407150
324 | 407184
325 | 407148
326 | 403237


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pems_crawler
 2 | PeMS crawler
 3 | 
 4 | This is a crawler which can download stations' information and time series data of each station from PeMS.
 5 | 
 6 | The file "BayStations.txt" contains all of stations' id of Bay Area. You can run this program to download all stations' information and their time series data.
 7 | 
 8 | You can change the last line of crawler.py to change start time and end time of the time series you want to download.
 9 | 
10 | ## Requirements
11 | python3
12 | 
13 | + requests
14 | 
15 | use `pip install -r requirements.txt` to install dependencies.
16 | 
17 | ## Usage
18 | 
19 | ```
20 | python crawler.py
21 | ```


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import os
  4 | import requests
  5 | from datetime import datetime
  6 | from datetime import timedelta
  7 | from urllib.parse import quote_plus
  8 | import concurrent.futures
  9 | 
 10 | headers = {
 11 | 	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',           
 12 | 	'Accept-Encoding': 'gzip, deflate, sdch',
 13 | 	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
 14 | }
 15 | 
 16 | directory = os.path.dirname(os.path.abspath(__file__))
 17 | 
 18 | class spider:
 19 |     def __init__(self, username, password, proxies = False):
 20 |         self.home = "http://pems.dot.ca.gov/"
 21 |         self.s = requests.Session()
 22 |         if proxies:
 23 |             self.s.proxies = proxies
 24 |         self.username = username
 25 |         self.password = password
 26 | 
 27 |     def get(self, url):
 28 |         '''
 29 |         HTTP get request
 30 |         '''
 31 |         return self.s.get(url, headers = headers)
 32 | 
 33 |     def post(self, url, data):
 34 |         '''
 35 |         HTTP post request
 36 |         '''
 37 |         return self.s.post(url, data = data, headers = headers)
 38 | 
 39 |     def login(self):
 40 |         '''
 41 |         sign in PEMS
 42 |         '''
 43 |         print('try to login')
 44 |         data = {'username': self.username,
 45 |             'password': self.password,
 46 |             'login': 'Login',
 47 |             'redirect': ""}
 48 |         self.post(self.home, data)
 49 | 
 50 |     def download_station_time_series_5min(self, station_id, start_time, end_time, areaname):
 51 |         '''
 52 |         download each station's time series data, time interval is 5min
 53 | 
 54 |         Parameters
 55 |         ----------
 56 |         station_id: str, e.g. 402000
 57 | 
 58 |         start_time: str, %Y%m%d%H%M, e.g. 201701010000
 59 | 
 60 |         end_time: str, %Y%m%d%H%M, e.g. 201701012359
 61 |         '''
 62 |         print('try to download %s %s %s'%(station_id, start_time, end_time))
 63 | 
 64 |         s_time_id_f = datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M")
 65 |         e_time_id_f = datetime.strptime(end_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M")
 66 |         
 67 |         delta = timedelta(days = 1)
 68 |         s_time_id = str(int(datetime.strptime(datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%Y%m%d") + "0800",
 69 |                                     "%Y%m%d%H%M").timestamp()))
 70 |         e_time_id = str(int(datetime.strptime((datetime.strptime(end_time, "%Y%m%d%H%M") + delta).strftime("%Y%m%d") + "0759",
 71 |                                     "%Y%m%d%H%M").timestamp()))
 72 |         data_str = '''report_form=1
 73 |                         dnode=VDS
 74 |                         content=loops
 75 |                         tab=det_timeseries
 76 |                         export=text
 77 |                         station_id=405572
 78 |                         s_time_id=1483228800
 79 |                         s_time_id_f=01%2F01%2F2017+00%3A00
 80 |                         e_time_id=1483315140
 81 |                         e_time_id_f=01%2F01%2F2017+23%3A59
 82 |                         tod=all
 83 |                         tod_from=0
 84 |                         tod_to=0
 85 |                         dow_0=on
 86 |                         dow_1=on
 87 |                         dow_2=on
 88 |                         dow_3=on
 89 |                         dow_4=on
 90 |                         dow_5=on
 91 |                         dow_6=on
 92 |                         holidays=on
 93 |                         q=flow
 94 |                         q2=
 95 |                         gn=5min
 96 |                         agg=on'''
 97 |         data = dict(map(lambda x: x.strip().split('='), data_str.split('\n')))
 98 |         data['station_id'] = station_id
 99 |         data['s_time_id'] = s_time_id
100 |         data['e_time_id'] = e_time_id
101 |         data['s_time_id_f'] = s_time_id_f
102 |         data['e_time_id_f'] = e_time_id_f
103 | 
104 |         url = self.home + '?' + '&'.join(map(lambda x: '='.join((x[0][0], quote_plus(x[0][1], safe = "+"))), zip(data.items())))
105 |         response = self.get(url)
106 | 
107 |         if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname)))):
108 |             os.makedirs(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname))))
109 | 
110 |         with open('%s%s_%s_%s.txt'%(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname))), station_id, start_time, end_time), 'w') as f:
111 |             f.write(response.text)
112 | 
113 |     def download_station_metadata(self, station, areaname):
114 |         print('try to download meta data of station %s'%(station))
115 |         url = "%s?station_id=%s&dnode=VDS&content=sta_cfg"%(self.home, station)
116 |         r = self.get(url)
117 | 
118 |         if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname)))):
119 |             os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname))))
120 | 
121 |         with open("%s%s.html"%(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname))), station), "w") as f:
122 |             f.write(r.text)
123 | 
124 |     def start(self, areaname, filename, start_time, end_time):
125 |         '''
126 |         Parameters
127 |         ----------
128 |         areaname: str, e.g. Bay
129 | 
130 |         filename: str, e.g. station.txt
131 | 
132 |         start_time, end_time: str, %Y%m%d
133 | 
134 |         '''
135 |         # download all stations' metadata
136 |         with open(filename, 'r') as f:
137 |             station_list = f.read().strip().split('\n')
138 |         
139 |         if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname)))):
140 |             os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname))))
141 | 
142 |         station_metadatas = os.listdir(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname))))
143 |         for station in station_list:
144 |             if "%s.html"%(station) in station_metadatas:
145 |                 continue
146 |             self.download_station_metadata(station, areaname)
147 | 
148 |         # compute timelist
149 |         start_time = start_time + "0000"
150 |         t = datetime.strptime(start_time, "%Y%m%d%H%M")
151 |         delta = timedelta(days = 7)
152 |         end_time = datetime.strptime(end_time, "%Y%m%d")
153 |         timelist = []
154 |         while t < end_time:
155 |             tmp = [t.strftime("%Y%m%d") + "0000"]
156 |             t = t + delta
157 |             tmp.append((t - timedelta(days = 1)).strftime("%Y%m%d") + "2359")
158 |             timelist.append(tmp)
159 | 
160 |         # compute all stations' time series start time and end time
161 |         jobs = [(station, start_time, end_time) \
162 |                 for station in station_list \
163 |                 for start_time, end_time in timelist \
164 |                 if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/%s_%s_%s.txt'%(areaname, station, start_time, end_time))))]
165 | 
166 |         # multi threaded downloader
167 |         with concurrent.futures.ThreadPoolExecutor(max_workers = 4) as executor:
168 |             
169 |             # add all jobs to future_to_url, {job: job's name}
170 |             future_to_url = {executor.submit(self.download_station_time_series_5min, station, start_time, end_time, areaname): '_'.join((station, start_time, end_time)) for station, start_time, end_time in jobs}
171 |             for future in concurrent.futures.as_completed(future_to_url):
172 |                 job_name = future_to_url[future]
173 |                 try:
174 |                     future.result()
175 |                 except Exception as exc:
176 |                     print('%r generated an exception: %s' % (job_name, exc))
177 | 
178 | if __name__ == "__main__":
179 |     proxies = {'http': 'http://127.0.0.1:1080'}
180 | 
181 |     username = None
182 |     password = None
183 | 
184 |     if username == None:
185 |         username = input('please input your username of PeMS: ')
186 | 
187 |     if password == None:
188 |         password = input('please input your password: ')
189 | 
190 |     # spider initialization
191 |     a = spider()
192 | 
193 |     # sign in PEMS
194 |     a.login(username, password, proxies = False)
195 | 
196 |     a.start('Bay', 'BayStations.txt', '20170101', '20170601')


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------