├── BayStations.txt ├── README.md ├── crawler.py └── requirements.txt /BayStations.txt: -------------------------------------------------------------------------------- 1 | 402000 2 | 400174 3 | 401846 4 | 401351 5 | 402291 6 | 401817 7 | 400264 8 | 400869 9 | 401845 10 | 400581 11 | 402290 12 | 401816 13 | 401949 14 | 407323 15 | 404640 16 | 404585 17 | 401538 18 | 409526 19 | 401507 20 | 407374 21 | 404586 22 | 401489 23 | 409529 24 | 401926 25 | 400528 26 | 400823 27 | 400687 28 | 401464 29 | 409525 30 | 401937 31 | 407348 32 | 400499 33 | 400274 34 | 413026 35 | 409528 36 | 401948 37 | 407325 38 | 400673 39 | 400970 40 | 402119 41 | 409524 42 | 401534 43 | 407367 44 | 400084 45 | 402288 46 | 402118 47 | 401936 48 | 407342 49 | 400560 50 | 400096 51 | 402289 52 | 402117 53 | 401998 54 | 407328 55 | 400429 56 | 400973 57 | 400088 58 | 402059 59 | 400435 60 | 407352 61 | 400414 62 | 400201 63 | 400238 64 | 402058 65 | 404522 66 | 407335 67 | 400292 68 | 400567 69 | 402286 70 | 402057 71 | 404554 72 | 407360 73 | 400714 74 | 400449 75 | 402287 76 | 402056 77 | 400545 78 | 407336 79 | 401400 80 | 400104 81 | 402120 82 | 400563 83 | 400743 84 | 407361 85 | 401388 86 | 401014 87 | 402285 88 | 400236 89 | 400330 90 | 402060 91 | 407711 92 | 400296 93 | 402283 94 | 400916 95 | 401997 96 | 402061 97 | 407710 98 | 400158 99 | 402284 100 | 400065 101 | 401996 102 | 407337 103 | 405701 104 | 400873 105 | 402281 106 | 400258 107 | 404553 108 | 407339 109 | 401167 110 | 413845 111 | 402282 112 | 400995 113 | 400904 114 | 407364 115 | 401163 116 | 402121 117 | 404453 118 | 400790 119 | 407341 120 | 403419 121 | 401541 122 | 404461 123 | 400168 124 | 407372 125 | 401327 126 | 400971 127 | 400664 128 | 400643 129 | 407373 130 | 414694 131 | 400122 132 | 400837 133 | 400794 134 | 407331 135 | 403414 136 | 404759 137 | 400654 138 | 400246 139 | 407321 140 | 413877 141 | 400045 142 | 400040 143 | 400069 144 | 407370 145 | 402067 146 | 400479 147 | 400418 148 | 401994 149 | 407344 150 | 401942 151 | 400030 152 | 400257 153 | 400372 154 | 400677 155 | 401943 156 | 401560 157 | 401597 158 | 400353 159 | 400209 160 | 403412 161 | 401440 162 | 401606 163 | 404521 164 | 400185 165 | 414284 166 | 403225 167 | 404452 168 | 400206 169 | 400648 170 | 403409 171 | 403265 172 | 405619 173 | 400895 174 | 400507 175 | 413878 176 | 400343 177 | 401567 178 | 400227 179 | 400828 180 | 401810 181 | 400508 182 | 401457 183 | 400586 184 | 401210 185 | 401811 186 | 400253 187 | 400178 188 | 400964 189 | 401224 190 | 403406 191 | 400147 192 | 400457 193 | 400172 194 | 400952 195 | 400953 196 | 400709 197 | 404462 198 | 400760 199 | 400097 200 | 400799 201 | 400057 202 | 404451 203 | 400911 204 | 400222 205 | 403404 206 | 400514 207 | 401957 208 | 400863 209 | 400582 210 | 403401 211 | 400723 212 | 401958 213 | 404753 214 | 400688 215 | 403402 216 | 400951 217 | 401129 218 | 400394 219 | 400213 220 | 401809 221 | 408911 222 | 401154 223 | 400001 224 | 400464 225 | 401808 226 | 408907 227 | 405613 228 | 400922 229 | 400907 230 | 401655 231 | 401611 232 | 400842 233 | 400965 234 | 400822 235 | 401391 236 | 404370 237 | 400280 238 | 400109 239 | 400792 240 | 401403 241 | 400100 242 | 401495 243 | 402362 244 | 400485 245 | 400665 246 | 400221 247 | 402363 248 | 400934 249 | 400298 250 | 400668 251 | 400017 252 | 407173 253 | 400436 254 | 400440 255 | 400713 256 | 407172 257 | 403329 258 | 400996 259 | 400700 260 | 407165 261 | 400278 262 | 400160 263 | 400336 264 | 407194 265 | 400240 266 | 401890 267 | 400772 268 | 400535 269 | 400073 270 | 401891 271 | 400148 272 | 407167 273 | 400715 274 | 400400 275 | 400717 276 | 407196 277 | 400832 278 | 402361 279 | 400750 280 | 407197 281 | 400649 282 | 400804 283 | 400059 284 | 407170 285 | 404378 286 | 401906 287 | 400461 288 | 407176 289 | 401908 290 | 400690 291 | 407177 292 | 402359 293 | 400052 294 | 407202 295 | 402360 296 | 400085 297 | 407204 298 | 401555 299 | 407179 300 | 400268 301 | 407180 302 | 400637 303 | 407206 304 | 400519 305 | 407181 306 | 400145 307 | 407207 308 | 400754 309 | 407187 310 | 400444 311 | 407157 312 | 400577 313 | 407155 314 | 407186 315 | 407153 316 | 407200 317 | 407174 318 | 407161 319 | 407191 320 | 407190 321 | 407151 322 | 407185 323 | 407150 324 | 407184 325 | 407148 326 | 403237 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pems_crawler 2 | PeMS crawler 3 | 4 | This is a crawler which can download stations' information and time series data of each station from PeMS. 5 | 6 | The file "BayStations.txt" contains all of stations' id of Bay Area. You can run this program to download all stations' information and their time series data. 7 | 8 | You can change the last line of crawler.py to change start time and end time of the time series you want to download. 9 | 10 | ## Requirements 11 | python3 12 | 13 | + requests 14 | 15 | use `pip install -r requirements.txt` to install dependencies. 16 | 17 | ## Usage 18 | 19 | ``` 20 | python crawler.py 21 | ``` -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import requests 5 | from datetime import datetime 6 | from datetime import timedelta 7 | from urllib.parse import quote_plus 8 | import concurrent.futures 9 | 10 | headers = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 12 | 'Accept-Encoding': 'gzip, deflate, sdch', 13 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 14 | } 15 | 16 | directory = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | class spider: 19 | def __init__(self, username, password, proxies = False): 20 | self.home = "http://pems.dot.ca.gov/" 21 | self.s = requests.Session() 22 | if proxies: 23 | self.s.proxies = proxies 24 | self.username = username 25 | self.password = password 26 | 27 | def get(self, url): 28 | ''' 29 | HTTP get request 30 | ''' 31 | return self.s.get(url, headers = headers) 32 | 33 | def post(self, url, data): 34 | ''' 35 | HTTP post request 36 | ''' 37 | return self.s.post(url, data = data, headers = headers) 38 | 39 | def login(self): 40 | ''' 41 | sign in PEMS 42 | ''' 43 | print('try to login') 44 | data = {'username': self.username, 45 | 'password': self.password, 46 | 'login': 'Login', 47 | 'redirect': ""} 48 | self.post(self.home, data) 49 | 50 | def download_station_time_series_5min(self, station_id, start_time, end_time, areaname): 51 | ''' 52 | download each station's time series data, time interval is 5min 53 | 54 | Parameters 55 | ---------- 56 | station_id: str, e.g. 402000 57 | 58 | start_time: str, %Y%m%d%H%M, e.g. 201701010000 59 | 60 | end_time: str, %Y%m%d%H%M, e.g. 201701012359 61 | ''' 62 | print('try to download %s %s %s'%(station_id, start_time, end_time)) 63 | 64 | s_time_id_f = datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M") 65 | e_time_id_f = datetime.strptime(end_time, "%Y%m%d%H%M").strftime("%m/%d/%Y+%H:%M") 66 | 67 | delta = timedelta(days = 1) 68 | s_time_id = str(int(datetime.strptime(datetime.strptime(start_time, "%Y%m%d%H%M").strftime("%Y%m%d") + "0800", 69 | "%Y%m%d%H%M").timestamp())) 70 | e_time_id = str(int(datetime.strptime((datetime.strptime(end_time, "%Y%m%d%H%M") + delta).strftime("%Y%m%d") + "0759", 71 | "%Y%m%d%H%M").timestamp())) 72 | data_str = '''report_form=1 73 | dnode=VDS 74 | content=loops 75 | tab=det_timeseries 76 | export=text 77 | station_id=405572 78 | s_time_id=1483228800 79 | s_time_id_f=01%2F01%2F2017+00%3A00 80 | e_time_id=1483315140 81 | e_time_id_f=01%2F01%2F2017+23%3A59 82 | tod=all 83 | tod_from=0 84 | tod_to=0 85 | dow_0=on 86 | dow_1=on 87 | dow_2=on 88 | dow_3=on 89 | dow_4=on 90 | dow_5=on 91 | dow_6=on 92 | holidays=on 93 | q=flow 94 | q2= 95 | gn=5min 96 | agg=on''' 97 | data = dict(map(lambda x: x.strip().split('='), data_str.split('\n'))) 98 | data['station_id'] = station_id 99 | data['s_time_id'] = s_time_id 100 | data['e_time_id'] = e_time_id 101 | data['s_time_id_f'] = s_time_id_f 102 | data['e_time_id_f'] = e_time_id_f 103 | 104 | url = self.home + '?' + '&'.join(map(lambda x: '='.join((x[0][0], quote_plus(x[0][1], safe = "+"))), zip(data.items()))) 105 | response = self.get(url) 106 | 107 | if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname)))): 108 | os.makedirs(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname)))) 109 | 110 | with open('%s%s_%s_%s.txt'%(os.path.normpath(os.path.join(directory, 'time_series/%s/'%(areaname))), station_id, start_time, end_time), 'w') as f: 111 | f.write(response.text) 112 | 113 | def download_station_metadata(self, station, areaname): 114 | print('try to download meta data of station %s'%(station)) 115 | url = "%s?station_id=%s&dnode=VDS&content=sta_cfg"%(self.home, station) 116 | r = self.get(url) 117 | 118 | if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname)))): 119 | os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname)))) 120 | 121 | with open("%s%s.html"%(os.path.normpath(os.path.join(directory, 'station_metadata/%s/'%(areaname))), station), "w") as f: 122 | f.write(r.text) 123 | 124 | def start(self, areaname, filename, start_time, end_time): 125 | ''' 126 | Parameters 127 | ---------- 128 | areaname: str, e.g. Bay 129 | 130 | filename: str, e.g. station.txt 131 | 132 | start_time, end_time: str, %Y%m%d 133 | 134 | ''' 135 | # download all stations' metadata 136 | with open(filename, 'r') as f: 137 | station_list = f.read().strip().split('\n') 138 | 139 | if not os.path.exists(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname)))): 140 | os.makedirs(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname)))) 141 | 142 | station_metadatas = os.listdir(os.path.normpath(os.path.join(directory, 'station_metadata/%s'%(areaname)))) 143 | for station in station_list: 144 | if "%s.html"%(station) in station_metadatas: 145 | continue 146 | self.download_station_metadata(station, areaname) 147 | 148 | # compute timelist 149 | start_time = start_time + "0000" 150 | t = datetime.strptime(start_time, "%Y%m%d%H%M") 151 | delta = timedelta(days = 7) 152 | end_time = datetime.strptime(end_time, "%Y%m%d") 153 | timelist = [] 154 | while t < end_time: 155 | tmp = [t.strftime("%Y%m%d") + "0000"] 156 | t = t + delta 157 | tmp.append((t - timedelta(days = 1)).strftime("%Y%m%d") + "2359") 158 | timelist.append(tmp) 159 | 160 | # compute all stations' time series start time and end time 161 | jobs = [(station, start_time, end_time) \ 162 | for station in station_list \ 163 | for start_time, end_time in timelist \ 164 | if not os.path.exists(os.path.normpath(os.path.join(directory, 'time_series/%s/%s_%s_%s.txt'%(areaname, station, start_time, end_time))))] 165 | 166 | # multi threaded downloader 167 | with concurrent.futures.ThreadPoolExecutor(max_workers = 4) as executor: 168 | 169 | # add all jobs to future_to_url, {job: job's name} 170 | future_to_url = {executor.submit(self.download_station_time_series_5min, station, start_time, end_time, areaname): '_'.join((station, start_time, end_time)) for station, start_time, end_time in jobs} 171 | for future in concurrent.futures.as_completed(future_to_url): 172 | job_name = future_to_url[future] 173 | try: 174 | future.result() 175 | except Exception as exc: 176 | print('%r generated an exception: %s' % (job_name, exc)) 177 | 178 | if __name__ == "__main__": 179 | proxies = {'http': 'http://127.0.0.1:1080'} 180 | 181 | username = None 182 | password = None 183 | 184 | if username == None: 185 | username = input('please input your username of PeMS: ') 186 | 187 | if password == None: 188 | password = input('please input your password: ') 189 | 190 | # spider initialization 191 | a = spider() 192 | 193 | # sign in PEMS 194 | a.login(username, password, proxies = False) 195 | 196 | a.start('Bay', 'BayStations.txt', '20170101', '20170601') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests --------------------------------------------------------------------------------