├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── crawler_TOEFL.iml ├── README.md └── crawler_toefl.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/crawler_TOEFL.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NEEA托福考位爬虫 Getting Started with NEEA TOEFL Testseat Crawler 2 | 3 | 本文档简要介绍了NEEA托福考位本地爬虫的使用方法。 4 | This document provides a brief intro of the usage of NEEA TOEFL Test Seats Selenium Crawler. 5 | 6 | ### 动机 Motivation 7 | NEEA 托福考位网站正在提供着不便的服务。在寻找考位时,我们需要按每个日期,每个城市一个个地搜索考位, 8 | 这为那些想尽快找到测试座位的人带来了无法忍受的体验。 9 | 10 |
11 | 12 |
13 | 14 | 为什么不直接以表格形式显示所有考位? 15 | 16 | [NEEA TOEFL](https://toefl.neea.cn/) Test Seat website, supported by Chinese National Education 17 | Examinations Authority (NEEA), is providing an inconvenience service. When looking for a test seat, 18 | we need to search date by every date, every city, which brings an intolerable experience for those 19 | who just want to find a test seat ASAP. Why not display the form of all the test seat? 20 | 21 | ### 安装要求 Requirements 22 | - Firefox [mozilla geckodriver](https://github.com/mozilla/geckodriver/releases) v0.26.0 23 | - [How to install webdriver](https://www.cnblogs.com/LY-CC/p/11068244.html) 24 | - [Firefox](https://ftp.mozilla.org/pub/firefox/releases/) ≥ 60 25 | - pip install selenium 26 | #### 安装方式 Install 27 | - Firefox mozilla geckodriver: the default geckodriver path is "C:\Program Files\Mozilla Firefox\geckodriver.exe". 28 | If you want to set your executable path, please use **--webdriver_path='your path'** to start. 29 | 30 | - 默认Firefox mozilla geckodriver是安装在"C:\Program Files\Mozilla Firefox\geckodriver.exe"路径中,如果你希望使用其他路径, 31 | 请使用 **--webdriver_path='your path'** 来启动爬虫。 32 | 33 | ### Get start 34 | default start 35 | ``` 36 | python crawler_toefl.py --username='NEEA ID number' --password='password' 37 | ``` 38 | When finished, you can get a .csv form file. 爬虫完成后将得到.csv表格文件。 39 | 40 |
41 | 42 |
43 | 44 | ### Todo: 45 | 1. faster, test time is 30 min 爬虫速度太慢了, 爬完全部数据目前需要30分钟 46 | 2. headless mode 无界面模式怎么绕开反爬虫? 47 | 3. Anti anti-crawler when click the 'search seats' button 怎么绕开反爬虫? 48 | 4. online crawler (use a server) 在线爬虫(服务器) 49 | 5. different modes 用户定制化爬虫 50 | 51 | ### Acknowledgement 52 | This idea is initially coming from https://www.jianshu.com/p/2541d918869e, thanks! 53 | -------------------------------------------------------------------------------- /crawler_toefl.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | # test on python 3.6 3 | # thanks https://www.jianshu.com/p/2541d918869e 4 | # version 1.0 5 | # author cambridge.mo@foxmail.com 6 | # month Jul 2020 7 | 8 | import os 9 | import csv 10 | import time 11 | import requests 12 | from PIL import Image 13 | from selenium import webdriver 14 | from selenium.webdriver.support.ui import WebDriverWait 15 | from selenium.webdriver.common.by import By 16 | from selenium.webdriver.support.select import Select 17 | from selenium.webdriver.support import expected_conditions as EC 18 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 19 | import random 20 | import win32gui 21 | import win32api 22 | import argparse 23 | 24 | CITYS = [] 25 | DATES = [] 26 | 27 | 28 | def parse_args(): 29 | # Parse input arguments 30 | parser = argparse.ArgumentParser(description='TOEFL crawler args') 31 | parser.add_argument('--username', dest='USERNAME_TF', 32 | type=str, default=None) 33 | parser.add_argument('--password', dest='PASSWORD_TF', 34 | type=str, default=None) 35 | parser.add_argument('--headless', dest='headless', 36 | help='(Not suport in this version) start headless, browser will not display', 37 | default=False, action='store_true') 38 | parser.add_argument('--eager', dest='eager', 39 | help='eager mode (unstable!) is faster when loading web-page', 40 | default=False, action='store_true') 41 | parser.add_argument('--webdriver_path', dest='webdriver_path', 42 | help='set Firefox webdriver path', 43 | type=str, default="C:\Program Files\Mozilla Firefox\geckodriver.exe") 44 | 45 | # parser.add_argument('--mode', dest='mode', 46 | # help='enum the mode', 47 | # type=int) 48 | args = parser.parse_args() 49 | return args 50 | 51 | 52 | class GetToeflTestInfos(): 53 | def __init__(self): 54 | args = parse_args() 55 | self.username = args.USERNAME_TF 56 | self.password = args.PASSWORD_TF 57 | if self.username==None: 58 | self.username = input('请输入账户名 Please enter username:') 59 | if self.password==None: 60 | self.password = input('请输入密码 Please enter password:') 61 | self.index_url = "https://toefl.neea.cn/login" 62 | self.hwnd = None 63 | self.option = webdriver.FirefoxOptions() # for anti-crawler, only FireFox can be used 64 | self.option.add_argument('--user-agent="Firefox/60.0"') 65 | if args.headless: 66 | self.option.add_argument('--headless') # start 'headless', browser will not display 67 | if args.eager: 68 | desired_capabilities = DesiredCapabilities.FIREFOX 69 | desired_capabilities["pageLoadStrategy"] = "eager" # eager mode (unstable) is faster when loading web-page 70 | 71 | try: 72 | self.driver = webdriver.Firefox(executable_path=args.webdriver_path, options=self.option) 73 | except: 74 | print("Your webdriver executable path is wrong: Cannot start webdriver.") 75 | print("Please use --webdriver_path to set webdriver executable path") 76 | print('See https://github.com/893374759/crawler_TOEFL#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F-install') 77 | raise 78 | 79 | self.wait = WebDriverWait(self.driver, timeout=50) 80 | self.CITY = None 81 | self.DATE = None 82 | 83 | def input_infos(self): 84 | """ 85 | Enter username and password 86 | """ 87 | self.driver.get(self.index_url) 88 | print("自动输入用户名和密码 Automatically enter username and password") 89 | # username 90 | time.sleep(2) 91 | input_name = self.wait.until( 92 | EC.presence_of_element_located((By.ID, "userName")) 93 | ) 94 | input_name.clear() 95 | input_name.send_keys(self.username) 96 | # password 97 | input_pwd = self.wait.until( 98 | EC.presence_of_element_located((By.ID, "textPassword")) 99 | ) 100 | input_pwd.clear() 101 | input_pwd.send_keys(self.password) 102 | 103 | def get_captcha(self): 104 | """ 105 | get captcha, :return: captcha 106 | """ 107 | print("等待加载验证码 Loading captcha...") 108 | # 模拟点击 109 | input_code = self.wait.until( 110 | EC.element_to_be_clickable((By.ID, "verifyCode")) 111 | ) 112 | self.hwnd = win32gui.FindWindow('MozillaWindowClass', '首页 - 教育部考试中心托福网上报名 - Mozilla Firefox') 113 | win32api.keybd_event(27, 0, 0, 0) # VK_code 114 | win32gui.SetForegroundWindow(self.hwnd) 115 | while True: 116 | input_code.click() 117 | time.sleep(4) 118 | # get captcha link, send requests 119 | src = self.wait.until( 120 | EC.presence_of_element_located((By.ID, "chkImg")) 121 | ) 122 | time.sleep(2.5) 123 | src_url = src.get_attribute("src") 124 | print(src_url) 125 | if (not ('loading' in src_url)) and (src_url is not None): 126 | break 127 | 128 | res = requests.get(src_url) 129 | time.sleep(1.5) 130 | with open('code.png', 'wb') as f: 131 | f.write(res.content) 132 | # Open local captcha, manually identify 133 | try: 134 | im = Image.open('code.png') 135 | im.show() 136 | im.close() 137 | except: 138 | print('到本地目录打开code.png获取验证码 Go local directory, open code.png to see captcha') 139 | finally: 140 | captcha = input('请输入验证码 Please enter the captcha:') 141 | os.remove('code.png') 142 | print('尝试登录中 Logging in...') 143 | return captcha 144 | 145 | def login(self, code): 146 | input_code = self.wait.until( 147 | EC.presence_of_element_located((By.ID, "verifyCode")) 148 | ) 149 | input_code.send_keys(code) 150 | submit_button = self.wait.until( 151 | EC.element_to_be_clickable((By.ID, "btnLogin")) 152 | ) 153 | submit_button.click() 154 | # Check if the login is successful 155 | try: 156 | #TODO: http 500 error 157 | print(self.driver.title) 158 | success = self.wait.until( 159 | EC.text_to_be_present_in_element((By.XPATH, '//div[@class="myhome_info_cn"]/span[2]'), self.username) 160 | ) 161 | if success: 162 | print("==登录成功页面 Page Login Success==") 163 | except: 164 | self.input_infos() 165 | code_str = self.get_captcha() 166 | self.login(code_str) 167 | 168 | def find_seat(self): 169 | print('开始考位查询 Turn to Page Find-Seat') 170 | success = False 171 | while not success: 172 | self.driver.get("https://toefl.neea.cn/myHome/8625374/index#!/testSeat") 173 | time.sleep(1) 174 | try: 175 | success = self.wait.until( 176 | EC.text_to_be_present_in_element((By.XPATH, '//div[@class="span12"]/h4'), "查询条件") 177 | ) 178 | if success: 179 | print("==考位查询页面 Page Find-Seat==") 180 | except: 181 | success = False 182 | 183 | # self.driver.switch_to.alert.accept() 184 | 185 | def get_all_DATE(self): 186 | CITYS, DATES = [], [] 187 | CITY = "上海" 188 | time.sleep(1) 189 | city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(CITY) 190 | CITYS = self.driver.find_element_by_id("centerProvinceCity").text.split("\n") 191 | del CITYS[0] 192 | all_options = self.driver.find_element_by_id("testDays").find_elements_by_tag_name('option') 193 | for option in all_options: 194 | DATES.append(option.get_attribute("value")) 195 | del DATES[0] 196 | print("已获取全部城市、考试日期 get all test DATE/CITYs") 197 | return [CITYS, DATES] 198 | 199 | def send_query_condition(self, virgin=False): 200 | city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(self.CITY) 201 | date = Select(self.driver.find_element_by_id("testDays")).select_by_value(self.DATE) 202 | 203 | if virgin: 204 | click = False 205 | while not click: 206 | try: 207 | win32api.keybd_event(27, 0, 0, 0) # VK_code 208 | win32gui.SetForegroundWindow(self.hwnd) 209 | print("正在反-反爬虫, 或许需要您点一下火狐浏览器 Anti anti-crawler, you can click the Firefox browser...") 210 | scrool = random.randint(0, 100) 211 | self.driver.execute_script('window.scrollBy(0,%d)' % scrool) 212 | time.sleep(1) 213 | self.driver.execute_script('window.scrollBy(0,%d)' % -scrool) 214 | 215 | query_button = self.wait.until( 216 | EC.element_to_be_clickable((By.ID, "btnQuerySeat")) 217 | ) 218 | time.sleep(1) 219 | query_button.click() 220 | click = bool(WebDriverWait(self.driver, timeout=5).until(alert_or_success())) 221 | except: 222 | click = False 223 | else: 224 | time.sleep(0.2) 225 | 226 | query_button = self.wait.until( 227 | EC.element_to_be_clickable((By.ID, "btnQuerySeat")) 228 | ) 229 | query_button.click() 230 | 231 | def save_date(self, i=1): 232 | """ 233 | save to .csv 234 | """ 235 | csv_fp = open("toefl_{}_check.csv".format(time.strftime('%Y-%m-%d', time.localtime(time.time()))), "a+", 236 | encoding='utf-8-sig', newline='') 237 | writer = csv.writer(csv_fp) 238 | try: 239 | is_success = EC.text_to_be_present_in_element((By.XPATH, '//td[@style="text-align:center;vertical-align: middle"]'), s_city)( 240 | self.driver) 241 | except: 242 | is_success = 0 243 | if bool(is_success): 244 | # head 1: test date 245 | boxhead1 = self.wait.until( 246 | EC.presence_of_all_elements_located( 247 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[1]/th/span'.format(i)) 248 | ) 249 | ) 250 | head1_ls = [] 251 | for head1 in boxhead1: 252 | if not head1.text: 253 | continue 254 | head1_ls.append(head1.text) 255 | writer.writerow(head1_ls) 256 | print(head1_ls) 257 | 258 | # head 2 259 | boxhead2 = self.wait.until( 260 | EC.presence_of_all_elements_located( 261 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[2]/th'.format(i)) 262 | ) 263 | ) 264 | head2_ls = [] 265 | for head2 in boxhead2: 266 | head2_ls.append(head2.text.replace('\n', '')) 267 | writer.writerow(head2_ls) 268 | print(head2_ls) 269 | 270 | # inquiry form 271 | items = self.wait.until( 272 | EC.presence_of_all_elements_located( 273 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i)) 274 | ) 275 | ) 276 | try: 277 | for item in items: 278 | body_dict = {} 279 | body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text 280 | body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text 281 | body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text 282 | body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text 283 | writer.writerow(body_dict.values()) 284 | print(body_dict) 285 | except: 286 | items = self.wait.until( 287 | EC.presence_of_all_elements_located( 288 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i)) 289 | ) 290 | ) 291 | print(['refresh occur!']) 292 | writer.writerow(['refresh occur!']) 293 | for item in items: 294 | body_dict = {} 295 | body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text 296 | body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text 297 | body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text 298 | body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text 299 | writer.writerow(body_dict.values()) 300 | print(body_dict) 301 | else: 302 | null_line = [self.CITY, self.DATE, "未查询到考位信息"] 303 | print(null_line) 304 | writer.writerow(null_line) 305 | csv_fp.close() 306 | 307 | 308 | class alert_or_success: 309 | def __init__(self): 310 | self.is_success, self.is_alert = 0, 0 311 | 312 | def __call__(self, driver): 313 | ''' 314 | wait to see whether is '考位查询结果' or '未查询到考位信息' 315 | ''' 316 | try: 317 | self.is_success = EC.text_to_be_present_in_element((By.XPATH, '//div[@id="qrySeatResult"]/h4'), "考位查询结果")( 318 | driver) 319 | except: 320 | self.is_alert = EC.visibility_of_element_located( 321 | (By.XPATH, '//i[@class="layui-layer-ico layui-layer-ico0"]'))(driver) 322 | if bool(self.is_success): 323 | self.is_alert = 0 324 | return True 325 | elif bool(self.is_alert): 326 | self.is_success = 0 327 | return True 328 | else: 329 | self.is_success, self.is_alert = 0, 0 330 | return False 331 | 332 | 333 | if __name__ == "__main__": 334 | GetToeflCrawler = GetToeflTestInfos() 335 | GetToeflCrawler.input_infos() 336 | captcha = GetToeflCrawler.get_captcha() 337 | GetToeflCrawler.login(captcha) 338 | GetToeflCrawler.find_seat() 339 | [CITYS, DATES] = GetToeflCrawler.get_all_DATE() 340 | CITYS.reverse() 341 | 342 | for s_date in DATES: 343 | for s_city in CITYS: 344 | GetToeflCrawler.CITY, GetToeflCrawler.DATE = s_city, s_date 345 | if [s_city, s_date] == [CITYS[0], DATES[0]]: 346 | virgin = True 347 | else: 348 | virgin = False 349 | GetToeflCrawler.send_query_condition(virgin) 350 | flag = WebDriverWait(GetToeflCrawler.driver, timeout=50).until(alert_or_success()) 351 | GetToeflCrawler.save_date(i=1) 352 | 353 | GetToeflCrawler.driver.quit() 354 | --------------------------------------------------------------------------------