├── .idea
    ├── .gitignore
    ├── vcs.xml
    ├── misc.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    └── crawler_TOEFL.iml
├── README.md
└── crawler_toefl.py


/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (detectron2-master)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/crawler_TOEFL.iml" filepath="$PROJECT_DIR$/.idea/crawler_TOEFL.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/crawler_TOEFL.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## NEEA托福考位爬虫 Getting Started with NEEA TOEFL Testseat Crawler
 2 | 
 3 | 本文档简要介绍了NEEA托福考位本地爬虫的使用方法。
 4 | This document provides a brief intro of the usage of NEEA TOEFL Test Seats Selenium Crawler.
 5 | 
 6 | ### 动机 Motivation
 7 | NEEA 托福考位网站正在提供着不便的服务。在寻找考位时，我们需要按每个日期，每个城市一个个地搜索考位，
 8 | 这为那些想尽快找到测试座位的人带来了无法忍受的体验。
 9 | 
10 | <div align="center">
11 |   <img src="https://s1.ax1x.com/2020/07/18/UcfnqP.gif"/>
12 | </div>
13 | 
14 | 为什么不直接以表格形式显示所有考位？
15 | 
16 | [NEEA TOEFL](https://toefl.neea.cn/) Test Seat website, supported by Chinese National Education 
17 | Examinations Authority (NEEA), is providing an inconvenience service. When looking for a test seat, 
18 | we need to search date by every date, every city, which brings an intolerable experience for those 
19 | who just want to find a test seat ASAP. Why not display the form of all the test seat?
20 | 
21 | ### 安装要求 Requirements
22 | - Firefox [mozilla geckodriver](https://github.com/mozilla/geckodriver/releases) v0.26.0
23 |     - [How to install webdriver](https://www.cnblogs.com/LY-CC/p/11068244.html)
24 | - [Firefox](https://ftp.mozilla.org/pub/firefox/releases/) ≥ 60
25 | - pip install selenium
26 | #### 安装方式 Install
27 | - Firefox mozilla geckodriver: the default geckodriver path is "C:\Program Files\Mozilla Firefox\geckodriver.exe". 
28 | If you want to set your executable path, please use **--webdriver_path='your path'** to start.
29 | 
30 | - 默认Firefox mozilla geckodriver是安装在"C:\Program Files\Mozilla Firefox\geckodriver.exe"路径中，如果你希望使用其他路径，
31 | 请使用 **--webdriver_path='your path'** 来启动爬虫。
32 | 
33 | ### Get start
34 | default start
35 | ```
36 | python crawler_toefl.py --username='NEEA ID number' --password='password'
37 | ```
38 | When finished, you can get a .csv form file. 爬虫完成后将得到.csv表格文件。
39 | 
40 | <div align="center">
41 |   <img src="https://s1.ax1x.com/2020/07/18/Uch9Qs.png"/>
42 | </div>
43 | 
44 | ### Todo:
45 | 1. faster, test time is 30 min 爬虫速度太慢了, 爬完全部数据目前需要30分钟
46 | 2. headless mode 无界面模式怎么绕开反爬虫?
47 | 3. Anti anti-crawler when click the 'search seats' button 怎么绕开反爬虫?
48 | 4. online crawler (use a server) 在线爬虫(服务器)
49 | 5. different modes 用户定制化爬虫
50 | 
51 | ### Acknowledgement
52 | This idea is initially coming from https://www.jianshu.com/p/2541d918869e, thanks!  
53 | 


--------------------------------------------------------------------------------
/crawler_toefl.py:
--------------------------------------------------------------------------------
  1 | # *_*coding:utf-8 *_*
  2 | # test on python 3.6
  3 | # thanks https://www.jianshu.com/p/2541d918869e
  4 | # version 1.0
  5 | # author cambridge.mo@foxmail.com
  6 | # month Jul 2020
  7 | 
  8 | import os
  9 | import csv
 10 | import time
 11 | import requests
 12 | from PIL import Image
 13 | from selenium import webdriver
 14 | from selenium.webdriver.support.ui import WebDriverWait
 15 | from selenium.webdriver.common.by import By
 16 | from selenium.webdriver.support.select import Select
 17 | from selenium.webdriver.support import expected_conditions as EC
 18 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 19 | import random
 20 | import win32gui
 21 | import win32api
 22 | import argparse
 23 | 
 24 | CITYS = []
 25 | DATES = []
 26 | 
 27 | 
 28 | def parse_args():
 29 |     # Parse input arguments
 30 |     parser = argparse.ArgumentParser(description='TOEFL crawler args')
 31 |     parser.add_argument('--username', dest='USERNAME_TF',
 32 |                         type=str, default=None)
 33 |     parser.add_argument('--password', dest='PASSWORD_TF',
 34 |                         type=str, default=None)
 35 |     parser.add_argument('--headless', dest='headless',
 36 |                         help='(Not suport in this version) start headless, browser will not display',
 37 |                         default=False, action='store_true')
 38 |     parser.add_argument('--eager', dest='eager',
 39 |                         help='eager mode (unstable!) is faster when loading web-page',
 40 |                         default=False, action='store_true')
 41 |     parser.add_argument('--webdriver_path', dest='webdriver_path',
 42 |                         help='set Firefox webdriver path',
 43 |                         type=str, default="C:\Program Files\Mozilla Firefox\geckodriver.exe")
 44 | 
 45 |     # parser.add_argument('--mode', dest='mode',
 46 |     #                     help='enum the mode',
 47 |     #                     type=int)
 48 |     args = parser.parse_args()
 49 |     return args
 50 | 
 51 | 
 52 | class GetToeflTestInfos():
 53 |     def __init__(self):
 54 |         args = parse_args()
 55 |         self.username = args.USERNAME_TF
 56 |         self.password = args.PASSWORD_TF
 57 |         if self.username==None:
 58 |             self.username = input('请输入账户名 Please enter username:')
 59 |         if self.password==None:
 60 |             self.password = input('请输入密码 Please enter password:')
 61 |         self.index_url = "https://toefl.neea.cn/login"
 62 |         self.hwnd = None
 63 |         self.option = webdriver.FirefoxOptions()  # for anti-crawler, only FireFox can be used
 64 |         self.option.add_argument('--user-agent="Firefox/60.0"')
 65 |         if args.headless:
 66 |             self.option.add_argument('--headless')  # start 'headless', browser will not display
 67 |         if args.eager:
 68 |             desired_capabilities = DesiredCapabilities.FIREFOX
 69 |             desired_capabilities["pageLoadStrategy"] = "eager"  # eager mode (unstable) is faster when loading web-page
 70 | 
 71 |         try:
 72 |             self.driver = webdriver.Firefox(executable_path=args.webdriver_path, options=self.option)
 73 |         except:
 74 |             print("Your webdriver executable path is wrong: Cannot start webdriver.")
 75 |             print("Please use --webdriver_path to set webdriver executable path")
 76 |             print('See https://github.com/893374759/crawler_TOEFL#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F-install')
 77 |             raise
 78 | 
 79 |         self.wait = WebDriverWait(self.driver, timeout=50)
 80 |         self.CITY = None
 81 |         self.DATE = None
 82 | 
 83 |     def input_infos(self):
 84 |         """
 85 |         Enter username and password
 86 |         """
 87 |         self.driver.get(self.index_url)
 88 |         print("自动输入用户名和密码 Automatically enter username and password")
 89 |         # username
 90 |         time.sleep(2)
 91 |         input_name = self.wait.until(
 92 |             EC.presence_of_element_located((By.ID, "userName"))
 93 |         )
 94 |         input_name.clear()
 95 |         input_name.send_keys(self.username)
 96 |         # password
 97 |         input_pwd = self.wait.until(
 98 |             EC.presence_of_element_located((By.ID, "textPassword"))
 99 |         )
100 |         input_pwd.clear()
101 |         input_pwd.send_keys(self.password)
102 | 
103 |     def get_captcha(self):
104 |         """
105 |         get captcha, :return: captcha
106 |         """
107 |         print("等待加载验证码 Loading captcha...")
108 |         # 模拟点击
109 |         input_code = self.wait.until(
110 |             EC.element_to_be_clickable((By.ID, "verifyCode"))
111 |         )
112 |         self.hwnd = win32gui.FindWindow('MozillaWindowClass', '首页 - 教育部考试中心托福网上报名 - Mozilla Firefox')
113 |         win32api.keybd_event(27, 0, 0, 0)  # VK_code
114 |         win32gui.SetForegroundWindow(self.hwnd)
115 |         while True:
116 |             input_code.click()
117 |             time.sleep(4)
118 |             # get captcha link, send requests
119 |             src = self.wait.until(
120 |                 EC.presence_of_element_located((By.ID, "chkImg"))
121 |             )
122 |             time.sleep(2.5)
123 |             src_url = src.get_attribute("src")
124 |             print(src_url)
125 |             if (not ('loading' in src_url)) and (src_url is not None):
126 |                 break
127 | 
128 |         res = requests.get(src_url)
129 |         time.sleep(1.5)
130 |         with open('code.png', 'wb') as f:
131 |             f.write(res.content)
132 |         # Open local captcha, manually identify
133 |         try:
134 |             im = Image.open('code.png')
135 |             im.show()
136 |             im.close()
137 |         except:
138 |             print('到本地目录打开code.png获取验证码 Go local directory, open code.png to see captcha')
139 |         finally:
140 |             captcha = input('请输入验证码 Please enter the captcha:')
141 |             os.remove('code.png')
142 |             print('尝试登录中 Logging in...')
143 |         return captcha
144 | 
145 |     def login(self, code):
146 |         input_code = self.wait.until(
147 |             EC.presence_of_element_located((By.ID, "verifyCode"))
148 |         )
149 |         input_code.send_keys(code)
150 |         submit_button = self.wait.until(
151 |             EC.element_to_be_clickable((By.ID, "btnLogin"))
152 |         )
153 |         submit_button.click()
154 |         # Check if the login is successful
155 |         try:
156 |             #TODO: http 500 error
157 |             print(self.driver.title)
158 |             success = self.wait.until(
159 |                 EC.text_to_be_present_in_element((By.XPATH, '//div[@class="myhome_info_cn"]/span[2]'), self.username)
160 |             )
161 |             if success:
162 |                 print("==登录成功页面 Page Login Success==")
163 |         except:
164 |             self.input_infos()
165 |             code_str = self.get_captcha()
166 |             self.login(code_str)
167 | 
168 |     def find_seat(self):
169 |         print('开始考位查询 Turn to Page Find-Seat')
170 |         success = False
171 |         while not success:
172 |             self.driver.get("https://toefl.neea.cn/myHome/8625374/index#!/testSeat")
173 |             time.sleep(1)
174 |             try:
175 |                 success = self.wait.until(
176 |                     EC.text_to_be_present_in_element((By.XPATH, '//div[@class="span12"]/h4'), "查询条件")
177 |                 )
178 |                 if success:
179 |                     print("==考位查询页面 Page Find-Seat==")
180 |             except:
181 |                 success = False
182 | 
183 |         # self.driver.switch_to.alert.accept()
184 | 
185 |     def get_all_DATE(self):
186 |         CITYS, DATES = [], []
187 |         CITY = "上海"
188 |         time.sleep(1)
189 |         city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(CITY)
190 |         CITYS = self.driver.find_element_by_id("centerProvinceCity").text.split("\n")
191 |         del CITYS[0]
192 |         all_options = self.driver.find_element_by_id("testDays").find_elements_by_tag_name('option')
193 |         for option in all_options:
194 |             DATES.append(option.get_attribute("value"))
195 |         del DATES[0]
196 |         print("已获取全部城市、考试日期 get all test DATE/CITYs")
197 |         return [CITYS, DATES]
198 | 
199 |     def send_query_condition(self, virgin=False):
200 |         city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(self.CITY)
201 |         date = Select(self.driver.find_element_by_id("testDays")).select_by_value(self.DATE)
202 | 
203 |         if virgin:
204 |             click = False
205 |             while not click:
206 |                 try:
207 |                     win32api.keybd_event(27, 0, 0, 0)  # VK_code
208 |                     win32gui.SetForegroundWindow(self.hwnd)
209 |                     print("正在反-反爬虫, 或许需要您点一下火狐浏览器 Anti anti-crawler, you can click the Firefox browser...")
210 |                     scrool = random.randint(0, 100)
211 |                     self.driver.execute_script('window.scrollBy(0,%d)' % scrool)
212 |                     time.sleep(1)
213 |                     self.driver.execute_script('window.scrollBy(0,%d)' % -scrool)
214 | 
215 |                     query_button = self.wait.until(
216 |                         EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
217 |                     )
218 |                     time.sleep(1)
219 |                     query_button.click()
220 |                     click = bool(WebDriverWait(self.driver, timeout=5).until(alert_or_success()))
221 |                 except:
222 |                     click = False
223 |         else:
224 |             time.sleep(0.2)
225 | 
226 |             query_button = self.wait.until(
227 |                 EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
228 |             )
229 |             query_button.click()
230 | 
231 |     def save_date(self, i=1):
232 |         """
233 |         save to .csv
234 |         """
235 |         csv_fp = open("toefl_{}_check.csv".format(time.strftime('%Y-%m-%d', time.localtime(time.time()))), "a+",
236 |                       encoding='utf-8-sig', newline='')
237 |         writer = csv.writer(csv_fp)
238 |         try:
239 |             is_success = EC.text_to_be_present_in_element((By.XPATH, '//td[@style="text-align:center;vertical-align: middle"]'), s_city)(
240 |                 self.driver)
241 |         except:
242 |             is_success = 0
243 |         if bool(is_success):
244 |             # head 1: test date
245 |             boxhead1 = self.wait.until(
246 |                 EC.presence_of_all_elements_located(
247 |                     (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[1]/th/span'.format(i))
248 |                 )
249 |             )
250 |             head1_ls = []
251 |             for head1 in boxhead1:
252 |                 if not head1.text:
253 |                     continue
254 |                 head1_ls.append(head1.text)
255 |             writer.writerow(head1_ls)
256 |             print(head1_ls)
257 | 
258 |             # head 2
259 |             boxhead2 = self.wait.until(
260 |                 EC.presence_of_all_elements_located(
261 |                     (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[2]/th'.format(i))
262 |                 )
263 |             )
264 |             head2_ls = []
265 |             for head2 in boxhead2:
266 |                 head2_ls.append(head2.text.replace('\n', ''))
267 |             writer.writerow(head2_ls)
268 |             print(head2_ls)
269 | 
270 |             # inquiry form
271 |             items = self.wait.until(
272 |                 EC.presence_of_all_elements_located(
273 |                     (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i))
274 |                 )
275 |             )
276 |             try:
277 |                 for item in items:
278 |                     body_dict = {}
279 |                     body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text
280 |                     body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text
281 |                     body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text
282 |                     body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text
283 |                     writer.writerow(body_dict.values())
284 |                     print(body_dict)
285 |             except:
286 |                 items = self.wait.until(
287 |                     EC.presence_of_all_elements_located(
288 |                         (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i))
289 |                     )
290 |                 )
291 |                 print(['refresh occur!'])
292 |                 writer.writerow(['refresh occur!'])
293 |                 for item in items:
294 |                     body_dict = {}
295 |                     body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text
296 |                     body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text
297 |                     body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text
298 |                     body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text
299 |                     writer.writerow(body_dict.values())
300 |                     print(body_dict)
301 |         else:
302 |             null_line = [self.CITY, self.DATE, "未查询到考位信息"]
303 |             print(null_line)
304 |             writer.writerow(null_line)
305 |         csv_fp.close()
306 | 
307 | 
308 | class alert_or_success:
309 |     def __init__(self):
310 |         self.is_success, self.is_alert = 0, 0
311 | 
312 |     def __call__(self, driver):
313 |         '''
314 |         wait to see whether is '考位查询结果' or '未查询到考位信息'
315 |         '''
316 |         try:
317 |             self.is_success = EC.text_to_be_present_in_element((By.XPATH, '//div[@id="qrySeatResult"]/h4'), "考位查询结果")(
318 |                 driver)
319 |         except:
320 |             self.is_alert = EC.visibility_of_element_located(
321 |                 (By.XPATH, '//i[@class="layui-layer-ico layui-layer-ico0"]'))(driver)
322 |         if bool(self.is_success):
323 |             self.is_alert = 0
324 |             return True
325 |         elif bool(self.is_alert):
326 |             self.is_success = 0
327 |             return True
328 |         else:
329 |             self.is_success, self.is_alert = 0, 0
330 |             return False
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     GetToeflCrawler = GetToeflTestInfos()
335 |     GetToeflCrawler.input_infos()
336 |     captcha = GetToeflCrawler.get_captcha()
337 |     GetToeflCrawler.login(captcha)
338 |     GetToeflCrawler.find_seat()
339 |     [CITYS, DATES] = GetToeflCrawler.get_all_DATE()
340 |     CITYS.reverse()
341 | 
342 |     for s_date in DATES:
343 |         for s_city in CITYS:
344 |             GetToeflCrawler.CITY, GetToeflCrawler.DATE = s_city, s_date
345 |             if [s_city, s_date] == [CITYS[0], DATES[0]]:
346 |                 virgin = True
347 |             else:
348 |                 virgin = False
349 |             GetToeflCrawler.send_query_condition(virgin)
350 |             flag = WebDriverWait(GetToeflCrawler.driver, timeout=50).until(alert_or_success())
351 |             GetToeflCrawler.save_date(i=1)
352 | 
353 |     GetToeflCrawler.driver.quit()
354 | 


--------------------------------------------------------------------------------