├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── crawler_TOEFL.iml
├── README.md
└── crawler_toefl.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/crawler_TOEFL.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## NEEA托福考位爬虫 Getting Started with NEEA TOEFL Testseat Crawler
2 |
3 | 本文档简要介绍了NEEA托福考位本地爬虫的使用方法。
4 | This document provides a brief intro of the usage of NEEA TOEFL Test Seats Selenium Crawler.
5 |
6 | ### 动机 Motivation
7 | NEEA 托福考位网站正在提供着不便的服务。在寻找考位时,我们需要按每个日期,每个城市一个个地搜索考位,
8 | 这为那些想尽快找到测试座位的人带来了无法忍受的体验。
9 |
10 |
11 |
12 |
13 |
14 | 为什么不直接以表格形式显示所有考位?
15 |
16 | [NEEA TOEFL](https://toefl.neea.cn/) Test Seat website, supported by Chinese National Education
17 | Examinations Authority (NEEA), is providing an inconvenience service. When looking for a test seat,
18 | we need to search date by every date, every city, which brings an intolerable experience for those
19 | who just want to find a test seat ASAP. Why not display the form of all the test seat?
20 |
21 | ### 安装要求 Requirements
22 | - Firefox [mozilla geckodriver](https://github.com/mozilla/geckodriver/releases) v0.26.0
23 | - [How to install webdriver](https://www.cnblogs.com/LY-CC/p/11068244.html)
24 | - [Firefox](https://ftp.mozilla.org/pub/firefox/releases/) ≥ 60
25 | - pip install selenium
26 | #### 安装方式 Install
27 | - Firefox mozilla geckodriver: the default geckodriver path is "C:\Program Files\Mozilla Firefox\geckodriver.exe".
28 | If you want to set your executable path, please use **--webdriver_path='your path'** to start.
29 |
30 | - 默认Firefox mozilla geckodriver是安装在"C:\Program Files\Mozilla Firefox\geckodriver.exe"路径中,如果你希望使用其他路径,
31 | 请使用 **--webdriver_path='your path'** 来启动爬虫。
32 |
33 | ### Get start
34 | default start
35 | ```
36 | python crawler_toefl.py --username='NEEA ID number' --password='password'
37 | ```
38 | When finished, you can get a .csv form file. 爬虫完成后将得到.csv表格文件。
39 |
40 |
41 |
42 |
43 |
44 | ### Todo:
45 | 1. faster, test time is 30 min 爬虫速度太慢了, 爬完全部数据目前需要30分钟
46 | 2. headless mode 无界面模式怎么绕开反爬虫?
47 | 3. Anti anti-crawler when click the 'search seats' button 怎么绕开反爬虫?
48 | 4. online crawler (use a server) 在线爬虫(服务器)
49 | 5. different modes 用户定制化爬虫
50 |
51 | ### Acknowledgement
52 | This idea is initially coming from https://www.jianshu.com/p/2541d918869e, thanks!
53 |
--------------------------------------------------------------------------------
/crawler_toefl.py:
--------------------------------------------------------------------------------
1 | # *_*coding:utf-8 *_*
2 | # test on python 3.6
3 | # thanks https://www.jianshu.com/p/2541d918869e
4 | # version 1.0
5 | # author cambridge.mo@foxmail.com
6 | # month Jul 2020
7 |
8 | import os
9 | import csv
10 | import time
11 | import requests
12 | from PIL import Image
13 | from selenium import webdriver
14 | from selenium.webdriver.support.ui import WebDriverWait
15 | from selenium.webdriver.common.by import By
16 | from selenium.webdriver.support.select import Select
17 | from selenium.webdriver.support import expected_conditions as EC
18 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
19 | import random
20 | import win32gui
21 | import win32api
22 | import argparse
23 |
24 | CITYS = []
25 | DATES = []
26 |
27 |
28 | def parse_args():
29 | # Parse input arguments
30 | parser = argparse.ArgumentParser(description='TOEFL crawler args')
31 | parser.add_argument('--username', dest='USERNAME_TF',
32 | type=str, default=None)
33 | parser.add_argument('--password', dest='PASSWORD_TF',
34 | type=str, default=None)
35 | parser.add_argument('--headless', dest='headless',
36 | help='(Not suport in this version) start headless, browser will not display',
37 | default=False, action='store_true')
38 | parser.add_argument('--eager', dest='eager',
39 | help='eager mode (unstable!) is faster when loading web-page',
40 | default=False, action='store_true')
41 | parser.add_argument('--webdriver_path', dest='webdriver_path',
42 | help='set Firefox webdriver path',
43 | type=str, default="C:\Program Files\Mozilla Firefox\geckodriver.exe")
44 |
45 | # parser.add_argument('--mode', dest='mode',
46 | # help='enum the mode',
47 | # type=int)
48 | args = parser.parse_args()
49 | return args
50 |
51 |
52 | class GetToeflTestInfos():
53 | def __init__(self):
54 | args = parse_args()
55 | self.username = args.USERNAME_TF
56 | self.password = args.PASSWORD_TF
57 | if self.username==None:
58 | self.username = input('请输入账户名 Please enter username:')
59 | if self.password==None:
60 | self.password = input('请输入密码 Please enter password:')
61 | self.index_url = "https://toefl.neea.cn/login"
62 | self.hwnd = None
63 | self.option = webdriver.FirefoxOptions() # for anti-crawler, only FireFox can be used
64 | self.option.add_argument('--user-agent="Firefox/60.0"')
65 | if args.headless:
66 | self.option.add_argument('--headless') # start 'headless', browser will not display
67 | if args.eager:
68 | desired_capabilities = DesiredCapabilities.FIREFOX
69 | desired_capabilities["pageLoadStrategy"] = "eager" # eager mode (unstable) is faster when loading web-page
70 |
71 | try:
72 | self.driver = webdriver.Firefox(executable_path=args.webdriver_path, options=self.option)
73 | except:
74 | print("Your webdriver executable path is wrong: Cannot start webdriver.")
75 | print("Please use --webdriver_path to set webdriver executable path")
76 | print('See https://github.com/893374759/crawler_TOEFL#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F-install')
77 | raise
78 |
79 | self.wait = WebDriverWait(self.driver, timeout=50)
80 | self.CITY = None
81 | self.DATE = None
82 |
83 | def input_infos(self):
84 | """
85 | Enter username and password
86 | """
87 | self.driver.get(self.index_url)
88 | print("自动输入用户名和密码 Automatically enter username and password")
89 | # username
90 | time.sleep(2)
91 | input_name = self.wait.until(
92 | EC.presence_of_element_located((By.ID, "userName"))
93 | )
94 | input_name.clear()
95 | input_name.send_keys(self.username)
96 | # password
97 | input_pwd = self.wait.until(
98 | EC.presence_of_element_located((By.ID, "textPassword"))
99 | )
100 | input_pwd.clear()
101 | input_pwd.send_keys(self.password)
102 |
103 | def get_captcha(self):
104 | """
105 | get captcha, :return: captcha
106 | """
107 | print("等待加载验证码 Loading captcha...")
108 | # 模拟点击
109 | input_code = self.wait.until(
110 | EC.element_to_be_clickable((By.ID, "verifyCode"))
111 | )
112 | self.hwnd = win32gui.FindWindow('MozillaWindowClass', '首页 - 教育部考试中心托福网上报名 - Mozilla Firefox')
113 | win32api.keybd_event(27, 0, 0, 0) # VK_code
114 | win32gui.SetForegroundWindow(self.hwnd)
115 | while True:
116 | input_code.click()
117 | time.sleep(4)
118 | # get captcha link, send requests
119 | src = self.wait.until(
120 | EC.presence_of_element_located((By.ID, "chkImg"))
121 | )
122 | time.sleep(2.5)
123 | src_url = src.get_attribute("src")
124 | print(src_url)
125 | if (not ('loading' in src_url)) and (src_url is not None):
126 | break
127 |
128 | res = requests.get(src_url)
129 | time.sleep(1.5)
130 | with open('code.png', 'wb') as f:
131 | f.write(res.content)
132 | # Open local captcha, manually identify
133 | try:
134 | im = Image.open('code.png')
135 | im.show()
136 | im.close()
137 | except:
138 | print('到本地目录打开code.png获取验证码 Go local directory, open code.png to see captcha')
139 | finally:
140 | captcha = input('请输入验证码 Please enter the captcha:')
141 | os.remove('code.png')
142 | print('尝试登录中 Logging in...')
143 | return captcha
144 |
145 | def login(self, code):
146 | input_code = self.wait.until(
147 | EC.presence_of_element_located((By.ID, "verifyCode"))
148 | )
149 | input_code.send_keys(code)
150 | submit_button = self.wait.until(
151 | EC.element_to_be_clickable((By.ID, "btnLogin"))
152 | )
153 | submit_button.click()
154 | # Check if the login is successful
155 | try:
156 | #TODO: http 500 error
157 | print(self.driver.title)
158 | success = self.wait.until(
159 | EC.text_to_be_present_in_element((By.XPATH, '//div[@class="myhome_info_cn"]/span[2]'), self.username)
160 | )
161 | if success:
162 | print("==登录成功页面 Page Login Success==")
163 | except:
164 | self.input_infos()
165 | code_str = self.get_captcha()
166 | self.login(code_str)
167 |
168 | def find_seat(self):
169 | print('开始考位查询 Turn to Page Find-Seat')
170 | success = False
171 | while not success:
172 | self.driver.get("https://toefl.neea.cn/myHome/8625374/index#!/testSeat")
173 | time.sleep(1)
174 | try:
175 | success = self.wait.until(
176 | EC.text_to_be_present_in_element((By.XPATH, '//div[@class="span12"]/h4'), "查询条件")
177 | )
178 | if success:
179 | print("==考位查询页面 Page Find-Seat==")
180 | except:
181 | success = False
182 |
183 | # self.driver.switch_to.alert.accept()
184 |
185 | def get_all_DATE(self):
186 | CITYS, DATES = [], []
187 | CITY = "上海"
188 | time.sleep(1)
189 | city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(CITY)
190 | CITYS = self.driver.find_element_by_id("centerProvinceCity").text.split("\n")
191 | del CITYS[0]
192 | all_options = self.driver.find_element_by_id("testDays").find_elements_by_tag_name('option')
193 | for option in all_options:
194 | DATES.append(option.get_attribute("value"))
195 | del DATES[0]
196 | print("已获取全部城市、考试日期 get all test DATE/CITYs")
197 | return [CITYS, DATES]
198 |
199 | def send_query_condition(self, virgin=False):
200 | city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(self.CITY)
201 | date = Select(self.driver.find_element_by_id("testDays")).select_by_value(self.DATE)
202 |
203 | if virgin:
204 | click = False
205 | while not click:
206 | try:
207 | win32api.keybd_event(27, 0, 0, 0) # VK_code
208 | win32gui.SetForegroundWindow(self.hwnd)
209 | print("正在反-反爬虫, 或许需要您点一下火狐浏览器 Anti anti-crawler, you can click the Firefox browser...")
210 | scrool = random.randint(0, 100)
211 | self.driver.execute_script('window.scrollBy(0,%d)' % scrool)
212 | time.sleep(1)
213 | self.driver.execute_script('window.scrollBy(0,%d)' % -scrool)
214 |
215 | query_button = self.wait.until(
216 | EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
217 | )
218 | time.sleep(1)
219 | query_button.click()
220 | click = bool(WebDriverWait(self.driver, timeout=5).until(alert_or_success()))
221 | except:
222 | click = False
223 | else:
224 | time.sleep(0.2)
225 |
226 | query_button = self.wait.until(
227 | EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
228 | )
229 | query_button.click()
230 |
231 | def save_date(self, i=1):
232 | """
233 | save to .csv
234 | """
235 | csv_fp = open("toefl_{}_check.csv".format(time.strftime('%Y-%m-%d', time.localtime(time.time()))), "a+",
236 | encoding='utf-8-sig', newline='')
237 | writer = csv.writer(csv_fp)
238 | try:
239 | is_success = EC.text_to_be_present_in_element((By.XPATH, '//td[@style="text-align:center;vertical-align: middle"]'), s_city)(
240 | self.driver)
241 | except:
242 | is_success = 0
243 | if bool(is_success):
244 | # head 1: test date
245 | boxhead1 = self.wait.until(
246 | EC.presence_of_all_elements_located(
247 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[1]/th/span'.format(i))
248 | )
249 | )
250 | head1_ls = []
251 | for head1 in boxhead1:
252 | if not head1.text:
253 | continue
254 | head1_ls.append(head1.text)
255 | writer.writerow(head1_ls)
256 | print(head1_ls)
257 |
258 | # head 2
259 | boxhead2 = self.wait.until(
260 | EC.presence_of_all_elements_located(
261 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/thead/tr[2]/th'.format(i))
262 | )
263 | )
264 | head2_ls = []
265 | for head2 in boxhead2:
266 | head2_ls.append(head2.text.replace('\n', ''))
267 | writer.writerow(head2_ls)
268 | print(head2_ls)
269 |
270 | # inquiry form
271 | items = self.wait.until(
272 | EC.presence_of_all_elements_located(
273 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i))
274 | )
275 | )
276 | try:
277 | for item in items:
278 | body_dict = {}
279 | body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text
280 | body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text
281 | body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text
282 | body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text
283 | writer.writerow(body_dict.values())
284 | print(body_dict)
285 | except:
286 | items = self.wait.until(
287 | EC.presence_of_all_elements_located(
288 | (By.XPATH, '//table[@class="table table-bordered table-striped"][{}]/tbody/tr'.format(i))
289 | )
290 | )
291 | print(['refresh occur!'])
292 | writer.writerow(['refresh occur!'])
293 | for item in items:
294 | body_dict = {}
295 | body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text
296 | body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text
297 | body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text
298 | body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text
299 | writer.writerow(body_dict.values())
300 | print(body_dict)
301 | else:
302 | null_line = [self.CITY, self.DATE, "未查询到考位信息"]
303 | print(null_line)
304 | writer.writerow(null_line)
305 | csv_fp.close()
306 |
307 |
308 | class alert_or_success:
309 | def __init__(self):
310 | self.is_success, self.is_alert = 0, 0
311 |
312 | def __call__(self, driver):
313 | '''
314 | wait to see whether is '考位查询结果' or '未查询到考位信息'
315 | '''
316 | try:
317 | self.is_success = EC.text_to_be_present_in_element((By.XPATH, '//div[@id="qrySeatResult"]/h4'), "考位查询结果")(
318 | driver)
319 | except:
320 | self.is_alert = EC.visibility_of_element_located(
321 | (By.XPATH, '//i[@class="layui-layer-ico layui-layer-ico0"]'))(driver)
322 | if bool(self.is_success):
323 | self.is_alert = 0
324 | return True
325 | elif bool(self.is_alert):
326 | self.is_success = 0
327 | return True
328 | else:
329 | self.is_success, self.is_alert = 0, 0
330 | return False
331 |
332 |
333 | if __name__ == "__main__":
334 | GetToeflCrawler = GetToeflTestInfos()
335 | GetToeflCrawler.input_infos()
336 | captcha = GetToeflCrawler.get_captcha()
337 | GetToeflCrawler.login(captcha)
338 | GetToeflCrawler.find_seat()
339 | [CITYS, DATES] = GetToeflCrawler.get_all_DATE()
340 | CITYS.reverse()
341 |
342 | for s_date in DATES:
343 | for s_city in CITYS:
344 | GetToeflCrawler.CITY, GetToeflCrawler.DATE = s_city, s_date
345 | if [s_city, s_date] == [CITYS[0], DATES[0]]:
346 | virgin = True
347 | else:
348 | virgin = False
349 | GetToeflCrawler.send_query_condition(virgin)
350 | flag = WebDriverWait(GetToeflCrawler.driver, timeout=50).until(alert_or_success())
351 | GetToeflCrawler.save_date(i=1)
352 |
353 | GetToeflCrawler.driver.quit()
354 |
--------------------------------------------------------------------------------