├── .gitignore
├── crawl.py
├── demo.html
├── demo.ipynb
├── main.py
└── readme.md
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 |
--------------------------------------------------------------------------------
/crawl.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | from logging import handlers
3 | import pathlib
4 | import shutil
5 | import time
6 | import logging
7 | import os
8 | import re
9 | import tqdm
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support import wait, expected_conditions
12 |
13 |
14 | def wait_for_login(driver):
15 | '''Wait for the user to login if wos cannot be accessed directly.'''
16 | try:
17 | driver.find_element(By.XPATH, '//div[contains(@class, "shibboleth-login-form")]')
18 | input('Login before going next...\n')
19 | except:
20 | pass
21 |
22 |
23 | def switch_language_to_Eng(driver):
24 | '''Switch language from zh-cn to English.'''
25 |
26 | wait.WebDriverWait(driver, 10).until(
27 | expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, "search-main-box")]')))
28 |
29 | close_pendo_windows(driver)
30 | try:
31 | driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click()
32 | driver.find_element(By.XPATH, '//button[@lang="en"]').click()
33 | except:
34 | close_pendo_windows(driver)
35 | driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click()
36 | driver.find_element(By.XPATH, '//button[@lang="en"]').click()
37 |
38 |
39 | def close_pendo_windows(driver):
40 | '''Close guiding windows'''
41 | # Cookies
42 | try:
43 | driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click()
44 | except:
45 | pass
46 | # "Got it"
47 | try:
48 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click()
49 | except:
50 | pass
51 | # "No thanks"
52 | try:
53 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click()
54 | except:
55 | pass
56 | # What was it... I forgot...
57 | try:
58 | driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click()
59 | except:
60 | pass
61 | # Overlay
62 | try:
63 | driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click()
64 | except:
65 | pass
66 | # Overlay dialog
67 | try:
68 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click()
69 | except:
70 | pass
71 |
72 |
73 | def mark_flag(path):
74 | '''Create a flag in the path to mark the task as completed.'''
75 | with open(os.path.join(path, 'completed.flag'), 'w') as f:
76 | f.write('1')
77 |
78 |
79 | def check_flag(path):
80 | '''Check if the flag in the path to check if task has been searched.'''
81 | return os.path.exists(path) and 'completed.flag' in os.listdir(path)
82 |
83 |
84 | def search_query(driver, path, query):
85 | '''Go to advanced search page, insert query into search frame and search the query.'''
86 | if not path == None:
87 | os.makedirs(path, exist_ok=True)
88 | logging.info(path)
89 |
90 | # Close extra windows
91 | if not len(driver.window_handles) == 1:
92 | handles = driver.window_handles
93 | for i_handle in range(len(handles)-1, 0, -1): # traverse in reverse order
94 | # Switch to the window and load the page
95 | driver.switch_to.window(handles[i_handle])
96 | driver.close()
97 | driver.switch_to.window(handles[0])
98 |
99 | ## Search query
100 | driver.get("https://www.webofscience.com/wos/alldb/advanced-search")
101 | max_retry = 3
102 | retry_times = 0
103 | while True:
104 | try:
105 | close_pendo_windows(driver)
106 | # Load the page
107 | wait.WebDriverWait(driver, 10).until(
108 | expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]')))
109 |
110 | # Clear the field
111 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]').click()
112 | # Insert the query
113 | driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query))
114 | # Click on the search button
115 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click()
116 | break
117 | except:
118 | retry_times += 1
119 | if retry_times > max_retry:
120 | logging.error("Search exceeded max retries")
121 | return False
122 | else:
123 | # Retry
124 | logging.debug("Search retrying")
125 | # Wait for the query page
126 | try:
127 | wait.WebDriverWait(driver, 5).until(
128 | expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link')))
129 | except:
130 | try:
131 | # No results
132 | driver.find_element(By.XPATH, '//*[text()="Your search found no results"]')
133 | logging.warning(f'Your search found no results')
134 | # Mark as completed
135 | if not path == None:
136 | mark_flag(path)
137 | return False
138 | except:
139 | # Search failed
140 | driver.find_element(By.XPATH, '//div[contains(@class, "error-code")]')
141 | logging.error(driver.find_element(By.XPATH, '//div[contains(@class, "error-code")]').text)
142 | return False
143 | # Go to the next step
144 | return True
145 |
146 |
147 | def download_outbound(driver, default_download_path):
148 | '''Export the search results as outbound. The file is downloaded to default path set for the system.'''
149 | max_retry = 3
150 | retry_times = 0
151 | while True:
152 | close_pendo_windows(driver)
153 | # Not support search for more than 1000 results yet
154 | assert int(driver.find_element(By.XPATH, '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!"
155 | # File should not exist on default download folder
156 | assert not os.path.exists(default_download_path), "File existed on default download folder!"
157 | try:
158 | # Click on "Export"
159 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click()
160 | # Click on "Plain text file"
161 | try:
162 | driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]').click()
163 | except:
164 | driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]').click()
165 | # Click on "Records from:"
166 | driver.find_element(By.XPATH, '//*[text()[contains(string(), "Records from:")]]').click()
167 | # Click on "Export"
168 | driver.find_element(By.XPATH, '//span[contains(@class, "ng-star-inserted") and text()="Export"]').click()
169 | # Wait for download to complete
170 | for retry_download in range(4):
171 | time.sleep(2)
172 | try:
173 | # If there is any "Internal error"
174 | wait.WebDriverWait(driver, 2).until(
175 | expected_conditions.presence_of_element_located((By.XPATH, '//div[text()="Server encountered an internal error"]')))
176 | driver.find_element(By.XPATH, '//div[text()="Server encountered an internal error"]')
177 | driver.find_element(By.XPATH, '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click()
178 | except:
179 | if os.path.exists(default_download_path):
180 | break
181 | # Download completed
182 | assert os.path.exists(default_download_path), "File not found!"
183 | return True
184 | except:
185 | retry_times += 1
186 | if retry_times > max_retry:
187 | logging.error("Crawl outbound exceeded max retries")
188 | return False
189 | else:
190 | # Retry
191 | logging.debug("Crawl outbound retrying")
192 | close_pendo_windows(driver)
193 | # Click on "Cancel"
194 | try:
195 | driver.find_element(By.XPATH, '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click()
196 | except:
197 | driver.refresh()
198 | time.sleep(1)
199 | wait.WebDriverWait(driver, 10).until(
200 | expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link')))
201 | continue
202 |
203 |
204 | def process_outbound(driver, default_download_path, dst_path):
205 | '''Process the outbound downloaded to the default path set for the system.'''
206 |
207 | # Move the outbound to dest folder
208 | assert os.path.exists(default_download_path), "File not found!"
209 | if pathlib.Path(dst_path).is_dir():
210 | dst_path = os.path.join(dst_path, 'record.txt')
211 | shutil.move(default_download_path, dst_path)
212 | logging.debug(f'Outbound saved in {dst_path}')
213 |
214 | # Load the downloaded outbound (for debug)
215 | with open(dst_path, "r", encoding='utf-8') as f_outbound:
216 | n_record_ref = len(re.findall("\nER\n", f_outbound.read()))
217 | assert n_record_ref == int("".join(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split(","))), "Records num do not match outbound num"
218 | return True
219 |
220 |
221 | def download_record(driver, path, records_id):
222 | '''Download the page to the path'''
223 | # Load the page or throw exception
224 | wait.WebDriverWait(driver, 10).until(
225 | expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]')))
226 |
227 | # Download the record
228 | with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file:
229 | file.write(driver.page_source)
230 | logging.debug(f'record #{records_id} saved in {path}')
231 |
232 |
233 | def process_record(driver, path, records_id):
234 | '''Parse a page to get certain statistics'''
235 | # Show all authors and save raw data
236 | try:
237 | driver.find_element(By.XPATH, '//*[text()="...More"]').click()
238 | except:
239 | pass
240 | with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file:
241 | file.write(driver.page_source)
242 | logging.debug(f'record #{records_id} saved in {path}')
243 |
244 |
245 | def roll_down(driver, fold = 40):
246 | '''Roll down to the bottom of the page to load all results'''
247 | for i_roll in range(1, fold+1):
248 | time.sleep(0.1)
249 | driver.execute_script(f'window.scrollTo(0, {i_roll * 500});')
250 |
251 |
252 | def save_screenshot(driver, prefix, pic_path):
253 | """Screenshot and save as a png"""
254 |
255 | # paper_id + current_time
256 | current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
257 | driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png')
258 |
259 |
260 | def process_windows(driver, path, records_id):
261 | '''Process all subpages'''
262 | handles = driver.window_handles
263 | has_error = False
264 | for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order
265 | # Switch to the window and load the page
266 | driver.switch_to.window(handles[i_handle])
267 | close_pendo_windows(driver)
268 | try:
269 | download_record(driver, path, records_id)
270 | process_record(driver, path, records_id)
271 | except:
272 | logging.error("Record downloading failed!")
273 | has_error = True
274 | records_id += 1
275 | driver.close()
276 | driver.switch_to.window(handles[0])
277 | return len(handles) - 1 if not has_error else -1
278 |
279 |
280 | def process_records(driver, path):
281 | '''Open records as new subpages, download or parse subpages according to the setting.'''
282 | # init
283 | n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text)
284 | n_page = (n_record + 50 - 1) // 50
285 | assert n_page < 2000, "Too many pages"
286 | logging.debug(f'{n_record} records found, divided into {n_page} pages')
287 |
288 | records_id = 0
289 | url_set = set()
290 | for i_page in range(n_page):
291 | assert len(driver.window_handles) == 1, "Unexpected windows"
292 | roll_down(driver)
293 |
294 | # Open every record in a new window
295 | windows_count = 0
296 | for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, "summary-record-title-link")]'):
297 | if record.get_attribute("href") in url_set:
298 | # coz some records have more than 1 href link
299 | continue
300 | else:
301 | url_set.add(record.get_attribute("href"))
302 | time.sleep(0.5)
303 | driver.execute_script(f'window.open(\"{record.get_attribute("href")}\");')
304 | windows_count += 1
305 | if windows_count >= 10 and not windows_count % 5:
306 | # Save records and close windows
307 | increment = process_windows(driver, path, records_id)
308 | if increment != -1:
309 | records_id += increment
310 | else:
311 | return False
312 | time.sleep(5)
313 |
314 | # Save records and close windows
315 | increment = process_windows(driver, path, records_id)
316 | if increment != -1:
317 | records_id += increment
318 | else:
319 | return False
320 | # Go to the next page
321 | if i_page + 1 < n_page:
322 | driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, "arrowRight")]').click()
323 | return True
324 |
325 |
326 | def start_session(driver, task_list, default_download_path):
327 | '''
328 | Start the search of all tasks.
329 | driver: the handle of a selenium.webdriver object
330 | task_list: the zip of save paths and advanced query strings
331 | default_download_path: the default path set for the system, for example, C://Downloads/
332 | '''
333 |
334 | # Init
335 | os.makedirs('logs', exist_ok=True)
336 | logging.basicConfig(level=logging.INFO,
337 | filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M',
338 | time.localtime(time.time())) + '.log',
339 | filemode="w",
340 | format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s"
341 | )
342 |
343 | if not default_download_path.endswith("/savedrecs.txt"):
344 | default_download_path += "/savedrecs.txt"
345 | driver.get("https://www.webofscience.com/")
346 | wait_for_login(driver)
347 | # switch_language_to_Eng(driver)
348 |
349 | # Start Query
350 | for path, query in tqdm.tqdm(task_list):
351 | if not path == None and check_flag(path): continue
352 |
353 | # Search query
354 | if not search_query(driver, path, query):
355 | # Stop if download failed for some reason
356 | continue
357 |
358 | # Download the outbound
359 | if not download_outbound(driver, default_download_path):
360 | continue
361 |
362 | # Deal with the outbound
363 | if not process_outbound(driver, default_download_path, path):
364 | continue
365 |
366 | # Deal with records
367 | if not process_records(driver, path):
368 | continue
369 |
370 | # Search completed
371 | if not path == None:
372 | mark_flag(path)
373 |
374 | driver.quit()
375 |
--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## If you want to follow the procedure by functions, you can run the code below by sequence..."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Load packages"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 4,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "# encoding: utf-8\n",
24 | "from selenium import webdriver\n",
25 | "import shutil\n",
26 | "import time\n",
27 | "import logging\n",
28 | "import os\n",
29 | "import re\n",
30 | "import tqdm\n",
31 | "from selenium.webdriver.common.by import By\n",
32 | "from selenium.webdriver.support import wait, expected_conditions"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Set up param"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 6,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stderr",
49 | "output_type": "stream",
50 | "text": [
51 | ":8: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n",
52 | " driver = webdriver.Chrome(\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "\n",
58 | "################ Set up parameters here #####################\n",
59 | "default_download_path = \"D:/Downloads/\"\n",
60 | " # The first string should be the path where your file is downloaded to by default.\n",
61 | " # Most likely, it should be like: \"C://Users/usr_name/Downloads\"\n",
62 | "path = \"results/search_1\"\n",
63 | "query = \"TI=(pFind) AND PY=(2016-2022)\"\n",
64 | " # These are the task to be searched\n",
65 | "driver = webdriver.Chrome(\n",
66 | " executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe'\n",
67 | " # This is the path where you place your chromedriver\n",
68 | ")\n",
69 | "#############################################################\n",
70 | "\n",
71 | "if not default_download_path.endswith(\"/savedrecs.txt\"):\n",
72 | " default_download_path += \"/savedrecs.txt\"\n",
73 | "driver.get(\"https://www.webofscience.com/\")"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## Login in if your ip cannot visit WoS"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 7,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# Login\n",
90 | "try:\n",
91 | " driver.find_element(By.XPATH, '//div[contains(@class, \"shibboleth-login-form\")]')\n",
92 | " input('Login before going next...\\n')\n",
93 | "except:\n",
94 | " pass"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "## Close the annoying pop windows"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 8,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "def close_pendo_windows(driver):\n",
111 | " '''Close guiding windows'''\n",
112 | " # Cookies\n",
113 | " try:\n",
114 | " driver.find_element(By.XPATH, '//*[@id=\"onetrust-accept-btn-handler\"]').click()\n",
115 | " except:\n",
116 | " pass\n",
117 | " # \"Got it\"\n",
118 | " try:\n",
119 | " driver.find_element(By.XPATH, '//button[contains(@class, \"_pendo-button-primaryButton\")]').click()\n",
120 | " except:\n",
121 | " pass\n",
122 | " # \"No thanks\"\n",
123 | " try:\n",
124 | " driver.find_element(By.XPATH, '//button[contains(@class, \"_pendo-button-secondaryButton\")]').click()\n",
125 | " except:\n",
126 | " pass\n",
127 | " # What was it... I forgot...\n",
128 | " try:\n",
129 | " driver.find_element(By.XPATH, '//span[contains(@class, \"_pendo-close-guide\")').click()\n",
130 | " except:\n",
131 | " pass\n",
132 | " # Overlay\n",
133 | " try:\n",
134 | " driver.find_element(By.XPATH, '//div[contains(@class, \"cdk-overlay-container\")').click()\n",
135 | " except:\n",
136 | " pass "
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "## Switch language to English"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 10,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "\n",
153 | " wait.WebDriverWait(driver, 10).until(\n",
154 | " expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, \"search-main-box\")]')))\n",
155 | "\n",
156 | " close_pendo_windows(driver)\n",
157 | " try:\n",
158 | " driver.find_element(By.XPATH, '//*[normalize-space(text())=\"简体中文\"]').click()\n",
159 | " driver.find_element(By.XPATH, '//button[@lang=\"en\"]').click()\n",
160 | " except:\n",
161 | " close_pendo_windows(driver)\n",
162 | " driver.find_element(By.XPATH, '//*[normalize-space(text())=\"简体中文\"]').click()\n",
163 | " driver.find_element(By.XPATH, '//button[@lang=\"en\"]').click()"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "## Now open the search page and insert the query!\n",
171 | "ps: If this block does not run successfully, run it again."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 11,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | ""
183 | ]
184 | },
185 | "execution_count": 11,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "close_pendo_windows(driver)\n",
192 | "os.makedirs(path, exist_ok=True)\n",
193 | "\n",
194 | "driver.get(\"https://www.webofscience.com/wos/alldb/advanced-search\")\n",
195 | "\n",
196 | "close_pendo_windows(driver)\n",
197 | "# Load the page\n",
198 | "wait.WebDriverWait(driver, 10).until(\n",
199 | " expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Clear \"]')))\n",
200 | "\n",
201 | "# Clear the field\n",
202 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Clear \"]').click()\n",
203 | "# Insert the query\n",
204 | "driver.find_element(By.XPATH, '//*[@id=\"advancedSearchInputArea\"]').send_keys(\"{}\".format(query))\n",
205 | "# Click on the search button\n",
206 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Search \"]').click()\n",
207 | "\n",
208 | "# Wait for the query page\n",
209 | "wait.WebDriverWait(driver, 5).until(\n",
210 | " expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link')))\n"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "## Export the search results as outbound\n",
218 | "ps: If this block does not run successfully, run it again."
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 16,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "close_pendo_windows(driver)\n",
228 | "try:\n",
229 | " driver.find_element(By.XPATH, '//*[contains(@class, \"mat-button-wrapper\") and text()=\"Cancel \"]').click()\n",
230 | "except:\n",
231 | " pass\n",
232 | "# Not support search for more than 1000 results yet\n",
233 | "assert int(driver.find_element(By.XPATH, '//span[contains(@class, \"end-page\")]').text) < 1000, \"Sorry, too many results!\"\n",
234 | "# File should not exist on default download folder\n",
235 | "assert not os.path.exists(default_download_path), \"File existed on default download folder!\"\n",
236 | "\n",
237 | "# Click on \"Export\" \n",
238 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Export \"]').click()\n",
239 | "time.sleep(0.5)\n",
240 | "# Click on \"Plain text file\" \n",
241 | "try:\n",
242 | " driver.find_element(By.XPATH, '//button[contains(@class, \"mat-menu-item\") and text()=\" Plain text file \"]').click()\n",
243 | "except:\n",
244 | " driver.find_element(By.XPATH, '//button[contains(@class, \"mat-menu-item\") and @aria-label=\"Plain text file\"]').click()\n",
245 | "# Click on \"Records from:\"\n",
246 | "driver.find_element(By.XPATH, '//*[text()[contains(string(), \"Records from:\")]]').click()\n",
247 | "# Click on \"Export\"\n",
248 | "driver.find_element(By.XPATH, '//span[contains(@class, \"ng-star-inserted\") and text()=\"Export\"]').click()\n",
249 | "# Wait for download to complete\n",
250 | "for retry_download in range(4):\n",
251 | " time.sleep(1)\n",
252 | " try:\n",
253 | " # If there is any \"Internal error\"\n",
254 | " wait.WebDriverWait(driver, 2).until(\n",
255 | " expected_conditions.presence_of_element_located((By.XPATH, '//div[text()=\"Server encountered an internal error\"]'))) \n",
256 | " driver.find_element(By.XPATH, '//div[text()=\"Server encountered an internal error\"]')\n",
257 | " driver.find_element(By.XPATH, '//*[contains(@class, \"ng-star-inserted\") and text()=\"Export\"]').click()\n",
258 | " except:\n",
259 | " if os.path.exists(default_download_path):\n",
260 | " break\n",
261 | "# Download completed\n",
262 | "assert os.path.exists(default_download_path), \"File not found! Run this block again.\""
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "## Move the outbound to dest folder. Now there should be a file at results/search_1/record.txt"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 17,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "shutil.move(default_download_path, os.path.join(path, 'record.txt'))\n",
279 | "logging.debug(f'Outbound saved in {path}')\n",
280 | "# Load the downloaded outbound (for debug)\n",
281 | "with open(os.path.join(path, 'record.txt'), \"r\", encoding='utf-8') as f_outbound:\n",
282 | " n_record_ref = len(re.findall(\"\\nER\\n\", f_outbound.read()))\n",
283 | " assert n_record_ref == int(\"\".join(driver.find_element(By.XPATH, '//span[contains(@class, \"brand-blue\")]').text.split(\",\"))), \"Records num do not match outbound num\""
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "## Below are some functions required for the next block"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 18,
296 | "metadata": {},
297 | "outputs": [],
298 | "source": [
299 | "\n",
300 | "def download_record(driver, path, records_id):\n",
301 | " '''Download a page'''\n",
302 | " # Load the page or throw exception\n",
303 | " wait.WebDriverWait(driver, 10).until(\n",
304 | " expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, \"title\")]')))\n",
305 | "\n",
306 | " # Download the record\n",
307 | " with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file:\n",
308 | " file.write(driver.page_source)\n",
309 | " logging.debug(f'record #{records_id} saved in {path}')\n",
310 | "\n",
311 | "def process_record(driver, path, records_id):\n",
312 | " '''Parse a page'''\n",
313 | " # Show all authors and save raw data\n",
314 | " try:\n",
315 | " driver.find_element(By.XPATH, '//button[text()=\"...More\"]').click() \n",
316 | " except:\n",
317 | " pass\n",
318 | " with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file:\n",
319 | " file.write(driver.page_source)\n",
320 | " logging.debug(f'record #{records_id} saved in {path}') \n",
321 | "\n",
322 | "\n",
323 | "def process_windows(driver, path, records_id):\n",
324 | " '''Process all subpages'''\n",
325 | " handles = driver.window_handles\n",
326 | " has_error = False\n",
327 | " for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order\n",
328 | " # Switch to the window and load the page\n",
329 | " driver.switch_to.window(handles[i_handle])\n",
330 | " close_pendo_windows(driver)\n",
331 | " try:\n",
332 | " download_record(driver, path, records_id)\n",
333 | " process_record(driver, path, records_id)\n",
334 | " except:\n",
335 | " logging.error(\"Record downloading failed!\")\n",
336 | " has_error = True\n",
337 | " records_id += 1\n",
338 | " driver.close()\n",
339 | " driver.switch_to.window(handles[0])\n",
340 | " return len(handles) - 1 if not has_error else -1"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "## What are going to do next: click on all the results in new windows, switch to the window and save the page!"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 19,
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "# Deal with records\n",
357 | "# init\n",
358 | "n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, \"brand-blue\")]').text)\n",
359 | "n_page = (n_record + 50 - 1) // 50\n",
360 | "assert n_page < 2000, \"Too many pages\"\n",
361 | "logging.debug(f'{n_record} records found, divided into {n_page} pages')\n",
362 | "\n",
363 | "records_id = 0\n",
364 | "url_set = set()\n",
365 | "for i_page in range(n_page):\n",
366 | " assert len(driver.window_handles) == 1, \"Unexpected windows\"\n",
367 | " # Roll down to the bottom of the page to show all results\n",
368 | " for i_roll in range(1, 41):\n",
369 | " time.sleep(0.1)\n",
370 | " driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') \n",
371 | " \n",
372 | " # Open every record in a new window\n",
373 | " windows_count = 0\n",
374 | " for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, \"summary-record-title-link\")]'):\n",
375 | " if record.get_attribute(\"href\") in url_set:\n",
376 | " # coz some records have more than 1 href link \n",
377 | " continue \n",
378 | " else:\n",
379 | " url_set.add(record.get_attribute(\"href\")) \n",
380 | " time.sleep(0.5)\n",
381 | " driver.execute_script(f'window.open(\\\"{record.get_attribute(\"href\")}\\\");')\n",
382 | " windows_count += 1\n",
383 | " if windows_count >= 10 and not windows_count % 5:\n",
384 | " # Save records and close windows\n",
385 | " records_id += process_windows(driver, path, records_id)\n",
386 | " time.sleep(5)\n",
387 | " \n",
388 | " # Save records and close windows\n",
389 | " records_id += process_windows(driver, path, records_id)\n",
390 | " # Go to the next page\n",
391 | " if i_page + 1 < n_page: \n",
392 | " driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, \"arrowRight\")]').click()"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "metadata": {},
398 | "source": [
399 | "## Now all work are done! Close the driver. That's all."
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 20,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "driver.quit()"
409 | ]
410 | }
411 | ],
412 | "metadata": {
413 | "interpreter": {
414 | "hash": "b3ba2566441a7c06988d0923437866b63cedc61552a5af99d1f4fb67d367b25f"
415 | },
416 | "kernelspec": {
417 | "display_name": "Python 3.8.8 64-bit ('base': conda)",
418 | "name": "python3"
419 | },
420 | "language_info": {
421 | "codemirror_mode": {
422 | "name": "ipython",
423 | "version": 3
424 | },
425 | "file_extension": ".py",
426 | "mimetype": "text/x-python",
427 | "name": "python",
428 | "nbconvert_exporter": "python",
429 | "pygments_lexer": "ipython3",
430 | "version": "3.8.8"
431 | },
432 | "orig_nbformat": 4
433 | },
434 | "nbformat": 4,
435 | "nbformat_minor": 2
436 | }
437 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | from crawl import *
3 | from selenium import webdriver
4 |
5 | if __name__ == '__main__':
6 | ################ Set up parameters here #####################
7 | default_download_path = "C://Users/bigwh/Downloads" + "/savedrecs.txt"
8 | # The first string should be the path where your file is downloaded to by default.
9 | # Most likely, it should be like: "C://Users/usr_nm/Downloads"
10 | task_list = [ # folder_name, query
11 | ["results/search_1", "TI=(pFind) AND PY=(2016-2022)"],
12 | ["results/search_2", "TI=(Attention is All you Need)"]
13 | ]
14 | # These are the tasks to be searched
15 | driver = webdriver.Chrome(
16 | executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe'
17 | # This is the path where you place your chromedriver
18 | )
19 | #############################################################
20 | start_session(driver, task_list, default_download_path)
21 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # wos-selenium
2 |
3 | Web of Science spider implemented with selenium.
4 |
5 | This project mimic the click of mouse to query WoS, export plain text file and download results automatically. Since the new WoS does not support requests and posts easily with the help from Pendo.io (nice company though), old platforms using scrapy may no longer be used anymore.
6 |
7 | You can download the code, set up the config in `main.py` and run `main.py` to test the script. You can also follow the code and descriptions in `demo.ipynb`. You should install selenium and set up chromedriver (or firefox driver, etc.) before running the code.
8 |
9 | The logic of this project is: For each task, insert the query and do advanced search; then export the plain text file and move it to the destination path; finally open all results in new windows and download them to the destination path.
10 |
11 | This project supports English and Simplified Chinese for the time. For other languages, please change the function in `crawl.py/switch_language_to_Eng` with the help of development tools on your browser. You are welcome to fork this project and do improvements on it.
12 |
13 | jinyangl
14 |
15 | 2022.2
16 |
17 | ---
18 |
19 |
20 | 针对新版WoS的爬虫,通过模拟浏览器鼠标点击进行WoS的批量化查询、下载、处理。
21 |
22 | 使用方法:修改`main.py`中的参数,运行文件。也可以跟随`demo.ipynb`的介绍,以更详细地了解爬虫的代码(~~解决我还没解决的bug~~)。运行前确保selenium和chromedriver(或者firefox driver等)已经安装完毕。
23 |
24 | 代码的逻辑:对于每个任务,首先模拟输入query进行高级检索,然后下载纯文本文件,移动到path中,再打开所有的结果页面,下载到path中。
25 |
26 | 至今为止这大概是第一个针对新版WoS的爬虫,希望可以抛砖引玉。代码逻辑应该比较清晰,如果有其他需求可以自行修改,也欢迎与我交流(~~挖坑~~)。
27 |
--------------------------------------------------------------------------------