├── .gitignore ├── crawl.py ├── demo.html ├── demo.ipynb ├── main.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from logging import handlers 3 | import pathlib 4 | import shutil 5 | import time 6 | import logging 7 | import os 8 | import re 9 | import tqdm 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support import wait, expected_conditions 12 | 13 | 14 | def wait_for_login(driver): 15 | '''Wait for the user to login if wos cannot be accessed directly.''' 16 | try: 17 | driver.find_element(By.XPATH, '//div[contains(@class, "shibboleth-login-form")]') 18 | input('Login before going next...\n') 19 | except: 20 | pass 21 | 22 | 23 | def switch_language_to_Eng(driver): 24 | '''Switch language from zh-cn to English.''' 25 | 26 | wait.WebDriverWait(driver, 10).until( 27 | expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, "search-main-box")]'))) 28 | 29 | close_pendo_windows(driver) 30 | try: 31 | driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() 32 | driver.find_element(By.XPATH, '//button[@lang="en"]').click() 33 | except: 34 | close_pendo_windows(driver) 35 | driver.find_element(By.XPATH, '//*[normalize-space(text())="简体中文"]').click() 36 | driver.find_element(By.XPATH, '//button[@lang="en"]').click() 37 | 38 | 39 | def close_pendo_windows(driver): 40 | '''Close guiding windows''' 41 | # Cookies 42 | try: 43 | driver.find_element(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]').click() 44 | except: 45 | pass 46 | # "Got it" 47 | try: 48 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-primaryButton")]').click() 49 | except: 50 | pass 51 | # "No thanks" 52 | try: 53 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-button-secondaryButton")]').click() 54 | except: 55 | pass 56 | # What was it... I forgot... 57 | try: 58 | driver.find_element(By.XPATH, '//span[contains(@class, "_pendo-close-guide")').click() 59 | except: 60 | pass 61 | # Overlay 62 | try: 63 | driver.find_element(By.XPATH, '//div[contains(@class, "cdk-overlay-container")]').click() 64 | except: 65 | pass 66 | # Overlay dialog 67 | try: 68 | driver.find_element(By.XPATH, '//button[contains(@class, "_pendo-close-guide")]').click() 69 | except: 70 | pass 71 | 72 | 73 | def mark_flag(path): 74 | '''Create a flag in the path to mark the task as completed.''' 75 | with open(os.path.join(path, 'completed.flag'), 'w') as f: 76 | f.write('1') 77 | 78 | 79 | def check_flag(path): 80 | '''Check if the flag in the path to check if task has been searched.''' 81 | return os.path.exists(path) and 'completed.flag' in os.listdir(path) 82 | 83 | 84 | def search_query(driver, path, query): 85 | '''Go to advanced search page, insert query into search frame and search the query.''' 86 | if not path == None: 87 | os.makedirs(path, exist_ok=True) 88 | logging.info(path) 89 | 90 | # Close extra windows 91 | if not len(driver.window_handles) == 1: 92 | handles = driver.window_handles 93 | for i_handle in range(len(handles)-1, 0, -1): # traverse in reverse order 94 | # Switch to the window and load the page 95 | driver.switch_to.window(handles[i_handle]) 96 | driver.close() 97 | driver.switch_to.window(handles[0]) 98 | 99 | ## Search query 100 | driver.get("https://www.webofscience.com/wos/alldb/advanced-search") 101 | max_retry = 3 102 | retry_times = 0 103 | while True: 104 | try: 105 | close_pendo_windows(driver) 106 | # Load the page 107 | wait.WebDriverWait(driver, 10).until( 108 | expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]'))) 109 | 110 | # Clear the field 111 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Clear "]').click() 112 | # Insert the query 113 | driver.find_element(By.XPATH, '//*[@id="advancedSearchInputArea"]').send_keys("{}".format(query)) 114 | # Click on the search button 115 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Search "]').click() 116 | break 117 | except: 118 | retry_times += 1 119 | if retry_times > max_retry: 120 | logging.error("Search exceeded max retries") 121 | return False 122 | else: 123 | # Retry 124 | logging.debug("Search retrying") 125 | # Wait for the query page 126 | try: 127 | wait.WebDriverWait(driver, 5).until( 128 | expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) 129 | except: 130 | try: 131 | # No results 132 | driver.find_element(By.XPATH, '//*[text()="Your search found no results"]') 133 | logging.warning(f'Your search found no results') 134 | # Mark as completed 135 | if not path == None: 136 | mark_flag(path) 137 | return False 138 | except: 139 | # Search failed 140 | driver.find_element(By.XPATH, '//div[contains(@class, "error-code")]') 141 | logging.error(driver.find_element(By.XPATH, '//div[contains(@class, "error-code")]').text) 142 | return False 143 | # Go to the next step 144 | return True 145 | 146 | 147 | def download_outbound(driver, default_download_path): 148 | '''Export the search results as outbound. The file is downloaded to default path set for the system.''' 149 | max_retry = 3 150 | retry_times = 0 151 | while True: 152 | close_pendo_windows(driver) 153 | # Not support search for more than 1000 results yet 154 | assert int(driver.find_element(By.XPATH, '//span[contains(@class, "end-page")]').text) < 1000, "Sorry, too many results!" 155 | # File should not exist on default download folder 156 | assert not os.path.exists(default_download_path), "File existed on default download folder!" 157 | try: 158 | # Click on "Export" 159 | driver.find_element(By.XPATH, '//span[contains(@class, "mat-button-wrapper") and text()=" Export "]').click() 160 | # Click on "Plain text file" 161 | try: 162 | driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and text()=" Plain text file "]').click() 163 | except: 164 | driver.find_element(By.XPATH, '//button[contains(@class, "mat-menu-item") and @aria-label="Plain text file"]').click() 165 | # Click on "Records from:" 166 | driver.find_element(By.XPATH, '//*[text()[contains(string(), "Records from:")]]').click() 167 | # Click on "Export" 168 | driver.find_element(By.XPATH, '//span[contains(@class, "ng-star-inserted") and text()="Export"]').click() 169 | # Wait for download to complete 170 | for retry_download in range(4): 171 | time.sleep(2) 172 | try: 173 | # If there is any "Internal error" 174 | wait.WebDriverWait(driver, 2).until( 175 | expected_conditions.presence_of_element_located((By.XPATH, '//div[text()="Server encountered an internal error"]'))) 176 | driver.find_element(By.XPATH, '//div[text()="Server encountered an internal error"]') 177 | driver.find_element(By.XPATH, '//*[contains(@class, "ng-star-inserted") and text()="Export"]').click() 178 | except: 179 | if os.path.exists(default_download_path): 180 | break 181 | # Download completed 182 | assert os.path.exists(default_download_path), "File not found!" 183 | return True 184 | except: 185 | retry_times += 1 186 | if retry_times > max_retry: 187 | logging.error("Crawl outbound exceeded max retries") 188 | return False 189 | else: 190 | # Retry 191 | logging.debug("Crawl outbound retrying") 192 | close_pendo_windows(driver) 193 | # Click on "Cancel" 194 | try: 195 | driver.find_element(By.XPATH, '//*[contains(@class, "mat-button-wrapper") and text()="Cancel "]').click() 196 | except: 197 | driver.refresh() 198 | time.sleep(1) 199 | wait.WebDriverWait(driver, 10).until( 200 | expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link'))) 201 | continue 202 | 203 | 204 | def process_outbound(driver, default_download_path, dst_path): 205 | '''Process the outbound downloaded to the default path set for the system.''' 206 | 207 | # Move the outbound to dest folder 208 | assert os.path.exists(default_download_path), "File not found!" 209 | if pathlib.Path(dst_path).is_dir(): 210 | dst_path = os.path.join(dst_path, 'record.txt') 211 | shutil.move(default_download_path, dst_path) 212 | logging.debug(f'Outbound saved in {dst_path}') 213 | 214 | # Load the downloaded outbound (for debug) 215 | with open(dst_path, "r", encoding='utf-8') as f_outbound: 216 | n_record_ref = len(re.findall("\nER\n", f_outbound.read())) 217 | assert n_record_ref == int("".join(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text.split(","))), "Records num do not match outbound num" 218 | return True 219 | 220 | 221 | def download_record(driver, path, records_id): 222 | '''Download the page to the path''' 223 | # Load the page or throw exception 224 | wait.WebDriverWait(driver, 10).until( 225 | expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, "title")]'))) 226 | 227 | # Download the record 228 | with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file: 229 | file.write(driver.page_source) 230 | logging.debug(f'record #{records_id} saved in {path}') 231 | 232 | 233 | def process_record(driver, path, records_id): 234 | '''Parse a page to get certain statistics''' 235 | # Show all authors and save raw data 236 | try: 237 | driver.find_element(By.XPATH, '//*[text()="...More"]').click() 238 | except: 239 | pass 240 | with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file: 241 | file.write(driver.page_source) 242 | logging.debug(f'record #{records_id} saved in {path}') 243 | 244 | 245 | def roll_down(driver, fold = 40): 246 | '''Roll down to the bottom of the page to load all results''' 247 | for i_roll in range(1, fold+1): 248 | time.sleep(0.1) 249 | driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') 250 | 251 | 252 | def save_screenshot(driver, prefix, pic_path): 253 | """Screenshot and save as a png""" 254 | 255 | # paper_id + current_time 256 | current_time = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) 257 | driver.save_screenshot(f'{pic_path}{str(prefix)}_{current_time}.png') 258 | 259 | 260 | def process_windows(driver, path, records_id): 261 | '''Process all subpages''' 262 | handles = driver.window_handles 263 | has_error = False 264 | for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order 265 | # Switch to the window and load the page 266 | driver.switch_to.window(handles[i_handle]) 267 | close_pendo_windows(driver) 268 | try: 269 | download_record(driver, path, records_id) 270 | process_record(driver, path, records_id) 271 | except: 272 | logging.error("Record downloading failed!") 273 | has_error = True 274 | records_id += 1 275 | driver.close() 276 | driver.switch_to.window(handles[0]) 277 | return len(handles) - 1 if not has_error else -1 278 | 279 | 280 | def process_records(driver, path): 281 | '''Open records as new subpages, download or parse subpages according to the setting.''' 282 | # init 283 | n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, "brand-blue")]').text) 284 | n_page = (n_record + 50 - 1) // 50 285 | assert n_page < 2000, "Too many pages" 286 | logging.debug(f'{n_record} records found, divided into {n_page} pages') 287 | 288 | records_id = 0 289 | url_set = set() 290 | for i_page in range(n_page): 291 | assert len(driver.window_handles) == 1, "Unexpected windows" 292 | roll_down(driver) 293 | 294 | # Open every record in a new window 295 | windows_count = 0 296 | for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, "summary-record-title-link")]'): 297 | if record.get_attribute("href") in url_set: 298 | # coz some records have more than 1 href link 299 | continue 300 | else: 301 | url_set.add(record.get_attribute("href")) 302 | time.sleep(0.5) 303 | driver.execute_script(f'window.open(\"{record.get_attribute("href")}\");') 304 | windows_count += 1 305 | if windows_count >= 10 and not windows_count % 5: 306 | # Save records and close windows 307 | increment = process_windows(driver, path, records_id) 308 | if increment != -1: 309 | records_id += increment 310 | else: 311 | return False 312 | time.sleep(5) 313 | 314 | # Save records and close windows 315 | increment = process_windows(driver, path, records_id) 316 | if increment != -1: 317 | records_id += increment 318 | else: 319 | return False 320 | # Go to the next page 321 | if i_page + 1 < n_page: 322 | driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, "arrowRight")]').click() 323 | return True 324 | 325 | 326 | def start_session(driver, task_list, default_download_path): 327 | ''' 328 | Start the search of all tasks. 329 | driver: the handle of a selenium.webdriver object 330 | task_list: the zip of save paths and advanced query strings 331 | default_download_path: the default path set for the system, for example, C://Downloads/ 332 | ''' 333 | 334 | # Init 335 | os.makedirs('logs', exist_ok=True) 336 | logging.basicConfig(level=logging.INFO, 337 | filename=os.getcwd() + '/logs/log' + time.strftime('%Y%m%d%H%M', 338 | time.localtime(time.time())) + '.log', 339 | filemode="w", 340 | format="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s" 341 | ) 342 | 343 | if not default_download_path.endswith("/savedrecs.txt"): 344 | default_download_path += "/savedrecs.txt" 345 | driver.get("https://www.webofscience.com/") 346 | wait_for_login(driver) 347 | # switch_language_to_Eng(driver) 348 | 349 | # Start Query 350 | for path, query in tqdm.tqdm(task_list): 351 | if not path == None and check_flag(path): continue 352 | 353 | # Search query 354 | if not search_query(driver, path, query): 355 | # Stop if download failed for some reason 356 | continue 357 | 358 | # Download the outbound 359 | if not download_outbound(driver, default_download_path): 360 | continue 361 | 362 | # Deal with the outbound 363 | if not process_outbound(driver, default_download_path, path): 364 | continue 365 | 366 | # Deal with records 367 | if not process_records(driver, path): 368 | continue 369 | 370 | # Search completed 371 | if not path == None: 372 | mark_flag(path) 373 | 374 | driver.quit() 375 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## If you want to follow the procedure by functions, you can run the code below by sequence..." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Load packages" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 4, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# encoding: utf-8\n", 24 | "from selenium import webdriver\n", 25 | "import shutil\n", 26 | "import time\n", 27 | "import logging\n", 28 | "import os\n", 29 | "import re\n", 30 | "import tqdm\n", 31 | "from selenium.webdriver.common.by import By\n", 32 | "from selenium.webdriver.support import wait, expected_conditions" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Set up param" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 6, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stderr", 49 | "output_type": "stream", 50 | "text": [ 51 | ":8: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n", 52 | " driver = webdriver.Chrome(\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "\n", 58 | "################ Set up parameters here #####################\n", 59 | "default_download_path = \"D:/Downloads/\"\n", 60 | " # The first string should be the path where your file is downloaded to by default.\n", 61 | " # Most likely, it should be like: \"C://Users/usr_name/Downloads\"\n", 62 | "path = \"results/search_1\"\n", 63 | "query = \"TI=(pFind) AND PY=(2016-2022)\"\n", 64 | " # These are the task to be searched\n", 65 | "driver = webdriver.Chrome(\n", 66 | " executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe'\n", 67 | " # This is the path where you place your chromedriver\n", 68 | ")\n", 69 | "#############################################################\n", 70 | "\n", 71 | "if not default_download_path.endswith(\"/savedrecs.txt\"):\n", 72 | " default_download_path += \"/savedrecs.txt\"\n", 73 | "driver.get(\"https://www.webofscience.com/\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Login in if your ip cannot visit WoS" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# Login\n", 90 | "try:\n", 91 | " driver.find_element(By.XPATH, '//div[contains(@class, \"shibboleth-login-form\")]')\n", 92 | " input('Login before going next...\\n')\n", 93 | "except:\n", 94 | " pass" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Close the annoying pop windows" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 8, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "def close_pendo_windows(driver):\n", 111 | " '''Close guiding windows'''\n", 112 | " # Cookies\n", 113 | " try:\n", 114 | " driver.find_element(By.XPATH, '//*[@id=\"onetrust-accept-btn-handler\"]').click()\n", 115 | " except:\n", 116 | " pass\n", 117 | " # \"Got it\"\n", 118 | " try:\n", 119 | " driver.find_element(By.XPATH, '//button[contains(@class, \"_pendo-button-primaryButton\")]').click()\n", 120 | " except:\n", 121 | " pass\n", 122 | " # \"No thanks\"\n", 123 | " try:\n", 124 | " driver.find_element(By.XPATH, '//button[contains(@class, \"_pendo-button-secondaryButton\")]').click()\n", 125 | " except:\n", 126 | " pass\n", 127 | " # What was it... I forgot...\n", 128 | " try:\n", 129 | " driver.find_element(By.XPATH, '//span[contains(@class, \"_pendo-close-guide\")').click()\n", 130 | " except:\n", 131 | " pass\n", 132 | " # Overlay\n", 133 | " try:\n", 134 | " driver.find_element(By.XPATH, '//div[contains(@class, \"cdk-overlay-container\")').click()\n", 135 | " except:\n", 136 | " pass " 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Switch language to English" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 10, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "\n", 153 | " wait.WebDriverWait(driver, 10).until(\n", 154 | " expected_conditions.presence_of_element_located((By.XPATH, '//*[contains(@name, \"search-main-box\")]')))\n", 155 | "\n", 156 | " close_pendo_windows(driver)\n", 157 | " try:\n", 158 | " driver.find_element(By.XPATH, '//*[normalize-space(text())=\"简体中文\"]').click()\n", 159 | " driver.find_element(By.XPATH, '//button[@lang=\"en\"]').click()\n", 160 | " except:\n", 161 | " close_pendo_windows(driver)\n", 162 | " driver.find_element(By.XPATH, '//*[normalize-space(text())=\"简体中文\"]').click()\n", 163 | " driver.find_element(By.XPATH, '//button[@lang=\"en\"]').click()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## Now open the search page and insert the query!\n", 171 | "ps: If this block does not run successfully, run it again." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 11, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "" 183 | ] 184 | }, 185 | "execution_count": 11, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "close_pendo_windows(driver)\n", 192 | "os.makedirs(path, exist_ok=True)\n", 193 | "\n", 194 | "driver.get(\"https://www.webofscience.com/wos/alldb/advanced-search\")\n", 195 | "\n", 196 | "close_pendo_windows(driver)\n", 197 | "# Load the page\n", 198 | "wait.WebDriverWait(driver, 10).until(\n", 199 | " expected_conditions.presence_of_element_located((By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Clear \"]')))\n", 200 | "\n", 201 | "# Clear the field\n", 202 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Clear \"]').click()\n", 203 | "# Insert the query\n", 204 | "driver.find_element(By.XPATH, '//*[@id=\"advancedSearchInputArea\"]').send_keys(\"{}\".format(query))\n", 205 | "# Click on the search button\n", 206 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Search \"]').click()\n", 207 | "\n", 208 | "# Wait for the query page\n", 209 | "wait.WebDriverWait(driver, 5).until(\n", 210 | " expected_conditions.presence_of_element_located((By.CLASS_NAME, 'title-link')))\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "## Export the search results as outbound\n", 218 | "ps: If this block does not run successfully, run it again." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 16, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "close_pendo_windows(driver)\n", 228 | "try:\n", 229 | " driver.find_element(By.XPATH, '//*[contains(@class, \"mat-button-wrapper\") and text()=\"Cancel \"]').click()\n", 230 | "except:\n", 231 | " pass\n", 232 | "# Not support search for more than 1000 results yet\n", 233 | "assert int(driver.find_element(By.XPATH, '//span[contains(@class, \"end-page\")]').text) < 1000, \"Sorry, too many results!\"\n", 234 | "# File should not exist on default download folder\n", 235 | "assert not os.path.exists(default_download_path), \"File existed on default download folder!\"\n", 236 | "\n", 237 | "# Click on \"Export\" \n", 238 | "driver.find_element(By.XPATH, '//span[contains(@class, \"mat-button-wrapper\") and text()=\" Export \"]').click()\n", 239 | "time.sleep(0.5)\n", 240 | "# Click on \"Plain text file\" \n", 241 | "try:\n", 242 | " driver.find_element(By.XPATH, '//button[contains(@class, \"mat-menu-item\") and text()=\" Plain text file \"]').click()\n", 243 | "except:\n", 244 | " driver.find_element(By.XPATH, '//button[contains(@class, \"mat-menu-item\") and @aria-label=\"Plain text file\"]').click()\n", 245 | "# Click on \"Records from:\"\n", 246 | "driver.find_element(By.XPATH, '//*[text()[contains(string(), \"Records from:\")]]').click()\n", 247 | "# Click on \"Export\"\n", 248 | "driver.find_element(By.XPATH, '//span[contains(@class, \"ng-star-inserted\") and text()=\"Export\"]').click()\n", 249 | "# Wait for download to complete\n", 250 | "for retry_download in range(4):\n", 251 | " time.sleep(1)\n", 252 | " try:\n", 253 | " # If there is any \"Internal error\"\n", 254 | " wait.WebDriverWait(driver, 2).until(\n", 255 | " expected_conditions.presence_of_element_located((By.XPATH, '//div[text()=\"Server encountered an internal error\"]'))) \n", 256 | " driver.find_element(By.XPATH, '//div[text()=\"Server encountered an internal error\"]')\n", 257 | " driver.find_element(By.XPATH, '//*[contains(@class, \"ng-star-inserted\") and text()=\"Export\"]').click()\n", 258 | " except:\n", 259 | " if os.path.exists(default_download_path):\n", 260 | " break\n", 261 | "# Download completed\n", 262 | "assert os.path.exists(default_download_path), \"File not found! Run this block again.\"" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Move the outbound to dest folder. Now there should be a file at results/search_1/record.txt" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 17, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "shutil.move(default_download_path, os.path.join(path, 'record.txt'))\n", 279 | "logging.debug(f'Outbound saved in {path}')\n", 280 | "# Load the downloaded outbound (for debug)\n", 281 | "with open(os.path.join(path, 'record.txt'), \"r\", encoding='utf-8') as f_outbound:\n", 282 | " n_record_ref = len(re.findall(\"\\nER\\n\", f_outbound.read()))\n", 283 | " assert n_record_ref == int(\"\".join(driver.find_element(By.XPATH, '//span[contains(@class, \"brand-blue\")]').text.split(\",\"))), \"Records num do not match outbound num\"" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "## Below are some functions required for the next block" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 18, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "\n", 300 | "def download_record(driver, path, records_id):\n", 301 | " '''Download a page'''\n", 302 | " # Load the page or throw exception\n", 303 | " wait.WebDriverWait(driver, 10).until(\n", 304 | " expected_conditions.presence_of_element_located((By.XPATH, '//h2[contains(@class, \"title\")]')))\n", 305 | "\n", 306 | " # Download the record\n", 307 | " with open(os.path.join(path, f'record-{records_id}.html'), 'w', encoding='utf-8') as file:\n", 308 | " file.write(driver.page_source)\n", 309 | " logging.debug(f'record #{records_id} saved in {path}')\n", 310 | "\n", 311 | "def process_record(driver, path, records_id):\n", 312 | " '''Parse a page'''\n", 313 | " # Show all authors and save raw data\n", 314 | " try:\n", 315 | " driver.find_element(By.XPATH, '//button[text()=\"...More\"]').click() \n", 316 | " except:\n", 317 | " pass\n", 318 | " with open(os.path.join(path, f'record-{records_id}.dat'), 'w', encoding='utf-8') as file:\n", 319 | " file.write(driver.page_source)\n", 320 | " logging.debug(f'record #{records_id} saved in {path}') \n", 321 | "\n", 322 | "\n", 323 | "def process_windows(driver, path, records_id):\n", 324 | " '''Process all subpages'''\n", 325 | " handles = driver.window_handles\n", 326 | " has_error = False\n", 327 | " for i_handle in range(len(driver.window_handles)-1, 0, -1): # traverse in reverse order\n", 328 | " # Switch to the window and load the page\n", 329 | " driver.switch_to.window(handles[i_handle])\n", 330 | " close_pendo_windows(driver)\n", 331 | " try:\n", 332 | " download_record(driver, path, records_id)\n", 333 | " process_record(driver, path, records_id)\n", 334 | " except:\n", 335 | " logging.error(\"Record downloading failed!\")\n", 336 | " has_error = True\n", 337 | " records_id += 1\n", 338 | " driver.close()\n", 339 | " driver.switch_to.window(handles[0])\n", 340 | " return len(handles) - 1 if not has_error else -1" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "## What are going to do next: click on all the results in new windows, switch to the window and save the page!" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 19, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# Deal with records\n", 357 | "# init\n", 358 | "n_record = int(driver.find_element(By.XPATH, '//span[contains(@class, \"brand-blue\")]').text)\n", 359 | "n_page = (n_record + 50 - 1) // 50\n", 360 | "assert n_page < 2000, \"Too many pages\"\n", 361 | "logging.debug(f'{n_record} records found, divided into {n_page} pages')\n", 362 | "\n", 363 | "records_id = 0\n", 364 | "url_set = set()\n", 365 | "for i_page in range(n_page):\n", 366 | " assert len(driver.window_handles) == 1, \"Unexpected windows\"\n", 367 | " # Roll down to the bottom of the page to show all results\n", 368 | " for i_roll in range(1, 41):\n", 369 | " time.sleep(0.1)\n", 370 | " driver.execute_script(f'window.scrollTo(0, {i_roll * 500});') \n", 371 | " \n", 372 | " # Open every record in a new window\n", 373 | " windows_count = 0\n", 374 | " for record in driver.find_elements(By.XPATH, '//a[contains(@data-ta, \"summary-record-title-link\")]'):\n", 375 | " if record.get_attribute(\"href\") in url_set:\n", 376 | " # coz some records have more than 1 href link \n", 377 | " continue \n", 378 | " else:\n", 379 | " url_set.add(record.get_attribute(\"href\")) \n", 380 | " time.sleep(0.5)\n", 381 | " driver.execute_script(f'window.open(\\\"{record.get_attribute(\"href\")}\\\");')\n", 382 | " windows_count += 1\n", 383 | " if windows_count >= 10 and not windows_count % 5:\n", 384 | " # Save records and close windows\n", 385 | " records_id += process_windows(driver, path, records_id)\n", 386 | " time.sleep(5)\n", 387 | " \n", 388 | " # Save records and close windows\n", 389 | " records_id += process_windows(driver, path, records_id)\n", 390 | " # Go to the next page\n", 391 | " if i_page + 1 < n_page: \n", 392 | " driver.find_element(By.XPATH, '//mat-icon[contains(@svgicon, \"arrowRight\")]').click()" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "## Now all work are done! Close the driver. That's all." 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 20, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "driver.quit()" 409 | ] 410 | } 411 | ], 412 | "metadata": { 413 | "interpreter": { 414 | "hash": "b3ba2566441a7c06988d0923437866b63cedc61552a5af99d1f4fb67d367b25f" 415 | }, 416 | "kernelspec": { 417 | "display_name": "Python 3.8.8 64-bit ('base': conda)", 418 | "name": "python3" 419 | }, 420 | "language_info": { 421 | "codemirror_mode": { 422 | "name": "ipython", 423 | "version": 3 424 | }, 425 | "file_extension": ".py", 426 | "mimetype": "text/x-python", 427 | "name": "python", 428 | "nbconvert_exporter": "python", 429 | "pygments_lexer": "ipython3", 430 | "version": "3.8.8" 431 | }, 432 | "orig_nbformat": 4 433 | }, 434 | "nbformat": 4, 435 | "nbformat_minor": 2 436 | } 437 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from crawl import * 3 | from selenium import webdriver 4 | 5 | if __name__ == '__main__': 6 | ################ Set up parameters here ##################### 7 | default_download_path = "C://Users/bigwh/Downloads" + "/savedrecs.txt" 8 | # The first string should be the path where your file is downloaded to by default. 9 | # Most likely, it should be like: "C://Users/usr_nm/Downloads" 10 | task_list = [ # folder_name, query 11 | ["results/search_1", "TI=(pFind) AND PY=(2016-2022)"], 12 | ["results/search_2", "TI=(Attention is All you Need)"] 13 | ] 14 | # These are the tasks to be searched 15 | driver = webdriver.Chrome( 16 | executable_path='C://Program Files//Google//Chrome//Application//chromedriver.exe' 17 | # This is the path where you place your chromedriver 18 | ) 19 | ############################################################# 20 | start_session(driver, task_list, default_download_path) 21 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # wos-selenium 2 | 3 | Web of Science spider implemented with selenium. 4 | 5 | This project mimic the click of mouse to query WoS, export plain text file and download results automatically. Since the new WoS does not support requests and posts easily with the help from Pendo.io (nice company though), old platforms using scrapy may no longer be used anymore. 6 | 7 | You can download the code, set up the config in `main.py` and run `main.py` to test the script. You can also follow the code and descriptions in `demo.ipynb`. You should install selenium and set up chromedriver (or firefox driver, etc.) before running the code. 8 | 9 | The logic of this project is: For each task, insert the query and do advanced search; then export the plain text file and move it to the destination path; finally open all results in new windows and download them to the destination path. 10 | 11 | This project supports English and Simplified Chinese for the time. For other languages, please change the function in `crawl.py/switch_language_to_Eng` with the help of development tools on your browser. You are welcome to fork this project and do improvements on it. 12 | 13 | jinyangl 14 | 15 | 2022.2 16 | 17 | --- 18 | 19 | 20 | 针对新版WoS的爬虫,通过模拟浏览器鼠标点击进行WoS的批量化查询、下载、处理。 21 | 22 | 使用方法:修改`main.py`中的参数,运行文件。也可以跟随`demo.ipynb`的介绍,以更详细地了解爬虫的代码(~~解决我还没解决的bug~~)。运行前确保selenium和chromedriver(或者firefox driver等)已经安装完毕。 23 | 24 | 代码的逻辑:对于每个任务,首先模拟输入query进行高级检索,然后下载纯文本文件,移动到path中,再打开所有的结果页面,下载到path中。 25 | 26 | 至今为止这大概是第一个针对新版WoS的爬虫,希望可以抛砖引玉。代码逻辑应该比较清晰,如果有其他需求可以自行修改,也欢迎与我交流(~~挖坑~~)。 27 | --------------------------------------------------------------------------------