├── LICENSE ├── README.md ├── crawled.json ├── crawler.py └── start.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Mohammad Sadegh Salimi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ResearchGate Crawler 2 | Python script for crawling ResearchGate.net papers 3 | 4 | ## About the script 5 | This code start crawling process by urls in ```start.txt``` and give paper details in ```crawled.json```. 6 | 7 | ## Requirements 8 | First install Python. 9 | Then install these libraries: 10 | ``` 11 | pip install selenium 12 | pip install webdriver-manager 13 | ``` 14 | 15 | ## Parameters 16 | ```MAX_FETCH_COUNT```: How many pages you want to crawl? 17 | 18 | ```MAX_CACHED_NUM```: We renew ```crawled.json``` after crawling each ```MAX_CACHED_NUM``` papers. 19 | -------------------------------------------------------------------------------- /crawled.json: -------------------------------------------------------------------------------- 1 | [] -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | # this code is written by @SMSadegh19 2 | 3 | from lib2to3.pgen2 import driver 4 | import time 5 | import json 6 | 7 | from webdriver_manager.chrome import ChromeDriverManager 8 | 9 | ###### selenium imports 10 | from selenium import webdriver 11 | from selenium.webdriver import common 12 | from selenium.webdriver.common.keys import Keys 13 | from selenium.webdriver.chrome.options import Options 14 | from selenium.webdriver.support.ui import WebDriverWait 15 | from selenium.webdriver.common.action_chains import ActionChains 16 | from selenium.webdriver.common.by import By 17 | from selenium.webdriver.support.ui import WebDriverWait 18 | from selenium.webdriver.support import expected_conditions as EC 19 | ###### 20 | 21 | # ids: 22 | crawled_ids = set() 23 | # list of PageDatas: 24 | page_ids = set() 25 | pages_queue = [] 26 | MAX_FETCH_COUNT = 2000 27 | MAX_CACHED_NUM = 10 28 | BASE_URL = "https://www.researchgate.net/publication/" 29 | DRIVER_PATH = ChromeDriverManager().install() 30 | 31 | cached_pages = [] 32 | 33 | class PageData: 34 | def __init__(self, id, url): 35 | self.url = url 36 | self.id = id 37 | self.title = "" 38 | self.abstract = "" 39 | self.date = "" 40 | self.authors = [] 41 | self.references = [] 42 | 43 | def save_to_json(self): 44 | global cached_pages 45 | new_obj = {'id': self.id, 'title': self.title, 'abstract': self.abstract, 'date': self.date, 'authors': self.authors, 'references': self.references} 46 | cached_pages.append(new_obj) 47 | if len(cached_pages) > MAX_CACHED_NUM: 48 | crawled_pages = [] 49 | with open("crawled.json") as f: 50 | crawled_pages = json.load(f) 51 | crawled_pages.extend(cached_pages) 52 | cached_pages = [] 53 | with open("crawled.json", "w") as f: 54 | json.dump(crawled_pages, f, indent=4) 55 | 56 | def fetch_page(self): 57 | try: 58 | options = Options() 59 | options.headless = True 60 | driver = webdriver.Chrome(DRIVER_PATH, chrome_options=options) 61 | driver.get(self.url) 62 | print(self.url) 63 | time.sleep(3) 64 | self.url = driver.current_url 65 | 66 | ### get title 67 | self.title = driver.find_element(By.CSS_SELECTOR, '#lite-page > main > section > section.research-detail-header-section > div > div > h1').text 68 | print(self.title) 69 | 70 | ### get abstract 71 | try: 72 | self.abstract = driver.find_element(By.XPATH, '//*[@id="lite-page"]/main/section/div[1]/div[1]/div/div[2]/div').text 73 | except: 74 | self.abstract = "" 75 | print(self.abstract) 76 | 77 | ### get date 78 | elem = driver.find_element(By.CSS_SELECTOR, '#lite-page > main > section > section.research-detail-header-section > div > div > div.research-detail-header-section__metadata > div:nth-child(1) > ul > li') 79 | self.date = elem.text.split()[1] 80 | print(self.date) 81 | 82 | ### get authors 83 | # click show all authors 84 | try: 85 | elem = driver.find_element(By.XPATH, '//*[@id="lite-page"]/main/section/section[1]/div/div/div[4]/div/span[1]/a') 86 | if not ("Show" in elem.text): 87 | raise ValueError("errrrr") 88 | elem.click() 89 | except: 90 | pass 91 | # get authors name 92 | elems = driver.find_elements(By.XPATH, '//*[@id="lite-page"]/main/section/section[1]/div/div/div[3]/div') 93 | for elem in elems: 94 | self.authors.append(elem.text.split('\n')[0]) 95 | print(self.authors) 96 | 97 | ### get references 98 | elems = [] 99 | try: 100 | elems = driver.find_elements(By.CSS_SELECTOR, '#references > div > div > div > div > div > div > div > div > div > div > div:nth-child(1) > div > a') 101 | if len(elems) == 0: 102 | raise ValueError("errrrrr") 103 | except: 104 | elems = driver.find_elements(By.CSS_SELECTOR, '#citations > div > div > div > div > div > div > div > div > div > div > div:nth-child(1) > div > a') 105 | print(len(elems)) 106 | for elem in elems[:10]: 107 | ref_url = elem.get_attribute('href') 108 | self.references.append(get_id_from_url(ref_url)) 109 | print(self.references) 110 | 111 | for ref_id in self.references: 112 | add_page_to_queue(ref_id) 113 | 114 | self.save_to_json() 115 | crawled_ids.add(self.id) 116 | except Exception as err: 117 | print(err) 118 | print("Error during fetch:", " id = ", self.id, " url = ", self.url) 119 | finally: 120 | driver.quit() 121 | 122 | 123 | def get_id_from_url(url): 124 | start = url.index("publication") + 12 125 | return url[start:].split("_")[0] 126 | 127 | 128 | def read_crawled_file(): 129 | global crawled_ids, page_ids 130 | with open("crawled.json") as f: 131 | crawled_pages = json.load(f) 132 | for x in crawled_pages: 133 | page_id, page_references = x['id'], x['references'] 134 | for ref_id in page_references: 135 | add_page_to_queue(ref_id) 136 | crawled_ids.add(page_id) 137 | page_ids.add(page_id) 138 | 139 | 140 | def read_start_file(): 141 | global pages_queue, page_ids, crawled_ids 142 | with open("start.txt", "r") as f: 143 | urls = f.readlines() 144 | for x in urls: 145 | url = x.strip() 146 | pid = get_id_from_url(url) 147 | add_page_to_queue(pid) 148 | 149 | 150 | def add_page_to_queue(pid): 151 | if (pid not in page_ids) and (pid not in crawled_ids): 152 | pages_queue.append(PageData(pid, BASE_URL + pid)) 153 | page_ids.add(pid) 154 | 155 | 156 | def get_a_page_to_fetch(): 157 | global pages_queue, crawled_ids 158 | while len(pages_queue) > 0: 159 | page = pages_queue.pop() 160 | if page.id not in crawled_ids: 161 | return page 162 | return None 163 | 164 | 165 | def fetch_pages(): 166 | global crawled_ids 167 | while len(crawled_ids) < MAX_FETCH_COUNT: 168 | print("***************** Here we are at crawling ", len(crawled_ids)) 169 | page = get_a_page_to_fetch() 170 | if page is None: 171 | print("Queue is empty. There is no more page to crawl. :D") 172 | break 173 | page.fetch_page() 174 | 175 | 176 | read_crawled_file() 177 | read_start_file() 178 | 179 | fetch_pages() -------------------------------------------------------------------------------- /start.txt: -------------------------------------------------------------------------------- 1 | https://www.researchgate.net/publication/317558625_Attention_Is_All_You_Need 2 | https://www.researchgate.net/publication/286512696_Deep_Residual_Learning_for_Image_Recognition 3 | https://www.researchgate.net/publication/333444574_EfficientNet_Rethinking_Model_Scaling_for_Convolutional_Neural_Networks 4 | https://www.researchgate.net/publication/352015995_SegFormer_Simple_and_Efficient_Design_for_Semantic_Segmentation_with_Transformers 5 | https://www.researchgate.net/publication/354891122_PASS_An_ImageNet_replacement_for_self-supervised_pretraining_without_humans 6 | https://www.researchgate.net/publication/328230984_BERT_Pre-training_of_Deep_Bidirectional_Transformers_for_Language_Understanding 7 | https://www.researchgate.net/publication/344828174_An_Image_is_Worth_16x16_Words_Transformers_for_Image_Recognition_at_Scale --------------------------------------------------------------------------------