├── LICENSE
├── README.md
├── crawled.json
├── crawler.py
└── start.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Mohammad Sadegh Salimi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ResearchGate Crawler
 2 | Python script for crawling ResearchGate.net papers
 3 | 
 4 | ## About the script
 5 | This code start crawling process by urls in ```start.txt``` and give paper details in ```crawled.json```.
 6 | 
 7 | ## Requirements
 8 | First install Python.
 9 | Then install these libraries:
10 | ```
11 | pip install selenium
12 | pip install webdriver-manager
13 | ```
14 | 
15 | ## Parameters
16 | ```MAX_FETCH_COUNT```: How many pages you want to crawl?
17 | 
18 | ```MAX_CACHED_NUM```: We renew ```crawled.json``` after crawling each ```MAX_CACHED_NUM``` papers.
19 | 


--------------------------------------------------------------------------------
/crawled.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
  1 | # this code is written by @SMSadegh19
  2 | 
  3 | from lib2to3.pgen2 import driver
  4 | import time
  5 | import json
  6 | 
  7 | from webdriver_manager.chrome import ChromeDriverManager
  8 | 
  9 | ###### selenium imports
 10 | from selenium import webdriver
 11 | from selenium.webdriver import common
 12 | from selenium.webdriver.common.keys import Keys
 13 | from selenium.webdriver.chrome.options import Options
 14 | from selenium.webdriver.support.ui import WebDriverWait
 15 | from selenium.webdriver.common.action_chains import ActionChains
 16 | from selenium.webdriver.common.by import By
 17 | from selenium.webdriver.support.ui import WebDriverWait
 18 | from selenium.webdriver.support import expected_conditions as EC
 19 | ######
 20 | 
 21 | # ids:
 22 | crawled_ids = set()
 23 | # list of PageDatas:
 24 | page_ids = set()
 25 | pages_queue = []
 26 | MAX_FETCH_COUNT = 2000
 27 | MAX_CACHED_NUM = 10
 28 | BASE_URL = "https://www.researchgate.net/publication/"
 29 | DRIVER_PATH = ChromeDriverManager().install()
 30 | 
 31 | cached_pages = []
 32 | 
 33 | class PageData:
 34 |     def __init__(self, id, url):
 35 |         self.url = url
 36 |         self.id = id
 37 |         self.title = ""
 38 |         self.abstract = ""
 39 |         self.date = ""
 40 |         self.authors = []
 41 |         self.references = []
 42 |     
 43 |     def save_to_json(self):
 44 |         global cached_pages
 45 |         new_obj = {'id': self.id, 'title': self.title, 'abstract': self.abstract, 'date': self.date, 'authors': self.authors, 'references': self.references}
 46 |         cached_pages.append(new_obj)
 47 |         if len(cached_pages) > MAX_CACHED_NUM:
 48 |             crawled_pages = []
 49 |             with open("crawled.json") as f:
 50 |                 crawled_pages = json.load(f)
 51 |                 crawled_pages.extend(cached_pages)
 52 |                 cached_pages = []
 53 |             with open("crawled.json", "w") as f:
 54 |                 json.dump(crawled_pages, f, indent=4)
 55 | 
 56 |     def fetch_page(self):
 57 |         try:
 58 |             options = Options()
 59 |             options.headless = True
 60 |             driver = webdriver.Chrome(DRIVER_PATH, chrome_options=options)
 61 |             driver.get(self.url)
 62 |             print(self.url)
 63 |             time.sleep(3)
 64 |             self.url = driver.current_url
 65 | 
 66 |             ### get title
 67 |             self.title = driver.find_element(By.CSS_SELECTOR, '#lite-page > main > section > section.research-detail-header-section > div > div > h1').text
 68 |             print(self.title)
 69 | 
 70 |             ### get abstract
 71 |             try:
 72 |                 self.abstract = driver.find_element(By.XPATH, '//*[@id="lite-page"]/main/section/div[1]/div[1]/div/div[2]/div').text
 73 |             except:
 74 |                 self.abstract = ""
 75 |             print(self.abstract)
 76 | 
 77 |             ### get date
 78 |             elem = driver.find_element(By.CSS_SELECTOR, '#lite-page > main > section > section.research-detail-header-section > div > div > div.research-detail-header-section__metadata > div:nth-child(1) > ul > li')
 79 |             self.date = elem.text.split()[1]
 80 |             print(self.date)
 81 | 
 82 |             ### get authors
 83 |             # click show all authors
 84 |             try:
 85 |                 elem = driver.find_element(By.XPATH, '//*[@id="lite-page"]/main/section/section[1]/div/div/div[4]/div/span[1]/a')
 86 |                 if not ("Show" in elem.text):
 87 |                     raise ValueError("errrrr")
 88 |                 elem.click()
 89 |             except:
 90 |                 pass
 91 |             # get authors name
 92 |             elems = driver.find_elements(By.XPATH, '//*[@id="lite-page"]/main/section/section[1]/div/div/div[3]/div')
 93 |             for elem in elems:
 94 |                 self.authors.append(elem.text.split('\n')[0])
 95 |             print(self.authors)
 96 | 
 97 |             ### get references
 98 |             elems = []
 99 |             try:
100 |                 elems = driver.find_elements(By.CSS_SELECTOR, '#references > div > div > div > div > div > div > div > div > div > div > div:nth-child(1) > div > a')
101 |                 if len(elems) == 0:
102 |                     raise ValueError("errrrrr")
103 |             except:
104 |                 elems = driver.find_elements(By.CSS_SELECTOR, '#citations > div > div > div > div > div > div > div > div > div > div > div:nth-child(1) > div > a')
105 |             print(len(elems))
106 |             for elem in elems[:10]:
107 |                 ref_url = elem.get_attribute('href')
108 |                 self.references.append(get_id_from_url(ref_url))
109 |             print(self.references)
110 | 
111 |             for ref_id in self.references:
112 |                 add_page_to_queue(ref_id)
113 | 
114 |             self.save_to_json()
115 |             crawled_ids.add(self.id)
116 |         except Exception as err:
117 |             print(err)
118 |             print("Error during fetch:", " id = ", self.id, " url = ", self.url)
119 |         finally:
120 |             driver.quit()
121 | 
122 | 
123 | def get_id_from_url(url):
124 |     start = url.index("publication") + 12
125 |     return url[start:].split("_")[0]
126 | 
127 | 
128 | def read_crawled_file():
129 |     global crawled_ids, page_ids
130 |     with open("crawled.json") as f:
131 |         crawled_pages = json.load(f)
132 |         for x in crawled_pages:
133 |             page_id, page_references = x['id'], x['references']
134 |             for ref_id in page_references:
135 |                 add_page_to_queue(ref_id)
136 |             crawled_ids.add(page_id)
137 |             page_ids.add(page_id)
138 | 
139 | 
140 | def read_start_file():
141 |     global pages_queue, page_ids, crawled_ids
142 |     with open("start.txt", "r") as f:
143 |         urls = f.readlines()
144 |         for x in urls:
145 |             url = x.strip()
146 |             pid = get_id_from_url(url)
147 |             add_page_to_queue(pid)
148 | 
149 | 
150 | def add_page_to_queue(pid):
151 |     if (pid not in page_ids) and (pid not in crawled_ids):
152 |         pages_queue.append(PageData(pid, BASE_URL + pid))
153 |         page_ids.add(pid)
154 | 
155 | 
156 | def get_a_page_to_fetch():
157 |     global pages_queue, crawled_ids
158 |     while len(pages_queue) > 0:
159 |         page = pages_queue.pop()
160 |         if page.id not in crawled_ids:
161 |             return page
162 |     return None
163 | 
164 | 
165 | def fetch_pages():
166 |     global crawled_ids
167 |     while len(crawled_ids) < MAX_FETCH_COUNT:
168 |         print("***************** Here we are at crawling ", len(crawled_ids))
169 |         page = get_a_page_to_fetch()
170 |         if page is None:
171 |             print("Queue is empty. There is no more page to crawl. :D")
172 |             break
173 |         page.fetch_page()
174 | 
175 | 
176 | read_crawled_file()
177 | read_start_file()
178 | 
179 | fetch_pages()


--------------------------------------------------------------------------------
/start.txt:
--------------------------------------------------------------------------------
1 | https://www.researchgate.net/publication/317558625_Attention_Is_All_You_Need
2 | https://www.researchgate.net/publication/286512696_Deep_Residual_Learning_for_Image_Recognition
3 | https://www.researchgate.net/publication/333444574_EfficientNet_Rethinking_Model_Scaling_for_Convolutional_Neural_Networks
4 | https://www.researchgate.net/publication/352015995_SegFormer_Simple_and_Efficient_Design_for_Semantic_Segmentation_with_Transformers
5 | https://www.researchgate.net/publication/354891122_PASS_An_ImageNet_replacement_for_self-supervised_pretraining_without_humans
6 | https://www.researchgate.net/publication/328230984_BERT_Pre-training_of_Deep_Bidirectional_Transformers_for_Language_Understanding
7 | https://www.researchgate.net/publication/344828174_An_Image_is_Worth_16x16_Words_Transformers_for_Image_Recognition_at_Scale


--------------------------------------------------------------------------------