├── .github └── workflows │ └── python-app.yml ├── .gitignore ├── .idea ├── .gitignore ├── Literature Review.iml ├── git_toolbox_prj.xml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── other.xml └── vcs.xml ├── README.md ├── config.json ├── demo ├── acm adv search string.jpg ├── ieee adv search string.jpg ├── science direct adv search string.jpg └── science direct adv search.jpg ├── main.py ├── requirements.txt ├── src ├── __init__.py ├── acm.py ├── ieee.py ├── scidirect.py └── utils.py └── tests ├── __init__.py ├── bad.json ├── empty.json └── validate_config.py /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | pip install pytest pytest-cov 40 | pytest ./tests/validate_config.py 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project exclude paths 2 | /data/ -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/Literature Review.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 19 | -------------------------------------------------------------------------------- /.idea/git_toolbox_prj.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 85 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LiteratureReview 2 | 3 | scrapper for various science databases, supported databases are IEEE Xplore, Science Direct and 4 | ACM. theses scrapping bots will retrieve link to each search results aka paper, title and some 5 | other meta-data such as keywords and abstract, type of paper (conference, journal ect.) which 6 | useful to do the systematic literature review process make easy. 7 | 8 | _*If you find this work usefully, put a star on this repo ⭐*_ 9 | 10 | # Prerequisites 11 | 12 | - python 3.9 or higher 13 | - Chrome browser 14 | - Chrome web driver which matches your Chrome version. download from [here](https://chromedriver.chromium.org/downloads/) 15 | 16 | # How to use 17 | 18 | 1) go to the official site (advance search page), create a search query using their form, 19 |

Science Direct

20 | 21 |

22 |

IEEE Xplore

23 | 24 |

ACM

25 |

26 | 2) copy that query text and use it to configure the tool 27 | 3) clone the repo (create virtual environment is recommended way) and complete the configuration 28 | can use a single bot or all the bots at one by one configuration. 29 | 30 | ```shell 31 | git clone https://github.com/ashen007/LiteratureReview.git 32 | ``` 33 | - all bots with single configuration 34 | 35 | ```json 36 | { 37 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", 38 | "EXECUTABLE_PATH": "D:\\chromedriver.exe", 39 | "SCIDIR": { 40 | "search_term": "insert query string here", 41 | "link_file_save_to": "./temp/scidir_search_term.json", 42 | "abs_file_save_to": "./abs/scidir_search_term.json", 43 | "use_batches": true, 44 | "batch_size": 8, 45 | "keep_link_file": true 46 | }, 47 | "ACM": { 48 | "search_term": "insert query string here", 49 | "link_file_save_to": "./temp/acm_search_term.json", 50 | "abs_file_save_to": "./abs/acm_search_term.json", 51 | "use_batches": true, 52 | "batch_size": 8, 53 | "keep_link_file": true 54 | }, 55 | "IEEE": { 56 | "search_term": "insert query string here", 57 | "link_file_save_to": "./temp/ieee_search_term.json", 58 | "abs_file_save_to": "./abs/ieee_search_term.json", 59 | "use_batches": false, 60 | "batch_size": 8, 61 | "keep_link_file": true 62 | } 63 | } 64 | ``` 65 | 66 | - or can use one bot as well 67 | 68 | ```json 69 | { 70 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", 71 | "EXECUTABLE_PATH": "D:\\chromedriver.exe", 72 | "SCIDIR": { 73 | "search_term": "insert query string here", 74 | "link_file_save_to": "./temp/scidir_search_term.json", 75 | "abs_file_save_to": "./abs/scidir_search_term.json", 76 | "use_batches": true, 77 | "batch_size": 8, 78 | "keep_link_file": true 79 | } 80 | } 81 | ``` 82 | 83 | - config `BINARY_LOCATION` 84 | use a path to chrome.exe file location 85 | 86 | - config `EXECUTABLE_PATH` 87 | use a path where you download and extract the Chrome web driver 88 | 89 | 4) install dependencies run the main.py 90 | 91 | ```shell 92 | pip install -r ./requirements.txt 93 | ``` 94 | 95 | ```shell 96 | python main.py 97 | 98 | ``` 99 | 100 | 5) that's it 101 | 6) save results into excel workbook, automatically saved into `./SLR.xlsx` file. 102 | ```python 103 | from src.utils import to_excel 104 | to_excel({"acm":'./abs/acm_search_term.json', "ieee": './abs/ieee_search_term.json', "science_direct": './abs/scidir_search_term.json'}) 105 | ``` 106 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", 3 | "EXECUTABLE_PATH": "D:\\ML & DL\\chrome-win64\\chrome.exe", 4 | "ACM": { 5 | "search_term": "AllField:(video processing) AND Title:(sign language detection) AND AllField:(dumb and deff) AND AllField:(sign language)", 6 | "link_file_save_to": "./temp/acm_search_term_chris.json", 7 | "abs_file_save_to": "./abs/acm_search_term_chris.json", 8 | "use_batches": true, 9 | "batch_size": 8, 10 | "keep_link_file": true 11 | }, 12 | "IEEE": { 13 | "search_term": "video processing, sign language detection, dumb", 14 | "link_file_save_to": "./temp/ieee_search_term_chris.json", 15 | "abs_file_save_to": "./abs/ieee_search_term_chris.json", 16 | "use_batches": false, 17 | "batch_size": 8, 18 | "keep_link_file": true 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /demo/acm adv search string.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/acm adv search string.jpg -------------------------------------------------------------------------------- /demo/ieee adv search string.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/ieee adv search string.jpg -------------------------------------------------------------------------------- /demo/science direct adv search string.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search string.jpg -------------------------------------------------------------------------------- /demo/science direct adv search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search.jpg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from datetime import datetime 4 | from src.scidirect import ScienceDirect, Paper as SDP 5 | from src.acm import ACM, Paper as ACMP 6 | from src.ieee import IEEE, Paper as IXP 7 | from src.utils import * 8 | 9 | if __name__ == "__main__": 10 | config = read_json("./config.json") 11 | assert validate(config) 12 | 13 | if not os.path.isdir('temp'): 14 | os.mkdir('temp') 15 | 16 | if not os.path.isdir('abs'): 17 | os.mkdir('abs') 18 | 19 | scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(config.keys())) 20 | 21 | for s in scrappers: 22 | if s == 'IEEE': 23 | # get links to individual search results 24 | ieee = IEEE(config['IEEE']['search_term']) 25 | ieee.get_links_to_papers() 26 | 27 | # dump links 28 | if config['IEEE']['keep_link_file']: 29 | ieee.to_json(config['IEEE']['link_file_save_to']) 30 | 31 | # get abstract of the and every search results 32 | ieee_paper = IXP(config['IEEE']['link_file_save_to']) 33 | 34 | if config['IEEE']['use_batches']: 35 | ieee_paper.batch_update_details(config['IEEE']['batch_size']) 36 | 37 | else: 38 | ieee_paper.update_paper_details() 39 | 40 | ieee_paper.to_json(config['IEEE']['abs_file_save_to']) 41 | 42 | if not config['IEEE']['keep_link_file']: 43 | os.remove(config['IEEE']['link_file_save_to']) 44 | 45 | elif s == 'ACM': 46 | # get links to individual search results 47 | current_year = datetime.now().year 48 | acm = ACM((current_year - 5), current_year, config['ACM']['search_term']) 49 | acm.get_links_to_papers() 50 | 51 | # dump links 52 | if config['ACM']['keep_link_file']: 53 | acm.to_json(config['ACM']['link_file_save_to']) 54 | 55 | # get abstract of the and every search results 56 | acm_paper = ACMP(config['ACM']['link_file_save_to']) 57 | 58 | if config['ACM']['use_batches']: 59 | acm_paper.batch_update_details(config['ACM']['batch_size']) 60 | 61 | else: 62 | acm_paper.update_paper_details() 63 | 64 | acm_paper.to_json(config['ACM']['abs_file_save_to']) 65 | 66 | if not config['ACM']['keep_link_file']: 67 | os.remove(config['ACM']['link_file_save_to']) 68 | 69 | elif s == 'SCIDIR': 70 | # get links to individual search results 71 | current_year = datetime.now().year 72 | sd = ScienceDirect((current_year - 5), current_year, config['SCIDIR']['search_term']) 73 | 74 | if sd.driver is not None: 75 | sd.driver.delete_all_cookies() 76 | 77 | sd.get_links_to_papers() 78 | 79 | # dump links 80 | if config['SCIDIR']['keep_link_file']: 81 | sd.to_json(config['SCIDIR']['link_file_save_to']) 82 | 83 | # get abstract of the and every search results 84 | sd_paper = SDP(config['SCIDIR']['link_file_save_to']) 85 | 86 | if sd_paper.driver is not None: 87 | sd_paper.driver.delete_all_cookies() 88 | 89 | if config['SCIDIR']['use_batches']: 90 | sd_paper.batch_update_details(config['SCIDIR']['batch_size']) 91 | 92 | else: 93 | sd_paper.update_paper_details() 94 | 95 | sd_paper.to_json(config['SCIDIR']['abs_file_save_to']) 96 | 97 | if not config['SCIDIR']['keep_link_file']: 98 | os.remove(config['SCIDIR']['link_file_save_to']) 99 | 100 | else: 101 | raise ConfigurationError(f"wrong scrapper {s}.") 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests~=2.28.1 2 | numpy~=1.22.4 3 | selenium~=4.3.0 4 | undetected-chromedriver~=3.4.7 5 | selenium-stealth~=1.0.6 6 | pytest~=7.3.1 7 | PyYAML~=6.0 8 | pandas~=2.0.1 9 | openpyxl~=3.1.2 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/src/__init__.py -------------------------------------------------------------------------------- /src/acm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import numpy as np 5 | import undetected_chromedriver 6 | from selenium import webdriver 7 | from selenium.webdriver.common.by import By 8 | from selenium_stealth import stealth 9 | from src.utils import * 10 | 11 | 12 | class ACM: 13 | """ 14 | Parameters 15 | ---------- 16 | start: int 17 | start year of the date range filter 18 | 19 | end: int 20 | end year of the date range filter 21 | 22 | search_terms: str 23 | string of search terms (it can be comma seperated or semicolon 24 | seperated string) 25 | 26 | Attributes 27 | ---------- 28 | driver: undetected_chromedriver.Chrome 29 | web driver for selenium 30 | 31 | page_count: int 32 | number of pages in search results 33 | 34 | links_to_paper: dict 35 | mined links and additional details for results 36 | 37 | origin: str 38 | origin of science direct advanced search url 39 | 40 | date_filter: str 41 | date range to filter search results 42 | 43 | results_in_a_page: str 44 | number of records should show tin single page 45 | 46 | start_page: str 47 | where is the starting location in page numbering 48 | 49 | query_text: str 50 | encoded search query string to apply in URL 51 | 52 | Methods 53 | ------- 54 | encode_search_terms_into_query: 55 | encode user given search terms into URL string 56 | 57 | construct_full_link: 58 | create full link to make request from server 59 | 60 | create_query_text: 61 | create encoded query text to insert in URL 62 | 63 | init_driver: 64 | initiate web driver and session 65 | 66 | close_driver: 67 | close web driver and session 68 | 69 | post_request: 70 | post a request to science direct server 71 | 72 | check_for_multiple_pages: 73 | check weather search results contains multiple pages 74 | in results 75 | 76 | mine_links: 77 | get links to each search result (for each individual paper) 78 | 79 | get_links_to_papers: 80 | create paper link list 81 | 82 | to_json: 83 | dump results into json 84 | 85 | """ 86 | options = webdriver.ChromeOptions() 87 | config = read_json('./config.json') 88 | 89 | options.add_argument("--headless") 90 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 91 | options.add_experimental_option('useAutomationExtension', False) 92 | options.add_argument("--disable-blink-features=AutomationControlled") 93 | options.binary_location = config['BINARY_LOCATION'] 94 | 95 | def __init__(self, 96 | start, 97 | end, 98 | search_terms): 99 | self.driver = None 100 | self.page_count = None 101 | self.links_to_paper = {} 102 | self.search_terms = search_terms 103 | self.origin = "https://dl.acm.org/action/doSearch?" 104 | self.quick_search = "fillQuickSearch=false" 105 | self.target = "&target=advanced&expand=dl" 106 | self.date_filter = f"&AfterYear={start}&BeforeYear={end}" 107 | self.query_text = self.create_query_text() 108 | self.start_page = "&startPage=0" 109 | self.results_in_a_page = "&pageSize=50" 110 | 111 | @staticmethod 112 | def encode_search_terms_into_query(keywords: str) -> str: 113 | """ 114 | encode user given search terms into URL string 115 | 116 | Parameters 117 | ---------- 118 | keywords: str 119 | search terms to create search query 120 | 121 | Returns 122 | ------- 123 | 124 | """ 125 | encode = keywords.replace(' ', "+") 126 | encode = encode.replace(';', "%3B") 127 | encode = encode.replace(':', "%3A") 128 | encode = encode.replace(',', "%2C") 129 | encode = encode.replace('(', "%28") 130 | encode = encode.replace(')', "%29") 131 | 132 | return encode 133 | 134 | def create_query_text(self) -> str: 135 | """ 136 | create query text 137 | 138 | Returns 139 | ------- 140 | 141 | """ 142 | return f"&AllField={self.encode_search_terms_into_query(self.search_terms)}" 143 | 144 | def construct_full_link(self) -> str: 145 | """ 146 | create full link to make request from server 147 | 148 | Returns 149 | ------- 150 | 151 | """ 152 | return ''.join([self.origin, 153 | self.quick_search, 154 | self.target, 155 | self.date_filter, 156 | self.query_text, 157 | self.start_page, 158 | self.results_in_a_page]) 159 | 160 | def init_driver(self) -> None: 161 | """ 162 | initiate web driver and session 163 | 164 | Returns 165 | ------- 166 | 167 | """ 168 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options, 169 | executable_path=self.config['EXECUTABLE_PATH']) 170 | clean_cookies_and_caches(self.driver) 171 | 172 | def close_driver(self) -> None: 173 | """ 174 | close web driver and session 175 | 176 | Returns 177 | ------- 178 | 179 | """ 180 | self.driver.close() 181 | 182 | def post_request(self, link) -> None: 183 | """ 184 | post a request to science direct server 185 | 186 | Parameters 187 | ---------- 188 | link: str 189 | URL to make request on 190 | 191 | Returns 192 | ------- 193 | 194 | """ 195 | stealth(self.driver, 196 | languages=["en-US", "en"], 197 | vendor="Google Inc.", 198 | platform="Win32", 199 | webgl_vendor="Intel Inc.", 200 | renderer="Intel Iris OpenGL Engine", 201 | fix_hairline=True, 202 | ) 203 | # make request 204 | self.driver.delete_all_cookies() 205 | self.driver.get(link) 206 | time.sleep(abs(np.random.normal(2, 0.4))) 207 | 208 | def check_for_multiple_pages(self) -> bool: 209 | """ 210 | check weather search results contains multiple pages 211 | in results 212 | 213 | Returns 214 | ------- 215 | 216 | """ 217 | link = self.construct_full_link() 218 | self.init_driver() 219 | self.post_request(link) 220 | 221 | tot_results = int(self.driver.find_element(By.CLASS_NAME, 222 | value="result__count").text.split(' ')[0]) 223 | 224 | self.page_count = int(np.round(tot_results / 50)) 225 | 226 | self.close_driver() 227 | 228 | return True if self.page_count > 1 else False 229 | 230 | def mine_links(self) -> None: 231 | """ 232 | get links to each search result (for each individual paper) 233 | 234 | Returns 235 | ------- 236 | 237 | """ 238 | types = self.driver.find_elements(By.CLASS_NAME, value="issue-heading") 239 | dates = self.driver.find_elements(By.CLASS_NAME, value="bookPubDate") 240 | titles = self.driver.find_elements(By.CLASS_NAME, value="issue-item__title") 241 | links = self.driver.find_elements(By.CSS_SELECTOR, 242 | value="h5[class='issue-item__title']>span[class='hlFld-Title']>a") 243 | 244 | for type_, date, title, link in zip(types, dates, titles, links): 245 | self.links_to_paper[f'{link.get_attribute("href").split("/")[-1]}'] = {"type_": type_.text, 246 | "date": date.text, 247 | "title": title.text, 248 | "link": link.get_attribute('href')} 249 | 250 | time.sleep(abs(np.random.uniform(2, 4))) 251 | 252 | def get_links_to_papers(self) -> None: 253 | """ 254 | create paper link list 255 | 256 | Returns 257 | ------- 258 | 259 | """ 260 | if self.check_for_multiple_pages(): 261 | for i in range(1, (self.page_count + 1)): 262 | self.start_page = f"&startPage={i}" 263 | self.init_driver() 264 | self.post_request(self.construct_full_link()) 265 | self.mine_links() 266 | 267 | print(f'reading page: {i + 1} from {self.page_count}', end='\r') 268 | 269 | self.close_driver() 270 | 271 | else: 272 | self.init_driver() 273 | self.post_request(self.construct_full_link()) 274 | self.mine_links() 275 | self.close_driver() 276 | 277 | def to_json(self, path) -> None: 278 | """ 279 | dump results into json 280 | 281 | Parameters 282 | ---------- 283 | path: str 284 | string path for save results (link and additional details) 285 | 286 | Returns 287 | ------- 288 | 289 | """ 290 | with open(path, 'w') as file: 291 | json.dump(self.links_to_paper, file) 292 | 293 | 294 | class Paper: 295 | options = webdriver.ChromeOptions() 296 | config = read_json('./config.json') 297 | 298 | options.add_argument("--headless") 299 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 300 | options.add_experimental_option('useAutomationExtension', False) 301 | options.add_argument("--disable-blink-features=AutomationControlled") 302 | options.binary_location = config['BINARY_LOCATION'] 303 | 304 | def __init__(self, file_name): 305 | self.driver = None 306 | self.destination = file_name 307 | 308 | with open(file_name, "r") as file: 309 | self.link_object = json.load(file) 310 | 311 | def init_driver(self) -> None: 312 | """ 313 | initiate web driver and session 314 | 315 | Returns 316 | ------- 317 | 318 | """ 319 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options, 320 | executable_path=self.config['EXECUTABLE_PATH']) 321 | clean_cookies_and_caches(self.driver) 322 | 323 | def close_driver(self) -> None: 324 | """ 325 | close web driver and session 326 | 327 | Returns 328 | ------- 329 | 330 | """ 331 | self.driver.close() 332 | 333 | def request_paper(self, page_link) -> None: 334 | """ 335 | post a request to science direct server 336 | 337 | Parameters 338 | ---------- 339 | page_link: str 340 | URL to make request on 341 | 342 | Returns 343 | ------- 344 | 345 | """ 346 | stealth(self.driver, 347 | languages=["en-US", "en"], 348 | vendor="Google Inc.", 349 | platform="Win32", 350 | webgl_vendor="Intel Inc.", 351 | renderer="Intel Iris OpenGL Engine", 352 | fix_hairline=True, 353 | ) 354 | 355 | URL = page_link 356 | 357 | # make request 358 | self.driver.delete_all_cookies() 359 | self.driver.get(URL) 360 | 361 | time.sleep(abs(np.random.normal(1, 0.4))) 362 | 363 | def get_abstract_text(self) -> str: 364 | """ 365 | get abstract from each publication 366 | 367 | Returns 368 | ------- 369 | abstract: str 370 | 371 | """ 372 | return self.driver.find_element(By.CLASS_NAME, 'abstractInFull').text 373 | 374 | # def click_kw_section(self) -> None: 375 | # self.driver.execute_script("arguments[0].scrollIntoView();", 376 | # self.driver.find_element(By.ID, 'keywords')) 377 | # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click() 378 | # time.sleep(1) 379 | 380 | # def get_keywords(self) -> list: 381 | # """ 382 | # get all type of keywords in ieee xplore for the publication 383 | # 384 | # Returns 385 | # ------- 386 | # list of keyword strings: list 387 | # 388 | # """ 389 | # kw_types = self.driver.find_elements(By.CSS_SELECTOR, 390 | # "ul[class='doc-keywords-list stats-keywords-list']>li[" 391 | # "class='doc-keywords-list-item']>ul") 392 | # return [kw.text.replace('\n', '') for kw in kw_types if kw.text != ''] 393 | 394 | def update_paper_details(self) -> None: 395 | """ 396 | update the detail object of the publications 397 | 398 | Returns 399 | ------- 400 | 401 | """ 402 | # start driver 403 | self.init_driver() 404 | 405 | for obj in self.link_object.values(): 406 | doc_link = obj['link'] 407 | self.request_paper(doc_link) 408 | # self.click_kw_section() 409 | 410 | time.sleep(abs(np.random.normal(1, 0.4))) 411 | 412 | try: 413 | abstract = self.get_abstract_text() 414 | # kws = self.get_keywords() 415 | 416 | except: 417 | abstract = np.NAN 418 | # kws = np.NAN 419 | 420 | obj['abs'] = abstract 421 | 422 | # if kws not in value: 423 | # value.append(kws) 424 | 425 | # close driver 426 | self.close_driver() 427 | 428 | def batch_update_details(self, size) -> None: 429 | """ 430 | update the detail object of the publications batch wise 431 | 432 | Parameters 433 | ---------- 434 | size: int 435 | size of a batch 436 | 437 | Returns 438 | ------- 439 | 440 | """ 441 | keys = list(self.link_object.keys()) 442 | 443 | for i in range(size, len(self.link_object), size): 444 | batch = keys[(i - size):i] 445 | self.init_driver() 446 | 447 | for p in batch: 448 | doc_link = self.link_object[p]["link"] 449 | self.request_paper(doc_link) 450 | 451 | try: 452 | abstract = self.get_abstract_text() 453 | 454 | except: 455 | abstract = np.NAN 456 | 457 | if abstract not in list(self.link_object[p].values()): 458 | self.link_object[p]["abs"] = abstract 459 | 460 | # dump updated link object to json 461 | with open('./acm_temp.json', 'w') as file: 462 | json.dump(self.link_object, file) 463 | 464 | # close driver 465 | self.close_driver() 466 | 467 | def to_json(self, path) -> None: 468 | """ 469 | dump results into json 470 | 471 | Parameters 472 | ---------- 473 | path: str 474 | string path for save results (link and additional details) 475 | 476 | Returns 477 | ------- 478 | 479 | """ 480 | if os.path.isfile('./acm_temp.json'): 481 | with open('./acm_temp.json') as file: 482 | self.link_object = json.load(file) 483 | 484 | os.remove('./acm_temp.json') 485 | 486 | with open(path, 'w') as file: 487 | json.dump(self.link_object, file) 488 | -------------------------------------------------------------------------------- /src/ieee.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import requests 4 | import numpy as np 5 | import json 6 | import undetected_chromedriver 7 | 8 | from selenium import webdriver 9 | from selenium.common import exceptions 10 | from selenium.webdriver.common.by import By 11 | from selenium.webdriver.support.wait import WebDriverWait 12 | from selenium.webdriver.support import expected_conditions as EC 13 | from selenium_stealth import stealth 14 | from src.utils import * 15 | 16 | 17 | class IEEE: 18 | """ 19 | Parameters 20 | ---------- 21 | query: str 22 | search term for either simple search or advanced search, if for advanced 23 | search need to add AND, OR, NOT in between search keywords. 24 | 25 | Attributes 26 | ---------- 27 | headers: dict 28 | header to post for IEEE Xplore 29 | 30 | payload: dict 31 | additional details for filter results from request 32 | 33 | page_count: int 34 | total number of pages in the search results 35 | 36 | links_to_paper: dict 37 | mined links and additional details for results 38 | 39 | Methods 40 | ------- 41 | post_request: 42 | send request to IEEE server 43 | 44 | check_for_multiple_pages: 45 | check weather results has been divide to multiple 46 | web pages, if so update the page count. 47 | 48 | mine_links: 49 | get links for each document from search results 50 | 51 | get_links_to_papers: 52 | add all links to single object 53 | 54 | to_json: 55 | dump links to json file 56 | 57 | """ 58 | 59 | def __init__(self, query): 60 | self.headers = { 61 | "Accept": "application/json, text/plain, */*", 62 | "Origin": "https://ieeexplore.ieee.org", 63 | "Content-Type": "application/json", 64 | } 65 | self.payload = { 66 | "newsearch": True, 67 | "queryText": query, 68 | "highlight": True, 69 | "returnFacets": ["ALL"], 70 | "returnType": "SEARCH", 71 | "pageNumber": 1 72 | } 73 | self.page_count = None 74 | self.links_to_paper = {} 75 | 76 | @staticmethod 77 | def post_request(header: dict, json: dict) -> requests.Response: 78 | """ 79 | send request to IEEE server 80 | 81 | Parameters 82 | ---------- 83 | header: dict 84 | header to post for IEEE Xplore 85 | 86 | json: dict 87 | additional details for filter results from request 88 | 89 | Returns 90 | ------- 91 | 92 | """ 93 | result = requests.post("https://ieeexplore.ieee.org/rest/search", 94 | headers=header, 95 | json=json) 96 | 97 | return result 98 | 99 | def check_for_multiple_pages(self) -> bool: 100 | """ 101 | check weather results has been divide to multiple 102 | web pages, if so update the page count. 103 | 104 | Returns 105 | ------- 106 | 107 | """ 108 | results = self.post_request(self.headers, self.payload).json() 109 | self.page_count = results['totalPages'] 110 | 111 | return True if self.page_count > 1 else False 112 | 113 | def mine_links(self) -> None: 114 | """ 115 | get links for each document from search results 116 | 117 | Returns 118 | ------- 119 | 120 | """ 121 | request = self.post_request(self.headers, self.payload) 122 | j = 1 123 | 124 | while request.status_code != 200: 125 | time.sleep(abs(np.random.normal(0.1, 2))) 126 | request = self.post_request(self.headers, self.payload) 127 | 128 | results = request.json() 129 | 130 | for record in results['records']: 131 | self.links_to_paper[record['articleNumber']] = {"title": record.get('articleTitle', None), 132 | "link": record.get('documentLink', None), 133 | "date": record.get('publicationYear', None)} 134 | 135 | def get_links_to_papers(self) -> None: 136 | """ 137 | add all links to single object 138 | 139 | Returns 140 | ------- 141 | 142 | """ 143 | if self.check_for_multiple_pages(): 144 | for i in range(1, (self.page_count + 1)): 145 | self.payload["pageNumber"] = i 146 | 147 | self.mine_links() 148 | 149 | print(f'reading page: {i} from {self.page_count}', end='\r') 150 | 151 | else: 152 | self.mine_links() 153 | 154 | def to_json(self, path: str) -> None: 155 | """ 156 | dump links to json file 157 | 158 | Parameters 159 | ---------- 160 | path: str 161 | string path for save results (link and additional details) 162 | 163 | Returns 164 | ------- 165 | 166 | """ 167 | with open(path, 'w') as file: 168 | json.dump(self.links_to_paper, file) 169 | 170 | 171 | class Paper: 172 | options = webdriver.ChromeOptions() 173 | config = read_json('./config.json') 174 | 175 | options.add_argument("--headless") 176 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 177 | options.add_experimental_option('useAutomationExtension', False) 178 | options.add_argument("--disable-blink-features=AutomationControlled") 179 | options.binary_location = config['BINARY_LOCATION'] 180 | 181 | def __init__(self, file_name): 182 | self.driver = None 183 | self.failure = [] 184 | self.destination = file_name 185 | 186 | with open(file_name, "r") as file: 187 | self.link_object = json.load(file) 188 | 189 | def init_driver(self) -> None: 190 | """ 191 | initiate a web driver and session 192 | 193 | Returns 194 | ------- 195 | 196 | """ 197 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options, 198 | executable_path=self.config['EXECUTABLE_PATH']) 199 | clean_cookies_and_caches(self.driver) 200 | 201 | def close_driver(self) -> None: 202 | """ 203 | close a web driver and session 204 | 205 | Returns 206 | ------- 207 | 208 | """ 209 | self.driver.close() 210 | 211 | def request_paper(self, page_link) -> None: 212 | """ 213 | post a request to science direct server 214 | 215 | Parameters 216 | ---------- 217 | page_link: str 218 | URL to make request on 219 | 220 | Returns 221 | ------- 222 | 223 | """ 224 | stealth(self.driver, 225 | languages=["en-US", "en"], 226 | vendor="Google Inc.", 227 | platform="Win32", 228 | webgl_vendor="Intel Inc.", 229 | renderer="Intel Iris OpenGL Engine", 230 | fix_hairline=True, 231 | ) 232 | 233 | URL = f"https://ieeexplore.ieee.org{page_link}" 234 | 235 | # make request 236 | self.driver.delete_all_cookies() 237 | 238 | try: 239 | self.driver.get(URL) 240 | 241 | except: 242 | self.fall_back() 243 | self.driver.get(URL) 244 | 245 | time.sleep(abs(np.random.normal(1, 0.4))) 246 | 247 | def fall_back(self): 248 | """ 249 | recover while errors happens when requesting page data 250 | 251 | Returns 252 | ------- 253 | 254 | """ 255 | self.close_driver() 256 | time.sleep(1) 257 | self.init_driver() 258 | 259 | def get_abstract_text(self) -> str: 260 | """ 261 | get abstract from each publication 262 | 263 | Returns 264 | ------- 265 | abstract: str 266 | 267 | """ 268 | return self.driver.find_element(By.CLASS_NAME, 'abstract-text').text.replace('Abstract:\n', '') 269 | 270 | def click_kw_section(self) -> None: 271 | self.driver.execute_script("arguments[0].scrollIntoView();", 272 | self.driver.find_element(By.ID, 'keywords')) 273 | WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click() 274 | time.sleep(1) 275 | 276 | def get_keywords(self) -> list: 277 | """ 278 | get all type of keywords in ieee xplore for the publication 279 | 280 | Returns 281 | ------- 282 | list of keyword strings: list 283 | 284 | """ 285 | kw_types = self.driver.find_elements(By.CSS_SELECTOR, 286 | "ul[class='doc-keywords-list stats-keywords-list']>li[" 287 | "class='doc-keywords-list-item']>ul") 288 | return [kw.text.replace('\n', '') for kw in kw_types if kw.text != ''] 289 | 290 | def update_paper_details(self) -> None: 291 | """ 292 | update the detail object of the publications 293 | 294 | Returns 295 | ------- 296 | 297 | """ 298 | # start driver 299 | self.init_driver() 300 | 301 | for key, value in self.link_object.items(): 302 | doc_link = value["link"] 303 | 304 | try: 305 | self.request_paper(doc_link) 306 | self.click_kw_section() 307 | 308 | except exceptions.NoSuchElementException: 309 | self.fall_back() 310 | self.request_paper(doc_link) 311 | self.click_kw_section() 312 | 313 | except: 314 | continue 315 | 316 | time.sleep(abs(np.random.normal(1, 0.4))) 317 | 318 | try: 319 | abstract = self.get_abstract_text() 320 | kws = self.get_keywords() 321 | 322 | except: 323 | abstract = np.NAN 324 | kws = np.NAN 325 | 326 | value["abs"] = abstract 327 | value["kws"] = kws 328 | 329 | # close driver 330 | self.close_driver() 331 | 332 | def batch_update_details(self, size) -> None: 333 | """ 334 | update the detail object of the publications batch wise 335 | 336 | Parameters 337 | ---------- 338 | size: int 339 | size of a batch 340 | 341 | Returns 342 | ------- 343 | 344 | """ 345 | keys = list(self.link_object.keys()) 346 | 347 | for i in range(size, len(self.link_object), size): 348 | batch = keys[(i - size):i] 349 | self.init_driver() 350 | 351 | for p in batch: 352 | doc_link = self.link_object[p]["link"] 353 | 354 | try: 355 | self.request_paper(doc_link) 356 | self.click_kw_section() 357 | 358 | except exceptions.NoSuchElementException: 359 | self.fall_back() 360 | self.request_paper(doc_link) 361 | self.click_kw_section() 362 | 363 | except: 364 | continue 365 | 366 | try: 367 | abstract = self.get_abstract_text() 368 | kws = self.get_keywords() 369 | 370 | except: 371 | abstract = np.NAN 372 | kws = np.NAN 373 | 374 | self.link_object[p]["abs"] = abstract 375 | self.link_object[p]["abs"] = kws 376 | 377 | # dump updated link object to json 378 | with open('./ieee_temp.json', 'w') as file: 379 | json.dump(self.link_object, file) 380 | 381 | # close driver 382 | self.close_driver() 383 | 384 | def to_json(self, path) -> None: 385 | """ 386 | dump results into json 387 | 388 | Parameters 389 | ---------- 390 | path: str 391 | string path for save results (link and additional details) 392 | 393 | Returns 394 | ------- 395 | 396 | """ 397 | if os.path.isfile('./ieee_temp.json'): 398 | with open('./ieee_temp.json') as file: 399 | self.link_object = json.load(file) 400 | 401 | os.remove('./ieee_temp.json') 402 | 403 | with open(path, 'w') as file: 404 | json.dump(self.link_object, file) 405 | -------------------------------------------------------------------------------- /src/scidirect.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import numpy as np 5 | import undetected_chromedriver 6 | from selenium import webdriver 7 | from selenium.webdriver.common.by import By 8 | from selenium_stealth import stealth 9 | from src.utils import * 10 | 11 | 12 | class ScienceDirect: 13 | """ 14 | Parameters 15 | ---------- 16 | start: int 17 | start year of the date range filter 18 | 19 | end: int 20 | end year of the date range filter 21 | 22 | search_terms: str 23 | string of search terms (it can be comma seperated or semicolon 24 | seperated string) 25 | 26 | Attributes 27 | ---------- 28 | driver: undetected_chromedriver.Chrome 29 | web driver for selenium 30 | 31 | page_count: int 32 | number of pages in search results 33 | 34 | links_to_paper: dict 35 | mined links and additional details for results 36 | 37 | origin: str 38 | origin of science direct advanced search url 39 | 40 | date_filter: str 41 | date range to filter search results 42 | 43 | results_in_a_page: str 44 | number of records should show tin single page 45 | 46 | offset: str 47 | number of records should go forward for next page 48 | in search results 49 | 50 | query_text: str 51 | encoded search query string to apply in URL 52 | 53 | article_type: str 54 | science direct article type category indicator 55 | 56 | Methods 57 | ------- 58 | encode_search_terms_into_query: 59 | encode user given search terms into URL string 60 | 61 | construct_full_link: 62 | create full link to make request from server 63 | 64 | init_driver: 65 | initiate web driver and session 66 | 67 | close_driver: 68 | close web driver and session 69 | 70 | post_request: 71 | post a request to science direct server 72 | 73 | check_for_multiple_pages: 74 | check weather search results contains multiple pages 75 | in results 76 | 77 | mine_links: 78 | get links to each search result (for each individual paper) 79 | 80 | get_links_to_papers: 81 | create paper link list 82 | 83 | to_json: 84 | dump results into json 85 | 86 | """ 87 | 88 | options = webdriver.ChromeOptions() 89 | config = read_json('./config.json') 90 | 91 | options.add_argument("--headless") 92 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 93 | options.add_experimental_option('useAutomationExtension', False) 94 | options.add_argument("--disable-blink-features=AutomationControlled") 95 | options.binary_location = config['BINARY_LOCATION'] 96 | 97 | def __init__(self, start: int, end: int, search_terms: str): 98 | self.driver = None 99 | self.page_count = None 100 | self.links_to_paper = {} 101 | self.origin = "https://www.sciencedirect.com/search" 102 | self.date_filter = f"?date={start}-{end}" 103 | self.results_in_a_page = "&show=100" 104 | self.offset = "&offset=0" 105 | self.query_text = self.encode_search_terms_into_query(search_terms) 106 | self.article_type = "&articleTypes=FLA" 107 | 108 | @staticmethod 109 | def encode_search_terms_into_query(keywords: str) -> str: 110 | """ 111 | encode user given search terms into URL string 112 | 113 | Parameters 114 | ---------- 115 | keywords: str 116 | search terms to create search query 117 | 118 | Returns 119 | ------- 120 | 121 | """ 122 | encode = keywords.replace(' ', "%20") 123 | encode = encode.replace(';', "%3B") 124 | encode = encode.replace(',', "%2C") 125 | 126 | return f"&qs={encode}" 127 | 128 | def construct_full_link(self) -> str: 129 | """ 130 | create full link to make request from server 131 | 132 | Returns 133 | ------- 134 | 135 | """ 136 | return ''.join([self.origin, 137 | self.date_filter, 138 | self.query_text, 139 | self.results_in_a_page, 140 | self.offset, 141 | self.article_type]) 142 | 143 | def init_driver(self) -> None: 144 | """ 145 | initiate web driver and session 146 | 147 | Returns 148 | ------- 149 | 150 | """ 151 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options, 152 | executable_path=self.config['EXECUTABLE_PATH']) 153 | clean_cookies_and_caches(self.driver) 154 | 155 | def close_driver(self) -> None: 156 | """ 157 | close web driver and session 158 | 159 | Returns 160 | ------- 161 | 162 | """ 163 | self.driver.close() 164 | 165 | def post_request(self, link: str) -> None: 166 | """ 167 | post a request to science direct server 168 | 169 | Parameters 170 | ---------- 171 | link: str 172 | URL to make request on 173 | 174 | Returns 175 | ------- 176 | 177 | """ 178 | stealth(self.driver, 179 | languages=["en-US", "en"], 180 | vendor="Google Inc.", 181 | platform="Win32", 182 | webgl_vendor="Intel Inc.", 183 | renderer="Intel Iris OpenGL Engine", 184 | fix_hairline=True, 185 | ) 186 | # make request 187 | self.driver.delete_all_cookies() 188 | self.driver.get(link) 189 | time.sleep(abs(np.random.normal(2, 0.4))) 190 | 191 | def check_for_multiple_pages(self) -> bool: 192 | """ 193 | check weather search results contains multiple pages 194 | in results 195 | 196 | Returns 197 | ------- 198 | 199 | """ 200 | link = self.construct_full_link() 201 | self.init_driver() 202 | self.post_request(link) 203 | 204 | tot_results = int(self.driver.find_element(By.CLASS_NAME, 205 | value="search-body-results-text").text.split(' ')[0]) 206 | self.page_count = int(np.round(tot_results / 100)) 207 | 208 | self.close_driver() 209 | 210 | return True if self.page_count > 1 else False 211 | 212 | def mine_links(self) -> None: 213 | """ 214 | get links to each search result (for each individual paper) 215 | 216 | Returns 217 | ------- 218 | 219 | """ 220 | for title, article in zip(self.driver.find_elements(By.CLASS_NAME, value="result-list-title-link"), 221 | self.driver.find_elements(By.CLASS_NAME, value="article-type")): 222 | self.links_to_paper[title.get_attribute('id')] = {"title": title.text, 223 | "link": title.get_attribute('href'), 224 | "type_": article.text} 225 | 226 | time.sleep(abs(np.random.uniform(2, 4))) 227 | 228 | def get_links_to_papers(self) -> None: 229 | """ 230 | create paper link list 231 | 232 | Returns 233 | ------- 234 | 235 | """ 236 | if self.check_for_multiple_pages(): 237 | for i in range(self.page_count): 238 | self.offset = f"&offset={100 * i}" 239 | self.init_driver() 240 | self.post_request(self.construct_full_link()) 241 | self.mine_links() 242 | 243 | print(f'reading page: {i + 1} from {self.page_count}', end='\r') 244 | 245 | self.close_driver() 246 | 247 | else: 248 | self.init_driver() 249 | self.post_request(self.construct_full_link()) 250 | self.mine_links() 251 | self.close_driver() 252 | 253 | def to_json(self, path: str) -> None: 254 | """ 255 | dump results into json 256 | 257 | Parameters 258 | ---------- 259 | path: str 260 | string path for save results (link and additional details) 261 | 262 | Returns 263 | ------- 264 | 265 | """ 266 | with open(path, 'w') as file: 267 | json.dump(self.links_to_paper, file) 268 | 269 | 270 | class Paper: 271 | options = webdriver.ChromeOptions() 272 | config = read_json('./config.json') 273 | 274 | options.add_argument("--headless") 275 | options.add_experimental_option("excludeSwitches", ["enable-automation"]) 276 | options.add_experimental_option('useAutomationExtension', False) 277 | options.add_argument("--disable-blink-features=AutomationControlled") 278 | options.binary_location = config['BINARY_LOCATION'] 279 | 280 | def __init__(self, file_name): 281 | self.driver = None 282 | self.destination = file_name 283 | 284 | with open(file_name, "r") as file: 285 | self.link_object = json.load(file) 286 | 287 | def init_driver(self) -> None: 288 | """ 289 | initiate web driver and session 290 | 291 | Returns 292 | ------- 293 | 294 | """ 295 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options, 296 | executable_path=self.config['EXECUTABLE_PATH']) 297 | clean_cookies_and_caches(self.driver) 298 | 299 | def close_driver(self) -> None: 300 | """ 301 | close web driver and session 302 | 303 | Returns 304 | ------- 305 | 306 | """ 307 | self.driver.close() 308 | 309 | def request_paper(self, page_link) -> None: 310 | """ 311 | post a request to science direct server 312 | 313 | Parameters 314 | ---------- 315 | page_link: str 316 | URL to make request on 317 | 318 | Returns 319 | ------- 320 | 321 | """ 322 | stealth(self.driver, 323 | languages=["en-US", "en"], 324 | vendor="Google Inc.", 325 | platform="Win32", 326 | webgl_vendor="Intel Inc.", 327 | renderer="Intel Iris OpenGL Engine", 328 | fix_hairline=True, 329 | ) 330 | 331 | URL = page_link 332 | 333 | # make request 334 | self.driver.delete_all_cookies() 335 | self.driver.get(URL) 336 | 337 | time.sleep(abs(np.random.normal(1, 0.4))) 338 | 339 | def get_abstract_text(self) -> str: 340 | """ 341 | get abstract from each publication 342 | 343 | Returns 344 | ------- 345 | abstract: str 346 | 347 | """ 348 | return self.driver.find_element(By.CLASS_NAME, 'abstract').text.replace('Abstract:\n', '') 349 | 350 | # def click_kw_section(self) -> None: 351 | # self.driver.execute_script("arguments[0].scrollIntoView();", 352 | # self.driver.find_element(By.ID, 'keywords')) 353 | # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click() 354 | # time.sleep(1) 355 | # 356 | # def get_keywords(self) -> list: 357 | # """ 358 | # get all type of keywords in ieee xplore for the publication 359 | # 360 | # Returns 361 | # ------- 362 | # list of keyword strings: list 363 | # 364 | # """ 365 | # kw_types = self.driver.find_elements(By.CSS_SELECTOR, 366 | # "ul[class='doc-keywords-list stats-keywords-list']>li[" 367 | # "class='doc-keywords-list-item']>ul") 368 | # return [kw.text.replace('\n', '') for kw in kw_types if kw.text != ''] 369 | 370 | def update_paper_details(self) -> None: 371 | """ 372 | update the detail object of the publications 373 | 374 | Returns 375 | ------- 376 | 377 | """ 378 | # start driver 379 | self.init_driver() 380 | 381 | for key, value in self.link_object.items(): 382 | doc_link = value["link"] 383 | self.request_paper(doc_link) 384 | 385 | time.sleep(abs(np.random.normal(1, 0.4))) 386 | 387 | try: 388 | abstract = self.get_abstract_text() 389 | 390 | except: 391 | abstract = np.NAN 392 | 393 | value["abs"] = abstract 394 | 395 | # close driver 396 | self.close_driver() 397 | 398 | def batch_update_details(self, size) -> None: 399 | """ 400 | update the detail object of the publications batch wise 401 | 402 | Parameters 403 | ---------- 404 | size: int 405 | size of a batch 406 | 407 | Returns 408 | ------- 409 | 410 | """ 411 | keys = list(self.link_object.keys()) 412 | 413 | for i in range(size, len(self.link_object), size): 414 | batch = keys[(i - size):i] 415 | self.init_driver() 416 | 417 | for p in batch: 418 | doc_link = self.link_object[p]["link"] 419 | self.request_paper(doc_link) 420 | 421 | try: 422 | abstract = self.get_abstract_text() 423 | 424 | except: 425 | abstract = np.NAN 426 | 427 | self.link_object[p]["abs"] = abstract 428 | 429 | # dump updated link object to json 430 | with open('./sci_temp.json', 'w') as file: 431 | json.dump(self.link_object, file) 432 | 433 | # close driver 434 | self.close_driver() 435 | 436 | def to_json(self, path) -> None: 437 | """ 438 | dump results into json 439 | 440 | Parameters 441 | ---------- 442 | path: str 443 | string path for save results (link and additional details) 444 | 445 | Returns 446 | ------- 447 | 448 | """ 449 | if os.path.isfile('./sci_temp.json'): 450 | with open('./sci_temp.json') as file: 451 | self.link_object = json.load(file) 452 | 453 | os.remove('./sci_temp.json') 454 | 455 | with open(path, 'w') as file: 456 | json.dump(self.link_object, file) 457 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | 6 | 7 | def clean_cookies_and_caches(driver): 8 | # first falls check 9 | if driver is not None: 10 | driver.delete_all_cookies() 11 | 12 | # step 2 13 | # method 1 14 | driver.execute_script('window.localStorage.clear()') 15 | 16 | # method 2 17 | driver.execute_script('window.sessionStorage.clear()') 18 | 19 | 20 | def read_json(file_path): 21 | with open(file_path, "r") as f: 22 | return json.load(f) 23 | 24 | 25 | def to_excel(sheets: dict): 26 | dfs = {key: pd.read_json(filename) for key, filename in sheets.items()} 27 | 28 | with pd.ExcelWriter('./SLR_chris.xlsx') as writer: 29 | for sheet, df in dfs.items(): 30 | df.T.to_excel(writer, sheet_name=sheet) 31 | 32 | 33 | def validate(obj: dict): 34 | if obj == {}: 35 | raise ConfigurationError() 36 | 37 | scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(obj.keys())) 38 | if not obj.get('BINARY_LOCATION', False): 39 | raise ConfigurationError() 40 | 41 | if not obj.get('EXECUTABLE_PATH', False): 42 | raise ConfigurationError() 43 | 44 | assert len(scrappers) != 0 45 | 46 | print(f"detected scrappers: {scrappers}") 47 | print('=' * 25) 48 | 49 | validate_scrapper_keys(obj, scrappers) 50 | 51 | return True 52 | 53 | 54 | def validate_scrapper_keys(obj: dict, detected: set): 55 | expected_keys = ['search_term', 'link_file_save_to', 56 | 'abs_file_save_to', 'use_batches', 57 | 'batch_size', 'keep_link_file'] 58 | for s in detected: 59 | if list(obj[s].keys()) != expected_keys: 60 | raise ConfigurationError(expected_keys) 61 | 62 | 63 | class ConfigurationError(Exception): 64 | """ 65 | raise when scrapper configuration misses 66 | expected key or keys 67 | """ 68 | 69 | def __int__(self, exp_keys: list): 70 | self.exp_keys = exp_keys 71 | 72 | def __repr__(self): 73 | return f"{' '.join(self.exp_keys)} one or more keys missing from those." 74 | 75 | 76 | # TODO: complete this class 77 | # class GetSummery: 78 | # config = read_json('./config.json') 79 | # openai.api_key = config['API_KEY'] 80 | # 81 | # def __init__(self): 82 | # self.text_generator = None 83 | # self.paper_dtls = None 84 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/tests/__init__.py -------------------------------------------------------------------------------- /tests/bad.json: -------------------------------------------------------------------------------- 1 | { 2 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", 3 | "IEEE": { 4 | "search_term": "", 5 | "link_file_save_to": "./temp/ieee_search_term.json", 6 | "abs_file_save_to": "./abs/ieee_search_term.json", 7 | "use_batches": true, 8 | "batch_size": 8, 9 | "keep_link_file": true 10 | }, 11 | "ACM": { 12 | "search_term": "", 13 | "link_file_save_to": "./temp/acm_search_term.json", 14 | "abs_file_save_to": "./abs/acm_search_term.json", 15 | "use_batches": true, 16 | "batch_size": 8, 17 | "keep_link_file": true 18 | }, 19 | "SCIDIR": {} 20 | } -------------------------------------------------------------------------------- /tests/empty.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/validate_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from src.utils import * 4 | from pytest import raises 5 | 6 | 7 | def test_is_config_exists(file_name='x.json'): 8 | with raises(FileNotFoundError): 9 | read_json(file_name) 10 | 11 | 12 | def test_whether_config_empty(file_name='tests/empty.json'): 13 | config = read_json(file_name) 14 | 15 | with raises(ConfigurationError): 16 | validate(config) 17 | 18 | 19 | def test_config_file(): 20 | config = read_json('config.json') 21 | validate(config) 22 | 23 | 24 | def test_able_to_identify_bad_config(): 25 | config = read_json('tests/bad.json') 26 | 27 | with raises(ConfigurationError): 28 | validate(config) 29 | 30 | with raises(ConfigurationError): 31 | obj = { 32 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", 33 | "EXECUTABLE_PATH": "D:\\chromedriver.exe", 34 | "ACM": { 35 | "search_term": "", 36 | "link_file_save_to": "./temp/acm_search_term.json", 37 | "abs_file_save_to": "./abs/acm_search_term.json", 38 | "use_batches": True, 39 | "batch_size": 8, 40 | "keep_link_file": True 41 | }, 42 | "SCIDIR": { 43 | } 44 | } 45 | 46 | validate(obj) 47 | --------------------------------------------------------------------------------