5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LiteratureReview
2 |
3 | scrapper for various science databases, supported databases are IEEE Xplore, Science Direct and
4 | ACM. theses scrapping bots will retrieve link to each search results aka paper, title and some
5 | other meta-data such as keywords and abstract, type of paper (conference, journal ect.) which
6 | useful to do the systematic literature review process make easy.
7 |
8 | _*If you find this work usefully, put a star on this repo ⭐*_
9 |
10 | # Prerequisites
11 |
12 | - python 3.9 or higher
13 | - Chrome browser
14 | - Chrome web driver which matches your Chrome version. download from [here](https://chromedriver.chromium.org/downloads/)
15 |
16 | # How to use
17 |
18 | 1) go to the official site (advance search page), create a search query using their form,
19 |
Science Direct
20 |
21 |
22 |
IEEE Xplore
23 |
24 |
ACM
25 |
26 | 2) copy that query text and use it to configure the tool
27 | 3) clone the repo (create virtual environment is recommended way) and complete the configuration
28 | can use a single bot or all the bots at one by one configuration.
29 |
30 | ```shell
31 | git clone https://github.com/ashen007/LiteratureReview.git
32 | ```
33 | - all bots with single configuration
34 |
35 | ```json
36 | {
37 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
38 | "EXECUTABLE_PATH": "D:\\chromedriver.exe",
39 | "SCIDIR": {
40 | "search_term": "insert query string here",
41 | "link_file_save_to": "./temp/scidir_search_term.json",
42 | "abs_file_save_to": "./abs/scidir_search_term.json",
43 | "use_batches": true,
44 | "batch_size": 8,
45 | "keep_link_file": true
46 | },
47 | "ACM": {
48 | "search_term": "insert query string here",
49 | "link_file_save_to": "./temp/acm_search_term.json",
50 | "abs_file_save_to": "./abs/acm_search_term.json",
51 | "use_batches": true,
52 | "batch_size": 8,
53 | "keep_link_file": true
54 | },
55 | "IEEE": {
56 | "search_term": "insert query string here",
57 | "link_file_save_to": "./temp/ieee_search_term.json",
58 | "abs_file_save_to": "./abs/ieee_search_term.json",
59 | "use_batches": false,
60 | "batch_size": 8,
61 | "keep_link_file": true
62 | }
63 | }
64 | ```
65 |
66 | - or can use one bot as well
67 |
68 | ```json
69 | {
70 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
71 | "EXECUTABLE_PATH": "D:\\chromedriver.exe",
72 | "SCIDIR": {
73 | "search_term": "insert query string here",
74 | "link_file_save_to": "./temp/scidir_search_term.json",
75 | "abs_file_save_to": "./abs/scidir_search_term.json",
76 | "use_batches": true,
77 | "batch_size": 8,
78 | "keep_link_file": true
79 | }
80 | }
81 | ```
82 |
83 | - config `BINARY_LOCATION`
84 | use a path to chrome.exe file location
85 |
86 | - config `EXECUTABLE_PATH`
87 | use a path where you download and extract the Chrome web driver
88 |
89 | 4) install dependencies run the main.py
90 |
91 | ```shell
92 | pip install -r ./requirements.txt
93 | ```
94 |
95 | ```shell
96 | python main.py
97 |
98 | ```
99 |
100 | 5) that's it
101 | 6) save results into excel workbook, automatically saved into `./SLR.xlsx` file.
102 | ```python
103 | from src.utils import to_excel
104 | to_excel({"acm":'./abs/acm_search_term.json', "ieee": './abs/ieee_search_term.json', "science_direct": './abs/scidir_search_term.json'})
105 | ```
106 |
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
3 | "EXECUTABLE_PATH": "D:\\ML & DL\\chrome-win64\\chrome.exe",
4 | "ACM": {
5 | "search_term": "AllField:(video processing) AND Title:(sign language detection) AND AllField:(dumb and deff) AND AllField:(sign language)",
6 | "link_file_save_to": "./temp/acm_search_term_chris.json",
7 | "abs_file_save_to": "./abs/acm_search_term_chris.json",
8 | "use_batches": true,
9 | "batch_size": 8,
10 | "keep_link_file": true
11 | },
12 | "IEEE": {
13 | "search_term": "video processing, sign language detection, dumb",
14 | "link_file_save_to": "./temp/ieee_search_term_chris.json",
15 | "abs_file_save_to": "./abs/ieee_search_term_chris.json",
16 | "use_batches": false,
17 | "batch_size": 8,
18 | "keep_link_file": true
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/demo/acm adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/acm adv search string.jpg
--------------------------------------------------------------------------------
/demo/ieee adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/ieee adv search string.jpg
--------------------------------------------------------------------------------
/demo/science direct adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search string.jpg
--------------------------------------------------------------------------------
/demo/science direct adv search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search.jpg
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from datetime import datetime
4 | from src.scidirect import ScienceDirect, Paper as SDP
5 | from src.acm import ACM, Paper as ACMP
6 | from src.ieee import IEEE, Paper as IXP
7 | from src.utils import *
8 |
9 | if __name__ == "__main__":
10 | config = read_json("./config.json")
11 | assert validate(config)
12 |
13 | if not os.path.isdir('temp'):
14 | os.mkdir('temp')
15 |
16 | if not os.path.isdir('abs'):
17 | os.mkdir('abs')
18 |
19 | scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(config.keys()))
20 |
21 | for s in scrappers:
22 | if s == 'IEEE':
23 | # get links to individual search results
24 | ieee = IEEE(config['IEEE']['search_term'])
25 | ieee.get_links_to_papers()
26 |
27 | # dump links
28 | if config['IEEE']['keep_link_file']:
29 | ieee.to_json(config['IEEE']['link_file_save_to'])
30 |
31 | # get abstract of the and every search results
32 | ieee_paper = IXP(config['IEEE']['link_file_save_to'])
33 |
34 | if config['IEEE']['use_batches']:
35 | ieee_paper.batch_update_details(config['IEEE']['batch_size'])
36 |
37 | else:
38 | ieee_paper.update_paper_details()
39 |
40 | ieee_paper.to_json(config['IEEE']['abs_file_save_to'])
41 |
42 | if not config['IEEE']['keep_link_file']:
43 | os.remove(config['IEEE']['link_file_save_to'])
44 |
45 | elif s == 'ACM':
46 | # get links to individual search results
47 | current_year = datetime.now().year
48 | acm = ACM((current_year - 5), current_year, config['ACM']['search_term'])
49 | acm.get_links_to_papers()
50 |
51 | # dump links
52 | if config['ACM']['keep_link_file']:
53 | acm.to_json(config['ACM']['link_file_save_to'])
54 |
55 | # get abstract of the and every search results
56 | acm_paper = ACMP(config['ACM']['link_file_save_to'])
57 |
58 | if config['ACM']['use_batches']:
59 | acm_paper.batch_update_details(config['ACM']['batch_size'])
60 |
61 | else:
62 | acm_paper.update_paper_details()
63 |
64 | acm_paper.to_json(config['ACM']['abs_file_save_to'])
65 |
66 | if not config['ACM']['keep_link_file']:
67 | os.remove(config['ACM']['link_file_save_to'])
68 |
69 | elif s == 'SCIDIR':
70 | # get links to individual search results
71 | current_year = datetime.now().year
72 | sd = ScienceDirect((current_year - 5), current_year, config['SCIDIR']['search_term'])
73 |
74 | if sd.driver is not None:
75 | sd.driver.delete_all_cookies()
76 |
77 | sd.get_links_to_papers()
78 |
79 | # dump links
80 | if config['SCIDIR']['keep_link_file']:
81 | sd.to_json(config['SCIDIR']['link_file_save_to'])
82 |
83 | # get abstract of the and every search results
84 | sd_paper = SDP(config['SCIDIR']['link_file_save_to'])
85 |
86 | if sd_paper.driver is not None:
87 | sd_paper.driver.delete_all_cookies()
88 |
89 | if config['SCIDIR']['use_batches']:
90 | sd_paper.batch_update_details(config['SCIDIR']['batch_size'])
91 |
92 | else:
93 | sd_paper.update_paper_details()
94 |
95 | sd_paper.to_json(config['SCIDIR']['abs_file_save_to'])
96 |
97 | if not config['SCIDIR']['keep_link_file']:
98 | os.remove(config['SCIDIR']['link_file_save_to'])
99 |
100 | else:
101 | raise ConfigurationError(f"wrong scrapper {s}.")
102 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests~=2.28.1
2 | numpy~=1.22.4
3 | selenium~=4.3.0
4 | undetected-chromedriver~=3.4.7
5 | selenium-stealth~=1.0.6
6 | pytest~=7.3.1
7 | PyYAML~=6.0
8 | pandas~=2.0.1
9 | openpyxl~=3.1.2
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/src/__init__.py
--------------------------------------------------------------------------------
/src/acm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import json
4 | import numpy as np
5 | import undetected_chromedriver
6 | from selenium import webdriver
7 | from selenium.webdriver.common.by import By
8 | from selenium_stealth import stealth
9 | from src.utils import *
10 |
11 |
12 | class ACM:
13 | """
14 | Parameters
15 | ----------
16 | start: int
17 | start year of the date range filter
18 |
19 | end: int
20 | end year of the date range filter
21 |
22 | search_terms: str
23 | string of search terms (it can be comma seperated or semicolon
24 | seperated string)
25 |
26 | Attributes
27 | ----------
28 | driver: undetected_chromedriver.Chrome
29 | web driver for selenium
30 |
31 | page_count: int
32 | number of pages in search results
33 |
34 | links_to_paper: dict
35 | mined links and additional details for results
36 |
37 | origin: str
38 | origin of science direct advanced search url
39 |
40 | date_filter: str
41 | date range to filter search results
42 |
43 | results_in_a_page: str
44 | number of records should show tin single page
45 |
46 | start_page: str
47 | where is the starting location in page numbering
48 |
49 | query_text: str
50 | encoded search query string to apply in URL
51 |
52 | Methods
53 | -------
54 | encode_search_terms_into_query:
55 | encode user given search terms into URL string
56 |
57 | construct_full_link:
58 | create full link to make request from server
59 |
60 | create_query_text:
61 | create encoded query text to insert in URL
62 |
63 | init_driver:
64 | initiate web driver and session
65 |
66 | close_driver:
67 | close web driver and session
68 |
69 | post_request:
70 | post a request to science direct server
71 |
72 | check_for_multiple_pages:
73 | check weather search results contains multiple pages
74 | in results
75 |
76 | mine_links:
77 | get links to each search result (for each individual paper)
78 |
79 | get_links_to_papers:
80 | create paper link list
81 |
82 | to_json:
83 | dump results into json
84 |
85 | """
86 | options = webdriver.ChromeOptions()
87 | config = read_json('./config.json')
88 |
89 | options.add_argument("--headless")
90 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
91 | options.add_experimental_option('useAutomationExtension', False)
92 | options.add_argument("--disable-blink-features=AutomationControlled")
93 | options.binary_location = config['BINARY_LOCATION']
94 |
95 | def __init__(self,
96 | start,
97 | end,
98 | search_terms):
99 | self.driver = None
100 | self.page_count = None
101 | self.links_to_paper = {}
102 | self.search_terms = search_terms
103 | self.origin = "https://dl.acm.org/action/doSearch?"
104 | self.quick_search = "fillQuickSearch=false"
105 | self.target = "&target=advanced&expand=dl"
106 | self.date_filter = f"&AfterYear={start}&BeforeYear={end}"
107 | self.query_text = self.create_query_text()
108 | self.start_page = "&startPage=0"
109 | self.results_in_a_page = "&pageSize=50"
110 |
111 | @staticmethod
112 | def encode_search_terms_into_query(keywords: str) -> str:
113 | """
114 | encode user given search terms into URL string
115 |
116 | Parameters
117 | ----------
118 | keywords: str
119 | search terms to create search query
120 |
121 | Returns
122 | -------
123 |
124 | """
125 | encode = keywords.replace(' ', "+")
126 | encode = encode.replace(';', "%3B")
127 | encode = encode.replace(':', "%3A")
128 | encode = encode.replace(',', "%2C")
129 | encode = encode.replace('(', "%28")
130 | encode = encode.replace(')', "%29")
131 |
132 | return encode
133 |
134 | def create_query_text(self) -> str:
135 | """
136 | create query text
137 |
138 | Returns
139 | -------
140 |
141 | """
142 | return f"&AllField={self.encode_search_terms_into_query(self.search_terms)}"
143 |
144 | def construct_full_link(self) -> str:
145 | """
146 | create full link to make request from server
147 |
148 | Returns
149 | -------
150 |
151 | """
152 | return ''.join([self.origin,
153 | self.quick_search,
154 | self.target,
155 | self.date_filter,
156 | self.query_text,
157 | self.start_page,
158 | self.results_in_a_page])
159 |
160 | def init_driver(self) -> None:
161 | """
162 | initiate web driver and session
163 |
164 | Returns
165 | -------
166 |
167 | """
168 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
169 | executable_path=self.config['EXECUTABLE_PATH'])
170 | clean_cookies_and_caches(self.driver)
171 |
172 | def close_driver(self) -> None:
173 | """
174 | close web driver and session
175 |
176 | Returns
177 | -------
178 |
179 | """
180 | self.driver.close()
181 |
182 | def post_request(self, link) -> None:
183 | """
184 | post a request to science direct server
185 |
186 | Parameters
187 | ----------
188 | link: str
189 | URL to make request on
190 |
191 | Returns
192 | -------
193 |
194 | """
195 | stealth(self.driver,
196 | languages=["en-US", "en"],
197 | vendor="Google Inc.",
198 | platform="Win32",
199 | webgl_vendor="Intel Inc.",
200 | renderer="Intel Iris OpenGL Engine",
201 | fix_hairline=True,
202 | )
203 | # make request
204 | self.driver.delete_all_cookies()
205 | self.driver.get(link)
206 | time.sleep(abs(np.random.normal(2, 0.4)))
207 |
208 | def check_for_multiple_pages(self) -> bool:
209 | """
210 | check weather search results contains multiple pages
211 | in results
212 |
213 | Returns
214 | -------
215 |
216 | """
217 | link = self.construct_full_link()
218 | self.init_driver()
219 | self.post_request(link)
220 |
221 | tot_results = int(self.driver.find_element(By.CLASS_NAME,
222 | value="result__count").text.split(' ')[0])
223 |
224 | self.page_count = int(np.round(tot_results / 50))
225 |
226 | self.close_driver()
227 |
228 | return True if self.page_count > 1 else False
229 |
230 | def mine_links(self) -> None:
231 | """
232 | get links to each search result (for each individual paper)
233 |
234 | Returns
235 | -------
236 |
237 | """
238 | types = self.driver.find_elements(By.CLASS_NAME, value="issue-heading")
239 | dates = self.driver.find_elements(By.CLASS_NAME, value="bookPubDate")
240 | titles = self.driver.find_elements(By.CLASS_NAME, value="issue-item__title")
241 | links = self.driver.find_elements(By.CSS_SELECTOR,
242 | value="h5[class='issue-item__title']>span[class='hlFld-Title']>a")
243 |
244 | for type_, date, title, link in zip(types, dates, titles, links):
245 | self.links_to_paper[f'{link.get_attribute("href").split("/")[-1]}'] = {"type_": type_.text,
246 | "date": date.text,
247 | "title": title.text,
248 | "link": link.get_attribute('href')}
249 |
250 | time.sleep(abs(np.random.uniform(2, 4)))
251 |
252 | def get_links_to_papers(self) -> None:
253 | """
254 | create paper link list
255 |
256 | Returns
257 | -------
258 |
259 | """
260 | if self.check_for_multiple_pages():
261 | for i in range(1, (self.page_count + 1)):
262 | self.start_page = f"&startPage={i}"
263 | self.init_driver()
264 | self.post_request(self.construct_full_link())
265 | self.mine_links()
266 |
267 | print(f'reading page: {i + 1} from {self.page_count}', end='\r')
268 |
269 | self.close_driver()
270 |
271 | else:
272 | self.init_driver()
273 | self.post_request(self.construct_full_link())
274 | self.mine_links()
275 | self.close_driver()
276 |
277 | def to_json(self, path) -> None:
278 | """
279 | dump results into json
280 |
281 | Parameters
282 | ----------
283 | path: str
284 | string path for save results (link and additional details)
285 |
286 | Returns
287 | -------
288 |
289 | """
290 | with open(path, 'w') as file:
291 | json.dump(self.links_to_paper, file)
292 |
293 |
294 | class Paper:
295 | options = webdriver.ChromeOptions()
296 | config = read_json('./config.json')
297 |
298 | options.add_argument("--headless")
299 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
300 | options.add_experimental_option('useAutomationExtension', False)
301 | options.add_argument("--disable-blink-features=AutomationControlled")
302 | options.binary_location = config['BINARY_LOCATION']
303 |
304 | def __init__(self, file_name):
305 | self.driver = None
306 | self.destination = file_name
307 |
308 | with open(file_name, "r") as file:
309 | self.link_object = json.load(file)
310 |
311 | def init_driver(self) -> None:
312 | """
313 | initiate web driver and session
314 |
315 | Returns
316 | -------
317 |
318 | """
319 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
320 | executable_path=self.config['EXECUTABLE_PATH'])
321 | clean_cookies_and_caches(self.driver)
322 |
323 | def close_driver(self) -> None:
324 | """
325 | close web driver and session
326 |
327 | Returns
328 | -------
329 |
330 | """
331 | self.driver.close()
332 |
333 | def request_paper(self, page_link) -> None:
334 | """
335 | post a request to science direct server
336 |
337 | Parameters
338 | ----------
339 | page_link: str
340 | URL to make request on
341 |
342 | Returns
343 | -------
344 |
345 | """
346 | stealth(self.driver,
347 | languages=["en-US", "en"],
348 | vendor="Google Inc.",
349 | platform="Win32",
350 | webgl_vendor="Intel Inc.",
351 | renderer="Intel Iris OpenGL Engine",
352 | fix_hairline=True,
353 | )
354 |
355 | URL = page_link
356 |
357 | # make request
358 | self.driver.delete_all_cookies()
359 | self.driver.get(URL)
360 |
361 | time.sleep(abs(np.random.normal(1, 0.4)))
362 |
363 | def get_abstract_text(self) -> str:
364 | """
365 | get abstract from each publication
366 |
367 | Returns
368 | -------
369 | abstract: str
370 |
371 | """
372 | return self.driver.find_element(By.CLASS_NAME, 'abstractInFull').text
373 |
374 | # def click_kw_section(self) -> None:
375 | # self.driver.execute_script("arguments[0].scrollIntoView();",
376 | # self.driver.find_element(By.ID, 'keywords'))
377 | # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
378 | # time.sleep(1)
379 |
380 | # def get_keywords(self) -> list:
381 | # """
382 | # get all type of keywords in ieee xplore for the publication
383 | #
384 | # Returns
385 | # -------
386 | # list of keyword strings: list
387 | #
388 | # """
389 | # kw_types = self.driver.find_elements(By.CSS_SELECTOR,
390 | # "ul[class='doc-keywords-list stats-keywords-list']>li["
391 | # "class='doc-keywords-list-item']>ul")
392 | # return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
393 |
394 | def update_paper_details(self) -> None:
395 | """
396 | update the detail object of the publications
397 |
398 | Returns
399 | -------
400 |
401 | """
402 | # start driver
403 | self.init_driver()
404 |
405 | for obj in self.link_object.values():
406 | doc_link = obj['link']
407 | self.request_paper(doc_link)
408 | # self.click_kw_section()
409 |
410 | time.sleep(abs(np.random.normal(1, 0.4)))
411 |
412 | try:
413 | abstract = self.get_abstract_text()
414 | # kws = self.get_keywords()
415 |
416 | except:
417 | abstract = np.NAN
418 | # kws = np.NAN
419 |
420 | obj['abs'] = abstract
421 |
422 | # if kws not in value:
423 | # value.append(kws)
424 |
425 | # close driver
426 | self.close_driver()
427 |
428 | def batch_update_details(self, size) -> None:
429 | """
430 | update the detail object of the publications batch wise
431 |
432 | Parameters
433 | ----------
434 | size: int
435 | size of a batch
436 |
437 | Returns
438 | -------
439 |
440 | """
441 | keys = list(self.link_object.keys())
442 |
443 | for i in range(size, len(self.link_object), size):
444 | batch = keys[(i - size):i]
445 | self.init_driver()
446 |
447 | for p in batch:
448 | doc_link = self.link_object[p]["link"]
449 | self.request_paper(doc_link)
450 |
451 | try:
452 | abstract = self.get_abstract_text()
453 |
454 | except:
455 | abstract = np.NAN
456 |
457 | if abstract not in list(self.link_object[p].values()):
458 | self.link_object[p]["abs"] = abstract
459 |
460 | # dump updated link object to json
461 | with open('./acm_temp.json', 'w') as file:
462 | json.dump(self.link_object, file)
463 |
464 | # close driver
465 | self.close_driver()
466 |
467 | def to_json(self, path) -> None:
468 | """
469 | dump results into json
470 |
471 | Parameters
472 | ----------
473 | path: str
474 | string path for save results (link and additional details)
475 |
476 | Returns
477 | -------
478 |
479 | """
480 | if os.path.isfile('./acm_temp.json'):
481 | with open('./acm_temp.json') as file:
482 | self.link_object = json.load(file)
483 |
484 | os.remove('./acm_temp.json')
485 |
486 | with open(path, 'w') as file:
487 | json.dump(self.link_object, file)
488 |
--------------------------------------------------------------------------------
/src/ieee.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import requests
4 | import numpy as np
5 | import json
6 | import undetected_chromedriver
7 |
8 | from selenium import webdriver
9 | from selenium.common import exceptions
10 | from selenium.webdriver.common.by import By
11 | from selenium.webdriver.support.wait import WebDriverWait
12 | from selenium.webdriver.support import expected_conditions as EC
13 | from selenium_stealth import stealth
14 | from src.utils import *
15 |
16 |
17 | class IEEE:
18 | """
19 | Parameters
20 | ----------
21 | query: str
22 | search term for either simple search or advanced search, if for advanced
23 | search need to add AND, OR, NOT in between search keywords.
24 |
25 | Attributes
26 | ----------
27 | headers: dict
28 | header to post for IEEE Xplore
29 |
30 | payload: dict
31 | additional details for filter results from request
32 |
33 | page_count: int
34 | total number of pages in the search results
35 |
36 | links_to_paper: dict
37 | mined links and additional details for results
38 |
39 | Methods
40 | -------
41 | post_request:
42 | send request to IEEE server
43 |
44 | check_for_multiple_pages:
45 | check weather results has been divide to multiple
46 | web pages, if so update the page count.
47 |
48 | mine_links:
49 | get links for each document from search results
50 |
51 | get_links_to_papers:
52 | add all links to single object
53 |
54 | to_json:
55 | dump links to json file
56 |
57 | """
58 |
59 | def __init__(self, query):
60 | self.headers = {
61 | "Accept": "application/json, text/plain, */*",
62 | "Origin": "https://ieeexplore.ieee.org",
63 | "Content-Type": "application/json",
64 | }
65 | self.payload = {
66 | "newsearch": True,
67 | "queryText": query,
68 | "highlight": True,
69 | "returnFacets": ["ALL"],
70 | "returnType": "SEARCH",
71 | "pageNumber": 1
72 | }
73 | self.page_count = None
74 | self.links_to_paper = {}
75 |
76 | @staticmethod
77 | def post_request(header: dict, json: dict) -> requests.Response:
78 | """
79 | send request to IEEE server
80 |
81 | Parameters
82 | ----------
83 | header: dict
84 | header to post for IEEE Xplore
85 |
86 | json: dict
87 | additional details for filter results from request
88 |
89 | Returns
90 | -------
91 |
92 | """
93 | result = requests.post("https://ieeexplore.ieee.org/rest/search",
94 | headers=header,
95 | json=json)
96 |
97 | return result
98 |
99 | def check_for_multiple_pages(self) -> bool:
100 | """
101 | check weather results has been divide to multiple
102 | web pages, if so update the page count.
103 |
104 | Returns
105 | -------
106 |
107 | """
108 | results = self.post_request(self.headers, self.payload).json()
109 | self.page_count = results['totalPages']
110 |
111 | return True if self.page_count > 1 else False
112 |
113 | def mine_links(self) -> None:
114 | """
115 | get links for each document from search results
116 |
117 | Returns
118 | -------
119 |
120 | """
121 | request = self.post_request(self.headers, self.payload)
122 | j = 1
123 |
124 | while request.status_code != 200:
125 | time.sleep(abs(np.random.normal(0.1, 2)))
126 | request = self.post_request(self.headers, self.payload)
127 |
128 | results = request.json()
129 |
130 | for record in results['records']:
131 | self.links_to_paper[record['articleNumber']] = {"title": record.get('articleTitle', None),
132 | "link": record.get('documentLink', None),
133 | "date": record.get('publicationYear', None)}
134 |
135 | def get_links_to_papers(self) -> None:
136 | """
137 | add all links to single object
138 |
139 | Returns
140 | -------
141 |
142 | """
143 | if self.check_for_multiple_pages():
144 | for i in range(1, (self.page_count + 1)):
145 | self.payload["pageNumber"] = i
146 |
147 | self.mine_links()
148 |
149 | print(f'reading page: {i} from {self.page_count}', end='\r')
150 |
151 | else:
152 | self.mine_links()
153 |
154 | def to_json(self, path: str) -> None:
155 | """
156 | dump links to json file
157 |
158 | Parameters
159 | ----------
160 | path: str
161 | string path for save results (link and additional details)
162 |
163 | Returns
164 | -------
165 |
166 | """
167 | with open(path, 'w') as file:
168 | json.dump(self.links_to_paper, file)
169 |
170 |
171 | class Paper:
172 | options = webdriver.ChromeOptions()
173 | config = read_json('./config.json')
174 |
175 | options.add_argument("--headless")
176 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
177 | options.add_experimental_option('useAutomationExtension', False)
178 | options.add_argument("--disable-blink-features=AutomationControlled")
179 | options.binary_location = config['BINARY_LOCATION']
180 |
181 | def __init__(self, file_name):
182 | self.driver = None
183 | self.failure = []
184 | self.destination = file_name
185 |
186 | with open(file_name, "r") as file:
187 | self.link_object = json.load(file)
188 |
189 | def init_driver(self) -> None:
190 | """
191 | initiate a web driver and session
192 |
193 | Returns
194 | -------
195 |
196 | """
197 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
198 | executable_path=self.config['EXECUTABLE_PATH'])
199 | clean_cookies_and_caches(self.driver)
200 |
201 | def close_driver(self) -> None:
202 | """
203 | close a web driver and session
204 |
205 | Returns
206 | -------
207 |
208 | """
209 | self.driver.close()
210 |
211 | def request_paper(self, page_link) -> None:
212 | """
213 | post a request to science direct server
214 |
215 | Parameters
216 | ----------
217 | page_link: str
218 | URL to make request on
219 |
220 | Returns
221 | -------
222 |
223 | """
224 | stealth(self.driver,
225 | languages=["en-US", "en"],
226 | vendor="Google Inc.",
227 | platform="Win32",
228 | webgl_vendor="Intel Inc.",
229 | renderer="Intel Iris OpenGL Engine",
230 | fix_hairline=True,
231 | )
232 |
233 | URL = f"https://ieeexplore.ieee.org{page_link}"
234 |
235 | # make request
236 | self.driver.delete_all_cookies()
237 |
238 | try:
239 | self.driver.get(URL)
240 |
241 | except:
242 | self.fall_back()
243 | self.driver.get(URL)
244 |
245 | time.sleep(abs(np.random.normal(1, 0.4)))
246 |
247 | def fall_back(self):
248 | """
249 | recover while errors happens when requesting page data
250 |
251 | Returns
252 | -------
253 |
254 | """
255 | self.close_driver()
256 | time.sleep(1)
257 | self.init_driver()
258 |
259 | def get_abstract_text(self) -> str:
260 | """
261 | get abstract from each publication
262 |
263 | Returns
264 | -------
265 | abstract: str
266 |
267 | """
268 | return self.driver.find_element(By.CLASS_NAME, 'abstract-text').text.replace('Abstract:\n', '')
269 |
270 | def click_kw_section(self) -> None:
271 | self.driver.execute_script("arguments[0].scrollIntoView();",
272 | self.driver.find_element(By.ID, 'keywords'))
273 | WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
274 | time.sleep(1)
275 |
276 | def get_keywords(self) -> list:
277 | """
278 | get all type of keywords in ieee xplore for the publication
279 |
280 | Returns
281 | -------
282 | list of keyword strings: list
283 |
284 | """
285 | kw_types = self.driver.find_elements(By.CSS_SELECTOR,
286 | "ul[class='doc-keywords-list stats-keywords-list']>li["
287 | "class='doc-keywords-list-item']>ul")
288 | return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
289 |
290 | def update_paper_details(self) -> None:
291 | """
292 | update the detail object of the publications
293 |
294 | Returns
295 | -------
296 |
297 | """
298 | # start driver
299 | self.init_driver()
300 |
301 | for key, value in self.link_object.items():
302 | doc_link = value["link"]
303 |
304 | try:
305 | self.request_paper(doc_link)
306 | self.click_kw_section()
307 |
308 | except exceptions.NoSuchElementException:
309 | self.fall_back()
310 | self.request_paper(doc_link)
311 | self.click_kw_section()
312 |
313 | except:
314 | continue
315 |
316 | time.sleep(abs(np.random.normal(1, 0.4)))
317 |
318 | try:
319 | abstract = self.get_abstract_text()
320 | kws = self.get_keywords()
321 |
322 | except:
323 | abstract = np.NAN
324 | kws = np.NAN
325 |
326 | value["abs"] = abstract
327 | value["kws"] = kws
328 |
329 | # close driver
330 | self.close_driver()
331 |
332 | def batch_update_details(self, size) -> None:
333 | """
334 | update the detail object of the publications batch wise
335 |
336 | Parameters
337 | ----------
338 | size: int
339 | size of a batch
340 |
341 | Returns
342 | -------
343 |
344 | """
345 | keys = list(self.link_object.keys())
346 |
347 | for i in range(size, len(self.link_object), size):
348 | batch = keys[(i - size):i]
349 | self.init_driver()
350 |
351 | for p in batch:
352 | doc_link = self.link_object[p]["link"]
353 |
354 | try:
355 | self.request_paper(doc_link)
356 | self.click_kw_section()
357 |
358 | except exceptions.NoSuchElementException:
359 | self.fall_back()
360 | self.request_paper(doc_link)
361 | self.click_kw_section()
362 |
363 | except:
364 | continue
365 |
366 | try:
367 | abstract = self.get_abstract_text()
368 | kws = self.get_keywords()
369 |
370 | except:
371 | abstract = np.NAN
372 | kws = np.NAN
373 |
374 | self.link_object[p]["abs"] = abstract
375 | self.link_object[p]["abs"] = kws
376 |
377 | # dump updated link object to json
378 | with open('./ieee_temp.json', 'w') as file:
379 | json.dump(self.link_object, file)
380 |
381 | # close driver
382 | self.close_driver()
383 |
384 | def to_json(self, path) -> None:
385 | """
386 | dump results into json
387 |
388 | Parameters
389 | ----------
390 | path: str
391 | string path for save results (link and additional details)
392 |
393 | Returns
394 | -------
395 |
396 | """
397 | if os.path.isfile('./ieee_temp.json'):
398 | with open('./ieee_temp.json') as file:
399 | self.link_object = json.load(file)
400 |
401 | os.remove('./ieee_temp.json')
402 |
403 | with open(path, 'w') as file:
404 | json.dump(self.link_object, file)
405 |
--------------------------------------------------------------------------------
/src/scidirect.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import json
4 | import numpy as np
5 | import undetected_chromedriver
6 | from selenium import webdriver
7 | from selenium.webdriver.common.by import By
8 | from selenium_stealth import stealth
9 | from src.utils import *
10 |
11 |
12 | class ScienceDirect:
13 | """
14 | Parameters
15 | ----------
16 | start: int
17 | start year of the date range filter
18 |
19 | end: int
20 | end year of the date range filter
21 |
22 | search_terms: str
23 | string of search terms (it can be comma seperated or semicolon
24 | seperated string)
25 |
26 | Attributes
27 | ----------
28 | driver: undetected_chromedriver.Chrome
29 | web driver for selenium
30 |
31 | page_count: int
32 | number of pages in search results
33 |
34 | links_to_paper: dict
35 | mined links and additional details for results
36 |
37 | origin: str
38 | origin of science direct advanced search url
39 |
40 | date_filter: str
41 | date range to filter search results
42 |
43 | results_in_a_page: str
44 | number of records should show tin single page
45 |
46 | offset: str
47 | number of records should go forward for next page
48 | in search results
49 |
50 | query_text: str
51 | encoded search query string to apply in URL
52 |
53 | article_type: str
54 | science direct article type category indicator
55 |
56 | Methods
57 | -------
58 | encode_search_terms_into_query:
59 | encode user given search terms into URL string
60 |
61 | construct_full_link:
62 | create full link to make request from server
63 |
64 | init_driver:
65 | initiate web driver and session
66 |
67 | close_driver:
68 | close web driver and session
69 |
70 | post_request:
71 | post a request to science direct server
72 |
73 | check_for_multiple_pages:
74 | check weather search results contains multiple pages
75 | in results
76 |
77 | mine_links:
78 | get links to each search result (for each individual paper)
79 |
80 | get_links_to_papers:
81 | create paper link list
82 |
83 | to_json:
84 | dump results into json
85 |
86 | """
87 |
88 | options = webdriver.ChromeOptions()
89 | config = read_json('./config.json')
90 |
91 | options.add_argument("--headless")
92 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
93 | options.add_experimental_option('useAutomationExtension', False)
94 | options.add_argument("--disable-blink-features=AutomationControlled")
95 | options.binary_location = config['BINARY_LOCATION']
96 |
97 | def __init__(self, start: int, end: int, search_terms: str):
98 | self.driver = None
99 | self.page_count = None
100 | self.links_to_paper = {}
101 | self.origin = "https://www.sciencedirect.com/search"
102 | self.date_filter = f"?date={start}-{end}"
103 | self.results_in_a_page = "&show=100"
104 | self.offset = "&offset=0"
105 | self.query_text = self.encode_search_terms_into_query(search_terms)
106 | self.article_type = "&articleTypes=FLA"
107 |
108 | @staticmethod
109 | def encode_search_terms_into_query(keywords: str) -> str:
110 | """
111 | encode user given search terms into URL string
112 |
113 | Parameters
114 | ----------
115 | keywords: str
116 | search terms to create search query
117 |
118 | Returns
119 | -------
120 |
121 | """
122 | encode = keywords.replace(' ', "%20")
123 | encode = encode.replace(';', "%3B")
124 | encode = encode.replace(',', "%2C")
125 |
126 | return f"&qs={encode}"
127 |
128 | def construct_full_link(self) -> str:
129 | """
130 | create full link to make request from server
131 |
132 | Returns
133 | -------
134 |
135 | """
136 | return ''.join([self.origin,
137 | self.date_filter,
138 | self.query_text,
139 | self.results_in_a_page,
140 | self.offset,
141 | self.article_type])
142 |
143 | def init_driver(self) -> None:
144 | """
145 | initiate web driver and session
146 |
147 | Returns
148 | -------
149 |
150 | """
151 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
152 | executable_path=self.config['EXECUTABLE_PATH'])
153 | clean_cookies_and_caches(self.driver)
154 |
155 | def close_driver(self) -> None:
156 | """
157 | close web driver and session
158 |
159 | Returns
160 | -------
161 |
162 | """
163 | self.driver.close()
164 |
165 | def post_request(self, link: str) -> None:
166 | """
167 | post a request to science direct server
168 |
169 | Parameters
170 | ----------
171 | link: str
172 | URL to make request on
173 |
174 | Returns
175 | -------
176 |
177 | """
178 | stealth(self.driver,
179 | languages=["en-US", "en"],
180 | vendor="Google Inc.",
181 | platform="Win32",
182 | webgl_vendor="Intel Inc.",
183 | renderer="Intel Iris OpenGL Engine",
184 | fix_hairline=True,
185 | )
186 | # make request
187 | self.driver.delete_all_cookies()
188 | self.driver.get(link)
189 | time.sleep(abs(np.random.normal(2, 0.4)))
190 |
191 | def check_for_multiple_pages(self) -> bool:
192 | """
193 | check weather search results contains multiple pages
194 | in results
195 |
196 | Returns
197 | -------
198 |
199 | """
200 | link = self.construct_full_link()
201 | self.init_driver()
202 | self.post_request(link)
203 |
204 | tot_results = int(self.driver.find_element(By.CLASS_NAME,
205 | value="search-body-results-text").text.split(' ')[0])
206 | self.page_count = int(np.round(tot_results / 100))
207 |
208 | self.close_driver()
209 |
210 | return True if self.page_count > 1 else False
211 |
212 | def mine_links(self) -> None:
213 | """
214 | get links to each search result (for each individual paper)
215 |
216 | Returns
217 | -------
218 |
219 | """
220 | for title, article in zip(self.driver.find_elements(By.CLASS_NAME, value="result-list-title-link"),
221 | self.driver.find_elements(By.CLASS_NAME, value="article-type")):
222 | self.links_to_paper[title.get_attribute('id')] = {"title": title.text,
223 | "link": title.get_attribute('href'),
224 | "type_": article.text}
225 |
226 | time.sleep(abs(np.random.uniform(2, 4)))
227 |
228 | def get_links_to_papers(self) -> None:
229 | """
230 | create paper link list
231 |
232 | Returns
233 | -------
234 |
235 | """
236 | if self.check_for_multiple_pages():
237 | for i in range(self.page_count):
238 | self.offset = f"&offset={100 * i}"
239 | self.init_driver()
240 | self.post_request(self.construct_full_link())
241 | self.mine_links()
242 |
243 | print(f'reading page: {i + 1} from {self.page_count}', end='\r')
244 |
245 | self.close_driver()
246 |
247 | else:
248 | self.init_driver()
249 | self.post_request(self.construct_full_link())
250 | self.mine_links()
251 | self.close_driver()
252 |
253 | def to_json(self, path: str) -> None:
254 | """
255 | dump results into json
256 |
257 | Parameters
258 | ----------
259 | path: str
260 | string path for save results (link and additional details)
261 |
262 | Returns
263 | -------
264 |
265 | """
266 | with open(path, 'w') as file:
267 | json.dump(self.links_to_paper, file)
268 |
269 |
270 | class Paper:
271 | options = webdriver.ChromeOptions()
272 | config = read_json('./config.json')
273 |
274 | options.add_argument("--headless")
275 | options.add_experimental_option("excludeSwitches", ["enable-automation"])
276 | options.add_experimental_option('useAutomationExtension', False)
277 | options.add_argument("--disable-blink-features=AutomationControlled")
278 | options.binary_location = config['BINARY_LOCATION']
279 |
280 | def __init__(self, file_name):
281 | self.driver = None
282 | self.destination = file_name
283 |
284 | with open(file_name, "r") as file:
285 | self.link_object = json.load(file)
286 |
287 | def init_driver(self) -> None:
288 | """
289 | initiate web driver and session
290 |
291 | Returns
292 | -------
293 |
294 | """
295 | self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
296 | executable_path=self.config['EXECUTABLE_PATH'])
297 | clean_cookies_and_caches(self.driver)
298 |
299 | def close_driver(self) -> None:
300 | """
301 | close web driver and session
302 |
303 | Returns
304 | -------
305 |
306 | """
307 | self.driver.close()
308 |
309 | def request_paper(self, page_link) -> None:
310 | """
311 | post a request to science direct server
312 |
313 | Parameters
314 | ----------
315 | page_link: str
316 | URL to make request on
317 |
318 | Returns
319 | -------
320 |
321 | """
322 | stealth(self.driver,
323 | languages=["en-US", "en"],
324 | vendor="Google Inc.",
325 | platform="Win32",
326 | webgl_vendor="Intel Inc.",
327 | renderer="Intel Iris OpenGL Engine",
328 | fix_hairline=True,
329 | )
330 |
331 | URL = page_link
332 |
333 | # make request
334 | self.driver.delete_all_cookies()
335 | self.driver.get(URL)
336 |
337 | time.sleep(abs(np.random.normal(1, 0.4)))
338 |
339 | def get_abstract_text(self) -> str:
340 | """
341 | get abstract from each publication
342 |
343 | Returns
344 | -------
345 | abstract: str
346 |
347 | """
348 | return self.driver.find_element(By.CLASS_NAME, 'abstract').text.replace('Abstract:\n', '')
349 |
350 | # def click_kw_section(self) -> None:
351 | # self.driver.execute_script("arguments[0].scrollIntoView();",
352 | # self.driver.find_element(By.ID, 'keywords'))
353 | # WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
354 | # time.sleep(1)
355 | #
356 | # def get_keywords(self) -> list:
357 | # """
358 | # get all type of keywords in ieee xplore for the publication
359 | #
360 | # Returns
361 | # -------
362 | # list of keyword strings: list
363 | #
364 | # """
365 | # kw_types = self.driver.find_elements(By.CSS_SELECTOR,
366 | # "ul[class='doc-keywords-list stats-keywords-list']>li["
367 | # "class='doc-keywords-list-item']>ul")
368 | # return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
369 |
370 | def update_paper_details(self) -> None:
371 | """
372 | update the detail object of the publications
373 |
374 | Returns
375 | -------
376 |
377 | """
378 | # start driver
379 | self.init_driver()
380 |
381 | for key, value in self.link_object.items():
382 | doc_link = value["link"]
383 | self.request_paper(doc_link)
384 |
385 | time.sleep(abs(np.random.normal(1, 0.4)))
386 |
387 | try:
388 | abstract = self.get_abstract_text()
389 |
390 | except:
391 | abstract = np.NAN
392 |
393 | value["abs"] = abstract
394 |
395 | # close driver
396 | self.close_driver()
397 |
398 | def batch_update_details(self, size) -> None:
399 | """
400 | update the detail object of the publications batch wise
401 |
402 | Parameters
403 | ----------
404 | size: int
405 | size of a batch
406 |
407 | Returns
408 | -------
409 |
410 | """
411 | keys = list(self.link_object.keys())
412 |
413 | for i in range(size, len(self.link_object), size):
414 | batch = keys[(i - size):i]
415 | self.init_driver()
416 |
417 | for p in batch:
418 | doc_link = self.link_object[p]["link"]
419 | self.request_paper(doc_link)
420 |
421 | try:
422 | abstract = self.get_abstract_text()
423 |
424 | except:
425 | abstract = np.NAN
426 |
427 | self.link_object[p]["abs"] = abstract
428 |
429 | # dump updated link object to json
430 | with open('./sci_temp.json', 'w') as file:
431 | json.dump(self.link_object, file)
432 |
433 | # close driver
434 | self.close_driver()
435 |
436 | def to_json(self, path) -> None:
437 | """
438 | dump results into json
439 |
440 | Parameters
441 | ----------
442 | path: str
443 | string path for save results (link and additional details)
444 |
445 | Returns
446 | -------
447 |
448 | """
449 | if os.path.isfile('./sci_temp.json'):
450 | with open('./sci_temp.json') as file:
451 | self.link_object = json.load(file)
452 |
453 | os.remove('./sci_temp.json')
454 |
455 | with open(path, 'w') as file:
456 | json.dump(self.link_object, file)
457 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pandas as pd
3 |
4 | from selenium.webdriver.support.ui import WebDriverWait
5 |
6 |
7 | def clean_cookies_and_caches(driver):
8 | # first falls check
9 | if driver is not None:
10 | driver.delete_all_cookies()
11 |
12 | # step 2
13 | # method 1
14 | driver.execute_script('window.localStorage.clear()')
15 |
16 | # method 2
17 | driver.execute_script('window.sessionStorage.clear()')
18 |
19 |
20 | def read_json(file_path):
21 | with open(file_path, "r") as f:
22 | return json.load(f)
23 |
24 |
25 | def to_excel(sheets: dict):
26 | dfs = {key: pd.read_json(filename) for key, filename in sheets.items()}
27 |
28 | with pd.ExcelWriter('./SLR_chris.xlsx') as writer:
29 | for sheet, df in dfs.items():
30 | df.T.to_excel(writer, sheet_name=sheet)
31 |
32 |
33 | def validate(obj: dict):
34 | if obj == {}:
35 | raise ConfigurationError()
36 |
37 | scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(obj.keys()))
38 | if not obj.get('BINARY_LOCATION', False):
39 | raise ConfigurationError()
40 |
41 | if not obj.get('EXECUTABLE_PATH', False):
42 | raise ConfigurationError()
43 |
44 | assert len(scrappers) != 0
45 |
46 | print(f"detected scrappers: {scrappers}")
47 | print('=' * 25)
48 |
49 | validate_scrapper_keys(obj, scrappers)
50 |
51 | return True
52 |
53 |
54 | def validate_scrapper_keys(obj: dict, detected: set):
55 | expected_keys = ['search_term', 'link_file_save_to',
56 | 'abs_file_save_to', 'use_batches',
57 | 'batch_size', 'keep_link_file']
58 | for s in detected:
59 | if list(obj[s].keys()) != expected_keys:
60 | raise ConfigurationError(expected_keys)
61 |
62 |
63 | class ConfigurationError(Exception):
64 | """
65 | raise when scrapper configuration misses
66 | expected key or keys
67 | """
68 |
69 | def __int__(self, exp_keys: list):
70 | self.exp_keys = exp_keys
71 |
72 | def __repr__(self):
73 | return f"{' '.join(self.exp_keys)} one or more keys missing from those."
74 |
75 |
76 | # TODO: complete this class
77 | # class GetSummery:
78 | # config = read_json('./config.json')
79 | # openai.api_key = config['API_KEY']
80 | #
81 | # def __init__(self):
82 | # self.text_generator = None
83 | # self.paper_dtls = None
84 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/tests/__init__.py
--------------------------------------------------------------------------------
/tests/bad.json:
--------------------------------------------------------------------------------
1 | {
2 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
3 | "IEEE": {
4 | "search_term": "",
5 | "link_file_save_to": "./temp/ieee_search_term.json",
6 | "abs_file_save_to": "./abs/ieee_search_term.json",
7 | "use_batches": true,
8 | "batch_size": 8,
9 | "keep_link_file": true
10 | },
11 | "ACM": {
12 | "search_term": "",
13 | "link_file_save_to": "./temp/acm_search_term.json",
14 | "abs_file_save_to": "./abs/acm_search_term.json",
15 | "use_batches": true,
16 | "batch_size": 8,
17 | "keep_link_file": true
18 | },
19 | "SCIDIR": {}
20 | }
--------------------------------------------------------------------------------
/tests/empty.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/tests/validate_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from src.utils import *
4 | from pytest import raises
5 |
6 |
7 | def test_is_config_exists(file_name='x.json'):
8 | with raises(FileNotFoundError):
9 | read_json(file_name)
10 |
11 |
12 | def test_whether_config_empty(file_name='tests/empty.json'):
13 | config = read_json(file_name)
14 |
15 | with raises(ConfigurationError):
16 | validate(config)
17 |
18 |
19 | def test_config_file():
20 | config = read_json('config.json')
21 | validate(config)
22 |
23 |
24 | def test_able_to_identify_bad_config():
25 | config = read_json('tests/bad.json')
26 |
27 | with raises(ConfigurationError):
28 | validate(config)
29 |
30 | with raises(ConfigurationError):
31 | obj = {
32 | "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
33 | "EXECUTABLE_PATH": "D:\\chromedriver.exe",
34 | "ACM": {
35 | "search_term": "",
36 | "link_file_save_to": "./temp/acm_search_term.json",
37 | "abs_file_save_to": "./abs/acm_search_term.json",
38 | "use_batches": True,
39 | "batch_size": 8,
40 | "keep_link_file": True
41 | },
42 | "SCIDIR": {
43 | }
44 | }
45 |
46 | validate(obj)
47 |
--------------------------------------------------------------------------------