├── .github
    └── workflows
    │   └── python-app.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── Literature Review.iml
    ├── git_toolbox_prj.xml
    ├── inspectionProfiles
    │   ├── Project_Default.xml
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── other.xml
    └── vcs.xml
├── README.md
├── config.json
├── demo
    ├── acm adv search string.jpg
    ├── ieee adv search string.jpg
    ├── science direct adv search string.jpg
    └── science direct adv search.jpg
├── main.py
├── requirements.txt
├── src
    ├── __init__.py
    ├── acm.py
    ├── ieee.py
    ├── scidirect.py
    └── utils.py
└── tests
    ├── __init__.py
    ├── bad.json
    ├── empty.json
    └── validate_config.py


/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |     - name: Lint with flake8
32 |       run: |
33 |         # stop the build if there are Python syntax errors or undefined names
34 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 |     - name: Test with pytest
38 |       run: |
39 |         pip install pytest pytest-cov
40 |         pytest ./tests/validate_config.py
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Project exclude paths
2 | /data/


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/.idea/Literature Review.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/data" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/notebooks" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/temp" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/abs" />
10 |     </content>
11 |     <orderEntry type="jdk" jdkName="Python 3.10 (LiteratureReview)" jdkType="Python SDK" />
12 |     <orderEntry type="sourceFolder" forTests="false" />
13 |   </component>
14 |   <component name="PyDocumentationSettings">
15 |     <option name="format" value="NUMPY" />
16 |     <option name="myDocStringFormat" value="NumPy" />
17 |     <option name="renderExternalDocumentation" value="true" />
18 |   </component>
19 | </module>


--------------------------------------------------------------------------------
/.idea/git_toolbox_prj.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="GitToolBoxProjectSettings">
 4 |     <option name="commitMessageIssueKeyValidationOverride">
 5 |       <BoolValueOverride>
 6 |         <option name="enabled" value="true" />
 7 |       </BoolValueOverride>
 8 |     </option>
 9 |     <option name="commitMessageValidationEnabledOverride">
10 |       <BoolValueOverride>
11 |         <option name="enabled" value="true" />
12 |       </BoolValueOverride>
13 |     </option>
14 |   </component>
15 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="CssUnknownProperty" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="myCustomPropertiesEnabled" value="true" />
 6 |       <option name="myIgnoreVendorSpecificProperties" value="false" />
 7 |       <option name="myCustomPropertiesList">
 8 |         <value>
 9 |           <list size="1">
10 |             <item index="0" class="java.lang.String" itemvalue="text-aling" />
11 |           </list>
12 |         </value>
13 |       </option>
14 |     </inspection_tool>
15 |     <inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
16 |       <Languages>
17 |         <language minSize="562" name="Python" />
18 |       </Languages>
19 |     </inspection_tool>
20 |     <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
21 |     <inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
22 |       <option name="myValues">
23 |         <value>
24 |           <list size="7">
25 |             <item index="0" class="java.lang.String" itemvalue="nobr" />
26 |             <item index="1" class="java.lang.String" itemvalue="noembed" />
27 |             <item index="2" class="java.lang.String" itemvalue="comment" />
28 |             <item index="3" class="java.lang.String" itemvalue="noscript" />
29 |             <item index="4" class="java.lang.String" itemvalue="embed" />
30 |             <item index="5" class="java.lang.String" itemvalue="script" />
31 |             <item index="6" class="java.lang.String" itemvalue="ul" />
32 |           </list>
33 |         </value>
34 |       </option>
35 |       <option name="myCustomValuesEnabled" value="true" />
36 |     </inspection_tool>
37 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
38 |       <option name="ignoredPackages">
39 |         <value>
40 |           <list size="4">
41 |             <item index="0" class="java.lang.String" itemvalue="pandas" />
42 |             <item index="1" class="java.lang.String" itemvalue="missingno" />
43 |             <item index="2" class="java.lang.String" itemvalue="requests" />
44 |             <item index="3" class="java.lang.String" itemvalue="update_checker" />
45 |           </list>
46 |         </value>
47 |       </option>
48 |     </inspection_tool>
49 |     <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
50 |       <option name="ignoredErrors">
51 |         <list>
52 |           <option value="E402" />
53 |         </list>
54 |       </option>
55 |     </inspection_tool>
56 |     <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
57 |       <option name="ignoredErrors">
58 |         <list>
59 |           <option value="N802" />
60 |           <option value="N806" />
61 |           <option value="N803" />
62 |         </list>
63 |       </option>
64 |     </inspection_tool>
65 |     <inspection_tool class="PyStubPackagesAdvertiser" enabled="true" level="WARNING" enabled_by_default="true">
66 |       <option name="ignoredPackages">
67 |         <list>
68 |           <option value="pyspark-stubs==3.0.0.post3" />
69 |         </list>
70 |       </option>
71 |     </inspection_tool>
72 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
73 |       <option name="ignoredIdentifiers">
74 |         <list>
75 |           <option value="n_layers" />
76 |           <option value="dict.Items" />
77 |           <option value="object.best_params_" />
78 |           <option value="object.best_estimator_" />
79 |           <option value="object.cv_results_" />
80 |           <option value="tab_cleaner.outliers.base_outlier.BaseOutlier.result_" />
81 |         </list>
82 |       </option>
83 |     </inspection_tool>
84 |   </profile>
85 | </component>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (LiteratureReview)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Literature Review.iml" filepath="$PROJECT_DIR$/.idea/Literature Review.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="PySciProjectComponent">
4 |     <option name="PY_SCI_VIEW" value="true" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LiteratureReview
  2 | 
  3 | scrapper for various science databases, supported databases are IEEE Xplore, Science Direct and
  4 | ACM. theses scrapping bots will retrieve link to each search results aka paper, title and some
  5 | other meta-data such as keywords and abstract, type of paper (conference, journal ect.) which
  6 | useful to do the systematic literature review process make easy.
  7 | 
  8 | _*If you find this work usefully, put a star on this repo ⭐*_
  9 | 
 10 | # Prerequisites
 11 | 
 12 | - python 3.9 or higher
 13 | - Chrome browser
 14 | - Chrome web driver which matches your Chrome version. download from [here](https://chromedriver.chromium.org/downloads/)
 15 | 
 16 | # How to use
 17 | 
 18 | 1) go to the official site (advance search page), create a search query using their form,
 19 |    <P><h3>Science Direct</h3>
 20 |    <img height="300" src="demo\science direct adv search.jpg" width="600"/>
 21 |    <img height="300" src="demo\science direct adv search string.jpg" width="700"/></p>
 22 |    <P><h3>IEEE Xplore</h3>
 23 |    <img height="300" src="demo\ieee adv search string.jpg" width="700"/>
 24 |    <P><h3>ACM</h3>
 25 |    <img height="900" src="demo\acm adv search string.jpg" width="350"/></p>
 26 | 2) copy that query text and use it to configure the tool
 27 | 3) clone the repo (create virtual environment is recommended way) and complete the configuration
 28 |    can use a single bot or all the bots at one by one configuration.
 29 | 
 30 | ```shell
 31 | git clone https://github.com/ashen007/LiteratureReview.git
 32 | ```   
 33 | - all bots with single configuration
 34 | 
 35 | ```json
 36 | {
 37 |   "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
 38 |   "EXECUTABLE_PATH": "D:\\chromedriver.exe",
 39 |   "SCIDIR": {
 40 |     "search_term": "insert query string here",
 41 |     "link_file_save_to": "./temp/scidir_search_term.json",
 42 |     "abs_file_save_to": "./abs/scidir_search_term.json",
 43 |     "use_batches": true,
 44 |     "batch_size": 8,
 45 |     "keep_link_file": true
 46 |   },
 47 |     "ACM": {
 48 |     "search_term": "insert query string here",
 49 |     "link_file_save_to": "./temp/acm_search_term.json",
 50 |     "abs_file_save_to": "./abs/acm_search_term.json",
 51 |     "use_batches": true,
 52 |     "batch_size": 8,
 53 |     "keep_link_file": true
 54 |   },
 55 |     "IEEE": {
 56 |     "search_term": "insert query string here",
 57 |     "link_file_save_to": "./temp/ieee_search_term.json",
 58 |     "abs_file_save_to": "./abs/ieee_search_term.json",
 59 |     "use_batches": false,
 60 |     "batch_size": 8,
 61 |     "keep_link_file": true
 62 |   }
 63 | }      
 64 | ```
 65 | 
 66 | - or can use one bot as well
 67 | 
 68 | ```json
 69 | {
 70 |   "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
 71 |   "EXECUTABLE_PATH": "D:\\chromedriver.exe",
 72 |   "SCIDIR": {
 73 |     "search_term": "insert query string here",
 74 |     "link_file_save_to": "./temp/scidir_search_term.json",
 75 |     "abs_file_save_to": "./abs/scidir_search_term.json",
 76 |     "use_batches": true,
 77 |     "batch_size": 8,
 78 |     "keep_link_file": true
 79 |   }
 80 | }
 81 | ```
 82 | 
 83 | - config `BINARY_LOCATION`
 84 |     use a path to chrome.exe file location
 85 | 
 86 | - config `EXECUTABLE_PATH`
 87 |     use a path where you download and extract the Chrome web driver
 88 | 
 89 | 4) install dependencies run the main.py
 90 | 
 91 | ```shell
 92 | pip install -r ./requirements.txt
 93 | ```
 94 | 
 95 | ```shell
 96 | python main.py
 97 | 
 98 | ```
 99 | 
100 | 5) that's it
101 | 6) save results into excel workbook, automatically saved into `./SLR.xlsx` file.
102 | ```python
103 |    from src.utils import to_excel
104 |    to_excel({"acm":'./abs/acm_search_term.json', "ieee": './abs/ieee_search_term.json', "science_direct": './abs/scidir_search_term.json'})
105 | ```
106 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
 3 |   "EXECUTABLE_PATH": "D:\\ML & DL\\chrome-win64\\chrome.exe",
 4 |   "ACM": {
 5 |     "search_term": "AllField:(video processing) AND Title:(sign language detection) AND AllField:(dumb and deff) AND AllField:(sign language)",
 6 |     "link_file_save_to": "./temp/acm_search_term_chris.json",
 7 |     "abs_file_save_to": "./abs/acm_search_term_chris.json",
 8 |     "use_batches": true,
 9 |     "batch_size": 8,
10 |     "keep_link_file": true
11 |   },
12 |   "IEEE": {
13 |     "search_term": "video processing, sign language detection, dumb",
14 |     "link_file_save_to": "./temp/ieee_search_term_chris.json",
15 |     "abs_file_save_to": "./abs/ieee_search_term_chris.json",
16 |     "use_batches": false,
17 |     "batch_size": 8,
18 |     "keep_link_file": true
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/demo/acm adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/acm adv search string.jpg


--------------------------------------------------------------------------------
/demo/ieee adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/ieee adv search string.jpg


--------------------------------------------------------------------------------
/demo/science direct adv search string.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search string.jpg


--------------------------------------------------------------------------------
/demo/science direct adv search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/demo/science direct adv search.jpg


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from datetime import datetime
  4 | from src.scidirect import ScienceDirect, Paper as SDP
  5 | from src.acm import ACM, Paper as ACMP
  6 | from src.ieee import IEEE, Paper as IXP
  7 | from src.utils import *
  8 | 
  9 | if __name__ == "__main__":
 10 |     config = read_json("./config.json")
 11 |     assert validate(config)
 12 | 
 13 |     if not os.path.isdir('temp'):
 14 |         os.mkdir('temp')
 15 | 
 16 |     if not os.path.isdir('abs'):
 17 |         os.mkdir('abs')
 18 | 
 19 |     scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(config.keys()))
 20 | 
 21 |     for s in scrappers:
 22 |         if s == 'IEEE':
 23 |             # get links to individual search results
 24 |             ieee = IEEE(config['IEEE']['search_term'])
 25 |             ieee.get_links_to_papers()
 26 | 
 27 |             # dump links
 28 |             if config['IEEE']['keep_link_file']:
 29 |                 ieee.to_json(config['IEEE']['link_file_save_to'])
 30 | 
 31 |             # get abstract of the and every search results
 32 |             ieee_paper = IXP(config['IEEE']['link_file_save_to'])
 33 | 
 34 |             if config['IEEE']['use_batches']:
 35 |                 ieee_paper.batch_update_details(config['IEEE']['batch_size'])
 36 | 
 37 |             else:
 38 |                 ieee_paper.update_paper_details()
 39 | 
 40 |             ieee_paper.to_json(config['IEEE']['abs_file_save_to'])
 41 | 
 42 |             if not config['IEEE']['keep_link_file']:
 43 |                 os.remove(config['IEEE']['link_file_save_to'])
 44 | 
 45 |         elif s == 'ACM':
 46 |             # get links to individual search results
 47 |             current_year = datetime.now().year
 48 |             acm = ACM((current_year - 5), current_year, config['ACM']['search_term'])
 49 |             acm.get_links_to_papers()
 50 | 
 51 |             # dump links
 52 |             if config['ACM']['keep_link_file']:
 53 |                 acm.to_json(config['ACM']['link_file_save_to'])
 54 | 
 55 |             # get abstract of the and every search results
 56 |             acm_paper = ACMP(config['ACM']['link_file_save_to'])
 57 | 
 58 |             if config['ACM']['use_batches']:
 59 |                 acm_paper.batch_update_details(config['ACM']['batch_size'])
 60 | 
 61 |             else:
 62 |                 acm_paper.update_paper_details()
 63 | 
 64 |             acm_paper.to_json(config['ACM']['abs_file_save_to'])
 65 | 
 66 |             if not config['ACM']['keep_link_file']:
 67 |                 os.remove(config['ACM']['link_file_save_to'])
 68 | 
 69 |         elif s == 'SCIDIR':
 70 |             # get links to individual search results
 71 |             current_year = datetime.now().year
 72 |             sd = ScienceDirect((current_year - 5), current_year, config['SCIDIR']['search_term'])
 73 | 
 74 |             if sd.driver is not None:
 75 |                 sd.driver.delete_all_cookies()
 76 | 
 77 |             sd.get_links_to_papers()
 78 | 
 79 |             # dump links
 80 |             if config['SCIDIR']['keep_link_file']:
 81 |                 sd.to_json(config['SCIDIR']['link_file_save_to'])
 82 | 
 83 |             # get abstract of the and every search results
 84 |             sd_paper = SDP(config['SCIDIR']['link_file_save_to'])
 85 | 
 86 |             if sd_paper.driver is not None:
 87 |                 sd_paper.driver.delete_all_cookies()
 88 | 
 89 |             if config['SCIDIR']['use_batches']:
 90 |                 sd_paper.batch_update_details(config['SCIDIR']['batch_size'])
 91 | 
 92 |             else:
 93 |                 sd_paper.update_paper_details()
 94 | 
 95 |             sd_paper.to_json(config['SCIDIR']['abs_file_save_to'])
 96 | 
 97 |             if not config['SCIDIR']['keep_link_file']:
 98 |                 os.remove(config['SCIDIR']['link_file_save_to'])
 99 | 
100 |         else:
101 |             raise ConfigurationError(f"wrong scrapper {s}.")
102 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests~=2.28.1
2 | numpy~=1.22.4
3 | selenium~=4.3.0
4 | undetected-chromedriver~=3.4.7
5 | selenium-stealth~=1.0.6
6 | pytest~=7.3.1
7 | PyYAML~=6.0
8 | pandas~=2.0.1
9 | openpyxl~=3.1.2


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/src/__init__.py


--------------------------------------------------------------------------------
/src/acm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import json
  4 | import numpy as np
  5 | import undetected_chromedriver
  6 | from selenium import webdriver
  7 | from selenium.webdriver.common.by import By
  8 | from selenium_stealth import stealth
  9 | from src.utils import *
 10 | 
 11 | 
 12 | class ACM:
 13 |     """
 14 |     Parameters
 15 |     ----------
 16 |     start: int
 17 |         start year of the date range filter
 18 | 
 19 |     end: int
 20 |         end year of the date range filter
 21 | 
 22 |     search_terms: str
 23 |         string of search terms (it can be comma seperated or semicolon
 24 |         seperated string)
 25 | 
 26 |     Attributes
 27 |     ----------
 28 |     driver: undetected_chromedriver.Chrome
 29 |         web driver for selenium
 30 | 
 31 |     page_count: int
 32 |         number of pages in search results
 33 | 
 34 |     links_to_paper: dict
 35 |         mined links and additional details for results
 36 | 
 37 |     origin: str
 38 |         origin of science direct advanced search url
 39 | 
 40 |     date_filter: str
 41 |         date range to filter search results
 42 | 
 43 |     results_in_a_page: str
 44 |         number of records should show tin single page
 45 | 
 46 |     start_page: str
 47 |         where is the starting location in page numbering
 48 | 
 49 |     query_text: str
 50 |         encoded search query string to apply in URL
 51 | 
 52 |     Methods
 53 |     -------
 54 |     encode_search_terms_into_query:
 55 |         encode user given search terms into URL string
 56 | 
 57 |     construct_full_link:
 58 |         create full link to make request from server
 59 | 
 60 |     create_query_text:
 61 |         create encoded query text to insert in URL
 62 | 
 63 |     init_driver:
 64 |         initiate web driver and session
 65 | 
 66 |     close_driver:
 67 |         close web driver and session
 68 | 
 69 |     post_request:
 70 |         post a request to science direct server
 71 | 
 72 |     check_for_multiple_pages:
 73 |         check weather search results contains multiple pages
 74 |         in results
 75 | 
 76 |     mine_links:
 77 |         get links to each search result (for each individual paper)
 78 | 
 79 |     get_links_to_papers:
 80 |         create paper link list
 81 | 
 82 |     to_json:
 83 |         dump results into json
 84 | 
 85 |     """
 86 |     options = webdriver.ChromeOptions()
 87 |     config = read_json('./config.json')
 88 | 
 89 |     options.add_argument("--headless")
 90 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
 91 |     options.add_experimental_option('useAutomationExtension', False)
 92 |     options.add_argument("--disable-blink-features=AutomationControlled")
 93 |     options.binary_location = config['BINARY_LOCATION']
 94 | 
 95 |     def __init__(self,
 96 |                  start,
 97 |                  end,
 98 |                  search_terms):
 99 |         self.driver = None
100 |         self.page_count = None
101 |         self.links_to_paper = {}
102 |         self.search_terms = search_terms
103 |         self.origin = "https://dl.acm.org/action/doSearch?"
104 |         self.quick_search = "fillQuickSearch=false"
105 |         self.target = "&target=advanced&expand=dl"
106 |         self.date_filter = f"&AfterYear={start}&BeforeYear={end}"
107 |         self.query_text = self.create_query_text()
108 |         self.start_page = "&startPage=0"
109 |         self.results_in_a_page = "&pageSize=50"
110 | 
111 |     @staticmethod
112 |     def encode_search_terms_into_query(keywords: str) -> str:
113 |         """
114 |         encode user given search terms into URL string
115 | 
116 |         Parameters
117 |         ----------
118 |         keywords: str
119 |             search terms to create search query
120 | 
121 |         Returns
122 |         -------
123 | 
124 |         """
125 |         encode = keywords.replace(' ', "+")
126 |         encode = encode.replace(';', "%3B")
127 |         encode = encode.replace(':', "%3A")
128 |         encode = encode.replace(',', "%2C")
129 |         encode = encode.replace('(', "%28")
130 |         encode = encode.replace(')', "%29")
131 | 
132 |         return encode
133 | 
134 |     def create_query_text(self) -> str:
135 |         """
136 |         create query text
137 | 
138 |         Returns
139 |         -------
140 | 
141 |         """
142 |         return f"&AllField={self.encode_search_terms_into_query(self.search_terms)}"
143 | 
144 |     def construct_full_link(self) -> str:
145 |         """
146 |         create full link to make request from server
147 | 
148 |         Returns
149 |         -------
150 | 
151 |         """
152 |         return ''.join([self.origin,
153 |                         self.quick_search,
154 |                         self.target,
155 |                         self.date_filter,
156 |                         self.query_text,
157 |                         self.start_page,
158 |                         self.results_in_a_page])
159 | 
160 |     def init_driver(self) -> None:
161 |         """
162 |         initiate web driver and session
163 | 
164 |         Returns
165 |         -------
166 | 
167 |         """
168 |         self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
169 |                                                      executable_path=self.config['EXECUTABLE_PATH'])
170 |         clean_cookies_and_caches(self.driver)
171 | 
172 |     def close_driver(self) -> None:
173 |         """
174 |         close web driver and session
175 | 
176 |         Returns
177 |         -------
178 | 
179 |         """
180 |         self.driver.close()
181 | 
182 |     def post_request(self, link) -> None:
183 |         """
184 |         post a request to science direct server
185 | 
186 |         Parameters
187 |         ----------
188 |         link: str
189 |             URL to make request on
190 | 
191 |         Returns
192 |         -------
193 | 
194 |         """
195 |         stealth(self.driver,
196 |                 languages=["en-US", "en"],
197 |                 vendor="Google Inc.",
198 |                 platform="Win32",
199 |                 webgl_vendor="Intel Inc.",
200 |                 renderer="Intel Iris OpenGL Engine",
201 |                 fix_hairline=True,
202 |                 )
203 |         # make request
204 |         self.driver.delete_all_cookies()
205 |         self.driver.get(link)
206 |         time.sleep(abs(np.random.normal(2, 0.4)))
207 | 
208 |     def check_for_multiple_pages(self) -> bool:
209 |         """
210 |         check weather search results contains multiple pages
211 |         in results
212 | 
213 |         Returns
214 |         -------
215 | 
216 |         """
217 |         link = self.construct_full_link()
218 |         self.init_driver()
219 |         self.post_request(link)
220 | 
221 |         tot_results = int(self.driver.find_element(By.CLASS_NAME,
222 |                                                    value="result__count").text.split(' ')[0])
223 | 
224 |         self.page_count = int(np.round(tot_results / 50))
225 | 
226 |         self.close_driver()
227 | 
228 |         return True if self.page_count > 1 else False
229 | 
230 |     def mine_links(self) -> None:
231 |         """
232 |         get links to each search result (for each individual paper)
233 | 
234 |         Returns
235 |         -------
236 | 
237 |         """
238 |         types = self.driver.find_elements(By.CLASS_NAME, value="issue-heading")
239 |         dates = self.driver.find_elements(By.CLASS_NAME, value="bookPubDate")
240 |         titles = self.driver.find_elements(By.CLASS_NAME, value="issue-item__title")
241 |         links = self.driver.find_elements(By.CSS_SELECTOR,
242 |                                           value="h5[class='issue-item__title']>span[class='hlFld-Title']>a")
243 | 
244 |         for type_, date, title, link in zip(types, dates, titles, links):
245 |             self.links_to_paper[f'{link.get_attribute("href").split("/")[-1]}'] = {"type_": type_.text,
246 |                                                                                    "date": date.text,
247 |                                                                                    "title": title.text,
248 |                                                                                    "link": link.get_attribute('href')}
249 | 
250 |         time.sleep(abs(np.random.uniform(2, 4)))
251 | 
252 |     def get_links_to_papers(self) -> None:
253 |         """
254 |         create paper link list
255 | 
256 |         Returns
257 |         -------
258 | 
259 |         """
260 |         if self.check_for_multiple_pages():
261 |             for i in range(1, (self.page_count + 1)):
262 |                 self.start_page = f"&startPage={i}"
263 |                 self.init_driver()
264 |                 self.post_request(self.construct_full_link())
265 |                 self.mine_links()
266 | 
267 |                 print(f'reading page: {i + 1} from {self.page_count}', end='\r')
268 | 
269 |                 self.close_driver()
270 | 
271 |         else:
272 |             self.init_driver()
273 |             self.post_request(self.construct_full_link())
274 |             self.mine_links()
275 |             self.close_driver()
276 | 
277 |     def to_json(self, path) -> None:
278 |         """
279 |         dump results into json
280 | 
281 |         Parameters
282 |         ----------
283 |         path: str
284 |             string path for save results (link and additional details)
285 | 
286 |         Returns
287 |         -------
288 | 
289 |         """
290 |         with open(path, 'w') as file:
291 |             json.dump(self.links_to_paper, file)
292 | 
293 | 
294 | class Paper:
295 |     options = webdriver.ChromeOptions()
296 |     config = read_json('./config.json')
297 | 
298 |     options.add_argument("--headless")
299 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
300 |     options.add_experimental_option('useAutomationExtension', False)
301 |     options.add_argument("--disable-blink-features=AutomationControlled")
302 |     options.binary_location = config['BINARY_LOCATION']
303 | 
304 |     def __init__(self, file_name):
305 |         self.driver = None
306 |         self.destination = file_name
307 | 
308 |         with open(file_name, "r") as file:
309 |             self.link_object = json.load(file)
310 | 
311 |     def init_driver(self) -> None:
312 |         """
313 |         initiate web driver and session
314 | 
315 |         Returns
316 |         -------
317 | 
318 |         """
319 |         self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
320 |                                                      executable_path=self.config['EXECUTABLE_PATH'])
321 |         clean_cookies_and_caches(self.driver)
322 | 
323 |     def close_driver(self) -> None:
324 |         """
325 |         close web driver and session
326 | 
327 |         Returns
328 |         -------
329 | 
330 |         """
331 |         self.driver.close()
332 | 
333 |     def request_paper(self, page_link) -> None:
334 |         """
335 |         post a request to science direct server
336 | 
337 |         Parameters
338 |         ----------
339 |         page_link: str
340 |             URL to make request on
341 | 
342 |         Returns
343 |         -------
344 | 
345 |         """
346 |         stealth(self.driver,
347 |                 languages=["en-US", "en"],
348 |                 vendor="Google Inc.",
349 |                 platform="Win32",
350 |                 webgl_vendor="Intel Inc.",
351 |                 renderer="Intel Iris OpenGL Engine",
352 |                 fix_hairline=True,
353 |                 )
354 | 
355 |         URL = page_link
356 | 
357 |         # make request
358 |         self.driver.delete_all_cookies()
359 |         self.driver.get(URL)
360 | 
361 |         time.sleep(abs(np.random.normal(1, 0.4)))
362 | 
363 |     def get_abstract_text(self) -> str:
364 |         """
365 |         get abstract from each publication
366 | 
367 |         Returns
368 |         -------
369 |         abstract: str
370 | 
371 |         """
372 |         return self.driver.find_element(By.CLASS_NAME, 'abstractInFull').text
373 | 
374 |     # def click_kw_section(self) -> None:
375 |     #     self.driver.execute_script("arguments[0].scrollIntoView();",
376 |     #                                self.driver.find_element(By.ID, 'keywords'))
377 |     #     WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
378 |     #     time.sleep(1)
379 | 
380 |     # def get_keywords(self) -> list:
381 |     #     """
382 |     #     get all type of keywords in ieee xplore for the publication
383 |     #
384 |     #     Returns
385 |     #     -------
386 |     #     list of keyword strings: list
387 |     #
388 |     #     """
389 |     #     kw_types = self.driver.find_elements(By.CSS_SELECTOR,
390 |     #                                          "ul[class='doc-keywords-list stats-keywords-list']>li["
391 |     #                                          "class='doc-keywords-list-item']>ul")
392 |     #     return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
393 | 
394 |     def update_paper_details(self) -> None:
395 |         """
396 |         update the detail object of the publications
397 | 
398 |         Returns
399 |         -------
400 | 
401 |         """
402 |         # start driver
403 |         self.init_driver()
404 | 
405 |         for obj in self.link_object.values():
406 |             doc_link = obj['link']
407 |             self.request_paper(doc_link)
408 |             # self.click_kw_section()
409 | 
410 |             time.sleep(abs(np.random.normal(1, 0.4)))
411 | 
412 |             try:
413 |                 abstract = self.get_abstract_text()
414 |                 # kws = self.get_keywords()
415 | 
416 |             except:
417 |                 abstract = np.NAN
418 |                 # kws = np.NAN
419 | 
420 |             obj['abs'] = abstract
421 | 
422 |             # if kws not in value:
423 |             #     value.append(kws)
424 | 
425 |         # close driver
426 |         self.close_driver()
427 | 
428 |     def batch_update_details(self, size) -> None:
429 |         """
430 |         update the detail object of the publications batch wise
431 | 
432 |         Parameters
433 |         ----------
434 |         size: int
435 |             size of a batch
436 | 
437 |         Returns
438 |         -------
439 | 
440 |         """
441 |         keys = list(self.link_object.keys())
442 | 
443 |         for i in range(size, len(self.link_object), size):
444 |             batch = keys[(i - size):i]
445 |             self.init_driver()
446 | 
447 |             for p in batch:
448 |                 doc_link = self.link_object[p]["link"]
449 |                 self.request_paper(doc_link)
450 | 
451 |                 try:
452 |                     abstract = self.get_abstract_text()
453 | 
454 |                 except:
455 |                     abstract = np.NAN
456 | 
457 |                 if abstract not in list(self.link_object[p].values()):
458 |                     self.link_object[p]["abs"] = abstract
459 | 
460 |             # dump updated link object to json
461 |             with open('./acm_temp.json', 'w') as file:
462 |                 json.dump(self.link_object, file)
463 | 
464 |             # close driver
465 |             self.close_driver()
466 | 
467 |     def to_json(self, path) -> None:
468 |         """
469 |         dump results into json
470 | 
471 |         Parameters
472 |         ----------
473 |         path: str
474 |             string path for save results (link and additional details)
475 | 
476 |         Returns
477 |         -------
478 | 
479 |         """
480 |         if os.path.isfile('./acm_temp.json'):
481 |             with open('./acm_temp.json') as file:
482 |                 self.link_object = json.load(file)
483 | 
484 |             os.remove('./acm_temp.json')
485 | 
486 |         with open(path, 'w') as file:
487 |             json.dump(self.link_object, file)
488 | 


--------------------------------------------------------------------------------
/src/ieee.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import requests
  4 | import numpy as np
  5 | import json
  6 | import undetected_chromedriver
  7 | 
  8 | from selenium import webdriver
  9 | from selenium.common import exceptions
 10 | from selenium.webdriver.common.by import By
 11 | from selenium.webdriver.support.wait import WebDriverWait
 12 | from selenium.webdriver.support import expected_conditions as EC
 13 | from selenium_stealth import stealth
 14 | from src.utils import *
 15 | 
 16 | 
 17 | class IEEE:
 18 |     """
 19 |     Parameters
 20 |     ----------
 21 |     query: str
 22 |         search term for either simple search or advanced search, if for advanced
 23 |         search need to add AND, OR, NOT in between search keywords.
 24 | 
 25 |     Attributes
 26 |     ----------
 27 |     headers: dict
 28 |         header to post for IEEE Xplore
 29 | 
 30 |     payload: dict
 31 |         additional details for filter results from request
 32 | 
 33 |     page_count: int
 34 |         total number of pages in the search results
 35 | 
 36 |     links_to_paper: dict
 37 |         mined links and additional details for results
 38 | 
 39 |     Methods
 40 |     -------
 41 |     post_request:
 42 |         send request to IEEE server
 43 | 
 44 |     check_for_multiple_pages:
 45 |         check weather results has been divide to multiple
 46 |         web pages, if so update the page count.
 47 | 
 48 |     mine_links:
 49 |         get links for each document from search results
 50 | 
 51 |     get_links_to_papers:
 52 |         add all links to single object
 53 | 
 54 |     to_json:
 55 |         dump links to json file
 56 | 
 57 |     """
 58 | 
 59 |     def __init__(self, query):
 60 |         self.headers = {
 61 |             "Accept": "application/json, text/plain, */*",
 62 |             "Origin": "https://ieeexplore.ieee.org",
 63 |             "Content-Type": "application/json",
 64 |         }
 65 |         self.payload = {
 66 |             "newsearch": True,
 67 |             "queryText": query,
 68 |             "highlight": True,
 69 |             "returnFacets": ["ALL"],
 70 |             "returnType": "SEARCH",
 71 |             "pageNumber": 1
 72 |         }
 73 |         self.page_count = None
 74 |         self.links_to_paper = {}
 75 | 
 76 |     @staticmethod
 77 |     def post_request(header: dict, json: dict) -> requests.Response:
 78 |         """
 79 |         send request to IEEE server
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         header: dict
 84 |             header to post for IEEE Xplore
 85 | 
 86 |         json: dict
 87 |             additional details for filter results from request
 88 | 
 89 |         Returns
 90 |         -------
 91 | 
 92 |         """
 93 |         result = requests.post("https://ieeexplore.ieee.org/rest/search",
 94 |                                headers=header,
 95 |                                json=json)
 96 | 
 97 |         return result
 98 | 
 99 |     def check_for_multiple_pages(self) -> bool:
100 |         """
101 |         check weather results has been divide to multiple
102 |         web pages, if so update the page count.
103 | 
104 |         Returns
105 |         -------
106 | 
107 |         """
108 |         results = self.post_request(self.headers, self.payload).json()
109 |         self.page_count = results['totalPages']
110 | 
111 |         return True if self.page_count > 1 else False
112 | 
113 |     def mine_links(self) -> None:
114 |         """
115 |         get links for each document from search results
116 | 
117 |         Returns
118 |         -------
119 | 
120 |         """
121 |         request = self.post_request(self.headers, self.payload)
122 |         j = 1
123 | 
124 |         while request.status_code != 200:
125 |             time.sleep(abs(np.random.normal(0.1, 2)))
126 |             request = self.post_request(self.headers, self.payload)
127 | 
128 |         results = request.json()
129 | 
130 |         for record in results['records']:
131 |             self.links_to_paper[record['articleNumber']] = {"title": record.get('articleTitle', None),
132 |                                                             "link": record.get('documentLink', None),
133 |                                                             "date": record.get('publicationYear', None)}
134 | 
135 |     def get_links_to_papers(self) -> None:
136 |         """
137 |         add all links to single object
138 | 
139 |         Returns
140 |         -------
141 | 
142 |         """
143 |         if self.check_for_multiple_pages():
144 |             for i in range(1, (self.page_count + 1)):
145 |                 self.payload["pageNumber"] = i
146 | 
147 |                 self.mine_links()
148 | 
149 |                 print(f'reading page: {i} from {self.page_count}', end='\r')
150 | 
151 |         else:
152 |             self.mine_links()
153 | 
154 |     def to_json(self, path: str) -> None:
155 |         """
156 |         dump links to json file
157 | 
158 |         Parameters
159 |         ----------
160 |         path: str
161 |             string path for save results (link and additional details)
162 | 
163 |         Returns
164 |         -------
165 | 
166 |         """
167 |         with open(path, 'w') as file:
168 |             json.dump(self.links_to_paper, file)
169 | 
170 | 
171 | class Paper:
172 |     options = webdriver.ChromeOptions()
173 |     config = read_json('./config.json')
174 | 
175 |     options.add_argument("--headless")
176 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
177 |     options.add_experimental_option('useAutomationExtension', False)
178 |     options.add_argument("--disable-blink-features=AutomationControlled")
179 |     options.binary_location = config['BINARY_LOCATION']
180 | 
181 |     def __init__(self, file_name):
182 |         self.driver = None
183 |         self.failure = []
184 |         self.destination = file_name
185 | 
186 |         with open(file_name, "r") as file:
187 |             self.link_object = json.load(file)
188 | 
189 |     def init_driver(self) -> None:
190 |         """
191 |         initiate a web driver and session
192 | 
193 |         Returns
194 |         -------
195 | 
196 |         """
197 |         self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
198 |                                                      executable_path=self.config['EXECUTABLE_PATH'])
199 |         clean_cookies_and_caches(self.driver)
200 | 
201 |     def close_driver(self) -> None:
202 |         """
203 |         close a web driver and session
204 | 
205 |         Returns
206 |         -------
207 | 
208 |         """
209 |         self.driver.close()
210 | 
211 |     def request_paper(self, page_link) -> None:
212 |         """
213 |         post a request to science direct server
214 | 
215 |         Parameters
216 |         ----------
217 |         page_link: str
218 |             URL to make request on
219 | 
220 |         Returns
221 |         -------
222 | 
223 |         """
224 |         stealth(self.driver,
225 |                 languages=["en-US", "en"],
226 |                 vendor="Google Inc.",
227 |                 platform="Win32",
228 |                 webgl_vendor="Intel Inc.",
229 |                 renderer="Intel Iris OpenGL Engine",
230 |                 fix_hairline=True,
231 |                 )
232 | 
233 |         URL = f"https://ieeexplore.ieee.org{page_link}"
234 | 
235 |         # make request
236 |         self.driver.delete_all_cookies()
237 | 
238 |         try:
239 |             self.driver.get(URL)
240 | 
241 |         except:
242 |             self.fall_back()
243 |             self.driver.get(URL)
244 | 
245 |         time.sleep(abs(np.random.normal(1, 0.4)))
246 | 
247 |     def fall_back(self):
248 |         """
249 |         recover while errors happens when requesting page data
250 | 
251 |         Returns
252 |         -------
253 | 
254 |         """
255 |         self.close_driver()
256 |         time.sleep(1)
257 |         self.init_driver()
258 | 
259 |     def get_abstract_text(self) -> str:
260 |         """
261 |         get abstract from each publication
262 | 
263 |         Returns
264 |         -------
265 |         abstract: str
266 | 
267 |         """
268 |         return self.driver.find_element(By.CLASS_NAME, 'abstract-text').text.replace('Abstract:\n', '')
269 | 
270 |     def click_kw_section(self) -> None:
271 |         self.driver.execute_script("arguments[0].scrollIntoView();",
272 |                                    self.driver.find_element(By.ID, 'keywords'))
273 |         WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
274 |         time.sleep(1)
275 | 
276 |     def get_keywords(self) -> list:
277 |         """
278 |         get all type of keywords in ieee xplore for the publication
279 | 
280 |         Returns
281 |         -------
282 |         list of keyword strings: list
283 | 
284 |         """
285 |         kw_types = self.driver.find_elements(By.CSS_SELECTOR,
286 |                                              "ul[class='doc-keywords-list stats-keywords-list']>li["
287 |                                              "class='doc-keywords-list-item']>ul")
288 |         return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
289 | 
290 |     def update_paper_details(self) -> None:
291 |         """
292 |         update the detail object of the publications
293 | 
294 |         Returns
295 |         -------
296 | 
297 |         """
298 |         # start driver
299 |         self.init_driver()
300 | 
301 |         for key, value in self.link_object.items():
302 |             doc_link = value["link"]
303 | 
304 |             try:
305 |                 self.request_paper(doc_link)
306 |                 self.click_kw_section()
307 | 
308 |             except exceptions.NoSuchElementException:
309 |                 self.fall_back()
310 |                 self.request_paper(doc_link)
311 |                 self.click_kw_section()
312 | 
313 |             except:
314 |                 continue
315 | 
316 |             time.sleep(abs(np.random.normal(1, 0.4)))
317 | 
318 |             try:
319 |                 abstract = self.get_abstract_text()
320 |                 kws = self.get_keywords()
321 | 
322 |             except:
323 |                 abstract = np.NAN
324 |                 kws = np.NAN
325 | 
326 |             value["abs"] = abstract
327 |             value["kws"] = kws
328 | 
329 |         # close driver
330 |         self.close_driver()
331 | 
332 |     def batch_update_details(self, size) -> None:
333 |         """
334 |         update the detail object of the publications batch wise
335 | 
336 |         Parameters
337 |         ----------
338 |         size: int
339 |             size of a batch
340 | 
341 |         Returns
342 |         -------
343 | 
344 |         """
345 |         keys = list(self.link_object.keys())
346 | 
347 |         for i in range(size, len(self.link_object), size):
348 |             batch = keys[(i - size):i]
349 |             self.init_driver()
350 | 
351 |             for p in batch:
352 |                 doc_link = self.link_object[p]["link"]
353 | 
354 |                 try:
355 |                     self.request_paper(doc_link)
356 |                     self.click_kw_section()
357 | 
358 |                 except exceptions.NoSuchElementException:
359 |                     self.fall_back()
360 |                     self.request_paper(doc_link)
361 |                     self.click_kw_section()
362 | 
363 |                 except:
364 |                     continue
365 | 
366 |                 try:
367 |                     abstract = self.get_abstract_text()
368 |                     kws = self.get_keywords()
369 | 
370 |                 except:
371 |                     abstract = np.NAN
372 |                     kws = np.NAN
373 | 
374 |                 self.link_object[p]["abs"] = abstract
375 |                 self.link_object[p]["abs"] = kws
376 | 
377 |             # dump updated link object to json
378 |             with open('./ieee_temp.json', 'w') as file:
379 |                 json.dump(self.link_object, file)
380 | 
381 |             # close driver
382 |             self.close_driver()
383 | 
384 |     def to_json(self, path) -> None:
385 |         """
386 |         dump results into json
387 | 
388 |         Parameters
389 |         ----------
390 |         path: str
391 |             string path for save results (link and additional details)
392 | 
393 |         Returns
394 |         -------
395 | 
396 |         """
397 |         if os.path.isfile('./ieee_temp.json'):
398 |             with open('./ieee_temp.json') as file:
399 |                 self.link_object = json.load(file)
400 | 
401 |             os.remove('./ieee_temp.json')
402 | 
403 |         with open(path, 'w') as file:
404 |             json.dump(self.link_object, file)
405 | 


--------------------------------------------------------------------------------
/src/scidirect.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import json
  4 | import numpy as np
  5 | import undetected_chromedriver
  6 | from selenium import webdriver
  7 | from selenium.webdriver.common.by import By
  8 | from selenium_stealth import stealth
  9 | from src.utils import *
 10 | 
 11 | 
 12 | class ScienceDirect:
 13 |     """
 14 |     Parameters
 15 |     ----------
 16 |     start: int
 17 |         start year of the date range filter
 18 | 
 19 |     end: int
 20 |         end year of the date range filter
 21 | 
 22 |     search_terms: str
 23 |         string of search terms (it can be comma seperated or semicolon
 24 |         seperated string)
 25 | 
 26 |     Attributes
 27 |     ----------
 28 |     driver: undetected_chromedriver.Chrome
 29 |         web driver for selenium
 30 | 
 31 |     page_count: int
 32 |         number of pages in search results
 33 | 
 34 |     links_to_paper: dict
 35 |         mined links and additional details for results
 36 | 
 37 |     origin: str
 38 |         origin of science direct advanced search url
 39 | 
 40 |     date_filter: str
 41 |         date range to filter search results
 42 | 
 43 |     results_in_a_page: str
 44 |         number of records should show tin single page
 45 | 
 46 |     offset: str
 47 |         number of records should go forward for next page
 48 |         in search results
 49 | 
 50 |     query_text: str
 51 |         encoded search query string to apply in URL
 52 | 
 53 |     article_type: str
 54 |         science direct article type category indicator
 55 | 
 56 |     Methods
 57 |     -------
 58 |     encode_search_terms_into_query:
 59 |         encode user given search terms into URL string
 60 | 
 61 |     construct_full_link:
 62 |         create full link to make request from server
 63 | 
 64 |     init_driver:
 65 |         initiate web driver and session
 66 | 
 67 |     close_driver:
 68 |         close web driver and session
 69 | 
 70 |     post_request:
 71 |         post a request to science direct server
 72 | 
 73 |     check_for_multiple_pages:
 74 |         check weather search results contains multiple pages
 75 |         in results
 76 | 
 77 |     mine_links:
 78 |         get links to each search result (for each individual paper)
 79 | 
 80 |     get_links_to_papers:
 81 |         create paper link list
 82 | 
 83 |     to_json:
 84 |         dump results into json
 85 | 
 86 |     """
 87 | 
 88 |     options = webdriver.ChromeOptions()
 89 |     config = read_json('./config.json')
 90 | 
 91 |     options.add_argument("--headless")
 92 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
 93 |     options.add_experimental_option('useAutomationExtension', False)
 94 |     options.add_argument("--disable-blink-features=AutomationControlled")
 95 |     options.binary_location = config['BINARY_LOCATION']
 96 | 
 97 |     def __init__(self, start: int, end: int, search_terms: str):
 98 |         self.driver = None
 99 |         self.page_count = None
100 |         self.links_to_paper = {}
101 |         self.origin = "https://www.sciencedirect.com/search"
102 |         self.date_filter = f"?date={start}-{end}"
103 |         self.results_in_a_page = "&show=100"
104 |         self.offset = "&offset=0"
105 |         self.query_text = self.encode_search_terms_into_query(search_terms)
106 |         self.article_type = "&articleTypes=FLA"
107 | 
108 |     @staticmethod
109 |     def encode_search_terms_into_query(keywords: str) -> str:
110 |         """
111 |         encode user given search terms into URL string
112 | 
113 |         Parameters
114 |         ----------
115 |         keywords: str
116 |             search terms to create search query
117 | 
118 |         Returns
119 |         -------
120 | 
121 |         """
122 |         encode = keywords.replace(' ', "%20")
123 |         encode = encode.replace(';', "%3B")
124 |         encode = encode.replace(',', "%2C")
125 | 
126 |         return f"&qs={encode}"
127 | 
128 |     def construct_full_link(self) -> str:
129 |         """
130 |         create full link to make request from server
131 | 
132 |         Returns
133 |         -------
134 | 
135 |         """
136 |         return ''.join([self.origin,
137 |                         self.date_filter,
138 |                         self.query_text,
139 |                         self.results_in_a_page,
140 |                         self.offset,
141 |                         self.article_type])
142 | 
143 |     def init_driver(self) -> None:
144 |         """
145 |         initiate web driver and session
146 | 
147 |         Returns
148 |         -------
149 | 
150 |         """
151 |         self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
152 |                                                      executable_path=self.config['EXECUTABLE_PATH'])
153 |         clean_cookies_and_caches(self.driver)
154 | 
155 |     def close_driver(self) -> None:
156 |         """
157 |         close web driver and session
158 | 
159 |         Returns
160 |         -------
161 | 
162 |         """
163 |         self.driver.close()
164 | 
165 |     def post_request(self, link: str) -> None:
166 |         """
167 |         post a request to science direct server
168 | 
169 |         Parameters
170 |         ----------
171 |         link: str
172 |             URL to make request on
173 | 
174 |         Returns
175 |         -------
176 | 
177 |         """
178 |         stealth(self.driver,
179 |                 languages=["en-US", "en"],
180 |                 vendor="Google Inc.",
181 |                 platform="Win32",
182 |                 webgl_vendor="Intel Inc.",
183 |                 renderer="Intel Iris OpenGL Engine",
184 |                 fix_hairline=True,
185 |                 )
186 |         # make request
187 |         self.driver.delete_all_cookies()
188 |         self.driver.get(link)
189 |         time.sleep(abs(np.random.normal(2, 0.4)))
190 | 
191 |     def check_for_multiple_pages(self) -> bool:
192 |         """
193 |         check weather search results contains multiple pages
194 |         in results
195 | 
196 |         Returns
197 |         -------
198 | 
199 |         """
200 |         link = self.construct_full_link()
201 |         self.init_driver()
202 |         self.post_request(link)
203 | 
204 |         tot_results = int(self.driver.find_element(By.CLASS_NAME,
205 |                                                    value="search-body-results-text").text.split(' ')[0])
206 |         self.page_count = int(np.round(tot_results / 100))
207 | 
208 |         self.close_driver()
209 | 
210 |         return True if self.page_count > 1 else False
211 | 
212 |     def mine_links(self) -> None:
213 |         """
214 |         get links to each search result (for each individual paper)
215 | 
216 |         Returns
217 |         -------
218 | 
219 |         """
220 |         for title, article in zip(self.driver.find_elements(By.CLASS_NAME, value="result-list-title-link"),
221 |                                   self.driver.find_elements(By.CLASS_NAME, value="article-type")):
222 |             self.links_to_paper[title.get_attribute('id')] = {"title": title.text,
223 |                                                               "link": title.get_attribute('href'),
224 |                                                               "type_": article.text}
225 | 
226 |         time.sleep(abs(np.random.uniform(2, 4)))
227 | 
228 |     def get_links_to_papers(self) -> None:
229 |         """
230 |         create paper link list
231 | 
232 |         Returns
233 |         -------
234 | 
235 |         """
236 |         if self.check_for_multiple_pages():
237 |             for i in range(self.page_count):
238 |                 self.offset = f"&offset={100 * i}"
239 |                 self.init_driver()
240 |                 self.post_request(self.construct_full_link())
241 |                 self.mine_links()
242 | 
243 |                 print(f'reading page: {i + 1} from {self.page_count}', end='\r')
244 | 
245 |                 self.close_driver()
246 | 
247 |         else:
248 |             self.init_driver()
249 |             self.post_request(self.construct_full_link())
250 |             self.mine_links()
251 |             self.close_driver()
252 | 
253 |     def to_json(self, path: str) -> None:
254 |         """
255 |         dump results into json
256 | 
257 |         Parameters
258 |         ----------
259 |         path: str
260 |             string path for save results (link and additional details)
261 | 
262 |         Returns
263 |         -------
264 | 
265 |         """
266 |         with open(path, 'w') as file:
267 |             json.dump(self.links_to_paper, file)
268 | 
269 | 
270 | class Paper:
271 |     options = webdriver.ChromeOptions()
272 |     config = read_json('./config.json')
273 | 
274 |     options.add_argument("--headless")
275 |     options.add_experimental_option("excludeSwitches", ["enable-automation"])
276 |     options.add_experimental_option('useAutomationExtension', False)
277 |     options.add_argument("--disable-blink-features=AutomationControlled")
278 |     options.binary_location = config['BINARY_LOCATION']
279 | 
280 |     def __init__(self, file_name):
281 |         self.driver = None
282 |         self.destination = file_name
283 | 
284 |         with open(file_name, "r") as file:
285 |             self.link_object = json.load(file)
286 | 
287 |     def init_driver(self) -> None:
288 |         """
289 |         initiate web driver and session
290 | 
291 |         Returns
292 |         -------
293 | 
294 |         """
295 |         self.driver = undetected_chromedriver.Chrome(chrome_options=self.options,
296 |                                                      executable_path=self.config['EXECUTABLE_PATH'])
297 |         clean_cookies_and_caches(self.driver)
298 | 
299 |     def close_driver(self) -> None:
300 |         """
301 |         close web driver and session
302 | 
303 |         Returns
304 |         -------
305 | 
306 |         """
307 |         self.driver.close()
308 | 
309 |     def request_paper(self, page_link) -> None:
310 |         """
311 |         post a request to science direct server
312 | 
313 |         Parameters
314 |         ----------
315 |         page_link: str
316 |             URL to make request on
317 | 
318 |         Returns
319 |         -------
320 | 
321 |         """
322 |         stealth(self.driver,
323 |                 languages=["en-US", "en"],
324 |                 vendor="Google Inc.",
325 |                 platform="Win32",
326 |                 webgl_vendor="Intel Inc.",
327 |                 renderer="Intel Iris OpenGL Engine",
328 |                 fix_hairline=True,
329 |                 )
330 | 
331 |         URL = page_link
332 | 
333 |         # make request
334 |         self.driver.delete_all_cookies()
335 |         self.driver.get(URL)
336 | 
337 |         time.sleep(abs(np.random.normal(1, 0.4)))
338 | 
339 |     def get_abstract_text(self) -> str:
340 |         """
341 |         get abstract from each publication
342 | 
343 |         Returns
344 |         -------
345 |         abstract: str
346 | 
347 |         """
348 |         return self.driver.find_element(By.CLASS_NAME, 'abstract').text.replace('Abstract:\n', '')
349 | 
350 |     # def click_kw_section(self) -> None:
351 |     #     self.driver.execute_script("arguments[0].scrollIntoView();",
352 |     #                                self.driver.find_element(By.ID, 'keywords'))
353 |     #     WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.ID, 'keywords'))).click()
354 |     #     time.sleep(1)
355 |     #
356 |     # def get_keywords(self) -> list:
357 |     #     """
358 |     #     get all type of keywords in ieee xplore for the publication
359 |     #
360 |     #     Returns
361 |     #     -------
362 |     #     list of keyword strings: list
363 |     #
364 |     #     """
365 |     #     kw_types = self.driver.find_elements(By.CSS_SELECTOR,
366 |     #                                          "ul[class='doc-keywords-list stats-keywords-list']>li["
367 |     #                                          "class='doc-keywords-list-item']>ul")
368 |     #     return [kw.text.replace('\n', '') for kw in kw_types if kw.text != '']
369 | 
370 |     def update_paper_details(self) -> None:
371 |         """
372 |         update the detail object of the publications
373 | 
374 |         Returns
375 |         -------
376 | 
377 |         """
378 |         # start driver
379 |         self.init_driver()
380 | 
381 |         for key, value in self.link_object.items():
382 |             doc_link = value["link"]
383 |             self.request_paper(doc_link)
384 | 
385 |             time.sleep(abs(np.random.normal(1, 0.4)))
386 | 
387 |             try:
388 |                 abstract = self.get_abstract_text()
389 | 
390 |             except:
391 |                 abstract = np.NAN
392 | 
393 |             value["abs"] = abstract
394 | 
395 |         # close driver
396 |         self.close_driver()
397 | 
398 |     def batch_update_details(self, size) -> None:
399 |         """
400 |         update the detail object of the publications batch wise
401 | 
402 |         Parameters
403 |         ----------
404 |         size: int
405 |             size of a batch
406 | 
407 |         Returns
408 |         -------
409 | 
410 |         """
411 |         keys = list(self.link_object.keys())
412 | 
413 |         for i in range(size, len(self.link_object), size):
414 |             batch = keys[(i - size):i]
415 |             self.init_driver()
416 | 
417 |             for p in batch:
418 |                 doc_link = self.link_object[p]["link"]
419 |                 self.request_paper(doc_link)
420 | 
421 |                 try:
422 |                     abstract = self.get_abstract_text()
423 | 
424 |                 except:
425 |                     abstract = np.NAN
426 | 
427 |                 self.link_object[p]["abs"] = abstract
428 | 
429 |             # dump updated link object to json
430 |             with open('./sci_temp.json', 'w') as file:
431 |                 json.dump(self.link_object, file)
432 | 
433 |             # close driver
434 |             self.close_driver()
435 | 
436 |     def to_json(self, path) -> None:
437 |         """
438 |         dump results into json
439 | 
440 |         Parameters
441 |         ----------
442 |         path: str
443 |             string path for save results (link and additional details)
444 | 
445 |         Returns
446 |         -------
447 | 
448 |         """
449 |         if os.path.isfile('./sci_temp.json'):
450 |             with open('./sci_temp.json') as file:
451 |                 self.link_object = json.load(file)
452 | 
453 |             os.remove('./sci_temp.json')
454 | 
455 |         with open(path, 'w') as file:
456 |             json.dump(self.link_object, file)
457 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | 
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | 
 6 | 
 7 | def clean_cookies_and_caches(driver):
 8 |     # first falls check
 9 |     if driver is not None:
10 |         driver.delete_all_cookies()
11 | 
12 |     # step 2
13 |     # method 1
14 |     driver.execute_script('window.localStorage.clear()')
15 | 
16 |     # method 2
17 |     driver.execute_script('window.sessionStorage.clear()')
18 | 
19 | 
20 | def read_json(file_path):
21 |     with open(file_path, "r") as f:
22 |         return json.load(f)
23 | 
24 | 
25 | def to_excel(sheets: dict):
26 |     dfs = {key: pd.read_json(filename) for key, filename in sheets.items()}
27 | 
28 |     with pd.ExcelWriter('./SLR_chris.xlsx') as writer:
29 |         for sheet, df in dfs.items():
30 |             df.T.to_excel(writer, sheet_name=sheet)
31 | 
32 | 
33 | def validate(obj: dict):
34 |     if obj == {}:
35 |         raise ConfigurationError()
36 | 
37 |     scrappers = {'IEEE', 'ACM', 'SCIDIR'}.intersection(set(obj.keys()))
38 |     if not obj.get('BINARY_LOCATION', False):
39 |         raise ConfigurationError()
40 | 
41 |     if not obj.get('EXECUTABLE_PATH', False):
42 |         raise ConfigurationError()
43 | 
44 |     assert len(scrappers) != 0
45 | 
46 |     print(f"detected scrappers: {scrappers}")
47 |     print('=' * 25)
48 | 
49 |     validate_scrapper_keys(obj, scrappers)
50 | 
51 |     return True
52 | 
53 | 
54 | def validate_scrapper_keys(obj: dict, detected: set):
55 |     expected_keys = ['search_term', 'link_file_save_to',
56 |                      'abs_file_save_to', 'use_batches',
57 |                      'batch_size', 'keep_link_file']
58 |     for s in detected:
59 |         if list(obj[s].keys()) != expected_keys:
60 |             raise ConfigurationError(expected_keys)
61 | 
62 | 
63 | class ConfigurationError(Exception):
64 |     """
65 |     raise when scrapper configuration misses
66 |     expected key or keys
67 |     """
68 | 
69 |     def __int__(self, exp_keys: list):
70 |         self.exp_keys = exp_keys
71 | 
72 |     def __repr__(self):
73 |         return f"{' '.join(self.exp_keys)} one or more keys missing from those."
74 | 
75 | 
76 | # TODO: complete this class
77 | # class GetSummery:
78 | #     config = read_json('./config.json')
79 | #     openai.api_key = config['API_KEY']
80 | #
81 | #     def __init__(self):
82 | #         self.text_generator = None
83 | #         self.paper_dtls = None
84 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ashen007/LiteratureReview/1bcf3d7f383a966d22c8337bc2ffd0296a78fd2f/tests/__init__.py


--------------------------------------------------------------------------------
/tests/bad.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
 3 |   "IEEE": {
 4 |     "search_term": "",
 5 |     "link_file_save_to": "./temp/ieee_search_term.json",
 6 |     "abs_file_save_to": "./abs/ieee_search_term.json",
 7 |     "use_batches": true,
 8 |     "batch_size": 8,
 9 |     "keep_link_file": true
10 |   },
11 |   "ACM": {
12 |     "search_term": "",
13 |     "link_file_save_to": "./temp/acm_search_term.json",
14 |     "abs_file_save_to": "./abs/acm_search_term.json",
15 |     "use_batches": true,
16 |     "batch_size": 8,
17 |     "keep_link_file": true
18 |   },
19 |   "SCIDIR": {}
20 | }


--------------------------------------------------------------------------------
/tests/empty.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/validate_config.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from src.utils import *
 4 | from pytest import raises
 5 | 
 6 | 
 7 | def test_is_config_exists(file_name='x.json'):
 8 |     with raises(FileNotFoundError):
 9 |         read_json(file_name)
10 | 
11 | 
12 | def test_whether_config_empty(file_name='tests/empty.json'):
13 |     config = read_json(file_name)
14 | 
15 |     with raises(ConfigurationError):
16 |         validate(config)
17 | 
18 | 
19 | def test_config_file():
20 |     config = read_json('config.json')
21 |     validate(config)
22 | 
23 | 
24 | def test_able_to_identify_bad_config():
25 |     config = read_json('tests/bad.json')
26 | 
27 |     with raises(ConfigurationError):
28 |         validate(config)
29 | 
30 |     with raises(ConfigurationError):
31 |         obj = {
32 |             "BINARY_LOCATION": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
33 |             "EXECUTABLE_PATH": "D:\\chromedriver.exe",
34 |             "ACM": {
35 |                 "search_term": "",
36 |                 "link_file_save_to": "./temp/acm_search_term.json",
37 |                 "abs_file_save_to": "./abs/acm_search_term.json",
38 |                 "use_batches": True,
39 |                 "batch_size": 8,
40 |                 "keep_link_file": True
41 |             },
42 |             "SCIDIR": {
43 |             }
44 |         }
45 | 
46 |         validate(obj)
47 | 


--------------------------------------------------------------------------------