├── .gitignore
├── LICENSE
├── README.md
├── after
    ├── config.json
    ├── main.py
    ├── scrape
    │   ├── __init__.py
    │   ├── config.py
    │   ├── dir.py
    │   ├── export.py
    │   ├── fetch.py
    │   ├── json.py
    │   ├── log.py
    │   ├── pdf.py
    │   ├── scihub.py
    │   └── scraper.py
    └── test_pdf.py
├── before
    └── main.py
├── papers
    ├── Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf
    ├── An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf
    ├── Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf
    ├── Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf
    └── Attentional capture helps explain why moral and emotional content go viral.docx
└── words
    ├── bycatch.txt
    ├── research.txt
    └── target.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.pyc
2 | **/.DS_Store
3 | **/*.log


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 John Fallot and ArjanCodes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web and PDF Scraper Refactoring
2 | 
3 | This repository contains the example code of the Web and PDF scraper code roast. Here are the links to the videos:
4 | 
5 | - Part 1: https://youtu.be/MXM6VEtf8SE
6 | - Part 2: https://youtu.be/6ac4Um2Vicg
7 | 


--------------------------------------------------------------------------------
/after/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "export_dir": "PDN Scraper Exports",
 3 |     "prime_src": "211001_PDN_studies_9.csv",
 4 |     "url_dmnsns": "https://app.dimensions.ai/discover/publication/results.json",
 5 |     "research_dir": "PDN Research Papers From Scrape",
 6 |     "url_scihub": "https://sci-hubtw.hkvisa.net/",
 7 |     "paper_folder": "../papers",
 8 |     "research_words": "../words/research.txt",
 9 |     "bycatch_words": "../words/bycatch.txt",
10 |     "target_words": "../words/target.txt"
11 | }


--------------------------------------------------------------------------------
/after/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from scrape.config import read_config
 4 | from scrape.export import export_data
 5 | from scrape.fetch import fetch_terms_from_pdf_files
 6 | from scrape.log import log_msg
 7 | 
 8 | 
 9 | def main() -> None:
10 | 
11 |     # read the configuration settings from a JSON file
12 |     config = read_config("./config.json")
13 | 
14 |     # fetch data from pdf files and export it
15 |     start = time.perf_counter()
16 |     result = fetch_terms_from_pdf_files(config)
17 |     export_data(result, config.export_dir)
18 |     elapsed = time.perf_counter() - start
19 |     log_msg(f"\n[sciscraper]: Extraction finished in {elapsed} seconds.\n")
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/after/scrape/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/after/scrape/__init__.py


--------------------------------------------------------------------------------
/after/scrape/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class ScrapeConfig:
 7 |     export_dir: str
 8 |     prime_src: str
 9 |     url_dmnsns: str
10 |     research_dir: str
11 |     url_scihub: str
12 |     paper_folder: str
13 |     research_words: str
14 |     bycatch_words: str
15 |     target_words: str
16 | 
17 | 
18 | def read_config(config_file: str) -> ScrapeConfig:
19 |     with open(config_file) as file:
20 |         data = json.load(file)
21 |         return ScrapeConfig(**data)
22 | 


--------------------------------------------------------------------------------
/after/scrape/dir.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from contextlib import contextmanager
 3 | 
 4 | 
 5 | @contextmanager
 6 | def change_dir(destination: str):
 7 |     """Sets a destination for exported files."""
 8 |     cwd = os.getcwd()
 9 |     try:
10 |         __dest = os.path.realpath(destination)
11 |         if not os.path.exists(__dest):
12 |             os.mkdir(__dest)
13 |         os.chdir(__dest)
14 |         yield
15 |     finally:
16 |         os.chdir(cwd)
17 | 


--------------------------------------------------------------------------------
/after/scrape/export.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from datetime import datetime
 3 | from typing import Optional
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from scrape.dir import change_dir
 8 | from scrape.log import log_msg
 9 | 
10 | 
11 | def export_data(dataframe: Optional[pd.DataFrame], export_dir: str):
12 |     now = datetime.now()
13 |     date = now.strftime("%y%m%d")
14 |     with change_dir(export_dir):
15 |         print_id = random.randint(0, 100)
16 |         export_name = f"{date}_DIMScrape_Refactor_{print_id}.csv"
17 |         dataframe.to_csv(export_name)
18 |         print(dataframe.head())
19 |         log_msg(
20 |             f"\n[sciscraper]: A spreadsheet was exported as {export_name} in {export_dir}.\n"
21 |         )
22 | 


--------------------------------------------------------------------------------
/after/scrape/fetch.py:
--------------------------------------------------------------------------------
 1 | from fnmatch import fnmatch
 2 | from os import listdir, path
 3 | 
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | 
 7 | from scrape.config import ScrapeConfig
 8 | from scrape.pdf import PDFScraper
 9 | from scrape.scraper import Scraper
10 | 
11 | 
12 | def fetch_terms_from_doi(target: str, scraper: Scraper) -> pd.DataFrame:
13 |     print(f"\n[sciscraper]: Getting entries from file: {target}")
14 |     with open(target, newline="") as f:
15 |         df = [doi for doi in pd.read_csv(f, usecols=["DOI"])["DOI"]]
16 |         search_terms = [search_text for search_text in df if search_text is not None]
17 |         return pd.DataFrame(
18 |             [scraper.scrape(search_text) for search_text in tqdm(search_terms)]
19 |         )
20 | 
21 | 
22 | def fetch_terms_from_pubid(target: pd.DataFrame, scraper: Scraper) -> pd.DataFrame:
23 |     df = target.explode("cited_dimensions_ids", "title")
24 |     search_terms = (
25 |         search_text
26 |         for search_text in df["cited_dimensions_ids"]
27 |         if search_text is not None
28 |     )
29 |     src_title = pd.Series(df["title"])
30 | 
31 |     return pd.DataFrame(
32 |         [scraper.scrape(search_text) for search_text in tqdm(list(search_terms))]
33 |     ).join(src_title)
34 | 
35 | 
36 | def fetch_terms_from_pdf_files(config: ScrapeConfig) -> pd.DataFrame:
37 | 
38 |     search_terms = [
39 |         path.join(config.paper_folder, file)
40 |         for file in listdir(config.paper_folder)
41 |         if fnmatch(path.basename(file), "*.pdf")
42 |     ]
43 |     scraper = PDFScraper(
44 |         config.research_words, config.bycatch_words, config.target_words
45 |     )
46 |     return pd.DataFrame([scraper.scrape(file) for file in tqdm(search_terms)])
47 | 


--------------------------------------------------------------------------------
/after/scrape/json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from json.decoder import JSONDecodeError
 4 | from typing import Optional
 5 | 
 6 | ## Scraping Related Imports
 7 | import requests
 8 | from requests.exceptions import HTTPError, RequestException
 9 | 
10 | from scrape.log import log_msg
11 | 
12 | 
13 | class JSONScraper:
14 |     """The JSONScrape class takes the provided string from a prior list comprehension.
15 |     Using that string value, it gets the resulting JSON data, parses it, and then returns a dictionary, which gets appended to a list.
16 |     """
17 | 
18 |     def __init__(self, dimensions_url: str) -> None:
19 |         self.dimensions_url = dimensions_url
20 | 
21 |     def download(self, search_text: str) -> dict:
22 |         """The download method generates a session and a querystring that gets sent to the website. This returns a JSON entry.
23 |         The JSON entry is loaded and specific values are identified for passing along, back to a dataframe.
24 |         """
25 |         self.sessions = requests.Session()
26 |         self.search_field = self.specify_search(search_text)
27 |         print(
28 |             f"[sciscraper]: Searching for {search_text} via a {self.search_field}-style search.",
29 |             end="\r",
30 |         )
31 |         querystring = {
32 |             "search_mode": "content",
33 |             "search_text": f"{search_text}",
34 |             "search_type": "kws",
35 |             "search_field": f"{self.search_field}",
36 |         }
37 |         time.sleep(1)
38 | 
39 |         try:
40 |             r = self.sessions.get(self.dimensions_url, params=querystring)
41 |             r.raise_for_status()
42 |             log_msg(str(r.status_code))
43 |             self.docs = json.loads(r.text)["docs"]
44 | 
45 |         except (JSONDecodeError, RequestException) as e:
46 |             print(
47 |                 f"\n[sciscraper]: An error occurred while searching for {search_text}.\
48 |                 \n[sciscraper]: Proceeding to next item in sequence.\
49 |                 Cause of error: {e}\n"
50 |             )
51 |             pass
52 | 
53 |         except HTTPError as f:
54 |             print(
55 |                 f"\n[sciscraper]: Access to {self.dimensions_url} denied while searching for {search_text}.\
56 |                 \n[sciscraper]: Terminating sequence. Cause of error: {f}\
57 |                 \n"
58 |             )
59 |             quit()
60 | 
61 |         for item in self.docs:
62 |             self.data = self.get_data_entry(
63 |                 item,
64 |                 keys=[
65 |                     "title",
66 |                     "author_list",
67 |                     "publisher",
68 |                     "pub_date",
69 |                     "doi",
70 |                     "id",
71 |                     "abstract",
72 |                     "acknowledgements",
73 |                     "journal_title",
74 |                     "volume",
75 |                     "issue",
76 |                     "times_cited",
77 |                     "mesh_terms",
78 |                     "cited_dimensions_ids",
79 |                 ],
80 |             )
81 |         return self.data
82 | 
83 |     def specify_search(self, search_text: str) -> str:
84 |         """Determines whether the dimensions.ai query will be for a full_search or just for the doi."""
85 |         if search_text.startswith("pub"):
86 |             self.search_field = "full_search"
87 |         else:
88 |             self.search_field = "doi"
89 |         return self.search_field
90 | 
91 |     def get_data_entry(self, item, keys: Optional[list]) -> dict:
92 |         """Based on a provided list of keys and items in the JSON data,
93 |         generates a dictionary entry.
94 |         """
95 |         return {_key: item.get(_key, "") for _key in keys}
96 | 


--------------------------------------------------------------------------------
/after/scrape/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from datetime import datetime
 3 | 
 4 | now = datetime.now()
 5 | date = now.strftime("%y%m%d")
 6 | 
 7 | logging.basicConfig(
 8 |     filename=f"{date}_scraper.log",
 9 |     level=logging.DEBUG,
10 |     format="%(asctime)s - %(message)s",
11 |     datefmt="%d-%b-%y %H:%M:%S",
12 | )
13 | 
14 | 
15 | def log_msg(msg: str) -> None:
16 |     logging.info(msg)
17 |     print(msg)
18 | 


--------------------------------------------------------------------------------
/after/scrape/pdf.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from os import path
 3 | from typing import Any
 4 | 
 5 | import pdfplumber
 6 | from nltk import FreqDist
 7 | from nltk.corpus import names, stopwords
 8 | from nltk.tokenize import word_tokenize
 9 | 
10 | from scrape.scraper import Scraper, ScrapeResult
11 | 
12 | STOP_WORDS: set[str] = set(stopwords.words("english"))
13 | 
14 | NAME_WORDS: set[str] = set(names.words())
15 | 
16 | 
17 | def guess_doi(path_name: str) -> str:
18 |     basename = path.basename(path_name)
19 |     doi = basename[7:-4]
20 |     return f"{doi[:7]}/{doi[7:]}"
21 | 
22 | 
23 | def compute_filtered_tokens(text: list[str]) -> set[str]:
24 |     """Takes a lowercase string, now removed of its non-alphanumeric characters.
25 |     It returns (as a list comprehension) a parsed and tokenized
26 |     version of the text, with stopwords and names removed.
27 |     """
28 |     word_tokens = word_tokenize("\n".join(text))
29 |     return set([w for w in word_tokens if not w in STOP_WORDS and NAME_WORDS])
30 | 
31 | 
32 | def most_common_words(word_set: set[str], n: int) -> list[tuple[str, int]]:
33 |     return FreqDist(word_set).most_common(n)
34 | 
35 | 
36 | class PDFScraper(Scraper):
37 |     def __init__(self, research_words: str, bycatch_words: str, target_words: str):
38 |         with open(research_words, encoding="utf8") as f:
39 |             self.research_words = set(f.readlines())
40 |         with open(bycatch_words, encoding="utf8") as f:
41 |             self.bycatch_words = set(f.readlines())
42 |         with open(target_words, encoding="utf8") as f:
43 |             self.target_words = set(f.readlines())
44 | 
45 |     def scrape(self, search_text: str) -> ScrapeResult:
46 |         preprints: list[str] = []
47 |         with pdfplumber.open(search_text) as study:
48 |             pages: list[Any] = study.pages
49 |             n = len(pages)
50 |             pages_to_check: list[Any] = [page for page in pages][:n]
51 |             for page_number, page in enumerate(pages_to_check):
52 |                 page: str = pages[page_number].extract_text(
53 |                     x_tolerance=3, y_tolerance=3
54 |                 )
55 |                 print(
56 |                     f"[sciscraper]: Processing Page {page_number} of {n-1} | {search_text}...",
57 |                     end="\r",
58 |                 )
59 |                 preprints.append(
60 |                     page
61 |                 )  # Each page's string gets appended to preprint []
62 | 
63 |             manuscripts = [str(preprint).strip().lower() for preprint in preprints]
64 |             # The preprints are stripped of extraneous characters and all made lower case.
65 |             postprints = [re.sub(r"\W+", " ", manuscript) for manuscript in manuscripts]
66 |             # The ensuing manuscripts are stripped of lingering whitespace and non-alphanumeric characters.
67 |             all_words = compute_filtered_tokens(postprints)
68 |             research_word_overlap = self.research_words.intersection(all_words)
69 | 
70 |             doi = guess_doi(search_text)
71 | 
72 |             target_intersection = self.target_words.intersection(all_words)
73 |             bycatch_intersection = self.bycatch_words.intersection(all_words)
74 |             wordscore = len(target_intersection) - len(bycatch_intersection)
75 |             frequency = most_common_words(all_words, 5)
76 |             study_design = most_common_words(research_word_overlap, 3)
77 | 
78 |             return ScrapeResult(
79 |                 doi,
80 |                 wordscore,
81 |                 frequency,
82 |                 study_design,
83 |             )
84 | 


--------------------------------------------------------------------------------
/after/scrape/scihub.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from contextlib import suppress
 4 | from datetime import datetime
 5 | 
 6 | import requests
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | from scrape.dir import change_dir
10 | from scrape.log import log_msg
11 | 
12 | 
13 | class SciHubScraper:
14 |     def __init__(self, scihub_url: str, research_dir: str) -> None:
15 |         self.scihub_url = scihub_url
16 | 
17 |         now = datetime.now()
18 |         date = now.strftime("%y%m%d")
19 |         self.research_dir = os.path.realpath(f"{research_dir}_{date}")
20 | 
21 |     """The SciHubScrape class takes the provided string from a prior list comprehension.
22 |     Using that string value, it posts it to the selected website.
23 |     Then, it downloads the ensuing pdf file that appears as a result of that query.
24 |     """
25 | 
26 |     def download(self, search_text: str):
27 |         """The download method generates a session and a payload that gets posted as a search query to the website.
28 |         This search should return a pdf.
29 |         Once the search is found, it is parsed with BeautifulSoup.
30 |         Then, the link to download that pdf is isolated.
31 |         """
32 |         self.sessions = requests.Session()
33 |         print(
34 |             f"[sciscraper]: Delving too greedily and too deep for download links for {search_text}, by means of dark and arcane magicx.",
35 |             end="\r",
36 |         )
37 |         self.payload = {"request": f"{search_text}"}
38 |         with change_dir(self.research_dir):
39 |             time.sleep(1)
40 |             with suppress(
41 |                 requests.exceptions.HTTPError, requests.exceptions.RequestException
42 |             ):
43 |                 r = self.sessions.post(url=self.scihub_url, data=self.payload)
44 |                 r.raise_for_status()
45 |                 log_msg(str(r.status_code))
46 |                 soup = BeautifulSoup(r.text, "lxml")
47 |                 self.links = list(
48 |                     ((item["onclick"]).split("=")[1]).strip("'")
49 |                     for item in soup.select("button[onclick^='location.href=']")
50 |                 )
51 |                 self.enrich_scrape()
52 | 
53 |     def enrich_scrape(self, search_text: str):
54 |         """With the link to download isolated, it is followed and thereby downloaded.
55 |         It is sent as bytes to a temporary text file, as a middleman of sorts.
56 |         The temporary text file is then used as a basis to generate a new pdf.
57 |         The temporary text file is then deleted in preparation for the next pdf.
58 |         """
59 |         now = datetime.now()
60 |         date = now.strftime("%y%m%d")
61 |         for link in self.links:
62 |             paper_url = f"{link}=true"
63 |             paper_title = f'{date}_{search_text.replace("/","")}.pdf'
64 |             time.sleep(1)
65 |             paper_content = (
66 |                 requests.get(paper_url, stream=True, allow_redirects=True)
67 |             ).content
68 |             with open("temp_file.txt", "wb") as _tempfile:
69 |                 _tempfile.write(paper_content)
70 |             with open(paper_title, "wb") as file:
71 |                 for line in open("temp_file.txt", "rb").readlines():
72 |                     file.write(line)
73 |             os.remove("temp_file.txt")
74 | 


--------------------------------------------------------------------------------
/after/scrape/scraper.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Protocol
 3 | 
 4 | 
 5 | @dataclass
 6 | class ScrapeResult:
 7 |     DOI: str
 8 |     wordscore: int
 9 |     frequency: list[tuple[str, int]]
10 |     study_design: list[tuple[str, int]]
11 | 
12 | 
13 | class Scraper(Protocol):
14 |     def scrape(self, search_text: str) -> ScrapeResult:
15 |         ...
16 | 


--------------------------------------------------------------------------------
/after/test_pdf.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scrape.pdf import compute_filtered_tokens
 4 | 
 5 | 
 6 | class TestPdfScraper(unittest.TestCase):
 7 |     def test_filtered_tokens_empty(self):
 8 |         self.assertEqual(len(compute_filtered_tokens([])), 0)
 9 | 
10 |     def test_filtered_tokens_stop(self):
11 |         tokens = ["please like and subscribe"]
12 |         filtered_tokens = compute_filtered_tokens(tokens)
13 |         print(tokens)
14 |         print(filtered_tokens)
15 |         self.assertEqual(filtered_tokens, {"please", "like", "subscribe"})
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/before/main.py:
--------------------------------------------------------------------------------
  1 | r"""This module looks up bibliographic information from directories, csv files and pandas databases.
  2 | 
  3 | From a .csv file input, you can get:
  4 |     - a folder of .pdf downloads (experimental).
  5 |     - a pandas dataframe
  6 |     - another .csv file
  7 | 
  8 | From a pandas dataframe file input, you can get:
  9 |     - a folder of .pdf downloads (experimental).
 10 |     - another pandas dataframe, but the citations of the input are each returned with their own citation information.
 11 | 
 12 | From a folder input, you can get:
 13 |     - a pandas dataframe
 14 |     - a .csv file
 15 | """
 16 | 
 17 | # ==============================================
 18 | #    SPECIAL THANKS
 19 | #
 20 | #    Michele Cotrufo
 21 | #    Nathan Lippi
 22 | #    Jon Watson Rooney
 23 | #    Colin Meret
 24 | #    ArjanCodes
 25 | #    James Murphy
 26 | #    Micael Jarniac
 27 | #
 28 | #    ----------------Maintainer----------------
 29 | #    John Fallot
 30 | #
 31 | #    ----------------License----------------
 32 | #    The MIT License [https://opensource.org/licenses/MIT]
 33 | #    Copyright (c) 2021 John Fallot
 34 | # ==============================================
 35 | 
 36 | # ==============================================
 37 | #      IMPORTS
 38 | # ==============================================
 39 | 
 40 | ## File Structure Related Imports
 41 | import __future__
 42 | 
 43 | __version__ = "1.01"
 44 | __author__ = "John Fallot"
 45 | 
 46 | import datetime
 47 | import json
 48 | import logging
 49 | import os
 50 | import random
 51 | import re
 52 | import time
 53 | from contextlib import contextmanager, suppress
 54 | from fnmatch import fnmatch
 55 | from json.decoder import JSONDecodeError
 56 | from os import PathLike, listdir, path
 57 | from os.path import isdir
 58 | from typing import Optional
 59 | 
 60 | import pandas as pd
 61 | 
 62 | ## Language Processing Related Imports
 63 | import pdfplumber
 64 | 
 65 | ## Scraping Related Imports
 66 | import requests
 67 | from bs4 import BeautifulSoup
 68 | from nltk import FreqDist
 69 | from nltk.corpus import names, stopwords
 70 | from nltk.tokenize import word_tokenize
 71 | from requests.exceptions import HTTPError, RequestException
 72 | from tqdm import tqdm
 73 | 
 74 | # ==============================================
 75 | #    CONFIGS
 76 | # ==============================================
 77 | 
 78 | now = datetime.datetime.now()
 79 | date = now.strftime("%y%m%d")
 80 | export_dir = os.path.realpath("PDN Scraper Exports")
 81 | msg_error_1 = "[sciscraper]: HTTP Error Encountered, moving to next available object. Reason Given:"
 82 | 
 83 | logging.basicConfig(
 84 |     filename=f"{date}_scraper.log",
 85 |     level=logging.DEBUG,
 86 |     format="%(asctime)s - %(message)s",
 87 |     datefmt="%d-%b-%y %H:%M:%S",
 88 | )
 89 | 
 90 | PRIME_SRC = os.path.realpath("211001_PDN_studies_9.csv")
 91 | URL_DMNSNS = "https://app.dimensions.ai/discover/publication/results.json"
 92 | RESEARCH_DIR = os.path.realpath(f"{date}_PDN Research Papers From Scrape")
 93 | URL_SCIHUB = "https://sci-hubtw.hkvisa.net/"
 94 | 
 95 | # ==============================================
 96 | #    SCRAPE RELATED CLASSES & SUBCLASSES
 97 | # ==============================================
 98 | 
 99 | 
100 | class ScrapeRequest:
101 |     """The abstraction of the program's web scraping requests, which dynamically returns its appropriate subclasses based on the provided inputs."""
102 | 
103 |     _registry = {}
104 | 
105 |     def __init_subclass__(cls, slookup_code, **kwargs):
106 |         super().__init_subclass__(**kwargs)
107 |         cls._registry[slookup_code] = cls
108 | 
109 |     def __new__(cls, s_bool: bool):
110 |         """The ScrapeRequest class looks for the boolean value passed to it from the FileRequest class.
111 |         A value of True, or 1, would return a SciHubScrape subclass.
112 |         Whereas a value of False, of 0, would return a JSONScrape subclass.
113 |         """
114 |         if not isinstance(s_bool, bool):
115 |             raise TypeError
116 |         if s_bool:
117 |             slookup_code = "sci"
118 |         else:
119 |             slookup_code = "json"
120 | 
121 |         subclass = cls._registry[slookup_code]
122 | 
123 |         obj = object.__new__(subclass)
124 |         return obj
125 | 
126 |     def download(self) -> None:
127 |         raise NotImplementedError
128 | 
129 | 
130 | class SciHubScrape(ScrapeRequest, slookup_code="sci"):
131 |     """The SciHubScrape class takes the provided string from a prior list comprehension.
132 |     Using that string value, it posts it to the selected website.
133 |     Then, it downloads the ensuing pdf file that appears as a result of that query.
134 |     """
135 | 
136 |     def download(self, search_text: str):
137 |         """The download method generates a session and a payload that gets posted as a search query to the website.
138 |         This search should return a pdf.
139 |         Once the search is found, it is parsed with BeautifulSoup.
140 |         Then, the link to download that pdf is isolated.
141 |         """
142 |         self.sessions = requests.Session()
143 |         self.base_url = URL_SCIHUB
144 |         print(
145 |             f"[sciscraper]: Delving too greedily and too deep for download links for {search_text}, by means of dark and arcane magicx.",
146 |             end="\r",
147 |         )
148 |         self.payload = {"request": f"{search_text}"}
149 |         with change_dir(RESEARCH_DIR):
150 |             time.sleep(1)
151 |             with suppress(
152 |                 requests.exceptions.HTTPError, requests.exceptions.RequestException
153 |             ):
154 |                 r = self.sessions.post(url=self.base_url, data=self.payload)
155 |                 r.raise_for_status()
156 |                 logging.info(r.status_code)
157 |                 soup = BeautifulSoup(r.text, "lxml")
158 |                 self.links = list(
159 |                     ((item["onclick"]).split("=")[1]).strip("'")
160 |                     for item in soup.select("button[onclick^='location.href=']")
161 |                 )
162 |                 self.enrich_scrape()
163 | 
164 |     def enrich_scrape(self, search_text: str):
165 |         """With the link to download isolated, it is followed and thereby downloaded.
166 |         It is sent as bytes to a temporary text file, as a middleman of sorts.
167 |         The temporary text file is then used as a basis to generate a new pdf.
168 |         The temporary text file is then deleted in preparation for the next pdf.
169 |         """
170 |         for link in self.links:
171 |             paper_url = f"{link}=true"
172 |             paper_title = f'{date}_{search_text.replace("/","")}.pdf'
173 |             time.sleep(1)
174 |             paper_content = (
175 |                 requests.get(paper_url, stream=True, allow_redirects=True)
176 |             ).content
177 |             with open("temp_file.txt", "wb") as _tempfile:
178 |                 _tempfile.write(paper_content)
179 |             with open(paper_title, "wb") as file:
180 |                 for line in open("temp_file.txt", "rb").readlines():
181 |                     file.write(line)
182 |             os.remove("temp_file.txt")
183 | 
184 | 
185 | class JSONScrape(ScrapeRequest, slookup_code="json"):
186 |     """The JSONScrape class takes the provided string from a prior list comprehension.
187 |     Using that string value, it gets the resulting JSON data, parses it, and then returns a dictionary, which gets appended to a list.
188 |     """
189 | 
190 |     def download(self, search_text: str) -> dict:
191 |         """The download method generates a session and a querystring that gets sent to the website. This returns a JSON entry.
192 |         The JSON entry is loaded and specific values are identified for passing along, back to a dataframe.
193 |         """
194 |         self.sessions = requests.Session()
195 |         self.search_field = self.specify_search(search_text)
196 |         self.base_url = URL_DMNSNS
197 |         print(
198 |             f"[sciscraper]: Searching for {search_text} via a {self.search_field}-style search.",
199 |             end="\r",
200 |         )
201 |         querystring = {
202 |             "search_mode": "content",
203 |             "search_text": f"{search_text}",
204 |             "search_type": "kws",
205 |             "search_field": f"{self.search_field}",
206 |         }
207 |         time.sleep(1)
208 | 
209 |         try:
210 |             r = self.sessions.get(self.base_url, params=querystring)
211 |             r.raise_for_status()
212 |             logging.info(r.status_code)
213 |             self.docs = json.loads(r.text)["docs"]
214 | 
215 |         except (JSONDecodeError, RequestException) as e:
216 |             print(
217 |                 f"\n[sciscraper]: An error occurred while searching for {search_text}.\
218 |                 \n\[sciscraper]: Proceeding to next item in sequence.\
219 |                 Cause of error: {e}\n"
220 |             )
221 |             pass
222 | 
223 |         except HTTPError as f:
224 |             print(
225 |                 f"\n[sciscraper]: Access to {self.base_url} denied while searching for {search_text}.\
226 |                 \n[sciscraper]: Terminating sequence. Cause of error: {f}\
227 |                 \n"
228 |             )
229 |             quit()
230 | 
231 |         for item in self.docs:
232 |             self.data = self.get_data_entry(
233 |                 item,
234 |                 keys=[
235 |                     "title",
236 |                     "author_list",
237 |                     "publisher",
238 |                     "pub_date",
239 |                     "doi",
240 |                     "id",
241 |                     "abstract",
242 |                     "acknowledgements",
243 |                     "journal_title",
244 |                     "volume",
245 |                     "issue",
246 |                     "times_cited",
247 |                     "mesh_terms",
248 |                     "cited_dimensions_ids",
249 |                 ],
250 |             )
251 |         return self.data
252 | 
253 |     def specify_search(self, search_text: str) -> str:
254 |         """Determines whether the dimensions.ai query will be for a full_search or just for the doi."""
255 |         if search_text.startswith("pub"):
256 |             self.search_field = "full_search"
257 |         else:
258 |             self.search_field = "doi"
259 |         return self.search_field
260 | 
261 |     def get_data_entry(self, item, keys: Optional[list]) -> dict:
262 |         """Based on a provided list of keys and items in the JSON data,
263 |         generates a dictionary entry.
264 |         """
265 |         return {_key: item.get(_key, "") for _key in keys}
266 | 
267 | 
268 | class PDFScrape:
269 |     """The PDFScrape class takes the provided string from a prior list
270 |     comprehension of PDF files in a directory. From each pdf file, it parses the document
271 |     and returns metrics about its composition and relevance.
272 |     """
273 | 
274 |     def download(self, search_text: str) -> dict:
275 |         self.search_text = search_text
276 |         self.preprints = []
277 |         with pdfplumber.open(self.search_text) as self.study:
278 |             self.n = len(self.study.pages)
279 |             self.pages_to_check = [page for page in self.study.pages][: self.n]
280 |             for page_number, page in enumerate(self.pages_to_check):
281 |                 page = self.study.pages[page_number].extract_text(
282 |                     x_tolerance=3, y_tolerance=3
283 |                 )
284 |                 print(
285 |                     f"[sciscraper]: Processing Page {page_number} of {self.n-1} | {search_text}...",
286 |                     end="\r",
287 |                 )
288 |                 self.preprints.append(
289 |                     page
290 |                 )  # Each page's string gets appended to preprint []
291 | 
292 |             self.manuscripts = [
293 |                 str(preprint).strip().lower() for preprint in self.preprints
294 |             ]
295 |             # The preprints are stripped of extraneous characters and all made lower case.
296 |             self.postprints = [
297 |                 re.sub(r"\W+", " ", manuscript) for manuscript in self.manuscripts
298 |             ]
299 |             # The ensuing manuscripts are stripped of lingering whitespace and non-alphanumeric characters.
300 |             self.all_words = self.get_tokens()
301 |             self.research_word_overlap = self.get_research_words()
302 |             return self.get_data_entry()
303 | 
304 |     def get_tokens(self) -> list:
305 |         """Takes a lowercase string, now removed of its non-alphanumeric characters.
306 |         It returns (as a list comprehension) a parsed and tokenized
307 |         version of the postprint, with stopwords and names removed.
308 |         """
309 |         self.stop_words = set(stopwords.words("english"))
310 |         self.name_words = set(names.words())
311 |         self.word_tokens = word_tokenize(str(self.postprints))
312 |         return [
313 |             w for w in self.word_tokens if not w in self.stop_words and self.name_words
314 |         ]  # Filters out the stopwords
315 | 
316 |     def _overlap(self, li) -> list:
317 |         """Checks if token words match words in a provided list."""
318 |         return [w for w in li if w in self.all_words]
319 | 
320 |     def get_target_words(self):
321 |         """Checks for words that match the user's primary query."""
322 |         self.target_words = [
323 |             "prosocial",
324 |             "design",
325 |             "intervention",
326 |             "reddit",
327 |             "humane",
328 |             "social media",
329 |             "user experience",
330 |             "nudge",
331 |             "choice architecture",
332 |             "user interface",
333 |             "misinformation",
334 |             "disinformation",
335 |             "Trump",
336 |             "conspiracy",
337 |             "dysinformation",
338 |             "users",
339 |             "Thaler",
340 |             "Sunstein",
341 |             "boost",
342 |         ]
343 |         self.target_word_overlap = self._overlap(self.target_words)
344 |         return self.target_word_overlap
345 | 
346 |     def get_bycatch_words(self):
347 |         """Checks for words that often occur in conjunction with the
348 |         user's primary query, but are deemed undesirable.
349 |         """
350 |         self.bycatch_words = [
351 |             "psychology",
352 |             "pediatric",
353 |             "pediatry",
354 |             "autism",
355 |             "mental",
356 |             "medical",
357 |             "oxytocin",
358 |             "adolescence",
359 |             "infant",
360 |             "health",
361 |             "wellness",
362 |             "child",
363 |             "care",
364 |             "mindfulness",
365 |         ]
366 |         self.bycatch_word_overlap = self._overlap(self.bycatch_words)
367 |         return self.bycatch_word_overlap
368 | 
369 |     def get_research_words(self):
370 |         """Checks for words that correspond to specific experimental designs."""
371 |         self.research_words = [
372 |             "big data",
373 |             "data",
374 |             "analytics",
375 |             "randomized controlled trial",
376 |             "RCT",
377 |             "moderation",
378 |             "community",
379 |             "social media",
380 |             "conversational",
381 |             "control",
382 |             "randomized",
383 |             "systemic",
384 |             "analysis",
385 |             "thematic",
386 |             "review",
387 |             "study",
388 |             "case series",
389 |             "case report",
390 |             "double blind",
391 |             "ecological",
392 |             "survey",
393 |         ]
394 |         self.research_word_overlap = self._overlap(self.research_words)
395 |         return self.research_word_overlap
396 | 
397 |     def get_wordscore(self) -> int:
398 |         """Returns a score, which is the number of target words minus the number of undesirable words.
399 |         A positive score suggests that the paper is more likely than not to be a match.
400 |         A negative score suggests that the paper is likely to be unrelated to the user's primary query."""
401 |         return len(self.get_target_words()) - len(self.get_bycatch_words())
402 | 
403 |     def get_doi(self) -> str:
404 |         """Approximates a possible DOI, assuming the file is saved in YYMMDD_DOI.pdf format."""
405 |         self.getting_doi = path.basename(self.search_text)
406 |         self.doi = self.getting_doi[7:-4]
407 |         self.doi = self.doi[:7] + "/" + self.doi[7:]
408 |         return self.doi
409 | 
410 |     def get_data_entry(self) -> dict:
411 |         """Returns a dictionary entry. Ideally, this will someday work through a DataEntry class."""
412 |         self.data = {
413 |             "DOI": self.get_doi(),
414 |             "wordscore": self.get_wordscore(),
415 |             "frequency": FreqDist(self.all_words).most_common(5),
416 |             "study_design": FreqDist(self.research_word_overlap).most_common(3),
417 |         }
418 | 
419 |         return self.data
420 | 
421 | 
422 | # ==============================================
423 | #    CONTEXT MANAGER METACLASS
424 | # ==============================================
425 | 
426 | 
427 | @contextmanager
428 | def change_dir(destination: str):
429 |     """Sets a destination for exported files."""
430 |     try:
431 |         __dest = os.path.realpath(destination)
432 |         cwd = os.getcwd()
433 |         if not os.path.exists(__dest):
434 |             os.mkdir(__dest)
435 |         os.chdir(__dest)
436 |         yield
437 |     finally:
438 |         os.chdir(cwd)
439 | 
440 | 
441 | # ==============================================
442 | #    FILE REQUEST CLASSES & SUBCLASSES
443 | # ==============================================
444 | 
445 | 
446 | class FileRequest:
447 |     """The abstraction of the program's input file classes.
448 |     It dynamically returns its appropriate subclasses based on the provided inputs.
449 |     """
450 | 
451 |     _registry = {}
452 | 
453 |     def __init_subclass__(cls, dlookup_code, **kwargs):
454 |         super().__init_subclass__(**kwargs)
455 |         cls._registry[dlookup_code] = cls
456 | 
457 |     def __new__(cls, target, slookup_key: bool = None):
458 |         if isdir(target):
459 |             dlookup_code = "fold"
460 |         elif str(target).endswith("csv"):
461 |             dlookup_code = "doi"
462 |         elif isinstance(target, pd.DataFrame):
463 |             dlookup_code = "pub"
464 |         else:
465 |             raise Exception("[sciscraper]: Invalid prefix detected.")
466 | 
467 |         subclass = cls._registry[dlookup_code]
468 | 
469 |         obj = object.__new__(subclass)
470 |         obj.target = target
471 |         obj.slookup_key = slookup_key
472 |         obj.scraper = ScrapeRequest(slookup_key)
473 |         return obj
474 | 
475 |     def fetch_terms(self) -> None:
476 |         raise NotImplementedError
477 | 
478 | 
479 | class DOIRequest(FileRequest, dlookup_code="doi"):
480 |     """The DOIRequest class takes a csv and generates a list comprehension.
481 |     The list comprehension is scraped, and then returns a DataFrame.
482 |     """
483 | 
484 |     def __init__(self, target: str, slookup_key: bool = False):
485 |         self.target = target
486 |         self.slookup_key = slookup_key
487 |         self.scraper = ScrapeRequest(self.slookup_key)
488 | 
489 |     def fetch_terms(self):
490 |         print(f"\n[sciscraper]: Getting entries from file: {self.target}")
491 |         with open(self.target, newline="") as f:
492 |             self.df = [doi for doi in pd.read_csv(f, usecols=["DOI"])["DOI"]]
493 |             self.search_terms = [
494 |                 search_text for search_text in self.df if search_text is not None
495 |             ]
496 |             return pd.DataFrame(
497 |                 [
498 |                     self.scraper.download(search_text)
499 |                     for search_text in tqdm(self.search_terms)
500 |                 ]
501 |             )
502 | 
503 | 
504 | class PubIDRequest(FileRequest, dlookup_code="pub"):
505 |     """The PubIDRequest class takes a DataFrame and generates a list comprehension.
506 |     The list comprehension is scraped, and then returns a DataFrame.
507 |     """
508 | 
509 |     def __init__(self, target: pd.DataFrame, slookup_key: bool = False):
510 |         if slookup_key:
511 |             print(
512 |                 "\n[sciscraper]: Getting Pub IDs from dataframe to download from web..."
513 |             )
514 |         else:
515 |             print(
516 |                 "\n[sciscraper]: Expounding upon existing PubIDs to generate a new dataframe..."
517 |             )
518 |         self.target = target
519 |         self.slookup_key = slookup_key
520 |         self.scraper = ScrapeRequest(self.slookup_key)
521 | 
522 |     def fetch_terms(self):
523 |         self.df = self.target.explode("cited_dimensions_ids", "title")
524 |         self.search_terms = (
525 |             search_text
526 |             for search_text in self.df["cited_dimensions_ids"]
527 |             if search_text is not None
528 |         )
529 |         self.src_title = pd.Series(self.df["title"])
530 | 
531 |         return pd.DataFrame(
532 |             [
533 |                 self.scraper.download(search_text)
534 |                 for search_text in tqdm(list(self.search_terms))
535 |             ]
536 |         ).join(self.src_title)
537 | 
538 | 
539 | class FolderRequest(FileRequest, dlookup_code="fold"):
540 |     """
541 |     The Folder class takes a directory and generates a list comprehension.
542 |     The list comprehension is scraped, and then returns a DataFrame.
543 |     Unlike other classes, it cannot undergo a SciScrape.
544 |     """
545 | 
546 |     def __init__(self, target: PathLike[str], slookup_key: bool = False):
547 |         print(f"\n[sciscraper]: Getting files from folder: {target}")
548 |         self.target = target
549 |         if self.slookup_key:
550 |             raise Exception(
551 |                 "This action is prohibited. \
552 |                 You already have the files that this query would return."
553 |             )
554 |         self.slookup_key = slookup_key
555 |         self.scraper = PDFScrape()
556 | 
557 |     def fetch_terms(self):
558 |         self.search_terms = [
559 |             path.join(self.target, file)
560 |             for file in listdir(self.target)
561 |             if fnmatch(path.basename(file), "*.pdf")
562 |         ]
563 |         return pd.DataFrame(
564 |             [self.scraper.download(file) for file in tqdm(self.search_terms)]
565 |         )
566 | 
567 | 
568 | # ==============================================
569 | #    EXPORTING, MAIN LOOP, AND MISCELLANY
570 | # ==============================================
571 | 
572 | 
573 | def export(dataframe: Optional[pd.DataFrame]):
574 |     with change_dir(export_dir):
575 |         print_id = random.randint(0, 100)
576 |         export_name = f"{date}_DIMScrape_Refactor_{print_id}.csv"
577 |         msg_spreadsheetexported = f"\n[sciscraper]: A spreadsheet was exported as {export_name} in {export_dir}.\n"
578 |         dataframe.to_csv(export_name)
579 |         print(dataframe.head())
580 |         logging.info(msg_spreadsheetexported)
581 |         print(msg_spreadsheetexported)
582 | 
583 | 
584 | def main():
585 |     start = time.perf_counter()
586 |     file_request = FileRequest(target="../papers", slookup_key=False)
587 |     print(file_request.__class__.__name__)
588 |     # results = file_request.fetch_terms()
589 |     # export(results)
590 |     elapsed = time.perf_counter() - start
591 |     msg_timestamp = f"\n[sciscraper]: Extraction finished in {elapsed} seconds.\n"
592 |     logging.info(msg_timestamp)
593 |     print(msg_timestamp)
594 |     quit()
595 | 
596 | 
597 | if __name__ == "__main__":
598 |     main()  # %%
599 | 


--------------------------------------------------------------------------------
/papers/Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf


--------------------------------------------------------------------------------
/papers/An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf


--------------------------------------------------------------------------------
/papers/Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf


--------------------------------------------------------------------------------
/papers/Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf


--------------------------------------------------------------------------------
/papers/Attentional capture helps explain why moral and emotional content go viral.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Attentional capture helps explain why moral and emotional content go viral.docx


--------------------------------------------------------------------------------
/words/bycatch.txt:
--------------------------------------------------------------------------------
 1 | psychology
 2 | pediatric
 3 | pediatry
 4 | autism
 5 | mental
 6 | medical
 7 | oxytocin
 8 | adolescence
 9 | infant
10 | health
11 | wellness
12 | child
13 | care
14 | mindfulness


--------------------------------------------------------------------------------
/words/research.txt:
--------------------------------------------------------------------------------
 1 | big data
 2 | data
 3 | analytics
 4 | randomized controlled trial
 5 | RCT
 6 | moderation
 7 | community
 8 | social media
 9 | conversational
10 | control
11 | randomized
12 | systemic
13 | analysis
14 | thematic
15 | review
16 | study
17 | case series
18 | case report
19 | double blind
20 | ecological
21 | survey


--------------------------------------------------------------------------------
/words/target.txt:
--------------------------------------------------------------------------------
 1 | prosocial
 2 | design
 3 | intervention
 4 | reddit
 5 | humane
 6 | social media
 7 | user experience
 8 | nudge
 9 | choice architecture
10 | user interface
11 | misinformation
12 | disinformation
13 | Trump
14 | conspiracy
15 | dysinformation
16 | users
17 | Thaler
18 | Sunstein
19 | boost


--------------------------------------------------------------------------------