├── .gitignore ├── LICENSE ├── README.md ├── after ├── config.json ├── main.py ├── scrape │ ├── __init__.py │ ├── config.py │ ├── dir.py │ ├── export.py │ ├── fetch.py │ ├── json.py │ ├── log.py │ ├── pdf.py │ ├── scihub.py │ └── scraper.py └── test_pdf.py ├── before └── main.py ├── papers ├── Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf ├── An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf ├── Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf ├── Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf └── Attentional capture helps explain why moral and emotional content go viral.docx └── words ├── bycatch.txt ├── research.txt └── target.txt /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | **/.DS_Store 3 | **/*.log -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 John Fallot and ArjanCodes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web and PDF Scraper Refactoring 2 | 3 | This repository contains the example code of the Web and PDF scraper code roast. Here are the links to the videos: 4 | 5 | - Part 1: https://youtu.be/MXM6VEtf8SE 6 | - Part 2: https://youtu.be/6ac4Um2Vicg 7 | -------------------------------------------------------------------------------- /after/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "export_dir": "PDN Scraper Exports", 3 | "prime_src": "211001_PDN_studies_9.csv", 4 | "url_dmnsns": "https://app.dimensions.ai/discover/publication/results.json", 5 | "research_dir": "PDN Research Papers From Scrape", 6 | "url_scihub": "https://sci-hubtw.hkvisa.net/", 7 | "paper_folder": "../papers", 8 | "research_words": "../words/research.txt", 9 | "bycatch_words": "../words/bycatch.txt", 10 | "target_words": "../words/target.txt" 11 | } -------------------------------------------------------------------------------- /after/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from scrape.config import read_config 4 | from scrape.export import export_data 5 | from scrape.fetch import fetch_terms_from_pdf_files 6 | from scrape.log import log_msg 7 | 8 | 9 | def main() -> None: 10 | 11 | # read the configuration settings from a JSON file 12 | config = read_config("./config.json") 13 | 14 | # fetch data from pdf files and export it 15 | start = time.perf_counter() 16 | result = fetch_terms_from_pdf_files(config) 17 | export_data(result, config.export_dir) 18 | elapsed = time.perf_counter() - start 19 | log_msg(f"\n[sciscraper]: Extraction finished in {elapsed} seconds.\n") 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /after/scrape/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/after/scrape/__init__.py -------------------------------------------------------------------------------- /after/scrape/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class ScrapeConfig: 7 | export_dir: str 8 | prime_src: str 9 | url_dmnsns: str 10 | research_dir: str 11 | url_scihub: str 12 | paper_folder: str 13 | research_words: str 14 | bycatch_words: str 15 | target_words: str 16 | 17 | 18 | def read_config(config_file: str) -> ScrapeConfig: 19 | with open(config_file) as file: 20 | data = json.load(file) 21 | return ScrapeConfig(**data) 22 | -------------------------------------------------------------------------------- /after/scrape/dir.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import contextmanager 3 | 4 | 5 | @contextmanager 6 | def change_dir(destination: str): 7 | """Sets a destination for exported files.""" 8 | cwd = os.getcwd() 9 | try: 10 | __dest = os.path.realpath(destination) 11 | if not os.path.exists(__dest): 12 | os.mkdir(__dest) 13 | os.chdir(__dest) 14 | yield 15 | finally: 16 | os.chdir(cwd) 17 | -------------------------------------------------------------------------------- /after/scrape/export.py: -------------------------------------------------------------------------------- 1 | import random 2 | from datetime import datetime 3 | from typing import Optional 4 | 5 | import pandas as pd 6 | 7 | from scrape.dir import change_dir 8 | from scrape.log import log_msg 9 | 10 | 11 | def export_data(dataframe: Optional[pd.DataFrame], export_dir: str): 12 | now = datetime.now() 13 | date = now.strftime("%y%m%d") 14 | with change_dir(export_dir): 15 | print_id = random.randint(0, 100) 16 | export_name = f"{date}_DIMScrape_Refactor_{print_id}.csv" 17 | dataframe.to_csv(export_name) 18 | print(dataframe.head()) 19 | log_msg( 20 | f"\n[sciscraper]: A spreadsheet was exported as {export_name} in {export_dir}.\n" 21 | ) 22 | -------------------------------------------------------------------------------- /after/scrape/fetch.py: -------------------------------------------------------------------------------- 1 | from fnmatch import fnmatch 2 | from os import listdir, path 3 | 4 | import pandas as pd 5 | from tqdm import tqdm 6 | 7 | from scrape.config import ScrapeConfig 8 | from scrape.pdf import PDFScraper 9 | from scrape.scraper import Scraper 10 | 11 | 12 | def fetch_terms_from_doi(target: str, scraper: Scraper) -> pd.DataFrame: 13 | print(f"\n[sciscraper]: Getting entries from file: {target}") 14 | with open(target, newline="") as f: 15 | df = [doi for doi in pd.read_csv(f, usecols=["DOI"])["DOI"]] 16 | search_terms = [search_text for search_text in df if search_text is not None] 17 | return pd.DataFrame( 18 | [scraper.scrape(search_text) for search_text in tqdm(search_terms)] 19 | ) 20 | 21 | 22 | def fetch_terms_from_pubid(target: pd.DataFrame, scraper: Scraper) -> pd.DataFrame: 23 | df = target.explode("cited_dimensions_ids", "title") 24 | search_terms = ( 25 | search_text 26 | for search_text in df["cited_dimensions_ids"] 27 | if search_text is not None 28 | ) 29 | src_title = pd.Series(df["title"]) 30 | 31 | return pd.DataFrame( 32 | [scraper.scrape(search_text) for search_text in tqdm(list(search_terms))] 33 | ).join(src_title) 34 | 35 | 36 | def fetch_terms_from_pdf_files(config: ScrapeConfig) -> pd.DataFrame: 37 | 38 | search_terms = [ 39 | path.join(config.paper_folder, file) 40 | for file in listdir(config.paper_folder) 41 | if fnmatch(path.basename(file), "*.pdf") 42 | ] 43 | scraper = PDFScraper( 44 | config.research_words, config.bycatch_words, config.target_words 45 | ) 46 | return pd.DataFrame([scraper.scrape(file) for file in tqdm(search_terms)]) 47 | -------------------------------------------------------------------------------- /after/scrape/json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from json.decoder import JSONDecodeError 4 | from typing import Optional 5 | 6 | ## Scraping Related Imports 7 | import requests 8 | from requests.exceptions import HTTPError, RequestException 9 | 10 | from scrape.log import log_msg 11 | 12 | 13 | class JSONScraper: 14 | """The JSONScrape class takes the provided string from a prior list comprehension. 15 | Using that string value, it gets the resulting JSON data, parses it, and then returns a dictionary, which gets appended to a list. 16 | """ 17 | 18 | def __init__(self, dimensions_url: str) -> None: 19 | self.dimensions_url = dimensions_url 20 | 21 | def download(self, search_text: str) -> dict: 22 | """The download method generates a session and a querystring that gets sent to the website. This returns a JSON entry. 23 | The JSON entry is loaded and specific values are identified for passing along, back to a dataframe. 24 | """ 25 | self.sessions = requests.Session() 26 | self.search_field = self.specify_search(search_text) 27 | print( 28 | f"[sciscraper]: Searching for {search_text} via a {self.search_field}-style search.", 29 | end="\r", 30 | ) 31 | querystring = { 32 | "search_mode": "content", 33 | "search_text": f"{search_text}", 34 | "search_type": "kws", 35 | "search_field": f"{self.search_field}", 36 | } 37 | time.sleep(1) 38 | 39 | try: 40 | r = self.sessions.get(self.dimensions_url, params=querystring) 41 | r.raise_for_status() 42 | log_msg(str(r.status_code)) 43 | self.docs = json.loads(r.text)["docs"] 44 | 45 | except (JSONDecodeError, RequestException) as e: 46 | print( 47 | f"\n[sciscraper]: An error occurred while searching for {search_text}.\ 48 | \n[sciscraper]: Proceeding to next item in sequence.\ 49 | Cause of error: {e}\n" 50 | ) 51 | pass 52 | 53 | except HTTPError as f: 54 | print( 55 | f"\n[sciscraper]: Access to {self.dimensions_url} denied while searching for {search_text}.\ 56 | \n[sciscraper]: Terminating sequence. Cause of error: {f}\ 57 | \n" 58 | ) 59 | quit() 60 | 61 | for item in self.docs: 62 | self.data = self.get_data_entry( 63 | item, 64 | keys=[ 65 | "title", 66 | "author_list", 67 | "publisher", 68 | "pub_date", 69 | "doi", 70 | "id", 71 | "abstract", 72 | "acknowledgements", 73 | "journal_title", 74 | "volume", 75 | "issue", 76 | "times_cited", 77 | "mesh_terms", 78 | "cited_dimensions_ids", 79 | ], 80 | ) 81 | return self.data 82 | 83 | def specify_search(self, search_text: str) -> str: 84 | """Determines whether the dimensions.ai query will be for a full_search or just for the doi.""" 85 | if search_text.startswith("pub"): 86 | self.search_field = "full_search" 87 | else: 88 | self.search_field = "doi" 89 | return self.search_field 90 | 91 | def get_data_entry(self, item, keys: Optional[list]) -> dict: 92 | """Based on a provided list of keys and items in the JSON data, 93 | generates a dictionary entry. 94 | """ 95 | return {_key: item.get(_key, "") for _key in keys} 96 | -------------------------------------------------------------------------------- /after/scrape/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | now = datetime.now() 5 | date = now.strftime("%y%m%d") 6 | 7 | logging.basicConfig( 8 | filename=f"{date}_scraper.log", 9 | level=logging.DEBUG, 10 | format="%(asctime)s - %(message)s", 11 | datefmt="%d-%b-%y %H:%M:%S", 12 | ) 13 | 14 | 15 | def log_msg(msg: str) -> None: 16 | logging.info(msg) 17 | print(msg) 18 | -------------------------------------------------------------------------------- /after/scrape/pdf.py: -------------------------------------------------------------------------------- 1 | import re 2 | from os import path 3 | from typing import Any 4 | 5 | import pdfplumber 6 | from nltk import FreqDist 7 | from nltk.corpus import names, stopwords 8 | from nltk.tokenize import word_tokenize 9 | 10 | from scrape.scraper import Scraper, ScrapeResult 11 | 12 | STOP_WORDS: set[str] = set(stopwords.words("english")) 13 | 14 | NAME_WORDS: set[str] = set(names.words()) 15 | 16 | 17 | def guess_doi(path_name: str) -> str: 18 | basename = path.basename(path_name) 19 | doi = basename[7:-4] 20 | return f"{doi[:7]}/{doi[7:]}" 21 | 22 | 23 | def compute_filtered_tokens(text: list[str]) -> set[str]: 24 | """Takes a lowercase string, now removed of its non-alphanumeric characters. 25 | It returns (as a list comprehension) a parsed and tokenized 26 | version of the text, with stopwords and names removed. 27 | """ 28 | word_tokens = word_tokenize("\n".join(text)) 29 | return set([w for w in word_tokens if not w in STOP_WORDS and NAME_WORDS]) 30 | 31 | 32 | def most_common_words(word_set: set[str], n: int) -> list[tuple[str, int]]: 33 | return FreqDist(word_set).most_common(n) 34 | 35 | 36 | class PDFScraper(Scraper): 37 | def __init__(self, research_words: str, bycatch_words: str, target_words: str): 38 | with open(research_words, encoding="utf8") as f: 39 | self.research_words = set(f.readlines()) 40 | with open(bycatch_words, encoding="utf8") as f: 41 | self.bycatch_words = set(f.readlines()) 42 | with open(target_words, encoding="utf8") as f: 43 | self.target_words = set(f.readlines()) 44 | 45 | def scrape(self, search_text: str) -> ScrapeResult: 46 | preprints: list[str] = [] 47 | with pdfplumber.open(search_text) as study: 48 | pages: list[Any] = study.pages 49 | n = len(pages) 50 | pages_to_check: list[Any] = [page for page in pages][:n] 51 | for page_number, page in enumerate(pages_to_check): 52 | page: str = pages[page_number].extract_text( 53 | x_tolerance=3, y_tolerance=3 54 | ) 55 | print( 56 | f"[sciscraper]: Processing Page {page_number} of {n-1} | {search_text}...", 57 | end="\r", 58 | ) 59 | preprints.append( 60 | page 61 | ) # Each page's string gets appended to preprint [] 62 | 63 | manuscripts = [str(preprint).strip().lower() for preprint in preprints] 64 | # The preprints are stripped of extraneous characters and all made lower case. 65 | postprints = [re.sub(r"\W+", " ", manuscript) for manuscript in manuscripts] 66 | # The ensuing manuscripts are stripped of lingering whitespace and non-alphanumeric characters. 67 | all_words = compute_filtered_tokens(postprints) 68 | research_word_overlap = self.research_words.intersection(all_words) 69 | 70 | doi = guess_doi(search_text) 71 | 72 | target_intersection = self.target_words.intersection(all_words) 73 | bycatch_intersection = self.bycatch_words.intersection(all_words) 74 | wordscore = len(target_intersection) - len(bycatch_intersection) 75 | frequency = most_common_words(all_words, 5) 76 | study_design = most_common_words(research_word_overlap, 3) 77 | 78 | return ScrapeResult( 79 | doi, 80 | wordscore, 81 | frequency, 82 | study_design, 83 | ) 84 | -------------------------------------------------------------------------------- /after/scrape/scihub.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from contextlib import suppress 4 | from datetime import datetime 5 | 6 | import requests 7 | from bs4 import BeautifulSoup 8 | 9 | from scrape.dir import change_dir 10 | from scrape.log import log_msg 11 | 12 | 13 | class SciHubScraper: 14 | def __init__(self, scihub_url: str, research_dir: str) -> None: 15 | self.scihub_url = scihub_url 16 | 17 | now = datetime.now() 18 | date = now.strftime("%y%m%d") 19 | self.research_dir = os.path.realpath(f"{research_dir}_{date}") 20 | 21 | """The SciHubScrape class takes the provided string from a prior list comprehension. 22 | Using that string value, it posts it to the selected website. 23 | Then, it downloads the ensuing pdf file that appears as a result of that query. 24 | """ 25 | 26 | def download(self, search_text: str): 27 | """The download method generates a session and a payload that gets posted as a search query to the website. 28 | This search should return a pdf. 29 | Once the search is found, it is parsed with BeautifulSoup. 30 | Then, the link to download that pdf is isolated. 31 | """ 32 | self.sessions = requests.Session() 33 | print( 34 | f"[sciscraper]: Delving too greedily and too deep for download links for {search_text}, by means of dark and arcane magicx.", 35 | end="\r", 36 | ) 37 | self.payload = {"request": f"{search_text}"} 38 | with change_dir(self.research_dir): 39 | time.sleep(1) 40 | with suppress( 41 | requests.exceptions.HTTPError, requests.exceptions.RequestException 42 | ): 43 | r = self.sessions.post(url=self.scihub_url, data=self.payload) 44 | r.raise_for_status() 45 | log_msg(str(r.status_code)) 46 | soup = BeautifulSoup(r.text, "lxml") 47 | self.links = list( 48 | ((item["onclick"]).split("=")[1]).strip("'") 49 | for item in soup.select("button[onclick^='location.href=']") 50 | ) 51 | self.enrich_scrape() 52 | 53 | def enrich_scrape(self, search_text: str): 54 | """With the link to download isolated, it is followed and thereby downloaded. 55 | It is sent as bytes to a temporary text file, as a middleman of sorts. 56 | The temporary text file is then used as a basis to generate a new pdf. 57 | The temporary text file is then deleted in preparation for the next pdf. 58 | """ 59 | now = datetime.now() 60 | date = now.strftime("%y%m%d") 61 | for link in self.links: 62 | paper_url = f"{link}=true" 63 | paper_title = f'{date}_{search_text.replace("/","")}.pdf' 64 | time.sleep(1) 65 | paper_content = ( 66 | requests.get(paper_url, stream=True, allow_redirects=True) 67 | ).content 68 | with open("temp_file.txt", "wb") as _tempfile: 69 | _tempfile.write(paper_content) 70 | with open(paper_title, "wb") as file: 71 | for line in open("temp_file.txt", "rb").readlines(): 72 | file.write(line) 73 | os.remove("temp_file.txt") 74 | -------------------------------------------------------------------------------- /after/scrape/scraper.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Protocol 3 | 4 | 5 | @dataclass 6 | class ScrapeResult: 7 | DOI: str 8 | wordscore: int 9 | frequency: list[tuple[str, int]] 10 | study_design: list[tuple[str, int]] 11 | 12 | 13 | class Scraper(Protocol): 14 | def scrape(self, search_text: str) -> ScrapeResult: 15 | ... 16 | -------------------------------------------------------------------------------- /after/test_pdf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scrape.pdf import compute_filtered_tokens 4 | 5 | 6 | class TestPdfScraper(unittest.TestCase): 7 | def test_filtered_tokens_empty(self): 8 | self.assertEqual(len(compute_filtered_tokens([])), 0) 9 | 10 | def test_filtered_tokens_stop(self): 11 | tokens = ["please like and subscribe"] 12 | filtered_tokens = compute_filtered_tokens(tokens) 13 | print(tokens) 14 | print(filtered_tokens) 15 | self.assertEqual(filtered_tokens, {"please", "like", "subscribe"}) 16 | 17 | 18 | if __name__ == "__main__": 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /before/main.py: -------------------------------------------------------------------------------- 1 | r"""This module looks up bibliographic information from directories, csv files and pandas databases. 2 | 3 | From a .csv file input, you can get: 4 | - a folder of .pdf downloads (experimental). 5 | - a pandas dataframe 6 | - another .csv file 7 | 8 | From a pandas dataframe file input, you can get: 9 | - a folder of .pdf downloads (experimental). 10 | - another pandas dataframe, but the citations of the input are each returned with their own citation information. 11 | 12 | From a folder input, you can get: 13 | - a pandas dataframe 14 | - a .csv file 15 | """ 16 | 17 | # ============================================== 18 | # SPECIAL THANKS 19 | # 20 | # Michele Cotrufo 21 | # Nathan Lippi 22 | # Jon Watson Rooney 23 | # Colin Meret 24 | # ArjanCodes 25 | # James Murphy 26 | # Micael Jarniac 27 | # 28 | # ----------------Maintainer---------------- 29 | # John Fallot 30 | # 31 | # ----------------License---------------- 32 | # The MIT License [https://opensource.org/licenses/MIT] 33 | # Copyright (c) 2021 John Fallot 34 | # ============================================== 35 | 36 | # ============================================== 37 | # IMPORTS 38 | # ============================================== 39 | 40 | ## File Structure Related Imports 41 | import __future__ 42 | 43 | __version__ = "1.01" 44 | __author__ = "John Fallot" 45 | 46 | import datetime 47 | import json 48 | import logging 49 | import os 50 | import random 51 | import re 52 | import time 53 | from contextlib import contextmanager, suppress 54 | from fnmatch import fnmatch 55 | from json.decoder import JSONDecodeError 56 | from os import PathLike, listdir, path 57 | from os.path import isdir 58 | from typing import Optional 59 | 60 | import pandas as pd 61 | 62 | ## Language Processing Related Imports 63 | import pdfplumber 64 | 65 | ## Scraping Related Imports 66 | import requests 67 | from bs4 import BeautifulSoup 68 | from nltk import FreqDist 69 | from nltk.corpus import names, stopwords 70 | from nltk.tokenize import word_tokenize 71 | from requests.exceptions import HTTPError, RequestException 72 | from tqdm import tqdm 73 | 74 | # ============================================== 75 | # CONFIGS 76 | # ============================================== 77 | 78 | now = datetime.datetime.now() 79 | date = now.strftime("%y%m%d") 80 | export_dir = os.path.realpath("PDN Scraper Exports") 81 | msg_error_1 = "[sciscraper]: HTTP Error Encountered, moving to next available object. Reason Given:" 82 | 83 | logging.basicConfig( 84 | filename=f"{date}_scraper.log", 85 | level=logging.DEBUG, 86 | format="%(asctime)s - %(message)s", 87 | datefmt="%d-%b-%y %H:%M:%S", 88 | ) 89 | 90 | PRIME_SRC = os.path.realpath("211001_PDN_studies_9.csv") 91 | URL_DMNSNS = "https://app.dimensions.ai/discover/publication/results.json" 92 | RESEARCH_DIR = os.path.realpath(f"{date}_PDN Research Papers From Scrape") 93 | URL_SCIHUB = "https://sci-hubtw.hkvisa.net/" 94 | 95 | # ============================================== 96 | # SCRAPE RELATED CLASSES & SUBCLASSES 97 | # ============================================== 98 | 99 | 100 | class ScrapeRequest: 101 | """The abstraction of the program's web scraping requests, which dynamically returns its appropriate subclasses based on the provided inputs.""" 102 | 103 | _registry = {} 104 | 105 | def __init_subclass__(cls, slookup_code, **kwargs): 106 | super().__init_subclass__(**kwargs) 107 | cls._registry[slookup_code] = cls 108 | 109 | def __new__(cls, s_bool: bool): 110 | """The ScrapeRequest class looks for the boolean value passed to it from the FileRequest class. 111 | A value of True, or 1, would return a SciHubScrape subclass. 112 | Whereas a value of False, of 0, would return a JSONScrape subclass. 113 | """ 114 | if not isinstance(s_bool, bool): 115 | raise TypeError 116 | if s_bool: 117 | slookup_code = "sci" 118 | else: 119 | slookup_code = "json" 120 | 121 | subclass = cls._registry[slookup_code] 122 | 123 | obj = object.__new__(subclass) 124 | return obj 125 | 126 | def download(self) -> None: 127 | raise NotImplementedError 128 | 129 | 130 | class SciHubScrape(ScrapeRequest, slookup_code="sci"): 131 | """The SciHubScrape class takes the provided string from a prior list comprehension. 132 | Using that string value, it posts it to the selected website. 133 | Then, it downloads the ensuing pdf file that appears as a result of that query. 134 | """ 135 | 136 | def download(self, search_text: str): 137 | """The download method generates a session and a payload that gets posted as a search query to the website. 138 | This search should return a pdf. 139 | Once the search is found, it is parsed with BeautifulSoup. 140 | Then, the link to download that pdf is isolated. 141 | """ 142 | self.sessions = requests.Session() 143 | self.base_url = URL_SCIHUB 144 | print( 145 | f"[sciscraper]: Delving too greedily and too deep for download links for {search_text}, by means of dark and arcane magicx.", 146 | end="\r", 147 | ) 148 | self.payload = {"request": f"{search_text}"} 149 | with change_dir(RESEARCH_DIR): 150 | time.sleep(1) 151 | with suppress( 152 | requests.exceptions.HTTPError, requests.exceptions.RequestException 153 | ): 154 | r = self.sessions.post(url=self.base_url, data=self.payload) 155 | r.raise_for_status() 156 | logging.info(r.status_code) 157 | soup = BeautifulSoup(r.text, "lxml") 158 | self.links = list( 159 | ((item["onclick"]).split("=")[1]).strip("'") 160 | for item in soup.select("button[onclick^='location.href=']") 161 | ) 162 | self.enrich_scrape() 163 | 164 | def enrich_scrape(self, search_text: str): 165 | """With the link to download isolated, it is followed and thereby downloaded. 166 | It is sent as bytes to a temporary text file, as a middleman of sorts. 167 | The temporary text file is then used as a basis to generate a new pdf. 168 | The temporary text file is then deleted in preparation for the next pdf. 169 | """ 170 | for link in self.links: 171 | paper_url = f"{link}=true" 172 | paper_title = f'{date}_{search_text.replace("/","")}.pdf' 173 | time.sleep(1) 174 | paper_content = ( 175 | requests.get(paper_url, stream=True, allow_redirects=True) 176 | ).content 177 | with open("temp_file.txt", "wb") as _tempfile: 178 | _tempfile.write(paper_content) 179 | with open(paper_title, "wb") as file: 180 | for line in open("temp_file.txt", "rb").readlines(): 181 | file.write(line) 182 | os.remove("temp_file.txt") 183 | 184 | 185 | class JSONScrape(ScrapeRequest, slookup_code="json"): 186 | """The JSONScrape class takes the provided string from a prior list comprehension. 187 | Using that string value, it gets the resulting JSON data, parses it, and then returns a dictionary, which gets appended to a list. 188 | """ 189 | 190 | def download(self, search_text: str) -> dict: 191 | """The download method generates a session and a querystring that gets sent to the website. This returns a JSON entry. 192 | The JSON entry is loaded and specific values are identified for passing along, back to a dataframe. 193 | """ 194 | self.sessions = requests.Session() 195 | self.search_field = self.specify_search(search_text) 196 | self.base_url = URL_DMNSNS 197 | print( 198 | f"[sciscraper]: Searching for {search_text} via a {self.search_field}-style search.", 199 | end="\r", 200 | ) 201 | querystring = { 202 | "search_mode": "content", 203 | "search_text": f"{search_text}", 204 | "search_type": "kws", 205 | "search_field": f"{self.search_field}", 206 | } 207 | time.sleep(1) 208 | 209 | try: 210 | r = self.sessions.get(self.base_url, params=querystring) 211 | r.raise_for_status() 212 | logging.info(r.status_code) 213 | self.docs = json.loads(r.text)["docs"] 214 | 215 | except (JSONDecodeError, RequestException) as e: 216 | print( 217 | f"\n[sciscraper]: An error occurred while searching for {search_text}.\ 218 | \n\[sciscraper]: Proceeding to next item in sequence.\ 219 | Cause of error: {e}\n" 220 | ) 221 | pass 222 | 223 | except HTTPError as f: 224 | print( 225 | f"\n[sciscraper]: Access to {self.base_url} denied while searching for {search_text}.\ 226 | \n[sciscraper]: Terminating sequence. Cause of error: {f}\ 227 | \n" 228 | ) 229 | quit() 230 | 231 | for item in self.docs: 232 | self.data = self.get_data_entry( 233 | item, 234 | keys=[ 235 | "title", 236 | "author_list", 237 | "publisher", 238 | "pub_date", 239 | "doi", 240 | "id", 241 | "abstract", 242 | "acknowledgements", 243 | "journal_title", 244 | "volume", 245 | "issue", 246 | "times_cited", 247 | "mesh_terms", 248 | "cited_dimensions_ids", 249 | ], 250 | ) 251 | return self.data 252 | 253 | def specify_search(self, search_text: str) -> str: 254 | """Determines whether the dimensions.ai query will be for a full_search or just for the doi.""" 255 | if search_text.startswith("pub"): 256 | self.search_field = "full_search" 257 | else: 258 | self.search_field = "doi" 259 | return self.search_field 260 | 261 | def get_data_entry(self, item, keys: Optional[list]) -> dict: 262 | """Based on a provided list of keys and items in the JSON data, 263 | generates a dictionary entry. 264 | """ 265 | return {_key: item.get(_key, "") for _key in keys} 266 | 267 | 268 | class PDFScrape: 269 | """The PDFScrape class takes the provided string from a prior list 270 | comprehension of PDF files in a directory. From each pdf file, it parses the document 271 | and returns metrics about its composition and relevance. 272 | """ 273 | 274 | def download(self, search_text: str) -> dict: 275 | self.search_text = search_text 276 | self.preprints = [] 277 | with pdfplumber.open(self.search_text) as self.study: 278 | self.n = len(self.study.pages) 279 | self.pages_to_check = [page for page in self.study.pages][: self.n] 280 | for page_number, page in enumerate(self.pages_to_check): 281 | page = self.study.pages[page_number].extract_text( 282 | x_tolerance=3, y_tolerance=3 283 | ) 284 | print( 285 | f"[sciscraper]: Processing Page {page_number} of {self.n-1} | {search_text}...", 286 | end="\r", 287 | ) 288 | self.preprints.append( 289 | page 290 | ) # Each page's string gets appended to preprint [] 291 | 292 | self.manuscripts = [ 293 | str(preprint).strip().lower() for preprint in self.preprints 294 | ] 295 | # The preprints are stripped of extraneous characters and all made lower case. 296 | self.postprints = [ 297 | re.sub(r"\W+", " ", manuscript) for manuscript in self.manuscripts 298 | ] 299 | # The ensuing manuscripts are stripped of lingering whitespace and non-alphanumeric characters. 300 | self.all_words = self.get_tokens() 301 | self.research_word_overlap = self.get_research_words() 302 | return self.get_data_entry() 303 | 304 | def get_tokens(self) -> list: 305 | """Takes a lowercase string, now removed of its non-alphanumeric characters. 306 | It returns (as a list comprehension) a parsed and tokenized 307 | version of the postprint, with stopwords and names removed. 308 | """ 309 | self.stop_words = set(stopwords.words("english")) 310 | self.name_words = set(names.words()) 311 | self.word_tokens = word_tokenize(str(self.postprints)) 312 | return [ 313 | w for w in self.word_tokens if not w in self.stop_words and self.name_words 314 | ] # Filters out the stopwords 315 | 316 | def _overlap(self, li) -> list: 317 | """Checks if token words match words in a provided list.""" 318 | return [w for w in li if w in self.all_words] 319 | 320 | def get_target_words(self): 321 | """Checks for words that match the user's primary query.""" 322 | self.target_words = [ 323 | "prosocial", 324 | "design", 325 | "intervention", 326 | "reddit", 327 | "humane", 328 | "social media", 329 | "user experience", 330 | "nudge", 331 | "choice architecture", 332 | "user interface", 333 | "misinformation", 334 | "disinformation", 335 | "Trump", 336 | "conspiracy", 337 | "dysinformation", 338 | "users", 339 | "Thaler", 340 | "Sunstein", 341 | "boost", 342 | ] 343 | self.target_word_overlap = self._overlap(self.target_words) 344 | return self.target_word_overlap 345 | 346 | def get_bycatch_words(self): 347 | """Checks for words that often occur in conjunction with the 348 | user's primary query, but are deemed undesirable. 349 | """ 350 | self.bycatch_words = [ 351 | "psychology", 352 | "pediatric", 353 | "pediatry", 354 | "autism", 355 | "mental", 356 | "medical", 357 | "oxytocin", 358 | "adolescence", 359 | "infant", 360 | "health", 361 | "wellness", 362 | "child", 363 | "care", 364 | "mindfulness", 365 | ] 366 | self.bycatch_word_overlap = self._overlap(self.bycatch_words) 367 | return self.bycatch_word_overlap 368 | 369 | def get_research_words(self): 370 | """Checks for words that correspond to specific experimental designs.""" 371 | self.research_words = [ 372 | "big data", 373 | "data", 374 | "analytics", 375 | "randomized controlled trial", 376 | "RCT", 377 | "moderation", 378 | "community", 379 | "social media", 380 | "conversational", 381 | "control", 382 | "randomized", 383 | "systemic", 384 | "analysis", 385 | "thematic", 386 | "review", 387 | "study", 388 | "case series", 389 | "case report", 390 | "double blind", 391 | "ecological", 392 | "survey", 393 | ] 394 | self.research_word_overlap = self._overlap(self.research_words) 395 | return self.research_word_overlap 396 | 397 | def get_wordscore(self) -> int: 398 | """Returns a score, which is the number of target words minus the number of undesirable words. 399 | A positive score suggests that the paper is more likely than not to be a match. 400 | A negative score suggests that the paper is likely to be unrelated to the user's primary query.""" 401 | return len(self.get_target_words()) - len(self.get_bycatch_words()) 402 | 403 | def get_doi(self) -> str: 404 | """Approximates a possible DOI, assuming the file is saved in YYMMDD_DOI.pdf format.""" 405 | self.getting_doi = path.basename(self.search_text) 406 | self.doi = self.getting_doi[7:-4] 407 | self.doi = self.doi[:7] + "/" + self.doi[7:] 408 | return self.doi 409 | 410 | def get_data_entry(self) -> dict: 411 | """Returns a dictionary entry. Ideally, this will someday work through a DataEntry class.""" 412 | self.data = { 413 | "DOI": self.get_doi(), 414 | "wordscore": self.get_wordscore(), 415 | "frequency": FreqDist(self.all_words).most_common(5), 416 | "study_design": FreqDist(self.research_word_overlap).most_common(3), 417 | } 418 | 419 | return self.data 420 | 421 | 422 | # ============================================== 423 | # CONTEXT MANAGER METACLASS 424 | # ============================================== 425 | 426 | 427 | @contextmanager 428 | def change_dir(destination: str): 429 | """Sets a destination for exported files.""" 430 | try: 431 | __dest = os.path.realpath(destination) 432 | cwd = os.getcwd() 433 | if not os.path.exists(__dest): 434 | os.mkdir(__dest) 435 | os.chdir(__dest) 436 | yield 437 | finally: 438 | os.chdir(cwd) 439 | 440 | 441 | # ============================================== 442 | # FILE REQUEST CLASSES & SUBCLASSES 443 | # ============================================== 444 | 445 | 446 | class FileRequest: 447 | """The abstraction of the program's input file classes. 448 | It dynamically returns its appropriate subclasses based on the provided inputs. 449 | """ 450 | 451 | _registry = {} 452 | 453 | def __init_subclass__(cls, dlookup_code, **kwargs): 454 | super().__init_subclass__(**kwargs) 455 | cls._registry[dlookup_code] = cls 456 | 457 | def __new__(cls, target, slookup_key: bool = None): 458 | if isdir(target): 459 | dlookup_code = "fold" 460 | elif str(target).endswith("csv"): 461 | dlookup_code = "doi" 462 | elif isinstance(target, pd.DataFrame): 463 | dlookup_code = "pub" 464 | else: 465 | raise Exception("[sciscraper]: Invalid prefix detected.") 466 | 467 | subclass = cls._registry[dlookup_code] 468 | 469 | obj = object.__new__(subclass) 470 | obj.target = target 471 | obj.slookup_key = slookup_key 472 | obj.scraper = ScrapeRequest(slookup_key) 473 | return obj 474 | 475 | def fetch_terms(self) -> None: 476 | raise NotImplementedError 477 | 478 | 479 | class DOIRequest(FileRequest, dlookup_code="doi"): 480 | """The DOIRequest class takes a csv and generates a list comprehension. 481 | The list comprehension is scraped, and then returns a DataFrame. 482 | """ 483 | 484 | def __init__(self, target: str, slookup_key: bool = False): 485 | self.target = target 486 | self.slookup_key = slookup_key 487 | self.scraper = ScrapeRequest(self.slookup_key) 488 | 489 | def fetch_terms(self): 490 | print(f"\n[sciscraper]: Getting entries from file: {self.target}") 491 | with open(self.target, newline="") as f: 492 | self.df = [doi for doi in pd.read_csv(f, usecols=["DOI"])["DOI"]] 493 | self.search_terms = [ 494 | search_text for search_text in self.df if search_text is not None 495 | ] 496 | return pd.DataFrame( 497 | [ 498 | self.scraper.download(search_text) 499 | for search_text in tqdm(self.search_terms) 500 | ] 501 | ) 502 | 503 | 504 | class PubIDRequest(FileRequest, dlookup_code="pub"): 505 | """The PubIDRequest class takes a DataFrame and generates a list comprehension. 506 | The list comprehension is scraped, and then returns a DataFrame. 507 | """ 508 | 509 | def __init__(self, target: pd.DataFrame, slookup_key: bool = False): 510 | if slookup_key: 511 | print( 512 | "\n[sciscraper]: Getting Pub IDs from dataframe to download from web..." 513 | ) 514 | else: 515 | print( 516 | "\n[sciscraper]: Expounding upon existing PubIDs to generate a new dataframe..." 517 | ) 518 | self.target = target 519 | self.slookup_key = slookup_key 520 | self.scraper = ScrapeRequest(self.slookup_key) 521 | 522 | def fetch_terms(self): 523 | self.df = self.target.explode("cited_dimensions_ids", "title") 524 | self.search_terms = ( 525 | search_text 526 | for search_text in self.df["cited_dimensions_ids"] 527 | if search_text is not None 528 | ) 529 | self.src_title = pd.Series(self.df["title"]) 530 | 531 | return pd.DataFrame( 532 | [ 533 | self.scraper.download(search_text) 534 | for search_text in tqdm(list(self.search_terms)) 535 | ] 536 | ).join(self.src_title) 537 | 538 | 539 | class FolderRequest(FileRequest, dlookup_code="fold"): 540 | """ 541 | The Folder class takes a directory and generates a list comprehension. 542 | The list comprehension is scraped, and then returns a DataFrame. 543 | Unlike other classes, it cannot undergo a SciScrape. 544 | """ 545 | 546 | def __init__(self, target: PathLike[str], slookup_key: bool = False): 547 | print(f"\n[sciscraper]: Getting files from folder: {target}") 548 | self.target = target 549 | if self.slookup_key: 550 | raise Exception( 551 | "This action is prohibited. \ 552 | You already have the files that this query would return." 553 | ) 554 | self.slookup_key = slookup_key 555 | self.scraper = PDFScrape() 556 | 557 | def fetch_terms(self): 558 | self.search_terms = [ 559 | path.join(self.target, file) 560 | for file in listdir(self.target) 561 | if fnmatch(path.basename(file), "*.pdf") 562 | ] 563 | return pd.DataFrame( 564 | [self.scraper.download(file) for file in tqdm(self.search_terms)] 565 | ) 566 | 567 | 568 | # ============================================== 569 | # EXPORTING, MAIN LOOP, AND MISCELLANY 570 | # ============================================== 571 | 572 | 573 | def export(dataframe: Optional[pd.DataFrame]): 574 | with change_dir(export_dir): 575 | print_id = random.randint(0, 100) 576 | export_name = f"{date}_DIMScrape_Refactor_{print_id}.csv" 577 | msg_spreadsheetexported = f"\n[sciscraper]: A spreadsheet was exported as {export_name} in {export_dir}.\n" 578 | dataframe.to_csv(export_name) 579 | print(dataframe.head()) 580 | logging.info(msg_spreadsheetexported) 581 | print(msg_spreadsheetexported) 582 | 583 | 584 | def main(): 585 | start = time.perf_counter() 586 | file_request = FileRequest(target="../papers", slookup_key=False) 587 | print(file_request.__class__.__name__) 588 | # results = file_request.fetch_terms() 589 | # export(results) 590 | elapsed = time.perf_counter() - start 591 | msg_timestamp = f"\n[sciscraper]: Extraction finished in {elapsed} seconds.\n" 592 | logging.info(msg_timestamp) 593 | print(msg_timestamp) 594 | quit() 595 | 596 | 597 | if __name__ == "__main__": 598 | main() # %% 599 | -------------------------------------------------------------------------------- /papers/Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Amygdala structure and the tendency to regard the social system as legitimate and desirable.pdf -------------------------------------------------------------------------------- /papers/An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/An Ideological Asymmetry in the Diffusion of Moralized Content on Social Media Among Political Leaders.pdf -------------------------------------------------------------------------------- /papers/Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Anyone Can Become a Troll_ Causes of Trolling Behavior in Online Discussions.pdf -------------------------------------------------------------------------------- /papers/Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Association of an Educational Program in Mindful Communication With Burnout, Empathy, and Attitudes Among Primary Care Physicians.pdf -------------------------------------------------------------------------------- /papers/Attentional capture helps explain why moral and emotional content go viral.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ArjanCodes/2021-coderoast-scrape/22719bb71af11a88f410802697a4a29b316163f8/papers/Attentional capture helps explain why moral and emotional content go viral.docx -------------------------------------------------------------------------------- /words/bycatch.txt: -------------------------------------------------------------------------------- 1 | psychology 2 | pediatric 3 | pediatry 4 | autism 5 | mental 6 | medical 7 | oxytocin 8 | adolescence 9 | infant 10 | health 11 | wellness 12 | child 13 | care 14 | mindfulness -------------------------------------------------------------------------------- /words/research.txt: -------------------------------------------------------------------------------- 1 | big data 2 | data 3 | analytics 4 | randomized controlled trial 5 | RCT 6 | moderation 7 | community 8 | social media 9 | conversational 10 | control 11 | randomized 12 | systemic 13 | analysis 14 | thematic 15 | review 16 | study 17 | case series 18 | case report 19 | double blind 20 | ecological 21 | survey -------------------------------------------------------------------------------- /words/target.txt: -------------------------------------------------------------------------------- 1 | prosocial 2 | design 3 | intervention 4 | reddit 5 | humane 6 | social media 7 | user experience 8 | nudge 9 | choice architecture 10 | user interface 11 | misinformation 12 | disinformation 13 | Trump 14 | conspiracy 15 | dysinformation 16 | users 17 | Thaler 18 | Sunstein 19 | boost --------------------------------------------------------------------------------