├── lib └── .placeholder ├── docs ├── corpus.png └── uptod.png ├── MANIFEST.txt ├── vocab.json ├── requirements.txt ├── bin ├── parsr_test.sh ├── download_s3.py ├── index_phrases.py ├── parsr.py ├── textrank_test.py ├── sampleConfig.json ├── parsed_json_interpreter.py ├── phrase_extraction_pipeline.py ├── extract_text.py ├── parsr_output_interpreter.py ├── upload_s3.py ├── parsr_client.py └── download_resources.py ├── .gitignore ├── DOWNLOAD.md ├── test.py ├── README.md ├── LICENSE └── errors.txt /lib/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/corpus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Coleridge-Initiative/rclc/HEAD/docs/corpus.png -------------------------------------------------------------------------------- /docs/uptod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Coleridge-Initiative/rclc/HEAD/docs/uptod.png -------------------------------------------------------------------------------- /MANIFEST.txt: -------------------------------------------------------------------------------- 1 | date: 2020-03-01 2 | release: v1.0.8 3 | uploaded_pdf: 1452 4 | uploaded_json: 76 5 | uploaded_txt: 1279 6 | -------------------------------------------------------------------------------- /vocab.json: -------------------------------------------------------------------------------- 1 | { 2 | "@language": "en", 3 | "adrf": "https://github.com/Coleridge-Initiative/adrf-onto/wiki/Vocabulary#", 4 | "cito": "http://purl.org/spar/cito/", 5 | "dct": "http://purl.org/dc/terms/", 6 | "foaf": "http://xmlns.com/foaf/0.1/", 7 | "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", 8 | "xsd": "http://www.w3.org/2001/XMLSchema#" 9 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | GitPython >= 3.0.5 2 | awscli >= 1.16.302 3 | beautifulsoup4 >= 4.8.0 4 | boto3 >= 1.10.38 5 | html5lib >= 1.0.1 6 | mypy >= 0.730 7 | networkx >= 2.4 8 | pdfminer.six == 20181108 9 | pytextrank >= 2.0.0 10 | ray >= 0.6.5 11 | rdflib >= 4.2.2 12 | rdflib-jsonld >= 0.4.0 13 | requests-html >= 0.10.0 14 | spacy >= 2.2.3 15 | sxsdiff >= 0.3.0 16 | tqdm >= 4.37.0 17 | -------------------------------------------------------------------------------- /bin/parsr_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PARSR=34.82.183.17:3001 4 | 5 | curl -X GET \ 6 | http://$PARSR/api/v1/queue/1ec3b910b86f2bb329684cd5763d56 7 | 8 | 9 | exit 0 10 | 11 | curl -X POST \ 12 | http://$PARSR/api/v1/document \ 13 | -H 'Content-Type: multipart/form-data' \ 14 | -F 'file=@resources/pub/pdf/a6024f82cef41d533019.pdf;type=application/pdf' \ 15 | -F 'config=@bin/sampleConfig.json;type=application/json' 16 | -------------------------------------------------------------------------------- /bin/download_s3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import boto3 5 | 6 | # initialize access to the storage grid bucket 7 | bucket_name = "richcontext" 8 | bucket = boto3.resource("s3").Bucket(bucket_name) 9 | 10 | # list up to N keys for files within our pseudo-directory 11 | prefix = "corpus_docs" 12 | limit = 10 13 | 14 | for obj in bucket.objects.filter(Prefix=prefix): 15 | if limit < 1: 16 | break 17 | else: 18 | print(obj.key) 19 | limit -= 1 20 | 21 | # show an example of how to download a specific file 22 | local_file = "001966ac583b67a965cf.json" 23 | key = prefix + "/pub/json/" + local_file 24 | 25 | bucket.download_file(key, local_file) 26 | 27 | -------------------------------------------------------------------------------- /bin/index_phrases.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from collections import OrderedDict 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | import codecs 8 | import json 9 | import pytextrank 10 | import ray 11 | import spacy 12 | import sys 13 | 14 | 15 | @ray.remote 16 | def extract_phrases (txt_file, dir_path, nlp): 17 | tr_path = dir_path / "tr" 18 | 19 | with codecs.open(txt_file, "r", encoding="utf8") as f: 20 | text = f.read() 21 | doc = nlp(text) 22 | view = OrderedDict() 23 | 24 | for phrase in doc._.phrases[:20]: 25 | view[phrase.text] = { "count": phrase.count, "rank_score": phrase.rank } 26 | 27 | file_name = txt_file.stem + ".json" 28 | tr_file = tr_path / file_name 29 | 30 | with codecs.open(tr_file, "wb", encoding="utf8") as f: 31 | json.dump(view, f, indent=4, ensure_ascii=False) 32 | 33 | 34 | def main (): 35 | nlp = spacy.load("en_core_web_sm") 36 | tr = pytextrank.TextRank(logger=None) 37 | nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) 38 | 39 | dir_path = Path("resources/pub") 40 | txt_path = dir_path / "txt" 41 | 42 | task_ids = [] 43 | ray.init() 44 | 45 | for txt_file in tqdm(list(txt_path.glob(f"*txt")), ascii=True, desc=f"extracted text files"): 46 | id = extract_phrases.remote(txt_file, dir_path, nlp) 47 | task_ids.append(id) 48 | 49 | ray.get(task_ids) 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .DS_Store 3 | 4 | example/pub/tr 5 | tmp.ttl 6 | tmp.jsonld 7 | todo.tsv 8 | resources/ 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # Environments 94 | .env 95 | .venv 96 | env/ 97 | venv/ 98 | ENV/ 99 | env.bak/ 100 | venv.bak/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | -------------------------------------------------------------------------------- /bin/parsr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from parsed_json_interpreter import mkdir 5 | from parsr_client import ParserClient 6 | from pathlib import Path 7 | import codecs 8 | import json 9 | import os 10 | import sys 11 | import traceback 12 | 13 | 14 | def Convert (base_path=".", force=False): 15 | config_path = Path(base_path) / "bin/sampleConfig.json" 16 | 17 | pub_dir = Path(base_path) / "resources/pub" 18 | 19 | json_dir = pub_dir / "json" 20 | mkdir(json_dir) 21 | 22 | txt_dir = pub_dir / "txt" 23 | mkdir(txt_dir) 24 | 25 | pdf_dir = pub_dir / "pdf" 26 | 27 | for pdf_file in list(pdf_dir.glob("*.pdf")): 28 | json_file = pdf_file.stem + ".json" 29 | json_path = json_dir / json_file 30 | 31 | if json_path.exists() and not force: 32 | # ignore the PDFs that were already parsed 33 | continue 34 | 35 | # send document to Parsr server for processing 36 | try: 37 | print(f"parsing {pdf_file}") 38 | 39 | job = parsr.send_document( 40 | file=pdf_file.as_posix(), 41 | config=config_path.as_posix(), 42 | wait_till_finished=True, 43 | save_request_id=True, 44 | ) 45 | 46 | # output the full results in JSON 47 | with codecs.open(json_path, "wb", encoding="utf8") as f: 48 | json.dump(parsr.get_json(), f, indent=2, ensure_ascii=False) 49 | 50 | # output the raw text 51 | txt_file = pdf_file.stem + ".txt" 52 | txt_path = txt_dir / txt_file 53 | 54 | with codecs.open(txt_path, "wb", encoding="utf8") as f: 55 | f.write(parsr.get_text()) 56 | 57 | except: 58 | traceback.print_exc() 59 | 60 | 61 | if __name__ == "__main__": 62 | if len(sys.argv) < 2: 63 | print("usage: parsr.py host:port") 64 | sys.exit(-1) 65 | 66 | server = sys.argv[1] 67 | path = os.path.dirname(os.path.dirname(__file__)) 68 | 69 | print(f"using Parsr server {server}") 70 | print(f"save to path {path}") 71 | 72 | parsr = ParserClient(server) 73 | Convert(path) 74 | -------------------------------------------------------------------------------- /DOWNLOAD.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | 3 | To install the Python library dependencies: 4 | 5 | ``` 6 | pip install -r requirements.txt 7 | ``` 8 | 9 | 10 | ## Download Parsed PDFs 11 | 12 | The `bin/download_s3.py` script provides example code for downloading 13 | PDF files (open access publications) and JSON files (extracted text) 14 | from the public S3 bucket. 15 | 16 | 17 | ## Collecting Open Access PDFs 18 | 19 | **For those on the NYU-CI team who update the corpus:** 20 | 21 | Download the corpus PDFs and other resource files: 22 | 23 | ``` 24 | python bin/download_resources.py --logger errors.txt 25 | ``` 26 | 27 | The PDF files get stored in the `resources/pub/pdf` subdirectory. 28 | 29 | 30 | ## Extract text from PDFs 31 | 32 | We use `science-parse` to extract text from research publications. 33 | Download the latest `science-parse-cli-assembly-*.jar` from the 34 | 35 | and copy that JAR file into the `lib/` subdirectory. 36 | 37 | Then run the `science-parse` CLI to extract text from the PDF files, 38 | and be sure to use the correct version number for the JAR that you 39 | downloaded: 40 | 41 | ``` 42 | mkdir -p resources/pub/json 43 | SPJAR=lib/science-parse-cli-assembly-2.0.3.jar 44 | java -jar $SPJAR -o ./resources/pub/json ./resources/pub/pdf 45 | ``` 46 | 47 | That command will download multiple resources from the Allan AI public 48 | datastore, which may take several minutes. 49 | 50 | TODO: replace this step with use of a containerized `SPv2` server. 51 | 52 | 53 | ## Upload PDF and JSON files 54 | 55 | **For those on the NYU-CI team who update the corpus:** 56 | 57 | Upload the PDF files (open access publications) and JSON files 58 | (extracted text) to the public S3 bucket: 59 | 60 | ``` 61 | python bin/upload_s3.py 62 | ``` 63 | 64 | 65 | ## S3 Bucket Specs 66 | 67 | View the public AWS S3 Bucket `richcontext` online: 68 | 69 | - 70 | - 71 | 72 | The directory structure of the public S3 bucket is similar to the 73 | directory structure used for resources in this repo: 74 | 75 | - richcontext 76 | - corpus_docs 77 | - pdf 78 | - json 79 | -------------------------------------------------------------------------------- /bin/textrank_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | 5 | from phrase_extraction_pipeline import setup, extract_phrases 6 | from parsed_json_interpreter import ParsedJsonInterpreter 7 | from pathlib import Path 8 | import pytextrank 9 | import codecs 10 | import json 11 | import spacy 12 | import unittest 13 | 14 | 15 | class TestVerifyTextRank (unittest.TestCase): 16 | EXAMPLE_TITLE = [ 17 | "trajGANs: Using generative adversarial networks for geo-privacy protection of trajectory data (Vision paper) ", 18 | "1 Introduction and motivation ", 19 | "2 Trajectory types and data generation scenarios ", 20 | "3 The trajGANs framework ", 21 | "5 Conclusions and Discussion ", 22 | "References " 23 | ] 24 | 25 | 26 | EXAMPLE_TEXTRANK = [ 27 | ["generative adversarial networks", "real data", "xi hanzhou chen1"], 28 | ["real data", "data", "trajectory data"], 29 | ["place- based trajectories", "synthetic trajectories", "human trajectories"], 30 | ["place- based trajectories", "synthetic trajectory samples", "synthetic trajectories"], 31 | ["real data", "original data", "pre-calculated statistical metrics"], 32 | ["generative adversarial networks", "deep convolutional generative adversarial networks", "s."] 33 | ] 34 | 35 | 36 | def setUp (self): 37 | '''run the example file''' 38 | nlp, resource_path = setup(testing=True) 39 | extract_phrases(nlp, resource_path, limit_keyphrase=3, verbose=False) 40 | 41 | tr_path = resource_path / "tr" 42 | tr_file = tr_path / "PE_Example.json" 43 | 44 | with codecs.open(tr_file, "r", encoding="utf8") as f: 45 | self.example_file = json.load(f) 46 | 47 | 48 | def test_key_phrases (self): 49 | for i, section in enumerate(self.example_file): 50 | for j, textrank in enumerate(section["text_rank"]): 51 | self.assertTrue(textrank == self.EXAMPLE_TEXTRANK[i][j]) 52 | 53 | 54 | def test_section_titles (self): 55 | for i, section in enumerate(self.example_file): 56 | self.assertTrue(section["section_title"] == self.EXAMPLE_TITLE[i]) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | 62 | 63 | -------------------------------------------------------------------------------- /bin/sampleConfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 0.5, 3 | "extractor": { 4 | "pdf": "pdfminer", 5 | "img": "tesseract", 6 | "language": ["eng", "fra"] 7 | }, 8 | "cleaner": [ 9 | "out-of-page-removal", 10 | [ 11 | "whitespace-removal", 12 | { 13 | "minWidth": 0 14 | } 15 | ], 16 | [ 17 | "redundancy-detection", 18 | { 19 | "minOverlap": 0.5 20 | } 21 | ], 22 | [ 23 | "table-detection", 24 | { 25 | "runConfig": [ 26 | { 27 | "pages": [], 28 | "flavor": "lattice" 29 | } 30 | ] 31 | } 32 | ], 33 | [ 34 | "header-footer-detection", 35 | { 36 | "ignorePages": [], 37 | "maxMarginPercentage": 15 38 | } 39 | ], 40 | [ 41 | "reading-order-detection", 42 | { 43 | "minVerticalGapWidth": 5, 44 | "minColumnWidthInPagePercent": 15 45 | } 46 | ], 47 | "link-detection", 48 | [ 49 | "words-to-line", 50 | { 51 | "lineHeightUncertainty": 0.2, 52 | "topUncertainty": 0.4, 53 | "maximumSpaceBetweenWords": 100, 54 | "mergeTableElements": false 55 | } 56 | ], 57 | [ 58 | "lines-to-paragraph", 59 | { 60 | "tolerance": 0.25 61 | } 62 | ], 63 | "heading-detection", 64 | "list-detection", 65 | "page-number-detection", 66 | "hierarchy-detection", 67 | [ 68 | "regex-matcher", 69 | { 70 | "isCaseSensitive": true, 71 | "isGlobal": true, 72 | "queries": [ 73 | { 74 | "label": "Car", 75 | "regex": "([A-Z]{2}\\-[\\d]{3}\\-[A-Z]{2})" 76 | }, 77 | { 78 | "label": "Age", 79 | "regex": "(\\d+)[ -]*(ans|jarige)" 80 | }, 81 | { 82 | "label": "Percent", 83 | "regex": "([\\-]?(\\d)+[\\.\\,]*(\\d)*)[ ]*(%|per|percent|pourcent|procent)" 84 | } 85 | ] 86 | } 87 | ] 88 | ], 89 | "output": { 90 | "granularity": "word", 91 | "includeMarginals": false, 92 | "formats": { 93 | "json": true, 94 | "text": true, 95 | "csv": true, 96 | "markdown": true, 97 | "pdf": false 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import json 5 | import networkx as nx 6 | import os 7 | import rdflib 8 | import sys 9 | import tempfile 10 | 11 | 12 | ###################################################################### 13 | ## NetworkX 14 | 15 | LABEL = {} 16 | 17 | def get_item_index (item): 18 | global LABEL 19 | item = str(item) 20 | 21 | if item not in LABEL: 22 | index = len(LABEL) 23 | LABEL[item] = index 24 | else: 25 | index = LABEL[item] 26 | 27 | return index 28 | 29 | 30 | def make_nxgraph (graph): 31 | g = nx.Graph() 32 | 33 | for s, p, o in graph: 34 | s_idx = get_item_index(s) 35 | o_idx = get_item_index(o) 36 | 37 | print(s_idx, str(s)) 38 | print(o_idx, str(o)) 39 | 40 | g.graph[s_idx] = str(s) 41 | g.graph[o_idx] = str(o) 42 | 43 | e = (s_idx, o_idx) 44 | g.add_edge(*e) 45 | 46 | g[s_idx][o_idx]["label"] = str(p) 47 | 48 | print(g.graph) 49 | 50 | 51 | def wrap_token (token): 52 | if token.startswith("http"): 53 | return "<{}>".format(token) 54 | else: 55 | return "\"{}\"".format(token) 56 | 57 | 58 | PREAMBLE = """ 59 | @base . 60 | 61 | @prefix cito: . 62 | @prefix dct: . 63 | @prefix foaf: . 64 | @prefix rdf: . 65 | @prefix xsd: . 66 | """ 67 | 68 | 69 | if __name__ == "__main__": 70 | # load the graph 71 | filename = sys.argv[1] 72 | graph = rdflib.Graph().parse(filename, format="n3") 73 | 74 | # enumerate all of the relations 75 | term = "dataset-11a95bfc951f7d23206a" 76 | out_triples = set([]) 77 | 78 | for s, p, o in graph: 79 | if s.endswith(term): 80 | out_triples.add((s, p, o,)) 81 | 82 | elif o.endswith(term): 83 | out_triples.add((s, p, o,)) 84 | 85 | ## write to in-memory file 86 | f = tempfile.NamedTemporaryFile(delete=False) 87 | f.write(PREAMBLE.encode("utf-8")) 88 | 89 | for s, p, o in out_triples: 90 | line = "{} {} {} .\n".format(wrap_token(s), wrap_token(p), wrap_token(o)) 91 | f.write(line.encode("utf-8")) 92 | 93 | f.close() 94 | 95 | # serialize the graph as JSON-LD 96 | with open("vocab.json", "r") as v: 97 | context = json.load(v) 98 | 99 | graph = rdflib.Graph().parse(f.name, format="n3") 100 | os.unlink(f.name) 101 | 102 | buf = graph.serialize(format="json-ld", context=context, indent=None) 103 | print(buf) 104 | 105 | 106 | -------------------------------------------------------------------------------- /bin/parsed_json_interpreter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import errno 5 | import os 6 | 7 | 8 | class ParsedJsonInterpreter (object): 9 | 10 | def __init__ (self, object): 11 | self.object = object 12 | 13 | 14 | def FindObject (self, object): 15 | """ 16 | Parse the JSON and convert it to text, divided by sections, 17 | and extract the title of the section 18 | """ 19 | texts = [] 20 | res = [] 21 | titles = [] 22 | 23 | for page in self.object['pages']: 24 | for element in page['elements']: 25 | try: 26 | if element['type'] == 'heading': 27 | title = self.GetText(element) 28 | titles.append(title) 29 | texts.append(res) 30 | res = [] 31 | 32 | if element['type'] in ['word', 'line', 'character', 'paragraph', 'heading', 'list']: 33 | res.append(element) 34 | except TypeError: 35 | continue 36 | 37 | texts.append(res) 38 | return texts[1:], titles 39 | 40 | 41 | def GetText (self, text_object): 42 | result = "" 43 | 44 | if text_object['type'] in ['paragraph','heading','list']: 45 | for i in text_object['content']: 46 | result += self.GetText(i) 47 | 48 | if text_object['type'] in ['line']: 49 | for i in text_object['content']: 50 | result += self.GetText(i) 51 | 52 | elif text_object['type'] in ['word']: 53 | if type(text_object['content']) is list: 54 | for i in text_object['content']: 55 | result += self.GetText(i) 56 | else: 57 | result += text_object['content'] 58 | result += ' ' 59 | 60 | elif text_object['type'] in ['character']: 61 | result += text_object['content'] 62 | 63 | return result 64 | 65 | 66 | def GetSectionalText (self, object): 67 | """ 68 | Get the text of a section 69 | """ 70 | text = "" 71 | sections = [] 72 | text_lists, titles = self.FindObject(object) 73 | 74 | for text_list in text_lists: 75 | for text_Obj in text_list: 76 | text += self.GetText(text_Obj) 77 | text += '\n\n' 78 | 79 | sections.append(text) 80 | text = "" 81 | 82 | return sections, titles 83 | 84 | 85 | def mkdir (path): 86 | """ 87 | check if the direcory already exists, and if not create a new one 88 | """ 89 | try: 90 | os.makedirs(path) 91 | except OSError as exception: 92 | if exception.errno != errno.EEXIST: 93 | raise 94 | 95 | 96 | -------------------------------------------------------------------------------- /bin/phrase_extraction_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from parsed_json_interpreter import ParsedJsonInterpreter, mkdir 5 | from pathlib import Path 6 | import codecs 7 | import json 8 | import pytextrank 9 | import spacy 10 | import sys 11 | 12 | 13 | def setup (base_path=".", testing=False): 14 | """ 15 | add PyTextRank into the spaCy pipeline, then set up the input 16 | directory path for test vs. production env 17 | """ 18 | nlp = spacy.load("en_core_web_sm") 19 | tr = pytextrank.TextRank(logger=None) 20 | 21 | nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) 22 | 23 | if testing: 24 | resource_path = Path(base_path) / "example/pub" 25 | else: 26 | resource_path = Path(base_path) / "resources/pub" 27 | 28 | return nlp, resource_path 29 | 30 | 31 | def extract_phrases (nlp, resource_path, limit_keyphrase=15, verbose=True): 32 | """ 33 | run PyTextRank on Parsr output to extract and rank key phrases 34 | """ 35 | json_dir = resource_path / "json" 36 | 37 | if verbose: 38 | print(f"scanning input directory: {json_dir}") 39 | 40 | for parse_file in list(json_dir.glob("*.json")): 41 | if verbose: 42 | print(f"loading {parse_file}") 43 | 44 | with codecs.open(parse_file, "r", encoding="utf8") as f: 45 | parsr_object = json.load(f) 46 | 47 | # parse the JSON and convert it to text, divided by sections, 48 | # then extract the title of each section 49 | parsr_interpreter = ParsedJsonInterpreter(parsr_object) 50 | sections, titles = parsr_interpreter.GetSectionalText(parsr_object) 51 | 52 | # run TextRank and collect the ranked keyphrases 53 | results = [] 54 | 55 | for i, section in enumerate(sections): 56 | doc = nlp(section) 57 | phrases = {} 58 | final = {} 59 | 60 | for phrase in doc._.phrases[:limit_keyphrase]: 61 | phrases[phrase.text] = {"count": phrase.count, "rank_score": phrase.rank} 62 | 63 | final["section_title"] = titles[i] 64 | final["text_rank"] = phrases 65 | results.append(final) 66 | 67 | if verbose: 68 | print("section: {}".format(final["section_title"])) 69 | 70 | # output the ranked results to JSON 71 | 72 | tr_path = resource_path / "tr" 73 | mkdir(tr_path) 74 | 75 | tr_file = parse_file.stem + ".json" 76 | output_path = tr_path / tr_file 77 | 78 | with codecs.open(output_path, "wb", encoding="utf8") as f: 79 | json.dump(results, f, indent=4, ensure_ascii=False) 80 | 81 | if verbose: 82 | print(f"completed: {output_path}") 83 | 84 | 85 | if __name__ == "__main__": 86 | nlp, resource_path = setup(testing=True) 87 | extract_phrases(nlp, resource_path) 88 | -------------------------------------------------------------------------------- /bin/extract_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from pathlib import Path 5 | from tqdm import tqdm 6 | import codecs 7 | import os 8 | import pdfx 9 | import ray 10 | import sys 11 | import time 12 | import traceback 13 | 14 | 15 | def extract_text (file_path): 16 | """ 17 | parse text from PDF 18 | """ 19 | text = None 20 | page_count = 0 21 | 22 | try: 23 | pdf_meta = pdfx.PDFx(file_path) 24 | meta = pdf_meta.get_metadata() 25 | page_count = meta["Pages"] 26 | 27 | # split into sections 28 | buf = [] 29 | grafs = [] 30 | 31 | for line in pdf_meta.get_text().split("\n"): 32 | line = line.strip() 33 | buf.append(line) 34 | 35 | if len(line) < 1: 36 | section = " ".join(buf).strip().replace("- ", "") + "\n" 37 | grafs.append(section) 38 | buf = [] 39 | 40 | text = "\n".join(grafs) 41 | except: 42 | print(f"ERROR parsing {pdf_file}") 43 | traceback.print_exc() 44 | finally: 45 | return text, page_count 46 | 47 | 48 | def enum_pdfs (pdf_dir, txt_dir): 49 | """ 50 | enumerate all of the non-zero downloaded PDF files 51 | """ 52 | uuid_set = set([]) 53 | 54 | for pdf_file in list(pdf_dir.glob("*.pdf")): 55 | if os.path.getsize(pdf_file) > 0: 56 | uuid_set.add(pdf_file.stem) 57 | 58 | # filter out PDF files that have already been converted to text 59 | for txt_file in list(txt_dir.glob("*.txt")): 60 | if txt_file.stem in uuid_set and os.path.getsize(txt_file) > 0: 61 | uuid_set.remove(txt_file.stem) 62 | 63 | for uuid in uuid_set: 64 | yield uuid 65 | 66 | 67 | @ray.remote 68 | def convert_pdf (pdf_dir, txt_dir, uuid): 69 | t0 = time.time() 70 | pdf_file = pdf_dir / f"{uuid}.pdf" 71 | txt_file = txt_dir / f"{uuid}.txt" 72 | 73 | text, page_count = extract_text(pdf_file.as_posix()) 74 | 75 | if text and len(text) > 0: 76 | with codecs.open(txt_file, "wb", encoding="utf8") as f: 77 | f.write(text) 78 | 79 | timing = time.time() - t0 80 | print("\n{} {:.3f} s".format(uuid, timing)) 81 | 82 | 83 | def main (): 84 | pdf_dir = Path.cwd() / "resources/pub/pdf" 85 | txt_dir = Path.cwd() / "resources/pub/txt" 86 | task_ids = [] 87 | 88 | ray.init() 89 | 90 | for uuid in tqdm(enum_pdfs(pdf_dir, txt_dir), ascii=True, desc="convert pdf"): 91 | task_ids.append(convert_pdf.remote(pdf_dir, txt_dir, uuid)) 92 | 93 | ray.get(task_ids) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /bin/parsr_output_interpreter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # 5 | # Copyright 2019 AXA Group Operations S.A. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | import logging 21 | import pandas as pd 22 | from io import StringIO 23 | 24 | class ParsrOutputInterpreter(object): 25 | def __init__(self, object=None): 26 | logging.basicConfig(level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s') 27 | self.object = None 28 | if object is not None: 29 | self.load_object(object) 30 | 31 | def __get_text_types(self): 32 | return ['word', 'line', 'character', 'paragraph', 'heading'] 33 | 34 | def __get_text_objects(self, page_number=None): 35 | texts = [] 36 | if page_number is not None: 37 | page = self.get_page(page_number) 38 | if page is None: 39 | logging.error("Cannot get text elements for the requested page; Page {} not found".format(page_number)) 40 | return None 41 | else: 42 | for element in page['elements']: 43 | if element['type'] in self.__get_text_types(): 44 | texts.append(element) 45 | else: 46 | for page in self.object['pages']: 47 | for element in page['elements']: 48 | if element['type'] in self.__get_text_types(): 49 | texts.append(element) 50 | return texts 51 | 52 | def __text_from_text_object(self, text_object:dict) -> str: 53 | result = "" 54 | if text_object['type'] in ['paragraph', 'heading']: 55 | for i in text_object['content']: 56 | result += self.__text_from_text_object(i) 57 | elif text_object['type'] in ['line']: 58 | for i in text_object['content']: 59 | result += self.__text_from_text_object(i) 60 | elif text_object['type'] in ['word']: 61 | if type(text_object['content']) is list: 62 | for i in text_object['content']: 63 | result += self.__text_from_text_object(i) 64 | else: 65 | result += text_object['content'] 66 | result += ' ' 67 | elif text_object['type'] in ['character']: 68 | result += text_object['content'] 69 | return result 70 | 71 | def load_object(self, object): 72 | self.object = object 73 | 74 | def get_page(self, page_number): 75 | for p in self.object['pages']: 76 | if p['pageNumber'] == page_number: 77 | return p 78 | logging.error("Page {} not found".format(page_number)) 79 | return None 80 | 81 | def get_text(self, page_number:int=None) -> str: 82 | final_text = "" 83 | for textObj in self.__get_text_objects(page_number): 84 | final_text += self.__text_from_text_object(textObj) 85 | final_text += "\n\n" 86 | return final_text 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tracking Progress in Rich Context 2 | 3 | [The Coleridge Initiative](https://coleridgeinitiative.org/richcontext) 4 | at NYU has been researching [*Rich Context*](https://coleridgeinitiative.org/richcontext) 5 | to enhance search and discovery of datasets used in scientific research – see the 6 | [_Background Info_](https://github.com/Coleridge-Initiative/rclc/wiki/Background-Info) 7 | section for more details. 8 | Partnering with experts throughout academia and industry, NYU-CI has 9 | worked to leverage the closely adjacent fields of NLP/NLU, knowledge 10 | graph, recommender systems, scholarly infrastructure, data mining from 11 | scientific literature, dataset discovery, linked data, open vocabularies, 12 | metadata management, data governance, and so on. 13 | Leaderboards are published here on GitHub to track _state-of-the-art_ 14 | (SOTA) progress among the top results. 15 | 16 | --- 17 | 18 | ## Leaderboard 1 19 | 20 | ### Entity Linking for Datasets in Publications 21 | 22 | The first challenge is to identify the datasets used in research 23 | publications, initially focused on the problem of 24 | [_entity linking_](https://nlpprogress.com/english/entity_linking.html). 25 | Research papers generally mention the datasets they've used, although there 26 | are limited formal means to describe that metadata in a machine-readable way. 27 | The goal here is to predict a set of dataset IDs for each publication. 28 | The dataset IDs within the corpus represent the set of all possible datasets 29 | which will appear. 30 | 31 | Identifying dataset mentions typically requires: 32 | 33 | * extracting text from an open access PDF 34 | * some NLP parsing of the text 35 | * feature engineering (e.g., attention to where text is located in a paper) 36 | * modeling to identify up to 5 datasets per publication 37 | 38 | See [_Evaluating Models for Entity Linking with Datasets_](https://github.com/Coleridge-Initiative/rclc/wiki/Evaluating-Models-for-Entity-Linking-with-Datasets) 39 | for details about how the `Top5uptoD` leaderboard metric is calculated. 40 | 41 | 42 | ## Instructions 43 | 44 | * [How To Participate](https://github.com/Coleridge-Initiative/rclc/wiki/How-To-Participate) 45 | * [Corpus Description](https://github.com/Coleridge-Initiative/rclc/wiki/Corpus-Description) 46 | * [Download Resource Files](https://github.com/Coleridge-Initiative/rclc/wiki/Downloading-Resource-Files) 47 | * [Background Info](https://github.com/Coleridge-Initiative/rclc/wiki/Background-Info) 48 | * [Workflow Stages](https://github.com/Coleridge-Initiative/rclc/wiki/Workflow-Stages) 49 | * [Glossary Terms](https://github.com/Coleridge-Initiative/rclc/wiki/Glossary-Terms) 50 | 51 | Use of open source and open standards are especially important to 52 | further the cause for effective, reproducible research. 53 | We're hosting this competition to focus on the research challenges 54 | of specific machine learning use cases encountered within Rich Context – see the 55 | [_Workflow Stages_](https://github.com/Coleridge-Initiative/rclc/wiki/Workflow-Stages) 56 | section. 57 | 58 | If you have any questions about the Rich Context leaderboard 59 | competition – and especially if you identify any problems in the 60 | corpus (e.g., data quality, incorrect metadata, broken links, etc.) – 61 | please use the GitHub issues for this repo and pull requests to 62 | report, discuss, and resolve them. 63 | -------------------------------------------------------------------------------- /bin/upload_s3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from git import Repo 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | import boto3 8 | import datetime 9 | import json 10 | import os 11 | import sys 12 | 13 | BUCKET_NAME = "richcontext" 14 | 15 | 16 | def access_bucket (handle): 17 | """ 18 | initialize access to the bucket 19 | """ 20 | bucket = handle.Bucket(BUCKET_NAME) 21 | return bucket 22 | 23 | 24 | def upload_file (handle, local_path, grid_path): 25 | """ 26 | upload a local file to the bucket 27 | """ 28 | handle.meta.client.upload_file(local_path, BUCKET_NAME, grid_path) 29 | 30 | 31 | def list_uploaded_files (bucket, prefix, kind): 32 | """ 33 | list the files of a particular kind which have already been 34 | uploaded to the bucket 35 | """ 36 | done = set([]) 37 | extension = f".{kind}" 38 | 39 | for obj in bucket.objects.filter(Prefix=prefix + "/pub/" + kind): 40 | if obj.key.endswith(extension): 41 | uuid = obj.key.split("/")[3].split(extension)[0] 42 | done.add(uuid) 43 | 44 | return done 45 | 46 | 47 | def iter_needed_files (dir_path, kind, done): 48 | """ 49 | iterator for the local files of a particular kind which 50 | haven't been uploaded yet 51 | """ 52 | for file_name in tqdm(list(dir_path.glob(f"*.{kind}")), ascii=True, desc=f"{kind} files"): 53 | uuid = file_name.stem 54 | 55 | if uuid not in done: 56 | yield uuid 57 | 58 | 59 | def upload_needed_files (handle, bucket, prefix, dir_path, kind, iter): 60 | """ 61 | upload the needed local files of a particular kind 62 | """ 63 | extension = f".{kind}" 64 | count = 0 65 | 66 | for uuid in iter: 67 | file_name = uuid + extension 68 | local_path = dir_path / file_name 69 | grid_path = prefix + "/pub/" + kind + "/" 70 | 71 | #print("uploading {} to {}".format(local_path, grid_path)) 72 | 73 | upload_file(handle, local_path.as_posix(), grid_path + file_name) 74 | count += 1 75 | 76 | return count 77 | 78 | 79 | def manage_upload (handle, bucket, prefix, pub_dir, kind): 80 | """ 81 | manage the upload for a particular kind of file 82 | """ 83 | dir_path = pub_dir / kind 84 | done = list_uploaded_files(bucket, prefix, kind) 85 | iter = iter_needed_files(dir_path, kind, done) 86 | count = upload_needed_files(handle, bucket, prefix, dir_path, kind, iter) 87 | 88 | return len(done), count 89 | 90 | 91 | def write_manifest (handle, prefix, manifest_data, file_name="MANIFEST.txt"): 92 | """ 93 | summarize details about the upload to a `MANIFEST.txt` 94 | file in the bucket 95 | """ 96 | with open(file_name, "w") as f: 97 | for key, val in manifest_data.items(): 98 | f.write("{}: {}\n".format(key, str(val))) 99 | 100 | grid_path = prefix + "/" + file_name 101 | upload_file(handle, file_name, grid_path) 102 | 103 | 104 | def main (): 105 | # locate the Git tag info 106 | git_path = Path.cwd().as_posix() 107 | repo = Repo(git_path) 108 | tags = sorted(repo.tags, key=lambda t: t.commit.committed_datetime) 109 | 110 | # set up the manifest 111 | manifest_data = {} 112 | manifest_data["date"] = datetime.date.today().strftime("%Y-%m-%d") 113 | manifest_data["release"] = tags[-1] 114 | 115 | # connect to the storage grid bucket 116 | handle = boto3.resource("s3") 117 | bucket = access_bucket(handle) 118 | prefix = "corpus_docs" 119 | 120 | # set up the local paths 121 | pub_dir = Path.cwd() / "resources/pub" 122 | 123 | # which PDF files do we need to upload? 124 | count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "pdf") 125 | manifest_data["uploaded_pdf"] = count + prev_count 126 | 127 | # which JSON files do we need to upload? 128 | count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "json") 129 | manifest_data["uploaded_json"] = count + prev_count 130 | 131 | # which TXT files do we need to upload? 132 | count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "txt") 133 | manifest_data["uploaded_txt"] = count + prev_count 134 | 135 | # write upload details to manifest 136 | write_manifest(handle, prefix, manifest_data) 137 | 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /bin/parsr_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # 5 | # Copyright 2019 AXA Group Operations S.A. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | from glob import glob 21 | from itertools import chain 22 | import os 23 | import sys 24 | import json 25 | import time 26 | 27 | from sxsdiff import DiffCalculator 28 | from sxsdiff.generators.github import GitHubStyledGenerator 29 | 30 | import diff_match_patch 31 | import pandas as pd 32 | import requests 33 | from io import StringIO 34 | 35 | 36 | 37 | class ParserClient(): 38 | def __init__(self, server): 39 | self.version_history = {} 40 | self.set_server(server) 41 | self.set_current_request_id("") 42 | 43 | def __supported_input_files(self) -> list: 44 | return ['*.pdf', '*.jpg', '*.jpeg', '*.png', '*.tiff', '*.tif',] 45 | 46 | def set_server(self, server:str): 47 | self.server = server 48 | 49 | def set_current_request_id(self, request_id:str): 50 | self.request_id = request_id 51 | 52 | def send_document(self, file:str, config:str, server:str="", document_name:str=None, wait_till_finished:bool=False, save_request_id:bool=False) -> dict: 53 | if server == "": 54 | if self.server == "": 55 | raise Exception('No server address provided') 56 | else: 57 | server = self.server 58 | packet = { 59 | 'file': (file, open(file, 'rb'), 'application/pdf'), 60 | 'config': (config, open(config, 'rb'), 'application/json'), 61 | } 62 | 63 | r = requests.post('http://'+server+'/api/v1/document', files=packet) 64 | jobId = r.text 65 | 66 | if not document_name: 67 | document_name = os.path.splitext(os.path.basename(file))[0] 68 | 69 | if document_name not in self.version_history: 70 | self.version_history[document_name] = [jobId] 71 | else: 72 | self.version_history[document_name].append(jobId) 73 | if save_request_id: 74 | self.set_current_request_id(jobId) 75 | if not wait_till_finished: 76 | return {'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text} 77 | else: 78 | print('> Polling server for the job {}...'.format(jobId)) 79 | server_status_response = self.get_status(jobId)['server_response'] 80 | while ('progress-percentage' in server_status_response): 81 | print('>> Progress percentage: {}'.format(server_status_response['progress-percentage'])) 82 | time.sleep(2) 83 | server_status_response = self.get_status(jobId)['server_response'] 84 | print('>> Job done!') 85 | return {'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text} 86 | 87 | def get_versions(self, document_name:str) -> list: 88 | if document_name in self.version_history: 89 | return self.version_history[document_name] 90 | else: 91 | return [] 92 | 93 | def send_documents_folder(self, folder:str, config:str, server:str="") -> list: 94 | if server == "": 95 | if self.server == "": 96 | raise Exception('No server address provided') 97 | else: 98 | server = self.server 99 | responses = [] 100 | os.chdir(folder) 101 | files = [glob.glob(e) for e in self.__supported_input_files()] 102 | files_flat = list(chain.from_iterable(files)) 103 | for file in files_flat: 104 | packet = { 105 | 'file': (file, open(file, 'rb'), 'application/pdf'), 106 | 'config': (config, open(config, 'rb'), 'application/json'), 107 | } 108 | r = requests.post('http://'+server+'/api/v1/document', files=packet) 109 | responses.append({'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text}) 110 | return responses 111 | 112 | def get_status(self, request_id:str="", server:str=""): 113 | if server == "": 114 | if self.server == "": 115 | raise Exception('No server address provided') 116 | else: 117 | server = self.server 118 | if request_id == "": 119 | if self.request_id == "": 120 | raise Exception('No request ID provided') 121 | else: 122 | request_id = self.request_id 123 | if self.server == "": 124 | raise Exception('No server address provided') 125 | r = requests.get('http://{}/api/v1/queue/{}'.format(server, request_id)) 126 | return {'request_id': request_id, 'server_response': json.loads(r.text)} 127 | 128 | def get_json(self, request_id:str="", server:str=""): 129 | if server == "": 130 | if self.server == "": 131 | raise Exception('No server address provided') 132 | else: 133 | server = self.server 134 | if request_id == "": 135 | if self.request_id == "": 136 | raise Exception('No request ID provided') 137 | else: 138 | request_id = self.request_id 139 | r = requests.get('http://{}/api/v1/json/{}'.format(server, request_id)) 140 | if r.text != "": 141 | return r.json() 142 | else: 143 | return {'request_id': request_id, 'server_response': r.json()} 144 | 145 | def get_markdown(self, request_id:str="", server:str=""): 146 | if server == "": 147 | if self.server == "": 148 | raise Exception('No server address provided') 149 | else: 150 | server = self.server 151 | if request_id == "": 152 | if self.request_id == "": 153 | raise Exception('No request ID provided') 154 | else: 155 | request_id = self.request_id 156 | r = requests.get('http://{}/api/v1/markdown/{}'.format(server, request_id)) 157 | if r.text != "": 158 | return r.text 159 | else: 160 | return {'request_id': request_id, 'server_response': r.text} 161 | 162 | def get_text(self, request_id:str="", server:str=""): 163 | if server == "": 164 | if self.server == "": 165 | raise Exception('No server address provided') 166 | else: 167 | server = self.server 168 | if request_id == "": 169 | if self.request_id == "": 170 | raise Exception('No request ID provided') 171 | else: 172 | request_id = self.request_id 173 | r = requests.get('http://{}/api/v1/text/{}'.format(server, request_id)) 174 | if r.text != "": 175 | return r.text 176 | else: 177 | return {'request_id': request_id, 'server_response': r.text} 178 | 179 | def get_table(self, request_id:str="", page=None, table=None, seperator=";", server:str=""): 180 | if server == "": 181 | if self.server == "": 182 | raise Exception('No server address provided') 183 | else: 184 | server = self.server 185 | if request_id == "": 186 | if self.request_id == "": 187 | raise Exception('No request ID provided') 188 | else: 189 | request_id = self.request_id 190 | if page is None and table is None: 191 | r = requests.get('http://{}/api/v1/csv/{}'.format(server, request_id)) 192 | else: 193 | r = requests.get('http://{}/api/v1/csv/{}/{}/{}'.format(server, request_id, page, table)) 194 | if r.text != "": 195 | try: 196 | df = pd.read_csv(StringIO(r.text), sep=seperator) 197 | df.loc[:, ~df.columns.str.match('Unnamed')] 198 | df = df.where((pd.notnull(df)), " ") 199 | return df 200 | except Exception as e: 201 | return {'request_id': request_id, 'server_response': r.text} 202 | else: 203 | return {'request_id': request_id, 'server_response': r.text} 204 | 205 | def compare_versions(self, request_ids:list, pretty_html:bool = False): 206 | diffs = [] 207 | for i in range(0, len(request_ids) - 1): 208 | request_id1 = request_ids[i] 209 | request_id2 = request_ids[i + 1] 210 | md1 = self.get_markdown(request_id1) 211 | md2 = self.get_markdown(request_id2) 212 | 213 | if pretty_html: 214 | sxsdiff_result = DiffCalculator().run(md1, md2) 215 | html_store = StringIO() 216 | GitHubStyledGenerator(file=html_store).run(sxsdiff_result) 217 | html_diff = html_store.getvalue() 218 | diffs.append(html_diff) 219 | else: 220 | dmp = diff_match_patch.diff_match_patch() 221 | diff = dmp.diff_main(md1, md2) 222 | dmp.diff_cleanupSemantic(diff) 223 | diffs.append(diff) 224 | return diffs 225 | -------------------------------------------------------------------------------- /bin/download_resources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | from bs4 import BeautifulSoup # type: ignore 5 | from pathlib import Path 6 | from pdfminer.pdfdocument import PDFDocument # type: ignore 7 | from pdfminer.pdfpage import PDFTextExtractionNotAllowed # type: ignore 8 | from pdfminer.pdfparser import PDFParser, PDFSyntaxError # type: ignore 9 | from requests_html import HTMLSession # type: ignore 10 | from tqdm import tqdm # type: ignore 11 | from typing import Any, Dict, List, Tuple 12 | from urllib.parse import urlparse 13 | import argparse 14 | import csv 15 | import json 16 | import logging # type: ignore 17 | import ray # type: ignore 18 | import requests 19 | import sys 20 | import time 21 | import traceback 22 | 23 | DEFAULT_LOGGER_FILE = None 24 | DEFAULT_CORPUS_FILE = "corpus.jsonld" 25 | DEFAULT_TODO_FILE = "todo.tsv" 26 | DEFAULT_TODO_LIST = None 27 | DEFAULT_OUTPUT_RESOURCE = "resources/" 28 | DEFAULT_FORCE_DOWNLOAD = False 29 | 30 | MAX_DOWNLOAD_TRIAL = 3 31 | 32 | PUB_PDF_PATH = "pub/pdf/" 33 | DAT_PAGE_PATH = "dat/" 34 | 35 | LOGGER = None # type: ignore 36 | 37 | 38 | @ray.remote 39 | class Worker (object): 40 | def __init__ (self): 41 | self.logger = LOGGER 42 | 43 | def train (self): 44 | self.logger.warning("print from inside worker") 45 | 46 | 47 | def setup_logger (args) -> None: 48 | """ logging is optional: to debug, set the logging level 49 | """ 50 | global LOGGER 51 | level = logging.WARNING 52 | 53 | if args.logger: 54 | logging.basicConfig(filename=args.logger, filemode="w", level=level) 55 | else: 56 | logging.basicConfig(stream=sys.stdout, level=level) 57 | 58 | LOGGER = logging.getLogger("RichContext") 59 | 60 | 61 | def load_corpus (filename: str) -> dict: 62 | """ Load the corpus file (in JSON-LD format) 63 | """ 64 | global LOGGER 65 | corpus = None 66 | 67 | with open(filename, "r") as f: 68 | jld_corpus = json.load(f) 69 | corpus = jld_corpus["@graph"] 70 | 71 | LOGGER.warning(f"number of records in the corpus: {len(corpus)}") 72 | return corpus 73 | 74 | 75 | def generate_todo (flag: str, todo_pub: list, todo_dat: list) -> None: 76 | """ Generate a TODO file for downloads 77 | """ 78 | if not flag: 79 | with open(DEFAULT_TODO_FILE, "wt") as f: 80 | writer = csv.writer(f, delimiter="\t") 81 | 82 | for t in todo_pub: 83 | writer.writerow(t) 84 | 85 | for t in todo_dat: 86 | writer.writerow(t) 87 | 88 | 89 | def load_todo (filename: str) -> List[Any]: 90 | """ load a TSV file for the list of files to be downloaded 91 | """ 92 | todo = [] 93 | 94 | with open(filename, "r") as f: 95 | reader = csv.reader(f, delimiter="\t") 96 | 97 | for row in reader: 98 | todo.append(row) 99 | 100 | 101 | def enum_pub_resources (corpus: dict, output_path: Path, force_download: bool) -> Tuple[Path, List[List[Any]]]: 102 | """ Enumerate all publications PDF files from the corpus data to be 103 | downloaded, if not downloaded yet. We use the entity id as filename. 104 | All downloaded files are stored under `output_path` folder. 105 | Input: 106 | - corpus: corpus file containing a list of publications. 107 | - output_path: path to store downloaded resources. 108 | - force_download: always download resources. 109 | """ 110 | global LOGGER 111 | pub_path = output_path / PUB_PDF_PATH 112 | 113 | if not pub_path.exists(): 114 | pub_path.mkdir(parents=True) 115 | 116 | pubs = [e for e in corpus if e["@type"] == "ResearchPublication"] 117 | 118 | if force_download: 119 | downloaded_pubs_id = set([]) 120 | else: 121 | downloaded_pubs = list(pub_path.glob("*.pdf")) 122 | downloaded_pubs_id = set([f.stem for f in downloaded_pubs]) 123 | 124 | todo = [] 125 | 126 | for entity in pubs: 127 | e_id = urlparse(entity["@id"]).fragment.split("-")[1] 128 | downloaded_before = e_id in downloaded_pubs_id 129 | 130 | if force_download or not downloaded_before: 131 | if isinstance(entity["openAccess"], list): 132 | ## this only happens in the error case where 133 | ## publications are duplicated in the corpus 134 | res_url = entity["openAccess"][0]["@value"] 135 | LOGGER.warning("duplicate: {} {}".format(e_id, entity["openAccess"])) 136 | else: 137 | res_url = entity["openAccess"]["@value"] 138 | 139 | todo.append(["pdf", e_id, res_url, pub_path]) 140 | 141 | return pub_path, todo 142 | 143 | 144 | def enum_dat_resources (corpus: dict, output_path: Path, force_download: bool) -> Tuple[Path, List[List[Any]]]: 145 | """ Enumerate all dataset "foaf:page" files from corpus data to be 146 | downloaded, if not downloaded yet. Uses the entity id as filename. 147 | All downloaded files are stored under `output_path` folder. 148 | Input: 149 | - corpus: corpus file containing a list of datasets. 150 | - output_path: path to store downloaded resources. 151 | - force_download: always download resources. 152 | """ 153 | global LOGGER 154 | dat_path = output_path / DAT_PAGE_PATH 155 | 156 | if not dat_path.exists(): 157 | dat_path.mkdir(parents=True) 158 | 159 | dats = [e for e in corpus if e["@type"] == "Dataset"] 160 | 161 | if force_download: 162 | downloaded_dat_id = set([]) 163 | else: 164 | downloaded_datasets = list(dat_path.glob("*.*")) 165 | downloaded_dat_id = set([f.stem for f in downloaded_datasets]) 166 | 167 | todo = [] 168 | 169 | for entity in dats: 170 | e_id = urlparse(entity["@id"]).fragment.split("-")[1] 171 | downloaded_before = e_id in downloaded_dat_id 172 | 173 | if force_download or not downloaded_before: 174 | if "foaf:page" in entity: 175 | res_url = entity["foaf:page"]["@value"] 176 | 177 | if res_url.startswith("http://example.com"): 178 | # ignore these placeholder URLs 179 | continue 180 | else: 181 | todo.append(["unknown", e_id, res_url, dat_path]) 182 | 183 | return dat_path, todo 184 | 185 | 186 | def is_valid_pdf_file (filename: str) -> bool: 187 | global LOGGER 188 | 189 | try: 190 | with open(filename, "rb") as f: 191 | parser = PDFParser(f) 192 | document = PDFDocument(parser, "") 193 | 194 | if not document.is_extractable: 195 | raise PDFTextExtractionNotAllowed(filename) 196 | 197 | return True 198 | except: 199 | LOGGER.debug(traceback.format_exc()) 200 | LOGGER.debug(f"not valid PDF file: {filename}") 201 | return False 202 | 203 | 204 | @ray.remote 205 | def _download (res_type: str, e_id: str, url: str, output_path: Path) -> Tuple[bool, str]: 206 | """ Download a resource for the corpus and store it in a file 207 | """ 208 | global LOGGER 209 | 210 | if res_type not in ["pdf", "html", "unknown"]: 211 | raise ValueError(f"Invalid resource type: {res_type}") 212 | 213 | headers = requests.utils.default_headers() 214 | headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36" 215 | 216 | trial = 0 217 | status = None 218 | 219 | while trial < MAX_DOWNLOAD_TRIAL: 220 | try: 221 | parsed_url = urlparse(url) 222 | 223 | if parsed_url.netloc == "www.sciencedirect.com": 224 | """ special case: sciencedirect.com auto generates PDF 225 | download link in an intermediate page 226 | """ 227 | try: 228 | session = HTMLSession() 229 | r0 = session.get(url) 230 | res = session.get(list(r0.html.absolute_links)[0]) 231 | except: 232 | LOGGER.debug(traceback.format_exc()) 233 | LOGGER.debug(list(r0.html.absolute_links)) 234 | 235 | status = f"session.get failed: {e_id} {url}" 236 | LOGGER.warning(status) 237 | 238 | return False, status 239 | 240 | elif parsed_url.netloc.endswith("onlinelibrary.wiley.com"): 241 | """ special case: wiley.com auto embeds to render PDF 242 | """ 243 | r0 = requests.get(url) 244 | soup = BeautifulSoup(r0.content, "html5lib") 245 | 246 | if soup.find("embed") is None: 247 | status = f"no embedded PDF: {e_id} {url}" 248 | LOGGER.warning(status) 249 | 250 | trial += 1 251 | continue 252 | 253 | src = soup.find("embed")["src"] 254 | res = requests.get(parsed_url.scheme + "://" + parsed_url.netloc + src) 255 | else: 256 | res = requests.get(url, headers=headers, timeout=(10, 20)) 257 | 258 | if res_type == "unknown": 259 | content_type = res.headers["content-type"] 260 | res_type = "html" if "text/html" in content_type else "pdf" 261 | 262 | out_file = output_path / (e_id + "." + res_type) 263 | out_file.write_bytes(res.content) 264 | 265 | if res_type == "pdf": 266 | filename = out_file.resolve().as_posix() 267 | 268 | try: 269 | if is_valid_pdf_file(filename): 270 | LOGGER.debug("writing: {filename}") 271 | 272 | if not is_valid_pdf_file(filename): 273 | out_file.unlink() 274 | trial += 1 275 | continue 276 | except: 277 | LOGGER.debug(traceback.format_exc()) 278 | 279 | status = f"{e_id} {url} not valid PDF: {filename}" 280 | LOGGER.warning(status) 281 | 282 | return False, status 283 | 284 | return True, status 285 | 286 | except KeyboardInterrupt: 287 | pass 288 | 289 | except requests.exceptions.RequestException as err: 290 | status = f"{e_id} {url} request exception {err}" 291 | LOGGER.warning(status) 292 | 293 | time.sleep(1) 294 | trial += 1 295 | 296 | if trial == MAX_DOWNLOAD_TRIAL: 297 | LOGGER.debug(f"aborted {e_id} {url} after {MAX_DOWNLOAD_TRIAL} attempts") 298 | 299 | return False, status 300 | 301 | 302 | def download_resource_files (todo: list) -> None: 303 | """ Download all resource files on the TODO list. 304 | We use the entity id as filename. 305 | Input: 306 | - todo: list of URLs to download 307 | """ 308 | global LOGGER 309 | 310 | try: 311 | for _type, e_id, res_url, path in tqdm(todo, ascii=True, desc="download files"): 312 | obj_id = _download.remote(_type, e_id, res_url, Path(path)) 313 | success, status = ray.get(obj_id) 314 | 315 | if not success: 316 | LOGGER.warning(f"failed: {e_id} {res_url}") 317 | 318 | if status: 319 | LOGGER.warning(status) 320 | 321 | time.sleep(0.1) 322 | 323 | except KeyboardInterrupt: 324 | # this function may run for a long while and is much more 325 | # likely to get interrupted 326 | pass 327 | 328 | 329 | def get_resources_stats (corpus: dict, pub_path: Path, dat_path: Path) -> None: 330 | global LOGGER 331 | 332 | pubs = [e for e in corpus if e["@type"] == "ResearchPublication"] 333 | missing_pub = set() 334 | 335 | downloaded_pubs = list(pub_path.glob("*.pdf")) 336 | downloaded_pubs_id = set([f.stem for f in downloaded_pubs]) 337 | 338 | for entity in pubs: 339 | e_id = urlparse(entity["@id"]).fragment.split("-")[1] 340 | 341 | if e_id not in downloaded_pubs_id: 342 | missing_pub.add(e_id) 343 | 344 | LOGGER.warning(f"number of research publications: {len(pubs)}") 345 | LOGGER.warning(f"successfully downloaded {len(pubs) - len(missing_pub)} PDF files") 346 | LOGGER.debug(f"missing publication resources: {missing_pub}") 347 | 348 | dats = [e for e in corpus if e["@type"] == "Dataset"] 349 | missing_dat_res = set() 350 | 351 | downloaded_dats = list(dat_path.glob("*.*")) 352 | downloaded_dat_id = set([f.stem for f in downloaded_dats]) 353 | 354 | for entity in dats: 355 | e_id = urlparse(entity["@id"]).fragment.split("-")[1] 356 | 357 | if e_id not in downloaded_dat_id: 358 | missing_dat_res.add(e_id) 359 | 360 | LOGGER.warning(f"number of datasets: {len(dats)}") 361 | LOGGER.warning(f"successfully downloaded {len(dats) - len(missing_dat_res)} resource files") 362 | LOGGER.debug(f"missing dataset resources: {missing_dat_res}") 363 | 364 | 365 | def main (args) -> None: 366 | # load and parse the corpus 367 | setup_logger(args) 368 | corpus = load_corpus(args.input) 369 | 370 | # enumerate the resource files to download 371 | output_path = Path(args.output_dir) 372 | pub_path, todo_pub = enum_pub_resources(corpus, output_path, args.force) 373 | dat_path, todo_dat = enum_dat_resources(corpus, output_path, args.force) 374 | 375 | # manage the TODO file for downloads 376 | generate_todo(args.todo, todo_pub, todo_dat) 377 | 378 | if not args.todo: 379 | todo = todo_pub + todo_dat 380 | else: 381 | todo = load_todo(args.todo) 382 | 383 | # NB: when connecting to an existing cluster, instead use 384 | # ray.init(address=) 385 | ray.init() 386 | 387 | # run the downloads and report 388 | download_resource_files(todo) 389 | get_resources_stats(corpus, pub_path, dat_path) 390 | 391 | 392 | if __name__ == "__main__": 393 | # parse the command line arguments, if any 394 | parser = argparse.ArgumentParser( 395 | description="download publication PDFs and dataset foaf pages for the rclc corpus" 396 | ) 397 | 398 | parser.add_argument( 399 | "--logger", 400 | type=str, 401 | default=DEFAULT_LOGGER_FILE, 402 | help="logger file" 403 | ) 404 | 405 | parser.add_argument( 406 | "--input", 407 | type=str, 408 | default=DEFAULT_CORPUS_FILE, 409 | help="rclc corpus file" 410 | ) 411 | 412 | parser.add_argument( 413 | "--todo", 414 | type=str, 415 | default=DEFAULT_TODO_LIST, 416 | help="download todo file" 417 | ) 418 | 419 | parser.add_argument( 420 | "--output_dir", 421 | type=str, 422 | default=DEFAULT_OUTPUT_RESOURCE, 423 | help="path to store downloaded resources" 424 | ) 425 | 426 | parser.add_argument( 427 | "--force", 428 | type=bool, 429 | default=DEFAULT_FORCE_DOWNLOAD, 430 | help="always download resources" 431 | ) 432 | 433 | main(parser.parse_args()) 434 | -------------------------------------------------------------------------------- /errors.txt: -------------------------------------------------------------------------------- 1 | WARNING:RichContext:number of records in the corpus: 6712 2 | WARNING:RichContext:failed: 639196e25a9adcadaaf2 https://journals.sagepub.com/doi/pdf/10.1177/0002716216678391 3 | WARNING:RichContext:failed: 847facc16baf543ccece https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12753 4 | WARNING:RichContext:no embedded PDF: 847facc16baf543ccece https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12753 5 | WARNING:RichContext:failed: be43d18d3aad47195bc4 https://doi.org/10.1016/j.childyouth.2016.10.018 6 | WARNING:RichContext:failed: c1863bb4c766796ef81e http://hdl.handle.net/10.1111/agec.12222 7 | WARNING:RichContext:failed: 8b66f7ad41758568afed http://respec.tamu.edu/zervoufirmsfinance.pdf 8 | WARNING:RichContext:8b66f7ad41758568afed http://respec.tamu.edu/zervoufirmsfinance.pdf request exception HTTPConnectionPool(host='respec.tamu.edu', port=80): Max retries exceeded with url: /zervoufirmsfinance.pdf (Caused by ConnectTimeoutError(, 'Connection to respec.tamu.edu timed out. (connect timeout=10)')) 9 | WARNING:RichContext:failed: df70ec30c4976e90d10b http://jhl.sagepub.com/content/22/1/27.full.pdf 10 | WARNING:RichContext:failed: c19ee91f63a30add42a2 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3598 11 | WARNING:RichContext:no embedded PDF: c19ee91f63a30add42a2 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3598 12 | WARNING:RichContext:failed: 296018d551a969ddbabc http://www.soc.jhu.edu/people/DeLuca/documents/SSR%20Mendenhall%2C%20DeLuca%2C%20Duncan.%202006.pdf 13 | WARNING:RichContext:failed: 2c890a18a848d22568f2 https://doi.org/10.1016/j.landurbplan.2018.04.018 14 | WARNING:RichContext:failed: cd9eed5e481bb49e8ada http://pdfs.journals.lww.com/academicmedicine/1997/09000/Profile_of_the_graduate_student_population_in_U_S_.20.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1506389721225;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzd++Xw1xrBvVGm3URNugOVtYfrjQcCVLgbjq4J7SGuNkd0xVDWMKnPVPA0AC2fd0y7;hash|Ze5loqJ0DcYD9lPI8UJDcw== 15 | WARNING:RichContext:failed: 68e25c63ffb13771638e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/2017JD027629 16 | WARNING:RichContext:no embedded PDF: 68e25c63ffb13771638e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/2017JD027629 17 | WARNING:RichContext:failed: cd5e96bdcfd516a8c765 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jtsa.12467 18 | WARNING:RichContext:no embedded PDF: cd5e96bdcfd516a8c765 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jtsa.12467 19 | WARNING:RichContext:failed: 61a59fa620c7a33304a7 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/j.2379-3988.2015.tb00082.x 20 | WARNING:RichContext:no embedded PDF: 61a59fa620c7a33304a7 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/j.2379-3988.2015.tb00082.x 21 | WARNING:RichContext:failed: 6d4a21c9f74b2ba41094 https://manuscript.elsevier.com/S1570677X1500060X/pdf/S1570677X1500060X.pdf 22 | WARNING:RichContext:6d4a21c9f74b2ba41094 https://manuscript.elsevier.com/S1570677X1500060X/pdf/S1570677X1500060X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1570677X1500060X/pdf/S1570677X1500060X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 23 | WARNING:RichContext:failed: 948ccb64c9b6a83d9784 https://pdfs.journals.lww.com/topicsinclinicalnutrition/2004/01000/Child_Nutrition_Programs_Legislation__Past_and.3.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1545548711867;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdjMrFmUeNGda7afeiVO1Pd1ATOUOd1u0sL/wUiMcn9MnKKnFNjYZHw94sfJcYNY6Y;hash|N5oAa31wWF2UlO119Unaug== 24 | WARNING:RichContext:failed: 3baebd7472bc23445521 https://manuscript.elsevier.com/S0964569115001076/pdf/S0964569115001076.pdf 25 | WARNING:RichContext:3baebd7472bc23445521 https://manuscript.elsevier.com/S0964569115001076/pdf/S0964569115001076.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S0964569115001076/pdf/S0964569115001076.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 26 | WARNING:RichContext:failed: c27ce9a0ba19a4ef1c7c http://link.springer.com/10.1007/s11113-018-9496-y 27 | WARNING:RichContext:failed: debcdac22bbbccc57da5 https://doi.org/10.1016/j.isprsjprs.2018.03.019 28 | WARNING:RichContext:failed: 9d4491e6ad99b405cdea https://onlinelibrary.wiley.com/doi/pdf/10.1002/oby.22395 29 | WARNING:RichContext:no embedded PDF: 9d4491e6ad99b405cdea https://onlinelibrary.wiley.com/doi/pdf/10.1002/oby.22395 30 | WARNING:RichContext:failed: 251fb8b407821b95d6f7 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/he.20336 31 | WARNING:RichContext:no embedded PDF: 251fb8b407821b95d6f7 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/he.20336 32 | WARNING:RichContext:failed: 4f84b0f40e7664f53590 http://hdl.handle.net/10.1002/pam.21842 33 | WARNING:RichContext:failed: 578e5c1bf1b8a2f93347 https://onlinelibrary.wiley.com/doi/pdf/10.1111/rurd.12057 34 | WARNING:RichContext:no embedded PDF: 578e5c1bf1b8a2f93347 https://onlinelibrary.wiley.com/doi/pdf/10.1111/rurd.12057 35 | WARNING:RichContext:failed: 2dc1f36c8e8c269799f9 https://www.philadelphiafed.org/-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail 36 | WARNING:RichContext:2dc1f36c8e8c269799f9 https://www.philadelphiafed.org/-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail request exception HTTPSConnectionPool(host='www.philadelphiafed.org', port=443): Max retries exceeded with url: /-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 37 | WARNING:RichContext:failed: 053066f2eb3b45d300e2 https://academic.oup.com/condor/article-pdf/121/2/duz007/28981665/duz007.pdf 38 | WARNING:RichContext:failed: 9c5d244917fe12f7be8d https://doi.org/10.1093/scipol/scz039 39 | WARNING:RichContext:failed: fdc935f99d70620d2a6f https://pediatrics.aappublications.org/content/pediatrics/135/1/e109.full.pdf 40 | WARNING:RichContext:failed: a0241d06c6cf070b947e https://www.tandfonline.com/doi/pdf/10.1080/00220388.2017.1324144?needAccess=true 41 | WARNING:RichContext:failed: 08979188655b9cd921c5 https://academic.oup.com/erae/article-pdf/42/3/499/7003763/jbu033.pdf 42 | WARNING:RichContext:failed: 8a7aa077551d3d9db6fc http://nsgl.gso.uri.edu/flsgp/flsgpm10001.pdf 43 | WARNING:RichContext:failed: 6e8c0b3329601bafa144 http://www.tandfonline.com/doi/pdf/10.1080/13636820.2016.1238837?needAccess=true 44 | WARNING:RichContext:failed: 08a045baa4fd84c47ddb http://europepmc.org/articles/PMC1497475?pdf=render 45 | WARNING:RichContext:failed: fc95db7f7d32cd5f6143 https://doi.org/10.1093/ajcn/nqz064 46 | WARNING:RichContext:failed: d0d445cafcdf2052eaa0 https://journals.sagepub.com/doi/pdf/10.1177/0002716219881628 47 | WARNING:RichContext:failed: a14d6bc056bd36399bef https://doi.org/10.1016/j.crm.2018.03.005 48 | WARNING:RichContext:failed: 6bc42ef8479d897dd5b8 https://manuscript.elsevier.com/S0195666316300952/pdf/S0195666316300952.pdf 49 | WARNING:RichContext:6bc42ef8479d897dd5b8 https://manuscript.elsevier.com/S0195666316300952/pdf/S0195666316300952.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S0195666316300952/pdf/S0195666316300952.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 50 | WARNING:RichContext:failed: 27bb9fe7e9fb3d742e26 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019MS001741 51 | WARNING:RichContext:no embedded PDF: 27bb9fe7e9fb3d742e26 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019MS001741 52 | WARNING:RichContext:failed: c930ff7ed86374e60faf http://pdfs.semanticscholar.org/dde2/64a68c1b893921be3b2875b861667e4260c4.pdf 53 | WARNING:RichContext:failed: 4e29b682e33b1288ea1a https://onlinelibrary.wiley.com/doi/pdf/10.1111/joca.12158 54 | WARNING:RichContext:no embedded PDF: 4e29b682e33b1288ea1a https://onlinelibrary.wiley.com/doi/pdf/10.1111/joca.12158 55 | WARNING:RichContext:failed: a4f9bb4fd4330c60a42d https://onlinelibrary.wiley.com/doi/pdf/10.1111/bjir.12469 56 | WARNING:RichContext:no embedded PDF: a4f9bb4fd4330c60a42d https://onlinelibrary.wiley.com/doi/pdf/10.1111/bjir.12469 57 | WARNING:RichContext:failed: caf688b9379490044130 https://www.sciencedirect.com/sdfe/reader/pii/S2210784316300973/pdf 58 | WARNING:RichContext:session.get failed: caf688b9379490044130 https://www.sciencedirect.com/sdfe/reader/pii/S2210784316300973/pdf 59 | WARNING:RichContext:failed: 590c800055988048ae2c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22104 60 | WARNING:RichContext:no embedded PDF: 590c800055988048ae2c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22104 61 | WARNING:RichContext:failed: 81c8c68f70538befd0f4 https://esajournals.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ecy.2600 62 | WARNING:RichContext:no embedded PDF: 81c8c68f70538befd0f4 https://esajournals.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ecy.2600 63 | WARNING:RichContext:failed: 94f7e2a4bb7a663c358b http://pdfs.semanticscholar.org/01cf/d33f38b51a55a75ffd59b30c5813850a77cc.pdf 64 | WARNING:RichContext:failed: 6cc89963f723ce427738 http://www.degruyter.com/downloadpdf/j/alr.ahead-of-print/applirev-2018-0020/applirev-2018-0020.xml 65 | WARNING:RichContext:failed: d0447f01dcf36d16bc19 https://link.springer.com/content/pdf/10.1007%2F978-3-030-18072-0.pdf 66 | WARNING:RichContext:failed: 8d7664e43439a2a679f8 http://journals.sagepub.com/doi/pdf/10.1177/1049731516630385 67 | WARNING:RichContext:failed: 1d26e90c12bc8cc7fb2e http://jhppl.dukejournals.org/cgi/reprint/32/3/415.pdf 68 | WARNING:RichContext:failed: 3a9f49659008444cd656 http://europepmc.org/articles/PMC2690384?pdf=render 69 | WARNING:RichContext:failed: aaa54f5cf1d57869edfe https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jmcb.12574 70 | WARNING:RichContext:no embedded PDF: aaa54f5cf1d57869edfe https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jmcb.12574 71 | WARNING:RichContext:failed: 6ec3fd89ae20c802af4f https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1002/2013JC009604 72 | WARNING:RichContext:no embedded PDF: 6ec3fd89ae20c802af4f https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1002/2013JC009604 73 | WARNING:RichContext:failed: 88e5e048845a9ada8b5d https://doi.org/10.1016/j.rse.2019.03.022 74 | WARNING:RichContext:failed: 0b8c73a4cc256431bd0d https://onlinelibrary.wiley.com/doi/pdf/10.1111/joes.12122 75 | WARNING:RichContext:no embedded PDF: 0b8c73a4cc256431bd0d https://onlinelibrary.wiley.com/doi/pdf/10.1111/joes.12122 76 | WARNING:RichContext:failed: 85bc3a4d4f38ff39baf4 http://www.annualreviews.org/doi/pdf/10.1146/annurev-criminol-032317-091915 77 | WARNING:RichContext:failed: 4a112f8ef5541d2a89a5 https://www.tandfonline.com/doi/pdf/10.1080/19320248.2013.786663?needAccess=true 78 | WARNING:RichContext:failed: 4895cb2001777057a23c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/pbaf.12244 79 | WARNING:RichContext:no embedded PDF: 4895cb2001777057a23c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/pbaf.12244 80 | WARNING:RichContext:failed: df4879004c5f8b41da49 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecoj.12415 81 | WARNING:RichContext:no embedded PDF: df4879004c5f8b41da49 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecoj.12415 82 | WARNING:RichContext:failed: 0de2b787b9fbf0067276 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ijpo.12562 83 | WARNING:RichContext:no embedded PDF: 0de2b787b9fbf0067276 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ijpo.12562 84 | WARNING:RichContext:failed: 64d397e1e2a4c4b7ca7a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22093 85 | WARNING:RichContext:no embedded PDF: 64d397e1e2a4c4b7ca7a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22093 86 | WARNING:RichContext:failed: d670e19e556e0a43fa05 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/coep.12266 87 | WARNING:RichContext:no embedded PDF: d670e19e556e0a43fa05 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/coep.12266 88 | WARNING:RichContext:failed: 0a54d3ff7a70f775f98c https://onlinelibrary.wiley.com/doi/pdf/10.1111/fare.12310 89 | WARNING:RichContext:no embedded PDF: 0a54d3ff7a70f775f98c https://onlinelibrary.wiley.com/doi/pdf/10.1111/fare.12310 90 | WARNING:RichContext:failed: 3519f2665128f8589e19 http://onlinelibrary.wiley.com/doi/10.1002/pop4.171/pdf 91 | WARNING:RichContext:no embedded PDF: 3519f2665128f8589e19 http://onlinelibrary.wiley.com/doi/10.1002/pop4.171/pdf 92 | WARNING:RichContext:failed: 118d90c6757823a66d60 https://doi.org/10.1093/aepp/ppy018 93 | WARNING:RichContext:failed: 18c64ba865ad68d6cf90 Retail oligopoly power, dairy compact, and Boston milk prices 94 | WARNING:RichContext:18c64ba865ad68d6cf90 Retail oligopoly power, dairy compact, and Boston milk prices request exception Invalid URL 'Retail oligopoly power, dairy compact, and Boston milk prices': No schema supplied. Perhaps you meant http://Retail oligopoly power, dairy compact, and Boston milk prices? 95 | WARNING:RichContext:failed: 0522d4cdb559dd0b03ee https://doi.org/10.21105/joss.01462 96 | WARNING:RichContext:failed: 8175cb931990b98a72d8 http://dataspace.princeton.edu/jspui/bitstream/88435/dsp01zw12z530b/1/441.pdf 97 | WARNING:RichContext:failed: 830fe95fd71f124a5e68 https://link.springer.com/content/pdf/10.1007%2Fs10896-019-00058-y.pdf 98 | WARNING:RichContext:failed: 95a93b5949b7868fcfe9 https://bmcmusculoskeletdisord.biomedcentral.com/track/pdf/10.1186/1471-2474-12-182 99 | WARNING:RichContext:failed: 756a33253efcecd3b9e5 https://onlinelibrary.wiley.com/doi/pdf/10.1111/mcn.12488 100 | WARNING:RichContext:no embedded PDF: 756a33253efcecd3b9e5 https://onlinelibrary.wiley.com/doi/pdf/10.1111/mcn.12488 101 | WARNING:RichContext:failed: efba7abdb9ccd597a997 https://doi.org/10.1016/j.appet.2014.12.003 102 | WARNING:RichContext:failed: a1d317452d59817cd6ff https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.22540 103 | WARNING:RichContext:no embedded PDF: a1d317452d59817cd6ff https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.22540 104 | WARNING:RichContext:failed: 89f6c098e8444e7a583b https://www.frontiersin.org/articles/10.3389/fenvs.2019.00068/pdf 105 | WARNING:RichContext:failed: e74e590413a621376e10 http://www.umass.edu/resec/faculty/rojas/docs/Taxes.pdf 106 | WARNING:RichContext:failed: 67e991d67225171ce669 http://journals.sagepub.com/doi/pdf/10.1177/0275074018765809 107 | WARNING:RichContext:failed: 571089b7e3a18f69602a https://doi.org/10.3934/environsci.2016.3.509 108 | WARNING:RichContext:failed: 526d34e526fe153f418b https://link.springer.com/content/pdf/10.1007%2F978-3-319-31816-5_2669-1.pdf 109 | WARNING:RichContext:failed: ac5b23a0dbc3312409c2 https://izajolp.springeropen.com/track/pdf/10.1186/s40173-015-0043-8 110 | WARNING:RichContext:ac5b23a0dbc3312409c2 https://izajolp.springeropen.com/track/pdf/10.1186/s40173-015-0043-8 request exception HTTPSConnectionPool(host='izajolp.springeropen.com', port=443): Read timed out. (read timeout=20) 111 | WARNING:RichContext:failed: 42660e147ce2722ca8c0 https://onlinelibrary.wiley.com/doi/pdf/10.1002/ajpa.22161 112 | WARNING:RichContext:no embedded PDF: 42660e147ce2722ca8c0 https://onlinelibrary.wiley.com/doi/pdf/10.1002/ajpa.22161 113 | WARNING:RichContext:failed: e3c089f9a76fa109516a http://policy.rutgers.edu/faculty/hetling/Hetling_Diversion_JPP.pdf 114 | WARNING:RichContext:failed: 9f66008fbb6b3f813781 http://dl.acm.org/ft_gateway.cfm?id=3232775&type=pdf 115 | WARNING:RichContext:failed: 28fe310677131a723ac9 https://dergipark.org.tr/tr/download/article-file/855095 116 | WARNING:RichContext:failed: d8f33b8436040b1230d0 https://academic.oup.com/restud/article-pdf/86/3/1170/28529276/rdy021.pdf 117 | WARNING:RichContext:failed: 70846c225e81d8944ef2 https://doi.org/10.7758/rsf.2015.1.1.07 118 | WARNING:RichContext:failed: 30c402dcc869061d60fb http://journals.sagepub.com/doi/pdf/10.1177/0734016818769705 119 | WARNING:RichContext:failed: 87699b623cea5b299453 https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=966252 120 | WARNING:RichContext:failed: 45585929322868e33f9e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019JD031232 121 | WARNING:RichContext:no embedded PDF: 45585929322868e33f9e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019JD031232 122 | WARNING:RichContext:failed: 3e1934281bcca346b4fb https://doi.org/10.4103/jfmpc.jfmpc_185_17 123 | WARNING:RichContext:failed: 5a6683af271dd94c08bf https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/joc.6367 124 | WARNING:RichContext:no embedded PDF: 5a6683af271dd94c08bf https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/joc.6367 125 | WARNING:RichContext:failed: fe2001372b6842192771 https://onlinelibrary.wiley.com/doi/pdf/10.1111/birt.12394 126 | WARNING:RichContext:no embedded PDF: fe2001372b6842192771 https://onlinelibrary.wiley.com/doi/pdf/10.1111/birt.12394 127 | WARNING:RichContext:failed: 27f9858462c84b19ec3b https://journals.sagepub.com/doi/pdf/10.1177/0002716219884546 128 | WARNING:RichContext:failed: 9502d3e234e31a774907 https://doi.org/10.1016/j.amepre.2016.08.023 129 | WARNING:RichContext:failed: 835616fa5075a00a4948 https://doi.org/10.5530/ami.2015.1.10 130 | WARNING:RichContext:failed: bab9c37e1d71bc2d046a https://onlinelibrary.wiley.com/doi/pdf/10.1111/1745-9133.12359 131 | WARNING:RichContext:no embedded PDF: bab9c37e1d71bc2d046a https://onlinelibrary.wiley.com/doi/pdf/10.1111/1745-9133.12359 132 | WARNING:RichContext:failed: 30f40b6634264fc34af1 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6179934 133 | WARNING:RichContext:30f40b6634264fc34af1 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6179934 request exception ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) 134 | WARNING:RichContext:failed: d11ed61dee9ed87ac448 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3651 135 | WARNING:RichContext:no embedded PDF: d11ed61dee9ed87ac448 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3651 136 | WARNING:RichContext:failed: 9ce19e2215388a5a3821 https://www.frontiersin.org/articles/10.3389/fbuil.2019.00105/pdf 137 | WARNING:RichContext:failed: 36bc15e99d84db9278fa https://link.springer.com/content/pdf/10.1007%2Fs12571-017-0733-8.pdf 138 | WARNING:RichContext:failed: 9d2897d8e568f46e0472 https://onlinelibrary.wiley.com/doi/pdfdirect/10.4073/csr.2015.2 139 | WARNING:RichContext:no embedded PDF: 9d2897d8e568f46e0472 https://onlinelibrary.wiley.com/doi/pdfdirect/10.4073/csr.2015.2 140 | WARNING:RichContext:failed: 2dad560eb70b1ed8f114 https://onlinelibrary.wiley.com/doi/pdf/10.1002/agr.21537 141 | WARNING:RichContext:no embedded PDF: 2dad560eb70b1ed8f114 https://onlinelibrary.wiley.com/doi/pdf/10.1002/agr.21537 142 | WARNING:RichContext:failed: 136ec8e7c40289836352 http://fmpc.uconn.edu/publications/rr/rr37.pdf 143 | WARNING:RichContext:136ec8e7c40289836352 http://fmpc.uconn.edu/publications/rr/rr37.pdf request exception HTTPConnectionPool(host='fmpc.uconn.edu', port=80): Max retries exceeded with url: /publications/rr/rr37.pdf (Caused by NewConnectionError(': Failed to establish a new connection: [Errno -2] Name or service not known')) 144 | WARNING:RichContext:failed: 95cd3d20b62594528334 http://hdl.handle.net/20.500.11794/10748 145 | WARNING:RichContext:95cd3d20b62594528334 http://hdl.handle.net/20.500.11794/10748 request exception HTTPSConnectionPool(host='corpus.ulaval.ca', port=443): Read timed out. (read timeout=20) 146 | WARNING:RichContext:failed: 0b29b6675fc51a1cc47d https://doi.org/10.1016/j.ijlp.2018.10.004 147 | WARNING:RichContext:failed: c3c44096bf0e3c9ffbf3 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2018MS001595 148 | WARNING:RichContext:no embedded PDF: c3c44096bf0e3c9ffbf3 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2018MS001595 149 | WARNING:RichContext:failed: 9aa50a5f06b9166f4715 https://doi.org/10.1016/j.jbusvent.2018.10.005 150 | WARNING:RichContext:failed: b52a4d9ed708d23ab31f https://manuscript.elsevier.com/S027795361730237X/pdf/S027795361730237X.pdf 151 | WARNING:RichContext:b52a4d9ed708d23ab31f https://manuscript.elsevier.com/S027795361730237X/pdf/S027795361730237X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S027795361730237X/pdf/S027795361730237X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 152 | WARNING:RichContext:failed: 6b8a515fa898d061c94e https://doi.org/10.1016/j.jrurstud.2016.02.001 153 | WARNING:RichContext:failed: 6cea458960193ca6c4c8 https://doi.org/10.1093/ajae/aay002 154 | WARNING:RichContext:failed: d1f47b6efca463251268 https://nyaspubs.onlinelibrary.wiley.com/doi/pdf/10.1111/nyas.12594 155 | WARNING:RichContext:no embedded PDF: d1f47b6efca463251268 https://nyaspubs.onlinelibrary.wiley.com/doi/pdf/10.1111/nyas.12594 156 | WARNING:RichContext:failed: b14b63bddb9a8e336432 http://journals.sagepub.com/doi/pdf/10.1177/0269215518758483 157 | WARNING:RichContext:failed: d1b427e76cad5fdac247 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/padr.12227 158 | WARNING:RichContext:no embedded PDF: d1b427e76cad5fdac247 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/padr.12227 159 | WARNING:RichContext:failed: 505caa8f96f691006fc6 https://journals.sagepub.com/doi/pdf/10.1177/0890117118786871 160 | WARNING:RichContext:failed: becacc4f9fe1589c4db4 https://manuscript.elsevier.com/S1499404616300318/pdf/S1499404616300318.pdf 161 | WARNING:RichContext:becacc4f9fe1589c4db4 https://manuscript.elsevier.com/S1499404616300318/pdf/S1499404616300318.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1499404616300318/pdf/S1499404616300318.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 162 | WARNING:RichContext:failed: 21783645b6d33194d0f7 https://www.tandfonline.com/doi/pdf/10.1080/10511482.2014.1003190?needAccess=true 163 | WARNING:RichContext:failed: d149447ba1b82d8b9ad2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.04028 164 | WARNING:RichContext:no embedded PDF: d149447ba1b82d8b9ad2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.04028 165 | WARNING:RichContext:failed: ba8305c710558f122b1d http://www.jneb.org/article/S1499404619300740/pdf 166 | WARNING:RichContext:failed: 86ac8d47e706be6c8219 https://doi.org/10.17000/kspr.25.4.201812.199 167 | WARNING:RichContext:failed: a9ca248fe2f8bdb1d751 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/1745-9125.12199 168 | WARNING:RichContext:no embedded PDF: a9ca248fe2f8bdb1d751 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/1745-9125.12199 169 | WARNING:RichContext:failed: ef8f24963245a184e40f http://crc.nv.gov/docs/forecst_wksp/developmentspaper.pdf 170 | WARNING:RichContext:failed: 8c9ccf399eebc2338418 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/jia2.25306 171 | WARNING:RichContext:no embedded PDF: 8c9ccf399eebc2338418 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/jia2.25306 172 | WARNING:RichContext:failed: 209e57ded73099de4596 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/met.1862 173 | WARNING:RichContext:no embedded PDF: 209e57ded73099de4596 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/met.1862 174 | WARNING:RichContext:failed: 1af374454453aa5d5825 https://manuscript.elsevier.com/S1570677X16300326/pdf/S1570677X16300326.pdf 175 | WARNING:RichContext:1af374454453aa5d5825 https://manuscript.elsevier.com/S1570677X16300326/pdf/S1570677X16300326.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1570677X16300326/pdf/S1570677X16300326.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 176 | WARNING:RichContext:failed: ac1a920b63e1d7fb783f http://www.degruyter.com/downloadpdf/j/jetl.2019.10.issue-2/jetl-2019-0108/jetl-2019-0108.xml 177 | WARNING:RichContext:failed: 6326552bd07f2ec4aeaf http://pdfs.semanticscholar.org/ca60/d8c02ca0590e15110e1b7b854c34ca9a52f2.pdf 178 | WARNING:RichContext:failed: ec6d8c7e87ae9bfbf142 http://www.tandfonline.com/doi/pdf/10.1080/00036846.2011.568397 179 | WARNING:RichContext:failed: 618e73b7a15fbdd9f814 https://pdfs.journals.lww.com/jaids/2015/07010/The_Association_Between_Food_Insufficiency_and_HIV.11.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1542048624015;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdmXWu5dPlhDubainwsnRl16RkxGril9arViLOJfOXddA=;hash|jfKu6Iv3RW2TNcA9hXUj8A== 180 | WARNING:RichContext:failed: 83a4e67742aabc45f9dc https://doi.org/10.1016/j.ocecoaman.2016.12.014 181 | WARNING:RichContext:failed: 77f7e285b82c924fd01d https://manuscript.elsevier.com/S030691921600018X/pdf/S030691921600018X.pdf 182 | WARNING:RichContext:77f7e285b82c924fd01d https://manuscript.elsevier.com/S030691921600018X/pdf/S030691921600018X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S030691921600018X/pdf/S030691921600018X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 183 | WARNING:RichContext:failed: ada821e909a30a8c6e0d https://doi.org/10.1016/j.childyouth.2014.04.003 184 | WARNING:RichContext:failed: 034349cf00da5e130480 http://archive.nyu.edu/bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf 185 | WARNING:RichContext:034349cf00da5e130480 http://archive.nyu.edu/bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf request exception HTTPSConnectionPool(host='archive.nyu.edu', port=443): Max retries exceeded with url: /bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 186 | WARNING:RichContext:failed: 49f839e31fe22701509f http://pdfs.semanticscholar.org/cb1e/f995d26cda189479736850b8c10b6fe3cf79.pdf 187 | WARNING:RichContext:failed: d2207b8ab7654cac902c https://doi.org/10.7554/elife.32822 188 | WARNING:RichContext:failed: 0ff3b036d813af980976 https://doi.org/10.1093/erae/jby026 189 | WARNING:RichContext:failed: 33d8143055f59f2829af https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/hsc.12619 190 | WARNING:RichContext:no embedded PDF: 33d8143055f59f2829af https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/hsc.12619 191 | WARNING:RichContext:failed: 417299798d839a7ef7e8 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4330833/pdf 192 | WARNING:RichContext:417299798d839a7ef7e8 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4330833/pdf request exception ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) 193 | WARNING:RichContext:failed: f4a39744928ec6a912bb https://doi.org/10.3168/jds.s0022-0302(78)94422-3 194 | WARNING:RichContext:failed: f65c9b9345c259191f8c https://academic.oup.com/psychsocgerontology/article-pdf/62/4/S209/1383923/S209.pdf 195 | WARNING:RichContext:failed: e26969208ccf52d42846 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jftr.12302 196 | WARNING:RichContext:no embedded PDF: e26969208ccf52d42846 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jftr.12302 197 | WARNING:RichContext:failed: 646197a01f532c943874 https://manuscript.elsevier.com/S027795361530280X/pdf/S027795361530280X.pdf 198 | WARNING:RichContext:646197a01f532c943874 https://manuscript.elsevier.com/S027795361530280X/pdf/S027795361530280X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S027795361530280X/pdf/S027795361530280X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 199 | WARNING:RichContext:failed: c6cbfb4533f33f1f895d https://doi.org/10.1016/j.marpol.2016.04.030 200 | WARNING:RichContext:failed: 1a2665361435e5929d9a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pop4.203 201 | WARNING:RichContext:no embedded PDF: 1a2665361435e5929d9a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pop4.203 202 | WARNING:RichContext:failed: 144fa5560b013a344936 http://repositorio.ul.pt/bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf 203 | WARNING:RichContext:144fa5560b013a344936 http://repositorio.ul.pt/bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf request exception HTTPSConnectionPool(host='repositorio.ul.pt', port=443): Max retries exceeded with url: /bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 204 | WARNING:RichContext:failed: 47e1bc82160141ce0352 https://doi.org/10.1093/jssam/smz024 205 | WARNING:RichContext:failed: 1c94ef1e2f327d388feb https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=2056045 206 | WARNING:RichContext:failed: aad7a5f99885cfa59d83 https://onlinelibrary.wiley.com/doi/pdf/10.1111/jora.12272 207 | WARNING:RichContext:no embedded PDF: aad7a5f99885cfa59d83 https://onlinelibrary.wiley.com/doi/pdf/10.1111/jora.12272 208 | WARNING:RichContext:failed: 40f8b8f798ae78fef5b3 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12842 209 | WARNING:RichContext:no embedded PDF: 40f8b8f798ae78fef5b3 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12842 210 | WARNING:RichContext:failed: 8f616ddb80faace26d72 https://doi.org/10.3982/qe564 211 | WARNING:RichContext:failed: 778a15d3c9f53050675e https://doi.org/10.1016/j.childyouth.2016.10.018 212 | WARNING:RichContext:failed: 1df1bb6a76cf596bccf8 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12837 213 | WARNING:RichContext:no embedded PDF: 1df1bb6a76cf596bccf8 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12837 214 | WARNING:RichContext:failed: 70a5a100bf7301a203bf https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.21862 215 | WARNING:RichContext:no embedded PDF: 70a5a100bf7301a203bf https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.21862 216 | WARNING:RichContext:failed: 2d0084671761a8116774 https://nyaspubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/nyas.13396 217 | WARNING:RichContext:no embedded PDF: 2d0084671761a8116774 https://nyaspubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/nyas.13396 218 | WARNING:RichContext:failed: f74c7adc3a94432f1535 https://onlinelibrary.wiley.com/doi/pdf/10.1111/phn.12375 219 | WARNING:RichContext:no embedded PDF: f74c7adc3a94432f1535 https://onlinelibrary.wiley.com/doi/pdf/10.1111/phn.12375 220 | WARNING:RichContext:failed: 436d4a380504abf1cb3c https://doi.org/10.5993/ajhb.42.1.3 221 | WARNING:RichContext:failed: 050f7524a00fccc84e03 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12764 222 | WARNING:RichContext:no embedded PDF: 050f7524a00fccc84e03 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12764 223 | WARNING:RichContext:failed: fddf252c8b7f7ab55e69 https://www.dllr.state.md.us/employment/unemployment.shtml 224 | WARNING:RichContext:fddf252c8b7f7ab55e69 https://www.dllr.state.md.us/employment/unemployment.shtml request exception HTTPSConnectionPool(host='www.dllr.state.md.us', port=443): Max retries exceeded with url: /employment/unemployment.shtml (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 225 | WARNING:RichContext:failed: 722701806b8efd709030 https://www.edd.ca.gov/unemployment/ 226 | WARNING:RichContext:722701806b8efd709030 https://www.edd.ca.gov/unemployment/ request exception HTTPSConnectionPool(host='www.edd.ca.gov', port=443): Max retries exceeded with url: /unemployment/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 227 | WARNING:RichContext:failed: ae027bea3fce790000b1 https://dew.sc.gov/tools-resources/data-statistics 228 | WARNING:RichContext:ae027bea3fce790000b1 https://dew.sc.gov/tools-resources/data-statistics request exception HTTPSConnectionPool(host='dew.sc.gov', port=443): Max retries exceeded with url: /tools-resources/data-statistics (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 229 | WARNING:RichContext:failed: e84ccf3e64a9f10a03a5 https://otda.ny.gov/resources/caseload/ 230 | WARNING:RichContext:e84ccf3e64a9f10a03a5 https://otda.ny.gov/resources/caseload/ request exception HTTPSConnectionPool(host='otda.ny.gov', port=443): Max retries exceeded with url: /resources/caseload/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])"))) 231 | WARNING:RichContext:failed: 7785c8d5d201ff8ba4da http://www23.statcan.gc.ca/imdb/p2SV.pl?Function=getSurvey&Id=795204 232 | WARNING:RichContext:7785c8d5d201ff8ba4da http://www23.statcan.gc.ca/imdb/p2SV.pl?Function=getSurvey&Id=795204 request exception ('Connection aborted.', OSError("(104, 'ECONNRESET')")) 233 | WARNING:RichContext:number of research publications: 1604 234 | WARNING:RichContext:successfully downloaded 1452 PDF files 235 | WARNING:RichContext:number of datasets: 208 236 | WARNING:RichContext:successfully downloaded 199 resource files 237 | --------------------------------------------------------------------------------