├── lib
    └── .placeholder
├── docs
    ├── corpus.png
    └── uptod.png
├── MANIFEST.txt
├── vocab.json
├── requirements.txt
├── bin
    ├── parsr_test.sh
    ├── download_s3.py
    ├── index_phrases.py
    ├── parsr.py
    ├── textrank_test.py
    ├── sampleConfig.json
    ├── parsed_json_interpreter.py
    ├── phrase_extraction_pipeline.py
    ├── extract_text.py
    ├── parsr_output_interpreter.py
    ├── upload_s3.py
    ├── parsr_client.py
    └── download_resources.py
├── .gitignore
├── DOWNLOAD.md
├── test.py
├── README.md
├── LICENSE
└── errors.txt


/lib/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/corpus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Coleridge-Initiative/rclc/HEAD/docs/corpus.png


--------------------------------------------------------------------------------
/docs/uptod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Coleridge-Initiative/rclc/HEAD/docs/uptod.png


--------------------------------------------------------------------------------
/MANIFEST.txt:
--------------------------------------------------------------------------------
1 | date: 2020-03-01
2 | release: v1.0.8
3 | uploaded_pdf: 1452
4 | uploaded_json: 76
5 | uploaded_txt: 1279
6 | 


--------------------------------------------------------------------------------
/vocab.json:
--------------------------------------------------------------------------------
1 | {
2 |   "@language":	"en",
3 |   "adrf":	"https://github.com/Coleridge-Initiative/adrf-onto/wiki/Vocabulary#",
4 |   "cito":	"http://purl.org/spar/cito/",
5 |   "dct":	"http://purl.org/dc/terms/",
6 |   "foaf":	"http://xmlns.com/foaf/0.1/",
7 |   "rdf":	"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
8 |   "xsd":	"http://www.w3.org/2001/XMLSchema#"
9 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | GitPython >= 3.0.5
 2 | awscli >= 1.16.302
 3 | beautifulsoup4 >= 4.8.0
 4 | boto3 >= 1.10.38
 5 | html5lib >= 1.0.1
 6 | mypy >= 0.730
 7 | networkx >= 2.4
 8 | pdfminer.six == 20181108
 9 | pytextrank >= 2.0.0
10 | ray >= 0.6.5
11 | rdflib >= 4.2.2
12 | rdflib-jsonld >= 0.4.0
13 | requests-html >= 0.10.0
14 | spacy >= 2.2.3
15 | sxsdiff >= 0.3.0
16 | tqdm >= 4.37.0
17 | 


--------------------------------------------------------------------------------
/bin/parsr_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PARSR=34.82.183.17:3001
 4 | 
 5 | curl -X GET \
 6 |      http://$PARSR/api/v1/queue/1ec3b910b86f2bb329684cd5763d56
 7 | 
 8 | 
 9 | exit 0
10 | 
11 | curl -X POST \
12 |      http://$PARSR/api/v1/document \
13 |      -H 'Content-Type: multipart/form-data' \
14 |      -F 'file=@resources/pub/pdf/a6024f82cef41d533019.pdf;type=application/pdf' \
15 |      -F 'config=@bin/sampleConfig.json;type=application/json'
16 | 


--------------------------------------------------------------------------------
/bin/download_s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import boto3
 5 | 
 6 | # initialize access to the storage grid bucket
 7 | bucket_name = "richcontext"
 8 | bucket = boto3.resource("s3").Bucket(bucket_name)
 9 | 
10 | # list up to N keys for files within our pseudo-directory
11 | prefix = "corpus_docs"
12 | limit = 10
13 |     
14 | for obj in bucket.objects.filter(Prefix=prefix):
15 |     if limit < 1:
16 |         break
17 |     else:
18 |         print(obj.key)
19 |         limit -= 1
20 | 
21 | # show an example of how to download a specific file
22 | local_file = "001966ac583b67a965cf.json"
23 | key = prefix + "/pub/json/" + local_file
24 | 
25 | bucket.download_file(key, local_file)
26 | 
27 | 


--------------------------------------------------------------------------------
/bin/index_phrases.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | from collections import OrderedDict
 5 | from pathlib import Path
 6 | from tqdm import tqdm
 7 | import codecs
 8 | import json
 9 | import pytextrank
10 | import ray
11 | import spacy
12 | import sys
13 | 
14 | 
15 | @ray.remote
16 | def extract_phrases (txt_file, dir_path, nlp):
17 |     tr_path = dir_path / "tr"
18 | 
19 |     with codecs.open(txt_file, "r", encoding="utf8") as f:
20 |         text = f.read()
21 |         doc = nlp(text)
22 |         view = OrderedDict()
23 | 
24 |         for phrase in doc._.phrases[:20]:
25 |             view[phrase.text] = { "count": phrase.count, "rank_score": phrase.rank }
26 | 
27 |             file_name = txt_file.stem + ".json"
28 |             tr_file = tr_path / file_name
29 | 
30 |             with codecs.open(tr_file, "wb", encoding="utf8") as f:
31 |                 json.dump(view, f, indent=4, ensure_ascii=False)
32 | 
33 | 
34 | def main ():
35 |     nlp = spacy.load("en_core_web_sm")
36 |     tr = pytextrank.TextRank(logger=None)
37 |     nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
38 | 
39 |     dir_path = Path("resources/pub")
40 |     txt_path = dir_path / "txt"
41 | 
42 |     task_ids = []
43 |     ray.init()
44 | 
45 |     for txt_file in tqdm(list(txt_path.glob(f"*txt")), ascii=True, desc=f"extracted text files"):
46 |         id = extract_phrases.remote(txt_file, dir_path, nlp)
47 |         task_ids.append(id)
48 | 
49 |     ray.get(task_ids)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *~
  2 | .DS_Store
  3 | 
  4 | example/pub/tr
  5 | tmp.ttl
  6 | tmp.jsonld
  7 | todo.tsv
  8 | resources/
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # Environments
 94 | .env
 95 | .venv
 96 | env/
 97 | venv/
 98 | ENV/
 99 | env.bak/
100 | venv.bak/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 


--------------------------------------------------------------------------------
/bin/parsr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | from parsed_json_interpreter import mkdir
 5 | from parsr_client import ParserClient
 6 | from pathlib import Path
 7 | import codecs
 8 | import json
 9 | import os
10 | import sys
11 | import traceback
12 | 
13 | 
14 | def Convert (base_path=".", force=False):
15 |     config_path = Path(base_path) / "bin/sampleConfig.json"
16 | 
17 |     pub_dir = Path(base_path) / "resources/pub"
18 | 
19 |     json_dir = pub_dir / "json"
20 |     mkdir(json_dir)
21 | 
22 |     txt_dir = pub_dir  / "txt"
23 |     mkdir(txt_dir)
24 | 
25 |     pdf_dir = pub_dir / "pdf"
26 | 
27 |     for pdf_file in list(pdf_dir.glob("*.pdf")):
28 |         json_file = pdf_file.stem + ".json"
29 |         json_path = json_dir / json_file
30 | 
31 |         if json_path.exists() and not force:
32 |             # ignore the PDFs that were already parsed
33 |             continue
34 | 
35 |         # send document to Parsr server for processing 
36 |         try:
37 |             print(f"parsing {pdf_file}")
38 | 
39 |             job = parsr.send_document(
40 |                 file=pdf_file.as_posix(),
41 |                 config=config_path.as_posix(),
42 |                 wait_till_finished=True,
43 |                 save_request_id=True,
44 |             )
45 |         
46 |             # output the full results in JSON
47 |             with codecs.open(json_path, "wb", encoding="utf8") as f:
48 |                 json.dump(parsr.get_json(), f, indent=2, ensure_ascii=False)
49 |             
50 |             # output the raw text
51 |             txt_file = pdf_file.stem + ".txt"
52 |             txt_path = txt_dir / txt_file
53 | 
54 |             with codecs.open(txt_path, "wb", encoding="utf8") as f:
55 |                 f.write(parsr.get_text())
56 | 
57 |         except:
58 |             traceback.print_exc()
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     if len(sys.argv) < 2:
63 |         print("usage: parsr.py host:port")
64 |         sys.exit(-1)
65 | 
66 |     server = sys.argv[1]
67 |     path = os.path.dirname(os.path.dirname(__file__))
68 | 
69 |     print(f"using Parsr server {server}")
70 |     print(f"save to path {path}")
71 | 
72 |     parsr = ParserClient(server)
73 |     Convert(path)
74 | 


--------------------------------------------------------------------------------
/DOWNLOAD.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | 
 3 | To install the Python library dependencies:
 4 | 
 5 | ```
 6 | pip install -r requirements.txt
 7 | ```
 8 | 
 9 | 
10 | ## Download Parsed PDFs
11 | 
12 | The `bin/download_s3.py` script provides example code for downloading
13 | PDF files (open access publications) and JSON files (extracted text)
14 | from the public S3 bucket.
15 | 
16 | 
17 | ## Collecting Open Access PDFs
18 | 
19 | **For those on the NYU-CI team who update the corpus:**
20 | 
21 | Download the corpus PDFs and other resource files:
22 | 
23 | ```
24 | python bin/download_resources.py --logger errors.txt
25 | ```
26 | 
27 | The PDF files get stored in the `resources/pub/pdf` subdirectory.
28 | 
29 | 
30 | ## Extract text from PDFs
31 | 
32 | We use `science-parse` to extract text from research publications.
33 | Download the latest `science-parse-cli-assembly-*.jar` from the
34 | <https://github.com/allenai/science-parse/releases>
35 | and copy that JAR file into the `lib/` subdirectory.
36 | 
37 | Then run the `science-parse` CLI to extract text from the PDF files,
38 | and be sure to use the correct version number for the JAR that you
39 | downloaded:
40 | 
41 | ```
42 | mkdir -p resources/pub/json
43 | SPJAR=lib/science-parse-cli-assembly-2.0.3.jar
44 | java -jar $SPJAR -o ./resources/pub/json ./resources/pub/pdf
45 | ```
46 | 
47 | That command will download multiple resources from the Allan AI public
48 | datastore, which may take several minutes.
49 | 
50 | TODO: replace this step with use of a containerized `SPv2` server.
51 | 
52 | 
53 | ## Upload PDF and JSON files
54 | 
55 | **For those on the NYU-CI team who update the corpus:**
56 | 
57 | Upload the PDF files (open access publications) and JSON files
58 | (extracted text) to the public S3 bucket:
59 | 
60 | ```
61 | python bin/upload_s3.py
62 | ```
63 | 
64 | 
65 | ## S3 Bucket Specs
66 | 
67 | View the public AWS S3 Bucket `richcontext` online:
68 | 
69 |  - <https://richcontext.s3.us-east-2.amazonaws.com/>
70 |  - <https://s3.console.aws.amazon.com/s3/buckets/richcontext/corpus_docs/>
71 | 
72 | The directory structure of the public S3 bucket is similar to the
73 | directory structure used for resources in this repo:
74 | 
75 | - richcontext
76 |   - corpus_docs
77 |     - pdf
78 |     - json
79 | 


--------------------------------------------------------------------------------
/bin/textrank_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | 
 5 | from phrase_extraction_pipeline import setup, extract_phrases
 6 | from parsed_json_interpreter import ParsedJsonInterpreter
 7 | from pathlib import Path
 8 | import pytextrank
 9 | import codecs
10 | import json
11 | import spacy
12 | import unittest
13 | 
14 | 
15 | class TestVerifyTextRank (unittest.TestCase):
16 |     EXAMPLE_TITLE = [
17 |         "trajGANs: Using generative adversarial networks for geo-privacy protection of trajectory data (Vision paper) ",
18 |         "1 Introduction and motivation ",
19 |         "2 Trajectory types and data generation scenarios ",
20 |         "3 The trajGANs framework ",
21 |         "5 Conclusions and Discussion ",
22 |         "References "
23 |     ]
24 | 
25 | 
26 |     EXAMPLE_TEXTRANK = [
27 |         ["generative adversarial networks", "real data", "xi hanzhou chen1"],
28 |         ["real data", "data", "trajectory data"],
29 |         ["place- based trajectories", "synthetic trajectories", "human trajectories"],
30 |         ["place- based trajectories", "synthetic trajectory samples", "synthetic trajectories"],
31 |         ["real data", "original data", "pre-calculated statistical metrics"],
32 |         ["generative adversarial networks", "deep convolutional generative adversarial networks", "s."]
33 |     ]
34 | 
35 | 
36 |     def setUp (self):
37 |         '''run the example file'''
38 |         nlp, resource_path = setup(testing=True)
39 |         extract_phrases(nlp, resource_path, limit_keyphrase=3, verbose=False)
40 | 
41 |         tr_path = resource_path / "tr" 
42 |         tr_file = tr_path / "PE_Example.json"
43 | 
44 |         with codecs.open(tr_file, "r", encoding="utf8") as f:
45 |             self.example_file = json.load(f)
46 | 
47 | 
48 |     def test_key_phrases (self):
49 |         for i, section in enumerate(self.example_file):
50 |             for j, textrank in enumerate(section["text_rank"]):
51 |                 self.assertTrue(textrank == self.EXAMPLE_TEXTRANK[i][j])
52 | 
53 | 
54 |     def test_section_titles (self):
55 |         for i, section in enumerate(self.example_file):
56 |             self.assertTrue(section["section_title"] == self.EXAMPLE_TITLE[i])
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/bin/sampleConfig.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 0.5,
  3 |   "extractor": {
  4 |     "pdf": "pdfminer",
  5 |     "img": "tesseract",
  6 |     "language": ["eng", "fra"]
  7 |   },
  8 |   "cleaner": [
  9 |     "out-of-page-removal",
 10 |     [
 11 |       "whitespace-removal",
 12 |       {
 13 |         "minWidth": 0
 14 |       }
 15 |     ],
 16 |     [
 17 |       "redundancy-detection",
 18 |       {
 19 |         "minOverlap": 0.5
 20 |       }
 21 |     ],
 22 |     [
 23 |       "table-detection",
 24 |       {
 25 |         "runConfig": [
 26 |           {
 27 |             "pages": [],
 28 |             "flavor": "lattice"
 29 |           }
 30 |         ]
 31 |       }
 32 |     ],
 33 |     [
 34 |       "header-footer-detection",
 35 |       {
 36 |         "ignorePages": [],
 37 |         "maxMarginPercentage": 15
 38 |       }
 39 |     ],
 40 |     [
 41 |       "reading-order-detection",
 42 |       {
 43 |         "minVerticalGapWidth": 5,
 44 |         "minColumnWidthInPagePercent": 15
 45 |       }
 46 |     ],
 47 |     "link-detection",
 48 |     [
 49 |       "words-to-line",
 50 |       {
 51 |         "lineHeightUncertainty": 0.2,
 52 |         "topUncertainty": 0.4,
 53 |         "maximumSpaceBetweenWords": 100,
 54 |         "mergeTableElements": false
 55 |       }
 56 |     ],
 57 |     [
 58 |       "lines-to-paragraph",
 59 |       {
 60 |         "tolerance": 0.25
 61 |       }
 62 |     ],
 63 |     "heading-detection",
 64 |     "list-detection",
 65 |     "page-number-detection",
 66 |     "hierarchy-detection",
 67 |     [
 68 |       "regex-matcher",
 69 |       {
 70 |         "isCaseSensitive": true,
 71 |         "isGlobal": true,
 72 |         "queries": [
 73 |           {
 74 |             "label": "Car",
 75 |             "regex": "([A-Z]{2}\\-[\\d]{3}\\-[A-Z]{2})"
 76 |           },
 77 |           {
 78 |             "label": "Age",
 79 |             "regex": "(\\d+)[ -]*(ans|jarige)"
 80 |           },
 81 |           {
 82 |             "label": "Percent",
 83 |             "regex": "([\\-]?(\\d)+[\\.\\,]*(\\d)*)[ ]*(%|per|percent|pourcent|procent)"
 84 |           }
 85 |         ]
 86 |       }
 87 |     ]
 88 |   ],
 89 |   "output": {
 90 |     "granularity": "word",
 91 |     "includeMarginals": false,
 92 |     "formats": {
 93 |       "json": true,
 94 |       "text": true,
 95 |       "csv": true,
 96 |       "markdown": true,
 97 |       "pdf": false
 98 |     }
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | import json
  5 | import networkx as nx
  6 | import os
  7 | import rdflib
  8 | import sys
  9 | import tempfile
 10 | 
 11 | 
 12 | ######################################################################
 13 | ## NetworkX
 14 | 
 15 | LABEL = {}
 16 | 
 17 | def get_item_index (item):
 18 |     global LABEL
 19 |     item = str(item)
 20 | 
 21 |     if item not in LABEL:
 22 |         index = len(LABEL)
 23 |         LABEL[item] = index
 24 |     else:
 25 |         index = LABEL[item]
 26 | 
 27 |     return index
 28 | 
 29 | 
 30 | def make_nxgraph (graph):
 31 |     g = nx.Graph()
 32 | 
 33 |     for s, p, o in graph:
 34 |         s_idx = get_item_index(s)
 35 |         o_idx = get_item_index(o)
 36 | 
 37 |         print(s_idx, str(s))
 38 |         print(o_idx, str(o))
 39 | 
 40 |         g.graph[s_idx] = str(s)
 41 |         g.graph[o_idx] = str(o)
 42 | 
 43 |         e = (s_idx, o_idx)
 44 |         g.add_edge(*e)
 45 | 
 46 |         g[s_idx][o_idx]["label"] = str(p)
 47 | 
 48 |     print(g.graph)
 49 | 
 50 | 
 51 | def wrap_token (token):
 52 |     if token.startswith("http"):
 53 |         return "<{}>".format(token)
 54 |     else:
 55 |         return "\"{}\"".format(token)
 56 | 
 57 | 
 58 | PREAMBLE = """
 59 | @base <https://github.com/Coleridge-Initiative/adrf-onto/wiki/Vocabulary> .
 60 | 
 61 | @prefix cito:	<http://purl.org/spar/cito/> .
 62 | @prefix dct:	<http://purl.org/dc/terms/> .
 63 | @prefix foaf:	<http://xmlns.com/foaf/0.1/> .
 64 | @prefix rdf:	<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
 65 | @prefix xsd:	<http://www.w3.org/2001/XMLSchema#> .
 66 | """
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     # load the graph
 71 |     filename = sys.argv[1]
 72 |     graph = rdflib.Graph().parse(filename, format="n3")
 73 | 
 74 |     # enumerate all of the relations
 75 |     term = "dataset-11a95bfc951f7d23206a"
 76 |     out_triples = set([])
 77 | 
 78 |     for s, p, o in graph:
 79 |         if s.endswith(term):
 80 |             out_triples.add((s, p, o,))
 81 | 
 82 |         elif o.endswith(term):
 83 |             out_triples.add((s, p, o,))
 84 | 
 85 |     ## write to in-memory file
 86 |     f = tempfile.NamedTemporaryFile(delete=False)
 87 |     f.write(PREAMBLE.encode("utf-8"))
 88 | 
 89 |     for s, p, o in out_triples:
 90 |         line = "{} {} {} .\n".format(wrap_token(s), wrap_token(p), wrap_token(o))
 91 |         f.write(line.encode("utf-8"))
 92 | 
 93 |     f.close()
 94 | 
 95 |     # serialize the graph as JSON-LD
 96 |     with open("vocab.json", "r") as v:
 97 |         context = json.load(v)
 98 | 
 99 |     graph = rdflib.Graph().parse(f.name, format="n3")
100 |     os.unlink(f.name)
101 | 
102 |     buf = graph.serialize(format="json-ld", context=context, indent=None)
103 |     print(buf)
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/bin/parsed_json_interpreter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import errno
 5 | import os
 6 | 
 7 | 
 8 | class ParsedJsonInterpreter (object):
 9 | 
10 |     def __init__ (self, object):
11 |         self.object = object
12 | 
13 |     
14 |     def FindObject (self, object):
15 |         """
16 |         Parse the JSON and convert it to text, divided by sections,
17 |         and extract the title of the section
18 |         """
19 |         texts = []
20 |         res = []
21 |         titles = []
22 | 
23 |         for page in self.object['pages']:
24 |             for element in page['elements']:
25 |                 try:    
26 |                     if element['type'] == 'heading':
27 |                         title = self.GetText(element)
28 |                         titles.append(title)
29 |                         texts.append(res)
30 |                         res = []
31 | 
32 |                     if element['type'] in ['word', 'line', 'character', 'paragraph', 'heading', 'list']:
33 |                         res.append(element)
34 |                 except TypeError:
35 |                     continue
36 | 
37 |         texts.append(res)
38 |         return texts[1:], titles
39 |     
40 |     
41 |     def GetText (self, text_object):
42 |         result = ""
43 | 
44 |         if text_object['type'] in ['paragraph','heading','list']:
45 |             for i in text_object['content']:
46 |                 result += self.GetText(i)
47 | 
48 |         if text_object['type'] in ['line']:
49 |             for i in text_object['content']:
50 |                 result += self.GetText(i)
51 | 
52 |         elif text_object['type'] in ['word']:
53 |             if type(text_object['content']) is list:
54 |                 for i in text_object['content']:
55 |                     result += self.GetText(i)
56 |             else:
57 |                 result += text_object['content']
58 |                 result += ' '
59 | 
60 |         elif text_object['type'] in ['character']:
61 |             result += text_object['content']
62 | 
63 |         return result
64 |     
65 | 
66 |     def GetSectionalText (self, object):
67 |         """
68 |         Get the text of a section
69 |         """
70 |         text = ""
71 |         sections = []
72 |         text_lists, titles = self.FindObject(object)
73 | 
74 |         for text_list in text_lists:
75 |             for text_Obj in text_list:
76 |                 text += self.GetText(text_Obj)
77 |                 text += '\n\n'
78 | 
79 |             sections.append(text)
80 |             text = ""
81 |         
82 |         return sections, titles
83 | 
84 | 
85 | def mkdir (path):
86 |     """
87 |     check if the direcory already exists, and if not create a new one
88 |     """
89 |     try:
90 |         os.makedirs(path)
91 |     except OSError as exception:
92 |         if exception.errno != errno.EEXIST:
93 |             raise
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/bin/phrase_extraction_pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | from parsed_json_interpreter import ParsedJsonInterpreter, mkdir
 5 | from pathlib import Path
 6 | import codecs
 7 | import json
 8 | import pytextrank
 9 | import spacy
10 | import sys
11 | 
12 | 
13 | def setup (base_path=".", testing=False):
14 |     """
15 |     add PyTextRank into the spaCy pipeline, then set up the input
16 |     directory path for test vs. production env
17 |     """
18 |     nlp = spacy.load("en_core_web_sm")
19 |     tr = pytextrank.TextRank(logger=None)
20 | 
21 |     nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
22 | 
23 |     if testing:
24 |         resource_path = Path(base_path) / "example/pub"
25 |     else:
26 |         resource_path = Path(base_path) / "resources/pub"
27 | 
28 |     return nlp, resource_path
29 | 
30 | 
31 | def extract_phrases (nlp, resource_path, limit_keyphrase=15, verbose=True):
32 |     """
33 |     run PyTextRank on Parsr output to extract and rank key phrases
34 |     """
35 |     json_dir = resource_path / "json"
36 | 
37 |     if verbose:
38 |         print(f"scanning input directory: {json_dir}")
39 | 
40 |     for parse_file in list(json_dir.glob("*.json")):
41 |         if verbose:
42 |             print(f"loading {parse_file}")
43 | 
44 |         with codecs.open(parse_file, "r", encoding="utf8") as f:
45 |             parsr_object = json.load(f)
46 |        
47 |         # parse the JSON and convert it to text, divided by sections,
48 |         # then extract the title of each section
49 |         parsr_interpreter = ParsedJsonInterpreter(parsr_object)
50 |         sections, titles = parsr_interpreter.GetSectionalText(parsr_object)
51 |         
52 |         # run TextRank and collect the ranked keyphrases
53 |         results = []
54 | 
55 |         for i, section in enumerate(sections):
56 |             doc = nlp(section)
57 |             phrases = {}
58 |             final = {}
59 | 
60 |             for phrase in doc._.phrases[:limit_keyphrase]:
61 |                 phrases[phrase.text] = {"count": phrase.count, "rank_score": phrase.rank}
62 |          
63 |             final["section_title"] = titles[i]
64 |             final["text_rank"] = phrases
65 |             results.append(final)
66 | 
67 |             if verbose:
68 |                 print("section: {}".format(final["section_title"]))
69 | 
70 |         # output the ranked results to JSON 
71 | 
72 |         tr_path = resource_path / "tr"
73 |         mkdir(tr_path)
74 | 
75 |         tr_file = parse_file.stem + ".json"
76 |         output_path = tr_path / tr_file
77 | 
78 |         with codecs.open(output_path, "wb", encoding="utf8") as f:
79 |             json.dump(results, f, indent=4, ensure_ascii=False)
80 | 
81 |         if verbose:
82 |             print(f"completed: {output_path}")
83 | 
84 |                       
85 | if __name__ == "__main__":
86 |     nlp, resource_path = setup(testing=True)
87 |     extract_phrases(nlp, resource_path)
88 | 


--------------------------------------------------------------------------------
/bin/extract_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | from pathlib import Path
 5 | from tqdm import tqdm
 6 | import codecs
 7 | import os
 8 | import pdfx
 9 | import ray
10 | import sys
11 | import time
12 | import traceback
13 | 
14 | 
15 | def extract_text (file_path):
16 |     """                                                                                                                      
17 |     parse text from PDF                                                                                                      
18 |     """
19 |     text = None
20 |     page_count = 0
21 | 
22 |     try:
23 |         pdf_meta = pdfx.PDFx(file_path)
24 |         meta = pdf_meta.get_metadata()
25 |         page_count = meta["Pages"]
26 | 
27 |         # split into sections                                                                                                   
28 |         buf = []
29 |         grafs = []
30 | 
31 |         for line in pdf_meta.get_text().split("\n"):
32 |             line = line.strip()
33 |             buf.append(line)
34 | 
35 |             if len(line) < 1:
36 |                 section = " ".join(buf).strip().replace("- ", "") + "\n"
37 |                 grafs.append(section)
38 |                 buf = []
39 | 
40 |         text = "\n".join(grafs)
41 |     except:
42 |         print(f"ERROR parsing {pdf_file}")
43 |         traceback.print_exc()
44 |     finally:
45 |         return text, page_count
46 | 
47 | 
48 | def enum_pdfs (pdf_dir, txt_dir):
49 |     """
50 |     enumerate all of the non-zero downloaded PDF files
51 |     """
52 |     uuid_set = set([])
53 | 
54 |     for pdf_file in list(pdf_dir.glob("*.pdf")):
55 |         if os.path.getsize(pdf_file) > 0:
56 |             uuid_set.add(pdf_file.stem)
57 | 
58 |     # filter out PDF files that have already been converted to text
59 |     for txt_file in list(txt_dir.glob("*.txt")):
60 |         if txt_file.stem in uuid_set and os.path.getsize(txt_file) > 0:
61 |             uuid_set.remove(txt_file.stem)
62 | 
63 |     for uuid in uuid_set:
64 |         yield uuid
65 | 
66 | 
67 | @ray.remote
68 | def convert_pdf (pdf_dir, txt_dir, uuid):
69 |     t0 = time.time()
70 |     pdf_file = pdf_dir / f"{uuid}.pdf"
71 |     txt_file = txt_dir / f"{uuid}.txt"
72 | 
73 |     text, page_count = extract_text(pdf_file.as_posix())
74 | 
75 |     if text and len(text) > 0:
76 |         with codecs.open(txt_file, "wb", encoding="utf8") as f:
77 |             f.write(text)
78 | 
79 |         timing = time.time() - t0
80 |         print("\n{} {:.3f} s".format(uuid, timing))
81 | 
82 | 
83 | def main ():
84 |     pdf_dir = Path.cwd() / "resources/pub/pdf"
85 |     txt_dir = Path.cwd() / "resources/pub/txt"
86 |     task_ids = []
87 | 
88 |     ray.init()
89 | 
90 |     for uuid in tqdm(enum_pdfs(pdf_dir, txt_dir), ascii=True, desc="convert pdf"):
91 |         task_ids.append(convert_pdf.remote(pdf_dir, txt_dir, uuid))
92 | 
93 |     ray.get(task_ids)
94 | 
95 |     
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/bin/parsr_output_interpreter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | #
 5 | # Copyright 2019 AXA Group Operations S.A.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | import logging
21 | import pandas as pd
22 | from io import StringIO
23 | 
24 | class ParsrOutputInterpreter(object):
25 | 	def __init__(self, object=None):
26 | 		logging.basicConfig(level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s')
27 | 		self.object = None
28 | 		if object is not None:
29 | 			self.load_object(object)
30 | 	
31 | 	def __get_text_types(self):
32 | 		return ['word', 'line', 'character', 'paragraph', 'heading']
33 | 
34 | 	def __get_text_objects(self, page_number=None):
35 | 		texts = []
36 | 		if page_number is not None:
37 | 			page = self.get_page(page_number)
38 | 			if page is None:
39 | 				logging.error("Cannot get text elements for the requested page; Page {} not found".format(page_number))
40 | 				return None
41 | 			else:
42 | 				for element in page['elements']:
43 | 					if element['type'] in self.__get_text_types():
44 | 						texts.append(element)
45 | 		else:
46 | 			for page in self.object['pages']:
47 | 				for element in page['elements']:
48 | 					if element['type'] in self.__get_text_types():
49 | 						texts.append(element)
50 | 		return texts
51 | 	
52 | 	def __text_from_text_object(self, text_object:dict) -> str:
53 | 		result = ""
54 | 		if text_object['type'] in ['paragraph', 'heading']:
55 | 			for i in text_object['content']:
56 | 				result += self.__text_from_text_object(i)
57 | 		elif text_object['type'] in ['line']:
58 | 			for i in text_object['content']:
59 | 				result += self.__text_from_text_object(i)
60 | 		elif text_object['type'] in ['word']:
61 | 			if type(text_object['content']) is list:
62 | 				for i in text_object['content']:
63 | 					result += self.__text_from_text_object(i)
64 | 			else:
65 | 				result += text_object['content']
66 | 				result += ' '
67 | 		elif text_object['type'] in ['character']:
68 | 			result += text_object['content']
69 | 		return result
70 | 	
71 | 	def load_object(self, object):
72 | 		self.object = object
73 | 
74 | 	def get_page(self, page_number):
75 | 		for p in self.object['pages']:
76 | 			if p['pageNumber'] == page_number:
77 | 				return p
78 | 		logging.error("Page {} not found".format(page_number))
79 | 		return None
80 | 
81 | 	def get_text(self, page_number:int=None) -> str:
82 | 		final_text = ""
83 | 		for textObj in self.__get_text_objects(page_number):
84 | 			final_text += self.__text_from_text_object(textObj)
85 | 			final_text += "\n\n"
86 | 		return final_text
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tracking Progress in Rich Context
 2 | 
 3 | [The Coleridge Initiative](https://coleridgeinitiative.org/richcontext) 
 4 | at NYU has been researching [*Rich Context*](https://coleridgeinitiative.org/richcontext)
 5 | to enhance search and discovery of datasets used in scientific research – see the 
 6 | [_Background Info_](https://github.com/Coleridge-Initiative/rclc/wiki/Background-Info) 
 7 | section for more details.
 8 | Partnering with experts throughout academia and industry, NYU-CI has
 9 | worked to leverage the closely adjacent fields of NLP/NLU, knowledge
10 | graph, recommender systems, scholarly infrastructure, data mining from
11 | scientific literature, dataset discovery, linked data, open vocabularies,
12 | metadata management, data governance, and so on.
13 | Leaderboards are published here on GitHub to track _state-of-the-art_ 
14 | (SOTA) progress among the top results.
15 | 
16 | ---
17 | 
18 | ## Leaderboard 1
19 | 
20 | ### Entity Linking for Datasets in Publications
21 | 
22 | The first challenge is to identify the datasets used in research
23 | publications, initially focused on the problem of 
24 | [_entity linking_](https://nlpprogress.com/english/entity_linking.html).
25 | Research papers generally mention the datasets they've used, although there
26 | are limited formal means to describe that metadata in a machine-readable way.
27 | The goal here is to predict a set of dataset IDs for each publication.
28 | The dataset IDs within the corpus represent the set of all possible datasets
29 | which will appear.
30 | 
31 | Identifying dataset mentions typically requires:
32 | 
33 |   * extracting text from an open access PDF
34 |   * some NLP parsing of the text
35 |   * feature engineering (e.g., attention to where text is located in a paper)
36 |   * modeling to identify up to 5 datasets per publication
37 | 
38 | See [_Evaluating Models for Entity Linking with Datasets_](https://github.com/Coleridge-Initiative/rclc/wiki/Evaluating-Models-for-Entity-Linking-with-Datasets)
39 | for details about how the `Top5uptoD` leaderboard metric is calculated.
40 | 
41 | 
42 | ## Instructions
43 | 
44 |   * [How To Participate](https://github.com/Coleridge-Initiative/rclc/wiki/How-To-Participate)
45 |   * [Corpus Description](https://github.com/Coleridge-Initiative/rclc/wiki/Corpus-Description)
46 |   * [Download Resource Files](https://github.com/Coleridge-Initiative/rclc/wiki/Downloading-Resource-Files)
47 |   * [Background Info](https://github.com/Coleridge-Initiative/rclc/wiki/Background-Info)
48 |   * [Workflow Stages](https://github.com/Coleridge-Initiative/rclc/wiki/Workflow-Stages)
49 |   * [Glossary Terms](https://github.com/Coleridge-Initiative/rclc/wiki/Glossary-Terms)
50 | 
51 | Use of open source and open standards are especially important to
52 | further the cause for effective, reproducible research. 
53 | We're hosting this competition to focus on the research challenges
54 | of specific machine learning use cases encountered within Rich Context – see the 
55 | [_Workflow Stages_](https://github.com/Coleridge-Initiative/rclc/wiki/Workflow-Stages)
56 | section. 
57 | 
58 | If you have any questions about the Rich Context leaderboard 
59 | competition – and especially if you identify any problems in the 
60 | corpus (e.g., data quality, incorrect metadata, broken links, etc.) – 
61 | please use the GitHub issues for this repo and pull requests to 
62 | report, discuss, and resolve them.
63 | 


--------------------------------------------------------------------------------
/bin/upload_s3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | from git import Repo
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | import boto3
  8 | import datetime
  9 | import json
 10 | import os
 11 | import sys
 12 | 
 13 | BUCKET_NAME = "richcontext"
 14 | 
 15 | 
 16 | def access_bucket (handle):
 17 |     """
 18 |     initialize access to the bucket
 19 |     """
 20 |     bucket = handle.Bucket(BUCKET_NAME)
 21 |     return bucket
 22 | 
 23 | 
 24 | def upload_file (handle, local_path, grid_path):
 25 |     """
 26 |     upload a local file to the bucket
 27 |     """
 28 |     handle.meta.client.upload_file(local_path, BUCKET_NAME, grid_path)
 29 | 
 30 | 
 31 | def list_uploaded_files (bucket, prefix, kind):
 32 |     """
 33 |     list the files of a particular kind which have already been
 34 |     uploaded to the bucket
 35 |     """
 36 |     done = set([])
 37 |     extension = f".{kind}"
 38 | 
 39 |     for obj in bucket.objects.filter(Prefix=prefix + "/pub/" + kind):
 40 |         if obj.key.endswith(extension):
 41 |             uuid = obj.key.split("/")[3].split(extension)[0]
 42 |             done.add(uuid)
 43 |     
 44 |     return done
 45 | 
 46 | 
 47 | def iter_needed_files (dir_path, kind, done):
 48 |     """
 49 |     iterator for the local files of a particular kind which 
 50 |     haven't been uploaded yet
 51 |     """
 52 |     for file_name in tqdm(list(dir_path.glob(f"*.{kind}")), ascii=True, desc=f"{kind} files"):
 53 |         uuid = file_name.stem
 54 | 
 55 |         if uuid not in done:
 56 |             yield uuid
 57 | 
 58 | 
 59 | def upload_needed_files (handle, bucket, prefix, dir_path, kind, iter):
 60 |     """
 61 |     upload the needed local files of a particular kind
 62 |     """
 63 |     extension = f".{kind}"
 64 |     count = 0
 65 | 
 66 |     for uuid in iter:
 67 |         file_name = uuid + extension
 68 |         local_path = dir_path / file_name
 69 |         grid_path = prefix + "/pub/" + kind + "/"
 70 | 
 71 |         #print("uploading {} to {}".format(local_path, grid_path))
 72 | 
 73 |         upload_file(handle, local_path.as_posix(), grid_path + file_name)
 74 |         count += 1
 75 | 
 76 |     return count
 77 |     
 78 | 
 79 | def manage_upload (handle, bucket, prefix, pub_dir, kind):
 80 |     """
 81 |     manage the upload for a particular kind of file
 82 |     """
 83 |     dir_path = pub_dir / kind
 84 |     done = list_uploaded_files(bucket, prefix, kind)
 85 |     iter = iter_needed_files(dir_path, kind, done)  
 86 |     count = upload_needed_files(handle, bucket, prefix, dir_path, kind, iter)
 87 | 
 88 |     return len(done), count
 89 | 
 90 | 
 91 | def write_manifest (handle, prefix, manifest_data, file_name="MANIFEST.txt"):
 92 |     """
 93 |     summarize details about the upload to a `MANIFEST.txt` 
 94 |     file in the bucket
 95 |     """
 96 |     with open(file_name, "w") as f:
 97 |         for key, val in manifest_data.items():
 98 |             f.write("{}: {}\n".format(key, str(val)))
 99 | 
100 |     grid_path = prefix + "/" + file_name
101 |     upload_file(handle, file_name, grid_path)
102 | 
103 | 
104 | def main ():
105 |     # locate the Git tag info
106 |     git_path = Path.cwd().as_posix()
107 |     repo = Repo(git_path)
108 |     tags = sorted(repo.tags, key=lambda t: t.commit.committed_datetime)
109 | 
110 |     # set up the manifest
111 |     manifest_data = {}
112 |     manifest_data["date"] = datetime.date.today().strftime("%Y-%m-%d")
113 |     manifest_data["release"] = tags[-1]
114 | 
115 |     # connect to the storage grid bucket
116 |     handle = boto3.resource("s3")
117 |     bucket = access_bucket(handle)
118 |     prefix = "corpus_docs"
119 | 
120 |     # set up the local paths
121 |     pub_dir = Path.cwd() / "resources/pub"
122 |     
123 |     # which PDF files do we need to upload?
124 |     count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "pdf")
125 |     manifest_data["uploaded_pdf"] = count + prev_count
126 | 
127 |     # which JSON files do we need to upload?
128 |     count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "json")
129 |     manifest_data["uploaded_json"] = count + prev_count
130 | 
131 |     # which TXT files do we need to upload?
132 |     count, prev_count = manage_upload(handle, bucket, prefix, pub_dir, "txt")
133 |     manifest_data["uploaded_txt"] = count + prev_count
134 | 
135 |     # write upload details to manifest
136 |     write_manifest(handle, prefix, manifest_data)
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     main()
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/bin/parsr_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | #
  5 | # Copyright 2019 AXA Group Operations S.A.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | #
 19 | 
 20 | from glob import glob
 21 | from itertools import chain
 22 | import os
 23 | import sys
 24 | import json
 25 | import time
 26 | 
 27 | from sxsdiff import DiffCalculator
 28 | from sxsdiff.generators.github import GitHubStyledGenerator
 29 | 
 30 | import diff_match_patch
 31 | import pandas as pd
 32 | import requests
 33 | from io import StringIO
 34 | 
 35 | 
 36 | 
 37 | class ParserClient():
 38 | 	def __init__(self, server):
 39 | 		self.version_history = {}
 40 | 		self.set_server(server)
 41 | 		self.set_current_request_id("")
 42 | 
 43 | 	def __supported_input_files(self) -> list:
 44 | 		return ['*.pdf', '*.jpg', '*.jpeg', '*.png', '*.tiff', '*.tif',]
 45 | 
 46 | 	def set_server(self, server:str):
 47 | 		self.server = server
 48 | 
 49 | 	def set_current_request_id(self, request_id:str):
 50 | 		self.request_id = request_id
 51 | 
 52 | 	def send_document(self, file:str, config:str, server:str="", document_name:str=None, wait_till_finished:bool=False, save_request_id:bool=False) -> dict:
 53 | 		if server == "":
 54 | 			if self.server == "":
 55 | 				raise Exception('No server address provided')
 56 | 			else:
 57 | 				server = self.server
 58 | 		packet = {
 59 | 			'file': (file, open(file, 'rb'), 'application/pdf'),
 60 | 			'config': (config, open(config, 'rb'), 'application/json'),
 61 | 		}
 62 | 
 63 | 		r = requests.post('http://'+server+'/api/v1/document', files=packet)
 64 | 		jobId = r.text
 65 | 
 66 | 		if not document_name:
 67 | 			document_name = os.path.splitext(os.path.basename(file))[0]
 68 | 
 69 | 		if document_name not in self.version_history:
 70 | 			self.version_history[document_name] = [jobId]
 71 | 		else:
 72 | 			self.version_history[document_name].append(jobId)
 73 | 		if save_request_id:
 74 | 			self.set_current_request_id(jobId)
 75 | 		if not wait_till_finished:
 76 | 			return {'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text}
 77 | 		else:
 78 | 			print('> Polling server for the job {}...'.format(jobId))
 79 | 			server_status_response = self.get_status(jobId)['server_response']
 80 | 			while ('progress-percentage' in server_status_response):
 81 | 				print('>> Progress percentage: {}'.format(server_status_response['progress-percentage']))
 82 | 				time.sleep(2)
 83 | 				server_status_response = self.get_status(jobId)['server_response']
 84 | 			print('>> Job done!')
 85 | 			return {'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text}
 86 | 
 87 | 	def get_versions(self, document_name:str) -> list:
 88 | 		if document_name in self.version_history:
 89 | 			return self.version_history[document_name]
 90 | 		else:
 91 | 			return []
 92 | 
 93 | 	def send_documents_folder(self, folder:str, config:str, server:str="") -> list:
 94 | 		if server == "":
 95 | 			if self.server == "":
 96 | 				raise Exception('No server address provided')
 97 | 			else:
 98 | 				server = self.server
 99 | 		responses = []
100 | 		os.chdir(folder)
101 | 		files = [glob.glob(e) for e in self.__supported_input_files()]
102 | 		files_flat = list(chain.from_iterable(files))
103 | 		for file in files_flat:
104 | 			packet = {
105 | 				'file': (file, open(file, 'rb'), 'application/pdf'),
106 | 				'config': (config, open(config, 'rb'), 'application/json'),
107 | 			}
108 | 			r = requests.post('http://'+server+'/api/v1/document', files=packet)
109 | 			responses.append({'file': file, 'config': config, 'status_code': r.status_code, 'server_response': r.text})
110 | 		return responses
111 | 
112 | 	def get_status(self, request_id:str="", server:str=""):
113 | 		if server == "":
114 | 			if self.server == "":
115 | 				raise Exception('No server address provided')
116 | 			else:
117 | 				server = self.server
118 | 		if request_id == "":
119 | 			if self.request_id == "":
120 | 				raise Exception('No request ID provided')
121 | 			else:
122 | 				request_id = self.request_id
123 | 		if self.server == "":
124 | 			raise Exception('No server address provided')
125 | 		r = requests.get('http://{}/api/v1/queue/{}'.format(server, request_id))
126 | 		return {'request_id': request_id, 'server_response': json.loads(r.text)}
127 | 
128 | 	def get_json(self, request_id:str="", server:str=""):
129 | 		if server == "":
130 | 			if self.server == "":
131 | 				raise Exception('No server address provided')
132 | 			else:
133 | 				server = self.server
134 | 		if request_id == "":
135 | 			if self.request_id == "":
136 | 				raise Exception('No request ID provided')
137 | 			else:
138 | 				request_id = self.request_id
139 | 		r = requests.get('http://{}/api/v1/json/{}'.format(server, request_id))
140 | 		if r.text != "":
141 | 			return r.json()
142 | 		else:
143 | 			return {'request_id': request_id, 'server_response': r.json()}
144 | 
145 | 	def get_markdown(self, request_id:str="", server:str=""):
146 | 		if server == "":
147 | 			if self.server == "":
148 | 				raise Exception('No server address provided')
149 | 			else:
150 | 				server = self.server
151 | 		if request_id == "":
152 | 			if self.request_id == "":
153 | 				raise Exception('No request ID provided')
154 | 			else:
155 | 				request_id = self.request_id
156 | 		r = requests.get('http://{}/api/v1/markdown/{}'.format(server, request_id))
157 | 		if r.text != "":
158 | 			return r.text
159 | 		else:
160 | 			return {'request_id': request_id, 'server_response': r.text}
161 | 
162 | 	def get_text(self, request_id:str="", server:str=""):
163 | 		if server == "":
164 | 			if self.server == "":
165 | 				raise Exception('No server address provided')
166 | 			else:
167 | 				server = self.server
168 | 		if request_id == "":
169 | 			if self.request_id == "":
170 | 				raise Exception('No request ID provided')
171 | 			else:
172 | 				request_id = self.request_id
173 | 		r = requests.get('http://{}/api/v1/text/{}'.format(server, request_id))
174 | 		if r.text != "":
175 | 			return r.text
176 | 		else:
177 | 			return {'request_id': request_id, 'server_response': r.text}
178 | 
179 | 	def get_table(self, request_id:str="", page=None, table=None, seperator=";", server:str=""):
180 | 		if server == "":
181 | 			if self.server == "":
182 | 				raise Exception('No server address provided')
183 | 			else:
184 | 				server = self.server
185 | 		if request_id == "":
186 | 			if self.request_id == "":
187 | 				raise Exception('No request ID provided')
188 | 			else:
189 | 				request_id = self.request_id
190 | 		if page is None and table is None:
191 | 			r = requests.get('http://{}/api/v1/csv/{}'.format(server, request_id))
192 | 		else:
193 | 			r = requests.get('http://{}/api/v1/csv/{}/{}/{}'.format(server, request_id, page, table))
194 | 		if r.text != "":
195 | 			try:
196 | 				df = pd.read_csv(StringIO(r.text), sep=seperator)
197 | 				df.loc[:, ~df.columns.str.match('Unnamed')]
198 | 				df = df.where((pd.notnull(df)), " ")
199 | 				return df
200 | 			except Exception as e:
201 | 				return {'request_id': request_id, 'server_response': r.text}
202 | 		else:
203 | 			return {'request_id': request_id, 'server_response': r.text}
204 | 
205 | 	def compare_versions(self, request_ids:list, pretty_html:bool = False):
206 | 		diffs = []
207 | 		for i in range(0, len(request_ids) - 1):
208 | 			request_id1 = request_ids[i]
209 | 			request_id2 = request_ids[i + 1]
210 | 			md1 = self.get_markdown(request_id1)
211 | 			md2 = self.get_markdown(request_id2)
212 | 
213 | 			if pretty_html:
214 | 				sxsdiff_result = DiffCalculator().run(md1, md2)
215 | 				html_store = StringIO()
216 | 				GitHubStyledGenerator(file=html_store).run(sxsdiff_result)
217 | 				html_diff = html_store.getvalue()
218 | 				diffs.append(html_diff)
219 | 			else:
220 | 				dmp = diff_match_patch.diff_match_patch()
221 | 				diff = dmp.diff_main(md1, md2)
222 | 				dmp.diff_cleanupSemantic(diff)
223 | 				diffs.append(diff)
224 | 		return diffs
225 | 


--------------------------------------------------------------------------------
/bin/download_resources.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | from bs4 import BeautifulSoup  # type: ignore
  5 | from pathlib import Path
  6 | from pdfminer.pdfdocument import PDFDocument  # type: ignore
  7 | from pdfminer.pdfpage import PDFTextExtractionNotAllowed  # type: ignore
  8 | from pdfminer.pdfparser import PDFParser, PDFSyntaxError  # type: ignore
  9 | from requests_html import HTMLSession  # type: ignore
 10 | from tqdm import tqdm  # type: ignore
 11 | from typing import Any, Dict, List, Tuple
 12 | from urllib.parse import urlparse
 13 | import argparse
 14 | import csv
 15 | import json
 16 | import logging  # type: ignore
 17 | import ray  # type: ignore
 18 | import requests
 19 | import sys
 20 | import time
 21 | import traceback
 22 | 
 23 | DEFAULT_LOGGER_FILE = None
 24 | DEFAULT_CORPUS_FILE = "corpus.jsonld"
 25 | DEFAULT_TODO_FILE = "todo.tsv"
 26 | DEFAULT_TODO_LIST = None
 27 | DEFAULT_OUTPUT_RESOURCE = "resources/"
 28 | DEFAULT_FORCE_DOWNLOAD = False
 29 | 
 30 | MAX_DOWNLOAD_TRIAL = 3
 31 | 
 32 | PUB_PDF_PATH = "pub/pdf/"
 33 | DAT_PAGE_PATH = "dat/"
 34 | 
 35 | LOGGER = None  # type: ignore
 36 | 
 37 | 
 38 | @ray.remote
 39 | class Worker (object):
 40 |     def __init__ (self):
 41 |         self.logger = LOGGER
 42 | 
 43 |     def train (self):
 44 |         self.logger.warning("print from inside worker")
 45 | 
 46 | 
 47 | def setup_logger (args) -> None:
 48 |     """ logging is optional: to debug, set the logging level
 49 |     """
 50 |     global LOGGER
 51 |     level = logging.WARNING
 52 | 
 53 |     if args.logger:
 54 |         logging.basicConfig(filename=args.logger, filemode="w", level=level)
 55 |     else:
 56 |         logging.basicConfig(stream=sys.stdout, level=level)
 57 | 
 58 |     LOGGER = logging.getLogger("RichContext")
 59 | 
 60 | 
 61 | def load_corpus (filename: str) -> dict:
 62 |     """ Load the corpus file (in JSON-LD format)
 63 |     """
 64 |     global LOGGER
 65 |     corpus = None
 66 | 
 67 |     with open(filename, "r") as f:
 68 |         jld_corpus = json.load(f)
 69 |         corpus = jld_corpus["@graph"]
 70 | 
 71 |     LOGGER.warning(f"number of records in the corpus: {len(corpus)}")
 72 |     return corpus
 73 | 
 74 | 
 75 | def generate_todo (flag: str, todo_pub: list, todo_dat: list) -> None:
 76 |     """ Generate a TODO file for downloads
 77 |     """
 78 |     if not flag:
 79 |         with open(DEFAULT_TODO_FILE, "wt") as f:
 80 |             writer = csv.writer(f, delimiter="\t")
 81 | 
 82 |             for t in todo_pub:
 83 |                 writer.writerow(t)
 84 | 
 85 |             for t in todo_dat:
 86 |                 writer.writerow(t)
 87 | 
 88 | 
 89 | def load_todo (filename: str) -> List[Any]:
 90 |     """ load a TSV file for the list of files to be downloaded
 91 |     """
 92 |     todo = []
 93 | 
 94 |     with open(filename, "r") as f:
 95 |         reader = csv.reader(f, delimiter="\t")
 96 | 
 97 |         for row in reader:
 98 |             todo.append(row)
 99 | 
100 | 
101 | def enum_pub_resources (corpus: dict, output_path: Path, force_download: bool) -> Tuple[Path, List[List[Any]]]:
102 |     """ Enumerate all publications PDF files from the corpus data to be 
103 |         downloaded, if not downloaded yet.  We use the entity id as filename.
104 |         All downloaded files are stored under `output_path` folder.
105 |         Input:
106 |             - corpus: corpus file containing a list of publications.
107 |             - output_path: path to store downloaded resources.
108 |             - force_download: always download resources.
109 |     """
110 |     global LOGGER
111 |     pub_path = output_path / PUB_PDF_PATH
112 | 
113 |     if not pub_path.exists():
114 |         pub_path.mkdir(parents=True)
115 | 
116 |     pubs = [e for e in corpus if e["@type"] == "ResearchPublication"]
117 | 
118 |     if force_download:
119 |         downloaded_pubs_id = set([])
120 |     else:
121 |         downloaded_pubs = list(pub_path.glob("*.pdf"))
122 |         downloaded_pubs_id = set([f.stem for f in downloaded_pubs])
123 | 
124 |     todo = []
125 | 
126 |     for entity in pubs:
127 |         e_id = urlparse(entity["@id"]).fragment.split("-")[1]
128 |         downloaded_before = e_id in downloaded_pubs_id
129 | 
130 |         if force_download or not downloaded_before:
131 |             if isinstance(entity["openAccess"], list):
132 |                 ## this only happens in the error case where
133 |                 ## publications are duplicated in the corpus
134 |                 res_url = entity["openAccess"][0]["@value"]
135 |                 LOGGER.warning("duplicate: {} {}".format(e_id, entity["openAccess"]))
136 |             else:
137 |                 res_url = entity["openAccess"]["@value"]
138 | 
139 |             todo.append(["pdf", e_id, res_url, pub_path])
140 | 
141 |     return pub_path, todo
142 | 
143 | 
144 | def enum_dat_resources (corpus: dict, output_path: Path, force_download: bool) -> Tuple[Path, List[List[Any]]]:
145 |     """ Enumerate all dataset "foaf:page" files from corpus data to be 
146 |         downloaded, if not downloaded yet. Uses the entity id as filename.
147 |         All downloaded files are stored under `output_path` folder.
148 |         Input:
149 |             - corpus: corpus file containing a list of datasets.
150 |             - output_path: path to store downloaded resources.
151 |             - force_download: always download resources.
152 |     """
153 |     global LOGGER
154 |     dat_path = output_path / DAT_PAGE_PATH
155 | 
156 |     if not dat_path.exists():
157 |         dat_path.mkdir(parents=True)
158 | 
159 |     dats = [e for e in corpus if e["@type"] == "Dataset"]
160 | 
161 |     if force_download:
162 |         downloaded_dat_id = set([])
163 |     else:
164 |         downloaded_datasets = list(dat_path.glob("*.*"))
165 |         downloaded_dat_id = set([f.stem for f in downloaded_datasets])
166 | 
167 |     todo = []
168 | 
169 |     for entity in dats:
170 |         e_id = urlparse(entity["@id"]).fragment.split("-")[1]
171 |         downloaded_before = e_id in downloaded_dat_id
172 | 
173 |         if force_download or not downloaded_before:
174 |             if "foaf:page" in entity:
175 |                 res_url = entity["foaf:page"]["@value"]
176 | 
177 |                 if res_url.startswith("http://example.com"):
178 |                     # ignore these placeholder URLs
179 |                     continue
180 |                 else:
181 |                     todo.append(["unknown", e_id, res_url, dat_path])
182 | 
183 |     return dat_path, todo
184 | 
185 | 
186 | def is_valid_pdf_file (filename: str) -> bool:
187 |     global LOGGER
188 | 
189 |     try:
190 |         with open(filename, "rb") as f:
191 |             parser = PDFParser(f)
192 |             document = PDFDocument(parser, "")
193 | 
194 |             if not document.is_extractable:
195 |                 raise PDFTextExtractionNotAllowed(filename)
196 | 
197 |             return True
198 |     except:
199 |         LOGGER.debug(traceback.format_exc())
200 |         LOGGER.debug(f"not valid PDF file: {filename}")
201 |         return False
202 | 
203 | 
204 | @ray.remote
205 | def _download (res_type: str, e_id: str, url: str, output_path: Path) -> Tuple[bool, str]:
206 |     """ Download a resource for the corpus and store it in a file
207 |     """
208 |     global LOGGER
209 | 
210 |     if res_type not in ["pdf", "html", "unknown"]:
211 |         raise ValueError(f"Invalid resource type: {res_type}")
212 | 
213 |     headers = requests.utils.default_headers()
214 |     headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
215 | 
216 |     trial = 0
217 |     status = None
218 | 
219 |     while trial < MAX_DOWNLOAD_TRIAL:
220 |         try:
221 |             parsed_url = urlparse(url)
222 | 
223 |             if parsed_url.netloc == "www.sciencedirect.com":
224 |                 """ special case: sciencedirect.com auto generates PDF
225 |                     download link in an intermediate page
226 |                 """
227 |                 try:
228 |                     session = HTMLSession()
229 |                     r0 = session.get(url)
230 |                     res = session.get(list(r0.html.absolute_links)[0])
231 |                 except:
232 |                     LOGGER.debug(traceback.format_exc())
233 |                     LOGGER.debug(list(r0.html.absolute_links))
234 | 
235 |                     status = f"session.get failed: {e_id} {url}"
236 |                     LOGGER.warning(status)
237 | 
238 |                     return False, status
239 | 
240 |             elif parsed_url.netloc.endswith("onlinelibrary.wiley.com"):
241 |                 """ special case: wiley.com auto embeds to render PDF
242 |                 """
243 |                 r0 = requests.get(url)
244 |                 soup = BeautifulSoup(r0.content, "html5lib")
245 | 
246 |                 if soup.find("embed") is None:
247 |                     status = f"no embedded PDF: {e_id} {url}"
248 |                     LOGGER.warning(status)
249 | 
250 |                     trial += 1
251 |                     continue
252 | 
253 |                 src = soup.find("embed")["src"]
254 |                 res = requests.get(parsed_url.scheme + "://" + parsed_url.netloc + src)
255 |             else:
256 |                 res = requests.get(url, headers=headers, timeout=(10, 20))
257 | 
258 |             if res_type == "unknown":
259 |                 content_type = res.headers["content-type"]
260 |                 res_type = "html" if "text/html" in content_type else "pdf"
261 | 
262 |             out_file = output_path / (e_id + "." + res_type)
263 |             out_file.write_bytes(res.content)
264 | 
265 |             if res_type == "pdf":
266 |                 filename = out_file.resolve().as_posix()
267 | 
268 |                 try:
269 |                     if is_valid_pdf_file(filename):
270 |                         LOGGER.debug("writing: {filename}")
271 | 
272 |                     if not is_valid_pdf_file(filename):
273 |                         out_file.unlink()
274 |                         trial += 1
275 |                         continue
276 |                 except:
277 |                     LOGGER.debug(traceback.format_exc())
278 | 
279 |                     status = f"{e_id} {url} not valid PDF: {filename}"
280 |                     LOGGER.warning(status)
281 | 
282 |                     return False, status
283 | 
284 |             return True, status
285 | 
286 |         except KeyboardInterrupt:
287 |             pass
288 | 
289 |         except requests.exceptions.RequestException as err:
290 |             status = f"{e_id} {url} request exception {err}"
291 |             LOGGER.warning(status)
292 | 
293 |             time.sleep(1)
294 |             trial += 1
295 | 
296 |     if trial == MAX_DOWNLOAD_TRIAL:
297 |         LOGGER.debug(f"aborted {e_id} {url} after {MAX_DOWNLOAD_TRIAL} attempts")
298 | 
299 |     return False, status
300 | 
301 | 
302 | def download_resource_files (todo: list) -> None:
303 |     """ Download all resource files on the TODO list.
304 |         We use the entity id as filename.
305 |         Input:
306 |             - todo: list of URLs to download
307 |     """
308 |     global LOGGER
309 | 
310 |     try:
311 |         for _type, e_id, res_url, path in tqdm(todo, ascii=True, desc="download files"):
312 |             obj_id = _download.remote(_type, e_id, res_url, Path(path))
313 |             success, status = ray.get(obj_id)
314 | 
315 |             if not success:
316 |                 LOGGER.warning(f"failed: {e_id} {res_url}")
317 | 
318 |                 if status:
319 |                     LOGGER.warning(status)
320 | 
321 |             time.sleep(0.1)
322 | 
323 |     except KeyboardInterrupt:
324 |         # this function may run for a long while and is much more
325 |         # likely to get interrupted
326 |         pass
327 | 
328 | 
329 | def get_resources_stats (corpus: dict, pub_path: Path, dat_path: Path) -> None:
330 |     global LOGGER
331 | 
332 |     pubs = [e for e in corpus if e["@type"] == "ResearchPublication"]
333 |     missing_pub = set()
334 | 
335 |     downloaded_pubs = list(pub_path.glob("*.pdf"))
336 |     downloaded_pubs_id = set([f.stem for f in downloaded_pubs])
337 | 
338 |     for entity in pubs:
339 |         e_id = urlparse(entity["@id"]).fragment.split("-")[1]
340 | 
341 |         if e_id not in downloaded_pubs_id:
342 |             missing_pub.add(e_id)
343 | 
344 |     LOGGER.warning(f"number of research publications: {len(pubs)}")
345 |     LOGGER.warning(f"successfully downloaded {len(pubs) - len(missing_pub)} PDF files")
346 |     LOGGER.debug(f"missing publication resources: {missing_pub}")
347 | 
348 |     dats = [e for e in corpus if e["@type"] == "Dataset"]
349 |     missing_dat_res = set()
350 | 
351 |     downloaded_dats = list(dat_path.glob("*.*"))
352 |     downloaded_dat_id = set([f.stem for f in downloaded_dats])
353 | 
354 |     for entity in dats:
355 |         e_id = urlparse(entity["@id"]).fragment.split("-")[1]
356 | 
357 |         if e_id not in downloaded_dat_id:
358 |             missing_dat_res.add(e_id)
359 | 
360 |     LOGGER.warning(f"number of datasets: {len(dats)}")
361 |     LOGGER.warning(f"successfully downloaded {len(dats) - len(missing_dat_res)} resource files")
362 |     LOGGER.debug(f"missing dataset resources: {missing_dat_res}")
363 | 
364 | 
365 | def main (args) -> None:
366 |     # load and parse the corpus
367 |     setup_logger(args)
368 |     corpus = load_corpus(args.input)
369 | 
370 |     # enumerate the resource files to download
371 |     output_path = Path(args.output_dir)
372 |     pub_path, todo_pub = enum_pub_resources(corpus, output_path, args.force)
373 |     dat_path, todo_dat = enum_dat_resources(corpus, output_path, args.force)
374 | 
375 |     # manage the TODO file for downloads
376 |     generate_todo(args.todo, todo_pub, todo_dat)
377 | 
378 |     if not args.todo:
379 |         todo = todo_pub + todo_dat
380 |     else:
381 |         todo = load_todo(args.todo)
382 | 
383 |     # NB: when connecting to an existing cluster, instead use
384 |     # ray.init(address=<cluster-address>)
385 |     ray.init()
386 | 
387 |     # run the downloads and report
388 |     download_resource_files(todo)
389 |     get_resources_stats(corpus, pub_path, dat_path)
390 | 
391 | 
392 | if __name__ == "__main__":
393 |     # parse the command line arguments, if any
394 |     parser = argparse.ArgumentParser(
395 |         description="download publication PDFs and dataset foaf pages for the rclc corpus"
396 |         )
397 | 
398 |     parser.add_argument(
399 |         "--logger",
400 |         type=str,
401 |         default=DEFAULT_LOGGER_FILE,
402 |         help="logger file"
403 |         )
404 | 
405 |     parser.add_argument(
406 |         "--input",
407 |         type=str,
408 |         default=DEFAULT_CORPUS_FILE,
409 |         help="rclc corpus file"
410 |         )
411 | 
412 |     parser.add_argument(
413 |         "--todo",
414 |         type=str,
415 |         default=DEFAULT_TODO_LIST,
416 |         help="download todo file"
417 |         )
418 | 
419 |     parser.add_argument(
420 |         "--output_dir",
421 |         type=str,
422 |         default=DEFAULT_OUTPUT_RESOURCE,
423 |         help="path to store downloaded resources"
424 |         )
425 | 
426 |     parser.add_argument(
427 |         "--force",
428 |         type=bool,
429 |         default=DEFAULT_FORCE_DOWNLOAD,
430 |         help="always download resources"
431 |         )
432 | 
433 |     main(parser.parse_args())
434 | 


--------------------------------------------------------------------------------
/errors.txt:
--------------------------------------------------------------------------------
  1 | WARNING:RichContext:number of records in the corpus: 6712
  2 | WARNING:RichContext:failed: 639196e25a9adcadaaf2 https://journals.sagepub.com/doi/pdf/10.1177/0002716216678391
  3 | WARNING:RichContext:failed: 847facc16baf543ccece https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12753
  4 | WARNING:RichContext:no embedded PDF: 847facc16baf543ccece https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12753
  5 | WARNING:RichContext:failed: be43d18d3aad47195bc4 https://doi.org/10.1016/j.childyouth.2016.10.018
  6 | WARNING:RichContext:failed: c1863bb4c766796ef81e http://hdl.handle.net/10.1111/agec.12222
  7 | WARNING:RichContext:failed: 8b66f7ad41758568afed http://respec.tamu.edu/zervoufirmsfinance.pdf
  8 | WARNING:RichContext:8b66f7ad41758568afed http://respec.tamu.edu/zervoufirmsfinance.pdf request exception HTTPConnectionPool(host='respec.tamu.edu', port=80): Max retries exceeded with url: /zervoufirmsfinance.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f4990bd30f0>, 'Connection to respec.tamu.edu timed out. (connect timeout=10)'))
  9 | WARNING:RichContext:failed: df70ec30c4976e90d10b http://jhl.sagepub.com/content/22/1/27.full.pdf
 10 | WARNING:RichContext:failed: c19ee91f63a30add42a2 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3598
 11 | WARNING:RichContext:no embedded PDF: c19ee91f63a30add42a2 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3598
 12 | WARNING:RichContext:failed: 296018d551a969ddbabc http://www.soc.jhu.edu/people/DeLuca/documents/SSR%20Mendenhall%2C%20DeLuca%2C%20Duncan.%202006.pdf
 13 | WARNING:RichContext:failed: 2c890a18a848d22568f2 https://doi.org/10.1016/j.landurbplan.2018.04.018
 14 | WARNING:RichContext:failed: cd9eed5e481bb49e8ada http://pdfs.journals.lww.com/academicmedicine/1997/09000/Profile_of_the_graduate_student_population_in_U_S_.20.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1506389721225;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzd++Xw1xrBvVGm3URNugOVtYfrjQcCVLgbjq4J7SGuNkd0xVDWMKnPVPA0AC2fd0y7;hash|Ze5loqJ0DcYD9lPI8UJDcw==
 15 | WARNING:RichContext:failed: 68e25c63ffb13771638e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/2017JD027629
 16 | WARNING:RichContext:no embedded PDF: 68e25c63ffb13771638e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/2017JD027629
 17 | WARNING:RichContext:failed: cd5e96bdcfd516a8c765 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jtsa.12467
 18 | WARNING:RichContext:no embedded PDF: cd5e96bdcfd516a8c765 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jtsa.12467
 19 | WARNING:RichContext:failed: 61a59fa620c7a33304a7 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/j.2379-3988.2015.tb00082.x
 20 | WARNING:RichContext:no embedded PDF: 61a59fa620c7a33304a7 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/j.2379-3988.2015.tb00082.x
 21 | WARNING:RichContext:failed: 6d4a21c9f74b2ba41094 https://manuscript.elsevier.com/S1570677X1500060X/pdf/S1570677X1500060X.pdf
 22 | WARNING:RichContext:6d4a21c9f74b2ba41094 https://manuscript.elsevier.com/S1570677X1500060X/pdf/S1570677X1500060X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1570677X1500060X/pdf/S1570677X1500060X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
 23 | WARNING:RichContext:failed: 948ccb64c9b6a83d9784 https://pdfs.journals.lww.com/topicsinclinicalnutrition/2004/01000/Child_Nutrition_Programs_Legislation__Past_and.3.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1545548711867;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdjMrFmUeNGda7afeiVO1Pd1ATOUOd1u0sL/wUiMcn9MnKKnFNjYZHw94sfJcYNY6Y;hash|N5oAa31wWF2UlO119Unaug==
 24 | WARNING:RichContext:failed: 3baebd7472bc23445521 https://manuscript.elsevier.com/S0964569115001076/pdf/S0964569115001076.pdf
 25 | WARNING:RichContext:3baebd7472bc23445521 https://manuscript.elsevier.com/S0964569115001076/pdf/S0964569115001076.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S0964569115001076/pdf/S0964569115001076.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
 26 | WARNING:RichContext:failed: c27ce9a0ba19a4ef1c7c http://link.springer.com/10.1007/s11113-018-9496-y
 27 | WARNING:RichContext:failed: debcdac22bbbccc57da5 https://doi.org/10.1016/j.isprsjprs.2018.03.019
 28 | WARNING:RichContext:failed: 9d4491e6ad99b405cdea https://onlinelibrary.wiley.com/doi/pdf/10.1002/oby.22395
 29 | WARNING:RichContext:no embedded PDF: 9d4491e6ad99b405cdea https://onlinelibrary.wiley.com/doi/pdf/10.1002/oby.22395
 30 | WARNING:RichContext:failed: 251fb8b407821b95d6f7 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/he.20336
 31 | WARNING:RichContext:no embedded PDF: 251fb8b407821b95d6f7 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/he.20336
 32 | WARNING:RichContext:failed: 4f84b0f40e7664f53590 http://hdl.handle.net/10.1002/pam.21842
 33 | WARNING:RichContext:failed: 578e5c1bf1b8a2f93347 https://onlinelibrary.wiley.com/doi/pdf/10.1111/rurd.12057
 34 | WARNING:RichContext:no embedded PDF: 578e5c1bf1b8a2f93347 https://onlinelibrary.wiley.com/doi/pdf/10.1111/rurd.12057
 35 | WARNING:RichContext:failed: 2dc1f36c8e8c269799f9 https://www.philadelphiafed.org/-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail
 36 | WARNING:RichContext:2dc1f36c8e8c269799f9 https://www.philadelphiafed.org/-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail request exception HTTPSConnectionPool(host='www.philadelphiafed.org', port=443): Max retries exceeded with url: /-/media/research-and-data/publications/working-papers/2017/wp17-19.pdf?utm_campaign=WorkingPapers&utm_source=2017/07/17&utm_medium=E-mail (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
 37 | WARNING:RichContext:failed: 053066f2eb3b45d300e2 https://academic.oup.com/condor/article-pdf/121/2/duz007/28981665/duz007.pdf
 38 | WARNING:RichContext:failed: 9c5d244917fe12f7be8d https://doi.org/10.1093/scipol/scz039
 39 | WARNING:RichContext:failed: fdc935f99d70620d2a6f https://pediatrics.aappublications.org/content/pediatrics/135/1/e109.full.pdf
 40 | WARNING:RichContext:failed: a0241d06c6cf070b947e https://www.tandfonline.com/doi/pdf/10.1080/00220388.2017.1324144?needAccess=true
 41 | WARNING:RichContext:failed: 08979188655b9cd921c5 https://academic.oup.com/erae/article-pdf/42/3/499/7003763/jbu033.pdf
 42 | WARNING:RichContext:failed: 8a7aa077551d3d9db6fc http://nsgl.gso.uri.edu/flsgp/flsgpm10001.pdf
 43 | WARNING:RichContext:failed: 6e8c0b3329601bafa144 http://www.tandfonline.com/doi/pdf/10.1080/13636820.2016.1238837?needAccess=true
 44 | WARNING:RichContext:failed: 08a045baa4fd84c47ddb http://europepmc.org/articles/PMC1497475?pdf=render
 45 | WARNING:RichContext:failed: fc95db7f7d32cd5f6143 https://doi.org/10.1093/ajcn/nqz064
 46 | WARNING:RichContext:failed: d0d445cafcdf2052eaa0 https://journals.sagepub.com/doi/pdf/10.1177/0002716219881628
 47 | WARNING:RichContext:failed: a14d6bc056bd36399bef https://doi.org/10.1016/j.crm.2018.03.005
 48 | WARNING:RichContext:failed: 6bc42ef8479d897dd5b8 https://manuscript.elsevier.com/S0195666316300952/pdf/S0195666316300952.pdf
 49 | WARNING:RichContext:6bc42ef8479d897dd5b8 https://manuscript.elsevier.com/S0195666316300952/pdf/S0195666316300952.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S0195666316300952/pdf/S0195666316300952.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
 50 | WARNING:RichContext:failed: 27bb9fe7e9fb3d742e26 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019MS001741
 51 | WARNING:RichContext:no embedded PDF: 27bb9fe7e9fb3d742e26 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019MS001741
 52 | WARNING:RichContext:failed: c930ff7ed86374e60faf http://pdfs.semanticscholar.org/dde2/64a68c1b893921be3b2875b861667e4260c4.pdf
 53 | WARNING:RichContext:failed: 4e29b682e33b1288ea1a https://onlinelibrary.wiley.com/doi/pdf/10.1111/joca.12158
 54 | WARNING:RichContext:no embedded PDF: 4e29b682e33b1288ea1a https://onlinelibrary.wiley.com/doi/pdf/10.1111/joca.12158
 55 | WARNING:RichContext:failed: a4f9bb4fd4330c60a42d https://onlinelibrary.wiley.com/doi/pdf/10.1111/bjir.12469
 56 | WARNING:RichContext:no embedded PDF: a4f9bb4fd4330c60a42d https://onlinelibrary.wiley.com/doi/pdf/10.1111/bjir.12469
 57 | WARNING:RichContext:failed: caf688b9379490044130 https://www.sciencedirect.com/sdfe/reader/pii/S2210784316300973/pdf
 58 | WARNING:RichContext:session.get failed: caf688b9379490044130 https://www.sciencedirect.com/sdfe/reader/pii/S2210784316300973/pdf
 59 | WARNING:RichContext:failed: 590c800055988048ae2c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22104
 60 | WARNING:RichContext:no embedded PDF: 590c800055988048ae2c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22104
 61 | WARNING:RichContext:failed: 81c8c68f70538befd0f4 https://esajournals.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ecy.2600
 62 | WARNING:RichContext:no embedded PDF: 81c8c68f70538befd0f4 https://esajournals.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/ecy.2600
 63 | WARNING:RichContext:failed: 94f7e2a4bb7a663c358b http://pdfs.semanticscholar.org/01cf/d33f38b51a55a75ffd59b30c5813850a77cc.pdf
 64 | WARNING:RichContext:failed: 6cc89963f723ce427738 http://www.degruyter.com/downloadpdf/j/alr.ahead-of-print/applirev-2018-0020/applirev-2018-0020.xml
 65 | WARNING:RichContext:failed: d0447f01dcf36d16bc19 https://link.springer.com/content/pdf/10.1007%2F978-3-030-18072-0.pdf
 66 | WARNING:RichContext:failed: 8d7664e43439a2a679f8 http://journals.sagepub.com/doi/pdf/10.1177/1049731516630385
 67 | WARNING:RichContext:failed: 1d26e90c12bc8cc7fb2e http://jhppl.dukejournals.org/cgi/reprint/32/3/415.pdf
 68 | WARNING:RichContext:failed: 3a9f49659008444cd656 http://europepmc.org/articles/PMC2690384?pdf=render
 69 | WARNING:RichContext:failed: aaa54f5cf1d57869edfe https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jmcb.12574
 70 | WARNING:RichContext:no embedded PDF: aaa54f5cf1d57869edfe https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jmcb.12574
 71 | WARNING:RichContext:failed: 6ec3fd89ae20c802af4f https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1002/2013JC009604
 72 | WARNING:RichContext:no embedded PDF: 6ec3fd89ae20c802af4f https://agupubs.onlinelibrary.wiley.com/doi/pdf/10.1002/2013JC009604
 73 | WARNING:RichContext:failed: 88e5e048845a9ada8b5d https://doi.org/10.1016/j.rse.2019.03.022
 74 | WARNING:RichContext:failed: 0b8c73a4cc256431bd0d https://onlinelibrary.wiley.com/doi/pdf/10.1111/joes.12122
 75 | WARNING:RichContext:no embedded PDF: 0b8c73a4cc256431bd0d https://onlinelibrary.wiley.com/doi/pdf/10.1111/joes.12122
 76 | WARNING:RichContext:failed: 85bc3a4d4f38ff39baf4 http://www.annualreviews.org/doi/pdf/10.1146/annurev-criminol-032317-091915
 77 | WARNING:RichContext:failed: 4a112f8ef5541d2a89a5 https://www.tandfonline.com/doi/pdf/10.1080/19320248.2013.786663?needAccess=true
 78 | WARNING:RichContext:failed: 4895cb2001777057a23c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/pbaf.12244
 79 | WARNING:RichContext:no embedded PDF: 4895cb2001777057a23c https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/pbaf.12244
 80 | WARNING:RichContext:failed: df4879004c5f8b41da49 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecoj.12415
 81 | WARNING:RichContext:no embedded PDF: df4879004c5f8b41da49 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecoj.12415
 82 | WARNING:RichContext:failed: 0de2b787b9fbf0067276 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ijpo.12562
 83 | WARNING:RichContext:no embedded PDF: 0de2b787b9fbf0067276 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ijpo.12562
 84 | WARNING:RichContext:failed: 64d397e1e2a4c4b7ca7a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22093
 85 | WARNING:RichContext:no embedded PDF: 64d397e1e2a4c4b7ca7a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pam.22093
 86 | WARNING:RichContext:failed: d670e19e556e0a43fa05 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/coep.12266
 87 | WARNING:RichContext:no embedded PDF: d670e19e556e0a43fa05 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/coep.12266
 88 | WARNING:RichContext:failed: 0a54d3ff7a70f775f98c https://onlinelibrary.wiley.com/doi/pdf/10.1111/fare.12310
 89 | WARNING:RichContext:no embedded PDF: 0a54d3ff7a70f775f98c https://onlinelibrary.wiley.com/doi/pdf/10.1111/fare.12310
 90 | WARNING:RichContext:failed: 3519f2665128f8589e19 http://onlinelibrary.wiley.com/doi/10.1002/pop4.171/pdf
 91 | WARNING:RichContext:no embedded PDF: 3519f2665128f8589e19 http://onlinelibrary.wiley.com/doi/10.1002/pop4.171/pdf
 92 | WARNING:RichContext:failed: 118d90c6757823a66d60 https://doi.org/10.1093/aepp/ppy018
 93 | WARNING:RichContext:failed: 18c64ba865ad68d6cf90 Retail oligopoly power, dairy compact, and Boston milk prices
 94 | WARNING:RichContext:18c64ba865ad68d6cf90 Retail oligopoly power, dairy compact, and Boston milk prices request exception Invalid URL 'Retail oligopoly power, dairy compact, and Boston milk prices': No schema supplied. Perhaps you meant http://Retail oligopoly power, dairy compact, and Boston milk prices?
 95 | WARNING:RichContext:failed: 0522d4cdb559dd0b03ee https://doi.org/10.21105/joss.01462
 96 | WARNING:RichContext:failed: 8175cb931990b98a72d8 http://dataspace.princeton.edu/jspui/bitstream/88435/dsp01zw12z530b/1/441.pdf
 97 | WARNING:RichContext:failed: 830fe95fd71f124a5e68 https://link.springer.com/content/pdf/10.1007%2Fs10896-019-00058-y.pdf
 98 | WARNING:RichContext:failed: 95a93b5949b7868fcfe9 https://bmcmusculoskeletdisord.biomedcentral.com/track/pdf/10.1186/1471-2474-12-182
 99 | WARNING:RichContext:failed: 756a33253efcecd3b9e5 https://onlinelibrary.wiley.com/doi/pdf/10.1111/mcn.12488
100 | WARNING:RichContext:no embedded PDF: 756a33253efcecd3b9e5 https://onlinelibrary.wiley.com/doi/pdf/10.1111/mcn.12488
101 | WARNING:RichContext:failed: efba7abdb9ccd597a997 https://doi.org/10.1016/j.appet.2014.12.003
102 | WARNING:RichContext:failed: a1d317452d59817cd6ff https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.22540
103 | WARNING:RichContext:no embedded PDF: a1d317452d59817cd6ff https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.22540
104 | WARNING:RichContext:failed: 89f6c098e8444e7a583b https://www.frontiersin.org/articles/10.3389/fenvs.2019.00068/pdf
105 | WARNING:RichContext:failed: e74e590413a621376e10 http://www.umass.edu/resec/faculty/rojas/docs/Taxes.pdf
106 | WARNING:RichContext:failed: 67e991d67225171ce669 http://journals.sagepub.com/doi/pdf/10.1177/0275074018765809
107 | WARNING:RichContext:failed: 571089b7e3a18f69602a https://doi.org/10.3934/environsci.2016.3.509
108 | WARNING:RichContext:failed: 526d34e526fe153f418b https://link.springer.com/content/pdf/10.1007%2F978-3-319-31816-5_2669-1.pdf
109 | WARNING:RichContext:failed: ac5b23a0dbc3312409c2 https://izajolp.springeropen.com/track/pdf/10.1186/s40173-015-0043-8
110 | WARNING:RichContext:ac5b23a0dbc3312409c2 https://izajolp.springeropen.com/track/pdf/10.1186/s40173-015-0043-8 request exception HTTPSConnectionPool(host='izajolp.springeropen.com', port=443): Read timed out. (read timeout=20)
111 | WARNING:RichContext:failed: 42660e147ce2722ca8c0 https://onlinelibrary.wiley.com/doi/pdf/10.1002/ajpa.22161
112 | WARNING:RichContext:no embedded PDF: 42660e147ce2722ca8c0 https://onlinelibrary.wiley.com/doi/pdf/10.1002/ajpa.22161
113 | WARNING:RichContext:failed: e3c089f9a76fa109516a http://policy.rutgers.edu/faculty/hetling/Hetling_Diversion_JPP.pdf
114 | WARNING:RichContext:failed: 9f66008fbb6b3f813781 http://dl.acm.org/ft_gateway.cfm?id=3232775&type=pdf
115 | WARNING:RichContext:failed: 28fe310677131a723ac9 https://dergipark.org.tr/tr/download/article-file/855095
116 | WARNING:RichContext:failed: d8f33b8436040b1230d0 https://academic.oup.com/restud/article-pdf/86/3/1170/28529276/rdy021.pdf
117 | WARNING:RichContext:failed: 70846c225e81d8944ef2 https://doi.org/10.7758/rsf.2015.1.1.07
118 | WARNING:RichContext:failed: 30c402dcc869061d60fb http://journals.sagepub.com/doi/pdf/10.1177/0734016818769705
119 | WARNING:RichContext:failed: 87699b623cea5b299453 https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=966252
120 | WARNING:RichContext:failed: 45585929322868e33f9e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019JD031232
121 | WARNING:RichContext:no embedded PDF: 45585929322868e33f9e https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2019JD031232
122 | WARNING:RichContext:failed: 3e1934281bcca346b4fb https://doi.org/10.4103/jfmpc.jfmpc_185_17
123 | WARNING:RichContext:failed: 5a6683af271dd94c08bf https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/joc.6367
124 | WARNING:RichContext:no embedded PDF: 5a6683af271dd94c08bf https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/joc.6367
125 | WARNING:RichContext:failed: fe2001372b6842192771 https://onlinelibrary.wiley.com/doi/pdf/10.1111/birt.12394
126 | WARNING:RichContext:no embedded PDF: fe2001372b6842192771 https://onlinelibrary.wiley.com/doi/pdf/10.1111/birt.12394
127 | WARNING:RichContext:failed: 27f9858462c84b19ec3b https://journals.sagepub.com/doi/pdf/10.1177/0002716219884546
128 | WARNING:RichContext:failed: 9502d3e234e31a774907 https://doi.org/10.1016/j.amepre.2016.08.023
129 | WARNING:RichContext:failed: 835616fa5075a00a4948 https://doi.org/10.5530/ami.2015.1.10
130 | WARNING:RichContext:failed: bab9c37e1d71bc2d046a https://onlinelibrary.wiley.com/doi/pdf/10.1111/1745-9133.12359
131 | WARNING:RichContext:no embedded PDF: bab9c37e1d71bc2d046a https://onlinelibrary.wiley.com/doi/pdf/10.1111/1745-9133.12359
132 | WARNING:RichContext:failed: 30f40b6634264fc34af1 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6179934
133 | WARNING:RichContext:30f40b6634264fc34af1 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6179934 request exception ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
134 | WARNING:RichContext:failed: d11ed61dee9ed87ac448 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3651
135 | WARNING:RichContext:no embedded PDF: d11ed61dee9ed87ac448 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/qj.3651
136 | WARNING:RichContext:failed: 9ce19e2215388a5a3821 https://www.frontiersin.org/articles/10.3389/fbuil.2019.00105/pdf
137 | WARNING:RichContext:failed: 36bc15e99d84db9278fa https://link.springer.com/content/pdf/10.1007%2Fs12571-017-0733-8.pdf
138 | WARNING:RichContext:failed: 9d2897d8e568f46e0472 https://onlinelibrary.wiley.com/doi/pdfdirect/10.4073/csr.2015.2
139 | WARNING:RichContext:no embedded PDF: 9d2897d8e568f46e0472 https://onlinelibrary.wiley.com/doi/pdfdirect/10.4073/csr.2015.2
140 | WARNING:RichContext:failed: 2dad560eb70b1ed8f114 https://onlinelibrary.wiley.com/doi/pdf/10.1002/agr.21537
141 | WARNING:RichContext:no embedded PDF: 2dad560eb70b1ed8f114 https://onlinelibrary.wiley.com/doi/pdf/10.1002/agr.21537
142 | WARNING:RichContext:failed: 136ec8e7c40289836352 http://fmpc.uconn.edu/publications/rr/rr37.pdf
143 | WARNING:RichContext:136ec8e7c40289836352 http://fmpc.uconn.edu/publications/rr/rr37.pdf request exception HTTPConnectionPool(host='fmpc.uconn.edu', port=80): Max retries exceeded with url: /publications/rr/rr37.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f4990bb3fd0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
144 | WARNING:RichContext:failed: 95cd3d20b62594528334 http://hdl.handle.net/20.500.11794/10748
145 | WARNING:RichContext:95cd3d20b62594528334 http://hdl.handle.net/20.500.11794/10748 request exception HTTPSConnectionPool(host='corpus.ulaval.ca', port=443): Read timed out. (read timeout=20)
146 | WARNING:RichContext:failed: 0b29b6675fc51a1cc47d https://doi.org/10.1016/j.ijlp.2018.10.004
147 | WARNING:RichContext:failed: c3c44096bf0e3c9ffbf3 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2018MS001595
148 | WARNING:RichContext:no embedded PDF: c3c44096bf0e3c9ffbf3 https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1029/2018MS001595
149 | WARNING:RichContext:failed: 9aa50a5f06b9166f4715 https://doi.org/10.1016/j.jbusvent.2018.10.005
150 | WARNING:RichContext:failed: b52a4d9ed708d23ab31f https://manuscript.elsevier.com/S027795361730237X/pdf/S027795361730237X.pdf
151 | WARNING:RichContext:b52a4d9ed708d23ab31f https://manuscript.elsevier.com/S027795361730237X/pdf/S027795361730237X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S027795361730237X/pdf/S027795361730237X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
152 | WARNING:RichContext:failed: 6b8a515fa898d061c94e https://doi.org/10.1016/j.jrurstud.2016.02.001
153 | WARNING:RichContext:failed: 6cea458960193ca6c4c8 https://doi.org/10.1093/ajae/aay002
154 | WARNING:RichContext:failed: d1f47b6efca463251268 https://nyaspubs.onlinelibrary.wiley.com/doi/pdf/10.1111/nyas.12594
155 | WARNING:RichContext:no embedded PDF: d1f47b6efca463251268 https://nyaspubs.onlinelibrary.wiley.com/doi/pdf/10.1111/nyas.12594
156 | WARNING:RichContext:failed: b14b63bddb9a8e336432 http://journals.sagepub.com/doi/pdf/10.1177/0269215518758483
157 | WARNING:RichContext:failed: d1b427e76cad5fdac247 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/padr.12227
158 | WARNING:RichContext:no embedded PDF: d1b427e76cad5fdac247 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/padr.12227
159 | WARNING:RichContext:failed: 505caa8f96f691006fc6 https://journals.sagepub.com/doi/pdf/10.1177/0890117118786871
160 | WARNING:RichContext:failed: becacc4f9fe1589c4db4 https://manuscript.elsevier.com/S1499404616300318/pdf/S1499404616300318.pdf
161 | WARNING:RichContext:becacc4f9fe1589c4db4 https://manuscript.elsevier.com/S1499404616300318/pdf/S1499404616300318.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1499404616300318/pdf/S1499404616300318.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
162 | WARNING:RichContext:failed: 21783645b6d33194d0f7 https://www.tandfonline.com/doi/pdf/10.1080/10511482.2014.1003190?needAccess=true
163 | WARNING:RichContext:failed: d149447ba1b82d8b9ad2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.04028
164 | WARNING:RichContext:no embedded PDF: d149447ba1b82d8b9ad2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/ecog.04028
165 | WARNING:RichContext:failed: ba8305c710558f122b1d http://www.jneb.org/article/S1499404619300740/pdf
166 | WARNING:RichContext:failed: 86ac8d47e706be6c8219 https://doi.org/10.17000/kspr.25.4.201812.199
167 | WARNING:RichContext:failed: a9ca248fe2f8bdb1d751 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/1745-9125.12199
168 | WARNING:RichContext:no embedded PDF: a9ca248fe2f8bdb1d751 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/1745-9125.12199
169 | WARNING:RichContext:failed: ef8f24963245a184e40f http://crc.nv.gov/docs/forecst_wksp/developmentspaper.pdf
170 | WARNING:RichContext:failed: 8c9ccf399eebc2338418 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/jia2.25306
171 | WARNING:RichContext:no embedded PDF: 8c9ccf399eebc2338418 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/jia2.25306
172 | WARNING:RichContext:failed: 209e57ded73099de4596 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/met.1862
173 | WARNING:RichContext:no embedded PDF: 209e57ded73099de4596 https://rmets.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/met.1862
174 | WARNING:RichContext:failed: 1af374454453aa5d5825 https://manuscript.elsevier.com/S1570677X16300326/pdf/S1570677X16300326.pdf
175 | WARNING:RichContext:1af374454453aa5d5825 https://manuscript.elsevier.com/S1570677X16300326/pdf/S1570677X16300326.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S1570677X16300326/pdf/S1570677X16300326.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
176 | WARNING:RichContext:failed: ac1a920b63e1d7fb783f http://www.degruyter.com/downloadpdf/j/jetl.2019.10.issue-2/jetl-2019-0108/jetl-2019-0108.xml
177 | WARNING:RichContext:failed: 6326552bd07f2ec4aeaf http://pdfs.semanticscholar.org/ca60/d8c02ca0590e15110e1b7b854c34ca9a52f2.pdf
178 | WARNING:RichContext:failed: ec6d8c7e87ae9bfbf142 http://www.tandfonline.com/doi/pdf/10.1080/00036846.2011.568397
179 | WARNING:RichContext:failed: 618e73b7a15fbdd9f814 https://pdfs.journals.lww.com/jaids/2015/07010/The_Association_Between_Food_Insufficiency_and_HIV.11.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1542048624015;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdmXWu5dPlhDubainwsnRl16RkxGril9arViLOJfOXddA=;hash|jfKu6Iv3RW2TNcA9hXUj8A==
180 | WARNING:RichContext:failed: 83a4e67742aabc45f9dc https://doi.org/10.1016/j.ocecoaman.2016.12.014
181 | WARNING:RichContext:failed: 77f7e285b82c924fd01d https://manuscript.elsevier.com/S030691921600018X/pdf/S030691921600018X.pdf
182 | WARNING:RichContext:77f7e285b82c924fd01d https://manuscript.elsevier.com/S030691921600018X/pdf/S030691921600018X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S030691921600018X/pdf/S030691921600018X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
183 | WARNING:RichContext:failed: ada821e909a30a8c6e0d https://doi.org/10.1016/j.childyouth.2014.04.003
184 | WARNING:RichContext:failed: 034349cf00da5e130480 http://archive.nyu.edu/bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf
185 | WARNING:RichContext:034349cf00da5e130480 http://archive.nyu.edu/bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf request exception HTTPSConnectionPool(host='archive.nyu.edu', port=443): Max retries exceeded with url: /bitstream/2451/31847/2/Wachtel_ProCyclicalCapital_Jul2013.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
186 | WARNING:RichContext:failed: 49f839e31fe22701509f http://pdfs.semanticscholar.org/cb1e/f995d26cda189479736850b8c10b6fe3cf79.pdf
187 | WARNING:RichContext:failed: d2207b8ab7654cac902c https://doi.org/10.7554/elife.32822
188 | WARNING:RichContext:failed: 0ff3b036d813af980976 https://doi.org/10.1093/erae/jby026
189 | WARNING:RichContext:failed: 33d8143055f59f2829af https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/hsc.12619
190 | WARNING:RichContext:no embedded PDF: 33d8143055f59f2829af https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/hsc.12619
191 | WARNING:RichContext:failed: 417299798d839a7ef7e8 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4330833/pdf
192 | WARNING:RichContext:417299798d839a7ef7e8 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4330833/pdf request exception ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
193 | WARNING:RichContext:failed: f4a39744928ec6a912bb https://doi.org/10.3168/jds.s0022-0302(78)94422-3
194 | WARNING:RichContext:failed: f65c9b9345c259191f8c https://academic.oup.com/psychsocgerontology/article-pdf/62/4/S209/1383923/S209.pdf
195 | WARNING:RichContext:failed: e26969208ccf52d42846 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jftr.12302
196 | WARNING:RichContext:no embedded PDF: e26969208ccf52d42846 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/jftr.12302
197 | WARNING:RichContext:failed: 646197a01f532c943874 https://manuscript.elsevier.com/S027795361530280X/pdf/S027795361530280X.pdf
198 | WARNING:RichContext:646197a01f532c943874 https://manuscript.elsevier.com/S027795361530280X/pdf/S027795361530280X.pdf request exception HTTPSConnectionPool(host='manuscript.elsevier.com', port=443): Max retries exceeded with url: /S027795361530280X/pdf/S027795361530280X.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
199 | WARNING:RichContext:failed: c6cbfb4533f33f1f895d https://doi.org/10.1016/j.marpol.2016.04.030
200 | WARNING:RichContext:failed: 1a2665361435e5929d9a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pop4.203
201 | WARNING:RichContext:no embedded PDF: 1a2665361435e5929d9a https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/pop4.203
202 | WARNING:RichContext:failed: 144fa5560b013a344936 http://repositorio.ul.pt/bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf
203 | WARNING:RichContext:144fa5560b013a344936 http://repositorio.ul.pt/bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf request exception HTTPSConnectionPool(host='repositorio.ul.pt', port=443): Max retries exceeded with url: /bitstream/10451/38733/1/ICS_SGCardoso_et_al_School.pdf (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
204 | WARNING:RichContext:failed: 47e1bc82160141ce0352 https://doi.org/10.1093/jssam/smz024
205 | WARNING:RichContext:failed: 1c94ef1e2f327d388feb https://papers.ssrn.com/sol3/Delivery.cfm?abstractid=2056045
206 | WARNING:RichContext:failed: aad7a5f99885cfa59d83 https://onlinelibrary.wiley.com/doi/pdf/10.1111/jora.12272
207 | WARNING:RichContext:no embedded PDF: aad7a5f99885cfa59d83 https://onlinelibrary.wiley.com/doi/pdf/10.1111/jora.12272
208 | WARNING:RichContext:failed: 40f8b8f798ae78fef5b3 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12842
209 | WARNING:RichContext:no embedded PDF: 40f8b8f798ae78fef5b3 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12842
210 | WARNING:RichContext:failed: 8f616ddb80faace26d72 https://doi.org/10.3982/qe564
211 | WARNING:RichContext:failed: 778a15d3c9f53050675e https://doi.org/10.1016/j.childyouth.2016.10.018
212 | WARNING:RichContext:failed: 1df1bb6a76cf596bccf8 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12837
213 | WARNING:RichContext:no embedded PDF: 1df1bb6a76cf596bccf8 https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/ecin.12837
214 | WARNING:RichContext:failed: 70a5a100bf7301a203bf https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.21862
215 | WARNING:RichContext:no embedded PDF: 70a5a100bf7301a203bf https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/oby.21862
216 | WARNING:RichContext:failed: 2d0084671761a8116774 https://nyaspubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/nyas.13396
217 | WARNING:RichContext:no embedded PDF: 2d0084671761a8116774 https://nyaspubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/nyas.13396
218 | WARNING:RichContext:failed: f74c7adc3a94432f1535 https://onlinelibrary.wiley.com/doi/pdf/10.1111/phn.12375
219 | WARNING:RichContext:no embedded PDF: f74c7adc3a94432f1535 https://onlinelibrary.wiley.com/doi/pdf/10.1111/phn.12375
220 | WARNING:RichContext:failed: 436d4a380504abf1cb3c https://doi.org/10.5993/ajhb.42.1.3
221 | WARNING:RichContext:failed: 050f7524a00fccc84e03 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12764
222 | WARNING:RichContext:no embedded PDF: 050f7524a00fccc84e03 https://srcd.onlinelibrary.wiley.com/doi/pdfdirect/10.1111/cdev.12764
223 | WARNING:RichContext:failed: fddf252c8b7f7ab55e69 https://www.dllr.state.md.us/employment/unemployment.shtml
224 | WARNING:RichContext:fddf252c8b7f7ab55e69 https://www.dllr.state.md.us/employment/unemployment.shtml request exception HTTPSConnectionPool(host='www.dllr.state.md.us', port=443): Max retries exceeded with url: /employment/unemployment.shtml (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
225 | WARNING:RichContext:failed: 722701806b8efd709030 https://www.edd.ca.gov/unemployment/
226 | WARNING:RichContext:722701806b8efd709030 https://www.edd.ca.gov/unemployment/ request exception HTTPSConnectionPool(host='www.edd.ca.gov', port=443): Max retries exceeded with url: /unemployment/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
227 | WARNING:RichContext:failed: ae027bea3fce790000b1 https://dew.sc.gov/tools-resources/data-statistics
228 | WARNING:RichContext:ae027bea3fce790000b1 https://dew.sc.gov/tools-resources/data-statistics request exception HTTPSConnectionPool(host='dew.sc.gov', port=443): Max retries exceeded with url: /tools-resources/data-statistics (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
229 | WARNING:RichContext:failed: e84ccf3e64a9f10a03a5 https://otda.ny.gov/resources/caseload/
230 | WARNING:RichContext:e84ccf3e64a9f10a03a5 https://otda.ny.gov/resources/caseload/ request exception HTTPSConnectionPool(host='otda.ny.gov', port=443): Max retries exceeded with url: /resources/caseload/ (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'tls_process_server_certificate', 'certificate verify failed')])")))
231 | WARNING:RichContext:failed: 7785c8d5d201ff8ba4da http://www23.statcan.gc.ca/imdb/p2SV.pl?Function=getSurvey&Id=795204
232 | WARNING:RichContext:7785c8d5d201ff8ba4da http://www23.statcan.gc.ca/imdb/p2SV.pl?Function=getSurvey&Id=795204 request exception ('Connection aborted.', OSError("(104, 'ECONNRESET')"))
233 | WARNING:RichContext:number of research publications: 1604
234 | WARNING:RichContext:successfully downloaded 1452 PDF files
235 | WARNING:RichContext:number of datasets: 208
236 | WARNING:RichContext:successfully downloaded 199 resource files
237 | 


--------------------------------------------------------------------------------