├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── Cotools_demo.ipynb ├── LICENSE.txt ├── README.md ├── Your To-Do list ├── cotools ├── __init__.py ├── data.py ├── hopkins.py └── text.py ├── nix └── cotools.nix ├── setup.cfg ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.h5 2 | *.mp4 3 | *.gif 4 | *.json 5 | #result 6 | #data/ 7 | 8 | 9 | # tensorboard logs 10 | logs/ 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/*.npy 116 | augmented/*.npy 117 | 118 | #pylint 119 | .pylint.d 120 | 121 | .vscode-server/ 122 | .bash_history 123 | 124 | tmux.conf 125 | 126 | data/* 127 | !data/*.sh 128 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | known_third_party = requests,setuptools,xmltodict 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit run --all-files 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v2.3.0 5 | hooks: 6 | - id: check-ast 7 | - id: check-byte-order-marker 8 | - id: check-case-conflict 9 | - id: check-docstring-first 10 | - id: check-executables-have-shebangs 11 | - id: check-json 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: detect-aws-credentials 15 | - id: detect-private-key 16 | - id: end-of-file-fixer 17 | - id: trailing-whitespace 18 | - id: mixed-line-ending 19 | 20 | - repo: https://github.com/pre-commit/mirrors-mypy 21 | rev: v0.730 22 | hooks: 23 | - id: mypy 24 | args: [--ignore-missing-imports] 25 | - repo: https://github.com/asottile/seed-isort-config 26 | rev: v1.9.3 27 | hooks: 28 | - id: seed-isort-config 29 | - repo: https://github.com/pre-commit/mirrors-isort 30 | rev: v4.3.21 31 | hooks: 32 | - id: isort 33 | - repo: https://github.com/psf/black 34 | rev: 19.3b0 35 | hooks: 36 | - id: black 37 | - repo: https://github.com/asottile/pyupgrade 38 | rev: v2.1.0 39 | hooks: 40 | - id: pyupgrade 41 | - repo: https://github.com/asottile/blacken-docs 42 | rev: v1.6.0 43 | hooks: 44 | - id: blacken-docs 45 | additional_dependencies: [black==19.3b0] 46 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2018 YOUR NAME 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. 18 | view raw 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/cord-19-tools.svg)](https://badge.fury.io/py/cord-19-tools) 2 | # COVID-19 Data Tools 3 | 4 | Tools for making COVID 19 data slightly easier for everyone! If you A) think something would be useful in your research or B) have some helpful code to contribute, make an issue or PR ASAP so we can get your code shared! 5 | 6 | ## Installation 7 | 8 | ``` 9 | pip install cord-19-tools 10 | ``` 11 | 12 | BE SURE TO HAVE THE MOST RECENT VERSION! I will be constantly updating to make sure users are getting the right data! Semantic Scholar updates the dataset every friday, so on fridays and saturdays be sure to redownload data! 13 | 14 | # Demo 15 | 16 | [Demonstration Notebook on colab](https://colab.research.google.com/drive/1al-K7vT3m72EOBduMpN2rQF1bLdGikx_) 17 | 18 | ## Downloading the data 19 | 20 | To download and extract the data, use the `download` function: 21 | 22 | ```python 23 | import cotools 24 | from pprint import pprint 25 | 26 | cotools.download() 27 | ``` 28 | 29 | For now this just downloads the data from the [CORD-19 dataset](https://pages.semanticscholar.org/coronavirus-research), metadata is not included (will be by end of day), extracts all the tarfiles, and places them in a directory 30 | 31 | ## The Paperset class 32 | 33 | This is a class for lazily loading papers from the [CORD-19 dataset](https://pages.semanticscholar.org/coronavirus-research). 34 | 35 | 36 | ```python 37 | # no `/` at the end please! 38 | data = cotools.Paperset("data/comm_use_subset") 39 | 40 | # indexes with ints 41 | pprint(data[0]) 42 | # returns a dict 43 | 44 | # and slices! 45 | pprint(data[:2]) 46 | # returns a list of dicts 47 | 48 | 49 | print(len(data)) 50 | 51 | # takes about 5gb in memory 52 | alldata = data[:] 53 | ``` 54 | 55 | Lets talk for a bit about how it works, and why it doesnt take a gigantic amount of memory. The files are not actually loaded into python ***until the data is indexed***. Upon indexing, the files at those indexes are read into python, resulting in a list of dictionaries. This means you can still contribute while working on a low resource system. 56 | 57 | 58 | ### Getting text and abstracts 59 | 60 | For text, there is the `text` function, which returns the text from a single document, the `texts` function, which returns the text from multiple documents, and the `Paperset.texts()` function, which gets the text from all documents: 61 | 62 | ```python 63 | print(cotools.text(data[0])) 64 | print(cotools.texts(data[12:18])) 65 | 66 | alltext = data.texts() 67 | # alltext = cotools.texts(alldata) 68 | ``` 69 | 70 | For abstracts, we have a similar API: 71 | 72 | ```python 73 | print(cotools.abstract(data[0])) 74 | print(cotools.abstracts(data[12:18])) 75 | 76 | allabs = data.abstracts() 77 | # allabs = cotools.abstracts(alldata) 78 | ``` 79 | 80 | ### Manipulating 81 | 82 | You can also manipulate the documents with the `Paperset.apply` method: 83 | 84 | ```python 85 | keys = comm_use.apply(lambda x: list(x.keys())) 86 | # then lets combine them into a set 87 | print(set(sum(keys, []))) 88 | ``` 89 | 90 | ### Searching 91 | 92 | You can search with a list OR a nested list! See the demo notebook for more examples! 93 | 94 | ```python 95 | txt = [["covid", "novel coronavirus"], ["ventilator", "cpap", "bipap"]] 96 | 97 | x = cotools.search(comm_use, txt) 98 | print(len(x)) 99 | print(len(cotools.search(comm_use, txt[0]))) 100 | print(len(cotools.search(comm_use, txt[-1]))) 101 | ``` 102 | 103 | # TODO 104 | 105 | - [x] Metadata 106 | - [ ] Other data, for example data from [this aggregate site](https://www.kiragoldner.com/covid19/) and [this google spreadsheet](https://docs.google.com/spreadsheets/u/1/d/e/2PACX-1vRwAqp96T9sYYq2-i7Tj0pvTf6XVHjDSMIKBdZHXiCGGdNC0ypEU9NbngS8mxea55JuCFuua1MUeOj5/pubhtml#) 107 | -------------------------------------------------------------------------------- /Your To-Do list: -------------------------------------------------------------------------------- 1 | Help on function download in module cotools.data: 2 | 3 | ddoowwnnllooaadd(dir: str = '.', match: str = '.tar.gz', regex: bool = False) -> None 4 | download: 5 | Download CORD-19 dataset from ai2's S3 bucket. 6 | ----------------------------------------------------- 7 | args: 8 | dir: Directory to download the data into. 9 | match: A string dictating which files to download. Defaults to match 10 | all tar files. 11 | regex: If regex should be used. Otherwise, a `match in x` is used. 12 | ----------------------------------------------------- 13 | how it works: 14 | Match all files: `download('data', match='*')` 15 | Match only JSON files: `download('data', match='.json')` 16 | Match tar files from April 10: `download('data', match='2020-04-10.*.tar.gz', regex=True)` 17 | -------------------------------------------------------------------------------- /cotools/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import Paperset, download, search 2 | from .hopkins import get_hopkins 3 | from .text import _get_abstract as abstract 4 | from .text import _get_text as text 5 | from .text import get_abstracts as abstracts 6 | from .text import get_texts as texts 7 | -------------------------------------------------------------------------------- /cotools/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from datetime import datetime 4 | from datetime import timedelta 5 | import shutil 6 | import tarfile 7 | from functools import reduce, partial 8 | from typing import Any, Callable, Dict, List, Sequence, TypeVar, Union, overload 9 | from urllib.request import urlopen 10 | import requests 11 | import multiprocessing 12 | import glob 13 | 14 | import xmltodict as xml 15 | import re 16 | 17 | from .text import _get_abstract, _get_text 18 | 19 | searchtext = Union[str, List[str]] 20 | searchtexts = Union[searchtext, List[searchtext]] 21 | textlist = List[Dict[str, Any]] 22 | nestedlist = List[List[str]] 23 | 24 | 25 | class Paperset: 26 | """ 27 | The Paperset class: 28 | __init__ args: 29 | directory: a string, the directory where the jsons are stored 30 | 31 | description: 32 | lazy loader for cord-19 text files. Data is not actually loaded 33 | until indexing, until then it just indexes files. Can be 34 | indexed with both ints and slices. 35 | """ 36 | 37 | def __init__(self, directory: str) -> None: 38 | self.directory = directory 39 | # get all of the text files recursively 40 | file_paths = glob.glob(self.directory + "/**/*.json", recursive=True) 41 | self.dir_dict = {idx: file_path for idx, file_path in enumerate(file_paths)} 42 | self.keys = list(self.dir_dict.keys()) 43 | 44 | def _load_file(self, path: str) -> dict: 45 | with open(f"{path}") as handle: 46 | outdict = json.loads(handle.read()) 47 | return outdict 48 | 49 | def __getitem__(self, indices: Union[int, slice]) -> Union[list, dict]: 50 | slicedkeys = list(self.dir_dict.keys())[indices] 51 | if not isinstance(slicedkeys, list): 52 | slicedkeys = [slicedkeys] 53 | out = [self._load_file(self.dir_dict[key]) for key in slicedkeys] 54 | if len(out) == 1: 55 | return out[0] 56 | else: 57 | return out 58 | 59 | def _helper(self, k: int, fun: Callable[..., Any]) -> Any: 60 | return fun(self._load_file(self.dir_dict[k])) 61 | 62 | def apply(self, fn: Callable[..., Any]) -> List[Any]: 63 | """ 64 | Paperset.apply: 65 | Iterate a function through a paperset 66 | --------------------------------------------- 67 | args: 68 | fn: any function! Should work on the structure of paperset[0] 69 | 70 | runs in parallel! 71 | """ 72 | helper = partial(self._helper, fun=fn) 73 | with multiprocessing.Pool(None) as p: 74 | res = p.map(helper, self.dir_dict.keys()) 75 | return list(res) 76 | # return [fn(self._load_file(self.dir_dict[k])) for k in self.dir_dict.keys()] 77 | 78 | def texts(self) -> List[str]: 79 | """ 80 | Papeset.texts: 81 | get all the text of all the papers, in list form! 82 | """ 83 | return self.apply(_get_text) 84 | 85 | def abstracts(self) -> List[str]: 86 | """ 87 | Papeset.abstracts: 88 | get all the abstracts of all the papers, in list form! 89 | """ 90 | return self.apply(_get_abstract) 91 | 92 | def __len__(self) -> int: 93 | return len(self.dir_dict.keys()) 94 | 95 | 96 | def _search_helper(x: dict, txt: List[str]) -> bool: 97 | if any(c in _get_text(x).lower() for c in txt) or any( 98 | c in _get_abstract(x).lower() for c in txt 99 | ): 100 | return x 101 | else: 102 | return None 103 | 104 | 105 | def _search(ps: Union[Paperset, textlist], txt: Any) -> textlist: 106 | # some checkers on txt, to prevent weirdness 107 | if type(txt[0]) is list: 108 | raise ValueError("Items of the search cannot be nested lists!") 109 | if type(txt) is str: 110 | txt = [txt] 111 | 112 | # If we accept a paperset, which has thousands of papers, we want to operate 113 | # in parallel 114 | if type(ps) is Paperset: 115 | # load the text into the helper function 116 | helper = partial(_search_helper, txt=txt) 117 | # apply in parallel! 118 | out = ps.apply(helper) 119 | return list(filter(lambda x: x is not None, out)) 120 | else: 121 | return [ 122 | x 123 | for x in ps 124 | if any(c in _get_text(x).lower() for c in txt) 125 | or any(c in _get_abstract(x).lower() for c in txt) 126 | ] 127 | 128 | 129 | def search( 130 | ps: Union[Paperset, textlist], terms: Union[searchtexts, searchtext, nestedlist], 131 | ) -> textlist: 132 | """ 133 | search: 134 | search through a paperset or list of paper dicts 135 | ----------------------------------------------------- 136 | args: 137 | ps: a paperset or list of paper dicts 138 | terms: search terms, a list or nested (one layer of nesting only) list 139 | ----------------------------------------------------- 140 | how it works: 141 | search(ps, ['string']) will search for all papers containing the phrase 142 | 'string' 143 | search(ps, ['string1', 'string2']) will search for all papers containing 144 | either phrase 145 | search(ps, [['string1'], ['string2']]) will search for all papers 146 | containing both phrases 147 | search(ps, [['string1', 'string2'], ['string3', 'string4']]) will 148 | search for all papers containing both (either string1 or string 2) and 149 | (either string3 or string4) 150 | ---------------------------------------------------- 151 | notes: 152 | search(ps, [['string1']]) will return weird results! Do not do! 153 | you do not have to worry about case! That is taken care of! 154 | 155 | """ 156 | if type(terms) is not list: 157 | raise ValueError("search terms must be a list!!") 158 | types = [type(x) for x in terms] 159 | nests = len(list(filter(lambda x: x is list, types))) 160 | if nests != 0: 161 | return reduce(lambda x, y: _search(x, y), terms, ps) 162 | else: 163 | return _search(ps, terms) 164 | 165 | 166 | def download( 167 | dir: str = ".", match: str = ".tar.gz", regex: bool = False, metadata: bool = True 168 | ) -> None: 169 | """ 170 | download: 171 | Download CORD-19 dataset from ai2's S3 bucket. 172 | ----------------------------------------------------- 173 | args: 174 | dir: Directory to download the data into. 175 | match: A string dictating which files to download. Defaults to match 176 | all tar files. 177 | regex: If regex should be used. Otherwise, a `match in x` is used. 178 | metadata: Should metadata be downloaded. Defaults to True 179 | ----------------------------------------------------- 180 | how it works: 181 | Match all files: `download('data', match='*')` 182 | Match only JSON files: `download('data', match='.json')` 183 | Match tar files from April 10: `download('data', match='2020-04-10.*.tar.gz', regex=True)` 184 | """ 185 | 186 | s3bucket_url = "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/" 187 | site = xml.parse(requests.get(s3bucket_url).content)["ListBucketResult"]["Contents"] 188 | 189 | def file_filter(f: str) -> bool: 190 | if regex: 191 | return re.search(match, f) 192 | return not match or (match in f) or (match == "*") 193 | 194 | keys = list(filter(file_filter, (x["Key"] for x in site))) 195 | if metadata: 196 | md = list(filter(re.compile("metadata").search, [x["Key"] for x in site])) 197 | dates = [datetime.strptime(x.split("/")[0], "%Y-%m-%d") for x in md] 198 | latest_metadata = md[dates.index(max(dates))] 199 | keys += [latest_metadata] 200 | mag_md = list( 201 | filter( 202 | re.compile("metadata_with_mag_mapping").search, [x["Key"] for x in site] 203 | ) 204 | ) 205 | dates = [datetime.strptime(x.split("/")[0], "%Y-%m-%d") for x in mag_md] 206 | latest_metadata_mappings = mag_md[dates.index(max(dates))] 207 | keys += [latest_metadata_mappings] 208 | data = dict(((os.path.basename(k), os.path.join(s3bucket_url, k)) for k in keys)) 209 | assert data, "No files matched." 210 | 211 | if not os.path.exists(dir): 212 | os.mkdir(dir) 213 | 214 | for fp, url in data.items(): 215 | res = requests.get(url, stream=True) 216 | if res.status_code != 200: 217 | print(f"Failed to download {url}: Got status {res.status_code}") 218 | continue 219 | 220 | print(f"Processing {url} ... ", end="") 221 | if fp.endswith(".tar.gz"): 222 | shutil.rmtree( 223 | os.path.join(dir, fp.replace(".tar.gz", "")), ignore_errors=True 224 | ) 225 | tar = tarfile.open(fileobj=res.raw, mode="r|gz") 226 | tar.extractall(dir) 227 | else: 228 | with open(os.path.join(dir, fp), "wb") as f: 229 | f.write(res.content) 230 | print("Done.") 231 | 232 | 233 | def last_friday() -> str: 234 | target_dayofweek = 4 # Friday 235 | current_dayofweek = datetime.now().weekday() # Today 236 | if target_dayofweek <= current_dayofweek: 237 | # target is in the current week 238 | endDate = datetime.now() - timedelta(current_dayofweek - target_dayofweek) 239 | else: 240 | # target is in the previous week 241 | endDate = ( 242 | datetime.now() 243 | - timedelta(weeks=1) 244 | + timedelta(target_dayofweek - current_dayofweek) 245 | ) 246 | return str(endDate).split()[0] 247 | -------------------------------------------------------------------------------- /cotools/hopkins.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | from typing import Any, Dict, Tuple 4 | from urllib.request import urlopen 5 | 6 | 7 | def _read_data(url: str) -> Dict[Any, Any]: 8 | response = urlopen(url) 9 | byts = response.read() 10 | data = io.StringIO(byts.decode()) 11 | reader = csv.DictReader(data) 12 | result = {} 13 | for row in reader: 14 | for column, value in row.items(): 15 | result.setdefault(column, []).append(value) 16 | return result 17 | 18 | 19 | def _convert_data(data: Dict[Any, Any]) -> Dict[Any, Any]: 20 | out = {} 21 | for k in data.keys(): 22 | if k in ["Province/State", "Country/Region"]: 23 | out[k] = data[k] 24 | elif k in ["Lat", "Long"]: 25 | out[k] = list(map(float, data[k])) 26 | else: 27 | out[k] = list(map(int, data[k])) 28 | return out 29 | 30 | 31 | def get_hopkins() -> Tuple[Dict[Any, Any], Dict[Any, Any]]: 32 | datafiles = [ 33 | "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", 34 | "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 35 | ] 36 | print("Warning: Hopkins data is constantly changing!") 37 | return (_convert_data(_read_data(dat)) for dat in datafiles) 38 | 39 | 40 | # url="https://docs.google.com/spreadsheets/d/1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY/export?gid=0&format=csv" 41 | # 42 | # url = "https://docs.google.com/spreadsheet/ccc?key=1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY&output=csv" 43 | # 44 | # url="https://drive.google.com/file/d/10Kffl2xAfWxiR_qtkgBSFu1gLuogmBad/export?gid=0&format=csv" 45 | # 46 | # import requests 47 | # io.StringIO(requests.get(url).content) 48 | # 49 | # test = _read_data(url) 50 | # 51 | # test 52 | # 53 | # url2 = "docs.google.com/feeds/download/spreadsheets/Export?key<1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY>&exportFormat=csv&gid=0" 54 | # 55 | # _read_data(url2) 56 | # 57 | # test 58 | # 59 | # test.keys() 60 | -------------------------------------------------------------------------------- /cotools/text.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | def _get_text(d: dict) -> str: 5 | """ 6 | gets the text from a single item 7 | """ 8 | tdict = d["body_text"] 9 | return " ".join([x["text"] for x in tdict]) 10 | 11 | 12 | def get_texts(l: list) -> List[str]: 13 | """ 14 | gets the text for a list of items 15 | """ 16 | return [_get_text(x) for x in l] 17 | 18 | 19 | def _get_abstract(d: dict) -> str: 20 | """ 21 | gets the abstract for a single item 22 | """ 23 | l = d["abstract"] 24 | out = "" if len(l) == 0 else l[0]["text"] 25 | return out 26 | 27 | 28 | def get_abstracts(l: list) -> List[str]: 29 | """ 30 | gets the abstract for a list of items 31 | """ 32 | return [_get_abstract(x) for x in l] 33 | -------------------------------------------------------------------------------- /nix/cotools.nix: -------------------------------------------------------------------------------- 1 | {lib, buildPythonPackage, fetchPypi}: 2 | 3 | buildPythonPackage rec { 4 | pname = "cord-19-tools"; 5 | version = "0.0.7"; 6 | 7 | src = fetchPypi { 8 | inherit pname version; 9 | sha256 = "aff320bd1e2df2b7a68d5d775a233991bb829ce10035cc085547f43ab3b545d3"; 10 | }; 11 | doCheck = false; 12 | } 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | def readme(): 5 | with open("README.md") as readme_file: 6 | return readme_file.read() 7 | 8 | 9 | setuptools.setup( 10 | name="cord-19-tools", 11 | version="0.3.3", 12 | description="CORD 19 tools and utilities", 13 | long_description=readme(), 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/josephsdavid/cord-19-tools", 16 | maintainer="David Josephs", 17 | maintainer_email="josephsd@smu.edu", 18 | # $ packages = setuptools.find_packages(exclude = [ 19 | # "*weights*", "*viz*", "*data*" 20 | # ]), 21 | packages=["cotools"], 22 | license="MIT", 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ], 28 | install_requires=["xmltodict"], 29 | ) 30 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | import os 3 | 4 | import cotools 5 | 6 | help(cotools.download) 7 | 8 | from datetime import datetime 9 | from datetime import timedelta 10 | 11 | 12 | cotools.download(dir="data") 13 | 14 | import pdb; pdb.set_trace() # XXX BREAKPOINT 15 | 16 | # noncomm = cotools.Paperset("data/noncomm_use_subset") 17 | 18 | data = cotools.Paperset("data/noncomm_use_subset") 19 | 20 | text(data[-1]) 21 | 22 | 23 | # pprint(data[0]) 24 | # print(type(data[0])) 25 | 26 | # get the text for one feature 27 | cotools.text(data[0]) 28 | 29 | cotools.texts(data[:15]) 30 | 31 | 32 | import pdb 33 | 34 | pdb.set_trace() # XXX BREAKPOINT 35 | data.apply(len) 36 | 37 | 38 | # dict 39 | 40 | # pprint(data[:2]) 41 | print(type(data[2:5])) 42 | # list 43 | 44 | print(len(data)) 45 | 46 | # takes about 5gb in memory 47 | # alldata = data[:] 48 | import pdb; pdb.set_trace() # XXX BREAKPOINT 49 | #data[:] 50 | 51 | # len(data) 52 | 53 | # len(alldata) 54 | 55 | txt = [["novel coronavirus"], ["ventilator", "cpap", "bipap"]] 56 | 57 | import pdb 58 | 59 | pdb.set_trace() # XXX BREAKPOINT 60 | x = cotools.search(data, txt) 61 | print(len(x)) 62 | print(len(cotools.search(data, txt[0]))) 63 | print(len(cotools.search(data, txt[-1]))) 64 | --------------------------------------------------------------------------------