├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── Cotools_demo.ipynb
├── LICENSE.txt
├── README.md
├── Your To-Do list
├── cotools
    ├── __init__.py
    ├── data.py
    ├── hopkins.py
    └── text.py
├── nix
    └── cotools.nix
├── setup.cfg
├── setup.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.h5
  2 | *.mp4
  3 | *.gif
  4 | *.json
  5 | #result
  6 | #data/
  7 | 
  8 | 
  9 | # tensorboard logs
 10 | logs/
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/*.npy
116 | augmented/*.npy
117 | 
118 | #pylint
119 | .pylint.d
120 | 
121 | .vscode-server/
122 | .bash_history
123 | 
124 | tmux.conf
125 | 
126 | data/*
127 | !data/*.sh
128 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | known_third_party = requests,setuptools,xmltodict
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit run --all-files
 2 | repos:
 3 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v2.3.0
 5 |     hooks:
 6 |     -   id: check-ast
 7 |     -   id: check-byte-order-marker
 8 |     -   id: check-case-conflict
 9 |     -   id: check-docstring-first
10 |     -   id: check-executables-have-shebangs
11 |     -   id: check-json
12 |     -   id: check-yaml
13 |     -   id: debug-statements
14 |     -   id: detect-aws-credentials
15 |     -   id: detect-private-key
16 |     -   id: end-of-file-fixer
17 |     -   id: trailing-whitespace
18 |     -   id: mixed-line-ending
19 | 
20 | -   repo: https://github.com/pre-commit/mirrors-mypy
21 |     rev: v0.730
22 |     hooks:
23 |     -   id: mypy
24 |         args: [--ignore-missing-imports]
25 | -   repo: https://github.com/asottile/seed-isort-config
26 |     rev: v1.9.3
27 |     hooks:
28 |     -   id: seed-isort-config
29 | -   repo: https://github.com/pre-commit/mirrors-isort
30 |     rev: v4.3.21
31 |     hooks:
32 |     -   id: isort
33 | -   repo: https://github.com/psf/black
34 |     rev: 19.3b0
35 |     hooks:
36 |     -   id: black
37 | -   repo: https://github.com/asottile/pyupgrade
38 |     rev: v2.1.0
39 |     hooks:
40 |     -   id: pyupgrade
41 | -   repo: https://github.com/asottile/blacken-docs
42 |     rev: v1.6.0
43 |     hooks:
44 |     -   id: blacken-docs
45 |         additional_dependencies: [black==19.3b0]
46 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2018 YOUR NAME
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.
18 | view raw
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI version](https://badge.fury.io/py/cord-19-tools.svg)](https://badge.fury.io/py/cord-19-tools)
  2 | # COVID-19 Data Tools
  3 | 
  4 | Tools for making COVID 19 data slightly easier for everyone! If you A) think something would be useful in your research or B) have some helpful code to contribute, make an issue or PR ASAP so we can get your code shared!
  5 | 
  6 | ## Installation
  7 | 
  8 | ```
  9 | pip install cord-19-tools
 10 | ```
 11 | 
 12 | BE SURE TO HAVE THE MOST RECENT VERSION! I will be constantly updating to make sure users are getting the right data! Semantic Scholar updates the dataset every friday, so on fridays and saturdays be sure to redownload data!
 13 | 
 14 | # Demo
 15 | 
 16 | [Demonstration Notebook on colab](https://colab.research.google.com/drive/1al-K7vT3m72EOBduMpN2rQF1bLdGikx_)
 17 | 
 18 | ## Downloading the data
 19 | 
 20 | To download and extract the data, use the `download` function:
 21 | 
 22 | ```python
 23 | import cotools
 24 | from pprint import pprint
 25 | 
 26 | cotools.download()
 27 | ```
 28 | 
 29 | For now this just downloads the data from the [CORD-19 dataset](https://pages.semanticscholar.org/coronavirus-research), metadata is not included (will be by end of day), extracts all the tarfiles, and places them in a directory
 30 | 
 31 | ## The Paperset class
 32 | 
 33 | This is a class for lazily loading papers from the [CORD-19 dataset](https://pages.semanticscholar.org/coronavirus-research).
 34 | 
 35 | 
 36 | ```python
 37 | # no `/` at the end please!
 38 | data = cotools.Paperset("data/comm_use_subset")
 39 | 
 40 | # indexes with ints
 41 | pprint(data[0])
 42 | # returns a dict
 43 | 
 44 | # and slices!
 45 | pprint(data[:2])
 46 | # returns a list of dicts
 47 | 
 48 | 
 49 | print(len(data))
 50 | 
 51 | # takes about 5gb in memory
 52 | alldata = data[:]
 53 | ```
 54 | 
 55 | Lets talk for a bit about how it works, and why it doesnt take a gigantic amount of memory. The files are not actually loaded into python ***until the data is indexed***. Upon indexing, the files at those indexes are read into python, resulting in a list of dictionaries. This means you can still contribute while working on a low resource system.
 56 | 
 57 | 
 58 | ### Getting text and abstracts
 59 | 
 60 | For text, there is the `text` function, which returns the text from a single document, the `texts` function, which returns the text from multiple documents, and the `Paperset.texts()` function, which gets the text from all documents:
 61 | 
 62 | ```python
 63 | print(cotools.text(data[0]))
 64 | print(cotools.texts(data[12:18]))
 65 | 
 66 | alltext = data.texts()
 67 | # alltext = cotools.texts(alldata)
 68 | ```
 69 | 
 70 | For abstracts, we have a similar API:
 71 | 
 72 | ```python
 73 | print(cotools.abstract(data[0]))
 74 | print(cotools.abstracts(data[12:18]))
 75 | 
 76 | allabs = data.abstracts()
 77 | # allabs = cotools.abstracts(alldata)
 78 | ```
 79 | 
 80 | ### Manipulating
 81 | 
 82 | You can also manipulate the documents with the `Paperset.apply` method:
 83 | 
 84 | ```python
 85 | keys = comm_use.apply(lambda x: list(x.keys()))
 86 | # then lets combine them into a set
 87 | print(set(sum(keys, [])))
 88 | ```
 89 | 
 90 | ### Searching
 91 | 
 92 | You can search with a list OR a nested list! See the demo notebook for more examples!
 93 | 
 94 | ```python
 95 | txt = [["covid", "novel coronavirus"], ["ventilator", "cpap", "bipap"]]
 96 | 
 97 | x = cotools.search(comm_use, txt)
 98 | print(len(x))
 99 | print(len(cotools.search(comm_use, txt[0])))
100 | print(len(cotools.search(comm_use, txt[-1])))
101 | ```
102 | 
103 | # TODO
104 | 
105 | - [x] Metadata
106 | - [ ] Other data, for example data from [this aggregate site](https://www.kiragoldner.com/covid19/) and [this google spreadsheet](https://docs.google.com/spreadsheets/u/1/d/e/2PACX-1vRwAqp96T9sYYq2-i7Tj0pvTf6XVHjDSMIKBdZHXiCGGdNC0ypEU9NbngS8mxea55JuCFuua1MUeOj5/pubhtml#)
107 | 


--------------------------------------------------------------------------------
/Your To-Do list:
--------------------------------------------------------------------------------
 1 | Help on function download in module cotools.data:
 2 | 
 3 | ddoowwnnllooaadd(dir: str = '.', match: str = '.tar.gz', regex: bool = False) -> None
 4 |     download:
 5 |         Download CORD-19 dataset from ai2's S3 bucket.
 6 |     -----------------------------------------------------
 7 |     args:
 8 |         dir:    Directory to download the data into.
 9 |         match:  A string dictating which files to download. Defaults to match
10 |                 all tar files.
11 |         regex:  If regex should be used. Otherwise, a `match in x` is used.
12 |     -----------------------------------------------------
13 |     how it works:
14 |         Match all files:                `download('data', match='*')`
15 |         Match only JSON files:          `download('data', match='.json')`
16 |         Match tar files from April 10:  `download('data', match='2020-04-10.*.tar.gz', regex=True)`
17 | 


--------------------------------------------------------------------------------
/cotools/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import Paperset, download, search
2 | from .hopkins import get_hopkins
3 | from .text import _get_abstract as abstract
4 | from .text import _get_text as text
5 | from .text import get_abstracts as abstracts
6 | from .text import get_texts as texts
7 | 


--------------------------------------------------------------------------------
/cotools/data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from datetime import datetime
  4 | from datetime import timedelta
  5 | import shutil
  6 | import tarfile
  7 | from functools import reduce, partial
  8 | from typing import Any, Callable, Dict, List, Sequence, TypeVar, Union, overload
  9 | from urllib.request import urlopen
 10 | import requests
 11 | import multiprocessing
 12 | import glob
 13 | 
 14 | import xmltodict as xml
 15 | import re
 16 | 
 17 | from .text import _get_abstract, _get_text
 18 | 
 19 | searchtext = Union[str, List[str]]
 20 | searchtexts = Union[searchtext, List[searchtext]]
 21 | textlist = List[Dict[str, Any]]
 22 | nestedlist = List[List[str]]
 23 | 
 24 | 
 25 | class Paperset:
 26 |     """
 27 |     The Paperset class:
 28 |         __init__ args:
 29 |             directory: a string, the directory where the jsons are stored
 30 | 
 31 |             description:
 32 |                 lazy loader for cord-19 text files. Data is not actually loaded
 33 |                 until indexing, until then it just indexes files. Can be
 34 |                 indexed with both ints and slices.
 35 |     """
 36 | 
 37 |     def __init__(self, directory: str) -> None:
 38 |         self.directory = directory
 39 |         # get all of the text files recursively
 40 |         file_paths = glob.glob(self.directory + "/**/*.json", recursive=True)
 41 |         self.dir_dict = {idx: file_path for idx, file_path in enumerate(file_paths)}
 42 |         self.keys = list(self.dir_dict.keys())
 43 | 
 44 |     def _load_file(self, path: str) -> dict:
 45 |         with open(f"{path}") as handle:
 46 |             outdict = json.loads(handle.read())
 47 |         return outdict
 48 | 
 49 |     def __getitem__(self, indices: Union[int, slice]) -> Union[list, dict]:
 50 |         slicedkeys = list(self.dir_dict.keys())[indices]
 51 |         if not isinstance(slicedkeys, list):
 52 |             slicedkeys = [slicedkeys]
 53 |         out = [self._load_file(self.dir_dict[key]) for key in slicedkeys]
 54 |         if len(out) == 1:
 55 |             return out[0]
 56 |         else:
 57 |             return out
 58 | 
 59 |     def _helper(self, k: int, fun: Callable[..., Any]) -> Any:
 60 |         return fun(self._load_file(self.dir_dict[k]))
 61 | 
 62 |     def apply(self, fn: Callable[..., Any]) -> List[Any]:
 63 |         """
 64 |         Paperset.apply:
 65 |             Iterate a function through a paperset
 66 |         ---------------------------------------------
 67 |         args:
 68 |             fn: any function! Should work on the structure of paperset[0]
 69 | 
 70 |         runs in parallel!
 71 |         """
 72 |         helper = partial(self._helper, fun=fn)
 73 |         with multiprocessing.Pool(None) as p:
 74 |             res = p.map(helper, self.dir_dict.keys())
 75 |         return list(res)
 76 |         # return [fn(self._load_file(self.dir_dict[k])) for k in self.dir_dict.keys()]
 77 | 
 78 |     def texts(self) -> List[str]:
 79 |         """
 80 |         Papeset.texts:
 81 |             get all the text of all the papers, in list form!
 82 |         """
 83 |         return self.apply(_get_text)
 84 | 
 85 |     def abstracts(self) -> List[str]:
 86 |         """
 87 |         Papeset.abstracts:
 88 |             get all the abstracts of all the papers, in list form!
 89 |         """
 90 |         return self.apply(_get_abstract)
 91 | 
 92 |     def __len__(self) -> int:
 93 |         return len(self.dir_dict.keys())
 94 | 
 95 | 
 96 | def _search_helper(x: dict, txt: List[str]) -> bool:
 97 |     if any(c in _get_text(x).lower() for c in txt) or any(
 98 |         c in _get_abstract(x).lower() for c in txt
 99 |     ):
100 |         return x
101 |     else:
102 |         return None
103 | 
104 | 
105 | def _search(ps: Union[Paperset, textlist], txt: Any) -> textlist:
106 |     # some checkers on txt, to prevent weirdness
107 |     if type(txt[0]) is list:
108 |         raise ValueError("Items of the search cannot be nested lists!")
109 |     if type(txt) is str:
110 |         txt = [txt]
111 | 
112 |     # If we accept a paperset, which has thousands of papers, we want to operate
113 |     # in parallel
114 |     if type(ps) is Paperset:
115 |         # load the text into the helper function
116 |         helper = partial(_search_helper, txt=txt)
117 |         # apply in parallel!
118 |         out = ps.apply(helper)
119 |         return list(filter(lambda x: x is not None, out))
120 |     else:
121 |         return [
122 |             x
123 |             for x in ps
124 |             if any(c in _get_text(x).lower() for c in txt)
125 |             or any(c in _get_abstract(x).lower() for c in txt)
126 |         ]
127 | 
128 | 
129 | def search(
130 |     ps: Union[Paperset, textlist], terms: Union[searchtexts, searchtext, nestedlist],
131 | ) -> textlist:
132 |     """
133 |     search:
134 |         search through a paperset or list of paper dicts
135 |     -----------------------------------------------------
136 |     args:
137 |         ps: a paperset or list of paper dicts
138 |         terms: search terms, a list or nested (one layer of nesting only) list
139 |     -----------------------------------------------------
140 |     how it works:
141 |         search(ps, ['string']) will search for all papers containing the phrase
142 |         'string'
143 |         search(ps, ['string1', 'string2']) will search for all papers containing
144 |         either phrase
145 |         search(ps, [['string1'], ['string2']]) will search for all papers
146 |         containing both phrases
147 |         search(ps, [['string1', 'string2'], ['string3', 'string4']]) will
148 |         search for all papers containing both (either string1 or string 2) and
149 |         (either string3 or string4)
150 |     ----------------------------------------------------
151 |     notes:
152 |         search(ps, [['string1']]) will return weird results! Do not do!
153 |         you do not have to worry about case! That is taken care of!
154 | 
155 |     """
156 |     if type(terms) is not list:
157 |         raise ValueError("search terms must be a list!!")
158 |     types = [type(x) for x in terms]
159 |     nests = len(list(filter(lambda x: x is list, types)))
160 |     if nests != 0:
161 |         return reduce(lambda x, y: _search(x, y), terms, ps)
162 |     else:
163 |         return _search(ps, terms)
164 | 
165 | 
166 | def download(
167 |     dir: str = ".", match: str = ".tar.gz", regex: bool = False, metadata: bool = True
168 | ) -> None:
169 |     """
170 |     download:
171 |         Download CORD-19 dataset from ai2's S3 bucket.
172 |     -----------------------------------------------------
173 |     args:
174 |         dir:    Directory to download the data into.
175 |         match:  A string dictating which files to download. Defaults to match
176 |                 all tar files.
177 |         regex:  If regex should be used. Otherwise, a `match in x` is used.
178 |         metadata: Should metadata be downloaded. Defaults to True
179 |     -----------------------------------------------------
180 |     how it works:
181 |         Match all files:                `download('data', match='*')`
182 |         Match only JSON files:          `download('data', match='.json')`
183 |         Match tar files from April 10:  `download('data', match='2020-04-10.*.tar.gz', regex=True)`
184 |     """
185 | 
186 |     s3bucket_url = "https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"
187 |     site = xml.parse(requests.get(s3bucket_url).content)["ListBucketResult"]["Contents"]
188 | 
189 |     def file_filter(f: str) -> bool:
190 |         if regex:
191 |             return re.search(match, f)
192 |         return not match or (match in f) or (match == "*")
193 | 
194 |     keys = list(filter(file_filter, (x["Key"] for x in site)))
195 |     if metadata:
196 |         md = list(filter(re.compile("metadata").search, [x["Key"] for x in site]))
197 |         dates = [datetime.strptime(x.split("/")[0], "%Y-%m-%d") for x in md]
198 |         latest_metadata = md[dates.index(max(dates))]
199 |         keys += [latest_metadata]
200 |         mag_md = list(
201 |             filter(
202 |                 re.compile("metadata_with_mag_mapping").search, [x["Key"] for x in site]
203 |             )
204 |         )
205 |         dates = [datetime.strptime(x.split("/")[0], "%Y-%m-%d") for x in mag_md]
206 |         latest_metadata_mappings = mag_md[dates.index(max(dates))]
207 |         keys += [latest_metadata_mappings]
208 |     data = dict(((os.path.basename(k), os.path.join(s3bucket_url, k)) for k in keys))
209 |     assert data, "No files matched."
210 | 
211 |     if not os.path.exists(dir):
212 |         os.mkdir(dir)
213 | 
214 |     for fp, url in data.items():
215 |         res = requests.get(url, stream=True)
216 |         if res.status_code != 200:
217 |             print(f"Failed to download {url}: Got status {res.status_code}")
218 |             continue
219 | 
220 |         print(f"Processing {url} ... ", end="")
221 |         if fp.endswith(".tar.gz"):
222 |             shutil.rmtree(
223 |                 os.path.join(dir, fp.replace(".tar.gz", "")), ignore_errors=True
224 |             )
225 |             tar = tarfile.open(fileobj=res.raw, mode="r|gz")
226 |             tar.extractall(dir)
227 |         else:
228 |             with open(os.path.join(dir, fp), "wb") as f:
229 |                 f.write(res.content)
230 |         print("Done.")
231 | 
232 | 
233 | def last_friday() -> str:
234 |     target_dayofweek = 4  # Friday
235 |     current_dayofweek = datetime.now().weekday()  # Today
236 |     if target_dayofweek <= current_dayofweek:
237 |         # target is in the current week
238 |         endDate = datetime.now() - timedelta(current_dayofweek - target_dayofweek)
239 |     else:
240 |         # target is in the previous week
241 |         endDate = (
242 |             datetime.now()
243 |             - timedelta(weeks=1)
244 |             + timedelta(target_dayofweek - current_dayofweek)
245 |         )
246 |     return str(endDate).split()[0]
247 | 


--------------------------------------------------------------------------------
/cotools/hopkins.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import io
 3 | from typing import Any, Dict, Tuple
 4 | from urllib.request import urlopen
 5 | 
 6 | 
 7 | def _read_data(url: str) -> Dict[Any, Any]:
 8 |     response = urlopen(url)
 9 |     byts = response.read()
10 |     data = io.StringIO(byts.decode())
11 |     reader = csv.DictReader(data)
12 |     result = {}
13 |     for row in reader:
14 |         for column, value in row.items():
15 |             result.setdefault(column, []).append(value)
16 |     return result
17 | 
18 | 
19 | def _convert_data(data: Dict[Any, Any]) -> Dict[Any, Any]:
20 |     out = {}
21 |     for k in data.keys():
22 |         if k in ["Province/State", "Country/Region"]:
23 |             out[k] = data[k]
24 |         elif k in ["Lat", "Long"]:
25 |             out[k] = list(map(float, data[k]))
26 |         else:
27 |             out[k] = list(map(int, data[k]))
28 |     return out
29 | 
30 | 
31 | def get_hopkins() -> Tuple[Dict[Any, Any], Dict[Any, Any]]:
32 |     datafiles = [
33 |         "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
34 |         "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
35 |     ]
36 |     print("Warning: Hopkins data is constantly changing!")
37 |     return (_convert_data(_read_data(dat)) for dat in datafiles)
38 | 
39 | 
40 | # url="https://docs.google.com/spreadsheets/d/1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY/export?gid=0&format=csv"
41 | #
42 | # url = "https://docs.google.com/spreadsheet/ccc?key=1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY&output=csv"
43 | #
44 | # url="https://drive.google.com/file/d/10Kffl2xAfWxiR_qtkgBSFu1gLuogmBad/export?gid=0&format=csv"
45 | #
46 | # import requests
47 | # io.StringIO(requests.get(url).content)
48 | #
49 | # test = _read_data(url)
50 | #
51 | # test
52 | #
53 | # url2 = "docs.google.com/feeds/download/spreadsheets/Export?key<1ZGol4qZthAc7wiElRYG_36iYT2own_W0QOiD3epGByY>&exportFormat=csv&gid=0"
54 | #
55 | # _read_data(url2)
56 | #
57 | # test
58 | #
59 | # test.keys()
60 | 


--------------------------------------------------------------------------------
/cotools/text.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | 
 4 | def _get_text(d: dict) -> str:
 5 |     """
 6 |     gets the text from a single item
 7 |     """
 8 |     tdict = d["body_text"]
 9 |     return " ".join([x["text"] for x in tdict])
10 | 
11 | 
12 | def get_texts(l: list) -> List[str]:
13 |     """
14 |     gets the text for a list of items
15 |     """
16 |     return [_get_text(x) for x in l]
17 | 
18 | 
19 | def _get_abstract(d: dict) -> str:
20 |     """
21 |     gets the abstract for a single item
22 |     """
23 |     l = d["abstract"]
24 |     out = "" if len(l) == 0 else l[0]["text"]
25 |     return out
26 | 
27 | 
28 | def get_abstracts(l: list) -> List[str]:
29 |     """
30 |     gets the abstract for a list of items
31 |     """
32 |     return [_get_abstract(x) for x in l]
33 | 


--------------------------------------------------------------------------------
/nix/cotools.nix:
--------------------------------------------------------------------------------
 1 | {lib, buildPythonPackage, fetchPypi}:
 2 | 
 3 | buildPythonPackage rec {
 4 |     pname = "cord-19-tools";
 5 |     version = "0.0.7";
 6 | 
 7 |     src = fetchPypi {
 8 |       inherit pname version;
 9 |       sha256 = "aff320bd1e2df2b7a68d5d775a233991bb829ce10035cc085547f43ab3b545d3";
10 |     };
11 |     doCheck = false;
12 | }
13 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | def readme():
 5 |     with open("README.md") as readme_file:
 6 |         return readme_file.read()
 7 | 
 8 | 
 9 | setuptools.setup(
10 |     name="cord-19-tools",
11 |     version="0.3.3",
12 |     description="CORD 19 tools and utilities",
13 |     long_description=readme(),
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/josephsdavid/cord-19-tools",
16 |     maintainer="David Josephs",
17 |     maintainer_email="josephsd@smu.edu",
18 |     # $ packages = setuptools.find_packages(exclude = [
19 |     #    "*weights*", "*viz*", "*data*"
20 |     # ]),
21 |     packages=["cotools"],
22 |     license="MIT",
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3",
25 |         "License :: OSI Approved :: MIT License",
26 |         "Operating System :: OS Independent",
27 |     ],
28 |     install_requires=["xmltodict"],
29 | )
30 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | import os
 3 | 
 4 | import cotools
 5 | 
 6 | help(cotools.download)
 7 | 
 8 | from datetime import datetime
 9 | from datetime import timedelta
10 | 
11 | 
12 | cotools.download(dir="data")
13 | 
14 | import pdb; pdb.set_trace()  # XXX BREAKPOINT
15 | 
16 | # noncomm = cotools.Paperset("data/noncomm_use_subset")
17 | 
18 | data = cotools.Paperset("data/noncomm_use_subset")
19 | 
20 | text(data[-1])
21 | 
22 | 
23 | # pprint(data[0])
24 | # print(type(data[0]))
25 | 
26 | # get the text for one feature
27 | cotools.text(data[0])
28 | 
29 | cotools.texts(data[:15])
30 | 
31 | 
32 | import pdb
33 | 
34 | pdb.set_trace()  # XXX BREAKPOINT
35 | data.apply(len)
36 | 
37 | 
38 | # dict
39 | 
40 | # pprint(data[:2])
41 | print(type(data[2:5]))
42 | # list
43 | 
44 | print(len(data))
45 | 
46 | # takes about 5gb in memory
47 | # alldata = data[:]
48 | import pdb; pdb.set_trace()  # XXX BREAKPOINT
49 | #data[:]
50 | 
51 | # len(data)
52 | 
53 | # len(alldata)
54 | 
55 | txt = [["novel coronavirus"], ["ventilator", "cpap", "bipap"]]
56 | 
57 | import pdb
58 | 
59 | pdb.set_trace()  # XXX BREAKPOINT
60 | x = cotools.search(data, txt)
61 | print(len(x))
62 | print(len(cotools.search(data, txt[0])))
63 | print(len(cotools.search(data, txt[-1])))
64 | 


--------------------------------------------------------------------------------