├── .github └── workflows │ ├── utests_on_commit.yml │ └── utests_on_pullreq.yml ├── .gitignore ├── LICENSE ├── README.md ├── covid19dh ├── __init__.py ├── _cache.py ├── _cite.py └── main.py ├── doc └── presentation.ipynb ├── publish.sh ├── requirements.txt ├── setup.py └── tests ├── test.py ├── test_cite.py ├── test_covid19_latest.py └── test_covid19_vintage.py /.github/workflows/utests_on_commit.yml: -------------------------------------------------------------------------------- 1 | 2 | name: utests_on_commit # workflow name 3 | on: # trigger 4 | push: 5 | branches: [ master ] 6 | 7 | # parallel jobs to perform 8 | jobs: 9 | # === unittest execution === 10 | unittest: 11 | runs-on: ubuntu-latest # runner 12 | 13 | steps: # tasks 14 | - name: Set Github Workspace # access Github Workspace 15 | uses: actions/checkout@v2 16 | - name: Set up Python 3.x # set architecture and Python3 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: '3.8' 20 | architecture: 'x64' # architecture 21 | 22 | - name: Install dependencies # dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 26 | 27 | - name: Run unittests # run unittests 28 | run: 29 | python tests/test.py -------------------------------------------------------------------------------- /.github/workflows/utests_on_pullreq.yml: -------------------------------------------------------------------------------- 1 | 2 | name: utests_on_pullreq # workflow name 3 | on: # trigger 4 | pull_request: 5 | branches: [ master ] 6 | 7 | # parallel jobs to perform 8 | jobs: 9 | # === unittest execution === 10 | unittest: 11 | runs-on: ubuntu-latest # runner 12 | 13 | steps: # tasks 14 | - name: Set Github Workspace # access Github Workspace 15 | uses: actions/checkout@v2 16 | - name: Set up Python 3.x # set architecture and Python3 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: '3.8' 20 | architecture: 'x64' # architecture 21 | 22 | - name: Install dependencies # dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 26 | 27 | - name: Run unittests # run unittests 28 | run: 29 | python tests/test.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # my added py scripts in main directory 132 | /*.py 133 | !setup.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Emanuele Guidotti and David Ardia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Python Interface to COVID-19 Data Hub 4 | 5 | [![](https://img.shields.io/pypi/v/covid19dh.svg?color=brightgreen)](https://pypi.org/pypi/covid19dh/) [![](https://img.shields.io/pypi/dm/covid19dh.svg?color=blue)](https://pypi.org/pypi/covid19dh/) [![DOI](https://joss.theoj.org/papers/10.21105/joss.02376/status.svg)](https://doi.org/10.21105/joss.02376) [![](https://github.com/covid19datahub/Python/workflows/utests_on_commit/badge.svg)](https://github.com/covid19datahub/Python) 6 | 7 | Download COVID-19 data across governmental sources at national, regional, and city level, as described in [Guidotti and Ardia (2020)](https://www.doi.org/10.21105/joss.02376). Includes the time series of vaccines, tests, cases, deaths, recovered, hospitalizations, intensive therapy, and policy measures by [Oxford COVID-19 Government Response Tracker](https://www.bsg.ox.ac.uk/research/research-projects/coronavirus-government-response-tracker). Please agree to the [Terms of Use](https://covid19datahub.io/LICENSE.html) and cite the following reference when using it: 8 | 9 | **Reference** 10 | 11 | Guidotti, E., Ardia, D., (2020). 12 | COVID-19 Data Hub 13 | _Journal of Open Source Software_, **5**(51):2376 14 | [https://doi.org/10.21105/joss.02376](https://doi.org/10.21105/joss.02376) 15 | 16 | ## Setup and usage 17 | 18 | Install from [pip](https://pypi.org/project/covid19dh/) with 19 | 20 | ```python 21 | pip install covid19dh 22 | ``` 23 | 24 | Importing the main function `covid19()` 25 | 26 | ```python 27 | from covid19dh import covid19 28 | x, src = covid19() 29 | ``` 30 | 31 | Package is regularly updated. Update with 32 | 33 | ```bash 34 | pip install --upgrade covid19dh 35 | ``` 36 | 37 | ## Return values 38 | 39 | The function `covid19()` returns 2 pandas dataframes: 40 | * the data and 41 | * references to the data sources. 42 | 43 | ## Parametrization 44 | 45 | ### Country 46 | 47 | List of country names (case-insensitive) or ISO codes (alpha-2, alpha-3 or numeric). The list of ISO codes can be found [here](https://github.com/covid19datahub/COVID19/blob/master/inst/extdata/db/ISO.csv). 48 | 49 | Fetching data from a particular country: 50 | 51 | ```python 52 | x, src = covid19("USA") # Unites States 53 | ``` 54 | 55 | Specify multiple countries at the same time: 56 | 57 | ```python 58 | x, src = covid19(["ESP","PT","andorra",250]) 59 | ``` 60 | 61 | If `country` is omitted, the whole dataset is returned: 62 | 63 | ```python 64 | x, src = covid19() 65 | ``` 66 | 67 | ### Raw data 68 | 69 | Logical. Skip data cleaning? Default `True`. If `raw=False`, the raw data are cleaned by filling missing dates with `NaN` values. This ensures that all locations share the same grid of dates and no single day is skipped. Then, `NaN` values are replaced with the previous non-`NaN` value or `0`. 70 | 71 | ```python 72 | x, src = covid19(raw = False) 73 | ``` 74 | 75 | ### Date filter 76 | 77 | Date can be specified with `datetime.datetime`, `datetime.date` or as a `str` in format `YYYY-mm-dd`. 78 | 79 | ```python 80 | from datetime import datetime 81 | x, src = covid19("SWE", start = datetime(2020,4,1), end = "2020-05-01") 82 | ``` 83 | 84 | ### Level 85 | 86 | Integer. Granularity level of the data: 87 | 88 | 1. Country level 89 | 2. State, region or canton level 90 | 3. City or municipality level 91 | 92 | ```python 93 | from datetime import date 94 | x, src = covid19("USA", level = 2, start = date(2020,5,1)) 95 | ``` 96 | 97 | ### Cache 98 | 99 | Logical. Memory caching? Significantly improves performance on successive calls. By default, using the cached data is enabled. 100 | 101 | Caching can be disabled (e.g. for long running programs) by: 102 | 103 | ```python 104 | x, src = covid19("FRA", cache = False) 105 | ``` 106 | 107 | ### Vintage 108 | 109 | Logical. Retrieve the snapshot of the dataset that was generated at the `end` date instead of using the latest version. Default `False`. 110 | 111 | To fetch e.g. US data that were accessible on *22th April 2020* type 112 | 113 | ```python 114 | x, src = covid19("USA", end = "2020-04-22", vintage = True) 115 | ``` 116 | 117 | The vintage data are collected at the end of the day, but published with approximately 48 hour delay, 118 | once the day is completed in all the timezones. 119 | 120 | Hence if `vintage = True`, but `end` is not set, warning is raised and `None` is returned. 121 | 122 | ```python 123 | x, src = covid19("USA", vintage = True) # too early to get today's vintage 124 | ``` 125 | 126 | ``` 127 | UserWarning: vintage data not available yet 128 | ``` 129 | 130 | ### Data Sources 131 | 132 | The data sources are returned as second value. 133 | 134 | ```python 135 | from covid19dh import covid19 136 | x, src = covid19("USA") 137 | print(src) 138 | ``` 139 | 140 | ### Additional information 141 | 142 | Find out more at https://covid19datahub.io 143 | 144 | ## Acknowledgements 145 | 146 | Developed and maintained by [Martin Benes](https://pypi.org/user/martinbenes1996/). 147 | 148 | ## Cite as 149 | 150 | *Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.* 151 | 152 | A BibTeX entry for LaTeX users is 153 | 154 | ```latex 155 | @Article{, 156 | title = {COVID-19 Data Hub}, 157 | year = {2020}, 158 | doi = {10.21105/joss.02376}, 159 | author = {Emanuele Guidotti and David Ardia}, 160 | journal = {Journal of Open Source Software}, 161 | volume = {5}, 162 | number = {51}, 163 | pages = {2376} 164 | } 165 | ``` -------------------------------------------------------------------------------- /covid19dh/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Unified data hub for a better understanding of COVID-19. 3 | 4 | For more information check README.md. 5 | 6 | Reference: https://covid19datahub.io/ 7 | Todo: 8 | * caching 9 | """ 10 | 11 | import pkg_resources 12 | from .main import * 13 | 14 | try: 15 | __version__ = pkg_resources.get_distribution("covid19dh").version 16 | except Exception: 17 | __version__ = None 18 | -------------------------------------------------------------------------------- /covid19dh/_cache.py: -------------------------------------------------------------------------------- 1 | 2 | # ======== data cache ========= 3 | _cache = {} # data 4 | 5 | 6 | def _construct_cache_id(level, dt, raw, vintage): 7 | cache_id = f"{level}" 8 | if raw: 9 | cache_id += "_raw" 10 | if vintage: 11 | cache_id += dt.strftime("%Y-%m-%d") 12 | return cache_id 13 | 14 | 15 | def read_cache(level, dt, raw, vintage): 16 | cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage) 17 | try: 18 | return _cache[cache_id] 19 | except Exception: 20 | return None 21 | 22 | def write_cache(x, level, dt, raw, vintage): 23 | cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage) 24 | _cache[cache_id] = x 25 | 26 | 27 | # ========= src cache ========== 28 | _cache_src = {} # src 29 | 30 | 31 | def _construct_src_cache_id(dt, vintage): 32 | cache_id = "src" 33 | if vintage: 34 | cache_id += dt.strftime("%Y-%m-%d") 35 | return cache_id 36 | 37 | 38 | def read_src_cache(dt, vintage): 39 | cache_id = _construct_src_cache_id(dt=dt, vintage=vintage) 40 | try: 41 | return _cache_src[cache_id] 42 | except Exception: 43 | return None 44 | 45 | 46 | def write_src_cache(src, dt, vintage): 47 | cache_id = _construct_src_cache_id(dt=dt, vintage=vintage) 48 | _cache_src[cache_id] = src 49 | 50 | 51 | __all__ = ["read_cache", "write_cache", "read_src_cache", "write_src_cache"] 52 | -------------------------------------------------------------------------------- /covid19dh/_cite.py: -------------------------------------------------------------------------------- 1 | 2 | from io import StringIO 3 | import math 4 | import re 5 | import warnings 6 | 7 | import pandas as pd 8 | import requests 9 | 10 | 11 | def get_sources(): 12 | url = 'https://storage.covid19datahub.io/src.csv' 13 | response = requests.get(url) # headers={'User-Agent': 'Mozilla/5.0'} 14 | return pd.read_csv( StringIO(response.text)) 15 | 16 | 17 | def sources_to_citations(sources): 18 | # shorten URL 19 | sources.url = sources.url.apply( 20 | lambda u: re.sub( 21 | r"(http://|https://|www\\.)([^/]+)(.*)", 22 | r"\1\2/", 23 | u) 24 | ) 25 | # remove duplicit 26 | unique_references = sources.groupby(["title","author","institution","url","textVersion","bibtype"]) 27 | 28 | # format 29 | citations = [] 30 | for n, g in unique_references: 31 | for i in range(1): 32 | (title, author, institution, url, textVersion, bibtype) = n 33 | year = g.year.max() 34 | 35 | if textVersion: 36 | citation = textVersion 37 | else: 38 | # pre,post 39 | if author: 40 | pre = author 41 | if title: 42 | post = f"{title}" 43 | elif title: 44 | pre = title 45 | post = "" 46 | # post 47 | if institution: 48 | if post: 49 | post += ", " 50 | post += f"{institution}" 51 | if url: 52 | if post: 53 | post += ", " 54 | url = re.sub(r"(http://|https://|www\\.)([^/]+)(.*)", 55 | r"\1\2/", url) 56 | post += f"{url}" 57 | else: 58 | post += "." 59 | citation = f"{pre} ({year}), {post}" 60 | 61 | citations.append(citation) 62 | return citations 63 | 64 | 65 | def cite(x: pd.DataFrame, verbose: bool = True, sources: bool = None): 66 | # all sources if missing 67 | if sources is None: 68 | sources = get_sources() 69 | 70 | # per iso 71 | references = pd.DataFrame(data=None, columns=sources.columns) 72 | for (iso,), country in x.groupby(["iso_alpha_3"]): 73 | # levels 74 | level = country.administrative_area_level.unique()[0] 75 | # empty attributes 76 | empty_params = country.apply(lambda c: c.isnull().all() | (c == 0).all()) 77 | params = x.columns[~empty_params] 78 | 79 | # filter 80 | src = sources[ 81 | (sources.administrative_area_level == level) & # level 82 | (sources.iso_alpha_3 == iso) & # iso 83 | sources.data_type.isin(params) # data type 84 | ] 85 | # fallback for missing 86 | missing = set(params) - set(src.data_type.unique()) 87 | if missing: 88 | src = pd.concat([ 89 | src, 90 | sources[ 91 | sources.data_type.isin(missing) & # data type 92 | sources.iso_alpha_3.isnull() & # empty ISO 93 | sources.administrative_area_level.isnull() # empty level 94 | ] 95 | ]) 96 | 97 | # set iso,level 98 | src.iso_alpha_3 = iso 99 | src.administrative_area_level = level 100 | 101 | # join 102 | references = pd.concat([references, src]) 103 | 104 | references.drop_duplicates(inplace=True) 105 | 106 | return references 107 | 108 | 109 | 110 | 111 | # === 112 | # hash data stats 113 | params = set(x.columns) 114 | isos = set(x["iso_alpha_3"].unique()) 115 | isos.add(math.nan) 116 | # prefilter 117 | sources = sources[ 118 | sources["iso_alpha_3"].isin(isos) & 119 | sources["data_type"].isin(params) ] 120 | sources = sources.fillna("") 121 | 122 | # filter 123 | def is_source_used(ref): 124 | # data type not present 125 | if not ref['data_type'] in params: return False 126 | # fallbacks 127 | if not ref['iso_alpha_3'] or not ref['administrative_area_level']: return True 128 | 129 | # check both equal 130 | return ((x.iso_alpha_3 == ref.iso_alpha_3) & (x.administrative_area_level == ref.administrative_area_level)).any() 131 | 132 | sources = sources[sources.apply(is_source_used, axis=1)] 133 | 134 | # drop fallback 135 | for p in params: 136 | non_fallback = (sources.data_type == p) & (sources.iso_alpha_3 != '') 137 | no_data = (x[p].isnull() | (x[p] == 0)) 138 | fallback = (sources.data_type == p) & (sources.iso_alpha_3 == '') 139 | if non_fallback.any() or no_data.all(): 140 | sources.drop(fallback.index[fallback].tolist(), inplace=True) 141 | 142 | #citations = sources_to_citations(sources) 143 | 144 | #if verbose: 145 | # print("\033[1mData References:\033[0m\n", end="") 146 | # for ref in citations: 147 | # print("\t" + ref, end="\n\n") 148 | # print("\033[33mTo hide the data sources use 'verbose = False'.\033[0m") 149 | 150 | sources.replace(r'^\s*$', math.nan, regex=True, inplace=True) 151 | return sources 152 | 153 | 154 | __all__ = ["cite", "get_sources"] 155 | -------------------------------------------------------------------------------- /covid19dh/main.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | from io import StringIO, BytesIO 4 | import math 5 | import sys 6 | import warnings 7 | import zipfile 8 | 9 | import pandas as pd 10 | import requests 11 | 12 | from ._cite import get_sources, cite 13 | from ._cache import * 14 | 15 | 16 | def get_url(level, dt, raw, vintage): 17 | # dataname 18 | rawprefix = "raw" if raw else "" 19 | dataname = f"{rawprefix}data-{level}" 20 | # vintage 21 | if vintage: 22 | # too new 23 | if dt >= datetime.datetime.now() - datetime.timedelta(days=2): 24 | warnings.warn("vintage data not available yet", category=ResourceWarning) 25 | return None, None 26 | dt_str = dt.strftime("%Y-%m-%d") 27 | filename = f"{dt_str}.zip" 28 | # current data 29 | else: 30 | filename = f"{dataname}.zip" 31 | # url, filename 32 | return f"https://storage.covid19datahub.io/{filename}", f"{dataname}.csv" 33 | 34 | 35 | def parseDate(dt): 36 | if isinstance(dt, datetime.date): 37 | return datetime.datetime(dt.year, dt.month, dt.day) 38 | if isinstance(dt, str): 39 | try: 40 | return datetime.datetime.strptime(dt, "%Y-%m-%d") 41 | except Exception: 42 | print("Invalid time format.", file=sys.stderr) 43 | raise 44 | return dt 45 | 46 | 47 | def covid19(country=None, 48 | level=1, 49 | start=datetime.date(2019, 1, 1), 50 | end=None, # defaultly today 51 | cache=True, 52 | verbose=True, 53 | raw=True, 54 | vintage=False): 55 | """Main function for module. Fetches data from hub. 56 | 57 | Args: 58 | country (str, optional): ISO country code, defaultly all countries 59 | level (int, optional): level of data, default 1 60 | * country-level (1) 61 | * state-level (2) 62 | * city-level (3) 63 | start (datetime | date | str, optional): start date of data (as str in format [%Y-%m-%d]), 64 | default 2019-01-01 65 | end (datetime | date | str, optional): end date of data (as str in format [%Y-%m-%d]), 66 | default today (sysdate) 67 | cache (bool, optional): use cached data if available, default yes 68 | verbose (bool, optional): prints sources, default true 69 | raw (bool, optional): download not cleansed data, defaultly using cleansed 70 | vintage (bool, optional): use hub data (True) or original source, not available in Python covid19dh (only hub) 71 | """ 72 | # parse arguments 73 | if country is not None: 74 | country = [country] if isinstance(country, str) else country 75 | country = [c.upper() if isinstance(c,str) else c for c in country] 76 | end = datetime.datetime.now() if end is None else end 77 | try: 78 | end = parseDate(end) 79 | start = parseDate(start) 80 | except Exception: 81 | return None, None 82 | if level not in {1, 2, 3}: 83 | warnings.warn("valid options for 'level' are:\n\t1: country-level data\n\t2: state-level data\n\t3: city-level data") 84 | return None, None 85 | if start > end: 86 | warnings.warn("start is later than end") 87 | return None, None 88 | 89 | # cache 90 | df = read_cache(level, end, raw, vintage) 91 | src = None 92 | 93 | if cache is False or df is None: 94 | # get url from level 95 | try: 96 | url, filename = get_url(level=level, dt=end, raw=raw, vintage=vintage) 97 | if url is None: 98 | return None, None 99 | except KeyError: 100 | warnings.warn("invalid level") 101 | return None, None 102 | # download 103 | try: 104 | response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) 105 | except Exception: 106 | if vintage: 107 | warnings.warn("vintage data not available yet") 108 | return None, None 109 | else: 110 | warnings.warn("error to fetch data") 111 | return None, None 112 | # parse 113 | with zipfile.ZipFile(BytesIO(response.content)) as zz: 114 | with zz.open(filename) as fd: 115 | df = pd.read_csv(fd, low_memory=False) 116 | # src from vintage archive 117 | if vintage: 118 | with zz.open("src.csv") as fd: 119 | src = pd.read_csv(fd, low_memory=False) 120 | write_src_cache(src, end, vintage) 121 | # cast columns 122 | df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")) 123 | try: 124 | df['iso_numeric'] = df['iso_numeric'].apply(lambda x: float(x)) 125 | except Exception: 126 | pass 127 | 128 | write_cache(df, level, end, raw, vintage) 129 | 130 | # src 131 | if src is None: 132 | src = read_src_cache(end, vintage) 133 | if src is None: 134 | src = get_sources() 135 | write_src_cache(src, end, vintage) 136 | 137 | # filter 138 | if country is not None: 139 | # elementwise comparison works, but throws warning that it will be working better in the future 140 | # no idea why, but I found solution to mute it as follows 141 | with warnings.catch_warnings(): 142 | warnings.simplefilter(action='ignore', category=FutureWarning) 143 | 144 | country_filter = df['administrative_area_level_1'].map(lambda s: s.upper()).isin(country) 145 | for feature in ["iso_alpha_2","iso_alpha_3","iso_numeric"]: 146 | try: 147 | country_filter = country_filter | df[feature].isin(country) 148 | except KeyError: 149 | pass 150 | df = df[country_filter] 151 | 152 | #df = df[(df['iso_alpha_3'].isin(country)) | 153 | # (df['iso_alpha_2'].isin(country)) | 154 | # (df['iso_numeric'].isin(country)) | 155 | # (df['administrative_area_level_1'].map(lambda s: s.upper()).isin(country)) ] 156 | if start is not None: 157 | df = df[df['date'] >= start] 158 | if end is not None: 159 | df = df[df['date'] <= end] 160 | 161 | # detect empty 162 | if df.empty: 163 | warnings.warn("no data for given settings", category=ResourceWarning) 164 | return None, None 165 | # sort 166 | df = df.sort_values(by=["id","date"]) 167 | 168 | # cite 169 | src = cite(x=df, sources=src, verbose=False) 170 | 171 | if verbose: 172 | # construct message 173 | message = "We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:\n\n" 174 | message += "\t\033[1mGuidotti, E., Ardia, D., (2020), \"COVID-19 Data Hub\", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.\033[0m\n\n" 175 | message += "A BibTeX entry for LaTeX users is\n\n" 176 | message += "\t@Article{,\n" 177 | message += "\t\ttitle = {COVID-19 Data Hub},\n" 178 | message += "\t\tyear = {2020},\n" 179 | message += "\t\tdoi = {10.21105/joss.02376},\n" 180 | message += "\t\tauthor = {Emanuele Guidotti and David Ardia},\n" 181 | message += "\t\tjournal = {Journal of Open Source Software},\n" 182 | message += "\t\tvolume = {5},\n" 183 | message += "\t\tnumber = {51},\n" 184 | message += "\t\tpages = {2376},\n" 185 | message += "\t}\n\n" 186 | message += "\033[33mTo hide this message use 'verbose = False'.\033[0m" 187 | # print 188 | print(message) 189 | 190 | return df, src 191 | 192 | 193 | 194 | __all__ = ["covid19"] -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # remove previous releases 4 | rm -rf build/ dist/ covid19dh.egg-info/ __pycache__/ 5 | # compile 6 | python setup.py sdist bdist_wheel 7 | # publish 8 | python -m twine upload dist/* 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | requests -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # requirements 4 | try: 5 | with open('requirements.txt') as f: 6 | reqs = f.read().splitlines() 7 | except Exception: 8 | reqs = [] 9 | 10 | import setuptools 11 | with open("README.md", "r", encoding="UTF-8") as fh: 12 | long_description = fh.read() 13 | 14 | setuptools.setup( 15 | name='covid19dh', 16 | version='2.3.1', 17 | author='Martin Beneš', 18 | author_email='martinbenes1996@gmail.com', 19 | description='Unified data hub for a better understanding of COVID-19 https://covid19datahub.io', 20 | long_description=long_description, 21 | long_description_content_type="text/markdown", 22 | packages=setuptools.find_packages(), 23 | url='https://www.covid19datahub.io', 24 | download_url='https://github.com/covid19datahub/Python/archive/2.3.0.tar.gz', 25 | keywords=['2019-nCov', 'coronavirus', 'covid-19', 'covid-data', 'covid19-data'], 26 | install_requires=reqs, 27 | package_dir={'': '.'}, 28 | classifiers=[ 29 | 'Development Status :: 5 - Production/Stable', 30 | 'Intended Audience :: Science/Research', 31 | 'Intended Audience :: Developers', 32 | 'Intended Audience :: Other Audience', 33 | 'Topic :: Database', 34 | 'Topic :: Scientific/Engineering', 35 | 'Topic :: Scientific/Engineering :: Information Analysis', 36 | 'Topic :: Software Development :: Libraries', 37 | 'Topic :: Utilities', 38 | 'License :: OSI Approved :: MIT License', 39 | 'Programming Language :: Python :: 3', 40 | 'Programming Language :: Python :: 3.8', 41 | 'Programming Language :: Python :: 3.9', 42 | 'Programming Language :: Python :: 3.10', 43 | ], 44 | ) 45 | -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | 4 | sys.path.append(".") 5 | sys.path.append("tests") 6 | 7 | # === unit tests === 8 | from test_covid19_latest import * 9 | from test_covid19_vintage import * 10 | from test_cite import * 11 | # ================== 12 | 13 | 14 | # logging 15 | if __name__ == "__main__": 16 | import logging 17 | logging.basicConfig(level=logging.WARNING) 18 | 19 | # run unittests 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /tests/test_cite.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime,timedelta 3 | import unittest 4 | 5 | import pandas as pd 6 | 7 | import covid19dh 8 | 9 | class TestCite(unittest.TestCase): 10 | def test_cite_verbose(self): 11 | x,src = covid19dh.covid19("CZE", verbose = False) 12 | # cite 13 | src2 = covid19dh._cite.cite(x, verbose = False) 14 | 15 | __all__ = ["TestCite"] -------------------------------------------------------------------------------- /tests/test_covid19_latest.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime,timedelta 3 | import unittest 4 | 5 | import pandas as pd 6 | 7 | import covid19dh 8 | 9 | class TestCovid19Latest(unittest.TestCase): 10 | _sourceless_attributes = [ 11 | 'id', 'key_google_mobility', 'key_apple_mobility', 'date', 12 | 'iso_numeric', 'iso_alpha_2', 'iso_alpha_3', 13 | 'administrative_area_level', 'administrative_area_level_3', 14 | 'administrative_area_level_2', 'administrative_area_level_1', 15 | 'gatherings_restrictions', 'stay_home_restrictions', 'iso_currency', 16 | # 17 | 'people_fully_vaccinated', 'people_vaccinated', 18 | 'vaccination_policy', 'elderly_people_protection', 'facial_coverings', 19 | 'containment_health_index', 'economic_support_index', 20 | 'government_response_index', 21 | ] 22 | _numeric_attributes = [ 23 | "tests", "confirmed", "recovered", "deaths", "hosp", "vent", "icu", 24 | ] 25 | _constant_attributes = ["population", "latitude", "longitude"] 26 | _indicator_attributes = [ 27 | "school_closing", "cancel_events", "contact_tracing", "testing_policy", 28 | "transport_closing", "workplace_closing", "information_campaigns", 29 | "stringency_index", "international_movement_restrictions", 30 | "internal_movement_restrictions", 31 | ] 32 | _index_attributes = [ 33 | 34 | ] 35 | _src_attributes = [ 36 | "iso_alpha_3", "administrative_area_level", "data_type", "url", 37 | "title", "year", "bibtype", "author", "institution", "textVersion", 38 | ] 39 | 40 | def _covid19(self, *args, **kw): 41 | x, src = covid19dh.covid19(*args, **kw, verbose=False) # fetch 42 | # test 43 | self.assertIsInstance(x, pd.DataFrame) 44 | cols = ( 45 | set(self._numeric_attributes) | 46 | set(self._constant_attributes) | 47 | set(self._sourceless_attributes) | 48 | set(self._indicator_attributes) | 49 | set(self._index_attributes) 50 | ) 51 | for col in cols: 52 | self.assertIn(col, x.columns) 53 | return x, src 54 | 55 | def _check_level1(self, x): 56 | self.assertTrue((x.administrative_area_level == 1).all()) 57 | self.assertTrue(not x.administrative_area_level_1.isnull().any()) 58 | self.assertTrue(x.administrative_area_level_2.isnull().all()) 59 | self.assertTrue(x.administrative_area_level_3.isnull().all()) 60 | 61 | def _check_level2(self, x): 62 | self.assertTrue((x.administrative_area_level == 2).all()) 63 | self.assertTrue(not x.administrative_area_level_1.isnull().any()) 64 | self.assertTrue(not x.administrative_area_level_2.isnull().any()) 65 | self.assertTrue(x.administrative_area_level_3.isnull().all()) 66 | 67 | def _check_level3(self, x): 68 | self.assertTrue((x.administrative_area_level == 3).all()) 69 | self.assertTrue(not x.administrative_area_level_1.isnull().any()) 70 | # self.assertTrue(not x.administrative_area_level_2.isnull().any()) # e.g. Colombia have only levels 1,3 71 | self.assertTrue(not x.administrative_area_level_3.isnull().any()) 72 | 73 | def _check_src(self, x, src): 74 | # format 75 | for col in self._src_attributes: 76 | self.assertIn(col, src.columns) 77 | # all data types 78 | data_types = src.data_type.unique() 79 | # all cols 80 | cols = set(x.columns) - set(self._sourceless_attributes) 81 | cols -= set([ # adjust 82 | 'key_alpha_2', 'key_numeric', 'key_jhu_csse', 83 | 'key_nuts', 'key_local', 'key_gadm', 84 | ]) 85 | for col in cols: 86 | # empty columns ignored 87 | if x[col].isnull().all() or (x[col] == 0).all(): 88 | continue 89 | 90 | self.assertIn(col, data_types) # col in sources 91 | 92 | def test_default(self): 93 | x, src = self._covid19() # fetch 94 | self._check_level1(x) 95 | self._check_src(x, src) 96 | 97 | def test_level1(self): 98 | x, src = self._covid19(level=1) # fetch 99 | self._check_level1(x) 100 | self._check_src(x, src) 101 | 102 | def test_level2(self): 103 | x, src = self._covid19(level=2) # fetch 104 | self._check_level2(x) 105 | self._check_src(x, src) 106 | 107 | # def test_level3(self): 108 | # x, src = self._covid19('SE', level=3, start='2023-01-01') # fetch 109 | # self._check_level3(x) 110 | # self._check_src(x, src) 111 | 112 | 113 | __all__ = ["TestCovid19Latest"] 114 | -------------------------------------------------------------------------------- /tests/test_covid19_vintage.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime,timedelta 3 | import unittest 4 | 5 | import pandas as pd 6 | 7 | import covid19dh 8 | 9 | class TestCovid19Vintage(unittest.TestCase): 10 | def _covid19(self, *args, **kw): 11 | x, src = covid19dh.covid19(*args, **kw, vintage=True, verbose=False) # fetch 12 | # test 13 | self.assertIsInstance(x, pd.DataFrame) 14 | for col in ["id", "date", "tests", "confirmed", "recovered", "deaths", "hosp", "vent", "icu"]: 15 | self.assertIn(col, x.columns) 16 | for col in ["population", "latitude", "longitude"]: 17 | self.assertIn(col, x.columns) 18 | for col in ["school_closing", "workplace_closing", "cancel_events", 19 | "gatherings_restrictions", "transport_closing", "testing_policy", 20 | "stay_home_restrictions", "internal_movement_restrictions", 21 | "international_movement_restrictions", "information_campaigns", 22 | "contact_tracing", "stringency_index", "key", "key_apple_mobility", 23 | "key_google_mobility"]: 24 | self.assertIn(col, x.columns) 25 | for col in ["iso_alpha_3", "iso_alpha_2", "iso_numeric", "currency", "administrative_area_level", 26 | "administrative_area_level_1", "administrative_area_level_2", "administrative_area_level_3"]: 27 | self.assertIn(col, x.columns) 28 | return x,src 29 | 30 | def test_vintage(self): 31 | # fetch 32 | _, src1 = self._covid19("DE", end=datetime(2020, 7, 10)) 33 | _, src2 = self._covid19("DE", end=datetime(2020, 7, 20)) 34 | 35 | 36 | __all__ = ["TestCovid19Vintage"] --------------------------------------------------------------------------------