├── .github
    └── workflows
    │   ├── utests_on_commit.yml
    │   └── utests_on_pullreq.yml
├── .gitignore
├── LICENSE
├── README.md
├── covid19dh
    ├── __init__.py
    ├── _cache.py
    ├── _cite.py
    └── main.py
├── doc
    └── presentation.ipynb
├── publish.sh
├── requirements.txt
├── setup.py
└── tests
    ├── test.py
    ├── test_cite.py
    ├── test_covid19_latest.py
    └── test_covid19_vintage.py


/.github/workflows/utests_on_commit.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: utests_on_commit # workflow name
 3 | on: # trigger
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | # parallel jobs to perform
 8 | jobs:
 9 |   # === unittest execution ===
10 |   unittest:
11 |     runs-on: ubuntu-latest # runner
12 |     
13 |     steps: # tasks  
14 |     - name: Set Github Workspace # access Github Workspace
15 |       uses: actions/checkout@v2
16 |     - name: Set up Python 3.x # set architecture and Python3
17 |       uses: actions/setup-python@v2
18 |       with:
19 |         python-version: '3.8'
20 |         architecture: 'x64' # architecture
21 | 
22 |     - name: Install dependencies # dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
26 |     
27 |     - name: Run unittests # run unittests
28 |       run:
29 |         python tests/test.py


--------------------------------------------------------------------------------
/.github/workflows/utests_on_pullreq.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: utests_on_pullreq # workflow name
 3 | on: # trigger
 4 |   pull_request:
 5 |     branches: [ master ]
 6 | 
 7 | # parallel jobs to perform
 8 | jobs:
 9 |   # === unittest execution ===
10 |   unittest:
11 |     runs-on: ubuntu-latest # runner
12 |     
13 |     steps: # tasks  
14 |     - name: Set Github Workspace # access Github Workspace
15 |       uses: actions/checkout@v2
16 |     - name: Set up Python 3.x # set architecture and Python3
17 |       uses: actions/setup-python@v2
18 |       with:
19 |         python-version: '3.8'
20 |         architecture: 'x64' # architecture
21 | 
22 |     - name: Install dependencies # dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
26 |     
27 |     - name: Run unittests # run unittests
28 |       run:
29 |         python tests/test.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # my added py scripts in main directory
132 | /*.py
133 | !setup.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Emanuele Guidotti and David Ardia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://covid19datahub.io"><img src="https://storage.covid19datahub.io/logo.svg" align="right" height="128"/></a>
  2 | 
  3 | # Python Interface to COVID-19 Data Hub
  4 | 
  5 | [![](https://img.shields.io/pypi/v/covid19dh.svg?color=brightgreen)](https://pypi.org/pypi/covid19dh/) [![](https://img.shields.io/pypi/dm/covid19dh.svg?color=blue)](https://pypi.org/pypi/covid19dh/) [![DOI](https://joss.theoj.org/papers/10.21105/joss.02376/status.svg)](https://doi.org/10.21105/joss.02376) [![](https://github.com/covid19datahub/Python/workflows/utests_on_commit/badge.svg)](https://github.com/covid19datahub/Python)
  6 | 
  7 | Download COVID-19 data across governmental sources at national, regional, and city level, as described in [Guidotti and Ardia (2020)](https://www.doi.org/10.21105/joss.02376). Includes the time series of vaccines, tests, cases, deaths, recovered, hospitalizations, intensive therapy, and policy measures by [Oxford COVID-19 Government Response Tracker](https://www.bsg.ox.ac.uk/research/research-projects/coronavirus-government-response-tracker). Please agree to the [Terms of Use](https://covid19datahub.io/LICENSE.html) and cite the following reference when using it:
  8 | 
  9 | **Reference**
 10 | 
 11 | Guidotti, E., Ardia, D., (2020).      
 12 | COVID-19 Data Hub       
 13 | _Journal of Open Source Software_, **5**(51):2376   
 14 | [https://doi.org/10.21105/joss.02376](https://doi.org/10.21105/joss.02376)  
 15 | 
 16 | ## Setup and usage
 17 | 
 18 | Install from [pip](https://pypi.org/project/covid19dh/) with
 19 | 
 20 | ```python
 21 | pip install covid19dh
 22 | ```
 23 | 
 24 | Importing the main function `covid19()`   
 25 | 
 26 | ```python
 27 | from covid19dh import covid19
 28 | x, src = covid19() 
 29 | ```
 30 | 
 31 | Package is regularly updated. Update with
 32 | 
 33 | ```bash
 34 | pip install --upgrade covid19dh
 35 | ```
 36 | 
 37 | ## Return values
 38 | 
 39 | The function `covid19()` returns 2 pandas dataframes:
 40 | * the data and
 41 | * references to the data sources.
 42 | 
 43 | ## Parametrization
 44 | 
 45 | ### Country
 46 | 
 47 | List of country names (case-insensitive) or ISO codes (alpha-2, alpha-3 or numeric). The list of ISO codes can be found [here](https://github.com/covid19datahub/COVID19/blob/master/inst/extdata/db/ISO.csv).
 48 | 
 49 | Fetching data from a particular country:
 50 | 
 51 | ```python
 52 | x, src = covid19("USA") # Unites States
 53 | ```
 54 | 
 55 | Specify multiple countries at the same time:
 56 | 
 57 | ```python
 58 | x, src = covid19(["ESP","PT","andorra",250])
 59 | ```
 60 | 
 61 | If `country` is omitted, the whole dataset is returned:
 62 | 
 63 | ```python
 64 | x, src = covid19()
 65 | ```
 66 | 
 67 | ### Raw data
 68 | 
 69 | Logical. Skip data cleaning? Default `True`. If `raw=False`, the raw data are cleaned by filling missing dates with `NaN` values. This ensures that all locations share the same grid of dates and no single day is skipped. Then, `NaN` values are replaced with the previous non-`NaN` value or `0`.  
 70 | 
 71 | ```python
 72 | x, src = covid19(raw = False)
 73 | ```
 74 | 
 75 | ### Date filter
 76 | 
 77 | Date can be specified with `datetime.datetime`, `datetime.date` or as a `str` in format `YYYY-mm-dd`.
 78 | 
 79 | ```python
 80 | from datetime import datetime
 81 | x, src = covid19("SWE", start = datetime(2020,4,1), end = "2020-05-01")
 82 | ```
 83 | 
 84 | ### Level
 85 | 
 86 | Integer. Granularity level of the data:
 87 | 
 88 | 1. Country level
 89 | 2. State, region or canton level
 90 | 3. City or municipality level
 91 | 
 92 | ```python
 93 | from datetime import date
 94 | x, src = covid19("USA", level = 2, start = date(2020,5,1))
 95 | ```
 96 | 
 97 | ### Cache
 98 | 
 99 | Logical. Memory caching? Significantly improves performance on successive calls. By default, using the cached data is enabled.
100 | 
101 | Caching can be disabled (e.g. for long running programs) by:
102 | 
103 | ```python
104 | x, src = covid19("FRA", cache = False)
105 | ```
106 | 
107 | ### Vintage
108 | 
109 | Logical. Retrieve the snapshot of the dataset that was generated at the `end` date instead of using the latest version. Default `False`.
110 | 
111 | To fetch e.g. US data that were accessible on *22th April 2020* type
112 | 
113 | ```python
114 | x, src = covid19("USA", end = "2020-04-22", vintage = True)
115 | ```
116 | 
117 | The vintage data are collected at the end of the day, but published with approximately 48 hour delay,
118 | once the day is completed in all the timezones.
119 | 
120 | Hence if `vintage = True`, but `end` is not set, warning is raised and `None` is returned.
121 | 
122 | ```python
123 | x, src = covid19("USA", vintage = True) # too early to get today's vintage
124 | ```
125 | 
126 | ```
127 | UserWarning: vintage data not available yet
128 | ```
129 | 
130 | ### Data Sources
131 | 
132 | The data sources are returned as second value.
133 | 
134 | ```python
135 | from covid19dh import covid19
136 | x, src = covid19("USA")
137 | print(src)
138 | ```
139 | 
140 | ### Additional information
141 | 
142 | Find out more at https://covid19datahub.io
143 | 
144 | ## Acknowledgements
145 | 
146 | Developed and maintained by [Martin Benes](https://pypi.org/user/martinbenes1996/).
147 | 
148 | ## Cite as
149 | 
150 | *Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.*
151 | 
152 | A BibTeX entry for LaTeX users is
153 | 
154 | ```latex
155 | @Article{,
156 |     title = {COVID-19 Data Hub},
157 |     year = {2020},
158 |     doi = {10.21105/joss.02376},
159 |     author = {Emanuele Guidotti and David Ardia},
160 |     journal = {Journal of Open Source Software},
161 |     volume = {5},
162 |     number = {51},
163 |     pages = {2376}
164 | }
165 | ```


--------------------------------------------------------------------------------
/covid19dh/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Unified data hub for a better understanding of COVID-19.
 3 | 
 4 | For more information check README.md.
 5 | 
 6 | Reference: https://covid19datahub.io/
 7 | Todo:
 8 |     * caching
 9 | """
10 | 
11 | import pkg_resources
12 | from .main import *
13 | 
14 | try:
15 |     __version__ = pkg_resources.get_distribution("covid19dh").version
16 | except Exception:
17 |     __version__ = None
18 | 


--------------------------------------------------------------------------------
/covid19dh/_cache.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # ======== data cache =========
 3 | _cache = {}  # data
 4 | 
 5 | 
 6 | def _construct_cache_id(level, dt, raw, vintage):
 7 |     cache_id = f"{level}"
 8 |     if raw:
 9 |         cache_id += "_raw"
10 |     if vintage:
11 |         cache_id += dt.strftime("%Y-%m-%d")
12 |     return cache_id
13 | 
14 | 
15 | def read_cache(level, dt, raw, vintage):
16 |     cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage)
17 |     try:
18 |         return _cache[cache_id]
19 |     except Exception:
20 |         return None
21 | 
22 | def write_cache(x, level, dt, raw, vintage):
23 |     cache_id = _construct_cache_id(level=level, dt=dt, raw=raw, vintage=vintage)
24 |     _cache[cache_id] = x
25 | 
26 | 
27 | # ========= src cache ==========
28 | _cache_src = {}  # src
29 | 
30 | 
31 | def _construct_src_cache_id(dt, vintage):
32 |     cache_id = "src"
33 |     if vintage:
34 |         cache_id += dt.strftime("%Y-%m-%d")
35 |     return cache_id
36 | 
37 | 
38 | def read_src_cache(dt, vintage):
39 |     cache_id = _construct_src_cache_id(dt=dt, vintage=vintage)
40 |     try:
41 |         return _cache_src[cache_id]
42 |     except Exception:
43 |         return None
44 | 
45 | 
46 | def write_src_cache(src, dt, vintage):
47 |     cache_id = _construct_src_cache_id(dt=dt, vintage=vintage)
48 |     _cache_src[cache_id] = src
49 | 
50 | 
51 | __all__ = ["read_cache", "write_cache", "read_src_cache", "write_src_cache"]
52 | 


--------------------------------------------------------------------------------
/covid19dh/_cite.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from io import StringIO
  3 | import math
  4 | import re
  5 | import warnings
  6 | 
  7 | import pandas as pd
  8 | import requests
  9 | 
 10 | 
 11 | def get_sources():
 12 |     url = 'https://storage.covid19datahub.io/src.csv'
 13 |     response = requests.get(url) # headers={'User-Agent': 'Mozilla/5.0'}
 14 |     return pd.read_csv( StringIO(response.text))
 15 | 
 16 | 
 17 | def sources_to_citations(sources):
 18 |     # shorten URL
 19 |     sources.url = sources.url.apply(
 20 |         lambda u: re.sub(
 21 |             r"(http://|https://|www\\.)([^/]+)(.*)",
 22 |             r"\1\2/",
 23 |             u)
 24 |         )
 25 |     # remove duplicit
 26 |     unique_references = sources.groupby(["title","author","institution","url","textVersion","bibtype"])
 27 | 
 28 |     # format
 29 |     citations = []
 30 |     for n, g in unique_references:
 31 |         for i in range(1):
 32 |             (title, author, institution, url, textVersion, bibtype) = n
 33 |             year = g.year.max()
 34 | 
 35 |             if textVersion:
 36 |                 citation = textVersion
 37 |             else:
 38 |                 # pre,post
 39 |                 if author:
 40 |                     pre = author
 41 |                     if title:
 42 |                         post = f"{title}"
 43 |                 elif title:
 44 |                     pre = title
 45 |                     post = ""
 46 |                 # post
 47 |                 if institution:
 48 |                     if post:
 49 |                         post += ", "
 50 |                     post += f"{institution}"
 51 |                 if url:
 52 |                     if post:
 53 |                         post += ", "
 54 |                     url = re.sub(r"(http://|https://|www\\.)([^/]+)(.*)",
 55 |                                  r"\1\2/", url)
 56 |                     post += f"{url}"
 57 |                 else:
 58 |                     post += "."
 59 |                 citation = f"{pre} ({year}), {post}"
 60 | 
 61 |             citations.append(citation)
 62 |     return citations
 63 | 
 64 | 
 65 | def cite(x: pd.DataFrame, verbose: bool = True, sources: bool = None):
 66 |     # all sources if missing
 67 |     if sources is None:
 68 |         sources = get_sources()
 69 | 
 70 |     # per iso
 71 |     references = pd.DataFrame(data=None, columns=sources.columns)
 72 |     for (iso,), country in x.groupby(["iso_alpha_3"]):
 73 |         # levels
 74 |         level = country.administrative_area_level.unique()[0]
 75 |         # empty attributes
 76 |         empty_params = country.apply(lambda c: c.isnull().all() | (c == 0).all())
 77 |         params = x.columns[~empty_params]
 78 | 
 79 |         # filter
 80 |         src = sources[
 81 |             (sources.administrative_area_level == level) & # level
 82 |             (sources.iso_alpha_3 == iso) & # iso
 83 |             sources.data_type.isin(params) # data type
 84 |         ]
 85 |         # fallback for missing
 86 |         missing = set(params) - set(src.data_type.unique())
 87 |         if missing:
 88 |             src = pd.concat([
 89 |                 src,
 90 |                 sources[
 91 |                     sources.data_type.isin(missing) & # data type
 92 |                     sources.iso_alpha_3.isnull() & # empty ISO
 93 |                     sources.administrative_area_level.isnull() # empty level
 94 |                 ]
 95 |             ])
 96 | 
 97 |         # set iso,level
 98 |         src.iso_alpha_3 = iso
 99 |         src.administrative_area_level = level
100 | 
101 |         # join
102 |         references = pd.concat([references, src])
103 | 
104 |     references.drop_duplicates(inplace=True)
105 | 
106 |     return references
107 | 
108 | 
109 | 
110 | 
111 |     # ===
112 |     # hash data stats
113 |     params = set(x.columns)
114 |     isos = set(x["iso_alpha_3"].unique())
115 |     isos.add(math.nan)
116 |     # prefilter
117 |     sources = sources[
118 |         sources["iso_alpha_3"].isin(isos) &
119 |         sources["data_type"].isin(params) ]
120 |     sources = sources.fillna("")
121 | 
122 |     # filter
123 |     def is_source_used(ref):
124 |         # data type not present
125 |         if not ref['data_type'] in params: return False
126 |         # fallbacks
127 |         if not ref['iso_alpha_3'] or not ref['administrative_area_level']: return True
128 | 
129 |         # check both equal
130 |         return ((x.iso_alpha_3 == ref.iso_alpha_3) & (x.administrative_area_level == ref.administrative_area_level)).any()
131 | 
132 |     sources = sources[sources.apply(is_source_used, axis=1)]
133 | 
134 |     # drop fallback
135 |     for p in params:
136 |         non_fallback = (sources.data_type == p) & (sources.iso_alpha_3 != '')
137 |         no_data = (x[p].isnull() | (x[p] == 0))
138 |         fallback = (sources.data_type == p) & (sources.iso_alpha_3 == '')
139 |         if non_fallback.any() or no_data.all():
140 |             sources.drop(fallback.index[fallback].tolist(), inplace=True)
141 | 
142 |     #citations = sources_to_citations(sources)
143 | 
144 |     #if verbose:
145 |     #    print("\033[1mData References:\033[0m\n", end="")
146 |     #    for ref in citations:
147 |     #        print("\t" + ref, end="\n\n")
148 |     #    print("\033[33mTo hide the data sources use 'verbose = False'.\033[0m")
149 | 
150 |     sources.replace(r'^\s*$', math.nan, regex=True, inplace=True)
151 |     return sources
152 | 
153 | 
154 | __all__ = ["cite", "get_sources"]
155 | 


--------------------------------------------------------------------------------
/covid19dh/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import datetime
  3 | from io import StringIO, BytesIO
  4 | import math
  5 | import sys
  6 | import warnings
  7 | import zipfile
  8 | 
  9 | import pandas as pd
 10 | import requests
 11 | 
 12 | from ._cite import get_sources, cite
 13 | from ._cache import *
 14 | 
 15 | 
 16 | def get_url(level, dt, raw, vintage):
 17 |     # dataname
 18 |     rawprefix = "raw" if raw else ""
 19 |     dataname = f"{rawprefix}data-{level}"
 20 |     # vintage
 21 |     if vintage:
 22 |         # too new
 23 |         if dt >= datetime.datetime.now() - datetime.timedelta(days=2):
 24 |             warnings.warn("vintage data not available yet", category=ResourceWarning)
 25 |             return None, None
 26 |         dt_str = dt.strftime("%Y-%m-%d")
 27 |         filename = f"{dt_str}.zip"
 28 |     # current data
 29 |     else:
 30 |         filename = f"{dataname}.zip"
 31 |     # url, filename
 32 |     return f"https://storage.covid19datahub.io/{filename}", f"{dataname}.csv"
 33 | 
 34 | 
 35 | def parseDate(dt):
 36 |     if isinstance(dt, datetime.date):
 37 |         return datetime.datetime(dt.year, dt.month, dt.day)
 38 |     if isinstance(dt, str):
 39 |         try:
 40 |             return datetime.datetime.strptime(dt, "%Y-%m-%d")
 41 |         except Exception:
 42 |             print("Invalid time format.", file=sys.stderr)
 43 |             raise
 44 |     return dt
 45 | 
 46 | 
 47 | def covid19(country=None,
 48 |             level=1,
 49 |             start=datetime.date(2019, 1, 1),
 50 |             end=None,  # defaultly today
 51 |             cache=True,
 52 |             verbose=True,
 53 |             raw=True,
 54 |             vintage=False):
 55 |     """Main function for module. Fetches data from hub.
 56 | 
 57 |     Args:
 58 |         country (str, optional): ISO country code, defaultly all countries
 59 |         level (int, optional): level of data, default 1
 60 |             * country-level (1)
 61 |             * state-level (2)
 62 |             * city-level (3)
 63 |         start (datetime | date | str, optional): start date of data (as str in format [%Y-%m-%d]),
 64 |                                                  default 2019-01-01
 65 |         end (datetime | date | str, optional): end date of data (as str in format [%Y-%m-%d]),
 66 |                                                default today (sysdate)
 67 |         cache (bool, optional): use cached data if available, default yes
 68 |         verbose (bool, optional): prints sources, default true
 69 |         raw (bool, optional): download not cleansed data, defaultly using cleansed
 70 |         vintage (bool, optional): use hub data (True) or original source, not available in Python covid19dh (only hub)
 71 |     """
 72 |     # parse arguments
 73 |     if country is not None:
 74 |         country = [country] if isinstance(country, str) else country
 75 |         country = [c.upper() if isinstance(c,str) else c for c in country]
 76 |     end = datetime.datetime.now() if end is None else end
 77 |     try:
 78 |         end = parseDate(end)
 79 |         start = parseDate(start)
 80 |     except Exception:
 81 |         return None, None
 82 |     if level not in {1, 2, 3}:
 83 |         warnings.warn("valid options for 'level' are:\n\t1: country-level data\n\t2: state-level data\n\t3: city-level data")
 84 |         return None, None
 85 |     if start > end:
 86 |         warnings.warn("start is later than end")
 87 |         return None, None
 88 | 
 89 |     # cache
 90 |     df = read_cache(level, end, raw, vintage)
 91 |     src = None
 92 | 
 93 |     if cache is False or df is None:
 94 |         # get url from level
 95 |         try:
 96 |             url, filename = get_url(level=level, dt=end, raw=raw, vintage=vintage)
 97 |             if url is None:
 98 |                 return None, None
 99 |         except KeyError:
100 |             warnings.warn("invalid level")
101 |             return None, None
102 |         # download
103 |         try:
104 |             response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
105 |         except Exception:
106 |             if vintage:
107 |                 warnings.warn("vintage data not available yet")
108 |                 return None, None
109 |             else:
110 |                 warnings.warn("error to fetch data")
111 |                 return None, None
112 |         # parse
113 |         with zipfile.ZipFile(BytesIO(response.content)) as zz:
114 |             with zz.open(filename) as fd:
115 |                 df = pd.read_csv(fd, low_memory=False)
116 |             # src from vintage archive
117 |             if vintage:
118 |                 with zz.open("src.csv") as fd:
119 |                     src = pd.read_csv(fd, low_memory=False)
120 |                     write_src_cache(src, end, vintage)
121 |         # cast columns
122 |         df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
123 |         try:
124 |             df['iso_numeric'] = df['iso_numeric'].apply(lambda x: float(x))
125 |         except Exception:
126 |             pass
127 | 
128 |         write_cache(df, level, end, raw, vintage)
129 | 
130 |     # src
131 |     if src is None:
132 |         src = read_src_cache(end, vintage)
133 |         if src is None:
134 |             src = get_sources()
135 |             write_src_cache(src, end, vintage)
136 | 
137 |     # filter
138 |     if country is not None:
139 |         # elementwise comparison works, but throws warning that it will be working better in the future
140 |         # no idea why, but I found solution to mute it as follows
141 |         with warnings.catch_warnings():
142 |             warnings.simplefilter(action='ignore', category=FutureWarning)
143 | 
144 |             country_filter = df['administrative_area_level_1'].map(lambda s: s.upper()).isin(country)
145 |             for feature in ["iso_alpha_2","iso_alpha_3","iso_numeric"]:
146 |                 try:
147 |                     country_filter = country_filter | df[feature].isin(country)
148 |                 except KeyError:
149 |                     pass
150 |             df = df[country_filter]
151 | 
152 |             #df = df[(df['iso_alpha_3'].isin(country)) |
153 |             #        (df['iso_alpha_2'].isin(country)) |
154 |             #        (df['iso_numeric'].isin(country)) |
155 |             #        (df['administrative_area_level_1'].map(lambda s: s.upper()).isin(country))  ]
156 |     if start is not None:
157 |         df = df[df['date'] >= start]
158 |     if end is not None:
159 |         df = df[df['date'] <= end]
160 | 
161 |     # detect empty
162 |     if df.empty:
163 |         warnings.warn("no data for given settings", category=ResourceWarning)
164 |         return None, None
165 |     # sort
166 |     df = df.sort_values(by=["id","date"])
167 | 
168 |     # cite
169 |     src = cite(x=df, sources=src, verbose=False)
170 | 
171 |     if verbose:
172 |         # construct message
173 |         message = "We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:\n\n"
174 |         message += "\t\033[1mGuidotti, E., Ardia, D., (2020), \"COVID-19 Data Hub\", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.\033[0m\n\n"
175 |         message += "A BibTeX entry for LaTeX users is\n\n"
176 |         message += "\t@Article{,\n"
177 |         message += "\t\ttitle = {COVID-19 Data Hub},\n"
178 |         message += "\t\tyear = {2020},\n"
179 |         message += "\t\tdoi = {10.21105/joss.02376},\n"
180 |         message += "\t\tauthor = {Emanuele Guidotti and David Ardia},\n"
181 |         message += "\t\tjournal = {Journal of Open Source Software},\n"
182 |         message += "\t\tvolume = {5},\n"
183 |         message += "\t\tnumber = {51},\n"
184 |         message += "\t\tpages = {2376},\n"
185 |         message += "\t}\n\n"
186 |         message += "\033[33mTo hide this message use 'verbose = False'.\033[0m"
187 |         # print
188 |         print(message)
189 | 
190 |     return df, src
191 | 
192 | 
193 | 
194 | __all__ = ["covid19"]


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # remove previous releases
4 | rm -rf build/ dist/ covid19dh.egg-info/ __pycache__/
5 | # compile
6 | python setup.py sdist bdist_wheel
7 | # publish
8 | python -m twine upload dist/*
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | requests


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # requirements
 4 | try:
 5 |     with open('requirements.txt') as f:
 6 |         reqs = f.read().splitlines()
 7 | except Exception:
 8 |     reqs = []
 9 | 
10 | import setuptools
11 | with open("README.md", "r", encoding="UTF-8") as fh:
12 |     long_description = fh.read()
13 | 
14 | setuptools.setup(
15 |     name='covid19dh',
16 |     version='2.3.1',
17 |     author='Martin Beneš',
18 |     author_email='martinbenes1996@gmail.com',
19 |     description='Unified data hub for a better understanding of COVID-19 https://covid19datahub.io',
20 |     long_description=long_description,
21 |     long_description_content_type="text/markdown",
22 |     packages=setuptools.find_packages(),
23 |     url='https://www.covid19datahub.io',
24 |     download_url='https://github.com/covid19datahub/Python/archive/2.3.0.tar.gz',
25 |     keywords=['2019-nCov', 'coronavirus', 'covid-19', 'covid-data', 'covid19-data'],
26 |     install_requires=reqs,
27 |     package_dir={'': '.'},
28 |     classifiers=[
29 |         'Development Status :: 5 - Production/Stable',
30 |         'Intended Audience :: Science/Research',
31 |         'Intended Audience :: Developers',
32 |         'Intended Audience :: Other Audience',
33 |         'Topic :: Database',
34 |         'Topic :: Scientific/Engineering',
35 |         'Topic :: Scientific/Engineering :: Information Analysis',
36 |         'Topic :: Software Development :: Libraries',
37 |         'Topic :: Utilities',
38 |         'License :: OSI Approved :: MIT License',
39 |         'Programming Language :: Python :: 3',
40 |         'Programming Language :: Python :: 3.8',
41 |         'Programming Language :: Python :: 3.9',
42 |         'Programming Language :: Python :: 3.10',
43 |     ],
44 | )
45 | 


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | 
 4 | sys.path.append(".")
 5 | sys.path.append("tests")
 6 | 
 7 | # === unit tests ===
 8 | from test_covid19_latest import *
 9 | from test_covid19_vintage import *
10 | from test_cite import *
11 | # ==================
12 | 
13 | 
14 | # logging
15 | if __name__ == "__main__":
16 |     import logging
17 |     logging.basicConfig(level=logging.WARNING)
18 | 
19 | # run unittests
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/tests/test_cite.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from datetime import datetime,timedelta
 3 | import unittest
 4 | 
 5 | import pandas as pd
 6 | 
 7 | import covid19dh
 8 | 
 9 | class TestCite(unittest.TestCase):
10 |     def test_cite_verbose(self):
11 |         x,src = covid19dh.covid19("CZE", verbose = False)
12 |         # cite
13 |         src2 = covid19dh._cite.cite(x, verbose = False)
14 | 
15 | __all__ = ["TestCite"]


--------------------------------------------------------------------------------
/tests/test_covid19_latest.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from datetime import datetime,timedelta
  3 | import unittest
  4 | 
  5 | import pandas as pd
  6 | 
  7 | import covid19dh
  8 | 
  9 | class TestCovid19Latest(unittest.TestCase):
 10 |     _sourceless_attributes = [
 11 |         'id', 'key_google_mobility', 'key_apple_mobility', 'date',
 12 |         'iso_numeric', 'iso_alpha_2', 'iso_alpha_3',
 13 |         'administrative_area_level', 'administrative_area_level_3',
 14 |         'administrative_area_level_2', 'administrative_area_level_1',
 15 |         'gatherings_restrictions', 'stay_home_restrictions', 'iso_currency',
 16 |         #
 17 |         'people_fully_vaccinated', 'people_vaccinated',
 18 |         'vaccination_policy', 'elderly_people_protection', 'facial_coverings',
 19 |         'containment_health_index', 'economic_support_index',
 20 |         'government_response_index',
 21 |     ]
 22 |     _numeric_attributes = [
 23 |         "tests", "confirmed", "recovered", "deaths", "hosp", "vent", "icu",
 24 |     ]
 25 |     _constant_attributes = ["population", "latitude", "longitude"]
 26 |     _indicator_attributes = [
 27 |         "school_closing", "cancel_events", "contact_tracing", "testing_policy",
 28 |         "transport_closing", "workplace_closing", "information_campaigns",
 29 |         "stringency_index", "international_movement_restrictions",
 30 |         "internal_movement_restrictions",
 31 |     ]
 32 |     _index_attributes = [
 33 | 
 34 |     ]
 35 |     _src_attributes = [
 36 |         "iso_alpha_3", "administrative_area_level", "data_type", "url",
 37 |         "title", "year", "bibtype", "author", "institution", "textVersion",
 38 |     ]
 39 | 
 40 |     def _covid19(self, *args, **kw):
 41 |         x, src = covid19dh.covid19(*args, **kw, verbose=False)  # fetch
 42 |         # test
 43 |         self.assertIsInstance(x, pd.DataFrame)
 44 |         cols = (
 45 |             set(self._numeric_attributes) |
 46 |             set(self._constant_attributes) |
 47 |             set(self._sourceless_attributes) |
 48 |             set(self._indicator_attributes) |
 49 |             set(self._index_attributes)
 50 |         )
 51 |         for col in cols:
 52 |             self.assertIn(col, x.columns)
 53 |         return x, src
 54 | 
 55 |     def _check_level1(self, x):
 56 |         self.assertTrue((x.administrative_area_level == 1).all())
 57 |         self.assertTrue(not x.administrative_area_level_1.isnull().any())
 58 |         self.assertTrue(x.administrative_area_level_2.isnull().all())
 59 |         self.assertTrue(x.administrative_area_level_3.isnull().all())
 60 | 
 61 |     def _check_level2(self, x):
 62 |         self.assertTrue((x.administrative_area_level == 2).all())
 63 |         self.assertTrue(not x.administrative_area_level_1.isnull().any())
 64 |         self.assertTrue(not x.administrative_area_level_2.isnull().any())
 65 |         self.assertTrue(x.administrative_area_level_3.isnull().all())
 66 | 
 67 |     def _check_level3(self, x):
 68 |         self.assertTrue((x.administrative_area_level == 3).all())
 69 |         self.assertTrue(not x.administrative_area_level_1.isnull().any())
 70 |         # self.assertTrue(not x.administrative_area_level_2.isnull().any()) # e.g. Colombia have only levels 1,3
 71 |         self.assertTrue(not x.administrative_area_level_3.isnull().any())
 72 | 
 73 |     def _check_src(self, x, src):
 74 |         # format
 75 |         for col in self._src_attributes:
 76 |             self.assertIn(col, src.columns)
 77 |         # all data types
 78 |         data_types = src.data_type.unique()
 79 |         # all cols
 80 |         cols = set(x.columns) - set(self._sourceless_attributes)
 81 |         cols -= set([  # adjust
 82 |             'key_alpha_2', 'key_numeric', 'key_jhu_csse',
 83 |             'key_nuts', 'key_local', 'key_gadm',
 84 |         ])
 85 |         for col in cols:
 86 |             # empty columns ignored
 87 |             if x[col].isnull().all() or (x[col] == 0).all():
 88 |                 continue
 89 | 
 90 |             self.assertIn(col, data_types)  # col in sources
 91 | 
 92 |     def test_default(self):
 93 |         x, src = self._covid19()  # fetch
 94 |         self._check_level1(x)
 95 |         self._check_src(x, src)
 96 | 
 97 |     def test_level1(self):
 98 |         x, src = self._covid19(level=1)  # fetch
 99 |         self._check_level1(x)
100 |         self._check_src(x, src)
101 | 
102 |     def test_level2(self):
103 |         x, src = self._covid19(level=2)  # fetch
104 |         self._check_level2(x)
105 |         self._check_src(x, src)
106 | 
107 |     # def test_level3(self):
108 |     #     x, src = self._covid19('SE', level=3, start='2023-01-01')  # fetch
109 |     #     self._check_level3(x)
110 |     #     self._check_src(x, src)
111 | 
112 | 
113 | __all__ = ["TestCovid19Latest"]
114 | 


--------------------------------------------------------------------------------
/tests/test_covid19_vintage.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from datetime import datetime,timedelta
 3 | import unittest
 4 | 
 5 | import pandas as pd
 6 | 
 7 | import covid19dh
 8 | 
 9 | class TestCovid19Vintage(unittest.TestCase):
10 |     def _covid19(self, *args, **kw):
11 |         x, src = covid19dh.covid19(*args, **kw, vintage=True, verbose=False) # fetch
12 |         # test
13 |         self.assertIsInstance(x, pd.DataFrame)
14 |         for col in ["id", "date", "tests", "confirmed", "recovered", "deaths", "hosp", "vent", "icu"]:
15 |             self.assertIn(col, x.columns)
16 |         for col in ["population", "latitude", "longitude"]:
17 |             self.assertIn(col, x.columns)
18 |         for col in ["school_closing", "workplace_closing", "cancel_events",
19 |                     "gatherings_restrictions", "transport_closing", "testing_policy",
20 |                     "stay_home_restrictions", "internal_movement_restrictions",
21 |                     "international_movement_restrictions", "information_campaigns",
22 |                     "contact_tracing", "stringency_index", "key", "key_apple_mobility",
23 |                     "key_google_mobility"]:
24 |             self.assertIn(col, x.columns)
25 |         for col in ["iso_alpha_3", "iso_alpha_2", "iso_numeric", "currency", "administrative_area_level",
26 |                     "administrative_area_level_1", "administrative_area_level_2", "administrative_area_level_3"]:
27 |             self.assertIn(col, x.columns)
28 |         return x,src
29 | 
30 |     def test_vintage(self):
31 |         # fetch
32 |         _, src1 = self._covid19("DE", end=datetime(2020, 7, 10))
33 |         _, src2 = self._covid19("DE", end=datetime(2020, 7, 20))
34 | 
35 | 
36 | __all__ = ["TestCovid19Vintage"]


--------------------------------------------------------------------------------