├── .env.example
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── app
├── __init__.py
├── core
│ ├── __init__.py
│ └── config.py
├── main.py
├── utils.py
└── v1
│ ├── __init__.py
│ ├── data.py
│ ├── lists.py
│ ├── metadata.py
│ ├── schemas.py
│ └── search.py
├── crawler
├── __init__.py
├── crawl.py
├── duckdb_models.py
├── full_text_index.py
├── query_duckdb.ipynb
└── utils.py
├── data_api
└── __init__.py
├── default.mk
├── demo
├── README.md
├── demo.py
├── requirements.txt
└── static
│ └── style.css
├── docker-compose.yaml
├── explore-duckdb.ipynb
├── nbinit.py
├── poetry.lock
├── poetry.toml
├── profiling
└── profile_formats.ipynb
├── pyproject.toml
├── setup.cfg
└── tests
├── __init__.py
├── crawler
└── test_crawl.py
├── sample_duck.db
└── test_v1.py
/.env.example:
--------------------------------------------------------------------------------
1 | PROJECT_NAME=data-api
2 | BACKEND_CORS_ORIGINS=["http://localhost:8000", "https://localhost:8000", "http://localhost", "https://localhost"]
3 |
4 | DUCKDB_PATH=duck.db
5 |
6 | OWID_CATALOG_DIR=/Users/mojmir/projects/etl/data
7 |
8 | # optional
9 | # BUGSNAG_API_KEY
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
133 | # pytype static type analyzer
134 | .pytype/
135 |
136 | # Cython debug symbols
137 | cython_debug/
138 |
139 | # Text Editor
140 | .vscode
141 |
142 |
143 | # Custom
144 | .coverage
145 | **/__pycache__
146 | .virtual_documents
147 | **/.ipynb_checkpoints
148 | .submodule-init
149 | .mypy-cache
150 | .pytest-cache
151 | .python-version
152 | .venv
153 | __pycache__
154 | .ipynb_checkpoints
155 | .env
156 | *.dot
157 | *.pdf
158 | pyrightconfig.json
159 | .DS_Store
160 | ign.*
161 | .idea
162 |
163 | api.err
164 | demo.err
165 | api.log
166 | demo.log
167 |
168 | duck.db
169 | duck.db.wal
170 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vendor/owid-catalog-py"]
2 | path = vendor/owid-catalog-py
3 | url = git@github.com:owid/owid-catalog-py.git
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/myint/autoflake
3 | rev: v1.4
4 | hooks:
5 | - id: autoflake
6 | exclude: .*/__init__.py
7 | args:
8 | - --in-place
9 | - --remove-all-unused-imports
10 | - --expand-star-imports
11 | - --remove-duplicate-keys
12 | - --remove-unused-variables
13 | - repo: local
14 | hooks:
15 | - id: flake8
16 | name: flake8
17 | entry: flake8
18 | language: system
19 | types: [python]
20 | - repo: https://github.com/pre-commit/mirrors-isort
21 | rev: v5.4.2
22 | hooks:
23 | - id: isort
24 | args: ["--profile", "black"]
25 | - repo: local
26 | hooks:
27 | - id: mypy
28 | name: mypy
29 | entry: mypy
30 | language: system
31 | types: [python]
32 | - repo: https://github.com/pre-commit/pre-commit-hooks
33 | rev: v3.3.0
34 | hooks:
35 | - id: trailing-whitespace
36 | - id: end-of-file-fixer
37 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9
2 |
3 | ENV PYTHONPATH "${PYTHONPATH}:/"
4 | ENV PORT=8000
5 |
6 | # Install Poetry
7 | RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | POETRY_HOME=/opt/poetry python && \
8 | cd /usr/local/bin && \
9 | ln -s /opt/poetry/bin/poetry && \
10 | poetry config virtualenvs.create false
11 |
12 | # Copy using poetry.lock* in case it doesn't exist yet
13 | COPY ./pyproject.toml ./poetry.lock* /app/
14 |
15 | RUN poetry install --no-root --no-dev
16 |
17 | COPY ./app /app
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Global Change Data Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Makefile
3 | #
4 |
5 | .PHONY: etl
6 |
7 | include default.mk
8 |
9 | SRC = app crawler tests
10 |
11 | help:
12 | @echo 'Available commands:'
13 | @echo
14 | @echo ' make crawl Crawl ETL catalog'
15 | @echo ' make api Run API server'
16 | @echo ' make test Run all linting and unit tests'
17 | @echo ' make testdb Rebuild test DB'
18 | @echo ' make watch Run all tests, watching for changes'
19 | @echo ' make clobber Delete non-reference data and .venv'
20 | @echo ' make run Run API and Catalog in the background'
21 | @echo
22 |
23 |
24 | watch-all:
25 | .venv/bin/watchmedo shell-command -c 'clear; make unittest; (cd vendor/owid-catalog-py && make unittest)' --recursive --drop .
26 |
27 | test-all: test
28 | cd vendor/owid-catalog-py && make test
29 |
30 | watch: .venv
31 | .venv/bin/watchmedo shell-command -c 'clear; make check-formatting lint check-typing coverage' --recursive --drop .
32 |
33 | .submodule-init:
34 | @echo '==> Initialising submodules'
35 | git submodule update --init
36 | touch $@
37 |
38 | .venv: pyproject.toml poetry.toml poetry.lock .submodule-init
39 | @echo '==> Copy .env.example to .env if missing'
40 | cp -n .env.example .env || true
41 | @echo '==> Installing packages'
42 | poetry install
43 | touch $@
44 |
45 | check-typing: .venv
46 | # @echo '==> Checking types'
47 | # .venv/bin/mypy $(SRC)
48 | @echo '==> WARNING: Checking types is disabled!'
49 |
50 | coverage: .venv
51 | @echo '==> Unit testing with coverage'
52 | .venv/bin/pytest --cov=app --cov-report=term-missing tests
53 |
54 | crawl: .venv
55 | @echo '==> Crawl ETL catalog'
56 | poetry run crawl
57 |
58 | crawl-backported: .venv
59 | @echo '==> Crawl backported ETL catalog'
60 | poetry run crawl --include dataset_
61 |
62 | api: .venv
63 | @echo '==> Running API'
64 | .venv/bin/hypercorn app.main:app --reload
65 |
66 | testdb: .venv
67 | @echo '==> Rebuild test DB'
68 | rm -f tests/sample_duck.db
69 | poetry run crawl --include 'dataset_941|ggdc_maddison' --duckdb-path tests/sample_duck.db
70 |
71 | clobber: clean
72 | find . -name .venv | xargs rm -rf
73 | find . -name .mypy_cache | xargs rm -rf
74 |
75 | run: .venv
76 | @echo 'Running API and Catalog in the background:'
77 | -kill $(lsof -t -i:8000)
78 | -kill $(lsof -t -i:8001)
79 | nohup make api > api.log 2> api.err < /dev/null &
80 | nohup .venv/bin/python -m demo.demo > demo.log 2> demo.err < /dev/null &
81 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # data-api
2 |
3 | _API for accessing data from our data catalog._
4 |
5 | **Status**: experimental
6 |
7 | ## Overview
8 |
9 | Our World in Data is trying to build a new data layer for our charts and visualisations based a repeatable and transparent data pipeline, our [etl](https://github.com/owid/etl). The ETL generates and publishes the latest version of our data catalog to S3.
10 |
11 | This project adds two components, a crawler and a web API. The crawler walks the data catalog and generates a local DuckDB database with the contents. The Dynamic API then provides RESTful access to the data, including SQL support thanks to DuckDB.
12 |
13 | ```mermaid
14 | graph TB
15 | ETL -->|generates| catalog[Data catalog]
16 | crawler(Crawler):::here -->|reads| catalog
17 | site[OWID site] -.->|queries| api
18 | api(Dynamic API):::here -->|queries| db[DuckDB cache]
19 | crawler -->|generates| db
20 |
21 | classDef here stroke-width:4px;
22 | ```
23 |
24 | ## Developing
25 |
26 | You need Python 3.10 and `poetry` installed to get started.
27 |
28 | ### Running tests
29 |
30 | To run all the checks and make sure you have everything set up correctly, try
31 |
32 | ```
33 | make test
34 | ```
35 |
36 | ### Crawling the catalog
37 |
38 | Crawler is a script that goes through all backported datasets and replicates them to local DuckDB. Crawler creates tables `meta_datasets`, `meta_tables`, and `meta_variables` in DuckDB with all metadata and it also replicates tables from ETL catalog in there.
39 |
40 | Table names are underscored table paths, e.g. path `backport/owid/latest/dataset_941_technology_adoption__isard__1942__and_others/dataset_941_technology_adoption__isard__1942__and_others` gets table name `backport__owid__latest__dataset_941_technology_adoption__isard__1942__and_others__dataset_941_technology_adoption__isard__1942__and_others`. This is unnecessarily verbose, but it doesn't not matter now.
41 |
42 | Crawler compares checksums of **datasets** to decide if a dataset needs to be updated. We cannot do it on a table level because we don't use table checksums.
43 |
44 | We only crawl `garden` and `backport` channels right now.
45 |
46 | Run `make crawl` to crawl the entire database (this would take veeeery long) or crawl only sample datasets with
47 |
48 | ```
49 | poetry run crawl --include 'dataset_941|ggdc_maddison'
50 | ```
51 |
52 | or just a garden channel
53 |
54 | ```
55 | poetry run crawl --include 'garden'
56 | ```
57 |
58 | ### Running the API
59 |
60 | Copy `.env.example` into `.env` and update it as you like. After you build `duck.db` with crawler, run the API with `hypercorn app.main:app --reload`.
61 |
62 | Docs are available at http://127.0.0.1:8000/v1/docs.
63 |
64 | ### Sample Queries
65 |
66 | Sample queries written in [httpie](https://httpie.io/)
67 |
68 | ```
69 | http GET http://127.0.0.1:8000/health
70 | http GET http://127.0.0.1:8000/v1/variableById/data/42539
71 | http GET http://127.0.0.1:8000/v1/variableById/metadata/42539
72 | http GET http://127.0.0.1:8000/v1/dataset/data/garden/owid/latest/covid/covid.csv
73 | http GET http://127.0.0.1:8000/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp
74 | http GET http://127.0.0.1:8000/v1/dataset/data/backport/owid/latest/dataset_5576_ggdc_maddison__2020_10_01/dataset_5576_ggdc_maddison__2020_10_01.feather
75 | http POST http://127.0.0.1:8000/v1/sql sql=="PRAGMA show_tables;" type==csv
76 | http POST http://127.0.0.1:8000/v1/sql sql=="select * from garden__ggdc__2020_10_01__ggdc_maddison__maddison_gdp limit 10;" type==csv
77 | ```
78 |
79 | ## Tests
80 |
81 | Integration tests work with sample data saved in `tests/sample_duck.db`. Regenerate it with `make testdb`.
82 |
83 | ## Development
84 |
85 | It is useful to recreate sample DB for testing and run tests right after that for debugging with
86 |
87 | ```
88 | make testdb && pytest -s tests/test_v1.py
89 | ```
90 |
91 | ## Full-text search
92 |
93 | - all variables are given the same weight, we should reconsider that
94 | - negation queries are not supported yet (could be useful for interactive exclusion of datasets)
95 |
96 | ## Future considerations
97 |
98 | This project was generated via [manage-fastapi](https://github.com/ycd/manage-fastapi/). We might re-generate the project with a [different template](https://fastapi.tiangolo.com/advanced/templates/) based on our production requirements.
99 |
--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/app/__init__.py
--------------------------------------------------------------------------------
/app/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/app/core/__init__.py
--------------------------------------------------------------------------------
/app/core/config.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import List, Optional, Union
3 |
4 | from pydantic import AnyHttpUrl, BaseSettings, validator
5 |
6 |
7 | class Settings(BaseSettings):
8 | PROJECT_NAME: str
9 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = []
10 |
11 | DUCKDB_PATH: Path = Path("duck.db")
12 |
13 | DUCKDB_MEMORY_LIMIT = "2GB"
14 |
15 | BUGSNAG_API_KEY: Optional[str] = None
16 |
17 | OWID_CATALOG_DIR: Path
18 |
19 | @validator("BACKEND_CORS_ORIGINS", pre=True)
20 | def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]:
21 | if isinstance(v, str) and not v.startswith("["):
22 | return [i.strip() for i in v.split(",")]
23 | elif isinstance(v, (list, str)):
24 | return v
25 | raise ValueError(v)
26 |
27 | class Config:
28 | case_sensitive = True
29 | env_file = ".env"
30 |
31 |
32 | settings = Settings()
33 |
--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
1 | import threading
2 |
3 | import bugsnag
4 | import structlog
5 | from bugsnag.asgi import BugsnagMiddleware
6 | from fastapi import FastAPI
7 | from fastapi.middleware.cors import CORSMiddleware
8 |
9 | from app.core.config import settings
10 | from app.v1 import v1
11 |
12 | log = structlog.get_logger()
13 |
14 | bugsnag.configure(
15 | api_key=settings.BUGSNAG_API_KEY,
16 | )
17 |
18 |
19 | def get_application():
20 | _app = FastAPI(title=settings.PROJECT_NAME)
21 |
22 | _app.add_middleware(
23 | CORSMiddleware,
24 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS],
25 | allow_credentials=True,
26 | allow_methods=["*"],
27 | allow_headers=["*"],
28 | )
29 |
30 | _app.add_middleware(
31 | BugsnagMiddleware,
32 | )
33 |
34 | return _app
35 |
36 |
37 | app = get_application()
38 |
39 | # mount subapplications as versions
40 | app.mount("/v1", v1)
41 |
42 |
43 | @app.get("/health")
44 | def health() -> dict:
45 | return {
46 | "status": "ok",
47 | "thread_id": str(threading.get_ident()),
48 | }
49 |
--------------------------------------------------------------------------------
/app/utils.py:
--------------------------------------------------------------------------------
1 | import functools
2 | from typing import Any
3 |
4 | import duckdb
5 | import orjson
6 | import pandas as pd
7 | import structlog
8 | from fastapi.responses import JSONResponse
9 |
10 | from app.core.config import settings
11 |
12 | log = structlog.get_logger()
13 |
14 |
15 | class ORJSONResponse(JSONResponse):
16 | """It serializes dataclass, datetime, numpy, and UUID instances natively."""
17 |
18 | media_type = "application/json"
19 |
20 | def render(self, content: Any) -> bytes:
21 | return orjson.dumps(content)
22 |
23 |
24 | @functools.cache
25 | def get_readonly_connection(thread_id: int) -> duckdb.DuckDBPyConnection:
26 | # duckdb connection is not threadsafe, we have to create one connection per thread
27 | log.info("duckdb.new_connection", thread_id=thread_id)
28 | con = duckdb.connect(
29 | database=settings.DUCKDB_PATH.as_posix(),
30 | read_only=True,
31 | config={"memory_limit": settings.DUCKDB_MEMORY_LIMIT},
32 | )
33 | return con
34 |
35 |
36 | def omit_nullable_values(d: dict) -> dict:
37 | return {k: v for k, v in d.items() if v is not None and not pd.isna(v)}
38 |
--------------------------------------------------------------------------------
/app/v1/__init__.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI
2 |
3 | from app import utils
4 |
5 | from .data import router as data_router
6 | from .lists import router as lists_router
7 | from .metadata import router as metadata_router
8 | from .search import router as search_router
9 |
10 | v1 = FastAPI(default_response_class=utils.ORJSONResponse)
11 |
12 | v1.include_router(metadata_router)
13 | v1.include_router(data_router)
14 | v1.include_router(search_router)
15 | v1.include_router(lists_router)
16 |
--------------------------------------------------------------------------------
/app/v1/data.py:
--------------------------------------------------------------------------------
1 | import io
2 | import threading
3 | from typing import Any, Literal, Optional, cast
4 |
5 | import pandas as pd
6 | import pyarrow as pa
7 | import structlog
8 | from fastapi import APIRouter, Header, HTTPException, Response
9 | from fastapi.responses import StreamingResponse
10 | from pyarrow.feather import write_feather
11 |
12 | from app import utils
13 | from app.main import settings
14 |
15 | from .schemas import VariableDataResponse
16 |
17 | log = structlog.get_logger()
18 |
19 |
20 | DATA_TYPES = Literal["csv", "feather", "feather_direct", "json"]
21 |
22 | router = APIRouter()
23 |
24 |
25 | def _read_sql_bytes(con, sql: str, parameters) -> io.BytesIO:
26 | """Execute SQL and return BytesIO object with byte data."""
27 | sink = io.BytesIO()
28 |
29 | batch_iterator = con.execute(sql, parameters=parameters).fetch_record_batch(
30 | chunk_size=1000
31 | )
32 | with pa.ipc.new_file(sink, batch_iterator.schema) as writer:
33 | for rb in batch_iterator:
34 | writer.write_batch(rb)
35 |
36 | sink.seek(0)
37 |
38 | return sink
39 |
40 |
41 | def _bytes_to_response(bytes_io: io.BytesIO) -> StreamingResponse:
42 | # NOTE: using raw `bytes_io` should be in theory faster than `iter([bytes_io.getvalue()])`, yet
43 | # it is much slower for unknown reasons
44 | # response = StreamingResponse(bytes_io, media_type="application/octet-stream")
45 | response = StreamingResponse(
46 | iter([bytes_io.getvalue()]), media_type="application/octet-stream"
47 | )
48 | response.headers["Content-Disposition"] = "attachment; filename=owid.feather"
49 | return response
50 |
51 |
52 | def _sql_to_response(
53 | con, sql: str, type: DATA_TYPES, parameters: list[Any] = []
54 | ) -> Any:
55 | # read data in feather format and return it directly in response
56 | # NOTE: should be the fastest in theory, but is really slow for unknown reasons
57 | if type == "feather_direct":
58 | # WARNING: this does not support categorical variables and raises `pyarrow.lib.ArrowTypeError`
59 | # when you try to read it with pandas (see https://github.com/duckdb/duckdb/issues/4130)
60 | # we'd have to either convert categoricals into strings or change format while still in arrow format
61 | bytes_io = _read_sql_bytes(con, sql, parameters=parameters)
62 | return _bytes_to_response(bytes_io)
63 |
64 | # read data into dataframe and then convert to feather
65 | elif type == "feather":
66 | bytes_io = io.BytesIO()
67 | df = con.execute(sql, parameters=parameters).fetch_df()
68 | write_feather(df, bytes_io)
69 | return _bytes_to_response(bytes_io)
70 |
71 | # read data into dataframe and then convert to csv
72 | elif type == "csv":
73 | df = con.execute(sql, parameters=parameters).fetch_df()
74 |
75 | str_stream = io.StringIO()
76 | df.to_csv(str_stream, index=False)
77 |
78 | return StreamingResponse(iter([str_stream.getvalue()]), media_type="text/csv")
79 |
80 | # read data into dataframe and then convert to json
81 | elif type == "json":
82 | # NOTE: we could also do this directly from pyarrow, but it is slower than to pandas
83 | # for some reason
84 | # return con.execute(sql, parameters=parameters).fetch_arrow_table().to_pydict()
85 |
86 | df = con.execute(sql, parameters=parameters).fetch_df()
87 |
88 | # TODO: converting to lists and then ormjson is slow, we could instead
89 | # convert to numpy arrays on which ormjson is super fast
90 | return df.to_dict(orient="list")
91 |
92 | else:
93 | raise HTTPException(status_code=400, detail=f"unknown type {type}")
94 |
95 |
96 | @router.post("/sql")
97 | def sql_query(sql: str, type: DATA_TYPES = "csv"):
98 | """Run arbitrary query on top of our database."""
99 | con = utils.get_readonly_connection(threading.get_ident())
100 | return _sql_to_response(con, sql, type)
101 |
102 |
103 | # QUESTION: how about /variable/{variable_id}/data?
104 | @router.get(
105 | "/variableById/data/{variable_id}",
106 | response_model=VariableDataResponse,
107 | response_model_exclude_unset=True,
108 | )
109 | def data_for_backported_variable(
110 | response: Response,
111 | variable_id: int,
112 | limit: Optional[int] = None,
113 | if_none_match: Optional[str] = Header(default=None),
114 | ):
115 | """Fetch data for a single variable."""
116 |
117 | con = utils.get_readonly_connection(threading.get_ident())
118 |
119 | # get meta about variable
120 | q = """
121 | select
122 | v.variable_id as variable_id,
123 | v.short_name as short_name,
124 | v.table_path as table_path,
125 | d.checksum as checksum
126 | from meta_variables v
127 | join meta_tables t on v.table_path = t.path
128 | join meta_datasets d on d.path = t.dataset_path
129 | where variable_id = (?)
130 | """
131 | df = cast(pd.DataFrame, con.execute(q, parameters=[variable_id]).fetch_df())
132 | _assert_single_variable(df.shape[0], variable_id)
133 | r = dict(df.iloc[0])
134 |
135 | # this is the dataset level checksum which is the best we have
136 | # at the moment
137 | checksum = r["checksum"]
138 |
139 | # if the client sent a IF-NONE-MATCH header, check if it matches the checksum
140 | if if_none_match == checksum:
141 | response.status_code = 304
142 | return
143 |
144 | # Send the checksum as the etag header and set cache-control to cache with
145 | # max-age of 0 (which makes the client validate with the if-none-match header)
146 | response.headers["ETag"] = checksum
147 | response.headers[
148 | "Cache-Control"
149 | ] = "max-age=0" # We could consider allowing a certain time window
150 |
151 | parquet_path = (settings.OWID_CATALOG_DIR / r["table_path"]).with_suffix(".parquet")
152 |
153 | # TODO: DuckDB / SQLite doesn't allow parameterized table or column names, how do we escape it properly?
154 | # is it even needed if we get them from our DB and it is read-only?
155 | q = f"""
156 | select
157 | year as years,
158 | entity_name as entity_names,
159 | entity_id as entities,
160 | entity_code as entity_codes,
161 | {r["short_name"]} as values
162 | from read_parquet('{parquet_path}')
163 | where {r["short_name"]} is not null
164 | """
165 | parameters = []
166 | if limit:
167 | q += "limit (?)"
168 | parameters.append(limit)
169 | df = cast(pd.DataFrame, con.execute(q, parameters=parameters).fetch_df())
170 |
171 | return df.to_dict(orient="list")
172 |
173 |
174 | # NOTE: it might be more intuitive to have paths like this
175 | # /dataset/{channel}/{namespace}/{version}/{dataset}/{table}/data.{type}
176 | # and
177 | # /dataset/{channel}/{namespace}/{version}/{dataset}/{table}/metadata
178 | # especially for browsing catalog tree in `lists.py`
179 | @router.get(
180 | "/dataset/data/{channel}/{namespace}/{version}/{dataset}/{table}.{type}",
181 | )
182 | def data_for_etl_table(
183 | channel: str,
184 | namespace: str,
185 | version: str,
186 | dataset: str,
187 | table: str,
188 | columns: str = "*",
189 | limit: int = 1000000000,
190 | type: DATA_TYPES = "csv",
191 | ):
192 | """Fetch data for a table."""
193 |
194 | con = utils.get_readonly_connection(threading.get_ident())
195 | table_path = (
196 | settings.OWID_CATALOG_DIR
197 | / f"{channel}/{namespace}/{version}/{dataset}/{table}.parquet"
198 | )
199 |
200 | sql = f"""
201 | select
202 | {columns}
203 | from read_parquet('{table_path}')
204 | limit (?)
205 | """
206 |
207 | con = utils.get_readonly_connection(threading.get_ident())
208 | return _sql_to_response(con, sql, type, [limit])
209 |
210 |
211 | def _assert_single_variable(n, variable_id):
212 | if n == 0:
213 | raise HTTPException(
214 | status_code=404, detail=f"variable_id {variable_id} not found"
215 | )
216 | elif n > 1:
217 | # raise internal error
218 | raise Exception(
219 | f"multiple variables found for variable_id {variable_id}, this should not happen"
220 | )
221 |
--------------------------------------------------------------------------------
/app/v1/lists.py:
--------------------------------------------------------------------------------
1 | import threading
2 |
3 | from fastapi import APIRouter
4 |
5 | from app import utils
6 |
7 |
8 | router = APIRouter()
9 |
10 |
11 | @router.get(
12 | "/datasets",
13 | )
14 | def list_all_datasets():
15 | con = utils.get_readonly_connection(threading.get_ident())
16 | sql = """
17 | select title from meta_datasets
18 | """
19 | df = con.execute(sql).fetch_df()
20 | return {"datasets": list(df.title)}
21 |
22 |
23 | @router.get(
24 | "/dataset/data",
25 | )
26 | def list_channels():
27 | """List all available channels."""
28 |
29 | con = utils.get_readonly_connection(threading.get_ident())
30 | sql = """
31 | select distinct channel from meta_tables
32 | """
33 | df = con.execute(sql).fetch_df()
34 | return {"channels": list(df.channel)}
35 |
36 |
37 | @router.get(
38 | "/dataset/data/{channel}",
39 | )
40 | def list_namespaces(channel: str):
41 | """List all available namespaces."""
42 |
43 | con = utils.get_readonly_connection(threading.get_ident())
44 | sql = """
45 | select distinct namespace from meta_tables
46 | where channel = (?)
47 | """
48 | df = con.execute(sql, parameters=[channel]).fetch_df()
49 | return {"namespaces": list(df.namespace)}
50 |
51 |
52 | @router.get(
53 | "/dataset/data/{channel}/{namespace}",
54 | )
55 | def list_versions(channel: str, namespace: str):
56 | """List all available versions."""
57 |
58 | con = utils.get_readonly_connection(threading.get_ident())
59 | sql = """
60 | select distinct version from meta_tables
61 | where channel = (?) and namespace = (?)
62 | """
63 | df = con.execute(sql, parameters=[channel, namespace]).fetch_df()
64 | return {"versions": list(df.version)}
65 |
66 |
67 | @router.get(
68 | "/dataset/data/{channel}/{namespace}/{version}",
69 | )
70 | def list_datasets(channel: str, namespace: str, version: str):
71 | """List all available datasets."""
72 |
73 | con = utils.get_readonly_connection(threading.get_ident())
74 | sql = """
75 | select distinct dataset_name from meta_tables
76 | where channel = (?) and namespace = (?) and version = (?)
77 | """
78 | df = con.execute(sql, parameters=[channel, namespace, version]).fetch_df()
79 | return {"datasets": list(df.dataset_name)}
80 |
81 |
82 | @router.get(
83 | "/dataset/data/{channel}/{namespace}/{version}/{dataset}",
84 | )
85 | def list_tables(channel: str, namespace: str, version: str, dataset: str):
86 | """List all available tables."""
87 |
88 | con = utils.get_readonly_connection(threading.get_ident())
89 | sql = """
90 | select distinct table_name from meta_tables
91 | where channel = (?) and namespace = (?) and version = (?) and dataset_name = (?)
92 | """
93 | df = con.execute(sql, parameters=[channel, namespace, version, dataset]).fetch_df()
94 | return {"tables": list(df.table_name)}
95 |
--------------------------------------------------------------------------------
/app/v1/metadata.py:
--------------------------------------------------------------------------------
1 | import json
2 | import threading
3 | from typing import Any, Dict, Optional, cast
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import structlog
8 | from fastapi import APIRouter, Header, HTTPException, Response
9 |
10 | from app import utils
11 |
12 | from .schemas import (
13 | Dimension,
14 | DimensionProperties,
15 | VariableMetadataResponse,
16 | VariableSource,
17 | )
18 |
19 | log = structlog.get_logger()
20 |
21 |
22 | router = APIRouter()
23 |
24 | # NOTE: duckdb also supports python relations, would it be helpful?
25 | # https://github.com/duckdb/duckdb/blob/master/examples/python/duckdb-python.py
26 |
27 |
28 | @router.get(
29 | "/dataset/metadata/{channel}/{namespace}/{version}/{dataset}/{table}",
30 | # response_model=VariableMetadataResponse,
31 | # response_model_exclude_unset=True,
32 | )
33 | def metadata_for_etl_variable(
34 | channel: str,
35 | namespace: str,
36 | version: str,
37 | dataset: str,
38 | table: str,
39 | ):
40 | table_path = f"{channel}/{namespace}/{version}/{dataset}/{table}"
41 |
42 | con = utils.get_readonly_connection(threading.get_ident())
43 |
44 | vf = _metadata_etl_variables(con, table_path)
45 | tf = _metadata_etl_table(con, table_path)
46 | df = _metadata_etl_dataset(con, channel, namespace, version, dataset)
47 |
48 | if df.empty:
49 | raise HTTPException(status_code=404, detail=f"table `{table_path}` not found")
50 |
51 | return {
52 | "dataset": df.iloc[0].to_dict(),
53 | "table": tf.iloc[0].to_dict(),
54 | "variables": vf.to_dict(orient="records"),
55 | }
56 |
57 |
58 | # QUESTION: how about `/variable/{variable_id}/metadata` naming?
59 | @router.get(
60 | "/variableById/metadata/{variable_id}",
61 | response_model=VariableMetadataResponse,
62 | response_model_exclude_unset=True,
63 | )
64 | def metadata_for_backported_variable(
65 | response: Response,
66 | variable_id: int,
67 | if_none_match: Optional[str] = Header(default=None),
68 | ):
69 | """Fetch metadata for a single variable from database.
70 | This function is identical to Variables.getVariableData in owid-grapher repository
71 | """
72 | q = """
73 | SELECT
74 | -- variables
75 | v.grapher_meta->>'$.name' as name,
76 | v.grapher_meta->>'$.unit' as unit,
77 | v.grapher_meta->>'$.description' as description,
78 | v.grapher_meta->>'$.createdAt' as createdAt,
79 | v.grapher_meta->>'$.updatedAt' as updatedAt,
80 | v.grapher_meta->>'$.code' as code,
81 | v.grapher_meta->>'$.coverage' as coverage,
82 | v.grapher_meta->>'$.timespan' as timespan,
83 | (v.grapher_meta->>'$.datasetId')::integer as datasetId,
84 | (v.grapher_meta->>'$.sourceId')::integer as sourceId,
85 | v.grapher_meta->>'$.shortUnit' as shortUnit,
86 | v.grapher_meta->>'$.display' as display,
87 | (v.grapher_meta->>'$.columnOrder')::integer as columnOrder,
88 | v.grapher_meta->>'$.originalMetadata' as originalMetadata,
89 | v.grapher_meta->>'$.grapherConfig' as grapherConfig,
90 | -- dataset
91 | d.grapher_meta->>'$.name' as datasetName,
92 | IF(d.grapher_meta->>'$.nonRedistributable' = 'true', true, false) as nonRedistributable,
93 | -- there should be always only one source for variable
94 | -- this is inverse of `convert_grapher_source`
95 | v.sources->>'$[0].name' as sourceName,
96 | v.sources->>'$[0].description' as sourceAdditionalInfo,
97 | v.sources->>'$[0].date_accessed' as sourceRetrievedDate,
98 | v.sources->>'$[0].url' as sourceLink,
99 | v.sources->>'$[0].publisher_source' as sourceDataPublisherSource,
100 | v.sources->>'$[0].published_by' as sourceDataPublishedBy,
101 | d.checksum as checksum,
102 | FROM meta_variables as v
103 | JOIN meta_datasets as d ON d.short_name = v.dataset_short_name
104 | join meta_tables t on v.table_path = t.path
105 | WHERE v.variable_id = (?)
106 | """
107 | con = utils.get_readonly_connection(threading.get_ident())
108 |
109 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead
110 | df = cast(pd.DataFrame, con.execute(q, parameters=[variable_id]).fetch_df())
111 |
112 | if df.empty:
113 | raise HTTPException(
114 | status_code=404, detail=f"variableId `{variable_id}` not found"
115 | )
116 |
117 | # null values in JSON string functions end up as "null" string, fix that
118 | df = df.replace("null", np.nan)
119 | row = df.iloc[0].to_dict()
120 |
121 | source = VariableSource(
122 | id=row.pop("sourceId"),
123 | name=row.pop("sourceName"),
124 | dataPublishedBy=row.pop("sourceDataPublishedBy", ""),
125 | dataPublisherSource=row.pop("sourceDataPublisherSource", ""),
126 | link=row.pop("sourceLink", ""),
127 | retrievedDate=row.pop("sourceRetrievedDate", ""),
128 | additionalInfo=row.pop("sourceAdditionalInfo", ""),
129 | )
130 |
131 | nonRedistributable = row.pop("nonRedistributable")
132 | displayJson = row.pop("display")
133 |
134 | # this is the dataset level checksum which is the best we have
135 | # at the moment
136 | checksum = row.pop("checksum")
137 |
138 | variable = utils.omit_nullable_values(row)
139 |
140 | # if the client sent a IF-NONE-MATCH header, check if it matches the checksum
141 | if if_none_match == checksum:
142 | response.status_code = 304
143 | return
144 |
145 | # Send the checksum as the etag header and set cache-control to cache with
146 | # max-age of 0 (which makes the client validate with the if-none-match header)
147 | response.headers["ETag"] = checksum
148 | response.headers[
149 | "Cache-Control"
150 | ] = "max-age=0" # We could consider allowing a certain time window
151 |
152 | # get variable types from duckdb (all metadata would be eventually retrieved in duckdb)
153 | # NOTE: getting these is a bit of a pain, we have a lot of duplicate information
154 | # in our DB
155 | q = """
156 | select
157 | v.variable_type,
158 | t.dimension_values
159 | from meta_variables as v
160 | join meta_tables as t on t.path = v.table_path
161 | where variable_id = (?)
162 | """
163 | variable_type, dimension_values = con.execute(q, parameters=[variable_id]).fetchone() # type: ignore
164 |
165 | dimensions = _parse_dimension_values(json.loads(dimension_values))
166 |
167 | return VariableMetadataResponse(
168 | nonRedistributable=bool(nonRedistributable),
169 | display=json.loads(displayJson),
170 | source=source,
171 | type=variable_type,
172 | dimensions=dimensions,
173 | **variable,
174 | )
175 |
176 |
177 | def _parse_dimension_values(dimension_values: Any) -> Dict[str, Dimension]:
178 | dimensions = {}
179 |
180 | # NOTE: we have inconsistency with plurals - even though the dimension name is
181 | # singular, we use plural in the API (but not for custom dimensions)
182 | if "year" in dimension_values:
183 | dimensions["years"] = Dimension(
184 | type="int",
185 | values=[DimensionProperties(id=y) for y in dimension_values.pop("year")],
186 | )
187 |
188 | if "entity_zip" in dimension_values:
189 | dimensions["entities"] = Dimension(
190 | type="int",
191 | values=[
192 | DimensionProperties(id=int(e[0]), name=e[1], code=e[2])
193 | for e in map(lambda x: x.split("|"), dimension_values.pop("entity_zip"))
194 | ],
195 | )
196 |
197 | assert not dimension_values, (
198 | "This currently works only for backported datasets with dimensions "
199 | '{"year", "entity_id", "entity_name", "entity_code"}'
200 | )
201 |
202 | return dimensions
203 |
204 |
205 | def _metadata_etl_variables(con, table_path):
206 | q = """
207 | SELECT
208 | -- variables (commented columns are not relevant for ETL tables)
209 | v.title,
210 | v.description,
211 | v.licenses,
212 | v.sources,
213 | v.unit,
214 | v.short_unit,
215 | -- conversion factor from display is needed for CO2 datasets, but honestly it would be
216 | -- better to hide it or do the calculation implicitly
217 | v.display,
218 | -- v.grapher_meta,
219 | -- v.variable_id,
220 | v.short_name,
221 | v.table_path,
222 | v.dataset_short_name,
223 | v.variable_type,
224 | -- TODO: should we include `dimension_values` in response or do we only need it for backported variables?
225 | -- v.dimension_values,
226 | FROM meta_variables as v
227 | WHERE v.table_path = (?)
228 | """
229 |
230 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead
231 | vf = cast(pd.DataFrame, con.execute(q, parameters=[table_path]).fetch_df())
232 |
233 | # convert JSON to dict (should be done automatically once we switch to ORM)
234 | for col in ("licenses", "sources", "display"):
235 | vf[col] = vf[col].apply(json.loads)
236 | return vf
237 |
238 |
239 | def _metadata_etl_table(con, table_path):
240 | q = """
241 | SELECT
242 | table_name,
243 | dataset_name,
244 | version,
245 | namespace,
246 | channel,
247 | dimensions,
248 | path,
249 | format,
250 | is_public,
251 | FROM meta_tables as t
252 | WHERE path = (?)
253 | """
254 |
255 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead
256 | tf = cast(pd.DataFrame, con.execute(q, parameters=[table_path]).fetch_df())
257 |
258 | for col in ("dimensions",):
259 | tf[col] = tf[col].apply(json.loads)
260 | return tf
261 |
262 |
263 | def _metadata_etl_dataset(con, channel, namespace, version, dataset):
264 | q = """
265 | SELECT
266 | channel,
267 | namespace,
268 | short_name,
269 | title,
270 | description,
271 | sources,
272 | licenses,
273 | is_public,
274 | checksum,
275 | version,
276 | -- grapher_meta
277 | FROM meta_datasets as d
278 | -- TODO: we might want to use path instead of separate columns
279 | WHERE channel = (?) and namespace = (?) and version = (?) and short_name = (?)
280 | """
281 |
282 | df = cast(
283 | pd.DataFrame,
284 | con.execute(
285 | q,
286 | parameters=[
287 | channel,
288 | namespace,
289 | version,
290 | dataset,
291 | ],
292 | ).fetch_df(),
293 | )
294 |
295 | for col in ("sources", "licenses"):
296 | df[col] = df[col].apply(json.loads)
297 |
298 | return df
299 |
--------------------------------------------------------------------------------
/app/v1/schemas.py:
--------------------------------------------------------------------------------
1 | import datetime as dt
2 | from typing import Any, Dict, List, Optional
3 |
4 | from pydantic import BaseModel, Extra
5 |
6 |
7 | class VariableDataResponse(BaseModel):
8 | years: List[int]
9 | entity_names: List[str]
10 | entities: List[int]
11 | entity_codes: List[str]
12 | values: List[Any]
13 |
14 | class Config:
15 | extra = Extra.forbid
16 |
17 |
18 | class VariableDisplayDataTableConfig(BaseModel):
19 | hideAbsoluteChange: Optional[bool]
20 | hideRelativeChange: Optional[bool]
21 |
22 |
23 | class VariableDisplay(BaseModel):
24 | name: Optional[str]
25 | unit: Optional[str]
26 | shortUnit: Optional[str]
27 | isProjection: Optional[bool]
28 | includeInTable: Optional[bool]
29 | conversionFactor: Optional[float]
30 | numDecimalPlaces: Optional[int]
31 | tolerance: Optional[float]
32 | yearIsDay: Optional[bool]
33 | zeroDay: Optional[str]
34 | entityAnnotationsMap: Optional[str]
35 | tableDisplay: Optional[VariableDisplayDataTableConfig]
36 | color: Optional[str]
37 |
38 | class Config:
39 | extra = Extra.forbid
40 |
41 |
42 | class VariableSource(BaseModel):
43 | id: int
44 | name: str
45 | dataPublishedBy: str
46 | dataPublisherSource: str
47 | link: str
48 | retrievedDate: str
49 | additionalInfo: str
50 |
51 | class Config:
52 | extra = Extra.forbid
53 |
54 |
55 | class DimensionProperties(BaseModel):
56 | id: int
57 | name: Optional[str] = None
58 | code: Optional[str] = None
59 |
60 | class Config:
61 | extra = Extra.forbid
62 |
63 |
64 | class Dimension(BaseModel):
65 | type: str
66 | values: List[DimensionProperties]
67 |
68 | class Config:
69 | extra = Extra.forbid
70 |
71 |
72 | class VariableMetadataResponse(BaseModel):
73 | name: str
74 | unit: str
75 | shortUnit: Optional[str]
76 | code: Optional[str]
77 | description: Optional[str]
78 | createdAt: dt.datetime
79 | updatedAt: dt.datetime
80 | coverage: str
81 | timespan: str
82 | datasetId: int
83 | columnOrder: int
84 | datasetName: str
85 | nonRedistributable: bool
86 | display: VariableDisplay
87 | originalMetadata: Optional[str]
88 | grapherConfig: Optional[str]
89 | # MAYBE CHANGE - this should be turned into an array
90 | source: VariableSource
91 | type: str
92 | dimensions: Dict[str, Dimension]
93 |
94 | class Config:
95 | extra = Extra.forbid
96 |
97 |
98 | class SearchResponse(BaseModel):
99 | variable_name: str
100 | variable_title: str
101 | variable_description: str
102 | variable_unit: str
103 | table_name: str
104 | dataset_title: str
105 | channel: str
106 | metadata_url: str
107 | data_url: str
108 | match: float
109 |
110 | class Config:
111 | extra = Extra.forbid
112 |
113 |
114 | class SearchResponseList(BaseModel):
115 |
116 | results: List[SearchResponse]
117 |
118 | class Config:
119 | extra = Extra.forbid
120 |
--------------------------------------------------------------------------------
/app/v1/search.py:
--------------------------------------------------------------------------------
1 | import threading
2 | from enum import Enum
3 | from typing import Optional
4 |
5 | import structlog
6 | from fastapi import APIRouter, Query
7 |
8 | from app import utils
9 |
10 | from .schemas import SearchResponseList
11 |
12 | log = structlog.get_logger()
13 |
14 |
15 | router = APIRouter()
16 |
17 |
18 | class SearchType(str, Enum):
19 | table = "meta_tables"
20 | variable = "meta_variables"
21 | dataset = "meta_datasets"
22 |
23 |
24 | @router.get(
25 | "/search",
26 | response_model=SearchResponseList,
27 | response_model_exclude_unset=True,
28 | )
29 | def search(
30 | term: str,
31 | channels: Optional[list[str]] = Query(default=None),
32 | type: SearchType = SearchType.variable,
33 | limit: int = 10,
34 | ):
35 | con = utils.get_readonly_connection(threading.get_ident())
36 |
37 | # TODO: implement search on other tables too? not sure whether we'll need it yet
38 | if type != SearchType.variable:
39 | raise NotImplementedError(
40 | f"Invalid search type {type}, only searching variables is currently supported"
41 | )
42 |
43 | if channels:
44 | # `parameters` do not support lists (maybe `multiple_parameter_sets` would do?)
45 | channels_str = ",".join([f"'{c}'" for c in channels])
46 | where = f"and d.channel in ({channels_str})"
47 | else:
48 | where = ""
49 |
50 | # sample search
51 | q = f"""
52 | SELECT
53 | v.short_name as variable_name,
54 | v.title as variable_title,
55 | v.unit as variable_unit,
56 | v.description as variable_description,
57 | t.table_name,
58 | t.path as table_path,
59 | d.title as dataset_title,
60 | d.channel as channel,
61 | fts_main_meta_variables.match_bm25(v.path, ?) AS match
62 | FROM meta_variables as v
63 | JOIN meta_tables as t ON t.path = v.table_path
64 | JOIN meta_datasets as d ON d.path = t.dataset_path
65 | where match is not null
66 | {where}
67 | order by match desc
68 | limit (?)
69 | """
70 | matches = con.execute(q, parameters=[term, limit]).fetch_df()
71 |
72 | matches["metadata_url"] = "/v1/dataset/metadata/" + matches["table_path"]
73 | matches["data_url"] = "/v1/dataset/data/" + matches["table_path"]
74 |
75 | matches = matches.drop(columns=["table_path"])
76 |
77 | return {"results": matches.to_dict(orient="records")}
78 |
--------------------------------------------------------------------------------
/crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/crawler/__init__.py
--------------------------------------------------------------------------------
/crawler/crawl.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from collections.abc import Generator
4 | from contextlib import contextmanager
5 | from pathlib import Path
6 | from typing import Any, Optional, Set, Tuple, cast
7 |
8 | import pandas as pd
9 | import pyarrow.parquet as pq
10 | import structlog
11 | import typer
12 | from owid.catalog import RemoteCatalog, TableMeta, VariableMeta
13 | from owid.catalog.catalogs import CatalogFrame, CatalogSeries
14 | from sqlalchemy.engine import Engine
15 | from sqlalchemy.orm.session import Session
16 |
17 | from crawler.duckdb_models import (
18 | MetaDatasetModel,
19 | MetaTableModel,
20 | MetaVariableModel,
21 | db_init,
22 | )
23 | from crawler.full_text_index import main as create_full_text_index
24 |
25 | log = structlog.get_logger()
26 |
27 |
28 | # duckdb does not support NaN in categories, use a special symbol instead
29 | CATEGORY_NAN = "-"
30 |
31 |
32 | def _load_catalog_frame(channels=()) -> CatalogFrame:
33 | frame = RemoteCatalog(channels=channels).frame
34 |
35 | # only public data
36 | frame = frame.loc[frame["is_public"]]
37 |
38 | # add dataset path
39 | frame["dataset_path"] = frame.path.map(os.path.dirname)
40 |
41 | # TODO: exclude large datasets (we need to improve their performance)
42 | frame = frame[~frame.path.str.contains("garden/faostat/2022-05-17")]
43 | # frame = frame[~frame.path.str.contains("garden/un_sdg/2022-07-07/un_sdg")]
44 |
45 | # TODO: exclude datasets with missing versions
46 | frame = frame[~frame.path.str.contains("garden/faostat/2021-04-09")]
47 | frame = frame[~frame.path.str.contains("garden/owid/latest/key_indicators")]
48 | frame = frame[~frame.path.str.contains("garden/owid/latest/population_density")]
49 | frame = frame[
50 | ~frame.path.str.contains("garden/sdg/latest/sdg/sustainable_development_goal")
51 | ]
52 | frame = frame[~frame.path.str.contains("garden/worldbank_wdi/2022-05-26/wdi/wdi")]
53 |
54 | # TODO: weird error
55 | frame = frame[
56 | ~frame.path.str.contains("garden/shift/2022-07-18/fossil_fuel_production")
57 | ]
58 |
59 | # TODO: exclude special datasets for now
60 | frame = frame[~frame.path.str.contains("garden/reference/")]
61 |
62 | return frame
63 |
64 |
65 | def _variable_types(con, parquet_path) -> dict:
66 | q = f"""
67 | select
68 | name,
69 | type
70 | from parquet_schema('{parquet_path}')
71 | """
72 | mf = pd.read_sql(q, con)
73 | return mf.set_index("name")["type"].to_dict()
74 |
75 |
76 | def _dataset_sync_actions(
77 | engine: Engine, ds_path_to_checksum: dict[str, str]
78 | ) -> Tuple[Set[str], Set[str]]:
79 | q = """
80 | select
81 | path,
82 | checksum
83 | from meta_datasets
84 | """
85 | try:
86 | df = pd.read_sql(q, engine)
87 | except RuntimeError as e:
88 | if e.args[0].startswith(
89 | "Catalog Error: Table with name meta_datasets does not exist"
90 | ):
91 | df = pd.DataFrame(columns=["path", "checksum"])
92 | else:
93 | raise e
94 |
95 | # compute ids consisting of checksum and table name to know which ones to delete
96 | db_ids = {r.path: r.checksum for r in df.itertuples()}
97 |
98 | dataset_paths_to_delete = {
99 | path
100 | for path, checksum in db_ids.items()
101 | if checksum != ds_path_to_checksum.get(path)
102 | }
103 | dataset_paths_to_create = {
104 | path
105 | for path, checksum in ds_path_to_checksum.items()
106 | if checksum != db_ids.get(path)
107 | }
108 |
109 | return dataset_paths_to_delete, dataset_paths_to_create
110 |
111 |
112 | def _parse_meta_variable(
113 | var_meta: VariableMeta,
114 | m: MetaTableModel,
115 | short_name: str,
116 | variable_type: str,
117 | dataset_short_name: str,
118 | dataset_path: str,
119 | ) -> MetaVariableModel:
120 | # sometimes `unit` is missing, but there is display.unit
121 | if (var_meta.unit == "") or pd.isnull(var_meta):
122 | var_meta.unit = (var_meta.display or {}).get("unit")
123 |
124 | # if there is backported variable in non-backported dataset, remove its grapher
125 | # metadata to make sure we don't have duplicate variable ids in DB
126 | channel = dataset_path.split("/")[0]
127 | if channel != "backport" and var_meta.additional_info:
128 | var_meta.additional_info.pop("grapher_meta", None)
129 |
130 | return MetaVariableModel(
131 | title=var_meta.title,
132 | description=var_meta.description,
133 | licenses=[license.to_dict() for license in var_meta.licenses],
134 | sources=[source.to_dict() for source in var_meta.sources],
135 | unit=var_meta.unit,
136 | short_unit=var_meta.short_unit,
137 | display=var_meta.display,
138 | grapher_meta=var_meta.additional_info["grapher_meta"]
139 | if var_meta.additional_info
140 | else None,
141 | variable_id=var_meta.additional_info["grapher_meta"]["id"]
142 | if var_meta.additional_info
143 | else None,
144 | short_name=short_name,
145 | table_path=m.path,
146 | variable_type=variable_type,
147 | dataset_short_name=dataset_short_name,
148 | dataset_path=dataset_path,
149 | )
150 |
151 |
152 | def _delete_dataset(path: str, session: Session) -> None:
153 | session.query(MetaDatasetModel).filter_by(path=path).delete()
154 | session.query(MetaTableModel).filter_by(dataset_path=path).delete()
155 | session.query(MetaVariableModel).filter_by(dataset_path=path).delete()
156 |
157 |
158 | def _datasets_updates(
159 | engine: Engine, frame: CatalogFrame, force: bool, include: Optional[str]
160 | ) -> Tuple[Set[str], Set[str]]:
161 | # dataset path to checksum from frame
162 | ds_path_to_checksum = {r.dataset_path: r.checksum for r in frame.itertuples()}
163 |
164 | if force:
165 | dataset_paths_to_delete = dataset_paths_to_create = set(
166 | ds_path_to_checksum.keys()
167 | )
168 | else:
169 | # which tables to delete and which to create
170 | dataset_paths_to_delete, dataset_paths_to_create = _dataset_sync_actions(
171 | engine, ds_path_to_checksum
172 | )
173 |
174 | # if using specific include pattern, don't delete any other datasets
175 | if include:
176 | dataset_paths_to_delete = dataset_paths_to_delete & dataset_paths_to_create
177 |
178 | return dataset_paths_to_delete, dataset_paths_to_create
179 |
180 |
181 | def _extract_dimension_values(
182 | parquet_path: str, dims_to_process: Set[str], engine
183 | ) -> dict[str, Any]:
184 | dimension_values = {}
185 |
186 | if not dims_to_process:
187 | return {}
188 |
189 | # entities belong together and has to be stored as tuple `entity_id|entity_name|entity_code`
190 | # NOTE: this might be generalized to any column name with `_id`, `_name`, `_code` suffix
191 | if {"entity_id", "entity_name", "entity_code"} <= dims_to_process:
192 | dims_to_process = dims_to_process - {"entity_id", "entity_name", "entity_code"}
193 |
194 | q = f"""
195 | select distinct
196 | entity_id, entity_name, entity_code
197 | from read_parquet('{parquet_path}')
198 | """
199 | df = pd.read_sql(q, engine)
200 |
201 | index_vals = sorted(
202 | set(
203 | zip(
204 | df["entity_id"],
205 | df["entity_name"],
206 | df["entity_code"],
207 | )
208 | )
209 | )
210 |
211 | dimension_values = {
212 | "entity_zip": sorted(["|".join(map(str, x)) for x in index_vals]),
213 | }
214 |
215 | for dim in dims_to_process:
216 | q = f"""
217 | select distinct {dim}
218 | from read_parquet('{parquet_path}')
219 | """
220 | df = pd.read_sql(q, engine)
221 |
222 | dimension_values[dim] = sorted(set(df[dim].dropna()))
223 |
224 | return dimension_values
225 |
226 |
227 | def _read_parquet_metadata(
228 | parquet_path: Path,
229 | ) -> tuple[TableMeta, dict[str, VariableMeta]]:
230 | meta = pq.read_metadata(parquet_path)
231 | table_meta = TableMeta.from_json(meta.metadata[b"owid_table"]) # type: ignore
232 |
233 | owid_fields = json.loads(meta.metadata[b"owid_fields"])
234 | fields_meta = {f: VariableMeta.from_dict(v) for f, v in owid_fields.items()}
235 |
236 | return table_meta, fields_meta
237 |
238 |
239 | def main(
240 | duckdb_path: Path = Path("duck.db"),
241 | owid_catalog_dir: Path = Path("../etl/data"),
242 | include: Optional[str] = typer.Option(
243 | None, help="Include datasets matching this regex"
244 | ),
245 | force: bool = False,
246 | full_text_search: bool = True,
247 | ) -> None:
248 | """Bake ETL catalog into DuckDB."""
249 | engine = db_init(duckdb_path)
250 |
251 | frame = _load_catalog_frame(channels=("backport", "garden"))
252 |
253 | if include:
254 | frame = frame.loc[frame.dataset_path.str.contains(include)]
255 |
256 | dataset_paths_to_delete, dataset_paths_to_create = _datasets_updates(
257 | engine, frame, force, include
258 | )
259 | log.info(
260 | "duckdb.actions",
261 | delete_datasets=len(dataset_paths_to_delete),
262 | create_datasets=len(dataset_paths_to_create),
263 | )
264 |
265 | frame = frame.loc[frame.dataset_path.isin(dataset_paths_to_create)]
266 |
267 | for i, (dataset_path, dataset_frame) in enumerate(frame.groupby("dataset_path")):
268 | log.info(
269 | "dataset.create",
270 | path=dataset_path,
271 | progress=f"{i + 1}/{len(frame)}",
272 | )
273 | if dataset_path in dataset_paths_to_delete:
274 | # delete everything related to a dataset before recreating them
275 | with new_session(engine) as session:
276 | _delete_dataset(dataset_path, session)
277 | dataset_paths_to_delete.remove(dataset_path)
278 |
279 | # NOTE: we need to grab from the first table we load, only insert the dataset
280 | # when we process the first table
281 | dataset_inserted = False
282 |
283 | for i, (_, catalog_row) in enumerate(dataset_frame.iterrows()):
284 |
285 | catalog_row = cast(CatalogSeries, catalog_row)
286 |
287 | log.info(
288 | "table.read_parquet_metadata",
289 | path=catalog_row.path,
290 | )
291 |
292 | parquet_path = (owid_catalog_dir / catalog_row.path).with_suffix(".parquet")
293 |
294 | table_meta, fields_meta = _read_parquet_metadata(parquet_path)
295 |
296 | log.info(
297 | "table.extract_dimension_values",
298 | path=catalog_row.path,
299 | )
300 |
301 | # NOTE: this requires reading parquet file, which could be slow. We could instead write
302 | # dimensions values to metadata when generating the parquet file.
303 | dimension_values = _extract_dimension_values(
304 | parquet_path, set(catalog_row.dimensions), engine
305 | )
306 |
307 | t = MetaTableModel.from_CatalogSeries(catalog_row, dimension_values)
308 |
309 | log.info(
310 | "table.create",
311 | path=t.path,
312 | )
313 |
314 | with new_session(engine) as session:
315 | # save dataset metadata alongside table, we could also create a separate table for datasets
316 | ds = table_meta.dataset
317 | assert ds is not None
318 |
319 | # exceptions for backported channel
320 | if catalog_row.channel == "backport":
321 | # backported datasets are missing version
322 | ds.version = "latest"
323 | # all backported datasets are currently saved under `owid` namespace, we could be saving them in their
324 | # real namespaces, but that would imply non-trivial changes to backporting code in ETL
325 | ds.namespace = "owid"
326 |
327 | assert ds.short_name
328 | if not ds.version:
329 | log.error("missing.version", path=catalog_row["path"])
330 | continue
331 |
332 | # add table
333 | session.add(t)
334 |
335 | # create dataset
336 | # TODO: channel should be ideally property of DatasetMeta
337 | if not dataset_inserted:
338 | session.add(
339 | MetaDatasetModel.from_DatasetMeta(
340 | ds, dataset_path, dataset_checksum=catalog_row.checksum
341 | )
342 | )
343 | dataset_inserted = True
344 |
345 | # get variable types from DB
346 | assert t.path
347 | variable_types = _variable_types(engine, parquet_path)
348 |
349 | # table with variables
350 | variables = []
351 | for variable_short_name, variable_meta in fields_meta.items():
352 | if variable_short_name in t.dimensions:
353 | continue
354 |
355 | variables.append(
356 | _parse_meta_variable(
357 | variable_meta,
358 | t,
359 | variable_short_name,
360 | variable_types[variable_short_name],
361 | ds.short_name,
362 | dataset_path,
363 | )
364 | )
365 | log.info(
366 | "table.variable.create",
367 | variable=variable_short_name,
368 | )
369 |
370 | session.add_all(variables)
371 |
372 | # delete the rest of the datasets
373 | if dataset_paths_to_delete:
374 | log.info("dataset.delete_datasets", n=len(dataset_paths_to_delete))
375 | with new_session(engine) as session:
376 | for dataset_path in dataset_paths_to_delete:
377 | _delete_dataset(dataset_path, session)
378 |
379 | if full_text_search:
380 | # recreate full-text search index (this has to be run on every new dataset)
381 | create_full_text_index(duckdb_path)
382 |
383 |
384 | @contextmanager
385 | def new_session(engine) -> Generator[Session, None, None]:
386 | """Open new session and commit at the end without expiring objects.
387 |
388 | I couldn't make this work with creating only one session per table (tables did not have data for
389 | unknown reasons), so I'm creating new session for each operation which works. Feel free to fix
390 | this and make it transactional or switch to a different ORM.
391 | """
392 | # NOTE: should I do it with transaction, i.e. `with session.begin():`?
393 | # there would be problems with commits in _upsert_dataset
394 | with Session(engine, expire_on_commit=False) as session:
395 | yield session
396 | session.commit()
397 |
398 |
399 | def main_cli():
400 | return typer.run(main)
401 |
402 |
403 | if __name__ == "__main__":
404 | main_cli()
405 |
--------------------------------------------------------------------------------
/crawler/duckdb_models.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pathlib import Path
3 | from typing import Any
4 |
5 | import pandas as pd
6 | import structlog
7 | from owid.catalog import DatasetMeta
8 | from owid.catalog.catalogs import CatalogSeries
9 | from sqlalchemy import JSON, Boolean, Column, Integer, String, create_engine
10 | from sqlalchemy.engine import Engine
11 | from sqlalchemy.ext.declarative import declarative_base
12 |
13 | Base = declarative_base()
14 |
15 |
16 | log = structlog.get_logger()
17 |
18 |
19 | # NOTE: not having type hints is quite limiting, ideally we would make this work with sqlmodel
20 | class MetaDatasetModel(Base): # type: ignore
21 | """
22 | Almost identical copy of DatasetMeta from owid-catalog-py
23 | """
24 |
25 | __tablename__ = "meta_datasets"
26 |
27 | # TODO: what should we use as the primary key? either we use autoincremented ids or we
28 | # use paths (e.g. `garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp` is an address to table)
29 | # what are the pros and cons of each?
30 | path = Column(String, primary_key=True)
31 |
32 | channel = Column(String)
33 | namespace = Column(String)
34 | short_name = Column(String)
35 | title = Column(String)
36 | description = Column(String)
37 | sources = Column(JSON)
38 | licenses = Column(JSON)
39 | is_public = Column(Boolean)
40 | checksum = Column(String)
41 | source_checksum = Column(String)
42 | version = Column(String)
43 |
44 | # this is an attribute of additional_info['grapher_meta']
45 | grapher_meta = Column(JSON)
46 |
47 | @classmethod
48 | def from_DatasetMeta(
49 | cls, ds: DatasetMeta, dataset_path: str, dataset_checksum: str
50 | ) -> "MetaDatasetModel":
51 | return MetaDatasetModel(
52 | path=dataset_path,
53 | channel=dataset_path.split("/")[0],
54 | short_name=ds.short_name,
55 | namespace=ds.namespace,
56 | title=ds.title,
57 | description=ds.description,
58 | sources=[source.to_dict() for source in ds.sources],
59 | licenses=[license.to_dict() for license in ds.licenses],
60 | is_public=ds.is_public,
61 | checksum=dataset_checksum,
62 | source_checksum=ds.source_checksum,
63 | grapher_meta=ds.additional_info["grapher_meta"]
64 | if ds.additional_info
65 | else None,
66 | version=ds.version,
67 | )
68 |
69 |
70 | class MetaTableModel(Base): # type: ignore
71 | __tablename__ = "meta_tables"
72 |
73 | path = Column(String, primary_key=True)
74 | dataset_path = Column(String)
75 |
76 | table_name = Column(String)
77 | dataset_name = Column(String)
78 |
79 | # columns from catalog
80 | version = Column(String)
81 | namespace = Column(String)
82 | channel = Column(String)
83 | dimensions = Column(JSON)
84 | format = Column(String)
85 | is_public = Column(Boolean)
86 |
87 | # distinct values of years and entities encoded as JSON
88 | dimension_values = Column(JSON)
89 |
90 | def __init__(self, *args, **kwargs):
91 | # TODO: "format" was changed to "formats", we'd have to rebuild the entire database, so just
92 | # hotfix it for now
93 | assert "feather" in kwargs["formats"]
94 | kwargs["format"] = "feather"
95 | del kwargs["formats"]
96 | super().__init__(*args, **kwargs)
97 |
98 | @classmethod
99 | def from_CatalogSeries(
100 | cls, catalog_row: CatalogSeries, dimension_values: dict[str, Any]
101 | ) -> "MetaTableModel":
102 | d = catalog_row.to_dict()
103 |
104 | d["dimension_values"] = dimension_values
105 |
106 | # checksum from catalog is actually checksum of a dataset, not table!
107 | del d["checksum"]
108 |
109 | d["dimensions"] = list(d["dimensions"])
110 |
111 | # rename to adhere to DuckDB schema
112 | d["table_name"] = d.pop("table")
113 | d["dataset_name"] = d.pop("dataset")
114 |
115 | t = cls(**d)
116 |
117 | is_backport = t.channel == "backport"
118 |
119 | if is_backport:
120 | missing_dims = {"year", "entity_name", "entity_code", "entity_id"} - set(
121 | t.dimensions
122 | )
123 | assert not missing_dims, f"Missing dimensions: {missing_dims}"
124 |
125 | return t
126 |
127 |
128 | class MetaVariableModel(Base): # type: ignore
129 | __tablename__ = "meta_variables"
130 |
131 | path = Column(String, primary_key=True)
132 |
133 | # columns from VariableMeta
134 | title = Column(String)
135 | description = Column(String)
136 | licenses = Column(JSON)
137 | sources = Column(JSON)
138 | unit = Column(String)
139 | short_unit = Column(String)
140 | display = Column(JSON)
141 |
142 | # this is an attribute of additional_info['grapher_meta']
143 | grapher_meta = Column(JSON)
144 |
145 | variable_id = Column(Integer)
146 |
147 | # inferred columns by crawler
148 | short_name = Column(String)
149 | table_path = Column(String)
150 | dataset_path = Column(String)
151 | dataset_short_name = Column(String)
152 | variable_type = Column(String)
153 |
154 | def __init__(self, *args, **kwargs):
155 | kwargs["path"] = f"{kwargs['table_path']}/{kwargs['short_name']}"
156 | super().__init__(*args, **kwargs)
157 |
158 |
159 | class PdEncoder(json.JSONEncoder):
160 | """Serialize non-native JSON objects."""
161 |
162 | def default(self, obj):
163 | if isinstance(obj, pd.Timestamp):
164 | return str(obj)
165 | return json.JSONEncoder.default(self, obj)
166 |
167 |
168 | def db_init(path: Path) -> Engine:
169 | eng = create_engine(
170 | f"duckdb:///{path}",
171 | json_serializer=lambda obj: json.dumps(obj, cls=PdEncoder),
172 | )
173 | Base.metadata.create_all(eng)
174 | return eng
175 |
--------------------------------------------------------------------------------
/crawler/full_text_index.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import duckdb
4 | import structlog
5 | import typer
6 |
7 | log = structlog.get_logger()
8 |
9 |
10 | def main(
11 | duckdb_path: Path = Path("duck.db"),
12 | ) -> None:
13 | assert duckdb_path.exists(), "DuckDB database path does not exist"
14 |
15 | log.info("table.full_text_index.start")
16 |
17 | con = duckdb.connect(duckdb_path.as_posix())
18 | cols = [
19 | "title",
20 | "description",
21 | "path",
22 | "unit",
23 | "short_name",
24 | ]
25 | _create_full_text_search_index(con, "meta_variables", "path", cols)
26 | log.info("table.full_text_index.end")
27 |
28 |
29 | def _create_full_text_search_index(
30 | con, table_name: str, primary_key: str, columns: list[str] = ["*"]
31 | ):
32 | # NOTE: path is a unique identifier (primary key probably)
33 | # NOTE: we include numbers (for SDG goals for instance)
34 | cols_to_index = ",".join([f"'{c}'" for c in columns])
35 | con.execute(
36 | f"""PRAGMA create_fts_index(
37 | '{table_name}',
38 | '{primary_key}',
39 | {cols_to_index},
40 | stopwords='english',
41 | overwrite=1,
42 | ignore='(\\.|[^a-z0-9])+')"""
43 | )
44 |
45 |
46 | if __name__ == "__main__":
47 | typer.run(main)
48 |
--------------------------------------------------------------------------------
/crawler/query_duckdb.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/html": [
11 | "
\n",
12 | "\n",
25 | "
\n",
26 | " \n",
27 | " \n",
28 | " | \n",
29 | " cid | \n",
30 | " name | \n",
31 | " type | \n",
32 | " notnull | \n",
33 | " dflt_value | \n",
34 | " pk | \n",
35 | "
\n",
36 | " \n",
37 | " \n",
38 | " \n",
39 | " 0 | \n",
40 | " 0 | \n",
41 | " table_name | \n",
42 | " VARCHAR | \n",
43 | " True | \n",
44 | " NaN | \n",
45 | " True | \n",
46 | "
\n",
47 | " \n",
48 | " 1 | \n",
49 | " 1 | \n",
50 | " dataset_name | \n",
51 | " VARCHAR | \n",
52 | " False | \n",
53 | " NaN | \n",
54 | " False | \n",
55 | "
\n",
56 | " \n",
57 | " 2 | \n",
58 | " 2 | \n",
59 | " table_db_name | \n",
60 | " VARCHAR | \n",
61 | " False | \n",
62 | " NaN | \n",
63 | " False | \n",
64 | "
\n",
65 | " \n",
66 | " 3 | \n",
67 | " 3 | \n",
68 | " version | \n",
69 | " VARCHAR | \n",
70 | " False | \n",
71 | " NaN | \n",
72 | " False | \n",
73 | "
\n",
74 | " \n",
75 | " 4 | \n",
76 | " 4 | \n",
77 | " namespace | \n",
78 | " VARCHAR | \n",
79 | " False | \n",
80 | " NaN | \n",
81 | " False | \n",
82 | "
\n",
83 | " \n",
84 | " 5 | \n",
85 | " 5 | \n",
86 | " channel | \n",
87 | " VARCHAR | \n",
88 | " False | \n",
89 | " NaN | \n",
90 | " False | \n",
91 | "
\n",
92 | " \n",
93 | " 6 | \n",
94 | " 6 | \n",
95 | " checksum | \n",
96 | " VARCHAR | \n",
97 | " False | \n",
98 | " NaN | \n",
99 | " False | \n",
100 | "
\n",
101 | " \n",
102 | " 7 | \n",
103 | " 7 | \n",
104 | " dimensions | \n",
105 | " JSON | \n",
106 | " False | \n",
107 | " NaN | \n",
108 | " False | \n",
109 | "
\n",
110 | " \n",
111 | " 8 | \n",
112 | " 8 | \n",
113 | " path | \n",
114 | " VARCHAR | \n",
115 | " False | \n",
116 | " NaN | \n",
117 | " False | \n",
118 | "
\n",
119 | " \n",
120 | " 9 | \n",
121 | " 9 | \n",
122 | " format | \n",
123 | " VARCHAR | \n",
124 | " False | \n",
125 | " NaN | \n",
126 | " False | \n",
127 | "
\n",
128 | " \n",
129 | " 10 | \n",
130 | " 10 | \n",
131 | " is_public | \n",
132 | " BOOLEAN | \n",
133 | " False | \n",
134 | " NaN | \n",
135 | " False | \n",
136 | "
\n",
137 | " \n",
138 | "
\n",
139 | "
"
140 | ],
141 | "text/plain": [
142 | " cid name type notnull dflt_value pk\n",
143 | "0 0 table_name VARCHAR True NaN True\n",
144 | "1 1 dataset_name VARCHAR False NaN False\n",
145 | "2 2 table_db_name VARCHAR False NaN False\n",
146 | "3 3 version VARCHAR False NaN False\n",
147 | "4 4 namespace VARCHAR False NaN False\n",
148 | "5 5 channel VARCHAR False NaN False\n",
149 | "6 6 checksum VARCHAR False NaN False\n",
150 | "7 7 dimensions JSON False NaN False\n",
151 | "8 8 path VARCHAR False NaN False\n",
152 | "9 9 format VARCHAR False NaN False\n",
153 | "10 10 is_public BOOLEAN False NaN False"
154 | ]
155 | },
156 | "execution_count": 1,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "import duckdb\n",
163 | "\n",
164 | "con = duckdb.connect(\"../duck.db\", read_only=True)\n",
165 | "\n",
166 | "con.execute(\"PRAGMA table_info('meta_tables');\").fetch_df()"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 2,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/html": [
177 | "\n",
178 | "\n",
191 | "
\n",
192 | " \n",
193 | " \n",
194 | " | \n",
195 | " table_name | \n",
196 | " dataset_name | \n",
197 | " table_db_name | \n",
198 | " version | \n",
199 | " namespace | \n",
200 | " channel | \n",
201 | " checksum | \n",
202 | " dimensions | \n",
203 | " path | \n",
204 | " format | \n",
205 | " is_public | \n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " \n",
210 | " 0 | \n",
211 | " dataset_941_technology_adoption__isard__1942__... | \n",
212 | " dataset_941_technology_adoption__isard__1942__... | \n",
213 | " backport__owid__latest__dataset_941_technology... | \n",
214 | " NaN | \n",
215 | " owid | \n",
216 | " backport | \n",
217 | " 76c24e0b3af5621506abb1cd3971faf0 | \n",
218 | " [\"year\", \"entity_name\", \"entity_id\", \"entity_c... | \n",
219 | " backport/owid/latest/dataset_941_technology_ad... | \n",
220 | " feather | \n",
221 | " True | \n",
222 | "
\n",
223 | " \n",
224 | " 1 | \n",
225 | " maddison_gdp | \n",
226 | " ggdc_maddison | \n",
227 | " garden__ggdc__2020_10_01__ggdc_maddison__maddi... | \n",
228 | " 2020-10-01 | \n",
229 | " ggdc | \n",
230 | " garden | \n",
231 | " 7236fb37ff655adc0d9924a9e79937ed | \n",
232 | " [\"country\", \"year\"] | \n",
233 | " garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp | \n",
234 | " feather | \n",
235 | " True | \n",
236 | "
\n",
237 | " \n",
238 | "
\n",
239 | "
"
240 | ],
241 | "text/plain": [
242 | " table_name \\\n",
243 | "0 dataset_941_technology_adoption__isard__1942__... \n",
244 | "1 maddison_gdp \n",
245 | "\n",
246 | " dataset_name \\\n",
247 | "0 dataset_941_technology_adoption__isard__1942__... \n",
248 | "1 ggdc_maddison \n",
249 | "\n",
250 | " table_db_name version namespace \\\n",
251 | "0 backport__owid__latest__dataset_941_technology... NaN owid \n",
252 | "1 garden__ggdc__2020_10_01__ggdc_maddison__maddi... 2020-10-01 ggdc \n",
253 | "\n",
254 | " channel checksum \\\n",
255 | "0 backport 76c24e0b3af5621506abb1cd3971faf0 \n",
256 | "1 garden 7236fb37ff655adc0d9924a9e79937ed \n",
257 | "\n",
258 | " dimensions \\\n",
259 | "0 [\"year\", \"entity_name\", \"entity_id\", \"entity_c... \n",
260 | "1 [\"country\", \"year\"] \n",
261 | "\n",
262 | " path format is_public \n",
263 | "0 backport/owid/latest/dataset_941_technology_ad... feather True \n",
264 | "1 garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp feather True "
265 | ]
266 | },
267 | "execution_count": 2,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "q = \"\"\"\n",
274 | "select\n",
275 | " *\n",
276 | "from meta_tables\n",
277 | "limit 5\n",
278 | "\"\"\"\n",
279 | "con.execute(q).fetch_df()"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 3,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/html": [
290 | "\n",
291 | "\n",
304 | "
\n",
305 | " \n",
306 | " \n",
307 | " | \n",
308 | " title | \n",
309 | " description | \n",
310 | " licenses | \n",
311 | " sources | \n",
312 | " unit | \n",
313 | " short_unit | \n",
314 | " display | \n",
315 | " grapher_meta | \n",
316 | " variable_path | \n",
317 | " variable_id | \n",
318 | " short_name | \n",
319 | " table_path | \n",
320 | " table_db_name | \n",
321 | " dataset_short_name | \n",
322 | " variable_type | \n",
323 | " dimension_values | \n",
324 | "
\n",
325 | " \n",
326 | " \n",
327 | " \n",
328 | " 0 | \n",
329 | " ATM (Comin and Hobijn (2004)) | \n",
330 | " Number of electro-mechanical devices that perm... | \n",
331 | " [] | \n",
332 | " [{\"name\": \"Isard (1942) and others\", \"descript... | \n",
333 | " | \n",
334 | " NaN | \n",
335 | " null | \n",
336 | " {\"id\": 42539, \"name\": \"ATM (Comin and Hobijn (... | \n",
337 | " backport/owid/latest/dataset_941_technology_ad... | \n",
338 | " 42539 | \n",
339 | " atm__comin_and_hobijn__2004 | \n",
340 | " backport/owid/latest/dataset_941_technology_ad... | \n",
341 | " backport__owid__latest__dataset_941_technology... | \n",
342 | " dataset_941_technology_adoption__isard__1942__... | \n",
343 | " FLOAT | \n",
344 | " {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... | \n",
345 | "
\n",
346 | " \n",
347 | " 1 | \n",
348 | " Agricultural tractor (Comin and Hobijn (2004)) | \n",
349 | " Number of wheel and crawler tractors (excl. ga... | \n",
350 | " [] | \n",
351 | " [{\"name\": \"Isard (1942) and others\", \"descript... | \n",
352 | " | \n",
353 | " NaN | \n",
354 | " null | \n",
355 | " {\"id\": 42538, \"name\": \"Agricultural tractor (C... | \n",
356 | " backport/owid/latest/dataset_941_technology_ad... | \n",
357 | " 42538 | \n",
358 | " agricultural_tractor__comin_and_hobijn__2004 | \n",
359 | " backport/owid/latest/dataset_941_technology_ad... | \n",
360 | " backport__owid__latest__dataset_941_technology... | \n",
361 | " dataset_941_technology_adoption__isard__1942__... | \n",
362 | " BIGINT | \n",
363 | " {\"year\": [\"1961\", \"1962\", \"1963\", \"1964\", \"196... | \n",
364 | "
\n",
365 | " \n",
366 | " 2 | \n",
367 | " Aviation passenger-km (Comin and Hobijn (2004)) | \n",
368 | " Civil aviation passenger‐km travelled on sched... | \n",
369 | " [] | \n",
370 | " [{\"name\": \"Isard (1942) and others\", \"descript... | \n",
371 | " passenger-km | \n",
372 | " NaN | \n",
373 | " null | \n",
374 | " {\"id\": 42540, \"name\": \"Aviation passenger-km (... | \n",
375 | " backport/owid/latest/dataset_941_technology_ad... | \n",
376 | " 42540 | \n",
377 | " aviation_passenger_km__comin_and_hobijn__2004 | \n",
378 | " backport/owid/latest/dataset_941_technology_ad... | \n",
379 | " backport__owid__latest__dataset_941_technology... | \n",
380 | " dataset_941_technology_adoption__isard__1942__... | \n",
381 | " BIGINT | \n",
382 | " {\"year\": [\"1930\", \"1931\", \"1932\", \"1933\", \"193... | \n",
383 | "
\n",
384 | " \n",
385 | " 3 | \n",
386 | " Canals (Isard (1942)) | \n",
387 | " Measures the mileage of completed canals. | \n",
388 | " [] | \n",
389 | " [{\"name\": \"Isard (1942) and others\", \"descript... | \n",
390 | " mileage | \n",
391 | " NaN | \n",
392 | " null | \n",
393 | " {\"id\": 42535, \"name\": \"Canals (Isard (1942))\",... | \n",
394 | " backport/owid/latest/dataset_941_technology_ad... | \n",
395 | " 42535 | \n",
396 | " canals__isard__1942 | \n",
397 | " backport/owid/latest/dataset_941_technology_ad... | \n",
398 | " backport__owid__latest__dataset_941_technology... | \n",
399 | " dataset_941_technology_adoption__isard__1942__... | \n",
400 | " FLOAT | \n",
401 | " {\"year\": [\"1800\", \"1803\", \"1805\", \"1807\", \"180... | \n",
402 | "
\n",
403 | " \n",
404 | " 4 | \n",
405 | " Card payments (Comin and Hobijn (2004)) | \n",
406 | " Number of transactions using payment cards at ... | \n",
407 | " [] | \n",
408 | " [{\"name\": \"Isard (1942) and others\", \"descript... | \n",
409 | " | \n",
410 | " NaN | \n",
411 | " null | \n",
412 | " {\"id\": 42542, \"name\": \"Card payments (Comin an... | \n",
413 | " backport/owid/latest/dataset_941_technology_ad... | \n",
414 | " 42542 | \n",
415 | " card_payments__comin_and_hobijn__2004 | \n",
416 | " backport/owid/latest/dataset_941_technology_ad... | \n",
417 | " backport__owid__latest__dataset_941_technology... | \n",
418 | " dataset_941_technology_adoption__isard__1942__... | \n",
419 | " BIGINT | \n",
420 | " {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... | \n",
421 | "
\n",
422 | " \n",
423 | "
\n",
424 | "
"
425 | ],
426 | "text/plain": [
427 | " title \\\n",
428 | "0 ATM (Comin and Hobijn (2004)) \n",
429 | "1 Agricultural tractor (Comin and Hobijn (2004)) \n",
430 | "2 Aviation passenger-km (Comin and Hobijn (2004)) \n",
431 | "3 Canals (Isard (1942)) \n",
432 | "4 Card payments (Comin and Hobijn (2004)) \n",
433 | "\n",
434 | " description licenses \\\n",
435 | "0 Number of electro-mechanical devices that perm... [] \n",
436 | "1 Number of wheel and crawler tractors (excl. ga... [] \n",
437 | "2 Civil aviation passenger‐km travelled on sched... [] \n",
438 | "3 Measures the mileage of completed canals. [] \n",
439 | "4 Number of transactions using payment cards at ... [] \n",
440 | "\n",
441 | " sources unit short_unit \\\n",
442 | "0 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n",
443 | "1 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n",
444 | "2 [{\"name\": \"Isard (1942) and others\", \"descript... passenger-km NaN \n",
445 | "3 [{\"name\": \"Isard (1942) and others\", \"descript... mileage NaN \n",
446 | "4 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n",
447 | "\n",
448 | " display grapher_meta \\\n",
449 | "0 null {\"id\": 42539, \"name\": \"ATM (Comin and Hobijn (... \n",
450 | "1 null {\"id\": 42538, \"name\": \"Agricultural tractor (C... \n",
451 | "2 null {\"id\": 42540, \"name\": \"Aviation passenger-km (... \n",
452 | "3 null {\"id\": 42535, \"name\": \"Canals (Isard (1942))\",... \n",
453 | "4 null {\"id\": 42542, \"name\": \"Card payments (Comin an... \n",
454 | "\n",
455 | " variable_path variable_id \\\n",
456 | "0 backport/owid/latest/dataset_941_technology_ad... 42539 \n",
457 | "1 backport/owid/latest/dataset_941_technology_ad... 42538 \n",
458 | "2 backport/owid/latest/dataset_941_technology_ad... 42540 \n",
459 | "3 backport/owid/latest/dataset_941_technology_ad... 42535 \n",
460 | "4 backport/owid/latest/dataset_941_technology_ad... 42542 \n",
461 | "\n",
462 | " short_name \\\n",
463 | "0 atm__comin_and_hobijn__2004 \n",
464 | "1 agricultural_tractor__comin_and_hobijn__2004 \n",
465 | "2 aviation_passenger_km__comin_and_hobijn__2004 \n",
466 | "3 canals__isard__1942 \n",
467 | "4 card_payments__comin_and_hobijn__2004 \n",
468 | "\n",
469 | " table_path \\\n",
470 | "0 backport/owid/latest/dataset_941_technology_ad... \n",
471 | "1 backport/owid/latest/dataset_941_technology_ad... \n",
472 | "2 backport/owid/latest/dataset_941_technology_ad... \n",
473 | "3 backport/owid/latest/dataset_941_technology_ad... \n",
474 | "4 backport/owid/latest/dataset_941_technology_ad... \n",
475 | "\n",
476 | " table_db_name \\\n",
477 | "0 backport__owid__latest__dataset_941_technology... \n",
478 | "1 backport__owid__latest__dataset_941_technology... \n",
479 | "2 backport__owid__latest__dataset_941_technology... \n",
480 | "3 backport__owid__latest__dataset_941_technology... \n",
481 | "4 backport__owid__latest__dataset_941_technology... \n",
482 | "\n",
483 | " dataset_short_name variable_type \\\n",
484 | "0 dataset_941_technology_adoption__isard__1942__... FLOAT \n",
485 | "1 dataset_941_technology_adoption__isard__1942__... BIGINT \n",
486 | "2 dataset_941_technology_adoption__isard__1942__... BIGINT \n",
487 | "3 dataset_941_technology_adoption__isard__1942__... FLOAT \n",
488 | "4 dataset_941_technology_adoption__isard__1942__... BIGINT \n",
489 | "\n",
490 | " dimension_values \n",
491 | "0 {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... \n",
492 | "1 {\"year\": [\"1961\", \"1962\", \"1963\", \"1964\", \"196... \n",
493 | "2 {\"year\": [\"1930\", \"1931\", \"1932\", \"1933\", \"193... \n",
494 | "3 {\"year\": [\"1800\", \"1803\", \"1805\", \"1807\", \"180... \n",
495 | "4 {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... "
496 | ]
497 | },
498 | "execution_count": 3,
499 | "metadata": {},
500 | "output_type": "execute_result"
501 | }
502 | ],
503 | "source": [
504 | "q = \"\"\"\n",
505 | "select\n",
506 | " *\n",
507 | "from meta_variables\n",
508 | "limit 5\n",
509 | "\"\"\"\n",
510 | "con.execute(q).fetch_df()"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 4,
516 | "metadata": {},
517 | "outputs": [
518 | {
519 | "data": {
520 | "text/html": [
521 | "\n",
522 | "\n",
535 | "
\n",
536 | " \n",
537 | " \n",
538 | " | \n",
539 | " cid | \n",
540 | " name | \n",
541 | " type | \n",
542 | " notnull | \n",
543 | " dflt_value | \n",
544 | " pk | \n",
545 | "
\n",
546 | " \n",
547 | " \n",
548 | " \n",
549 | " 0 | \n",
550 | " 0 | \n",
551 | " year | \n",
552 | " UBIGINT | \n",
553 | " False | \n",
554 | " NaN | \n",
555 | " False | \n",
556 | "
\n",
557 | " \n",
558 | " 1 | \n",
559 | " 1 | \n",
560 | " entity_name | \n",
561 | " entity_name | \n",
562 | " False | \n",
563 | " NaN | \n",
564 | " False | \n",
565 | "
\n",
566 | " \n",
567 | " 2 | \n",
568 | " 2 | \n",
569 | " entity_id | \n",
570 | " BIGINT | \n",
571 | " False | \n",
572 | " NaN | \n",
573 | " False | \n",
574 | "
\n",
575 | " \n",
576 | " 3 | \n",
577 | " 3 | \n",
578 | " entity_code | \n",
579 | " entity_code | \n",
580 | " False | \n",
581 | " NaN | \n",
582 | " False | \n",
583 | "
\n",
584 | " \n",
585 | " 4 | \n",
586 | " 4 | \n",
587 | " atm__comin_and_hobijn__2004 | \n",
588 | " FLOAT | \n",
589 | " False | \n",
590 | " NaN | \n",
591 | " False | \n",
592 | "
\n",
593 | " \n",
594 | " 5 | \n",
595 | " 5 | \n",
596 | " agricultural_tractor__comin_and_hobijn__2004 | \n",
597 | " BIGINT | \n",
598 | " False | \n",
599 | " NaN | \n",
600 | " False | \n",
601 | "
\n",
602 | " \n",
603 | " 6 | \n",
604 | " 6 | \n",
605 | " aviation_passenger_km__comin_and_hobijn__2004 | \n",
606 | " BIGINT | \n",
607 | " False | \n",
608 | " NaN | \n",
609 | " False | \n",
610 | "
\n",
611 | " \n",
612 | " 7 | \n",
613 | " 7 | \n",
614 | " canals__isard__1942 | \n",
615 | " FLOAT | \n",
616 | " False | \n",
617 | " NaN | \n",
618 | " False | \n",
619 | "
\n",
620 | " \n",
621 | " 8 | \n",
622 | " 8 | \n",
623 | " card_payments__comin_and_hobijn__2004 | \n",
624 | " BIGINT | \n",
625 | " False | \n",
626 | " NaN | \n",
627 | " False | \n",
628 | "
\n",
629 | " \n",
630 | " 9 | \n",
631 | " 9 | \n",
632 | " commercial_vehicles__comin_and_hobijn__2004 | \n",
633 | " FLOAT | \n",
634 | " False | \n",
635 | " NaN | \n",
636 | " False | \n",
637 | "
\n",
638 | " \n",
639 | " 10 | \n",
640 | " 10 | \n",
641 | " credit_and_debit_payments__comin_and_hobijn__2004 | \n",
642 | " FLOAT | \n",
643 | " False | \n",
644 | " NaN | \n",
645 | " False | \n",
646 | "
\n",
647 | " \n",
648 | " 11 | \n",
649 | " 11 | \n",
650 | " crude_steel_production__blast_oxygen_furnaces_... | \n",
651 | " BIGINT | \n",
652 | " False | \n",
653 | " NaN | \n",
654 | " False | \n",
655 | "
\n",
656 | " \n",
657 | " 12 | \n",
658 | " 12 | \n",
659 | " crude_steel_production__electric_furnaces__com... | \n",
660 | " INTEGER | \n",
661 | " False | \n",
662 | " NaN | \n",
663 | " False | \n",
664 | "
\n",
665 | " \n",
666 | " 13 | \n",
667 | " 13 | \n",
668 | " diesel_locomotives_in_service__us_census_burea... | \n",
669 | " INTEGER | \n",
670 | " False | \n",
671 | " NaN | \n",
672 | " False | \n",
673 | "
\n",
674 | " \n",
675 | " 14 | \n",
676 | " 14 | \n",
677 | " mri_units__comin_and_hobijn__2004 | \n",
678 | " INTEGER | \n",
679 | " False | \n",
680 | " NaN | \n",
681 | " False | \n",
682 | "
\n",
683 | " \n",
684 | " 15 | \n",
685 | " 15 | \n",
686 | " mail__mitchell__1998 | \n",
687 | " BIGINT | \n",
688 | " False | \n",
689 | " NaN | \n",
690 | " False | \n",
691 | "
\n",
692 | " \n",
693 | " 16 | \n",
694 | " 16 | \n",
695 | " newspapers__comin_and_hobijn__2004 | \n",
696 | " BIGINT | \n",
697 | " False | \n",
698 | " NaN | \n",
699 | " False | \n",
700 | "
\n",
701 | " \n",
702 | " 17 | \n",
703 | " 17 | \n",
704 | " rail_passenger_km__comin_and_hobijn__2004 | \n",
705 | " BIGINT | \n",
706 | " False | \n",
707 | " NaN | \n",
708 | " False | \n",
709 | "
\n",
710 | " \n",
711 | " 18 | \n",
712 | " 18 | \n",
713 | " retail_locations_accepting_card__comin_and_hob... | \n",
714 | " FLOAT | \n",
715 | " False | \n",
716 | " NaN | \n",
717 | " False | \n",
718 | "
\n",
719 | " \n",
720 | " 19 | \n",
721 | " 19 | \n",
722 | " roads__us_census_bureau__2017 | \n",
723 | " INTEGER | \n",
724 | " False | \n",
725 | " NaN | \n",
726 | " False | \n",
727 | "
\n",
728 | " \n",
729 | " 20 | \n",
730 | " 20 | \n",
731 | " steamships__tons__comin_and_hobijn__2004 | \n",
732 | " INTEGER | \n",
733 | " False | \n",
734 | " NaN | \n",
735 | " False | \n",
736 | "
\n",
737 | " \n",
738 | " 21 | \n",
739 | " 21 | \n",
740 | " synthetic__non_cellulosic__fibres__comin_and_h... | \n",
741 | " BIGINT | \n",
742 | " False | \n",
743 | " NaN | \n",
744 | " False | \n",
745 | "
\n",
746 | " \n",
747 | " 22 | \n",
748 | " 22 | \n",
749 | " telegrams__mitchell__1998 | \n",
750 | " FLOAT | \n",
751 | " False | \n",
752 | " NaN | \n",
753 | " False | \n",
754 | "
\n",
755 | " \n",
756 | "
\n",
757 | "
"
758 | ],
759 | "text/plain": [
760 | " cid name type \\\n",
761 | "0 0 year UBIGINT \n",
762 | "1 1 entity_name entity_name \n",
763 | "2 2 entity_id BIGINT \n",
764 | "3 3 entity_code entity_code \n",
765 | "4 4 atm__comin_and_hobijn__2004 FLOAT \n",
766 | "5 5 agricultural_tractor__comin_and_hobijn__2004 BIGINT \n",
767 | "6 6 aviation_passenger_km__comin_and_hobijn__2004 BIGINT \n",
768 | "7 7 canals__isard__1942 FLOAT \n",
769 | "8 8 card_payments__comin_and_hobijn__2004 BIGINT \n",
770 | "9 9 commercial_vehicles__comin_and_hobijn__2004 FLOAT \n",
771 | "10 10 credit_and_debit_payments__comin_and_hobijn__2004 FLOAT \n",
772 | "11 11 crude_steel_production__blast_oxygen_furnaces_... BIGINT \n",
773 | "12 12 crude_steel_production__electric_furnaces__com... INTEGER \n",
774 | "13 13 diesel_locomotives_in_service__us_census_burea... INTEGER \n",
775 | "14 14 mri_units__comin_and_hobijn__2004 INTEGER \n",
776 | "15 15 mail__mitchell__1998 BIGINT \n",
777 | "16 16 newspapers__comin_and_hobijn__2004 BIGINT \n",
778 | "17 17 rail_passenger_km__comin_and_hobijn__2004 BIGINT \n",
779 | "18 18 retail_locations_accepting_card__comin_and_hob... FLOAT \n",
780 | "19 19 roads__us_census_bureau__2017 INTEGER \n",
781 | "20 20 steamships__tons__comin_and_hobijn__2004 INTEGER \n",
782 | "21 21 synthetic__non_cellulosic__fibres__comin_and_h... BIGINT \n",
783 | "22 22 telegrams__mitchell__1998 FLOAT \n",
784 | "\n",
785 | " notnull dflt_value pk \n",
786 | "0 False NaN False \n",
787 | "1 False NaN False \n",
788 | "2 False NaN False \n",
789 | "3 False NaN False \n",
790 | "4 False NaN False \n",
791 | "5 False NaN False \n",
792 | "6 False NaN False \n",
793 | "7 False NaN False \n",
794 | "8 False NaN False \n",
795 | "9 False NaN False \n",
796 | "10 False NaN False \n",
797 | "11 False NaN False \n",
798 | "12 False NaN False \n",
799 | "13 False NaN False \n",
800 | "14 False NaN False \n",
801 | "15 False NaN False \n",
802 | "16 False NaN False \n",
803 | "17 False NaN False \n",
804 | "18 False NaN False \n",
805 | "19 False NaN False \n",
806 | "20 False NaN False \n",
807 | "21 False NaN False \n",
808 | "22 False NaN False "
809 | ]
810 | },
811 | "execution_count": 4,
812 | "metadata": {},
813 | "output_type": "execute_result"
814 | }
815 | ],
816 | "source": [
817 | "table_name = \"backport__owid__latest__dataset_941_technology_adoption__isard__1942__and_others__dataset_941_technology_adoption__isard__1942__and_others\"\n",
818 | "con.execute(f\"PRAGMA table_info('{table_name}');\").fetch_df()"
819 | ]
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": 8,
824 | "metadata": {},
825 | "outputs": [
826 | {
827 | "data": {
828 | "text/plain": [
829 | "short_name atm__comin_and_hobijn__2004\n",
830 | "table_path backport/owid/latest/dataset_941_technology_ad...\n",
831 | "table_db_name backport__owid__latest__dataset_941_technology...\n",
832 | "variable_type FLOAT\n",
833 | "years_values [1988, 1989, 1990, 1991, 1992, 1993, 1994, 199...\n",
834 | "entities_values {'entity_id': [13], 'entity_code': ['USA'], 'e...\n",
835 | "title ATM (Comin and Hobijn (2004))\n",
836 | "description Number of electro-mechanical devices that perm...\n",
837 | "sources [{'name': 'Isard (1942) and others', 'descript...\n",
838 | "grapher_meta {'id': 42539, 'name': 'ATM (Comin and Hobijn (...\n",
839 | "variable_id 42539\n",
840 | "unit NaN\n",
841 | "short_unit NaN\n",
842 | "Name: 0, dtype: object"
843 | ]
844 | },
845 | "execution_count": 8,
846 | "metadata": {},
847 | "output_type": "execute_result"
848 | }
849 | ],
850 | "source": [
851 | "q = f\"\"\"\n",
852 | "select * from meta_variables\n",
853 | "where variable_id = 42539\n",
854 | "\"\"\"\n",
855 | "r = con.execute(q).fetch_df()\n",
856 | "r.iloc[0]"
857 | ]
858 | }
859 | ],
860 | "metadata": {
861 | "interpreter": {
862 | "hash": "7cea4047479146d1310ae40921f620e4d325b759c497d12e215f27b54afd0461"
863 | },
864 | "kernelspec": {
865 | "display_name": "Python 3.9.12 ('.venv': poetry)",
866 | "language": "python",
867 | "name": "python3"
868 | },
869 | "language_info": {
870 | "codemirror_mode": {
871 | "name": "ipython",
872 | "version": 3
873 | },
874 | "file_extension": ".py",
875 | "mimetype": "text/x-python",
876 | "name": "python",
877 | "nbconvert_exporter": "python",
878 | "pygments_lexer": "ipython3",
879 | "version": "3.10.0"
880 | },
881 | "orig_nbformat": 4
882 | },
883 | "nbformat": 4,
884 | "nbformat_minor": 2
885 | }
886 |
--------------------------------------------------------------------------------
/crawler/utils.py:
--------------------------------------------------------------------------------
1 | def sanitize_table_path(path):
2 | # NOTE: version can contain - in dates (e.g. 2020-10-01)
3 | return path.replace("/", "__").replace("-", "_")
4 |
--------------------------------------------------------------------------------
/data_api/__init__.py:
--------------------------------------------------------------------------------
1 | # this file is needed by poetry in order to run scripts without errors
2 |
--------------------------------------------------------------------------------
/default.mk:
--------------------------------------------------------------------------------
1 | #
2 | # default.mk
3 | #
4 |
5 | SRC = src test
6 |
7 | default: help
8 |
9 | help-default:
10 | @echo 'Available commands:'
11 | @echo
12 | @echo ' make test Run all linting and unit tests'
13 | @echo ' make watch Run all tests, watching for changes'
14 | @echo
15 |
16 | # check formatting before lint, since an autoformat might fix linting issues
17 | test-default: check-formatting lint check-typing unittest
18 |
19 | .venv-default:
20 | @echo '==> Installing packages'
21 | git submodule update --init
22 | poetry install
23 | touch $@
24 |
25 | lint-default: .venv
26 | @echo '==> Linting'
27 | @.venv/bin/flake8 $(SRC)
28 |
29 | check-formatting-default: .venv
30 | @echo '==> Checking formatting'
31 | @.venv/bin/black --check $(SRC)
32 |
33 | check-typing-default: .venv
34 | @echo '==> Checking types'
35 | .venv/bin/mypy $(SRC)
36 |
37 | unittest-default: .venv
38 | @echo '==> Running unit tests'
39 | .venv/bin/pytest $(SRC)
40 |
41 | format-default: .venv
42 | @echo '==> Reformatting files'
43 | @.venv/bin/black $(SRC)
44 |
45 | watch-default: .venv
46 | .venv/bin/watchmedo shell-command -c 'clear; make test' --recursive --drop .
47 |
48 | # allow you to override a command, e.g. "watch", but if you do not, then use
49 | # the default
50 | %: %-default
51 | @true
52 |
--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
1 | # Demo
2 |
3 | Start API with `make api` and then run demo with
4 |
5 | ```
6 | python -m demo/demo.py
7 | ```
8 |
9 | ## Installation
10 |
11 | Install additional requirements for pywebio
12 |
13 | ```
14 | pip install -r demo/requirements.txt
15 | ```
16 |
--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 | from pathlib import Path
3 | from typing import Any, Literal
4 |
5 | import pandas as pd
6 | import time
7 | import requests
8 | import yaml
9 | from pywebio import config
10 | from pywebio import input as pi
11 | from pywebio import output as po
12 | from pywebio import pin as pn
13 | from pywebio import start_server
14 | from pywebio.pin import pin
15 | from pywebio.session import set_env
16 |
17 | from app.v1.schemas import SearchResponse
18 |
19 | API_URL = "http://127.0.0.1:8000"
20 |
21 | CURRENT_DIR = Path(__file__).parent
22 |
23 |
24 | def _df_to_array(df: pd.DataFrame) -> list[list[Any]]:
25 | return [df.columns] + df.to_numpy().tolist()
26 |
27 |
28 | def _api_search(term, channels) -> pd.DataFrame:
29 | url = f"{API_URL}/v1/search"
30 | resp = requests.get(url, params={"term": term, "channels": channels})
31 | print(f"Searching for {term}...")
32 | return pd.DataFrame(resp.json()["results"])
33 |
34 |
35 | def _api_etl_data(data_url, limit: int) -> pd.DataFrame:
36 | url = f"{API_URL}{data_url}.feather?limit={limit}"
37 | return pd.read_feather(url)
38 |
39 |
40 | def _style_truncate(max_width="300px", max_lines=3):
41 | return f"""
42 | display: -webkit-box;
43 | max-width: {max_width};
44 | -webkit-line-clamp: {max_lines};
45 | -webkit-box-orient: vertical;
46 | overflow: hidden;
47 | """
48 |
49 |
50 | def _list_channels() -> list[str]:
51 | url = f"{API_URL}/v1/dataset/data"
52 | return requests.get(url).json()["channels"]
53 |
54 |
55 | def _list_datasets() -> list[str]:
56 | url = f"{API_URL}/v1/datasets"
57 | return requests.get(url).json()["datasets"]
58 |
59 |
60 | def _put_table_preview(r: SearchResponse) -> None:
61 | """Show data of search result in a table preview"""
62 | t = time.time()
63 | df = _api_etl_data(r.data_url, limit=20)
64 | duration = time.time() - t
65 |
66 | # limit number of columns
67 | df = df.iloc[:, :10]
68 |
69 | po.put_markdown(
70 | f"""## Table {r.table_name} preview
71 |
72 | Dataframe shape: {df.shape}
73 | Dataframe size: {df.memory_usage().sum() / 1024 / 1024:.2f} MB
74 | Latency of pd.read_feather: {duration:.3f} s
75 | """
76 | )
77 | po.put_table(_df_to_array(df))
78 |
79 |
80 | ACTION_BUTTONS = Literal["Variable", "Table", "Code"]
81 |
82 |
83 | def _put_search_results_table(sf: pd.DataFrame) -> None:
84 | sf["actions"] = sf.apply(
85 | lambda row: po.put_buttons(
86 | ACTION_BUTTONS.__args__,
87 | onclick=partial(_open_popup, result=SearchResponse(**row.to_dict())),
88 | ).style("min-width: 250px"),
89 | axis=1,
90 | )
91 |
92 | # if title is missing, use short name
93 | ix = sf["variable_title"] == "nan"
94 | sf.loc[ix, "variable_title"] = sf.loc[ix, "variable_name"]
95 |
96 | sf["variable_description"] = sf["variable_description"].map(
97 | lambda s: po.put_text(s).style(_style_truncate())
98 | )
99 |
100 | sf["variable_title"] = sf["variable_title"].map(
101 | lambda s: po.put_text(s).style(_style_truncate())
102 | )
103 |
104 | sf["match"] = sf["match"].round(3)
105 |
106 | po.put_table(
107 | _df_to_array(
108 | sf[
109 | [
110 | "variable_title",
111 | "variable_description",
112 | "variable_unit",
113 | "dataset_title",
114 | "channel",
115 | "match",
116 | "actions",
117 | ]
118 | ]
119 | )
120 | ).style("font-size: 14px;")
121 |
122 |
123 | def _popup_variable_details(result: SearchResponse):
124 | url = f"{API_URL}{result.metadata_url}"
125 | resp = requests.get(url)
126 | assert resp.ok
127 | js = resp.json()
128 |
129 | meta = [v for v in js["variables"] if v["short_name"] == result.variable_name][0]
130 |
131 | po.popup(
132 | "Variable details",
133 | [po.put_code(yaml.dump(meta), language="yaml")],
134 | size=po.PopupSize.LARGE,
135 | )
136 |
137 |
138 | def _popup_table_details(result: SearchResponse) -> None:
139 | url = f"{API_URL}{result.metadata_url}"
140 | resp = requests.get(url)
141 | assert resp.ok
142 | js = resp.json()
143 |
144 | cols = ["title", "description", "unit"]
145 | df = []
146 | for v in js["variables"][:100]:
147 | df.append({c: v[c] for c in cols})
148 |
149 | del js["variables"]
150 |
151 | po.popup(
152 | "Table details",
153 | [
154 | po.put_code(yaml.dump(js), language="yaml"),
155 | po.put_markdown("### Variables"),
156 | po.put_table(_df_to_array(pd.DataFrame(df))).style("font-size: 14px;"),
157 | ],
158 | size=po.PopupSize.LARGE,
159 | )
160 |
161 |
162 | def _popup_code_snippets(result: SearchResponse) -> None:
163 | (
164 | _,
165 | _,
166 | _,
167 | _,
168 | channel,
169 | namespace,
170 | version,
171 | dataset,
172 | table,
173 | ) = result.metadata_url.split("/")
174 | if channel == "backport":
175 | catalog_snippet = f"""
176 | table = catalog.find_one(
177 | table="{table}",
178 | dataset="{dataset}",
179 | channels=["backport"],
180 | )""".strip()
181 | else:
182 | catalog_snippet = f"""
183 | table = catalog.find_one(
184 | table="{table}",
185 | namespace="{namespace}",
186 | dataset="{dataset}",
187 | channels=["{channel}"],
188 | )""".strip()
189 |
190 | po.popup(
191 | "Code snippets",
192 | [
193 | po.put_markdown(
194 | f"""
195 | ### Fetch metadata from API
196 | ```python
197 | r = requests.get("{API_URL}{result.metadata_url}")
198 | assert r.ok
199 | metadata = r.json()
200 | ```
201 |
202 | ### Fetch data from API
203 | ```python
204 | df = pd.read_feather("{API_URL}{result.data_url}.feather")
205 | df.head()
206 | ```
207 |
208 | ### Get table from Python API
209 | ```python
210 | from owid import catalog
211 | {catalog_snippet}
212 | table.head()
213 | ```
214 | """
215 | )
216 | ],
217 | size=po.PopupSize.LARGE,
218 | )
219 |
220 |
221 | def _open_popup(choice: ACTION_BUTTONS, result: SearchResponse):
222 | if choice == "Variable":
223 | _popup_variable_details(result)
224 | elif choice == "Table":
225 | _popup_table_details(result)
226 | elif choice == "Code":
227 | _popup_code_snippets(result)
228 | else:
229 | raise NotImplementedError()
230 |
231 |
232 | INIT_VALUES = {
233 | "search_term": "gdp",
234 | "channels": ["garden", "backport"],
235 | }
236 |
237 |
238 | @config(css_file="static/style.css")
239 | def app():
240 | channels = _list_channels()
241 | datasets = _list_datasets()
242 |
243 | set_env(output_animation=False)
244 | po.put_markdown("""# OWID Data Catalog""")
245 | pn.put_input("search_term", value=INIT_VALUES["search_term"], label="Search term")
246 | pn.put_select(
247 | "channels",
248 | label="Channels",
249 | multiple=True,
250 | options=channels,
251 | value=INIT_VALUES["channels"],
252 | )
253 | pn.put_select(
254 | "datasets",
255 | label="Datasets",
256 | multiple=True,
257 | options=datasets,
258 | )
259 |
260 | po.put_markdown("## Results")
261 |
262 | last_search_term = None
263 | while True:
264 | # get search term inputs or channel
265 | # NOTE: we need `timeout` together with last_search_term if user types too quickly
266 | # and we don't get the input yet
267 | pn.pin_wait_change("search_term", "channels", timeout=0.1)
268 |
269 | if last_search_term == pin.search_term:
270 | continue
271 | else:
272 | last_search_term = pin.search_term
273 |
274 | with po.use_scope("md", clear=True):
275 | search_term = pin.search_term
276 | channels = pin.channels
277 | sf = _api_search(search_term, channels=channels)
278 |
279 | if not sf.empty:
280 | _put_search_results_table(sf)
281 |
282 | # NOTE: not very useful, is going away soon
283 | _put_table_preview(sf.iloc[0])
284 |
285 |
286 | if __name__ == "__main__":
287 | start_server(app, port=8001, debug=True, static_dir="demo/static")
288 |
--------------------------------------------------------------------------------
/demo/requirements.txt:
--------------------------------------------------------------------------------
1 | pywebio
2 | git+https://github.com/owid/owid-grapher-py
3 |
--------------------------------------------------------------------------------
/demo/static/style.css:
--------------------------------------------------------------------------------
1 | .container {
2 | /* width: 100%; */
3 | /* max-width: unset; */
4 | max-width: 1200px;
5 | }
6 |
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | # NOTE: docker is not functional yet!
2 | version: "3.8"
3 |
4 | services:
5 | app:
6 | build: .
7 | env_file:
8 | - .env
9 | ports:
10 | - "8000:8000"
11 |
12 |
13 | database:
14 | image: mysql:8.0
15 | env_file:
16 | - .env
17 | ports:
18 | - "3306:3306"
19 |
20 |
--------------------------------------------------------------------------------
/nbinit.py:
--------------------------------------------------------------------------------
1 | import os
2 | # import matplotlib.pyplot as plt
3 | import pandas as pd
4 | import numpy as np
5 | # import seaborn as sns
6 | import json
7 | import sys
8 | import datetime
9 | from IPython import get_ipython
10 | import duckdb
11 | import sqlalchemy
12 |
13 |
14 |
15 | ipython = get_ipython()
16 | ipython.magic("load_ext rich")
17 | ipython.magic("load_ext sql")
18 | ipython.magic("load_ext autoreload")
19 | ipython.magic("autoreload 2")
20 | # ipython.magic("matplotlib inline")
21 | # ipython.magic("config InlineBackend.figure_format = 'svg'")
22 | ipython.magic("config SqlMagic.autopandas = True")
23 | ipython.magic("config SqlMagic.feedback = False")
24 | ipython.magic("config SqlMagic.displaycon = False")
25 |
26 | # nice / large graphs
27 | # sns.set_context("notebook")
28 | # plt.rcParams["figure.figsize"] = (6, 3)
--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 |
--------------------------------------------------------------------------------
/profiling/profile_formats.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Profile Formats\n",
8 | "\n",
9 | "Before running this test add COVID dataset into `duck.db` with\n",
10 | "\n",
11 | "```python\n",
12 | "python crawler/crawl.py --include 'covid19'\n",
13 | "```\n",
14 | "\n",
15 | "you might also want to remove the database to start from scratch `rm duck.db`. Then run the API with `make api`.\n",
16 | "\n",
17 | "### COVID dataset:\n",
18 | "\n",
19 | "- shape: ?\n",
20 | "- dataframe size: ?\n",
21 | "- S3 size: ?\n",
22 | "- DuckDB size: ?"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 8,
28 | "metadata": {},
29 | "outputs": [
30 | {
31 | "name": "stdout",
32 | "output_type": "stream",
33 | "text": [
34 | "JSON format: 30.86s\n",
35 | "CSV format: 9.28s\n",
36 | "Feather format: 0.55s\n",
37 | "Feather format (direct): 0.47s\n"
38 | ]
39 | }
40 | ],
41 | "source": [
42 | "import time\n",
43 | "import pandas as pd\n",
44 | "import requests\n",
45 | "\n",
46 | "url_wo_format = 'http://127.0.0.1:8000/v1/dataset/data/garden/owid/latest/covid/covid'\n",
47 | "\n",
48 | "t = time.time()\n",
49 | "r = requests.get(url_wo_format + '.json')\n",
50 | "assert r.ok\n",
51 | "print(f'JSON format: {time.time() - t:.2f}s')\n",
52 | "\n",
53 | "t = time.time()\n",
54 | "r = requests.get(url_wo_format + '.csv')\n",
55 | "assert r.ok\n",
56 | "print(f'CSV format: {time.time() - t:.2f}s')\n",
57 | "\n",
58 | "t = time.time()\n",
59 | "r = requests.get(url_wo_format + '.feather')\n",
60 | "assert r.ok\n",
61 | "print(f'Feather format: {time.time() - t:.2f}s')\n",
62 | "\n",
63 | "t = time.time()\n",
64 | "r = requests.get(url_wo_format + '.feather_direct')\n",
65 | "assert r.ok\n",
66 | "print(f'Feather format (direct): {time.time() - t:.2f}s')"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 9,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "Size in MB 69.274788\n",
78 | "Shape (202415, 67)\n",
79 | "dtype: object"
80 | ]
81 | },
82 | "execution_count": 9,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "# dataset info\n",
89 | "df = pd.read_feather(url_wo_format + '.feather')\n",
90 | "pd.Series({\n",
91 | " \"Size in MB\": df.memory_usage(deep=True).sum() / 1e6,\n",
92 | " \"Shape\": df.shape,\n",
93 | "})"
94 | ]
95 | }
96 | ],
97 | "metadata": {
98 | "kernelspec": {
99 | "display_name": "Python 3.10.0 ('.venv': poetry)",
100 | "language": "python",
101 | "name": "python3"
102 | },
103 | "language_info": {
104 | "codemirror_mode": {
105 | "name": "ipython",
106 | "version": 3
107 | },
108 | "file_extension": ".py",
109 | "mimetype": "text/x-python",
110 | "name": "python",
111 | "nbconvert_exporter": "python",
112 | "pygments_lexer": "ipython3",
113 | "version": "3.10.0"
114 | },
115 | "orig_nbformat": 4,
116 | "vscode": {
117 | "interpreter": {
118 | "hash": "7cea4047479146d1310ae40921f620e4d325b759c497d12e215f27b54afd0461"
119 | }
120 | }
121 | },
122 | "nbformat": 4,
123 | "nbformat_minor": 2
124 | }
125 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "data-api"
3 | version = "0.1.0"
4 | description = ""
5 | authors = ["Our World In Data "]
6 |
7 | [tool.poetry.scripts]
8 | crawl = "crawler.crawl:main_cli"
9 |
10 | [tool.poetry.dependencies]
11 | python = "^3.9"
12 | owid-catalog = {path = "vendor/owid-catalog-py", develop = true}
13 | fastapi = "^0.67.0"
14 | pydantic = {extras = ["dotenv"], version = "^1.9.1"}
15 | pandas = "^1.4.2"
16 | SQLAlchemy = {extras = ["mypy"], version = "^1.4.39"}
17 | mysqlclient = "^2.1.0"
18 | rich = "^12.4.4"
19 | typer = "^0.4.1"
20 | duckdb = "^0.4.0"
21 | duckdb-engine = "^0.1.11"
22 | structlog = "^21.5.0"
23 | hypercorn = "^0.13.2"
24 | orjson = "^3.7.11"
25 | bugsnag = "^4.2.1"
26 |
27 | [tool.poetry.dev-dependencies]
28 | pytest = "^7.1.2"
29 | pytest-cov = "^2.10.1"
30 | autoflake = "^1.4"
31 | flake8 = "^3.8.4"
32 | mypy = "^0.961"
33 | isort = "^5.0"
34 | pre-commit = "^2.8.2"
35 | black = {version = "^22.3.0", extras = ["jupyter"]}
36 | ipykernel = "^6.13.1"
37 | types-PyYAML = "^6.0.11"
38 | types-requests = "^2.28.3"
39 |
40 | [build-system]
41 | requires = ["poetry-core>=1.1.14"]
42 | build-backend = "poetry.core.masonry.api"
43 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | profile = black
3 | known_first_party = app
4 |
5 | [flake8]
6 | # Ignore some errors, since we autoformat them away already wherever possible
7 | # from https://github.com/psf/black/blob/main/.flake8
8 | # E302 is ignored to support jupytext files
9 | ignore = E203, E266, E501, W503, E302
10 | exclude = .ipynb_checkpoints
11 |
12 | [mypy]
13 | plugins = pydantic.mypy, sqlalchemy.ext.mypy.plugin
14 | ignore_missing_imports = True
15 | follow_imports = skip
16 | strict_optional = True
17 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/tests/__init__.py
--------------------------------------------------------------------------------
/tests/crawler/test_crawl.py:
--------------------------------------------------------------------------------
1 | from app.v1 import metadata
2 |
3 | # def test_extract_dimension_values():
4 | # df = pd.DataFrame(
5 | # {
6 | # "entity_id": [1, 2, 3],
7 | # "entity_name": ["A", "B", "C"],
8 | # "entity_code": ["c1", "c2", "c3"],
9 | # "year": [2000, 2001, 2002],
10 | # # "value": [1, 2, 3],
11 | # }
12 | # ).set_index(["year", "entity_id", "entity_code", "entity_name"])
13 |
14 | # dim_values = crawl._extract_dimension_values(df.index)
15 | # assert dim_values == {
16 | # "entity_zip": ["1|A|c1", "2|B|c2", "3|C|c3"],
17 | # "year": [2000, 2001, 2002],
18 | # }
19 |
20 |
21 | def test_parse_dimension_values():
22 | dim_values = {
23 | "entity_zip": ["1|A|c1", "2|B|c2", "3|C|c3"],
24 | "year": [2000, 2001, 2002],
25 | }
26 | dims = metadata._parse_dimension_values(dim_values)
27 | assert dims == {
28 | "years": metadata.Dimension(
29 | type="int",
30 | values=[
31 | metadata.DimensionProperties(id=2000, name=None, code=None),
32 | metadata.DimensionProperties(id=2001, name=None, code=None),
33 | metadata.DimensionProperties(id=2002, name=None, code=None),
34 | ],
35 | ),
36 | "entities": metadata.Dimension(
37 | type="int",
38 | values=[
39 | metadata.DimensionProperties(id=1, name="A", code="c1"),
40 | metadata.DimensionProperties(id=2, name="B", code="c2"),
41 | metadata.DimensionProperties(id=3, name="C", code="c3"),
42 | ],
43 | ),
44 | }
45 |
--------------------------------------------------------------------------------
/tests/sample_duck.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/tests/sample_duck.db
--------------------------------------------------------------------------------
/tests/test_v1.py:
--------------------------------------------------------------------------------
1 | import io
2 | from pathlib import Path
3 |
4 | import pandas as pd
5 | from fastapi.testclient import TestClient
6 |
7 | from app.main import app, settings
8 |
9 | client = TestClient(app)
10 |
11 | # mock settings
12 | settings.DUCKDB_PATH = Path("tests/sample_duck.db")
13 |
14 |
15 | def test_health():
16 | response = client.get("/health")
17 | assert response.status_code == 200
18 | assert response.json()["status"] == "ok"
19 |
20 |
21 | def test_variableById_data_for_variable():
22 | response = client.get("/v1/variableById/data/42539")
23 | assert response.status_code == 200
24 | assert set(response.json().keys()) == {
25 | "years",
26 | "entity_names",
27 | "entities",
28 | "entity_codes",
29 | "values",
30 | }
31 |
32 |
33 | def test_variableById_metadata_for_backported_variable():
34 | # this test requires connection to the database, this is only temporary and will change once we start getting
35 | # metadata from the catalog instead of the database
36 | response = client.get("/v1/variableById/metadata/42539")
37 | assert response.status_code == 200
38 | assert response.json() == {
39 | "name": "ATM (Comin and Hobijn (2004))",
40 | "unit": "",
41 | "description": "Number of electro-mechanical devices that permit authorized users, typically using machine readable plastic cards, to withdraw cash from their accounts and/or access other services",
42 | "createdAt": "2017-09-30T19:53:00",
43 | "updatedAt": "2018-02-28T08:58:52",
44 | "coverage": "",
45 | "timespan": "",
46 | "datasetId": 941,
47 | "columnOrder": 0,
48 | "datasetName": "Technology Adoption - Isard (1942) and others",
49 | "nonRedistributable": False,
50 | "display": {},
51 | "source": {
52 | "id": 6800,
53 | "name": "Isard (1942) and others",
54 | "dataPublishedBy": "Isard (1942) and others",
55 | "dataPublisherSource": "Scholarly work",
56 | "link": "http://www.jstor.org/stable/1927670",
57 | "retrievedDate": "28/09/2017",
58 | "additionalInfo": "Roads - Historical Statistics of the United States, Colonial Times to 1970, Volume 1 and 2. Bureau of the Census, Washington D.C. see Chapter Q - Transportation, Q50-63. Link: https://www2.census.gov/library/publications/1975/compendia/hist_stats_colonial-1970/hist_stats_colonial-1970p2-chQ.pdf;\nDiesel locomotives - Historical Statistics of the United States, Colonial Times to 1970, Volume 1 and 2. Bureau of the Census, Washington D.C. see Chapter Q - Transportation, Series Q284-312: Railroad mileage, equipment, and passenger traffic and revenue: 1890 to 1970. Link: https://www2.census.gov/library/publications/1975/compendia/hist_stats_colonial-1970/hist_stats_colonial-1970p2-chQ.pdf;\nAgricultural tractor, ATM, Aviation passenger-km, Credit and debit payments, Card payments, MRI units, Newspapers, Retail locations accepting card, Rail passenger-km, Steamships (tons), Crude steel production (blast oxygen furnaces)/(electric furnaces), Synthetic (non-cellulosic) fibres, Commercial vehicles - Comin and Hobijn (2004). Link: http://www.nber.org/data/chat/;\nMail and telegrams - Mitchell (1998) International Historical Statistics: the Americas, 1970-2000, 5th Ed",
59 | },
60 | "type": "FLOAT",
61 | "dimensions": {
62 | "years": {
63 | "type": "int",
64 | "values": [
65 | {"id": 1800},
66 | {"id": 1803},
67 | {"id": 1805},
68 | {"id": 1807},
69 | {"id": 1808},
70 | {"id": 1809},
71 | {"id": 1810},
72 | {"id": 1811},
73 | {"id": 1812},
74 | {"id": 1813},
75 | {"id": 1814},
76 | {"id": 1815},
77 | {"id": 1816},
78 | {"id": 1817},
79 | {"id": 1818},
80 | {"id": 1819},
81 | {"id": 1820},
82 | {"id": 1821},
83 | {"id": 1822},
84 | {"id": 1823},
85 | {"id": 1824},
86 | {"id": 1825},
87 | {"id": 1826},
88 | {"id": 1827},
89 | {"id": 1828},
90 | {"id": 1829},
91 | {"id": 1830},
92 | {"id": 1831},
93 | {"id": 1832},
94 | {"id": 1833},
95 | {"id": 1834},
96 | {"id": 1835},
97 | {"id": 1836},
98 | {"id": 1837},
99 | {"id": 1838},
100 | {"id": 1839},
101 | {"id": 1840},
102 | {"id": 1841},
103 | {"id": 1842},
104 | {"id": 1843},
105 | {"id": 1844},
106 | {"id": 1845},
107 | {"id": 1846},
108 | {"id": 1847},
109 | {"id": 1848},
110 | {"id": 1849},
111 | {"id": 1850},
112 | {"id": 1851},
113 | {"id": 1852},
114 | {"id": 1853},
115 | {"id": 1854},
116 | {"id": 1855},
117 | {"id": 1856},
118 | {"id": 1857},
119 | {"id": 1858},
120 | {"id": 1859},
121 | {"id": 1860},
122 | {"id": 1861},
123 | {"id": 1862},
124 | {"id": 1863},
125 | {"id": 1864},
126 | {"id": 1865},
127 | {"id": 1866},
128 | {"id": 1867},
129 | {"id": 1868},
130 | {"id": 1869},
131 | {"id": 1870},
132 | {"id": 1871},
133 | {"id": 1872},
134 | {"id": 1873},
135 | {"id": 1874},
136 | {"id": 1875},
137 | {"id": 1876},
138 | {"id": 1877},
139 | {"id": 1878},
140 | {"id": 1879},
141 | {"id": 1880},
142 | {"id": 1881},
143 | {"id": 1882},
144 | {"id": 1883},
145 | {"id": 1884},
146 | {"id": 1885},
147 | {"id": 1886},
148 | {"id": 1887},
149 | {"id": 1888},
150 | {"id": 1889},
151 | {"id": 1890},
152 | {"id": 1891},
153 | {"id": 1892},
154 | {"id": 1893},
155 | {"id": 1894},
156 | {"id": 1895},
157 | {"id": 1896},
158 | {"id": 1897},
159 | {"id": 1898},
160 | {"id": 1899},
161 | {"id": 1900},
162 | {"id": 1901},
163 | {"id": 1902},
164 | {"id": 1903},
165 | {"id": 1904},
166 | {"id": 1905},
167 | {"id": 1906},
168 | {"id": 1907},
169 | {"id": 1908},
170 | {"id": 1909},
171 | {"id": 1910},
172 | {"id": 1911},
173 | {"id": 1912},
174 | {"id": 1913},
175 | {"id": 1914},
176 | {"id": 1915},
177 | {"id": 1916},
178 | {"id": 1917},
179 | {"id": 1918},
180 | {"id": 1919},
181 | {"id": 1920},
182 | {"id": 1921},
183 | {"id": 1922},
184 | {"id": 1923},
185 | {"id": 1924},
186 | {"id": 1925},
187 | {"id": 1926},
188 | {"id": 1927},
189 | {"id": 1928},
190 | {"id": 1929},
191 | {"id": 1930},
192 | {"id": 1931},
193 | {"id": 1932},
194 | {"id": 1933},
195 | {"id": 1934},
196 | {"id": 1935},
197 | {"id": 1936},
198 | {"id": 1937},
199 | {"id": 1938},
200 | {"id": 1939},
201 | {"id": 1940},
202 | {"id": 1941},
203 | {"id": 1942},
204 | {"id": 1943},
205 | {"id": 1944},
206 | {"id": 1945},
207 | {"id": 1946},
208 | {"id": 1947},
209 | {"id": 1948},
210 | {"id": 1949},
211 | {"id": 1950},
212 | {"id": 1951},
213 | {"id": 1952},
214 | {"id": 1953},
215 | {"id": 1954},
216 | {"id": 1955},
217 | {"id": 1956},
218 | {"id": 1957},
219 | {"id": 1958},
220 | {"id": 1959},
221 | {"id": 1960},
222 | {"id": 1961},
223 | {"id": 1962},
224 | {"id": 1963},
225 | {"id": 1964},
226 | {"id": 1965},
227 | {"id": 1966},
228 | {"id": 1967},
229 | {"id": 1968},
230 | {"id": 1969},
231 | {"id": 1970},
232 | {"id": 1971},
233 | {"id": 1972},
234 | {"id": 1973},
235 | {"id": 1974},
236 | {"id": 1975},
237 | {"id": 1976},
238 | {"id": 1977},
239 | {"id": 1978},
240 | {"id": 1979},
241 | {"id": 1980},
242 | {"id": 1981},
243 | {"id": 1982},
244 | {"id": 1983},
245 | {"id": 1984},
246 | {"id": 1985},
247 | {"id": 1986},
248 | {"id": 1987},
249 | {"id": 1988},
250 | {"id": 1989},
251 | {"id": 1990},
252 | {"id": 1991},
253 | {"id": 1992},
254 | {"id": 1993},
255 | {"id": 1994},
256 | {"id": 1995},
257 | {"id": 1996},
258 | {"id": 1997},
259 | {"id": 1998},
260 | {"id": 1999},
261 | {"id": 2000},
262 | {"id": 2001},
263 | {"id": 2002},
264 | {"id": 2003},
265 | ],
266 | },
267 | "entities": {
268 | "type": "int",
269 | "values": [{"id": 13, "name": "United States", "code": "USA"}],
270 | },
271 | },
272 | }
273 |
274 |
275 | TEST_RESPONSE_JSON = {
276 | "country": ["Afghanistan", "Afghanistan"],
277 | "population": [3280000.0, 4207000.0],
278 | "year": [1820, 1870],
279 | }
280 |
281 |
282 | def test_dataset_data_for_etl_table_json_format():
283 | # this test requires connection to the database, this is only temporary and will change once we start getting
284 | # metadata from the catalog instead of the database
285 | response = client.get(
286 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.json",
287 | params={"limit": 2, "columns": "year,country,population"},
288 | )
289 | assert response.status_code == 200
290 | assert response.json() == TEST_RESPONSE_JSON
291 |
292 |
293 | def test_dataset_data_for_etl_table_csv_format():
294 | response = client.get(
295 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.csv",
296 | params={"limit": 2, "columns": "year,country,population"},
297 | )
298 | assert response.status_code == 200
299 | df = pd.read_csv(io.StringIO(response.text))
300 | assert df.to_dict(orient="list") == TEST_RESPONSE_JSON
301 |
302 |
303 | def test_dataset_data_for_etl_table_feather_format():
304 | response = client.get(
305 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.feather",
306 | params={"limit": 2, "columns": "year,country,population"},
307 | )
308 | assert response.status_code == 200
309 | df = pd.read_feather(io.BytesIO(response.content))
310 | assert df.to_dict(orient="list") == TEST_RESPONSE_JSON
311 |
312 |
313 | def test_dataset_metadata_for_etl_table():
314 | response = client.get(
315 | "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
316 | params={"limit": 2},
317 | )
318 | assert response.status_code == 200
319 | js = response.json()
320 |
321 | # trim long fields
322 | js["dataset"]["description"] = js["dataset"]["description"][:20]
323 |
324 | assert js == {
325 | "dataset": {
326 | "channel": "garden",
327 | "namespace": "ggdc",
328 | "short_name": "ggdc_maddison",
329 | "title": "Maddison Project Database (GGDC, 2020)",
330 | "description": "Notes:\n- Tanzania re",
331 | "sources": [
332 | {
333 | "name": "Maddison Project Database 2020 (Bolt and van Zanden, 2020)",
334 | "url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020",
335 | "source_data_url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/data/mpd2020.xlsx",
336 | "owid_data_url": "https://walden.nyc3.digitaloceanspaces.com/ggdc/2020-10-01/ggdc_maddison.xlsx",
337 | "date_accessed": "2022-04-12",
338 | "publication_date": "2020-10-01",
339 | "publication_year": 2020,
340 | "published_by": "Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update“.",
341 | "publisher_source": "The Maddison Project Database is based on the work of many researchers that have produced estimates of\neconomic growth for individual countries. The full list of sources for this historical data is given for each country below.\n",
342 | }
343 | ],
344 | "licenses": [
345 | {
346 | "name": "Creative Commons BY 4.0",
347 | "url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020",
348 | }
349 | ],
350 | "is_public": True,
351 | "checksum": "cc6d7a0cf74c962c4f0ac9d1d019a747",
352 | "version": "2020-10-01",
353 | },
354 | "table": {
355 | "table_name": "maddison_gdp",
356 | "dataset_name": "ggdc_maddison",
357 | "version": "2020-10-01",
358 | "namespace": "ggdc",
359 | "channel": "garden",
360 | "dimensions": ["country", "year"],
361 | "path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
362 | "format": "feather",
363 | "is_public": True,
364 | },
365 | "variables": [
366 | {
367 | "title": "GDP per capita",
368 | "description": None,
369 | "licenses": [],
370 | "sources": [],
371 | "unit": "2011 int-$",
372 | "short_unit": "$",
373 | "display": {
374 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand",
375 | "numDecimalPlaces": 0,
376 | },
377 | "short_name": "gdp_per_capita",
378 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
379 | "dataset_short_name": "ggdc_maddison",
380 | "variable_type": "FLOAT",
381 | },
382 | {
383 | "title": "Population",
384 | "description": None,
385 | "licenses": [],
386 | "sources": [],
387 | "unit": "people",
388 | "short_unit": None,
389 | "display": {
390 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand"
391 | },
392 | "short_name": "population",
393 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
394 | "dataset_short_name": "ggdc_maddison",
395 | "variable_type": "FLOAT",
396 | },
397 | {
398 | "title": "GDP",
399 | "description": "Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population.",
400 | "licenses": [],
401 | "sources": [],
402 | "unit": "2011 int-$",
403 | "short_unit": "$",
404 | "display": {
405 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand",
406 | "numDecimalPlaces": 0,
407 | },
408 | "short_name": "gdp",
409 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
410 | "dataset_short_name": "ggdc_maddison",
411 | "variable_type": "FLOAT",
412 | },
413 | ],
414 | }
415 |
416 |
417 | def test_dataset_metadata_for_backported_table():
418 | response = client.get(
419 | "/v1/dataset/metadata/backport/owid/latest/dataset_941_technology_adoption__isard__1942__and_others/dataset_941_technology_adoption__isard__1942__and_others",
420 | )
421 | assert response.status_code == 200
422 | response.json()
423 |
424 |
425 | def test_search():
426 | response = client.get(
427 | "/v1/search",
428 | params={"term": "population"},
429 | )
430 | assert response.status_code == 200
431 | js = response.json()
432 | assert js == {
433 | "results": [
434 | {
435 | "variable_name": "population",
436 | "variable_title": "Population",
437 | "variable_description": "nan",
438 | "variable_unit": "people",
439 | "table_name": "maddison_gdp",
440 | "dataset_title": "Maddison Project Database (GGDC, 2020)",
441 | "channel": "garden",
442 | "metadata_url": "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
443 | "data_url": "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
444 | "match": 1.8276277047674334,
445 | },
446 | {
447 | "variable_name": "gdp",
448 | "variable_title": "GDP",
449 | "variable_description": "Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population.",
450 | "variable_unit": "2011 int-$",
451 | "table_name": "maddison_gdp",
452 | "dataset_title": "Maddison Project Database (GGDC, 2020)",
453 | "channel": "garden",
454 | "metadata_url": "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
455 | "data_url": "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp",
456 | "match": 1.5464542117262898,
457 | },
458 | ]
459 | }
460 |
--------------------------------------------------------------------------------