├── .env.example ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── app ├── __init__.py ├── core │ ├── __init__.py │ └── config.py ├── main.py ├── utils.py └── v1 │ ├── __init__.py │ ├── data.py │ ├── lists.py │ ├── metadata.py │ ├── schemas.py │ └── search.py ├── crawler ├── __init__.py ├── crawl.py ├── duckdb_models.py ├── full_text_index.py ├── query_duckdb.ipynb └── utils.py ├── data_api └── __init__.py ├── default.mk ├── demo ├── README.md ├── demo.py ├── requirements.txt └── static │ └── style.css ├── docker-compose.yaml ├── explore-duckdb.ipynb ├── nbinit.py ├── poetry.lock ├── poetry.toml ├── profiling └── profile_formats.ipynb ├── pyproject.toml ├── setup.cfg └── tests ├── __init__.py ├── crawler └── test_crawl.py ├── sample_duck.db └── test_v1.py /.env.example: -------------------------------------------------------------------------------- 1 | PROJECT_NAME=data-api 2 | BACKEND_CORS_ORIGINS=["http://localhost:8000", "https://localhost:8000", "http://localhost", "https://localhost"] 3 | 4 | DUCKDB_PATH=duck.db 5 | 6 | OWID_CATALOG_DIR=/Users/mojmir/projects/etl/data 7 | 8 | # optional 9 | # BUGSNAG_API_KEY 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # pytype static type analyzer 134 | .pytype/ 135 | 136 | # Cython debug symbols 137 | cython_debug/ 138 | 139 | # Text Editor 140 | .vscode 141 | 142 | 143 | # Custom 144 | .coverage 145 | **/__pycache__ 146 | .virtual_documents 147 | **/.ipynb_checkpoints 148 | .submodule-init 149 | .mypy-cache 150 | .pytest-cache 151 | .python-version 152 | .venv 153 | __pycache__ 154 | .ipynb_checkpoints 155 | .env 156 | *.dot 157 | *.pdf 158 | pyrightconfig.json 159 | .DS_Store 160 | ign.* 161 | .idea 162 | 163 | api.err 164 | demo.err 165 | api.log 166 | demo.log 167 | 168 | duck.db 169 | duck.db.wal 170 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/owid-catalog-py"] 2 | path = vendor/owid-catalog-py 3 | url = git@github.com:owid/owid-catalog-py.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/myint/autoflake 3 | rev: v1.4 4 | hooks: 5 | - id: autoflake 6 | exclude: .*/__init__.py 7 | args: 8 | - --in-place 9 | - --remove-all-unused-imports 10 | - --expand-star-imports 11 | - --remove-duplicate-keys 12 | - --remove-unused-variables 13 | - repo: local 14 | hooks: 15 | - id: flake8 16 | name: flake8 17 | entry: flake8 18 | language: system 19 | types: [python] 20 | - repo: https://github.com/pre-commit/mirrors-isort 21 | rev: v5.4.2 22 | hooks: 23 | - id: isort 24 | args: ["--profile", "black"] 25 | - repo: local 26 | hooks: 27 | - id: mypy 28 | name: mypy 29 | entry: mypy 30 | language: system 31 | types: [python] 32 | - repo: https://github.com/pre-commit/pre-commit-hooks 33 | rev: v3.3.0 34 | hooks: 35 | - id: trailing-whitespace 36 | - id: end-of-file-fixer 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.9 2 | 3 | ENV PYTHONPATH "${PYTHONPATH}:/" 4 | ENV PORT=8000 5 | 6 | # Install Poetry 7 | RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | POETRY_HOME=/opt/poetry python && \ 8 | cd /usr/local/bin && \ 9 | ln -s /opt/poetry/bin/poetry && \ 10 | poetry config virtualenvs.create false 11 | 12 | # Copy using poetry.lock* in case it doesn't exist yet 13 | COPY ./pyproject.toml ./poetry.lock* /app/ 14 | 15 | RUN poetry install --no-root --no-dev 16 | 17 | COPY ./app /app 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Global Change Data Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Makefile 3 | # 4 | 5 | .PHONY: etl 6 | 7 | include default.mk 8 | 9 | SRC = app crawler tests 10 | 11 | help: 12 | @echo 'Available commands:' 13 | @echo 14 | @echo ' make crawl Crawl ETL catalog' 15 | @echo ' make api Run API server' 16 | @echo ' make test Run all linting and unit tests' 17 | @echo ' make testdb Rebuild test DB' 18 | @echo ' make watch Run all tests, watching for changes' 19 | @echo ' make clobber Delete non-reference data and .venv' 20 | @echo ' make run Run API and Catalog in the background' 21 | @echo 22 | 23 | 24 | watch-all: 25 | .venv/bin/watchmedo shell-command -c 'clear; make unittest; (cd vendor/owid-catalog-py && make unittest)' --recursive --drop . 26 | 27 | test-all: test 28 | cd vendor/owid-catalog-py && make test 29 | 30 | watch: .venv 31 | .venv/bin/watchmedo shell-command -c 'clear; make check-formatting lint check-typing coverage' --recursive --drop . 32 | 33 | .submodule-init: 34 | @echo '==> Initialising submodules' 35 | git submodule update --init 36 | touch $@ 37 | 38 | .venv: pyproject.toml poetry.toml poetry.lock .submodule-init 39 | @echo '==> Copy .env.example to .env if missing' 40 | cp -n .env.example .env || true 41 | @echo '==> Installing packages' 42 | poetry install 43 | touch $@ 44 | 45 | check-typing: .venv 46 | # @echo '==> Checking types' 47 | # .venv/bin/mypy $(SRC) 48 | @echo '==> WARNING: Checking types is disabled!' 49 | 50 | coverage: .venv 51 | @echo '==> Unit testing with coverage' 52 | .venv/bin/pytest --cov=app --cov-report=term-missing tests 53 | 54 | crawl: .venv 55 | @echo '==> Crawl ETL catalog' 56 | poetry run crawl 57 | 58 | crawl-backported: .venv 59 | @echo '==> Crawl backported ETL catalog' 60 | poetry run crawl --include dataset_ 61 | 62 | api: .venv 63 | @echo '==> Running API' 64 | .venv/bin/hypercorn app.main:app --reload 65 | 66 | testdb: .venv 67 | @echo '==> Rebuild test DB' 68 | rm -f tests/sample_duck.db 69 | poetry run crawl --include 'dataset_941|ggdc_maddison' --duckdb-path tests/sample_duck.db 70 | 71 | clobber: clean 72 | find . -name .venv | xargs rm -rf 73 | find . -name .mypy_cache | xargs rm -rf 74 | 75 | run: .venv 76 | @echo 'Running API and Catalog in the background:' 77 | -kill $(lsof -t -i:8000) 78 | -kill $(lsof -t -i:8001) 79 | nohup make api > api.log 2> api.err < /dev/null & 80 | nohup .venv/bin/python -m demo.demo > demo.log 2> demo.err < /dev/null & 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-api 2 | 3 | _API for accessing data from our data catalog._ 4 | 5 | **Status**: experimental 6 | 7 | ## Overview 8 | 9 | Our World in Data is trying to build a new data layer for our charts and visualisations based a repeatable and transparent data pipeline, our [etl](https://github.com/owid/etl). The ETL generates and publishes the latest version of our data catalog to S3. 10 | 11 | This project adds two components, a crawler and a web API. The crawler walks the data catalog and generates a local DuckDB database with the contents. The Dynamic API then provides RESTful access to the data, including SQL support thanks to DuckDB. 12 | 13 | ```mermaid 14 | graph TB 15 | ETL -->|generates| catalog[Data catalog] 16 | crawler(Crawler):::here -->|reads| catalog 17 | site[OWID site] -.->|queries| api 18 | api(Dynamic API):::here -->|queries| db[DuckDB cache] 19 | crawler -->|generates| db 20 | 21 | classDef here stroke-width:4px; 22 | ``` 23 | 24 | ## Developing 25 | 26 | You need Python 3.10 and `poetry` installed to get started. 27 | 28 | ### Running tests 29 | 30 | To run all the checks and make sure you have everything set up correctly, try 31 | 32 | ``` 33 | make test 34 | ``` 35 | 36 | ### Crawling the catalog 37 | 38 | Crawler is a script that goes through all backported datasets and replicates them to local DuckDB. Crawler creates tables `meta_datasets`, `meta_tables`, and `meta_variables` in DuckDB with all metadata and it also replicates tables from ETL catalog in there. 39 | 40 | Table names are underscored table paths, e.g. path `backport/owid/latest/dataset_941_technology_adoption__isard__1942__and_others/dataset_941_technology_adoption__isard__1942__and_others` gets table name `backport__owid__latest__dataset_941_technology_adoption__isard__1942__and_others__dataset_941_technology_adoption__isard__1942__and_others`. This is unnecessarily verbose, but it doesn't not matter now. 41 | 42 | Crawler compares checksums of **datasets** to decide if a dataset needs to be updated. We cannot do it on a table level because we don't use table checksums. 43 | 44 | We only crawl `garden` and `backport` channels right now. 45 | 46 | Run `make crawl` to crawl the entire database (this would take veeeery long) or crawl only sample datasets with 47 | 48 | ``` 49 | poetry run crawl --include 'dataset_941|ggdc_maddison' 50 | ``` 51 | 52 | or just a garden channel 53 | 54 | ``` 55 | poetry run crawl --include 'garden' 56 | ``` 57 | 58 | ### Running the API 59 | 60 | Copy `.env.example` into `.env` and update it as you like. After you build `duck.db` with crawler, run the API with `hypercorn app.main:app --reload`. 61 | 62 | Docs are available at http://127.0.0.1:8000/v1/docs. 63 | 64 | ### Sample Queries 65 | 66 | Sample queries written in [httpie](https://httpie.io/) 67 | 68 | ``` 69 | http GET http://127.0.0.1:8000/health 70 | http GET http://127.0.0.1:8000/v1/variableById/data/42539 71 | http GET http://127.0.0.1:8000/v1/variableById/metadata/42539 72 | http GET http://127.0.0.1:8000/v1/dataset/data/garden/owid/latest/covid/covid.csv 73 | http GET http://127.0.0.1:8000/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp 74 | http GET http://127.0.0.1:8000/v1/dataset/data/backport/owid/latest/dataset_5576_ggdc_maddison__2020_10_01/dataset_5576_ggdc_maddison__2020_10_01.feather 75 | http POST http://127.0.0.1:8000/v1/sql sql=="PRAGMA show_tables;" type==csv 76 | http POST http://127.0.0.1:8000/v1/sql sql=="select * from garden__ggdc__2020_10_01__ggdc_maddison__maddison_gdp limit 10;" type==csv 77 | ``` 78 | 79 | ## Tests 80 | 81 | Integration tests work with sample data saved in `tests/sample_duck.db`. Regenerate it with `make testdb`. 82 | 83 | ## Development 84 | 85 | It is useful to recreate sample DB for testing and run tests right after that for debugging with 86 | 87 | ``` 88 | make testdb && pytest -s tests/test_v1.py 89 | ``` 90 | 91 | ## Full-text search 92 | 93 | - all variables are given the same weight, we should reconsider that 94 | - negation queries are not supported yet (could be useful for interactive exclusion of datasets) 95 | 96 | ## Future considerations 97 | 98 | This project was generated via [manage-fastapi](https://github.com/ycd/manage-fastapi/). We might re-generate the project with a [different template](https://fastapi.tiangolo.com/advanced/templates/) based on our production requirements. 99 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/app/__init__.py -------------------------------------------------------------------------------- /app/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/app/core/__init__.py -------------------------------------------------------------------------------- /app/core/config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List, Optional, Union 3 | 4 | from pydantic import AnyHttpUrl, BaseSettings, validator 5 | 6 | 7 | class Settings(BaseSettings): 8 | PROJECT_NAME: str 9 | BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = [] 10 | 11 | DUCKDB_PATH: Path = Path("duck.db") 12 | 13 | DUCKDB_MEMORY_LIMIT = "2GB" 14 | 15 | BUGSNAG_API_KEY: Optional[str] = None 16 | 17 | OWID_CATALOG_DIR: Path 18 | 19 | @validator("BACKEND_CORS_ORIGINS", pre=True) 20 | def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str]: 21 | if isinstance(v, str) and not v.startswith("["): 22 | return [i.strip() for i in v.split(",")] 23 | elif isinstance(v, (list, str)): 24 | return v 25 | raise ValueError(v) 26 | 27 | class Config: 28 | case_sensitive = True 29 | env_file = ".env" 30 | 31 | 32 | settings = Settings() 33 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import bugsnag 4 | import structlog 5 | from bugsnag.asgi import BugsnagMiddleware 6 | from fastapi import FastAPI 7 | from fastapi.middleware.cors import CORSMiddleware 8 | 9 | from app.core.config import settings 10 | from app.v1 import v1 11 | 12 | log = structlog.get_logger() 13 | 14 | bugsnag.configure( 15 | api_key=settings.BUGSNAG_API_KEY, 16 | ) 17 | 18 | 19 | def get_application(): 20 | _app = FastAPI(title=settings.PROJECT_NAME) 21 | 22 | _app.add_middleware( 23 | CORSMiddleware, 24 | allow_origins=[str(origin) for origin in settings.BACKEND_CORS_ORIGINS], 25 | allow_credentials=True, 26 | allow_methods=["*"], 27 | allow_headers=["*"], 28 | ) 29 | 30 | _app.add_middleware( 31 | BugsnagMiddleware, 32 | ) 33 | 34 | return _app 35 | 36 | 37 | app = get_application() 38 | 39 | # mount subapplications as versions 40 | app.mount("/v1", v1) 41 | 42 | 43 | @app.get("/health") 44 | def health() -> dict: 45 | return { 46 | "status": "ok", 47 | "thread_id": str(threading.get_ident()), 48 | } 49 | -------------------------------------------------------------------------------- /app/utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Any 3 | 4 | import duckdb 5 | import orjson 6 | import pandas as pd 7 | import structlog 8 | from fastapi.responses import JSONResponse 9 | 10 | from app.core.config import settings 11 | 12 | log = structlog.get_logger() 13 | 14 | 15 | class ORJSONResponse(JSONResponse): 16 | """It serializes dataclass, datetime, numpy, and UUID instances natively.""" 17 | 18 | media_type = "application/json" 19 | 20 | def render(self, content: Any) -> bytes: 21 | return orjson.dumps(content) 22 | 23 | 24 | @functools.cache 25 | def get_readonly_connection(thread_id: int) -> duckdb.DuckDBPyConnection: 26 | # duckdb connection is not threadsafe, we have to create one connection per thread 27 | log.info("duckdb.new_connection", thread_id=thread_id) 28 | con = duckdb.connect( 29 | database=settings.DUCKDB_PATH.as_posix(), 30 | read_only=True, 31 | config={"memory_limit": settings.DUCKDB_MEMORY_LIMIT}, 32 | ) 33 | return con 34 | 35 | 36 | def omit_nullable_values(d: dict) -> dict: 37 | return {k: v for k, v in d.items() if v is not None and not pd.isna(v)} 38 | -------------------------------------------------------------------------------- /app/v1/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | 3 | from app import utils 4 | 5 | from .data import router as data_router 6 | from .lists import router as lists_router 7 | from .metadata import router as metadata_router 8 | from .search import router as search_router 9 | 10 | v1 = FastAPI(default_response_class=utils.ORJSONResponse) 11 | 12 | v1.include_router(metadata_router) 13 | v1.include_router(data_router) 14 | v1.include_router(search_router) 15 | v1.include_router(lists_router) 16 | -------------------------------------------------------------------------------- /app/v1/data.py: -------------------------------------------------------------------------------- 1 | import io 2 | import threading 3 | from typing import Any, Literal, Optional, cast 4 | 5 | import pandas as pd 6 | import pyarrow as pa 7 | import structlog 8 | from fastapi import APIRouter, Header, HTTPException, Response 9 | from fastapi.responses import StreamingResponse 10 | from pyarrow.feather import write_feather 11 | 12 | from app import utils 13 | from app.main import settings 14 | 15 | from .schemas import VariableDataResponse 16 | 17 | log = structlog.get_logger() 18 | 19 | 20 | DATA_TYPES = Literal["csv", "feather", "feather_direct", "json"] 21 | 22 | router = APIRouter() 23 | 24 | 25 | def _read_sql_bytes(con, sql: str, parameters) -> io.BytesIO: 26 | """Execute SQL and return BytesIO object with byte data.""" 27 | sink = io.BytesIO() 28 | 29 | batch_iterator = con.execute(sql, parameters=parameters).fetch_record_batch( 30 | chunk_size=1000 31 | ) 32 | with pa.ipc.new_file(sink, batch_iterator.schema) as writer: 33 | for rb in batch_iterator: 34 | writer.write_batch(rb) 35 | 36 | sink.seek(0) 37 | 38 | return sink 39 | 40 | 41 | def _bytes_to_response(bytes_io: io.BytesIO) -> StreamingResponse: 42 | # NOTE: using raw `bytes_io` should be in theory faster than `iter([bytes_io.getvalue()])`, yet 43 | # it is much slower for unknown reasons 44 | # response = StreamingResponse(bytes_io, media_type="application/octet-stream") 45 | response = StreamingResponse( 46 | iter([bytes_io.getvalue()]), media_type="application/octet-stream" 47 | ) 48 | response.headers["Content-Disposition"] = "attachment; filename=owid.feather" 49 | return response 50 | 51 | 52 | def _sql_to_response( 53 | con, sql: str, type: DATA_TYPES, parameters: list[Any] = [] 54 | ) -> Any: 55 | # read data in feather format and return it directly in response 56 | # NOTE: should be the fastest in theory, but is really slow for unknown reasons 57 | if type == "feather_direct": 58 | # WARNING: this does not support categorical variables and raises `pyarrow.lib.ArrowTypeError` 59 | # when you try to read it with pandas (see https://github.com/duckdb/duckdb/issues/4130) 60 | # we'd have to either convert categoricals into strings or change format while still in arrow format 61 | bytes_io = _read_sql_bytes(con, sql, parameters=parameters) 62 | return _bytes_to_response(bytes_io) 63 | 64 | # read data into dataframe and then convert to feather 65 | elif type == "feather": 66 | bytes_io = io.BytesIO() 67 | df = con.execute(sql, parameters=parameters).fetch_df() 68 | write_feather(df, bytes_io) 69 | return _bytes_to_response(bytes_io) 70 | 71 | # read data into dataframe and then convert to csv 72 | elif type == "csv": 73 | df = con.execute(sql, parameters=parameters).fetch_df() 74 | 75 | str_stream = io.StringIO() 76 | df.to_csv(str_stream, index=False) 77 | 78 | return StreamingResponse(iter([str_stream.getvalue()]), media_type="text/csv") 79 | 80 | # read data into dataframe and then convert to json 81 | elif type == "json": 82 | # NOTE: we could also do this directly from pyarrow, but it is slower than to pandas 83 | # for some reason 84 | # return con.execute(sql, parameters=parameters).fetch_arrow_table().to_pydict() 85 | 86 | df = con.execute(sql, parameters=parameters).fetch_df() 87 | 88 | # TODO: converting to lists and then ormjson is slow, we could instead 89 | # convert to numpy arrays on which ormjson is super fast 90 | return df.to_dict(orient="list") 91 | 92 | else: 93 | raise HTTPException(status_code=400, detail=f"unknown type {type}") 94 | 95 | 96 | @router.post("/sql") 97 | def sql_query(sql: str, type: DATA_TYPES = "csv"): 98 | """Run arbitrary query on top of our database.""" 99 | con = utils.get_readonly_connection(threading.get_ident()) 100 | return _sql_to_response(con, sql, type) 101 | 102 | 103 | # QUESTION: how about /variable/{variable_id}/data? 104 | @router.get( 105 | "/variableById/data/{variable_id}", 106 | response_model=VariableDataResponse, 107 | response_model_exclude_unset=True, 108 | ) 109 | def data_for_backported_variable( 110 | response: Response, 111 | variable_id: int, 112 | limit: Optional[int] = None, 113 | if_none_match: Optional[str] = Header(default=None), 114 | ): 115 | """Fetch data for a single variable.""" 116 | 117 | con = utils.get_readonly_connection(threading.get_ident()) 118 | 119 | # get meta about variable 120 | q = """ 121 | select 122 | v.variable_id as variable_id, 123 | v.short_name as short_name, 124 | v.table_path as table_path, 125 | d.checksum as checksum 126 | from meta_variables v 127 | join meta_tables t on v.table_path = t.path 128 | join meta_datasets d on d.path = t.dataset_path 129 | where variable_id = (?) 130 | """ 131 | df = cast(pd.DataFrame, con.execute(q, parameters=[variable_id]).fetch_df()) 132 | _assert_single_variable(df.shape[0], variable_id) 133 | r = dict(df.iloc[0]) 134 | 135 | # this is the dataset level checksum which is the best we have 136 | # at the moment 137 | checksum = r["checksum"] 138 | 139 | # if the client sent a IF-NONE-MATCH header, check if it matches the checksum 140 | if if_none_match == checksum: 141 | response.status_code = 304 142 | return 143 | 144 | # Send the checksum as the etag header and set cache-control to cache with 145 | # max-age of 0 (which makes the client validate with the if-none-match header) 146 | response.headers["ETag"] = checksum 147 | response.headers[ 148 | "Cache-Control" 149 | ] = "max-age=0" # We could consider allowing a certain time window 150 | 151 | parquet_path = (settings.OWID_CATALOG_DIR / r["table_path"]).with_suffix(".parquet") 152 | 153 | # TODO: DuckDB / SQLite doesn't allow parameterized table or column names, how do we escape it properly? 154 | # is it even needed if we get them from our DB and it is read-only? 155 | q = f""" 156 | select 157 | year as years, 158 | entity_name as entity_names, 159 | entity_id as entities, 160 | entity_code as entity_codes, 161 | {r["short_name"]} as values 162 | from read_parquet('{parquet_path}') 163 | where {r["short_name"]} is not null 164 | """ 165 | parameters = [] 166 | if limit: 167 | q += "limit (?)" 168 | parameters.append(limit) 169 | df = cast(pd.DataFrame, con.execute(q, parameters=parameters).fetch_df()) 170 | 171 | return df.to_dict(orient="list") 172 | 173 | 174 | # NOTE: it might be more intuitive to have paths like this 175 | # /dataset/{channel}/{namespace}/{version}/{dataset}/{table}/data.{type} 176 | # and 177 | # /dataset/{channel}/{namespace}/{version}/{dataset}/{table}/metadata 178 | # especially for browsing catalog tree in `lists.py` 179 | @router.get( 180 | "/dataset/data/{channel}/{namespace}/{version}/{dataset}/{table}.{type}", 181 | ) 182 | def data_for_etl_table( 183 | channel: str, 184 | namespace: str, 185 | version: str, 186 | dataset: str, 187 | table: str, 188 | columns: str = "*", 189 | limit: int = 1000000000, 190 | type: DATA_TYPES = "csv", 191 | ): 192 | """Fetch data for a table.""" 193 | 194 | con = utils.get_readonly_connection(threading.get_ident()) 195 | table_path = ( 196 | settings.OWID_CATALOG_DIR 197 | / f"{channel}/{namespace}/{version}/{dataset}/{table}.parquet" 198 | ) 199 | 200 | sql = f""" 201 | select 202 | {columns} 203 | from read_parquet('{table_path}') 204 | limit (?) 205 | """ 206 | 207 | con = utils.get_readonly_connection(threading.get_ident()) 208 | return _sql_to_response(con, sql, type, [limit]) 209 | 210 | 211 | def _assert_single_variable(n, variable_id): 212 | if n == 0: 213 | raise HTTPException( 214 | status_code=404, detail=f"variable_id {variable_id} not found" 215 | ) 216 | elif n > 1: 217 | # raise internal error 218 | raise Exception( 219 | f"multiple variables found for variable_id {variable_id}, this should not happen" 220 | ) 221 | -------------------------------------------------------------------------------- /app/v1/lists.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | from fastapi import APIRouter 4 | 5 | from app import utils 6 | 7 | 8 | router = APIRouter() 9 | 10 | 11 | @router.get( 12 | "/datasets", 13 | ) 14 | def list_all_datasets(): 15 | con = utils.get_readonly_connection(threading.get_ident()) 16 | sql = """ 17 | select title from meta_datasets 18 | """ 19 | df = con.execute(sql).fetch_df() 20 | return {"datasets": list(df.title)} 21 | 22 | 23 | @router.get( 24 | "/dataset/data", 25 | ) 26 | def list_channels(): 27 | """List all available channels.""" 28 | 29 | con = utils.get_readonly_connection(threading.get_ident()) 30 | sql = """ 31 | select distinct channel from meta_tables 32 | """ 33 | df = con.execute(sql).fetch_df() 34 | return {"channels": list(df.channel)} 35 | 36 | 37 | @router.get( 38 | "/dataset/data/{channel}", 39 | ) 40 | def list_namespaces(channel: str): 41 | """List all available namespaces.""" 42 | 43 | con = utils.get_readonly_connection(threading.get_ident()) 44 | sql = """ 45 | select distinct namespace from meta_tables 46 | where channel = (?) 47 | """ 48 | df = con.execute(sql, parameters=[channel]).fetch_df() 49 | return {"namespaces": list(df.namespace)} 50 | 51 | 52 | @router.get( 53 | "/dataset/data/{channel}/{namespace}", 54 | ) 55 | def list_versions(channel: str, namespace: str): 56 | """List all available versions.""" 57 | 58 | con = utils.get_readonly_connection(threading.get_ident()) 59 | sql = """ 60 | select distinct version from meta_tables 61 | where channel = (?) and namespace = (?) 62 | """ 63 | df = con.execute(sql, parameters=[channel, namespace]).fetch_df() 64 | return {"versions": list(df.version)} 65 | 66 | 67 | @router.get( 68 | "/dataset/data/{channel}/{namespace}/{version}", 69 | ) 70 | def list_datasets(channel: str, namespace: str, version: str): 71 | """List all available datasets.""" 72 | 73 | con = utils.get_readonly_connection(threading.get_ident()) 74 | sql = """ 75 | select distinct dataset_name from meta_tables 76 | where channel = (?) and namespace = (?) and version = (?) 77 | """ 78 | df = con.execute(sql, parameters=[channel, namespace, version]).fetch_df() 79 | return {"datasets": list(df.dataset_name)} 80 | 81 | 82 | @router.get( 83 | "/dataset/data/{channel}/{namespace}/{version}/{dataset}", 84 | ) 85 | def list_tables(channel: str, namespace: str, version: str, dataset: str): 86 | """List all available tables.""" 87 | 88 | con = utils.get_readonly_connection(threading.get_ident()) 89 | sql = """ 90 | select distinct table_name from meta_tables 91 | where channel = (?) and namespace = (?) and version = (?) and dataset_name = (?) 92 | """ 93 | df = con.execute(sql, parameters=[channel, namespace, version, dataset]).fetch_df() 94 | return {"tables": list(df.table_name)} 95 | -------------------------------------------------------------------------------- /app/v1/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import threading 3 | from typing import Any, Dict, Optional, cast 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import structlog 8 | from fastapi import APIRouter, Header, HTTPException, Response 9 | 10 | from app import utils 11 | 12 | from .schemas import ( 13 | Dimension, 14 | DimensionProperties, 15 | VariableMetadataResponse, 16 | VariableSource, 17 | ) 18 | 19 | log = structlog.get_logger() 20 | 21 | 22 | router = APIRouter() 23 | 24 | # NOTE: duckdb also supports python relations, would it be helpful? 25 | # https://github.com/duckdb/duckdb/blob/master/examples/python/duckdb-python.py 26 | 27 | 28 | @router.get( 29 | "/dataset/metadata/{channel}/{namespace}/{version}/{dataset}/{table}", 30 | # response_model=VariableMetadataResponse, 31 | # response_model_exclude_unset=True, 32 | ) 33 | def metadata_for_etl_variable( 34 | channel: str, 35 | namespace: str, 36 | version: str, 37 | dataset: str, 38 | table: str, 39 | ): 40 | table_path = f"{channel}/{namespace}/{version}/{dataset}/{table}" 41 | 42 | con = utils.get_readonly_connection(threading.get_ident()) 43 | 44 | vf = _metadata_etl_variables(con, table_path) 45 | tf = _metadata_etl_table(con, table_path) 46 | df = _metadata_etl_dataset(con, channel, namespace, version, dataset) 47 | 48 | if df.empty: 49 | raise HTTPException(status_code=404, detail=f"table `{table_path}` not found") 50 | 51 | return { 52 | "dataset": df.iloc[0].to_dict(), 53 | "table": tf.iloc[0].to_dict(), 54 | "variables": vf.to_dict(orient="records"), 55 | } 56 | 57 | 58 | # QUESTION: how about `/variable/{variable_id}/metadata` naming? 59 | @router.get( 60 | "/variableById/metadata/{variable_id}", 61 | response_model=VariableMetadataResponse, 62 | response_model_exclude_unset=True, 63 | ) 64 | def metadata_for_backported_variable( 65 | response: Response, 66 | variable_id: int, 67 | if_none_match: Optional[str] = Header(default=None), 68 | ): 69 | """Fetch metadata for a single variable from database. 70 | This function is identical to Variables.getVariableData in owid-grapher repository 71 | """ 72 | q = """ 73 | SELECT 74 | -- variables 75 | v.grapher_meta->>'$.name' as name, 76 | v.grapher_meta->>'$.unit' as unit, 77 | v.grapher_meta->>'$.description' as description, 78 | v.grapher_meta->>'$.createdAt' as createdAt, 79 | v.grapher_meta->>'$.updatedAt' as updatedAt, 80 | v.grapher_meta->>'$.code' as code, 81 | v.grapher_meta->>'$.coverage' as coverage, 82 | v.grapher_meta->>'$.timespan' as timespan, 83 | (v.grapher_meta->>'$.datasetId')::integer as datasetId, 84 | (v.grapher_meta->>'$.sourceId')::integer as sourceId, 85 | v.grapher_meta->>'$.shortUnit' as shortUnit, 86 | v.grapher_meta->>'$.display' as display, 87 | (v.grapher_meta->>'$.columnOrder')::integer as columnOrder, 88 | v.grapher_meta->>'$.originalMetadata' as originalMetadata, 89 | v.grapher_meta->>'$.grapherConfig' as grapherConfig, 90 | -- dataset 91 | d.grapher_meta->>'$.name' as datasetName, 92 | IF(d.grapher_meta->>'$.nonRedistributable' = 'true', true, false) as nonRedistributable, 93 | -- there should be always only one source for variable 94 | -- this is inverse of `convert_grapher_source` 95 | v.sources->>'$[0].name' as sourceName, 96 | v.sources->>'$[0].description' as sourceAdditionalInfo, 97 | v.sources->>'$[0].date_accessed' as sourceRetrievedDate, 98 | v.sources->>'$[0].url' as sourceLink, 99 | v.sources->>'$[0].publisher_source' as sourceDataPublisherSource, 100 | v.sources->>'$[0].published_by' as sourceDataPublishedBy, 101 | d.checksum as checksum, 102 | FROM meta_variables as v 103 | JOIN meta_datasets as d ON d.short_name = v.dataset_short_name 104 | join meta_tables t on v.table_path = t.path 105 | WHERE v.variable_id = (?) 106 | """ 107 | con = utils.get_readonly_connection(threading.get_ident()) 108 | 109 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead 110 | df = cast(pd.DataFrame, con.execute(q, parameters=[variable_id]).fetch_df()) 111 | 112 | if df.empty: 113 | raise HTTPException( 114 | status_code=404, detail=f"variableId `{variable_id}` not found" 115 | ) 116 | 117 | # null values in JSON string functions end up as "null" string, fix that 118 | df = df.replace("null", np.nan) 119 | row = df.iloc[0].to_dict() 120 | 121 | source = VariableSource( 122 | id=row.pop("sourceId"), 123 | name=row.pop("sourceName"), 124 | dataPublishedBy=row.pop("sourceDataPublishedBy", ""), 125 | dataPublisherSource=row.pop("sourceDataPublisherSource", ""), 126 | link=row.pop("sourceLink", ""), 127 | retrievedDate=row.pop("sourceRetrievedDate", ""), 128 | additionalInfo=row.pop("sourceAdditionalInfo", ""), 129 | ) 130 | 131 | nonRedistributable = row.pop("nonRedistributable") 132 | displayJson = row.pop("display") 133 | 134 | # this is the dataset level checksum which is the best we have 135 | # at the moment 136 | checksum = row.pop("checksum") 137 | 138 | variable = utils.omit_nullable_values(row) 139 | 140 | # if the client sent a IF-NONE-MATCH header, check if it matches the checksum 141 | if if_none_match == checksum: 142 | response.status_code = 304 143 | return 144 | 145 | # Send the checksum as the etag header and set cache-control to cache with 146 | # max-age of 0 (which makes the client validate with the if-none-match header) 147 | response.headers["ETag"] = checksum 148 | response.headers[ 149 | "Cache-Control" 150 | ] = "max-age=0" # We could consider allowing a certain time window 151 | 152 | # get variable types from duckdb (all metadata would be eventually retrieved in duckdb) 153 | # NOTE: getting these is a bit of a pain, we have a lot of duplicate information 154 | # in our DB 155 | q = """ 156 | select 157 | v.variable_type, 158 | t.dimension_values 159 | from meta_variables as v 160 | join meta_tables as t on t.path = v.table_path 161 | where variable_id = (?) 162 | """ 163 | variable_type, dimension_values = con.execute(q, parameters=[variable_id]).fetchone() # type: ignore 164 | 165 | dimensions = _parse_dimension_values(json.loads(dimension_values)) 166 | 167 | return VariableMetadataResponse( 168 | nonRedistributable=bool(nonRedistributable), 169 | display=json.loads(displayJson), 170 | source=source, 171 | type=variable_type, 172 | dimensions=dimensions, 173 | **variable, 174 | ) 175 | 176 | 177 | def _parse_dimension_values(dimension_values: Any) -> Dict[str, Dimension]: 178 | dimensions = {} 179 | 180 | # NOTE: we have inconsistency with plurals - even though the dimension name is 181 | # singular, we use plural in the API (but not for custom dimensions) 182 | if "year" in dimension_values: 183 | dimensions["years"] = Dimension( 184 | type="int", 185 | values=[DimensionProperties(id=y) for y in dimension_values.pop("year")], 186 | ) 187 | 188 | if "entity_zip" in dimension_values: 189 | dimensions["entities"] = Dimension( 190 | type="int", 191 | values=[ 192 | DimensionProperties(id=int(e[0]), name=e[1], code=e[2]) 193 | for e in map(lambda x: x.split("|"), dimension_values.pop("entity_zip")) 194 | ], 195 | ) 196 | 197 | assert not dimension_values, ( 198 | "This currently works only for backported datasets with dimensions " 199 | '{"year", "entity_id", "entity_name", "entity_code"}' 200 | ) 201 | 202 | return dimensions 203 | 204 | 205 | def _metadata_etl_variables(con, table_path): 206 | q = """ 207 | SELECT 208 | -- variables (commented columns are not relevant for ETL tables) 209 | v.title, 210 | v.description, 211 | v.licenses, 212 | v.sources, 213 | v.unit, 214 | v.short_unit, 215 | -- conversion factor from display is needed for CO2 datasets, but honestly it would be 216 | -- better to hide it or do the calculation implicitly 217 | v.display, 218 | -- v.grapher_meta, 219 | -- v.variable_id, 220 | v.short_name, 221 | v.table_path, 222 | v.dataset_short_name, 223 | v.variable_type, 224 | -- TODO: should we include `dimension_values` in response or do we only need it for backported variables? 225 | -- v.dimension_values, 226 | FROM meta_variables as v 227 | WHERE v.table_path = (?) 228 | """ 229 | 230 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead 231 | vf = cast(pd.DataFrame, con.execute(q, parameters=[table_path]).fetch_df()) 232 | 233 | # convert JSON to dict (should be done automatically once we switch to ORM) 234 | for col in ("licenses", "sources", "display"): 235 | vf[col] = vf[col].apply(json.loads) 236 | return vf 237 | 238 | 239 | def _metadata_etl_table(con, table_path): 240 | q = """ 241 | SELECT 242 | table_name, 243 | dataset_name, 244 | version, 245 | namespace, 246 | channel, 247 | dimensions, 248 | path, 249 | format, 250 | is_public, 251 | FROM meta_tables as t 252 | WHERE path = (?) 253 | """ 254 | 255 | # TODO: this is a hacky and slow way to do it, use ORM or proper dataclass instead 256 | tf = cast(pd.DataFrame, con.execute(q, parameters=[table_path]).fetch_df()) 257 | 258 | for col in ("dimensions",): 259 | tf[col] = tf[col].apply(json.loads) 260 | return tf 261 | 262 | 263 | def _metadata_etl_dataset(con, channel, namespace, version, dataset): 264 | q = """ 265 | SELECT 266 | channel, 267 | namespace, 268 | short_name, 269 | title, 270 | description, 271 | sources, 272 | licenses, 273 | is_public, 274 | checksum, 275 | version, 276 | -- grapher_meta 277 | FROM meta_datasets as d 278 | -- TODO: we might want to use path instead of separate columns 279 | WHERE channel = (?) and namespace = (?) and version = (?) and short_name = (?) 280 | """ 281 | 282 | df = cast( 283 | pd.DataFrame, 284 | con.execute( 285 | q, 286 | parameters=[ 287 | channel, 288 | namespace, 289 | version, 290 | dataset, 291 | ], 292 | ).fetch_df(), 293 | ) 294 | 295 | for col in ("sources", "licenses"): 296 | df[col] = df[col].apply(json.loads) 297 | 298 | return df 299 | -------------------------------------------------------------------------------- /app/v1/schemas.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from typing import Any, Dict, List, Optional 3 | 4 | from pydantic import BaseModel, Extra 5 | 6 | 7 | class VariableDataResponse(BaseModel): 8 | years: List[int] 9 | entity_names: List[str] 10 | entities: List[int] 11 | entity_codes: List[str] 12 | values: List[Any] 13 | 14 | class Config: 15 | extra = Extra.forbid 16 | 17 | 18 | class VariableDisplayDataTableConfig(BaseModel): 19 | hideAbsoluteChange: Optional[bool] 20 | hideRelativeChange: Optional[bool] 21 | 22 | 23 | class VariableDisplay(BaseModel): 24 | name: Optional[str] 25 | unit: Optional[str] 26 | shortUnit: Optional[str] 27 | isProjection: Optional[bool] 28 | includeInTable: Optional[bool] 29 | conversionFactor: Optional[float] 30 | numDecimalPlaces: Optional[int] 31 | tolerance: Optional[float] 32 | yearIsDay: Optional[bool] 33 | zeroDay: Optional[str] 34 | entityAnnotationsMap: Optional[str] 35 | tableDisplay: Optional[VariableDisplayDataTableConfig] 36 | color: Optional[str] 37 | 38 | class Config: 39 | extra = Extra.forbid 40 | 41 | 42 | class VariableSource(BaseModel): 43 | id: int 44 | name: str 45 | dataPublishedBy: str 46 | dataPublisherSource: str 47 | link: str 48 | retrievedDate: str 49 | additionalInfo: str 50 | 51 | class Config: 52 | extra = Extra.forbid 53 | 54 | 55 | class DimensionProperties(BaseModel): 56 | id: int 57 | name: Optional[str] = None 58 | code: Optional[str] = None 59 | 60 | class Config: 61 | extra = Extra.forbid 62 | 63 | 64 | class Dimension(BaseModel): 65 | type: str 66 | values: List[DimensionProperties] 67 | 68 | class Config: 69 | extra = Extra.forbid 70 | 71 | 72 | class VariableMetadataResponse(BaseModel): 73 | name: str 74 | unit: str 75 | shortUnit: Optional[str] 76 | code: Optional[str] 77 | description: Optional[str] 78 | createdAt: dt.datetime 79 | updatedAt: dt.datetime 80 | coverage: str 81 | timespan: str 82 | datasetId: int 83 | columnOrder: int 84 | datasetName: str 85 | nonRedistributable: bool 86 | display: VariableDisplay 87 | originalMetadata: Optional[str] 88 | grapherConfig: Optional[str] 89 | # MAYBE CHANGE - this should be turned into an array 90 | source: VariableSource 91 | type: str 92 | dimensions: Dict[str, Dimension] 93 | 94 | class Config: 95 | extra = Extra.forbid 96 | 97 | 98 | class SearchResponse(BaseModel): 99 | variable_name: str 100 | variable_title: str 101 | variable_description: str 102 | variable_unit: str 103 | table_name: str 104 | dataset_title: str 105 | channel: str 106 | metadata_url: str 107 | data_url: str 108 | match: float 109 | 110 | class Config: 111 | extra = Extra.forbid 112 | 113 | 114 | class SearchResponseList(BaseModel): 115 | 116 | results: List[SearchResponse] 117 | 118 | class Config: 119 | extra = Extra.forbid 120 | -------------------------------------------------------------------------------- /app/v1/search.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from enum import Enum 3 | from typing import Optional 4 | 5 | import structlog 6 | from fastapi import APIRouter, Query 7 | 8 | from app import utils 9 | 10 | from .schemas import SearchResponseList 11 | 12 | log = structlog.get_logger() 13 | 14 | 15 | router = APIRouter() 16 | 17 | 18 | class SearchType(str, Enum): 19 | table = "meta_tables" 20 | variable = "meta_variables" 21 | dataset = "meta_datasets" 22 | 23 | 24 | @router.get( 25 | "/search", 26 | response_model=SearchResponseList, 27 | response_model_exclude_unset=True, 28 | ) 29 | def search( 30 | term: str, 31 | channels: Optional[list[str]] = Query(default=None), 32 | type: SearchType = SearchType.variable, 33 | limit: int = 10, 34 | ): 35 | con = utils.get_readonly_connection(threading.get_ident()) 36 | 37 | # TODO: implement search on other tables too? not sure whether we'll need it yet 38 | if type != SearchType.variable: 39 | raise NotImplementedError( 40 | f"Invalid search type {type}, only searching variables is currently supported" 41 | ) 42 | 43 | if channels: 44 | # `parameters` do not support lists (maybe `multiple_parameter_sets` would do?) 45 | channels_str = ",".join([f"'{c}'" for c in channels]) 46 | where = f"and d.channel in ({channels_str})" 47 | else: 48 | where = "" 49 | 50 | # sample search 51 | q = f""" 52 | SELECT 53 | v.short_name as variable_name, 54 | v.title as variable_title, 55 | v.unit as variable_unit, 56 | v.description as variable_description, 57 | t.table_name, 58 | t.path as table_path, 59 | d.title as dataset_title, 60 | d.channel as channel, 61 | fts_main_meta_variables.match_bm25(v.path, ?) AS match 62 | FROM meta_variables as v 63 | JOIN meta_tables as t ON t.path = v.table_path 64 | JOIN meta_datasets as d ON d.path = t.dataset_path 65 | where match is not null 66 | {where} 67 | order by match desc 68 | limit (?) 69 | """ 70 | matches = con.execute(q, parameters=[term, limit]).fetch_df() 71 | 72 | matches["metadata_url"] = "/v1/dataset/metadata/" + matches["table_path"] 73 | matches["data_url"] = "/v1/dataset/data/" + matches["table_path"] 74 | 75 | matches = matches.drop(columns=["table_path"]) 76 | 77 | return {"results": matches.to_dict(orient="records")} 78 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/crawler/__init__.py -------------------------------------------------------------------------------- /crawler/crawl.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from collections.abc import Generator 4 | from contextlib import contextmanager 5 | from pathlib import Path 6 | from typing import Any, Optional, Set, Tuple, cast 7 | 8 | import pandas as pd 9 | import pyarrow.parquet as pq 10 | import structlog 11 | import typer 12 | from owid.catalog import RemoteCatalog, TableMeta, VariableMeta 13 | from owid.catalog.catalogs import CatalogFrame, CatalogSeries 14 | from sqlalchemy.engine import Engine 15 | from sqlalchemy.orm.session import Session 16 | 17 | from crawler.duckdb_models import ( 18 | MetaDatasetModel, 19 | MetaTableModel, 20 | MetaVariableModel, 21 | db_init, 22 | ) 23 | from crawler.full_text_index import main as create_full_text_index 24 | 25 | log = structlog.get_logger() 26 | 27 | 28 | # duckdb does not support NaN in categories, use a special symbol instead 29 | CATEGORY_NAN = "-" 30 | 31 | 32 | def _load_catalog_frame(channels=()) -> CatalogFrame: 33 | frame = RemoteCatalog(channels=channels).frame 34 | 35 | # only public data 36 | frame = frame.loc[frame["is_public"]] 37 | 38 | # add dataset path 39 | frame["dataset_path"] = frame.path.map(os.path.dirname) 40 | 41 | # TODO: exclude large datasets (we need to improve their performance) 42 | frame = frame[~frame.path.str.contains("garden/faostat/2022-05-17")] 43 | # frame = frame[~frame.path.str.contains("garden/un_sdg/2022-07-07/un_sdg")] 44 | 45 | # TODO: exclude datasets with missing versions 46 | frame = frame[~frame.path.str.contains("garden/faostat/2021-04-09")] 47 | frame = frame[~frame.path.str.contains("garden/owid/latest/key_indicators")] 48 | frame = frame[~frame.path.str.contains("garden/owid/latest/population_density")] 49 | frame = frame[ 50 | ~frame.path.str.contains("garden/sdg/latest/sdg/sustainable_development_goal") 51 | ] 52 | frame = frame[~frame.path.str.contains("garden/worldbank_wdi/2022-05-26/wdi/wdi")] 53 | 54 | # TODO: weird error 55 | frame = frame[ 56 | ~frame.path.str.contains("garden/shift/2022-07-18/fossil_fuel_production") 57 | ] 58 | 59 | # TODO: exclude special datasets for now 60 | frame = frame[~frame.path.str.contains("garden/reference/")] 61 | 62 | return frame 63 | 64 | 65 | def _variable_types(con, parquet_path) -> dict: 66 | q = f""" 67 | select 68 | name, 69 | type 70 | from parquet_schema('{parquet_path}') 71 | """ 72 | mf = pd.read_sql(q, con) 73 | return mf.set_index("name")["type"].to_dict() 74 | 75 | 76 | def _dataset_sync_actions( 77 | engine: Engine, ds_path_to_checksum: dict[str, str] 78 | ) -> Tuple[Set[str], Set[str]]: 79 | q = """ 80 | select 81 | path, 82 | checksum 83 | from meta_datasets 84 | """ 85 | try: 86 | df = pd.read_sql(q, engine) 87 | except RuntimeError as e: 88 | if e.args[0].startswith( 89 | "Catalog Error: Table with name meta_datasets does not exist" 90 | ): 91 | df = pd.DataFrame(columns=["path", "checksum"]) 92 | else: 93 | raise e 94 | 95 | # compute ids consisting of checksum and table name to know which ones to delete 96 | db_ids = {r.path: r.checksum for r in df.itertuples()} 97 | 98 | dataset_paths_to_delete = { 99 | path 100 | for path, checksum in db_ids.items() 101 | if checksum != ds_path_to_checksum.get(path) 102 | } 103 | dataset_paths_to_create = { 104 | path 105 | for path, checksum in ds_path_to_checksum.items() 106 | if checksum != db_ids.get(path) 107 | } 108 | 109 | return dataset_paths_to_delete, dataset_paths_to_create 110 | 111 | 112 | def _parse_meta_variable( 113 | var_meta: VariableMeta, 114 | m: MetaTableModel, 115 | short_name: str, 116 | variable_type: str, 117 | dataset_short_name: str, 118 | dataset_path: str, 119 | ) -> MetaVariableModel: 120 | # sometimes `unit` is missing, but there is display.unit 121 | if (var_meta.unit == "") or pd.isnull(var_meta): 122 | var_meta.unit = (var_meta.display or {}).get("unit") 123 | 124 | # if there is backported variable in non-backported dataset, remove its grapher 125 | # metadata to make sure we don't have duplicate variable ids in DB 126 | channel = dataset_path.split("/")[0] 127 | if channel != "backport" and var_meta.additional_info: 128 | var_meta.additional_info.pop("grapher_meta", None) 129 | 130 | return MetaVariableModel( 131 | title=var_meta.title, 132 | description=var_meta.description, 133 | licenses=[license.to_dict() for license in var_meta.licenses], 134 | sources=[source.to_dict() for source in var_meta.sources], 135 | unit=var_meta.unit, 136 | short_unit=var_meta.short_unit, 137 | display=var_meta.display, 138 | grapher_meta=var_meta.additional_info["grapher_meta"] 139 | if var_meta.additional_info 140 | else None, 141 | variable_id=var_meta.additional_info["grapher_meta"]["id"] 142 | if var_meta.additional_info 143 | else None, 144 | short_name=short_name, 145 | table_path=m.path, 146 | variable_type=variable_type, 147 | dataset_short_name=dataset_short_name, 148 | dataset_path=dataset_path, 149 | ) 150 | 151 | 152 | def _delete_dataset(path: str, session: Session) -> None: 153 | session.query(MetaDatasetModel).filter_by(path=path).delete() 154 | session.query(MetaTableModel).filter_by(dataset_path=path).delete() 155 | session.query(MetaVariableModel).filter_by(dataset_path=path).delete() 156 | 157 | 158 | def _datasets_updates( 159 | engine: Engine, frame: CatalogFrame, force: bool, include: Optional[str] 160 | ) -> Tuple[Set[str], Set[str]]: 161 | # dataset path to checksum from frame 162 | ds_path_to_checksum = {r.dataset_path: r.checksum for r in frame.itertuples()} 163 | 164 | if force: 165 | dataset_paths_to_delete = dataset_paths_to_create = set( 166 | ds_path_to_checksum.keys() 167 | ) 168 | else: 169 | # which tables to delete and which to create 170 | dataset_paths_to_delete, dataset_paths_to_create = _dataset_sync_actions( 171 | engine, ds_path_to_checksum 172 | ) 173 | 174 | # if using specific include pattern, don't delete any other datasets 175 | if include: 176 | dataset_paths_to_delete = dataset_paths_to_delete & dataset_paths_to_create 177 | 178 | return dataset_paths_to_delete, dataset_paths_to_create 179 | 180 | 181 | def _extract_dimension_values( 182 | parquet_path: str, dims_to_process: Set[str], engine 183 | ) -> dict[str, Any]: 184 | dimension_values = {} 185 | 186 | if not dims_to_process: 187 | return {} 188 | 189 | # entities belong together and has to be stored as tuple `entity_id|entity_name|entity_code` 190 | # NOTE: this might be generalized to any column name with `_id`, `_name`, `_code` suffix 191 | if {"entity_id", "entity_name", "entity_code"} <= dims_to_process: 192 | dims_to_process = dims_to_process - {"entity_id", "entity_name", "entity_code"} 193 | 194 | q = f""" 195 | select distinct 196 | entity_id, entity_name, entity_code 197 | from read_parquet('{parquet_path}') 198 | """ 199 | df = pd.read_sql(q, engine) 200 | 201 | index_vals = sorted( 202 | set( 203 | zip( 204 | df["entity_id"], 205 | df["entity_name"], 206 | df["entity_code"], 207 | ) 208 | ) 209 | ) 210 | 211 | dimension_values = { 212 | "entity_zip": sorted(["|".join(map(str, x)) for x in index_vals]), 213 | } 214 | 215 | for dim in dims_to_process: 216 | q = f""" 217 | select distinct {dim} 218 | from read_parquet('{parquet_path}') 219 | """ 220 | df = pd.read_sql(q, engine) 221 | 222 | dimension_values[dim] = sorted(set(df[dim].dropna())) 223 | 224 | return dimension_values 225 | 226 | 227 | def _read_parquet_metadata( 228 | parquet_path: Path, 229 | ) -> tuple[TableMeta, dict[str, VariableMeta]]: 230 | meta = pq.read_metadata(parquet_path) 231 | table_meta = TableMeta.from_json(meta.metadata[b"owid_table"]) # type: ignore 232 | 233 | owid_fields = json.loads(meta.metadata[b"owid_fields"]) 234 | fields_meta = {f: VariableMeta.from_dict(v) for f, v in owid_fields.items()} 235 | 236 | return table_meta, fields_meta 237 | 238 | 239 | def main( 240 | duckdb_path: Path = Path("duck.db"), 241 | owid_catalog_dir: Path = Path("../etl/data"), 242 | include: Optional[str] = typer.Option( 243 | None, help="Include datasets matching this regex" 244 | ), 245 | force: bool = False, 246 | full_text_search: bool = True, 247 | ) -> None: 248 | """Bake ETL catalog into DuckDB.""" 249 | engine = db_init(duckdb_path) 250 | 251 | frame = _load_catalog_frame(channels=("backport", "garden")) 252 | 253 | if include: 254 | frame = frame.loc[frame.dataset_path.str.contains(include)] 255 | 256 | dataset_paths_to_delete, dataset_paths_to_create = _datasets_updates( 257 | engine, frame, force, include 258 | ) 259 | log.info( 260 | "duckdb.actions", 261 | delete_datasets=len(dataset_paths_to_delete), 262 | create_datasets=len(dataset_paths_to_create), 263 | ) 264 | 265 | frame = frame.loc[frame.dataset_path.isin(dataset_paths_to_create)] 266 | 267 | for i, (dataset_path, dataset_frame) in enumerate(frame.groupby("dataset_path")): 268 | log.info( 269 | "dataset.create", 270 | path=dataset_path, 271 | progress=f"{i + 1}/{len(frame)}", 272 | ) 273 | if dataset_path in dataset_paths_to_delete: 274 | # delete everything related to a dataset before recreating them 275 | with new_session(engine) as session: 276 | _delete_dataset(dataset_path, session) 277 | dataset_paths_to_delete.remove(dataset_path) 278 | 279 | # NOTE: we need to grab from the first table we load, only insert the dataset 280 | # when we process the first table 281 | dataset_inserted = False 282 | 283 | for i, (_, catalog_row) in enumerate(dataset_frame.iterrows()): 284 | 285 | catalog_row = cast(CatalogSeries, catalog_row) 286 | 287 | log.info( 288 | "table.read_parquet_metadata", 289 | path=catalog_row.path, 290 | ) 291 | 292 | parquet_path = (owid_catalog_dir / catalog_row.path).with_suffix(".parquet") 293 | 294 | table_meta, fields_meta = _read_parquet_metadata(parquet_path) 295 | 296 | log.info( 297 | "table.extract_dimension_values", 298 | path=catalog_row.path, 299 | ) 300 | 301 | # NOTE: this requires reading parquet file, which could be slow. We could instead write 302 | # dimensions values to metadata when generating the parquet file. 303 | dimension_values = _extract_dimension_values( 304 | parquet_path, set(catalog_row.dimensions), engine 305 | ) 306 | 307 | t = MetaTableModel.from_CatalogSeries(catalog_row, dimension_values) 308 | 309 | log.info( 310 | "table.create", 311 | path=t.path, 312 | ) 313 | 314 | with new_session(engine) as session: 315 | # save dataset metadata alongside table, we could also create a separate table for datasets 316 | ds = table_meta.dataset 317 | assert ds is not None 318 | 319 | # exceptions for backported channel 320 | if catalog_row.channel == "backport": 321 | # backported datasets are missing version 322 | ds.version = "latest" 323 | # all backported datasets are currently saved under `owid` namespace, we could be saving them in their 324 | # real namespaces, but that would imply non-trivial changes to backporting code in ETL 325 | ds.namespace = "owid" 326 | 327 | assert ds.short_name 328 | if not ds.version: 329 | log.error("missing.version", path=catalog_row["path"]) 330 | continue 331 | 332 | # add table 333 | session.add(t) 334 | 335 | # create dataset 336 | # TODO: channel should be ideally property of DatasetMeta 337 | if not dataset_inserted: 338 | session.add( 339 | MetaDatasetModel.from_DatasetMeta( 340 | ds, dataset_path, dataset_checksum=catalog_row.checksum 341 | ) 342 | ) 343 | dataset_inserted = True 344 | 345 | # get variable types from DB 346 | assert t.path 347 | variable_types = _variable_types(engine, parquet_path) 348 | 349 | # table with variables 350 | variables = [] 351 | for variable_short_name, variable_meta in fields_meta.items(): 352 | if variable_short_name in t.dimensions: 353 | continue 354 | 355 | variables.append( 356 | _parse_meta_variable( 357 | variable_meta, 358 | t, 359 | variable_short_name, 360 | variable_types[variable_short_name], 361 | ds.short_name, 362 | dataset_path, 363 | ) 364 | ) 365 | log.info( 366 | "table.variable.create", 367 | variable=variable_short_name, 368 | ) 369 | 370 | session.add_all(variables) 371 | 372 | # delete the rest of the datasets 373 | if dataset_paths_to_delete: 374 | log.info("dataset.delete_datasets", n=len(dataset_paths_to_delete)) 375 | with new_session(engine) as session: 376 | for dataset_path in dataset_paths_to_delete: 377 | _delete_dataset(dataset_path, session) 378 | 379 | if full_text_search: 380 | # recreate full-text search index (this has to be run on every new dataset) 381 | create_full_text_index(duckdb_path) 382 | 383 | 384 | @contextmanager 385 | def new_session(engine) -> Generator[Session, None, None]: 386 | """Open new session and commit at the end without expiring objects. 387 | 388 | I couldn't make this work with creating only one session per table (tables did not have data for 389 | unknown reasons), so I'm creating new session for each operation which works. Feel free to fix 390 | this and make it transactional or switch to a different ORM. 391 | """ 392 | # NOTE: should I do it with transaction, i.e. `with session.begin():`? 393 | # there would be problems with commits in _upsert_dataset 394 | with Session(engine, expire_on_commit=False) as session: 395 | yield session 396 | session.commit() 397 | 398 | 399 | def main_cli(): 400 | return typer.run(main) 401 | 402 | 403 | if __name__ == "__main__": 404 | main_cli() 405 | -------------------------------------------------------------------------------- /crawler/duckdb_models.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | import pandas as pd 6 | import structlog 7 | from owid.catalog import DatasetMeta 8 | from owid.catalog.catalogs import CatalogSeries 9 | from sqlalchemy import JSON, Boolean, Column, Integer, String, create_engine 10 | from sqlalchemy.engine import Engine 11 | from sqlalchemy.ext.declarative import declarative_base 12 | 13 | Base = declarative_base() 14 | 15 | 16 | log = structlog.get_logger() 17 | 18 | 19 | # NOTE: not having type hints is quite limiting, ideally we would make this work with sqlmodel 20 | class MetaDatasetModel(Base): # type: ignore 21 | """ 22 | Almost identical copy of DatasetMeta from owid-catalog-py 23 | """ 24 | 25 | __tablename__ = "meta_datasets" 26 | 27 | # TODO: what should we use as the primary key? either we use autoincremented ids or we 28 | # use paths (e.g. `garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp` is an address to table) 29 | # what are the pros and cons of each? 30 | path = Column(String, primary_key=True) 31 | 32 | channel = Column(String) 33 | namespace = Column(String) 34 | short_name = Column(String) 35 | title = Column(String) 36 | description = Column(String) 37 | sources = Column(JSON) 38 | licenses = Column(JSON) 39 | is_public = Column(Boolean) 40 | checksum = Column(String) 41 | source_checksum = Column(String) 42 | version = Column(String) 43 | 44 | # this is an attribute of additional_info['grapher_meta'] 45 | grapher_meta = Column(JSON) 46 | 47 | @classmethod 48 | def from_DatasetMeta( 49 | cls, ds: DatasetMeta, dataset_path: str, dataset_checksum: str 50 | ) -> "MetaDatasetModel": 51 | return MetaDatasetModel( 52 | path=dataset_path, 53 | channel=dataset_path.split("/")[0], 54 | short_name=ds.short_name, 55 | namespace=ds.namespace, 56 | title=ds.title, 57 | description=ds.description, 58 | sources=[source.to_dict() for source in ds.sources], 59 | licenses=[license.to_dict() for license in ds.licenses], 60 | is_public=ds.is_public, 61 | checksum=dataset_checksum, 62 | source_checksum=ds.source_checksum, 63 | grapher_meta=ds.additional_info["grapher_meta"] 64 | if ds.additional_info 65 | else None, 66 | version=ds.version, 67 | ) 68 | 69 | 70 | class MetaTableModel(Base): # type: ignore 71 | __tablename__ = "meta_tables" 72 | 73 | path = Column(String, primary_key=True) 74 | dataset_path = Column(String) 75 | 76 | table_name = Column(String) 77 | dataset_name = Column(String) 78 | 79 | # columns from catalog 80 | version = Column(String) 81 | namespace = Column(String) 82 | channel = Column(String) 83 | dimensions = Column(JSON) 84 | format = Column(String) 85 | is_public = Column(Boolean) 86 | 87 | # distinct values of years and entities encoded as JSON 88 | dimension_values = Column(JSON) 89 | 90 | def __init__(self, *args, **kwargs): 91 | # TODO: "format" was changed to "formats", we'd have to rebuild the entire database, so just 92 | # hotfix it for now 93 | assert "feather" in kwargs["formats"] 94 | kwargs["format"] = "feather" 95 | del kwargs["formats"] 96 | super().__init__(*args, **kwargs) 97 | 98 | @classmethod 99 | def from_CatalogSeries( 100 | cls, catalog_row: CatalogSeries, dimension_values: dict[str, Any] 101 | ) -> "MetaTableModel": 102 | d = catalog_row.to_dict() 103 | 104 | d["dimension_values"] = dimension_values 105 | 106 | # checksum from catalog is actually checksum of a dataset, not table! 107 | del d["checksum"] 108 | 109 | d["dimensions"] = list(d["dimensions"]) 110 | 111 | # rename to adhere to DuckDB schema 112 | d["table_name"] = d.pop("table") 113 | d["dataset_name"] = d.pop("dataset") 114 | 115 | t = cls(**d) 116 | 117 | is_backport = t.channel == "backport" 118 | 119 | if is_backport: 120 | missing_dims = {"year", "entity_name", "entity_code", "entity_id"} - set( 121 | t.dimensions 122 | ) 123 | assert not missing_dims, f"Missing dimensions: {missing_dims}" 124 | 125 | return t 126 | 127 | 128 | class MetaVariableModel(Base): # type: ignore 129 | __tablename__ = "meta_variables" 130 | 131 | path = Column(String, primary_key=True) 132 | 133 | # columns from VariableMeta 134 | title = Column(String) 135 | description = Column(String) 136 | licenses = Column(JSON) 137 | sources = Column(JSON) 138 | unit = Column(String) 139 | short_unit = Column(String) 140 | display = Column(JSON) 141 | 142 | # this is an attribute of additional_info['grapher_meta'] 143 | grapher_meta = Column(JSON) 144 | 145 | variable_id = Column(Integer) 146 | 147 | # inferred columns by crawler 148 | short_name = Column(String) 149 | table_path = Column(String) 150 | dataset_path = Column(String) 151 | dataset_short_name = Column(String) 152 | variable_type = Column(String) 153 | 154 | def __init__(self, *args, **kwargs): 155 | kwargs["path"] = f"{kwargs['table_path']}/{kwargs['short_name']}" 156 | super().__init__(*args, **kwargs) 157 | 158 | 159 | class PdEncoder(json.JSONEncoder): 160 | """Serialize non-native JSON objects.""" 161 | 162 | def default(self, obj): 163 | if isinstance(obj, pd.Timestamp): 164 | return str(obj) 165 | return json.JSONEncoder.default(self, obj) 166 | 167 | 168 | def db_init(path: Path) -> Engine: 169 | eng = create_engine( 170 | f"duckdb:///{path}", 171 | json_serializer=lambda obj: json.dumps(obj, cls=PdEncoder), 172 | ) 173 | Base.metadata.create_all(eng) 174 | return eng 175 | -------------------------------------------------------------------------------- /crawler/full_text_index.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import duckdb 4 | import structlog 5 | import typer 6 | 7 | log = structlog.get_logger() 8 | 9 | 10 | def main( 11 | duckdb_path: Path = Path("duck.db"), 12 | ) -> None: 13 | assert duckdb_path.exists(), "DuckDB database path does not exist" 14 | 15 | log.info("table.full_text_index.start") 16 | 17 | con = duckdb.connect(duckdb_path.as_posix()) 18 | cols = [ 19 | "title", 20 | "description", 21 | "path", 22 | "unit", 23 | "short_name", 24 | ] 25 | _create_full_text_search_index(con, "meta_variables", "path", cols) 26 | log.info("table.full_text_index.end") 27 | 28 | 29 | def _create_full_text_search_index( 30 | con, table_name: str, primary_key: str, columns: list[str] = ["*"] 31 | ): 32 | # NOTE: path is a unique identifier (primary key probably) 33 | # NOTE: we include numbers (for SDG goals for instance) 34 | cols_to_index = ",".join([f"'{c}'" for c in columns]) 35 | con.execute( 36 | f"""PRAGMA create_fts_index( 37 | '{table_name}', 38 | '{primary_key}', 39 | {cols_to_index}, 40 | stopwords='english', 41 | overwrite=1, 42 | ignore='(\\.|[^a-z0-9])+')""" 43 | ) 44 | 45 | 46 | if __name__ == "__main__": 47 | typer.run(main) 48 | -------------------------------------------------------------------------------- /crawler/query_duckdb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | "
cidnametypenotnulldflt_valuepk
00table_nameVARCHARTrueNaNTrue
11dataset_nameVARCHARFalseNaNFalse
22table_db_nameVARCHARFalseNaNFalse
33versionVARCHARFalseNaNFalse
44namespaceVARCHARFalseNaNFalse
55channelVARCHARFalseNaNFalse
66checksumVARCHARFalseNaNFalse
77dimensionsJSONFalseNaNFalse
88pathVARCHARFalseNaNFalse
99formatVARCHARFalseNaNFalse
1010is_publicBOOLEANFalseNaNFalse
\n", 139 | "
" 140 | ], 141 | "text/plain": [ 142 | " cid name type notnull dflt_value pk\n", 143 | "0 0 table_name VARCHAR True NaN True\n", 144 | "1 1 dataset_name VARCHAR False NaN False\n", 145 | "2 2 table_db_name VARCHAR False NaN False\n", 146 | "3 3 version VARCHAR False NaN False\n", 147 | "4 4 namespace VARCHAR False NaN False\n", 148 | "5 5 channel VARCHAR False NaN False\n", 149 | "6 6 checksum VARCHAR False NaN False\n", 150 | "7 7 dimensions JSON False NaN False\n", 151 | "8 8 path VARCHAR False NaN False\n", 152 | "9 9 format VARCHAR False NaN False\n", 153 | "10 10 is_public BOOLEAN False NaN False" 154 | ] 155 | }, 156 | "execution_count": 1, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "import duckdb\n", 163 | "\n", 164 | "con = duckdb.connect(\"../duck.db\", read_only=True)\n", 165 | "\n", 166 | "con.execute(\"PRAGMA table_info('meta_tables');\").fetch_df()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 2, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/html": [ 177 | "
\n", 178 | "\n", 191 | "\n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | "
table_namedataset_nametable_db_nameversionnamespacechannelchecksumdimensionspathformatis_public
0dataset_941_technology_adoption__isard__1942__...dataset_941_technology_adoption__isard__1942__...backport__owid__latest__dataset_941_technology...NaNowidbackport76c24e0b3af5621506abb1cd3971faf0[\"year\", \"entity_name\", \"entity_id\", \"entity_c...backport/owid/latest/dataset_941_technology_ad...featherTrue
1maddison_gdpggdc_maddisongarden__ggdc__2020_10_01__ggdc_maddison__maddi...2020-10-01ggdcgarden7236fb37ff655adc0d9924a9e79937ed[\"country\", \"year\"]garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdpfeatherTrue
\n", 239 | "
" 240 | ], 241 | "text/plain": [ 242 | " table_name \\\n", 243 | "0 dataset_941_technology_adoption__isard__1942__... \n", 244 | "1 maddison_gdp \n", 245 | "\n", 246 | " dataset_name \\\n", 247 | "0 dataset_941_technology_adoption__isard__1942__... \n", 248 | "1 ggdc_maddison \n", 249 | "\n", 250 | " table_db_name version namespace \\\n", 251 | "0 backport__owid__latest__dataset_941_technology... NaN owid \n", 252 | "1 garden__ggdc__2020_10_01__ggdc_maddison__maddi... 2020-10-01 ggdc \n", 253 | "\n", 254 | " channel checksum \\\n", 255 | "0 backport 76c24e0b3af5621506abb1cd3971faf0 \n", 256 | "1 garden 7236fb37ff655adc0d9924a9e79937ed \n", 257 | "\n", 258 | " dimensions \\\n", 259 | "0 [\"year\", \"entity_name\", \"entity_id\", \"entity_c... \n", 260 | "1 [\"country\", \"year\"] \n", 261 | "\n", 262 | " path format is_public \n", 263 | "0 backport/owid/latest/dataset_941_technology_ad... feather True \n", 264 | "1 garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp feather True " 265 | ] 266 | }, 267 | "execution_count": 2, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "q = \"\"\"\n", 274 | "select\n", 275 | " *\n", 276 | "from meta_tables\n", 277 | "limit 5\n", 278 | "\"\"\"\n", 279 | "con.execute(q).fetch_df()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 3, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/html": [ 290 | "
\n", 291 | "\n", 304 | "\n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | "
titledescriptionlicensessourcesunitshort_unitdisplaygrapher_metavariable_pathvariable_idshort_nametable_pathtable_db_namedataset_short_namevariable_typedimension_values
0ATM (Comin and Hobijn (2004))Number of electro-mechanical devices that perm...[][{\"name\": \"Isard (1942) and others\", \"descript...NaNnull{\"id\": 42539, \"name\": \"ATM (Comin and Hobijn (...backport/owid/latest/dataset_941_technology_ad...42539atm__comin_and_hobijn__2004backport/owid/latest/dataset_941_technology_ad...backport__owid__latest__dataset_941_technology...dataset_941_technology_adoption__isard__1942__...FLOAT{\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199...
1Agricultural tractor (Comin and Hobijn (2004))Number of wheel and crawler tractors (excl. ga...[][{\"name\": \"Isard (1942) and others\", \"descript...NaNnull{\"id\": 42538, \"name\": \"Agricultural tractor (C...backport/owid/latest/dataset_941_technology_ad...42538agricultural_tractor__comin_and_hobijn__2004backport/owid/latest/dataset_941_technology_ad...backport__owid__latest__dataset_941_technology...dataset_941_technology_adoption__isard__1942__...BIGINT{\"year\": [\"1961\", \"1962\", \"1963\", \"1964\", \"196...
2Aviation passenger-km (Comin and Hobijn (2004))Civil aviation passenger‐km travelled on sched...[][{\"name\": \"Isard (1942) and others\", \"descript...passenger-kmNaNnull{\"id\": 42540, \"name\": \"Aviation passenger-km (...backport/owid/latest/dataset_941_technology_ad...42540aviation_passenger_km__comin_and_hobijn__2004backport/owid/latest/dataset_941_technology_ad...backport__owid__latest__dataset_941_technology...dataset_941_technology_adoption__isard__1942__...BIGINT{\"year\": [\"1930\", \"1931\", \"1932\", \"1933\", \"193...
3Canals (Isard (1942))Measures the mileage of completed canals.[][{\"name\": \"Isard (1942) and others\", \"descript...mileageNaNnull{\"id\": 42535, \"name\": \"Canals (Isard (1942))\",...backport/owid/latest/dataset_941_technology_ad...42535canals__isard__1942backport/owid/latest/dataset_941_technology_ad...backport__owid__latest__dataset_941_technology...dataset_941_technology_adoption__isard__1942__...FLOAT{\"year\": [\"1800\", \"1803\", \"1805\", \"1807\", \"180...
4Card payments (Comin and Hobijn (2004))Number of transactions using payment cards at ...[][{\"name\": \"Isard (1942) and others\", \"descript...NaNnull{\"id\": 42542, \"name\": \"Card payments (Comin an...backport/owid/latest/dataset_941_technology_ad...42542card_payments__comin_and_hobijn__2004backport/owid/latest/dataset_941_technology_ad...backport__owid__latest__dataset_941_technology...dataset_941_technology_adoption__isard__1942__...BIGINT{\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199...
\n", 424 | "
" 425 | ], 426 | "text/plain": [ 427 | " title \\\n", 428 | "0 ATM (Comin and Hobijn (2004)) \n", 429 | "1 Agricultural tractor (Comin and Hobijn (2004)) \n", 430 | "2 Aviation passenger-km (Comin and Hobijn (2004)) \n", 431 | "3 Canals (Isard (1942)) \n", 432 | "4 Card payments (Comin and Hobijn (2004)) \n", 433 | "\n", 434 | " description licenses \\\n", 435 | "0 Number of electro-mechanical devices that perm... [] \n", 436 | "1 Number of wheel and crawler tractors (excl. ga... [] \n", 437 | "2 Civil aviation passenger‐km travelled on sched... [] \n", 438 | "3 Measures the mileage of completed canals. [] \n", 439 | "4 Number of transactions using payment cards at ... [] \n", 440 | "\n", 441 | " sources unit short_unit \\\n", 442 | "0 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n", 443 | "1 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n", 444 | "2 [{\"name\": \"Isard (1942) and others\", \"descript... passenger-km NaN \n", 445 | "3 [{\"name\": \"Isard (1942) and others\", \"descript... mileage NaN \n", 446 | "4 [{\"name\": \"Isard (1942) and others\", \"descript... NaN \n", 447 | "\n", 448 | " display grapher_meta \\\n", 449 | "0 null {\"id\": 42539, \"name\": \"ATM (Comin and Hobijn (... \n", 450 | "1 null {\"id\": 42538, \"name\": \"Agricultural tractor (C... \n", 451 | "2 null {\"id\": 42540, \"name\": \"Aviation passenger-km (... \n", 452 | "3 null {\"id\": 42535, \"name\": \"Canals (Isard (1942))\",... \n", 453 | "4 null {\"id\": 42542, \"name\": \"Card payments (Comin an... \n", 454 | "\n", 455 | " variable_path variable_id \\\n", 456 | "0 backport/owid/latest/dataset_941_technology_ad... 42539 \n", 457 | "1 backport/owid/latest/dataset_941_technology_ad... 42538 \n", 458 | "2 backport/owid/latest/dataset_941_technology_ad... 42540 \n", 459 | "3 backport/owid/latest/dataset_941_technology_ad... 42535 \n", 460 | "4 backport/owid/latest/dataset_941_technology_ad... 42542 \n", 461 | "\n", 462 | " short_name \\\n", 463 | "0 atm__comin_and_hobijn__2004 \n", 464 | "1 agricultural_tractor__comin_and_hobijn__2004 \n", 465 | "2 aviation_passenger_km__comin_and_hobijn__2004 \n", 466 | "3 canals__isard__1942 \n", 467 | "4 card_payments__comin_and_hobijn__2004 \n", 468 | "\n", 469 | " table_path \\\n", 470 | "0 backport/owid/latest/dataset_941_technology_ad... \n", 471 | "1 backport/owid/latest/dataset_941_technology_ad... \n", 472 | "2 backport/owid/latest/dataset_941_technology_ad... \n", 473 | "3 backport/owid/latest/dataset_941_technology_ad... \n", 474 | "4 backport/owid/latest/dataset_941_technology_ad... \n", 475 | "\n", 476 | " table_db_name \\\n", 477 | "0 backport__owid__latest__dataset_941_technology... \n", 478 | "1 backport__owid__latest__dataset_941_technology... \n", 479 | "2 backport__owid__latest__dataset_941_technology... \n", 480 | "3 backport__owid__latest__dataset_941_technology... \n", 481 | "4 backport__owid__latest__dataset_941_technology... \n", 482 | "\n", 483 | " dataset_short_name variable_type \\\n", 484 | "0 dataset_941_technology_adoption__isard__1942__... FLOAT \n", 485 | "1 dataset_941_technology_adoption__isard__1942__... BIGINT \n", 486 | "2 dataset_941_technology_adoption__isard__1942__... BIGINT \n", 487 | "3 dataset_941_technology_adoption__isard__1942__... FLOAT \n", 488 | "4 dataset_941_technology_adoption__isard__1942__... BIGINT \n", 489 | "\n", 490 | " dimension_values \n", 491 | "0 {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... \n", 492 | "1 {\"year\": [\"1961\", \"1962\", \"1963\", \"1964\", \"196... \n", 493 | "2 {\"year\": [\"1930\", \"1931\", \"1932\", \"1933\", \"193... \n", 494 | "3 {\"year\": [\"1800\", \"1803\", \"1805\", \"1807\", \"180... \n", 495 | "4 {\"year\": [\"1988\", \"1989\", \"1990\", \"1991\", \"199... " 496 | ] 497 | }, 498 | "execution_count": 3, 499 | "metadata": {}, 500 | "output_type": "execute_result" 501 | } 502 | ], 503 | "source": [ 504 | "q = \"\"\"\n", 505 | "select\n", 506 | " *\n", 507 | "from meta_variables\n", 508 | "limit 5\n", 509 | "\"\"\"\n", 510 | "con.execute(q).fetch_df()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 4, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/html": [ 521 | "
\n", 522 | "\n", 535 | "\n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | "
cidnametypenotnulldflt_valuepk
00yearUBIGINTFalseNaNFalse
11entity_nameentity_nameFalseNaNFalse
22entity_idBIGINTFalseNaNFalse
33entity_codeentity_codeFalseNaNFalse
44atm__comin_and_hobijn__2004FLOATFalseNaNFalse
55agricultural_tractor__comin_and_hobijn__2004BIGINTFalseNaNFalse
66aviation_passenger_km__comin_and_hobijn__2004BIGINTFalseNaNFalse
77canals__isard__1942FLOATFalseNaNFalse
88card_payments__comin_and_hobijn__2004BIGINTFalseNaNFalse
99commercial_vehicles__comin_and_hobijn__2004FLOATFalseNaNFalse
1010credit_and_debit_payments__comin_and_hobijn__2004FLOATFalseNaNFalse
1111crude_steel_production__blast_oxygen_furnaces_...BIGINTFalseNaNFalse
1212crude_steel_production__electric_furnaces__com...INTEGERFalseNaNFalse
1313diesel_locomotives_in_service__us_census_burea...INTEGERFalseNaNFalse
1414mri_units__comin_and_hobijn__2004INTEGERFalseNaNFalse
1515mail__mitchell__1998BIGINTFalseNaNFalse
1616newspapers__comin_and_hobijn__2004BIGINTFalseNaNFalse
1717rail_passenger_km__comin_and_hobijn__2004BIGINTFalseNaNFalse
1818retail_locations_accepting_card__comin_and_hob...FLOATFalseNaNFalse
1919roads__us_census_bureau__2017INTEGERFalseNaNFalse
2020steamships__tons__comin_and_hobijn__2004INTEGERFalseNaNFalse
2121synthetic__non_cellulosic__fibres__comin_and_h...BIGINTFalseNaNFalse
2222telegrams__mitchell__1998FLOATFalseNaNFalse
\n", 757 | "
" 758 | ], 759 | "text/plain": [ 760 | " cid name type \\\n", 761 | "0 0 year UBIGINT \n", 762 | "1 1 entity_name entity_name \n", 763 | "2 2 entity_id BIGINT \n", 764 | "3 3 entity_code entity_code \n", 765 | "4 4 atm__comin_and_hobijn__2004 FLOAT \n", 766 | "5 5 agricultural_tractor__comin_and_hobijn__2004 BIGINT \n", 767 | "6 6 aviation_passenger_km__comin_and_hobijn__2004 BIGINT \n", 768 | "7 7 canals__isard__1942 FLOAT \n", 769 | "8 8 card_payments__comin_and_hobijn__2004 BIGINT \n", 770 | "9 9 commercial_vehicles__comin_and_hobijn__2004 FLOAT \n", 771 | "10 10 credit_and_debit_payments__comin_and_hobijn__2004 FLOAT \n", 772 | "11 11 crude_steel_production__blast_oxygen_furnaces_... BIGINT \n", 773 | "12 12 crude_steel_production__electric_furnaces__com... INTEGER \n", 774 | "13 13 diesel_locomotives_in_service__us_census_burea... INTEGER \n", 775 | "14 14 mri_units__comin_and_hobijn__2004 INTEGER \n", 776 | "15 15 mail__mitchell__1998 BIGINT \n", 777 | "16 16 newspapers__comin_and_hobijn__2004 BIGINT \n", 778 | "17 17 rail_passenger_km__comin_and_hobijn__2004 BIGINT \n", 779 | "18 18 retail_locations_accepting_card__comin_and_hob... FLOAT \n", 780 | "19 19 roads__us_census_bureau__2017 INTEGER \n", 781 | "20 20 steamships__tons__comin_and_hobijn__2004 INTEGER \n", 782 | "21 21 synthetic__non_cellulosic__fibres__comin_and_h... BIGINT \n", 783 | "22 22 telegrams__mitchell__1998 FLOAT \n", 784 | "\n", 785 | " notnull dflt_value pk \n", 786 | "0 False NaN False \n", 787 | "1 False NaN False \n", 788 | "2 False NaN False \n", 789 | "3 False NaN False \n", 790 | "4 False NaN False \n", 791 | "5 False NaN False \n", 792 | "6 False NaN False \n", 793 | "7 False NaN False \n", 794 | "8 False NaN False \n", 795 | "9 False NaN False \n", 796 | "10 False NaN False \n", 797 | "11 False NaN False \n", 798 | "12 False NaN False \n", 799 | "13 False NaN False \n", 800 | "14 False NaN False \n", 801 | "15 False NaN False \n", 802 | "16 False NaN False \n", 803 | "17 False NaN False \n", 804 | "18 False NaN False \n", 805 | "19 False NaN False \n", 806 | "20 False NaN False \n", 807 | "21 False NaN False \n", 808 | "22 False NaN False " 809 | ] 810 | }, 811 | "execution_count": 4, 812 | "metadata": {}, 813 | "output_type": "execute_result" 814 | } 815 | ], 816 | "source": [ 817 | "table_name = \"backport__owid__latest__dataset_941_technology_adoption__isard__1942__and_others__dataset_941_technology_adoption__isard__1942__and_others\"\n", 818 | "con.execute(f\"PRAGMA table_info('{table_name}');\").fetch_df()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 8, 824 | "metadata": {}, 825 | "outputs": [ 826 | { 827 | "data": { 828 | "text/plain": [ 829 | "short_name atm__comin_and_hobijn__2004\n", 830 | "table_path backport/owid/latest/dataset_941_technology_ad...\n", 831 | "table_db_name backport__owid__latest__dataset_941_technology...\n", 832 | "variable_type FLOAT\n", 833 | "years_values [1988, 1989, 1990, 1991, 1992, 1993, 1994, 199...\n", 834 | "entities_values {'entity_id': [13], 'entity_code': ['USA'], 'e...\n", 835 | "title ATM (Comin and Hobijn (2004))\n", 836 | "description Number of electro-mechanical devices that perm...\n", 837 | "sources [{'name': 'Isard (1942) and others', 'descript...\n", 838 | "grapher_meta {'id': 42539, 'name': 'ATM (Comin and Hobijn (...\n", 839 | "variable_id 42539\n", 840 | "unit NaN\n", 841 | "short_unit NaN\n", 842 | "Name: 0, dtype: object" 843 | ] 844 | }, 845 | "execution_count": 8, 846 | "metadata": {}, 847 | "output_type": "execute_result" 848 | } 849 | ], 850 | "source": [ 851 | "q = f\"\"\"\n", 852 | "select * from meta_variables\n", 853 | "where variable_id = 42539\n", 854 | "\"\"\"\n", 855 | "r = con.execute(q).fetch_df()\n", 856 | "r.iloc[0]" 857 | ] 858 | } 859 | ], 860 | "metadata": { 861 | "interpreter": { 862 | "hash": "7cea4047479146d1310ae40921f620e4d325b759c497d12e215f27b54afd0461" 863 | }, 864 | "kernelspec": { 865 | "display_name": "Python 3.9.12 ('.venv': poetry)", 866 | "language": "python", 867 | "name": "python3" 868 | }, 869 | "language_info": { 870 | "codemirror_mode": { 871 | "name": "ipython", 872 | "version": 3 873 | }, 874 | "file_extension": ".py", 875 | "mimetype": "text/x-python", 876 | "name": "python", 877 | "nbconvert_exporter": "python", 878 | "pygments_lexer": "ipython3", 879 | "version": "3.10.0" 880 | }, 881 | "orig_nbformat": 4 882 | }, 883 | "nbformat": 4, 884 | "nbformat_minor": 2 885 | } 886 | -------------------------------------------------------------------------------- /crawler/utils.py: -------------------------------------------------------------------------------- 1 | def sanitize_table_path(path): 2 | # NOTE: version can contain - in dates (e.g. 2020-10-01) 3 | return path.replace("/", "__").replace("-", "_") 4 | -------------------------------------------------------------------------------- /data_api/__init__.py: -------------------------------------------------------------------------------- 1 | # this file is needed by poetry in order to run scripts without errors 2 | -------------------------------------------------------------------------------- /default.mk: -------------------------------------------------------------------------------- 1 | # 2 | # default.mk 3 | # 4 | 5 | SRC = src test 6 | 7 | default: help 8 | 9 | help-default: 10 | @echo 'Available commands:' 11 | @echo 12 | @echo ' make test Run all linting and unit tests' 13 | @echo ' make watch Run all tests, watching for changes' 14 | @echo 15 | 16 | # check formatting before lint, since an autoformat might fix linting issues 17 | test-default: check-formatting lint check-typing unittest 18 | 19 | .venv-default: 20 | @echo '==> Installing packages' 21 | git submodule update --init 22 | poetry install 23 | touch $@ 24 | 25 | lint-default: .venv 26 | @echo '==> Linting' 27 | @.venv/bin/flake8 $(SRC) 28 | 29 | check-formatting-default: .venv 30 | @echo '==> Checking formatting' 31 | @.venv/bin/black --check $(SRC) 32 | 33 | check-typing-default: .venv 34 | @echo '==> Checking types' 35 | .venv/bin/mypy $(SRC) 36 | 37 | unittest-default: .venv 38 | @echo '==> Running unit tests' 39 | .venv/bin/pytest $(SRC) 40 | 41 | format-default: .venv 42 | @echo '==> Reformatting files' 43 | @.venv/bin/black $(SRC) 44 | 45 | watch-default: .venv 46 | .venv/bin/watchmedo shell-command -c 'clear; make test' --recursive --drop . 47 | 48 | # allow you to override a command, e.g. "watch", but if you do not, then use 49 | # the default 50 | %: %-default 51 | @true 52 | -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Demo 2 | 3 | Start API with `make api` and then run demo with 4 | 5 | ``` 6 | python -m demo/demo.py 7 | ``` 8 | 9 | ## Installation 10 | 11 | Install additional requirements for pywebio 12 | 13 | ``` 14 | pip install -r demo/requirements.txt 15 | ``` 16 | -------------------------------------------------------------------------------- /demo/demo.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import Any, Literal 4 | 5 | import pandas as pd 6 | import time 7 | import requests 8 | import yaml 9 | from pywebio import config 10 | from pywebio import input as pi 11 | from pywebio import output as po 12 | from pywebio import pin as pn 13 | from pywebio import start_server 14 | from pywebio.pin import pin 15 | from pywebio.session import set_env 16 | 17 | from app.v1.schemas import SearchResponse 18 | 19 | API_URL = "http://127.0.0.1:8000" 20 | 21 | CURRENT_DIR = Path(__file__).parent 22 | 23 | 24 | def _df_to_array(df: pd.DataFrame) -> list[list[Any]]: 25 | return [df.columns] + df.to_numpy().tolist() 26 | 27 | 28 | def _api_search(term, channels) -> pd.DataFrame: 29 | url = f"{API_URL}/v1/search" 30 | resp = requests.get(url, params={"term": term, "channels": channels}) 31 | print(f"Searching for {term}...") 32 | return pd.DataFrame(resp.json()["results"]) 33 | 34 | 35 | def _api_etl_data(data_url, limit: int) -> pd.DataFrame: 36 | url = f"{API_URL}{data_url}.feather?limit={limit}" 37 | return pd.read_feather(url) 38 | 39 | 40 | def _style_truncate(max_width="300px", max_lines=3): 41 | return f""" 42 | display: -webkit-box; 43 | max-width: {max_width}; 44 | -webkit-line-clamp: {max_lines}; 45 | -webkit-box-orient: vertical; 46 | overflow: hidden; 47 | """ 48 | 49 | 50 | def _list_channels() -> list[str]: 51 | url = f"{API_URL}/v1/dataset/data" 52 | return requests.get(url).json()["channels"] 53 | 54 | 55 | def _list_datasets() -> list[str]: 56 | url = f"{API_URL}/v1/datasets" 57 | return requests.get(url).json()["datasets"] 58 | 59 | 60 | def _put_table_preview(r: SearchResponse) -> None: 61 | """Show data of search result in a table preview""" 62 | t = time.time() 63 | df = _api_etl_data(r.data_url, limit=20) 64 | duration = time.time() - t 65 | 66 | # limit number of columns 67 | df = df.iloc[:, :10] 68 | 69 | po.put_markdown( 70 | f"""## Table {r.table_name} preview 71 | 72 | Dataframe shape: {df.shape} 73 | Dataframe size: {df.memory_usage().sum() / 1024 / 1024:.2f} MB 74 | Latency of pd.read_feather: {duration:.3f} s 75 | """ 76 | ) 77 | po.put_table(_df_to_array(df)) 78 | 79 | 80 | ACTION_BUTTONS = Literal["Variable", "Table", "Code"] 81 | 82 | 83 | def _put_search_results_table(sf: pd.DataFrame) -> None: 84 | sf["actions"] = sf.apply( 85 | lambda row: po.put_buttons( 86 | ACTION_BUTTONS.__args__, 87 | onclick=partial(_open_popup, result=SearchResponse(**row.to_dict())), 88 | ).style("min-width: 250px"), 89 | axis=1, 90 | ) 91 | 92 | # if title is missing, use short name 93 | ix = sf["variable_title"] == "nan" 94 | sf.loc[ix, "variable_title"] = sf.loc[ix, "variable_name"] 95 | 96 | sf["variable_description"] = sf["variable_description"].map( 97 | lambda s: po.put_text(s).style(_style_truncate()) 98 | ) 99 | 100 | sf["variable_title"] = sf["variable_title"].map( 101 | lambda s: po.put_text(s).style(_style_truncate()) 102 | ) 103 | 104 | sf["match"] = sf["match"].round(3) 105 | 106 | po.put_table( 107 | _df_to_array( 108 | sf[ 109 | [ 110 | "variable_title", 111 | "variable_description", 112 | "variable_unit", 113 | "dataset_title", 114 | "channel", 115 | "match", 116 | "actions", 117 | ] 118 | ] 119 | ) 120 | ).style("font-size: 14px;") 121 | 122 | 123 | def _popup_variable_details(result: SearchResponse): 124 | url = f"{API_URL}{result.metadata_url}" 125 | resp = requests.get(url) 126 | assert resp.ok 127 | js = resp.json() 128 | 129 | meta = [v for v in js["variables"] if v["short_name"] == result.variable_name][0] 130 | 131 | po.popup( 132 | "Variable details", 133 | [po.put_code(yaml.dump(meta), language="yaml")], 134 | size=po.PopupSize.LARGE, 135 | ) 136 | 137 | 138 | def _popup_table_details(result: SearchResponse) -> None: 139 | url = f"{API_URL}{result.metadata_url}" 140 | resp = requests.get(url) 141 | assert resp.ok 142 | js = resp.json() 143 | 144 | cols = ["title", "description", "unit"] 145 | df = [] 146 | for v in js["variables"][:100]: 147 | df.append({c: v[c] for c in cols}) 148 | 149 | del js["variables"] 150 | 151 | po.popup( 152 | "Table details", 153 | [ 154 | po.put_code(yaml.dump(js), language="yaml"), 155 | po.put_markdown("### Variables"), 156 | po.put_table(_df_to_array(pd.DataFrame(df))).style("font-size: 14px;"), 157 | ], 158 | size=po.PopupSize.LARGE, 159 | ) 160 | 161 | 162 | def _popup_code_snippets(result: SearchResponse) -> None: 163 | ( 164 | _, 165 | _, 166 | _, 167 | _, 168 | channel, 169 | namespace, 170 | version, 171 | dataset, 172 | table, 173 | ) = result.metadata_url.split("/") 174 | if channel == "backport": 175 | catalog_snippet = f""" 176 | table = catalog.find_one( 177 | table="{table}", 178 | dataset="{dataset}", 179 | channels=["backport"], 180 | )""".strip() 181 | else: 182 | catalog_snippet = f""" 183 | table = catalog.find_one( 184 | table="{table}", 185 | namespace="{namespace}", 186 | dataset="{dataset}", 187 | channels=["{channel}"], 188 | )""".strip() 189 | 190 | po.popup( 191 | "Code snippets", 192 | [ 193 | po.put_markdown( 194 | f""" 195 | ### Fetch metadata from API 196 | ```python 197 | r = requests.get("{API_URL}{result.metadata_url}") 198 | assert r.ok 199 | metadata = r.json() 200 | ``` 201 | 202 | ### Fetch data from API 203 | ```python 204 | df = pd.read_feather("{API_URL}{result.data_url}.feather") 205 | df.head() 206 | ``` 207 | 208 | ### Get table from Python API 209 | ```python 210 | from owid import catalog 211 | {catalog_snippet} 212 | table.head() 213 | ``` 214 | """ 215 | ) 216 | ], 217 | size=po.PopupSize.LARGE, 218 | ) 219 | 220 | 221 | def _open_popup(choice: ACTION_BUTTONS, result: SearchResponse): 222 | if choice == "Variable": 223 | _popup_variable_details(result) 224 | elif choice == "Table": 225 | _popup_table_details(result) 226 | elif choice == "Code": 227 | _popup_code_snippets(result) 228 | else: 229 | raise NotImplementedError() 230 | 231 | 232 | INIT_VALUES = { 233 | "search_term": "gdp", 234 | "channels": ["garden", "backport"], 235 | } 236 | 237 | 238 | @config(css_file="static/style.css") 239 | def app(): 240 | channels = _list_channels() 241 | datasets = _list_datasets() 242 | 243 | set_env(output_animation=False) 244 | po.put_markdown("""# OWID Data Catalog""") 245 | pn.put_input("search_term", value=INIT_VALUES["search_term"], label="Search term") 246 | pn.put_select( 247 | "channels", 248 | label="Channels", 249 | multiple=True, 250 | options=channels, 251 | value=INIT_VALUES["channels"], 252 | ) 253 | pn.put_select( 254 | "datasets", 255 | label="Datasets", 256 | multiple=True, 257 | options=datasets, 258 | ) 259 | 260 | po.put_markdown("## Results") 261 | 262 | last_search_term = None 263 | while True: 264 | # get search term inputs or channel 265 | # NOTE: we need `timeout` together with last_search_term if user types too quickly 266 | # and we don't get the input yet 267 | pn.pin_wait_change("search_term", "channels", timeout=0.1) 268 | 269 | if last_search_term == pin.search_term: 270 | continue 271 | else: 272 | last_search_term = pin.search_term 273 | 274 | with po.use_scope("md", clear=True): 275 | search_term = pin.search_term 276 | channels = pin.channels 277 | sf = _api_search(search_term, channels=channels) 278 | 279 | if not sf.empty: 280 | _put_search_results_table(sf) 281 | 282 | # NOTE: not very useful, is going away soon 283 | _put_table_preview(sf.iloc[0]) 284 | 285 | 286 | if __name__ == "__main__": 287 | start_server(app, port=8001, debug=True, static_dir="demo/static") 288 | -------------------------------------------------------------------------------- /demo/requirements.txt: -------------------------------------------------------------------------------- 1 | pywebio 2 | git+https://github.com/owid/owid-grapher-py 3 | -------------------------------------------------------------------------------- /demo/static/style.css: -------------------------------------------------------------------------------- 1 | .container { 2 | /* width: 100%; */ 3 | /* max-width: unset; */ 4 | max-width: 1200px; 5 | } 6 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: docker is not functional yet! 2 | version: "3.8" 3 | 4 | services: 5 | app: 6 | build: . 7 | env_file: 8 | - .env 9 | ports: 10 | - "8000:8000" 11 | 12 | 13 | database: 14 | image: mysql:8.0 15 | env_file: 16 | - .env 17 | ports: 18 | - "3306:3306" 19 | 20 | -------------------------------------------------------------------------------- /nbinit.py: -------------------------------------------------------------------------------- 1 | import os 2 | # import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import numpy as np 5 | # import seaborn as sns 6 | import json 7 | import sys 8 | import datetime 9 | from IPython import get_ipython 10 | import duckdb 11 | import sqlalchemy 12 | 13 | 14 | 15 | ipython = get_ipython() 16 | ipython.magic("load_ext rich") 17 | ipython.magic("load_ext sql") 18 | ipython.magic("load_ext autoreload") 19 | ipython.magic("autoreload 2") 20 | # ipython.magic("matplotlib inline") 21 | # ipython.magic("config InlineBackend.figure_format = 'svg'") 22 | ipython.magic("config SqlMagic.autopandas = True") 23 | ipython.magic("config SqlMagic.feedback = False") 24 | ipython.magic("config SqlMagic.displaycon = False") 25 | 26 | # nice / large graphs 27 | # sns.set_context("notebook") 28 | # plt.rcParams["figure.figsize"] = (6, 3) -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /profiling/profile_formats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Profile Formats\n", 8 | "\n", 9 | "Before running this test add COVID dataset into `duck.db` with\n", 10 | "\n", 11 | "```python\n", 12 | "python crawler/crawl.py --include 'covid19'\n", 13 | "```\n", 14 | "\n", 15 | "you might also want to remove the database to start from scratch `rm duck.db`. Then run the API with `make api`.\n", 16 | "\n", 17 | "### COVID dataset:\n", 18 | "\n", 19 | "- shape: ?\n", 20 | "- dataframe size: ?\n", 21 | "- S3 size: ?\n", 22 | "- DuckDB size: ?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 8, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "JSON format: 30.86s\n", 35 | "CSV format: 9.28s\n", 36 | "Feather format: 0.55s\n", 37 | "Feather format (direct): 0.47s\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "import time\n", 43 | "import pandas as pd\n", 44 | "import requests\n", 45 | "\n", 46 | "url_wo_format = 'http://127.0.0.1:8000/v1/dataset/data/garden/owid/latest/covid/covid'\n", 47 | "\n", 48 | "t = time.time()\n", 49 | "r = requests.get(url_wo_format + '.json')\n", 50 | "assert r.ok\n", 51 | "print(f'JSON format: {time.time() - t:.2f}s')\n", 52 | "\n", 53 | "t = time.time()\n", 54 | "r = requests.get(url_wo_format + '.csv')\n", 55 | "assert r.ok\n", 56 | "print(f'CSV format: {time.time() - t:.2f}s')\n", 57 | "\n", 58 | "t = time.time()\n", 59 | "r = requests.get(url_wo_format + '.feather')\n", 60 | "assert r.ok\n", 61 | "print(f'Feather format: {time.time() - t:.2f}s')\n", 62 | "\n", 63 | "t = time.time()\n", 64 | "r = requests.get(url_wo_format + '.feather_direct')\n", 65 | "assert r.ok\n", 66 | "print(f'Feather format (direct): {time.time() - t:.2f}s')" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 9, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "Size in MB 69.274788\n", 78 | "Shape (202415, 67)\n", 79 | "dtype: object" 80 | ] 81 | }, 82 | "execution_count": 9, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# dataset info\n", 89 | "df = pd.read_feather(url_wo_format + '.feather')\n", 90 | "pd.Series({\n", 91 | " \"Size in MB\": df.memory_usage(deep=True).sum() / 1e6,\n", 92 | " \"Shape\": df.shape,\n", 93 | "})" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3.10.0 ('.venv': poetry)", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.10.0" 114 | }, 115 | "orig_nbformat": 4, 116 | "vscode": { 117 | "interpreter": { 118 | "hash": "7cea4047479146d1310ae40921f620e4d325b759c497d12e215f27b54afd0461" 119 | } 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "data-api" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Our World In Data "] 6 | 7 | [tool.poetry.scripts] 8 | crawl = "crawler.crawl:main_cli" 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.9" 12 | owid-catalog = {path = "vendor/owid-catalog-py", develop = true} 13 | fastapi = "^0.67.0" 14 | pydantic = {extras = ["dotenv"], version = "^1.9.1"} 15 | pandas = "^1.4.2" 16 | SQLAlchemy = {extras = ["mypy"], version = "^1.4.39"} 17 | mysqlclient = "^2.1.0" 18 | rich = "^12.4.4" 19 | typer = "^0.4.1" 20 | duckdb = "^0.4.0" 21 | duckdb-engine = "^0.1.11" 22 | structlog = "^21.5.0" 23 | hypercorn = "^0.13.2" 24 | orjson = "^3.7.11" 25 | bugsnag = "^4.2.1" 26 | 27 | [tool.poetry.dev-dependencies] 28 | pytest = "^7.1.2" 29 | pytest-cov = "^2.10.1" 30 | autoflake = "^1.4" 31 | flake8 = "^3.8.4" 32 | mypy = "^0.961" 33 | isort = "^5.0" 34 | pre-commit = "^2.8.2" 35 | black = {version = "^22.3.0", extras = ["jupyter"]} 36 | ipykernel = "^6.13.1" 37 | types-PyYAML = "^6.0.11" 38 | types-requests = "^2.28.3" 39 | 40 | [build-system] 41 | requires = ["poetry-core>=1.1.14"] 42 | build-backend = "poetry.core.masonry.api" 43 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | profile = black 3 | known_first_party = app 4 | 5 | [flake8] 6 | # Ignore some errors, since we autoformat them away already wherever possible 7 | # from https://github.com/psf/black/blob/main/.flake8 8 | # E302 is ignored to support jupytext files 9 | ignore = E203, E266, E501, W503, E302 10 | exclude = .ipynb_checkpoints 11 | 12 | [mypy] 13 | plugins = pydantic.mypy, sqlalchemy.ext.mypy.plugin 14 | ignore_missing_imports = True 15 | follow_imports = skip 16 | strict_optional = True 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/tests/__init__.py -------------------------------------------------------------------------------- /tests/crawler/test_crawl.py: -------------------------------------------------------------------------------- 1 | from app.v1 import metadata 2 | 3 | # def test_extract_dimension_values(): 4 | # df = pd.DataFrame( 5 | # { 6 | # "entity_id": [1, 2, 3], 7 | # "entity_name": ["A", "B", "C"], 8 | # "entity_code": ["c1", "c2", "c3"], 9 | # "year": [2000, 2001, 2002], 10 | # # "value": [1, 2, 3], 11 | # } 12 | # ).set_index(["year", "entity_id", "entity_code", "entity_name"]) 13 | 14 | # dim_values = crawl._extract_dimension_values(df.index) 15 | # assert dim_values == { 16 | # "entity_zip": ["1|A|c1", "2|B|c2", "3|C|c3"], 17 | # "year": [2000, 2001, 2002], 18 | # } 19 | 20 | 21 | def test_parse_dimension_values(): 22 | dim_values = { 23 | "entity_zip": ["1|A|c1", "2|B|c2", "3|C|c3"], 24 | "year": [2000, 2001, 2002], 25 | } 26 | dims = metadata._parse_dimension_values(dim_values) 27 | assert dims == { 28 | "years": metadata.Dimension( 29 | type="int", 30 | values=[ 31 | metadata.DimensionProperties(id=2000, name=None, code=None), 32 | metadata.DimensionProperties(id=2001, name=None, code=None), 33 | metadata.DimensionProperties(id=2002, name=None, code=None), 34 | ], 35 | ), 36 | "entities": metadata.Dimension( 37 | type="int", 38 | values=[ 39 | metadata.DimensionProperties(id=1, name="A", code="c1"), 40 | metadata.DimensionProperties(id=2, name="B", code="c2"), 41 | metadata.DimensionProperties(id=3, name="C", code="c3"), 42 | ], 43 | ), 44 | } 45 | -------------------------------------------------------------------------------- /tests/sample_duck.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/owid/data-api/51c64b08605417f7a5ca5949ea425884fc9801f2/tests/sample_duck.db -------------------------------------------------------------------------------- /tests/test_v1.py: -------------------------------------------------------------------------------- 1 | import io 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | from fastapi.testclient import TestClient 6 | 7 | from app.main import app, settings 8 | 9 | client = TestClient(app) 10 | 11 | # mock settings 12 | settings.DUCKDB_PATH = Path("tests/sample_duck.db") 13 | 14 | 15 | def test_health(): 16 | response = client.get("/health") 17 | assert response.status_code == 200 18 | assert response.json()["status"] == "ok" 19 | 20 | 21 | def test_variableById_data_for_variable(): 22 | response = client.get("/v1/variableById/data/42539") 23 | assert response.status_code == 200 24 | assert set(response.json().keys()) == { 25 | "years", 26 | "entity_names", 27 | "entities", 28 | "entity_codes", 29 | "values", 30 | } 31 | 32 | 33 | def test_variableById_metadata_for_backported_variable(): 34 | # this test requires connection to the database, this is only temporary and will change once we start getting 35 | # metadata from the catalog instead of the database 36 | response = client.get("/v1/variableById/metadata/42539") 37 | assert response.status_code == 200 38 | assert response.json() == { 39 | "name": "ATM (Comin and Hobijn (2004))", 40 | "unit": "", 41 | "description": "Number of electro-mechanical devices that permit authorized users, typically using machine readable plastic cards, to withdraw cash from their accounts and/or access other services", 42 | "createdAt": "2017-09-30T19:53:00", 43 | "updatedAt": "2018-02-28T08:58:52", 44 | "coverage": "", 45 | "timespan": "", 46 | "datasetId": 941, 47 | "columnOrder": 0, 48 | "datasetName": "Technology Adoption - Isard (1942) and others", 49 | "nonRedistributable": False, 50 | "display": {}, 51 | "source": { 52 | "id": 6800, 53 | "name": "Isard (1942) and others", 54 | "dataPublishedBy": "Isard (1942) and others", 55 | "dataPublisherSource": "Scholarly work", 56 | "link": "http://www.jstor.org/stable/1927670", 57 | "retrievedDate": "28/09/2017", 58 | "additionalInfo": "Roads - Historical Statistics of the United States, Colonial Times to 1970, Volume 1 and 2. Bureau of the Census, Washington D.C. see Chapter Q - Transportation, Q50-63. Link: https://www2.census.gov/library/publications/1975/compendia/hist_stats_colonial-1970/hist_stats_colonial-1970p2-chQ.pdf;\nDiesel locomotives - Historical Statistics of the United States, Colonial Times to 1970, Volume 1 and 2. Bureau of the Census, Washington D.C. see Chapter Q - Transportation, Series Q284-312: Railroad mileage, equipment, and passenger traffic and revenue: 1890 to 1970. Link: https://www2.census.gov/library/publications/1975/compendia/hist_stats_colonial-1970/hist_stats_colonial-1970p2-chQ.pdf;\nAgricultural tractor, ATM, Aviation passenger-km, Credit and debit payments, Card payments, MRI units, Newspapers, Retail locations accepting card, Rail passenger-km, Steamships (tons), Crude steel production (blast oxygen furnaces)/(electric furnaces), Synthetic (non-cellulosic) fibres, Commercial vehicles - Comin and Hobijn (2004). Link: http://www.nber.org/data/chat/;\nMail and telegrams - Mitchell (1998) International Historical Statistics: the Americas, 1970-2000, 5th Ed", 59 | }, 60 | "type": "FLOAT", 61 | "dimensions": { 62 | "years": { 63 | "type": "int", 64 | "values": [ 65 | {"id": 1800}, 66 | {"id": 1803}, 67 | {"id": 1805}, 68 | {"id": 1807}, 69 | {"id": 1808}, 70 | {"id": 1809}, 71 | {"id": 1810}, 72 | {"id": 1811}, 73 | {"id": 1812}, 74 | {"id": 1813}, 75 | {"id": 1814}, 76 | {"id": 1815}, 77 | {"id": 1816}, 78 | {"id": 1817}, 79 | {"id": 1818}, 80 | {"id": 1819}, 81 | {"id": 1820}, 82 | {"id": 1821}, 83 | {"id": 1822}, 84 | {"id": 1823}, 85 | {"id": 1824}, 86 | {"id": 1825}, 87 | {"id": 1826}, 88 | {"id": 1827}, 89 | {"id": 1828}, 90 | {"id": 1829}, 91 | {"id": 1830}, 92 | {"id": 1831}, 93 | {"id": 1832}, 94 | {"id": 1833}, 95 | {"id": 1834}, 96 | {"id": 1835}, 97 | {"id": 1836}, 98 | {"id": 1837}, 99 | {"id": 1838}, 100 | {"id": 1839}, 101 | {"id": 1840}, 102 | {"id": 1841}, 103 | {"id": 1842}, 104 | {"id": 1843}, 105 | {"id": 1844}, 106 | {"id": 1845}, 107 | {"id": 1846}, 108 | {"id": 1847}, 109 | {"id": 1848}, 110 | {"id": 1849}, 111 | {"id": 1850}, 112 | {"id": 1851}, 113 | {"id": 1852}, 114 | {"id": 1853}, 115 | {"id": 1854}, 116 | {"id": 1855}, 117 | {"id": 1856}, 118 | {"id": 1857}, 119 | {"id": 1858}, 120 | {"id": 1859}, 121 | {"id": 1860}, 122 | {"id": 1861}, 123 | {"id": 1862}, 124 | {"id": 1863}, 125 | {"id": 1864}, 126 | {"id": 1865}, 127 | {"id": 1866}, 128 | {"id": 1867}, 129 | {"id": 1868}, 130 | {"id": 1869}, 131 | {"id": 1870}, 132 | {"id": 1871}, 133 | {"id": 1872}, 134 | {"id": 1873}, 135 | {"id": 1874}, 136 | {"id": 1875}, 137 | {"id": 1876}, 138 | {"id": 1877}, 139 | {"id": 1878}, 140 | {"id": 1879}, 141 | {"id": 1880}, 142 | {"id": 1881}, 143 | {"id": 1882}, 144 | {"id": 1883}, 145 | {"id": 1884}, 146 | {"id": 1885}, 147 | {"id": 1886}, 148 | {"id": 1887}, 149 | {"id": 1888}, 150 | {"id": 1889}, 151 | {"id": 1890}, 152 | {"id": 1891}, 153 | {"id": 1892}, 154 | {"id": 1893}, 155 | {"id": 1894}, 156 | {"id": 1895}, 157 | {"id": 1896}, 158 | {"id": 1897}, 159 | {"id": 1898}, 160 | {"id": 1899}, 161 | {"id": 1900}, 162 | {"id": 1901}, 163 | {"id": 1902}, 164 | {"id": 1903}, 165 | {"id": 1904}, 166 | {"id": 1905}, 167 | {"id": 1906}, 168 | {"id": 1907}, 169 | {"id": 1908}, 170 | {"id": 1909}, 171 | {"id": 1910}, 172 | {"id": 1911}, 173 | {"id": 1912}, 174 | {"id": 1913}, 175 | {"id": 1914}, 176 | {"id": 1915}, 177 | {"id": 1916}, 178 | {"id": 1917}, 179 | {"id": 1918}, 180 | {"id": 1919}, 181 | {"id": 1920}, 182 | {"id": 1921}, 183 | {"id": 1922}, 184 | {"id": 1923}, 185 | {"id": 1924}, 186 | {"id": 1925}, 187 | {"id": 1926}, 188 | {"id": 1927}, 189 | {"id": 1928}, 190 | {"id": 1929}, 191 | {"id": 1930}, 192 | {"id": 1931}, 193 | {"id": 1932}, 194 | {"id": 1933}, 195 | {"id": 1934}, 196 | {"id": 1935}, 197 | {"id": 1936}, 198 | {"id": 1937}, 199 | {"id": 1938}, 200 | {"id": 1939}, 201 | {"id": 1940}, 202 | {"id": 1941}, 203 | {"id": 1942}, 204 | {"id": 1943}, 205 | {"id": 1944}, 206 | {"id": 1945}, 207 | {"id": 1946}, 208 | {"id": 1947}, 209 | {"id": 1948}, 210 | {"id": 1949}, 211 | {"id": 1950}, 212 | {"id": 1951}, 213 | {"id": 1952}, 214 | {"id": 1953}, 215 | {"id": 1954}, 216 | {"id": 1955}, 217 | {"id": 1956}, 218 | {"id": 1957}, 219 | {"id": 1958}, 220 | {"id": 1959}, 221 | {"id": 1960}, 222 | {"id": 1961}, 223 | {"id": 1962}, 224 | {"id": 1963}, 225 | {"id": 1964}, 226 | {"id": 1965}, 227 | {"id": 1966}, 228 | {"id": 1967}, 229 | {"id": 1968}, 230 | {"id": 1969}, 231 | {"id": 1970}, 232 | {"id": 1971}, 233 | {"id": 1972}, 234 | {"id": 1973}, 235 | {"id": 1974}, 236 | {"id": 1975}, 237 | {"id": 1976}, 238 | {"id": 1977}, 239 | {"id": 1978}, 240 | {"id": 1979}, 241 | {"id": 1980}, 242 | {"id": 1981}, 243 | {"id": 1982}, 244 | {"id": 1983}, 245 | {"id": 1984}, 246 | {"id": 1985}, 247 | {"id": 1986}, 248 | {"id": 1987}, 249 | {"id": 1988}, 250 | {"id": 1989}, 251 | {"id": 1990}, 252 | {"id": 1991}, 253 | {"id": 1992}, 254 | {"id": 1993}, 255 | {"id": 1994}, 256 | {"id": 1995}, 257 | {"id": 1996}, 258 | {"id": 1997}, 259 | {"id": 1998}, 260 | {"id": 1999}, 261 | {"id": 2000}, 262 | {"id": 2001}, 263 | {"id": 2002}, 264 | {"id": 2003}, 265 | ], 266 | }, 267 | "entities": { 268 | "type": "int", 269 | "values": [{"id": 13, "name": "United States", "code": "USA"}], 270 | }, 271 | }, 272 | } 273 | 274 | 275 | TEST_RESPONSE_JSON = { 276 | "country": ["Afghanistan", "Afghanistan"], 277 | "population": [3280000.0, 4207000.0], 278 | "year": [1820, 1870], 279 | } 280 | 281 | 282 | def test_dataset_data_for_etl_table_json_format(): 283 | # this test requires connection to the database, this is only temporary and will change once we start getting 284 | # metadata from the catalog instead of the database 285 | response = client.get( 286 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.json", 287 | params={"limit": 2, "columns": "year,country,population"}, 288 | ) 289 | assert response.status_code == 200 290 | assert response.json() == TEST_RESPONSE_JSON 291 | 292 | 293 | def test_dataset_data_for_etl_table_csv_format(): 294 | response = client.get( 295 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.csv", 296 | params={"limit": 2, "columns": "year,country,population"}, 297 | ) 298 | assert response.status_code == 200 299 | df = pd.read_csv(io.StringIO(response.text)) 300 | assert df.to_dict(orient="list") == TEST_RESPONSE_JSON 301 | 302 | 303 | def test_dataset_data_for_etl_table_feather_format(): 304 | response = client.get( 305 | "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp.feather", 306 | params={"limit": 2, "columns": "year,country,population"}, 307 | ) 308 | assert response.status_code == 200 309 | df = pd.read_feather(io.BytesIO(response.content)) 310 | assert df.to_dict(orient="list") == TEST_RESPONSE_JSON 311 | 312 | 313 | def test_dataset_metadata_for_etl_table(): 314 | response = client.get( 315 | "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 316 | params={"limit": 2}, 317 | ) 318 | assert response.status_code == 200 319 | js = response.json() 320 | 321 | # trim long fields 322 | js["dataset"]["description"] = js["dataset"]["description"][:20] 323 | 324 | assert js == { 325 | "dataset": { 326 | "channel": "garden", 327 | "namespace": "ggdc", 328 | "short_name": "ggdc_maddison", 329 | "title": "Maddison Project Database (GGDC, 2020)", 330 | "description": "Notes:\n- Tanzania re", 331 | "sources": [ 332 | { 333 | "name": "Maddison Project Database 2020 (Bolt and van Zanden, 2020)", 334 | "url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020", 335 | "source_data_url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/data/mpd2020.xlsx", 336 | "owid_data_url": "https://walden.nyc3.digitaloceanspaces.com/ggdc/2020-10-01/ggdc_maddison.xlsx", 337 | "date_accessed": "2022-04-12", 338 | "publication_date": "2020-10-01", 339 | "publication_year": 2020, 340 | "published_by": "Bolt, Jutta and Jan Luiten van Zanden (2020), “Maddison style estimates of the evolution of the world economy. A new 2020 update“.", 341 | "publisher_source": "The Maddison Project Database is based on the work of many researchers that have produced estimates of\neconomic growth for individual countries. The full list of sources for this historical data is given for each country below.\n", 342 | } 343 | ], 344 | "licenses": [ 345 | { 346 | "name": "Creative Commons BY 4.0", 347 | "url": "https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2020", 348 | } 349 | ], 350 | "is_public": True, 351 | "checksum": "cc6d7a0cf74c962c4f0ac9d1d019a747", 352 | "version": "2020-10-01", 353 | }, 354 | "table": { 355 | "table_name": "maddison_gdp", 356 | "dataset_name": "ggdc_maddison", 357 | "version": "2020-10-01", 358 | "namespace": "ggdc", 359 | "channel": "garden", 360 | "dimensions": ["country", "year"], 361 | "path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 362 | "format": "feather", 363 | "is_public": True, 364 | }, 365 | "variables": [ 366 | { 367 | "title": "GDP per capita", 368 | "description": None, 369 | "licenses": [], 370 | "sources": [], 371 | "unit": "2011 int-$", 372 | "short_unit": "$", 373 | "display": { 374 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand", 375 | "numDecimalPlaces": 0, 376 | }, 377 | "short_name": "gdp_per_capita", 378 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 379 | "dataset_short_name": "ggdc_maddison", 380 | "variable_type": "FLOAT", 381 | }, 382 | { 383 | "title": "Population", 384 | "description": None, 385 | "licenses": [], 386 | "sources": [], 387 | "unit": "people", 388 | "short_unit": None, 389 | "display": { 390 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand" 391 | }, 392 | "short_name": "population", 393 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 394 | "dataset_short_name": "ggdc_maddison", 395 | "variable_type": "FLOAT", 396 | }, 397 | { 398 | "title": "GDP", 399 | "description": "Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population.", 400 | "licenses": [], 401 | "sources": [], 402 | "unit": "2011 int-$", 403 | "short_unit": "$", 404 | "display": { 405 | "entityAnnotationsMap": "Western Offshoots: United States, Canada, Australia and New Zealand", 406 | "numDecimalPlaces": 0, 407 | }, 408 | "short_name": "gdp", 409 | "table_path": "garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 410 | "dataset_short_name": "ggdc_maddison", 411 | "variable_type": "FLOAT", 412 | }, 413 | ], 414 | } 415 | 416 | 417 | def test_dataset_metadata_for_backported_table(): 418 | response = client.get( 419 | "/v1/dataset/metadata/backport/owid/latest/dataset_941_technology_adoption__isard__1942__and_others/dataset_941_technology_adoption__isard__1942__and_others", 420 | ) 421 | assert response.status_code == 200 422 | response.json() 423 | 424 | 425 | def test_search(): 426 | response = client.get( 427 | "/v1/search", 428 | params={"term": "population"}, 429 | ) 430 | assert response.status_code == 200 431 | js = response.json() 432 | assert js == { 433 | "results": [ 434 | { 435 | "variable_name": "population", 436 | "variable_title": "Population", 437 | "variable_description": "nan", 438 | "variable_unit": "people", 439 | "table_name": "maddison_gdp", 440 | "dataset_title": "Maddison Project Database (GGDC, 2020)", 441 | "channel": "garden", 442 | "metadata_url": "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 443 | "data_url": "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 444 | "match": 1.8276277047674334, 445 | }, 446 | { 447 | "variable_name": "gdp", 448 | "variable_title": "GDP", 449 | "variable_description": "Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) and price differences between countries. Calculated by multiplying GDP per capita with population.", 450 | "variable_unit": "2011 int-$", 451 | "table_name": "maddison_gdp", 452 | "dataset_title": "Maddison Project Database (GGDC, 2020)", 453 | "channel": "garden", 454 | "metadata_url": "/v1/dataset/metadata/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 455 | "data_url": "/v1/dataset/data/garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp", 456 | "match": 1.5464542117262898, 457 | }, 458 | ] 459 | } 460 | --------------------------------------------------------------------------------