├── datacommons_client ├── models │ ├── __init__.py │ ├── resolve.py │ ├── node.py │ ├── base.py │ └── observation.py ├── tests │ ├── README.MD │ ├── test_utils.py │ ├── test_decorators.py │ ├── models │ │ ├── test_resolve_models.py │ │ ├── test_node_models.py │ │ └── test_observation_models.py │ ├── test_names.py │ ├── endpoints │ │ ├── test_error_handling.py │ │ ├── test_observation_endpoint.py │ │ ├── test_payloads.py │ │ └── test_resolve_endpoint.py │ ├── test_dataframes.py │ └── utils │ │ └── test_graph.py ├── utils │ ├── __init__.py │ ├── decorators.py │ ├── names.py │ ├── error_handling.py │ ├── dataframes.py │ └── data_processing.py ├── endpoints │ ├── __init__.py │ ├── payloads.py │ ├── resolve.py │ ├── base.py │ └── observation.py ├── __init__.py ├── README.md └── client.py ├── datacommons_pandas ├── core.py ├── key.py ├── node.py ├── places.py ├── sparql.py ├── utils.py ├── requests.py ├── stat_vars.py ├── test │ └── __init__.py ├── examples │ ├── __init__.py │ └── df_builder.py ├── README.md ├── __init__.py ├── setup.py └── CHANGELOG.md ├── requirements.txt ├── notebooks ├── intro_data_science │ └── README.md └── README.md ├── cloudbuild.yaml ├── .github └── ISSUE_TEMPLATE │ ├── default-template.md │ └── bug_report.md ├── datacommons ├── examples │ ├── __init__.py │ ├── query.py │ ├── core.py │ └── places.py ├── test │ ├── __init__.py │ ├── set_api_key_test.py │ ├── sparql_test.py │ └── node_test.py ├── key.py ├── README.md ├── requests.py ├── __init__.py ├── setup.py ├── node.py ├── sparql.py ├── utils.py ├── core.py └── stat_vars.py ├── CONTRIBUTING.md ├── docs ├── development.md └── release.md ├── .gitignore ├── pyproject.toml └── run_test.sh /datacommons_client/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datacommons_client/tests/README.MD: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datacommons_client/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datacommons_client/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datacommons_pandas/core.py: -------------------------------------------------------------------------------- 1 | ../datacommons/core.py -------------------------------------------------------------------------------- /datacommons_pandas/key.py: -------------------------------------------------------------------------------- 1 | ../datacommons/key.py -------------------------------------------------------------------------------- /datacommons_pandas/node.py: -------------------------------------------------------------------------------- 1 | ../datacommons/node.py -------------------------------------------------------------------------------- /datacommons_pandas/places.py: -------------------------------------------------------------------------------- 1 | ../datacommons/places.py -------------------------------------------------------------------------------- /datacommons_pandas/sparql.py: -------------------------------------------------------------------------------- 1 | ../datacommons/sparql.py -------------------------------------------------------------------------------- /datacommons_pandas/utils.py: -------------------------------------------------------------------------------- 1 | ../datacommons/utils.py -------------------------------------------------------------------------------- /datacommons_pandas/requests.py: -------------------------------------------------------------------------------- 1 | ../datacommons/requests.py -------------------------------------------------------------------------------- /datacommons_pandas/stat_vars.py: -------------------------------------------------------------------------------- 1 | ../datacommons/stat_vars.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | isort==5.13.2 2 | mock 3 | pandas 4 | pytest 5 | requests==2.32.0 6 | typing_extensions==4.12.2 7 | yapf==0.40.2 8 | pydantic>=2.11 -------------------------------------------------------------------------------- /notebooks/intro_data_science/README.md: -------------------------------------------------------------------------------- 1 | All notebooks have been updated to use the V2 Python APIs and are found in the `v2/intro_data_science` directory. -------------------------------------------------------------------------------- /cloudbuild.yaml: -------------------------------------------------------------------------------- 1 | steps: 2 | - id: api_python 3 | name: python:3.10-slim 4 | entrypoint: /bin/bash 5 | args: 6 | - -c 7 | - "./run_test.sh -s && hatch run test:all" 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/default-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Default template 3 | about: 'Create an issue for all other questions about the API ' 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /datacommons_client/utils/decorators.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | try: 4 | import pandas as pd 5 | except ImportError: 6 | pd = None 7 | 8 | 9 | def requires_pandas(func): 10 | """Decorator to check if Pandas is available before executing a method.""" 11 | 12 | @wraps(func) 13 | def wrapper(*args, **kwargs): 14 | if pd is None: 15 | raise ImportError("Pandas is required for this method") 16 | return func(*args, **kwargs) 17 | 18 | return wrapper 19 | -------------------------------------------------------------------------------- /datacommons/examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /datacommons/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /datacommons_pandas/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /datacommons_pandas/examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /datacommons_client/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.1.4" 2 | """ 3 | Data Commons Client Package 4 | 5 | This package provides a Python client for interacting with the Data Commons API. 6 | """ 7 | 8 | from datacommons_client.client import DataCommonsClient 9 | from datacommons_client.endpoints.base import API 10 | from datacommons_client.endpoints.node import NodeEndpoint 11 | from datacommons_client.endpoints.observation import ObservationEndpoint 12 | from datacommons_client.endpoints.resolve import ResolveEndpoint 13 | 14 | __all__ = [ 15 | "DataCommonsClient", 16 | "API", 17 | "NodeEndpoint", 18 | "ObservationEndpoint", 19 | "ResolveEndpoint", 20 | ] 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve the API 4 | title: "[BUG] Description of bug" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /datacommons/key.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ API key related functions. 15 | """ 16 | 17 | import os 18 | 19 | # Environment variable for API key. 20 | _KEY_ENV = 'DC_API_KEY' 21 | 22 | 23 | def set_api_key(api_key): 24 | os.environ[_KEY_ENV] = api_key 25 | 26 | 27 | def get_api_key(): 28 | return os.environ.get(_KEY_ENV, '') 29 | -------------------------------------------------------------------------------- /datacommons/test/set_api_key_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API unit tests. 15 | 16 | Unit tests setting the API Key. 17 | """ 18 | import unittest 19 | 20 | import datacommons.key as key 21 | 22 | _KEY = "test-api-key" 23 | 24 | 25 | class TestApiKey(unittest.TestCase): 26 | """Unit test for setting or not setting the API Key.""" 27 | 28 | def test_set_api_key(self): 29 | key.set_api_key(_KEY) 30 | self.assertEqual(key.get_api_key(), _KEY) 31 | 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /datacommons_client/README.md: -------------------------------------------------------------------------------- 1 | # Data Commons Python API 2 | 3 | This is a Python library for accessing data in the Data Commons Graph. 4 | 5 | To get started, install this package from pip. 6 | 7 | ```bash 8 | pip install datacommons-client 9 | ``` 10 | 11 | To get additional functionality to work with Pandas DataFrames, install the package 12 | with the optional Pandas dependency. 13 | 14 | ```bash 15 | pip install "datacommons-client[Pandas]" 16 | ``` 17 | 18 | Once the package is installed, import `datacommons_client`. 19 | 20 | ```python 21 | import datacommons_client as dc 22 | ``` 23 | 24 | For more detail on getting started with the API, please visit . 25 | 26 | ## About Data Commons 27 | 28 | [Data Commons](https://datacommons.org/) is an open knowledge repository that 29 | provides a unified view across multiple public data sets and statistics. You can 30 | view what [datasets](https://datacommons.org/datasets) are currently ingested 31 | and browse the graph using our [browser](https://datacommons.org/browser). 32 | 33 | ## License 34 | 35 | Apache 2.0 36 | 37 | ## Support 38 | 39 | For questions, please send an email to `support@datacommons.org`. 40 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # Python API Development 2 | 3 | This client library supports `python>=3.10`. 4 | 5 | ## Set up 6 | If you haven't already, clone this repository. 7 | 8 | ```bash 9 | git clone https://github.com/datacommonsorg/api-python.git 10 | cd api-python 11 | ``` 12 | 13 | To set up the Python environment for development, run: 14 | 15 | ```bash 16 | ./run_test.sh -s 17 | ``` 18 | 19 | This will install `hatch`, which is the main tool used to manage the 20 | environment, dependencies, and development tools. You can also manually install 21 | `hatch` and create a virtual environment. 22 | 23 | ```bash 24 | pip install hatch 25 | hatch env create 26 | ``` 27 | 28 | ## Code style and linting 29 | We use `isort` and `yapf` for code formatting. Check formatting with: 30 | 31 | ```bash 32 | hatch run lint:check 33 | ``` 34 | 35 | To automatically fix formatting run: 36 | 37 | ```bash 38 | hatch run lint:format 39 | ``` 40 | 41 | ## Running tests 42 | 43 | To test, run: 44 | 45 | ```bash 46 | hatch run test:all 47 | ``` 48 | 49 | To debug the continuous integration tests, run: 50 | 51 | ```bash 52 | gcloud builds submit . --project=datcom-ci --config=cloudbuild.yaml 53 | ``` 54 | 55 | Both commands will run the same set of tests. -------------------------------------------------------------------------------- /datacommons_client/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from datacommons_client.utils.data_processing import group_variables_by_entity 2 | 3 | 4 | def test_group_variables_by_entity_basic(): 5 | """Test grouping with simple variable-entity mapping.""" 6 | input_data = { 7 | "var1": ["ent1", "ent2"], 8 | "var2": ["ent2", "ent3"], 9 | "var3": ["ent1"], 10 | } 11 | expected_output = { 12 | "ent1": ["var1", "var3"], 13 | "ent2": ["var1", "var2"], 14 | "ent3": ["var2"], 15 | } 16 | 17 | result = group_variables_by_entity(input_data) 18 | assert result == expected_output 19 | 20 | 21 | def test_group_variables_by_entity_duplicate_entities(): 22 | """Test grouping when a variable has duplicate entities.""" 23 | input_data = { 24 | "var1": ["ent1", "ent1", "ent2"], 25 | } 26 | result = group_variables_by_entity(input_data) 27 | assert result["ent1"].count("var1") == 2 # duplicates are preserved 28 | assert "ent2" in result 29 | assert result["ent2"] == ["var1"] 30 | 31 | 32 | def test_group_variables_by_entity_preserves_order(): 33 | """Test if the order of variables is preserved in the resulting entity lists.""" 34 | input_data = { 35 | "var1": ["ent1"], 36 | "var2": ["ent1"], 37 | "var3": ["ent1"], 38 | } 39 | result = group_variables_by_entity(input_data) 40 | assert result["ent1"] == ["var1", "var2", "var3"] 41 | -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Python API Notebooks 2 | 3 | This directory contains Colab notebooks that use the V1 Python API. For current notebooks, see the `v2` directory. 4 | 5 | Notebook | Description 6 | -------- | ----------- 7 | 8 | [`Place Similarity with Data Commons.ipynb`](https://colab.research.google.com/drive/1t7dFDSpCT16QDkNuD933QgLUL9BOdCAS) | A notebook that identifies similar places given a place and one or more statistical variables from Data Commons. 9 | [`Missing Data Imputation Tutorial.ipynb`](https://colab.research.google.com/drive/1S_rMCyRsgygd8sV-r8aLRPcKwZPFcEGb) | A notebook that analyzes the different types of time series holes and different methods of imputing those holes. 10 | [`analyzing_genomic_data.ipynb`](https://colab.research.google.com/drive/1Io7EDr4LjfPLl_l2JYY8__WbfitfNlOf) | A notebook that analyzes genetic variants within RUNX1 (provided by multiple datasets from UCSC Genome Browser, NCBI/gene, and ClinVar). 11 | [`Drug_Discovery_With_Data_Commons.ipynb`](https://colab.research.google.com/drive/1dSKYiRMn3mbDsInorQzYM0yk7sqv6fIV) | A notebook performing drug discovery by identifying novel applications of previously approved drugs using Biomedical Data Commons. 12 | [`protein-charts.ipynb`](https://colab.research.google.com/drive/1Kh-ufqobdChZ2qQgEY0rdPA2_DBmOiSG) | A notebook summarizing various protein properties and interactions using graphical visualizations. 13 | 14 | -------------------------------------------------------------------------------- /datacommons_pandas/README.md: -------------------------------------------------------------------------------- 1 | # Data Commons Pandas API 2 | 3 | This is a Python library for creating pandas objects with data in the 4 | Data Commons Graph. 5 | 6 | To get started, install this package from pip. 7 | 8 | ```bash 9 | pip install datacommons_pandas 10 | ``` 11 | 12 | Once the package is installed, import `datacommons_pandas`. 13 | 14 | ```python 15 | import datacommons_pandas as dcpd 16 | ``` 17 | 18 | For more detail on getting started with the API, please visit our 19 | [API Overview](https://docs.datacommons.org/api/pandas/). 20 | 21 | When you are ready to use the API, you can refer to `examples` for 22 | examples on how to use this package to perform various tasks. More tutorials and 23 | documentation can be found on our [tutorials page](https://docs.datacommons.org/tutorials/)! 24 | 25 | ## About Data Commons 26 | 27 | [Data Commons](https://datacommons.org/) is an open knowledge repository that 28 | provides a unified view across multiple public data sets and statistics. You can 29 | view what [datasets](https://datacommons.org/datasets) are currently ingested 30 | and browse the graph using our [browser](https://datacommons.org/browser). 31 | 32 | ## License 33 | 34 | Apache 2.0 35 | 36 | ## Support 37 | 38 | For general questions or issues about the API, please open an issue on our 39 | [issues](https://github.com/datacommonsorg/api-python/issues) page. For all other 40 | questions, please send an email to `support@datacommons.org`. 41 | -------------------------------------------------------------------------------- /datacommons/README.md: -------------------------------------------------------------------------------- 1 | # Data Commons Python API 2 | 3 | This is a Python library for accessing data in the Data Commons Graph. 4 | 5 | > See also: [Data Commons Pandas API](../datacommons_pandas/README.md). 6 | 7 | To get started, install this package from pip. 8 | 9 | ```bash 10 | pip install datacommons 11 | ``` 12 | 13 | Once the package is installed, import `datacommons`. 14 | 15 | ```python 16 | import datacommons as dc 17 | ``` 18 | 19 | For more detail on getting started with the API, please visit our 20 | [API Overview](https://docs.datacommons.org/api/). 21 | 22 | When you are ready to use the API, you can refer to `examples` for 23 | examples on how to use this package to perform various tasks. More tutorials and 24 | documentation can be found on our [tutorials page](https://docs.datacommons.org/tutorials/)! 25 | 26 | ## About Data Commons 27 | 28 | [Data Commons](https://datacommons.org/) is an open knowledge repository that 29 | provides a unified view across multiple public data sets and statistics. You can 30 | view what [datasets](https://datacommons.org/datasets) are currently ingested 31 | and browse the graph using our [browser](https://datacommons.org/browser). 32 | 33 | ## License 34 | 35 | Apache 2.0 36 | 37 | ## Support 38 | 39 | For general questions or issues about the API, please open an issue on our 40 | [issues](https://github.com/google/datacommons/issues) page. For all other 41 | questions, please send an email to `support@datacommons.org`. 42 | -------------------------------------------------------------------------------- /datacommons/examples/query.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API examples. 15 | 16 | Example on how to use the Client API SPARQL query wrapper. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import datacommons as dc 24 | 25 | 26 | def main(): 27 | # Create a SPARQL query querying for the name of some states 28 | query = (''' 29 | SELECT ?name ?dcid 30 | WHERE { 31 | ?a typeOf Place . 32 | ?a name ?name . 33 | ?a dcid ("geoId/06" "geoId/21" "geoId/24") . 34 | ?a dcid ?dcid 35 | } 36 | ''') 37 | print('> Issuing query.\n{}'.format(query)) 38 | 39 | # Iterate through all the rows in the results. 40 | print('> Printing results.\n') 41 | for row in dc.query(query_string=query): 42 | print(' {}'.format(row)) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /datacommons_client/models/resolve.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from pydantic import Field 4 | 5 | from datacommons_client.models.base import BaseDCModel 6 | from datacommons_client.models.base import DictLikeRootModel 7 | from datacommons_client.models.base import DominantType 8 | from datacommons_client.models.base import NodeDCID 9 | from datacommons_client.models.base import Query 10 | 11 | 12 | class Candidate(BaseDCModel): 13 | """Represents a candidate in the resolution response. 14 | 15 | Attributes: 16 | dcid (DCID): The Data Commons ID for the candidate. 17 | dominantType (Optional[DominantType]): The dominant type of the candidate, 18 | if available. This represents the primary type associated with the DCID. 19 | """ 20 | 21 | dcid: NodeDCID = Field(default_factory=str) 22 | dominantType: Optional[DominantType] = None 23 | 24 | 25 | class Entity(BaseDCModel): 26 | """Represents an entity with its resolution candidates. 27 | 28 | Attributes: 29 | node (Query): The query string or node being resolved. 30 | candidates (List[Candidate]): A list of candidates that match the query. 31 | """ 32 | 33 | node: Query 34 | candidates: list[Candidate] = Field(default_factory=list) 35 | 36 | 37 | class FlatCandidateMapping(BaseDCModel, 38 | DictLikeRootModel[dict[Query, 39 | list[NodeDCID] | NodeDCID]]): 40 | """A model to represent a mapping of queries to candidates.""" 41 | -------------------------------------------------------------------------------- /datacommons/requests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Send http requests to Data Commons REST API endpoints. 15 | """ 16 | 17 | from typing import Dict 18 | 19 | import requests 20 | 21 | import datacommons.key as key 22 | 23 | # REST API endpoint root 24 | _API_ROOT = "https://api.datacommons.org" 25 | 26 | 27 | def _post(path: str, data={}) -> Dict: 28 | url = _API_ROOT + path 29 | headers = {'Content-Type': 'application/json'} 30 | api_key = key.get_api_key() 31 | if api_key: 32 | headers['x-api-key'] = api_key 33 | try: 34 | resp = requests.post(url, json=data, headers=headers) 35 | if resp.status_code != 200: 36 | raise Exception( 37 | f'{resp.status_code}: {resp.reason}\n{resp.json()["message"]}') 38 | return resp.json() 39 | except requests.exceptions.Timeout: 40 | raise Exception('Data request timed out, please try again.') 41 | except requests.exceptions.RequestException as e: 42 | raise e 43 | -------------------------------------------------------------------------------- /datacommons_client/tests/test_decorators.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | 5 | from datacommons_client.utils.decorators import requires_pandas 6 | 7 | try: 8 | import pandas as pd 9 | 10 | PANDAS_AVAILABLE = True 11 | except ImportError: 12 | PANDAS_AVAILABLE = False 13 | 14 | 15 | @requires_pandas 16 | def function_requiring_pandas(): 17 | return "Pandas is available" 18 | 19 | 20 | def test_requires_pandas_with_pandas(): 21 | """Test that the function executes normally when Pandas is available.""" 22 | if PANDAS_AVAILABLE: 23 | assert function_requiring_pandas() == "Pandas is available" 24 | 25 | 26 | def test_requires_pandas_without_pandas(monkeypatch): 27 | """Test that the decorator raises ImportError when Pandas is not available.""" 28 | # Simulate Pandas being unavailable 29 | monkeypatch.setattr("datacommons_client.utils.decorators.pd", None) 30 | with pytest.raises(ImportError, match="Pandas is required for this method"): 31 | function_requiring_pandas() 32 | 33 | 34 | def test_importerror_handling(monkeypatch): 35 | """Test that the ImportError block is executed when Pandas is not installed.""" 36 | 37 | # Simulate pandas not being available 38 | with mock.patch.dict("sys.modules", {"pandas": None}): 39 | import importlib 40 | 41 | # Reload the module so that a new check of Pandas is performed 42 | import datacommons_client.utils.decorators 43 | importlib.reload(datacommons_client.utils.decorators) 44 | 45 | # Ensure pd is set to None 46 | assert datacommons_client.utils.decorators.pd is None 47 | -------------------------------------------------------------------------------- /datacommons/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # isort: skip_file 16 | 17 | ################################## IMPORTANT ################################# 18 | # All user-facing functions in this package must be symlinked to the # 19 | # datacommons_pandas pkg. This is so that users do not need to import both # 20 | # libraries for pd support. Please keep the below imports in sync with the # 21 | # __init__.py in the datacommons_pandas/ dir, and add a symlink when # 22 | # creating a new file. # 23 | # TODO: https://github.com/datacommonsorg/api-python/issues/149 # 24 | ############################################################################## 25 | 26 | # Data Commons SPARQL query support 27 | from datacommons.sparql import query 28 | 29 | # Data Commons Python API 30 | from datacommons.core import get_property_labels, get_property_values, get_triples 31 | from datacommons.places import get_places_in, get_related_places, get_stats 32 | from datacommons.stat_vars import get_stat_value, get_stat_series, get_stat_all 33 | 34 | from datacommons.key import set_api_key 35 | from datacommons.node import properties, property_values, triples 36 | -------------------------------------------------------------------------------- /datacommons_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # isort: skip_file 16 | 17 | from datacommons_pandas.df_builder import build_time_series, build_time_series_dataframe, build_multivariate_dataframe 18 | 19 | ################################ SYMLINK FILES ################################ 20 | # We include symlinks to all user-facing functions from the datacommons pkg. # 21 | # This is so that users do not need to import both libraries for pd support. # 22 | # Please keep the below in sync with the __init__.py in the datacommons/ dir # 23 | # TODO: enforce this. https://github.com/datacommonsorg/api-python/issues/149 # 24 | ##############################################@################################ 25 | # Data Commons SPARQL query support 26 | from datacommons_pandas.sparql import query 27 | 28 | # Data Commons Python API 29 | from datacommons_pandas.core import get_property_labels, get_property_values, get_triples 30 | from datacommons_pandas.places import get_places_in, get_related_places, get_stats 31 | from datacommons_pandas.stat_vars import get_stat_value, get_stat_series, get_stat_all 32 | 33 | from datacommons_pandas.key import set_api_key 34 | from datacommons_pandas.node import properties, property_values, triples 35 | -------------------------------------------------------------------------------- /datacommons_client/tests/models/test_resolve_models.py: -------------------------------------------------------------------------------- 1 | from datacommons_client.models.resolve import Candidate 2 | from datacommons_client.models.resolve import Entity 3 | 4 | 5 | def test_candidate_model_validation(): 6 | """Test that Candidate.model_validate parses full data correctly.""" 7 | json_data = {"dcid": "dcid123", "dominantType": "Place"} 8 | candidate = Candidate.model_validate(json_data) 9 | assert candidate.dcid == "dcid123" 10 | assert candidate.dominantType == "Place" 11 | 12 | 13 | def test_candidate_model_validation_partial(): 14 | """Test Candidate.model_validate with missing optional dominantType.""" 15 | json_data = {"dcid": "dcid456"} 16 | candidate = Candidate.model_validate(json_data) 17 | assert candidate.dcid == "dcid456" 18 | assert candidate.dominantType is None 19 | 20 | 21 | def test_entity_model_validation(): 22 | """Test that Entity.model_validate handles multiple candidates.""" 23 | json_data = { 24 | "node": 25 | "test_query", 26 | "candidates": [ 27 | { 28 | "dcid": "dcid123", 29 | "dominantType": "Place" 30 | }, 31 | { 32 | "dcid": "dcid456", 33 | "dominantType": "Event" 34 | }, 35 | ], 36 | } 37 | entity = Entity.model_validate(json_data) 38 | assert entity.node == "test_query" 39 | assert len(entity.candidates) == 2 40 | assert entity.candidates[0].dcid == "dcid123" 41 | assert entity.candidates[0].dominantType == "Place" 42 | assert entity.candidates[1].dcid == "dcid456" 43 | assert entity.candidates[1].dominantType == "Event" 44 | 45 | 46 | def test_entity_model_validation_empty_candidates(): 47 | """Test Entity.model_validate with no candidates.""" 48 | json_data = {"node": "test_query", "candidates": []} 49 | entity = Entity.model_validate(json_data) 50 | assert entity.node == "test_query" 51 | assert len(entity.candidates) == 0 52 | -------------------------------------------------------------------------------- /datacommons/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Build and distribute the datacommons package to PyPI.""" 15 | import os 16 | 17 | from setuptools import setup 18 | 19 | dir_path = os.path.dirname(os.path.realpath(__file__)) 20 | with open(os.path.join(dir_path, 'README.md'), 'r') as fh: 21 | long_description = fh.read() 22 | 23 | # Package metadata. 24 | NAME = 'datacommons' 25 | DESCRIPTION = 'A library to access Data Commons Python API.' 26 | URL = 'https://github.com/datacommonsorg/api-python' 27 | EMAIL = 'support@datacommons.org' 28 | AUTHOR = 'datacommons.org' 29 | REQUIRES_PYTHON = '>=3.7' 30 | VERSION = '1.4.3' 31 | REQUIRED = ['six', 'requests'] 32 | PACKAGES = ['datacommons'] 33 | 34 | setup( 35 | name=NAME, 36 | version=VERSION, 37 | description=DESCRIPTION, 38 | long_description=long_description, 39 | long_description_content_type='text/markdown', 40 | author=AUTHOR, 41 | author_email=EMAIL, 42 | maintainer=AUTHOR, 43 | maintainer_email=EMAIL, 44 | python_requires=REQUIRES_PYTHON, 45 | url=URL, 46 | packages=PACKAGES, 47 | install_requires=REQUIRED, 48 | include_package_data=True, 49 | license='Apache 2.0', 50 | classifiers=[ 51 | 'Intended Audience :: Developers', 52 | 'License :: OSI Approved :: Apache Software License', 53 | 'Programming Language :: Python', 54 | 'Programming Language :: Python :: 3.7', 55 | 'Programming Language :: Python :: Implementation :: CPython', 56 | 'Topic :: Software Development', 57 | ], 58 | ) 59 | -------------------------------------------------------------------------------- /datacommons_pandas/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Build and distribute the datacommons_pandas package to PyPI.""" 15 | import os 16 | 17 | from setuptools import setup 18 | 19 | dir_path = os.path.dirname(os.path.realpath(__file__)) 20 | with open(os.path.join(dir_path, 'README.md'), 'r') as fh: 21 | long_description = fh.read() 22 | 23 | # Package metadata. 24 | NAME = 'datacommons_pandas' 25 | DESCRIPTION = 'A library to create pandas objects using the Data Commons Python API.' 26 | URL = 'https://github.com/datacommonsorg/api-python' 27 | EMAIL = 'support@datacommons.org' 28 | AUTHOR = 'datacommons.org' 29 | REQUIRES_PYTHON = '>=3.7' 30 | VERSION = '0.0.3' 31 | REQUIRED = ['pandas', 'six', 'requests'] 32 | PACKAGES = ['datacommons_pandas'] 33 | 34 | setup( 35 | name=NAME, 36 | version=VERSION, 37 | description=DESCRIPTION, 38 | long_description=long_description, 39 | long_description_content_type='text/markdown', 40 | author=AUTHOR, 41 | author_email=EMAIL, 42 | maintainer=AUTHOR, 43 | maintainer_email=EMAIL, 44 | python_requires=REQUIRES_PYTHON, 45 | url=URL, 46 | packages=PACKAGES, 47 | install_requires=REQUIRED, 48 | include_package_data=True, 49 | license='Apache 2.0', 50 | classifiers=[ 51 | 'Intended Audience :: Developers', 52 | 'License :: OSI Approved :: Apache Software License', 53 | 'Programming Language :: Python', 54 | 'Programming Language :: Python :: 3.7', 55 | 'Programming Language :: Python :: Implementation :: CPython', 56 | 'Topic :: Software Development', 57 | ], 58 | ) 59 | -------------------------------------------------------------------------------- /datacommons_pandas/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.0.3 4 | 5 | **Date** - 11/10/2020 6 | 7 | **Release Tag** - [pd.0.0.3](https://github.com/datacommonsorg/api-python/releases/tag/pd0.0.3) 8 | 9 | **Release Status** - Current head of branch [`master`](https://github.com/datacommonsorg/api-python/tree/master) 10 | 11 | Update to use datacommons Python API 1.4.3, which returns empty data structures instead of erroring when no data is available. 12 | 13 | ## 0.0.2 14 | 15 | **Date** - 09/16/2020 16 | 17 | **Release Tag** - [pd.0.0.2](https://github.com/datacommonsorg/api-python/releases/tag/pd0.0.2) 18 | 19 | **Release Status** - Current head of branch [`master`](https://github.com/datacommonsorg/api-python/tree/master) 20 | 21 | Update to use datacommons Python API 1.4.2, which adds batching to the get_stat_all function used by build_time_series_dataframe and build_multivariate_dataframe. 22 | 23 | ## 0.0.1 24 | 25 | **Date** - 08/25/2020 26 | 27 | **Release Tag** - [pd.0.0.1](https://github.com/datacommonsorg/api-python/releases/tag/pd0.0.1) 28 | 29 | **Release Status** - Current head of branch [`master`](https://github.com/datacommonsorg/api-python/tree/master) 30 | 31 | Added pandas wrapper functions. 32 | 33 | - `build_time_series` constructs a pd.Series for a given StatisticalVariable and Place, where the time series are indexed by date. 34 | - `build_time_series_dataframe` constructs a pd.DataFrame for a given StatisticalVariable and a set of Places. The DataFrame will have Places as the index and dates as the columns. 35 | - `build_multivariate_dataframe` constructs a pd.DataFrame for a set of StatisticalVariables and a set of Places. The DataFrame will have Places as index and StatisticalVariables as the columns. The values are the most recent values for the chosen StatVarObservation options. 36 | 37 | For multi-place functions, when a StatisticalVariable has multiple StatVarObservation options, 38 | Data Commons chooses a set of StatVarObservation options that covers the most places. This 39 | ensures that the data fetched for a StatisticalVariable is comparable across places. 40 | When there is a tie, we select the StatVarObservation options set with the latest date 41 | data is available for any place. 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .dat 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # Unit test / coverage reports 32 | htmlcov/ 33 | .tox/ 34 | .nox/ 35 | .coverage 36 | .coverage.* 37 | .cache 38 | nosetests.xml 39 | coverage.xml 40 | *.cover 41 | .hypothesis/ 42 | .pytest_cache/ 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | local_settings.py 51 | db.sqlite3 52 | db.sqlite3-journal 53 | 54 | # Flask stuff: 55 | instance/ 56 | .webassets-cache 57 | 58 | # Scrapy stuff: 59 | .scrapy 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # Jupyter Notebook 68 | .ipynb_checkpoints 69 | 70 | # IPython 71 | profile_default/ 72 | ipython_config.py 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # Environments 78 | .env 79 | .venv 80 | env/ 81 | venv/ 82 | ENV/ 83 | env.bak/ 84 | venv.bak/ 85 | 86 | ### Ignore MAC OS System files ### 87 | # General 88 | .DS_Store 89 | .AppleDouble 90 | .LSOverride 91 | .profraw 92 | 93 | # Icon must end with two \r 94 | Icon 95 | 96 | # Thumbnails 97 | ._* 98 | 99 | # Files that might appear in the root of a volume 100 | .DocumentRevisions-V100 101 | .fseventsd 102 | .Spotlight-V100 103 | .TemporaryItems 104 | .Trashes 105 | .VolumeIcon.icns 106 | .com.apple.timemachine.donotpresent 107 | 108 | # Directories potentially created on remote AFP share 109 | .AppleDB 110 | .AppleDesktop 111 | Network Trash Folder 112 | Temporary Items 113 | .apdisk 114 | 115 | ### Ignore BAZEL BUILD System files ### 116 | /bazel-* 117 | 118 | ### R and RStudio ### 119 | .Rproj.user 120 | .Rhistory 121 | .RData 122 | .Ruserdata 123 | datacommons.RCheck 124 | *tar.gz 125 | 126 | ## VSCode 127 | .vscode/ 128 | 129 | ## JetBrains 130 | .idea/ 131 | 132 | # Gemini 133 | GEMINI.md 134 | .gemini/ 135 | 136 | # Temp files 137 | tmp/ -------------------------------------------------------------------------------- /datacommons_client/utils/names.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from datacommons_client.models.node import Node 4 | 5 | DEFAULT_NAME_PROPERTY: str = "name" 6 | NAME_WITH_LANGUAGE_PROPERTY: str = "nameWithLanguage" 7 | DEFAULT_NAME_LANGUAGE: str = "en" 8 | 9 | 10 | def extract_name_from_english_name_property(properties: list | Node) -> str: 11 | """ 12 | Extracts the name from a list of properties with English names. 13 | Args: 14 | properties (list): A list of properties with English names. 15 | Returns: 16 | str: The extracted name. 17 | """ 18 | if not properties: 19 | return '' 20 | 21 | if isinstance(properties, Node): 22 | properties = [properties] 23 | 24 | return properties[0].value 25 | 26 | 27 | def extract_name_from_property_with_language( 28 | properties: list, 29 | language: str, 30 | fallback_language: Optional[str] = None) -> tuple[str | None, str | None]: 31 | """ 32 | Extracts the name from a list of properties with language tags. 33 | Args: 34 | properties (list): A list of properties with language tags. 35 | language (str): The desired language code. 36 | fallback_language: If provided, this language will be used as a fallback if the requested 37 | language is not available. If not provided, no fallback will be used. 38 | 39 | Returns: 40 | tuple[str,str]: A tuple containing the extracted name and its language. 41 | """ 42 | # If a non-English language is requested, unpack the response to get it. 43 | fallback_name = None 44 | 45 | # Iterate through the properties to find the name in the specified language 46 | for candidate in properties: 47 | # If no language is specified, skip the candidate 48 | if "@" not in candidate.value: 49 | continue 50 | 51 | # Split the candidate value into name and language 52 | name, lang = candidate.value.rsplit("@", 1) 53 | 54 | # If the language matches, add the name to the dictionary. 55 | if lang == language: 56 | return name, lang 57 | # If language is 'en', store the name as a fallback 58 | if fallback_language and (lang == fallback_language): 59 | fallback_name = name 60 | 61 | # If no name was found in the specified language, use the fallback name (if available) 62 | return fallback_name, fallback_language if fallback_language else None 63 | -------------------------------------------------------------------------------- /datacommons/examples/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API examples. 15 | 16 | Basic demo for get_property_labels, get_property_values, and get_triples. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import datacommons as dc 24 | 25 | 26 | def main(): 27 | # Set the dcid to be that of Santa Clara County. 28 | dcids = ['geoId/06085', 'dc/p/zsb968m3v1f97'] 29 | 30 | # Print all incoming and outgoing properties from Santa Clara County. 31 | print('Property Labels for Santa Clara County') 32 | in_labels = dc.get_property_labels(dcids) 33 | out_labels = dc.get_property_labels(dcids, out=False) 34 | print('> Printing properties for {}'.format(dcids)) 35 | print('> Incoming properties: {}'.format(in_labels)) 36 | print('> Outgoing properties: {}'.format(out_labels)) 37 | 38 | # Print all property values for "containedInPlace" for Santa Clara County. 39 | print('Property Values for "containedInPlace" of Santa Clara County') 40 | prop_vals = dc.get_property_values(dcids, 41 | 'containedInPlace', 42 | out=False, 43 | value_type='City') 44 | print('> Cities contained in {}'.format(dcids)) 45 | for dcid in dcids: 46 | for city_dcid in prop_vals[dcid]: 47 | print(' - {}'.format(city_dcid)) 48 | 49 | # Print the first 10 triples associated with Santa Clara County 50 | print('Triples for Santa Clara County') 51 | triples = dc.get_triples(dcids) 52 | for dcid in dcids: 53 | print('> Triples for {}'.format(dcid)) 54 | for s, p, o in triples[dcid][:5]: 55 | print(' - ("{}", {}, "{}")'.format(s, p, o)) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /datacommons_client/tests/test_names.py: -------------------------------------------------------------------------------- 1 | from datacommons_client.models.node import Node 2 | from datacommons_client.utils.names import extract_name_from_english_name_property 3 | from datacommons_client.utils.names import extract_name_from_property_with_language 4 | 5 | 6 | def test_extract_name_from_english_name_property_with_list(): 7 | """Test extracting name from a list of Nodes.""" 8 | properties = [Node(value="Test Name")] 9 | result = extract_name_from_english_name_property(properties) 10 | assert result == "Test Name" 11 | 12 | 13 | def test_extract_name_from_english_empty_list(): 14 | """Test extracting name from an empty list.""" 15 | result = extract_name_from_english_name_property([]) 16 | assert result == "" 17 | 18 | 19 | def test_extract_name_from_english_not_list(): 20 | """Test extracting name from a single Node (not in a list).""" 21 | property_node = Node(value="Single Node Name") 22 | result = extract_name_from_english_name_property(property_node) 23 | assert result == "Single Node Name" 24 | 25 | 26 | def test_extract_name_from_property_with_language_match(): 27 | """Test extracting name when desired language is present.""" 28 | properties = [ 29 | Node(value="Nombre@es"), 30 | Node(value="Name@en"), 31 | ] 32 | result = extract_name_from_property_with_language(properties, 33 | language="es", 34 | fallback_language="en") 35 | assert result[0] == "Nombre" 36 | assert result[1] == "es" 37 | 38 | 39 | def test_extract_name_from_property_with_language_fallback(): 40 | """Test fallback to English when desired language is not found.""" 41 | properties = [ 42 | Node(value="Name@en"), 43 | Node(value="Nom@fr"), 44 | Node(value="Nome@it"), 45 | ] 46 | result = extract_name_from_property_with_language(properties, 47 | language="de", 48 | fallback_language="it") 49 | assert result[0] == "Nome" 50 | assert result[1] == "it" 51 | 52 | 53 | def test_extract_name_from_property_with_language_no_fallback(): 54 | """Test no result when language is not found and fallback is disabled.""" 55 | properties = [ 56 | Node(value="Name@en"), 57 | Node(value="Nom@fr"), 58 | ] 59 | result = extract_name_from_property_with_language(properties, language="de") 60 | assert result[0] is None 61 | assert result[1] is None 62 | 63 | 64 | def test_extract_name_from_property_without_language_tags(): 65 | """Test that properties without language tags are skipped.""" 66 | properties = [ 67 | Node(value="Plain str"), 68 | Node(value="Name@en"), 69 | ] 70 | result = extract_name_from_property_with_language(properties, language="en") 71 | assert result[0] == "Name" 72 | assert result[1] == "en" 73 | -------------------------------------------------------------------------------- /docs/release.md: -------------------------------------------------------------------------------- 1 | # Python API Release 2 | 3 | ## Releasing the `datacommons_client` package 4 | Support for V2 of the Data Commons API is being released as a new client library 5 | called `datacommons_client`. 6 | 7 | To release: 8 | 1. Update [CHANGELOG.md](../CHANGELOG.md) with relevant changes. 9 | 2. Bump the version by running `hatch version` followed by `patch`, `minor`, `major`, a 10 | specific version number, or `--pre beta` for a beta version, for example. 11 | 3. Build the package 12 | ```bash 13 | hatch build 14 | ``` 15 | 4. (optionally) Test the deployment process locally 16 | ```bash 17 | hatch run release:localtest 18 | ``` 19 | 5. Test the deployment process on Test PyPi 20 | ```bash 21 | hatch run release:testpypi 22 | ``` 23 | 24 | 6. Once verified, upload to PyPI: 25 | ```bash 26 | hatch run release:pypi 27 | ``` 28 | 29 | 7. Create a version tag on Git: 30 | ```bash 31 | hatch run release:tag 32 | ``` 33 | 34 | --- 35 | 36 | ## Releasing the legacy packages 37 | 38 | 39 | Note: Always release `datacommons_pandas` when `datacommons` is released. 40 | 41 | **If this is your first time releasing to PyPI**, please review the PyPI guide 42 | starting from the 43 | [setup 44 | section](https://packaging.python.org/tutorials/packaging-projects/#creating-setup-py). 45 | 46 | ## Prepare release tools 47 | 48 | ```bash 49 | python3 -m venv .env 50 | source .env/bin/activate 51 | python3 -m pip install --upgrade setuptools wheel 52 | python3 -m pip install --upgrade twine 53 | ``` 54 | 55 | ## Release to Test PyPI 56 | 57 | 1. In [datacommons/setup.py](../datacommons/setup.py) and [datacommons_pandas/setup.py](../datacommons_pandas/setup.py): 58 | 59 | - Append "-USERNAME" to the package "NAME". For example, 60 | `NAME = 'foo_package-janedoe123'`. 61 | - Increment the "VERSION" codes to something that has not been used in your 62 | test project. This will not affect the production PyPI versioning. 63 | 64 | 1. In the repo root directly, build the dists and release to TestPyPI: 65 | 66 | ```bash 67 | rm dist/* 68 | python3 datacommons/setup.py sdist bdist_wheel 69 | python3 datacommons_pandas/setup.py sdist bdist_wheel 70 | python3 -m twine upload --repository testpypi dist/* 71 | ``` 72 | 73 | ## Release to Production PyPI 74 | 75 | 1. In [datacommons/setup.py](../datacommons/setup.py) and 76 | [datacommons_pandas/setup.py](../datacommons_pandas/setup.py): 77 | 78 | - Revert the package name to `datacommons` and `datacommons_pandas` 79 | - Update and double check "VERSION" 80 | 81 | 1. Update [datacommons/CHANGELOG.md](../datacommons/CHANGELOG.md) and [datacommons_pandas/CHANGELOG.md](../datacommons_pandas/CHANGELOG.md) 82 | 83 | 1. Build the dists and release to PyPI: 84 | 85 | ```bash 86 | rm dist/* 87 | python3 datacommons/setup.py sdist bdist_wheel 88 | python3 datacommons_pandas/setup.py sdist bdist_wheel 89 | python3 -m twine upload dist/* 90 | ``` 91 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "datacommons-client" 3 | dynamic = ["version"] 4 | description = "A library to access Data Commons Python API." 5 | readme = "datacommons_client/README.md" 6 | authors = [ 7 | { name = "datacommons.org", email = "support@datacommons.org" }, 8 | { name = "one.org", email= "data@one.org"} 9 | ] 10 | maintainers = [ 11 | { name = "datacommons.org", email = "support@datacommons.org" } 12 | ] 13 | license = { file = "LICENSE" } 14 | dependencies = [ 15 | "requests>=2.32", 16 | "typing_extensions", 17 | "pydantic>=2.11" 18 | ] 19 | requires-python = ">=3.10" 20 | keywords = ["data commons", "api", "data", "development"] 21 | classifiers = [ 22 | "Intended Audience :: Developers", 23 | "License :: OSI Approved :: Apache Software License", 24 | "Programming Language :: Python", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Programming Language :: Python :: 3.13", 29 | "Programming Language :: Python :: Implementation :: CPython", 30 | "Topic :: Software Development" 31 | ] 32 | urls = { "Homepage" = "https://github.com/datacommonsorg/api-python" } 33 | 34 | [project.optional-dependencies] 35 | pandas = ["pandas"] 36 | dev = [ 37 | "pytest", 38 | "isort", 39 | "yapf", 40 | "mock", 41 | "hatch" 42 | ] 43 | 44 | [tool.hatch.version] 45 | path = "datacommons_client/__init__.py" 46 | 47 | 48 | [tool.hatch.build.targets.sdist] 49 | include = [ 50 | "datacommons_client", 51 | "README.md", 52 | "LICENSE", 53 | "CHANGELOG.md" 54 | ] 55 | 56 | [tool.hatch.build.targets.wheel] 57 | include = [ 58 | "datacommons_client" 59 | ] 60 | 61 | [tool.hatch.envs.default] 62 | dependencies = [ 63 | "pytest", 64 | "isort", 65 | "yapf", 66 | "hatch", 67 | ] 68 | 69 | [tool.hatch.envs.test] 70 | dependencies = [ 71 | "pytest", 72 | "mock", 73 | "pandas", 74 | "isort", 75 | "yapf" 76 | ] 77 | 78 | 79 | [tool.hatch.envs.test.scripts] 80 | setup = "./run_test.sh -s" 81 | all = "./run_test.sh -a" 82 | python = "./run_test.sh -p" 83 | lint = "./run_test.sh -l" 84 | 85 | [tool.hatch.envs.lint] 86 | dependencies = [ 87 | "isort", 88 | "yapf" 89 | ] 90 | 91 | [tool.hatch.envs.lint.scripts] 92 | check = "./run_test.sh -l" 93 | format = "./run_test.sh -f" 94 | 95 | [tool.hatch.envs.release] 96 | dependencies = [ 97 | "twine" 98 | ] 99 | 100 | [tool.hatch.envs.release.scripts] 101 | localtest = "hatch build && twine check dist/*" 102 | testpypi = "hatch build && twine upload --repository testpypi dist/*" 103 | pypi = "hatch build && twine upload dist/*" 104 | tag = "git commit -am 'Bump version to {version}' && git tag v{version}" 105 | 106 | 107 | [build-system] 108 | requires = ["hatchling"] 109 | build-backend = "hatchling.build" 110 | -------------------------------------------------------------------------------- /datacommons_client/tests/endpoints/test_error_handling.py: -------------------------------------------------------------------------------- 1 | from requests import Request 2 | from requests import Response 3 | 4 | from datacommons_client.utils.error_handling import APIError 5 | from datacommons_client.utils.error_handling import DataCommonsError 6 | from datacommons_client.utils.error_handling import DCAuthenticationError 7 | from datacommons_client.utils.error_handling import DCConnectionError 8 | from datacommons_client.utils.error_handling import DCStatusError 9 | from datacommons_client.utils.error_handling import InvalidDCInstanceError 10 | from datacommons_client.utils.error_handling import NoDataForPropertyError 11 | 12 | 13 | def test_data_commons_error_default_message(): 14 | """Tests that DataCommonsError uses the default message.""" 15 | error = DataCommonsError() 16 | assert str(error) == DataCommonsError.default_message 17 | 18 | 19 | def test_data_commons_error_custom_message(): 20 | """Tests that DataCommonsError uses a custom message when provided.""" 21 | error = DataCommonsError("Custom message") 22 | assert str(error) == "Custom message" 23 | 24 | 25 | def test_api_error_without_response(): 26 | """Tests APIError initialization without a Response object.""" 27 | error = APIError() 28 | assert str(error) == f"\n{APIError.default_message}" 29 | 30 | 31 | def test_api_error_with_response(): 32 | """Tests APIError initialization with a mocked Response object. 33 | 34 | Verifies that the string representation includes status code, 35 | request URL, and response text. 36 | """ 37 | mock_request = Request("GET", "http://example.com").prepare() 38 | mock_response = Response() 39 | mock_response.request = mock_request 40 | mock_response.status_code = 404 41 | mock_response._content = b"Not Found" 42 | 43 | error = APIError(response=mock_response) 44 | assert "Status Code: 404" in str(error) 45 | assert "Request URL: http://example.com" in str(error) 46 | assert "Not Found" in str(error) 47 | 48 | 49 | def test_subclass_default_messages(): 50 | """Tests that subclasses use their default messages.""" 51 | connection_error = DCConnectionError() 52 | assert DCConnectionError.default_message in str(connection_error) 53 | 54 | status_error = DCStatusError() 55 | assert DCStatusError.default_message in str(status_error) 56 | 57 | auth_error = DCAuthenticationError() 58 | assert DCAuthenticationError.default_message in str(auth_error) 59 | 60 | instance_error = InvalidDCInstanceError() 61 | assert InvalidDCInstanceError.default_message in str(instance_error) 62 | 63 | filter_error = NoDataForPropertyError() 64 | assert NoDataForPropertyError.default_message in str(filter_error) 65 | 66 | 67 | def test_subclass_custom_message(): 68 | """Tests that subclasses use custom messages when provided.""" 69 | error = DCAuthenticationError(response=Response(), 70 | message="Custom auth error") 71 | assert str(error) == "\nCustom auth error" 72 | -------------------------------------------------------------------------------- /datacommons/examples/places.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API examples. 15 | 16 | Basic demo for get_places_in 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import datacommons as dc 24 | 25 | 26 | def main(): 27 | # Create a list of dcids for Santa Clara and Montgomery County. 28 | sc, mc = 'geoId/06085', 'geoId/24031' 29 | dcids = [sc, mc] 30 | 31 | # Get all CensusTracts in these two counties. 32 | print('Get Census Tracts') 33 | tracts = dc.get_places_in(dcids, 'CensusTract') 34 | if sc in tracts: 35 | print('> 10 CensusTracts in Santa Clara County') 36 | for dcid in tracts[sc][:10]: 37 | print(' - {}'.format(dcid)) 38 | if mc in tracts: 39 | print('> 10 CensusTracts in Montgomery County') 40 | for dcid in tracts[mc][:10]: 41 | print(' - {}'.format(dcid)) 42 | 43 | # Get place stats. 44 | print('Get place stats -- all') 45 | stats = dc.get_stats(['geoId/05', 'geoId/06', 'dc/madDcid'], 46 | 'dc/0hyp6tkn18vcb', 47 | obs_dates='all') 48 | print(stats) 49 | 50 | print('Get place stats -- latest') 51 | stats = dc.get_stats(['geoId/05', 'geoId/06', 'dc/madDcid'], 52 | 'dc/0hyp6tkn18vcb') 53 | print(stats) 54 | 55 | print('Get place stats -- 2014') 56 | stats = dc.get_stats(['geoId/05', 'geoId/06', 'dc/madDcid'], 57 | 'dc/0hyp6tkn18vcb', 58 | obs_dates=['2014']) 59 | print(stats) 60 | 61 | print('Get place stats -- 2014 badly formatted') 62 | stats = dc.get_stats(['geoId/05', 'geoId/06', 'dc/madDcid'], 63 | 'dc/0hyp6tkn18vcb', 64 | obs_dates='2014') 65 | print(stats) 66 | 67 | print('Get place stats -- 2015-2016') 68 | stats = dc.get_stats(['geoId/05', 'geoId/06', 'dc/madDcid'], 69 | 'dc/0hyp6tkn18vcb', 70 | obs_dates=['2015', '2016']) 71 | print(stats) 72 | 73 | # Get related places. 74 | 75 | 76 | # TODO(*): Fix the related places example. 77 | # print('Get related places') 78 | # related_places = dc.get_related_places(['geoId/06085'], 'Person', 'count', 79 | # 'CensusACS5yrSurvey', "measuredValue", {"gender": "Female"}) 80 | # print(related_places) 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /datacommons_client/utils/error_handling.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from requests import Response 4 | 5 | 6 | class DataCommonsError(Exception): 7 | """Base exception for all Data Commons-related errors.""" 8 | 9 | default_message = "An error occurred getting data from Data Commons API." 10 | 11 | def __init__(self, message: Optional[str] = None): 12 | """Initializes a DataCommonsError with a default or custom message.""" 13 | super().__init__(message or self.default_message) 14 | 15 | 16 | class APIError(DataCommonsError): 17 | """Represents an error interacting with Data Commons API.""" 18 | 19 | default_message = "An API error occurred." 20 | 21 | def __init__( 22 | self, 23 | response: Optional[Response] = None, 24 | message: Optional[str] = None, 25 | ): 26 | """Initializes an APIError. 27 | 28 | Args: 29 | response (Optional[Response]): The response, if available. 30 | message (Optional[str]): A descriptive error message. 31 | """ 32 | super().__init__(message or self.default_message) 33 | self.response = response 34 | self.request = getattr(response, "request", None) 35 | self.status_code = getattr(response, "status_code", None) 36 | 37 | def __str__(self) -> str: 38 | """Returns a detailed string representation of the error. 39 | 40 | Returns: 41 | str: A string describing the error, including the request URL if available. 42 | """ 43 | 44 | details = f"\n{self.args[0]}" 45 | if self.status_code: 46 | details += f"\nStatus Code: {self.status_code}" 47 | if getattr(self.request, "url", None): 48 | details += f"\nRequest URL: {self.request.url}" 49 | if getattr(self.response, "text", None): 50 | details += f"\nResponse: {self.response.text}" 51 | 52 | return details 53 | 54 | 55 | class DCConnectionError(APIError): 56 | """Raised for network-related errors in the Data Commons API.""" 57 | 58 | default_message = ( 59 | "A network error occurred while connecting to the Data Commons API.") 60 | 61 | 62 | class DCStatusError(APIError): 63 | """Raised for non-2xx HTTP status code errors in the Data Commons API.""" 64 | 65 | default_message = "The Data Commons API returned a non-2xx status code." 66 | 67 | 68 | class DCAuthenticationError(APIError): 69 | """Raised for 401 Unauthorized errors in the Data Commons API.""" 70 | 71 | default_message = "Authentication failed. Please check your API key." 72 | 73 | 74 | class InvalidDCInstanceError(DataCommonsError): 75 | """Raised when an invalid Data Commons instance is provided.""" 76 | 77 | default_message = "The specified Data Commons instance is invalid." 78 | 79 | 80 | class InvalidObservationSelectError(DataCommonsError): 81 | """Raised when an invalid ObservationSelect field is provided.""" 82 | 83 | default_message = "The ObservationSelect field is invalid." 84 | 85 | 86 | class NoDataForPropertyError(DataCommonsError): 87 | """Raised when there is no data that meets the specified property filters.""" 88 | 89 | default_message = "No available data for the specified property filters." 90 | -------------------------------------------------------------------------------- /run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e # Immediately exit with failure if any command fails. 17 | 18 | YAPF_STYLE='{based_on_style: google, indent_width: 2}' 19 | FORMAT_INCLUDE_PATHS="datacommons/ datacommons_client/ datacommons_pandas/" 20 | FORMAT_EXCLUDE_PATH="**/.env/**" 21 | 22 | function setup_python { 23 | python3 -m pip install --upgrade pip hatch 24 | # here temporarily while there is an incompatibility with hatch and the newest click version 25 | # see https://github.com/pypa/hatch/pull/2051 for status updates from Hatch 26 | python3 -m pip uninstall uninstall click -y 27 | python3 -m pip install click==8.2.1 28 | hatch env create 29 | } 30 | 31 | function run_py_test { 32 | pytest -vv 33 | } 34 | 35 | function run_yapf { 36 | EXTRA_ARGS=$@ 37 | yapf $EXTRA_ARGS --recursive --parallel --style="$YAPF_STYLE" \ 38 | --exclude="$FORMAT_EXCLUDE_PATH" $FORMAT_INCLUDE_PATHS 39 | } 40 | 41 | function run_isort { 42 | EXTRA_ARGS=$@ 43 | isort $EXTRA_ARGS --profile=google --skip-glob="$FORMAT_EXCLUDE_PATH" \ 44 | $FORMAT_INCLUDE_PATHS 45 | } 46 | 47 | function run_lint_test { 48 | if ! run_yapf --diff; then 49 | echo "Fix lint errors by running: ./run_test.sh -f" 50 | exit 1 51 | fi 52 | if ! run_isort --check-only; then 53 | echo "Fix Python import sort orders by running ./run_test.sh -f" 54 | exit 1 55 | fi 56 | echo "Python style checks passed." 57 | } 58 | 59 | function run_lint_fix { 60 | run_yapf --in-place 61 | run_isort 62 | } 63 | 64 | function run_all_tests { 65 | run_py_test 66 | run_lint_test 67 | } 68 | 69 | function help { 70 | echo "Usage: $0 -asplf" 71 | echo "-a Run all tests" 72 | echo "-s Set up python environment" 73 | echo "-p Run python tests" 74 | echo "-l Run lint tests" 75 | echo "-f Fix lint" 76 | exit 1 77 | } 78 | 79 | while getopts asplf OPTION; do 80 | case $OPTION in 81 | a) 82 | echo -e "### Running all tests" 83 | run_all_tests 84 | ;; 85 | s) 86 | echo -e "### Setting up python environment" 87 | setup_python 88 | ;; 89 | p) 90 | echo -e "### Running python tests" 91 | run_py_test 92 | ;; 93 | l) 94 | echo -e "### Running lint tests" 95 | run_lint_test 96 | ;; 97 | f) 98 | echo -e "### Fix lint errors" 99 | run_lint_fix 100 | ;; 101 | *) 102 | help 103 | esac 104 | done 105 | 106 | if [ $OPTIND -eq 1 ] 107 | then 108 | help 109 | fi 110 | -------------------------------------------------------------------------------- /datacommons/node.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ API to request node information. 15 | """ 16 | 17 | from typing import Dict, List 18 | 19 | from datacommons.requests import _post 20 | from datacommons.utils import _get_arrow 21 | from datacommons.utils import _get_direction 22 | 23 | 24 | def properties(nodes: List[str], is_out: bool = True) -> Dict[str, List[str]]: 25 | """Retrieves all the properties for a list of nodes. 26 | 27 | Note this only returns the property labels, not the values. 28 | Args: 29 | nodes: List of DCIDs. 30 | is_out: Whether to return out going properties. 31 | Returns: 32 | A dict keyed by node DCID, with the values being a list of properties 33 | for the queried node. 34 | """ 35 | resp = _post('/v2/node', {'nodes': nodes, 'property': _get_arrow(is_out)}) 36 | result = {} 37 | for node, item in resp.get('data', {}).items(): 38 | properties = item.get('properties', []) 39 | result[node] = properties 40 | return result 41 | 42 | 43 | def property_values(nodes: List[str], 44 | property: str, 45 | is_out: bool = True) -> Dict[str, List[str]]: 46 | """Retrieves the property values for a list of nodes. 47 | Args: 48 | nodes: List of DCIDs. 49 | property: The property label to query for. 50 | is_out: Whether the property is out going. 51 | Returns: 52 | A dict keyed by node DCID, with the values being a list of values 53 | for the queried property. 54 | """ 55 | resp = _post(f'/v1/bulk/property/values/{_get_direction(is_out)}', { 56 | 'nodes': nodes, 57 | 'property': property, 58 | }) 59 | result = {} 60 | for item in resp.get('data', []): 61 | node, values = item['node'], item.get('values', []) 62 | result[node] = [] 63 | for v in values: 64 | if 'dcid' in v: 65 | result[node].append(v['dcid']) 66 | else: 67 | result[node].append(v['value']) 68 | return result 69 | 70 | 71 | def triples(nodes: List[str], 72 | is_out: bool = True) -> Dict[str, Dict[str, List[object]]]: 73 | """Retrieves the triples for a node. 74 | Args: 75 | nodes: List of DCIDs. 76 | is_out: Whether the returned property is out going for the queried 77 | nodes. 78 | Returns: 79 | A two level dict keyed by node DCID, then by the arc property, with 80 | a list of values or DCIDs. 81 | """ 82 | resp = _post(f'/v1/bulk/triples/{_get_direction(is_out)}', 83 | data={'nodes': nodes}) 84 | result = {} 85 | for item in resp.get('data', []): 86 | node, triples = item['node'], item.get('triples', {}) 87 | result[node] = {} 88 | for property, other_nodes in triples.items(): 89 | result[node][property] = other_nodes.get('nodes', []) 90 | return result 91 | -------------------------------------------------------------------------------- /datacommons_client/tests/test_dataframes.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import pandas as pd 4 | 5 | from datacommons_client.endpoints.node import NodeEndpoint 6 | from datacommons_client.models.node import StatVarConstraint 7 | from datacommons_client.models.node import StatVarConstraints 8 | from datacommons_client.utils.dataframes import add_property_constraints_to_observations_dataframe 9 | 10 | 11 | def test_add_property_constraints_to_observations_dataframe_adds_columns(): 12 | """Adds constraint id and name columns based on statvar metadata.""" 13 | # Input observations 14 | df = pd.DataFrame([ 15 | { 16 | "date": "2020", 17 | "entity": "geo/1", 18 | "variable": "sv/A", 19 | "value": 10, 20 | "unit": "Count", 21 | }, 22 | { 23 | "date": "2020", 24 | "entity": "geo/2", 25 | "variable": "sv/B", 26 | "value": 20, 27 | "unit": "Count", 28 | }, 29 | ]) 30 | 31 | endpoint = MagicMock(spec=NodeEndpoint) 32 | 33 | endpoint.fetch_statvar_constraints.return_value = StatVarConstraints.model_validate( 34 | { 35 | "sv/A": [ 36 | StatVarConstraint( 37 | constraintId="DevelopmentFinanceScheme", 38 | constraintName="Development Finance Scheme", 39 | valueId="ODAGrants", 40 | valueName="Official Development Assistance Grants", 41 | ), 42 | StatVarConstraint( 43 | constraintId="DevelopmentFinanceRecipient", 44 | constraintName="Development Finance Recipient", 45 | valueId="country/GTM", 46 | valueName="Guatemala", 47 | ), 48 | ], 49 | "sv/B": [ 50 | StatVarConstraint( 51 | constraintId="sex", 52 | constraintName="Sex", 53 | valueId="Female", 54 | valueName="Female", 55 | ) 56 | ], 57 | }) 58 | 59 | out = add_property_constraints_to_observations_dataframe(endpoint=endpoint, 60 | observations_df=df) 61 | 62 | # Columns for constraints should be present and filled per variable 63 | assert "DevelopmentFinanceScheme" in out.columns 64 | assert "DevelopmentFinanceScheme_name" in out.columns 65 | assert ("DevelopmentFinanceRecipient" in out.columns and 66 | "DevelopmentFinanceRecipient_name" in out.columns) 67 | assert "sex" in out.columns and "sex_name" in out.columns 68 | 69 | # Row-wise checks 70 | row_a = out[out["variable"] == "sv/A"].iloc[0] 71 | assert row_a["DevelopmentFinanceScheme"] == "ODAGrants" 72 | assert row_a[ 73 | "DevelopmentFinanceScheme_name"] == "Official Development Assistance Grants" 74 | assert row_a["DevelopmentFinanceRecipient"] == "country/GTM" 75 | assert row_a["DevelopmentFinanceRecipient_name"] == "Guatemala" 76 | 77 | row_b = out[out["variable"] == "sv/B"].iloc[0] 78 | assert row_b["sex"] == "Female" 79 | assert row_b["sex_name"] == "Female" 80 | 81 | 82 | def test_add_property_constraints_to_observations_dataframe_empty(): 83 | """Empty DataFrame returns unchanged.""" 84 | endpoint = MagicMock(spec=NodeEndpoint) 85 | empty_df = pd.DataFrame([]) 86 | out = add_property_constraints_to_observations_dataframe( 87 | endpoint=endpoint, observations_df=empty_df) 88 | assert out.empty 89 | -------------------------------------------------------------------------------- /datacommons_client/utils/dataframes.py: -------------------------------------------------------------------------------- 1 | from datacommons_client.endpoints.node import NodeEndpoint 2 | from datacommons_client.utils.data_processing import flatten_names_dictionary 3 | 4 | try: 5 | import pandas as pd 6 | except ImportError: 7 | pd = None 8 | 9 | from datacommons_client.utils.decorators import requires_pandas 10 | 11 | 12 | @requires_pandas 13 | def add_entity_names_to_observations_dataframe( 14 | endpoint: NodeEndpoint, 15 | observations_df: "pd.DataFrame", # type: ignore[reportInvalidTypeForm] 16 | entity_columns: str | list[str], 17 | ) -> "pd.DataFrame": # type: ignore[reportInvalidTypeForm] 18 | """ 19 | Adds entity names to the observations DataFrame. 20 | 21 | Args: 22 | endpoint (NodeEndpoint): The NodeEndpoint instance for fetching entity names. 23 | observations_df (dict): The DataFrame containing observations. 24 | entity_columns (str | list[str]): The column(s) containing entity DCIDs. 25 | """ 26 | 27 | # Guard against empty DataFrame 28 | if observations_df.empty: 29 | return observations_df 30 | 31 | if not isinstance(entity_columns, list): 32 | entity_columns = [entity_columns] 33 | 34 | for entity_column in entity_columns: 35 | if entity_column not in observations_df.columns: 36 | raise ValueError( 37 | "The specified entity column does not exist in the DataFrame.") 38 | 39 | # Get unique entity DCIDs from the DataFrame 40 | unique_values = observations_df[entity_column].dropna().unique().tolist() 41 | 42 | # Guard against empty unique values 43 | if not unique_values: 44 | continue 45 | 46 | # Fetch entity names from the endpoint 47 | response = endpoint.fetch_entity_names(entity_dcids=unique_values) 48 | 49 | # Flatten the response to get a dictionary of names 50 | names = flatten_names_dictionary(response) 51 | 52 | # Insert the names into a column next to the entity column 53 | name_column = f"{entity_column}_name" 54 | if name_column not in observations_df.columns: 55 | observations_df.insert( 56 | loc=observations_df.columns.get_loc(entity_column) + 1, 57 | column=name_column, 58 | value=observations_df[entity_column].map(names), 59 | ) 60 | 61 | return observations_df 62 | 63 | 64 | @requires_pandas 65 | def add_property_constraints_to_observations_dataframe( 66 | endpoint: NodeEndpoint, 67 | observations_df: "pd.DataFrame", # type: ignore[reportInvalidTypeForm] 68 | ) -> "pd.DataFrame": # type: ignore[reportInvalidTypeForm] 69 | """ 70 | Adds property constraint dcids and names to the observations DataFrame. 71 | 72 | Args: 73 | endpoint (NodeEndpoint): The NodeEndpoint instance for fetching entity names. 74 | observations_df (dict): The DataFrame containing observations. 75 | """ 76 | 77 | # Guard against empty DataFrame 78 | if observations_df.empty: 79 | return observations_df 80 | 81 | # Get constraints 82 | constraints_data = endpoint.fetch_statvar_constraints( 83 | variable_dcids=observations_df.variable.unique().tolist()) 84 | 85 | for statvar, constraints in constraints_data.items(): 86 | for constraint in constraints: 87 | # Fill the columns with the corresponding values 88 | observations_df.loc[observations_df.variable == statvar, 89 | constraint.constraintId] = constraint.valueId 90 | 91 | observations_df.loc[observations_df.variable == statvar, 92 | constraint.constraintId + 93 | "_name"] = constraint.valueName 94 | 95 | return observations_df 96 | -------------------------------------------------------------------------------- /datacommons/test/sparql_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API unit tests. 15 | 16 | Unit tests for the SPARQL query wrapper. 17 | """ 18 | 19 | import unittest 20 | from unittest.mock import patch 21 | 22 | import datacommons 23 | 24 | _QUERY1 = (''' 25 | SELECT ?name ?dcid 26 | WHERE { 27 | ?a typeOf Place . 28 | ?a name ?name . 29 | ?a dcid ("geoId/06" "geoId/21" "geoId/24") . 30 | ?a dcid ?dcid 31 | } 32 | ''') 33 | 34 | _QUERY2 = (''' 35 | SELECT ?name ?dcid 36 | WHERE { 37 | ?a typeOf Place . 38 | ?a name ?name . 39 | ?a dcid ("geoId/DNE") . 40 | ?a dcid ?dcid 41 | } 42 | ''') 43 | 44 | 45 | def _post_mock(path, data): 46 | """ A mock function for _post. """ 47 | if path == "/query" and data['sparql'] == _QUERY1: 48 | return { 49 | 'header': ['?name', '?dcid'], 50 | 'rows': [{ 51 | 'cells': [{ 52 | 'value': 'California' 53 | }, { 54 | 'value': 'geoId/06' 55 | }] 56 | }, { 57 | 'cells': [{ 58 | 'value': 'Kentucky' 59 | }, { 60 | 'value': 'geoId/21' 61 | }] 62 | }, { 63 | 'cells': [{ 64 | 'value': 'Maryland' 65 | }, { 66 | 'value': 'geoId/24' 67 | }] 68 | }] 69 | } 70 | if path == "/query" and data['sparql'] == _QUERY2: 71 | return { 72 | 'header': ['?name', '?dcid'], 73 | } 74 | 75 | # Otherwise, return an empty response and a 404. 76 | return Exception('mock exception') 77 | 78 | 79 | class TestQuery(unittest.TestCase): 80 | """ Unit tests for the Query object. """ 81 | 82 | @patch('datacommons.sparql._post') 83 | def test_rows(self, _post): 84 | """ Sending a valid query returns the correct response. """ 85 | _post.side_effect = _post_mock 86 | # Create the SPARQL query 87 | selector = lambda row: row['?name'] != 'California' 88 | # Issue the query 89 | results = datacommons.query(_QUERY1) 90 | selected_results = datacommons.query(_QUERY2, select=selector) 91 | # Execute the query and iterate through the results. 92 | for idx, row in enumerate(results): 93 | if idx == 0: 94 | self.assertDictEqual(row, {'?name': 'California', '?dcid': 'geoId/06'}) 95 | if idx == 1: 96 | self.assertDictEqual(row, {'?name': 'Kentucky', '?dcid': 'geoId/21'}) 97 | if idx == 2: 98 | self.assertDictEqual(row, {'?name': 'Maryland', '?dcid': 'geoId/24'}) 99 | 100 | # Verify that the select function works. 101 | for idx, row in enumerate(selected_results): 102 | if idx == 0: 103 | self.assertDictEqual(row, {'?name': 'Kentucky', '?dcid': 'geoId/21'}) 104 | if idx == 1: 105 | self.assertDictEqual(row, {'?name': 'Maryland', '?dcid': 'geoId/24'}) 106 | 107 | @patch('datacommons.sparql._post') 108 | def test_no_rows(self, _post): 109 | """ Handles row-less response. """ 110 | _post.side_effect = _post_mock 111 | # Issue the query 112 | self.assertEqual(datacommons.query(_QUERY2), []) 113 | 114 | 115 | if __name__ == '__main__': 116 | unittest.main() 117 | -------------------------------------------------------------------------------- /datacommons_client/models/node.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import Field 4 | 5 | from datacommons_client.models.base import ArcLabel 6 | from datacommons_client.models.base import BaseDCModel 7 | from datacommons_client.models.base import DictLikeRootModel 8 | from datacommons_client.models.base import ListLikeRootModel 9 | from datacommons_client.models.base import NodeDCID 10 | from datacommons_client.models.base import Property 11 | from datacommons_client.models.base import PropertyList 12 | 13 | 14 | class Node(BaseDCModel): 15 | """Represents an individual node in the Data Commons knowledge graph. 16 | 17 | Attributes: 18 | dcid: The unique identifier for the node. 19 | name: The name of the node. 20 | provenanceId: The provenance ID for the node. 21 | types: The types associated with the node. 22 | value: The value of the node. 23 | """ 24 | dcid: Optional[str] = None 25 | name: Optional[str] = None 26 | provenanceId: Optional[str | list[str]] = None 27 | types: Optional[list[str]] = None 28 | value: Optional[str] = None 29 | 30 | 31 | class Name(BaseDCModel): 32 | """Represents a name associated with an Entity (node). 33 | 34 | Attributes: 35 | value: The name of the Entity 36 | language: The language of the name 37 | property: The property used to get the name 38 | """ 39 | 40 | value: str 41 | language: str 42 | property: str 43 | 44 | 45 | class NodeGroup(BaseDCModel): 46 | """Represents a group of nodes in the Data Commons knowledge graph. 47 | 48 | Attributes: 49 | nodes: A list of Node objects in the group. 50 | """ 51 | 52 | nodes: list[Node] = Field(default_factory=list) 53 | 54 | 55 | class Arcs(BaseDCModel): 56 | """Represents arcs in the Data Commons knowledge graph. 57 | 58 | Attributes: 59 | arcs: A dictionary mapping arc labels to NodeGroup objects. 60 | """ 61 | 62 | arcs: dict[ArcLabel, NodeGroup] = Field(default_factory=dict) 63 | 64 | 65 | class Properties(BaseDCModel): 66 | """Represents a group of properties in the Data Commons knowledge graph. 67 | 68 | Attributes: 69 | properties: A list of property strings. 70 | """ 71 | 72 | properties: Optional[PropertyList] = None 73 | 74 | 75 | class FlattenedPropertiesMapping(BaseDCModel, 76 | DictLikeRootModel[dict[NodeDCID, 77 | PropertyList]]): 78 | """A model to represent a mapping of node DCIDs to their properties.""" 79 | 80 | 81 | class FlattenedArcsMapping(BaseDCModel, 82 | DictLikeRootModel[dict[NodeDCID, dict[Property, 83 | list[Node]]]]): 84 | """A model to represent a mapping of node DCIDs to their arcs.""" 85 | 86 | 87 | class NodeList(BaseDCModel, ListLikeRootModel[list[Node]]): 88 | """A root model whose value is a list of Node objects.""" 89 | 90 | 91 | class NodeDCIDList(BaseDCModel, ListLikeRootModel[list[NodeDCID]]): 92 | """A root model whose value is a list of NodeDCID strings.""" 93 | 94 | 95 | class StatVarConstraint(BaseDCModel): 96 | """Represents a constraint for a statistical variable.""" 97 | 98 | constraintId: NodeDCID 99 | constraintName: Optional[str] = None 100 | valueId: NodeDCID 101 | valueName: Optional[str] = None 102 | 103 | 104 | class StatVarConstraints(BaseDCModel, 105 | DictLikeRootModel[dict[NodeDCID, 106 | list[StatVarConstraint]]]): 107 | """A root model whose value is a dictionary of statvar ids - a list of StatVarConstraint objects. 108 | This model is used to represent constraints associated with statistical variables. 109 | """ 110 | -------------------------------------------------------------------------------- /datacommons/sparql.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API Query Module. 15 | 16 | Implements functions for sending graph queries to the Data Commons Graph. 17 | """ 18 | 19 | from datacommons.requests import _post 20 | 21 | 22 | def query(query_string, select=None): 23 | """ Returns the results of executing a SPARQL query on the Data Commons graph. 24 | 25 | Args: 26 | query_string (:obj:`str`): The SPARQL query string. 27 | select (:obj:`func` accepting a row of the query result): A function that 28 | selects rows to be returned by :code:`query`. This function accepts a row 29 | on the results of executing :code:`query_string` and returns True if and 30 | only if the row is to be returned by :code:`query`. The row passed in as 31 | an argument is represented as a :obj:`dict` that maps a query variable in 32 | :code:`query_string` to its value in the given row. 33 | 34 | Returns: 35 | A table, represented as a :obj:`list` of rows, resulting from executing the 36 | given SPARQL query. Each row is a :obj:`dict` mapping query variable to its 37 | value in the row. If `select` is not `None`, then a row is included in the 38 | returned :obj:`list` if and only if `select` returns :obj:`True` for that 39 | row. 40 | 41 | Raises: 42 | ValueError: If the payload returned by the Data Commons REST API is 43 | malformed. 44 | 45 | Examples: 46 | We would like to query for the name associated with three states identified 47 | by their dcids 48 | `California `_, 49 | `Kentucky `_, and 50 | `Maryland `_. 51 | 52 | >>> query_str = ''' 53 | ... SELECT ?name ?dcid 54 | ... WHERE { 55 | ... ?a typeOf Place . 56 | ... ?a name ?name . 57 | ... ?a dcid ("geoId/06" "geoId/21" "geoId/24") . 58 | ... ?a dcid ?dcid 59 | ... } 60 | ... ''' 61 | >>> result = query(query_str) 62 | >>> for r in result: 63 | ... print(r) 64 | {"?name": "Maryland", "?dcid": "geoId/24"} 65 | {"?name": "Kentucky", "?dcid": "geoId/21"} 66 | {"?name": "California", "?dcid": "geoId/06"} 67 | 68 | Optionally, we can specify which rows are returned by setting :code:`select` 69 | like so. The following returns all rows where the name is "Maryland". 70 | 71 | >>> selector = lambda row: row['?name'] == 'Maryland' 72 | >>> result = query(query_str, select=selector) 73 | >>> for r in result: 74 | ... print(r) 75 | {"?name": "Maryland", "?dcid": "geoId/24"} 76 | """ 77 | resp = _post('/query', {'sparql': query_string}) 78 | # Iterate through the query results 79 | header = resp.get('header') 80 | if header is None: 81 | raise ValueError('Ill-formatted response: does not contain a header.') 82 | result_rows = [] 83 | for row in resp.get('rows', []): 84 | # Construct the map from query variable to cell value. 85 | row_map = {} 86 | for idx, cell in enumerate(row.get('cells', [])): 87 | if idx > len(header): 88 | raise ValueError('Query error: unexpected cell {}'.format(cell)) 89 | if 'value' not in cell: 90 | raise ValueError('Query error: cell missing value {}'.format(cell)) 91 | cell_var = header[idx] 92 | row_map[cell_var] = cell['value'] 93 | # Add the row to the result rows if it is selected 94 | if select is None or select(row_map): 95 | result_rows.append(row_map) 96 | return result_rows 97 | -------------------------------------------------------------------------------- /datacommons_client/models/base.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Mapping, MutableSequence 2 | from pprint import pformat 3 | from typing import Annotated, Any, Iterable, Optional, TypeAlias 4 | 5 | from pydantic import BaseModel 6 | from pydantic import BeforeValidator 7 | from pydantic import ConfigDict 8 | from pydantic import RootModel 9 | 10 | 11 | def listify(v: Any) -> list[str]: 12 | if isinstance(v, (str, bytes)): 13 | return [v] 14 | if not isinstance(v, Iterable): 15 | return [v] 16 | return list(v) 17 | 18 | 19 | variableDCID: TypeAlias = str 20 | entityDCID: TypeAlias = str 21 | facetID: TypeAlias = str 22 | ListOrStr = Annotated[list[str] | str, BeforeValidator(listify)] 23 | NextToken: TypeAlias = Optional[str] 24 | NodeDCID: TypeAlias = str 25 | ArcLabel: TypeAlias = str 26 | Property: TypeAlias = str 27 | PropertyList: TypeAlias = list[Property] 28 | Query: TypeAlias = str 29 | DominantType: TypeAlias = str 30 | 31 | 32 | class BaseDCModel(BaseModel): 33 | """Provides serialization methods for the Pydantic models used by the client.""" 34 | 35 | model_config = ConfigDict(validate_by_name=True, 36 | validate_default=True, 37 | validate_by_alias=True, 38 | use_enum_values=True, 39 | serialize_by_alias=True) 40 | 41 | def __str__(self) -> str: 42 | """Returns a string representation of the instance.""" 43 | return self.to_json() 44 | 45 | def to_dict(self, exclude_none: bool = True) -> dict[str, Any]: 46 | """Converts the instance to a dictionary. 47 | 48 | Args: 49 | exclude_none: If True, only include non-empty values in the response. 50 | 51 | Returns: 52 | Dict[str, Any]: The dictionary representation of the instance. 53 | """ 54 | 55 | return self.model_dump(mode="python", exclude_none=exclude_none) 56 | 57 | def to_json(self, exclude_none: bool = True) -> str: 58 | """Converts the instance to a JSON string. 59 | 60 | Args: 61 | exclude_none: If True, only include non-empty values in the response. 62 | 63 | Returns: 64 | str: The JSON string representation of the instance. 65 | """ 66 | return self.model_dump_json(exclude_none=exclude_none, indent=2) 67 | 68 | 69 | class DictLikeRootModel(RootModel, Mapping): 70 | """A base class for models that can be treated as dictionaries.""" 71 | 72 | def __repr__(self) -> str: 73 | return f"{self.__class__.__name__}({self.root})" 74 | 75 | def __str__(self) -> str: 76 | return pformat(self.root, compact=True, width=80) 77 | 78 | def __getitem__(self, key: str) -> Any: 79 | return self.root[key] 80 | 81 | def __iter__(self) -> Iterable[Any]: 82 | return iter(self.root) 83 | 84 | def __len__(self) -> int: 85 | return len(self.root) 86 | 87 | def __eq__(self, other: Any) -> bool: 88 | if isinstance(other, DictLikeRootModel): 89 | return self.root == other.root 90 | else: 91 | return self.root == other 92 | 93 | 94 | class ListLikeRootModel(MutableSequence, RootModel): 95 | """A base class for models that can be treated as lists.""" 96 | 97 | def __repr__(self) -> str: 98 | return f"{self.__class__.__name__}({self.root})" 99 | 100 | def __str__(self) -> str: 101 | return pformat(self.root, compact=True, width=80) 102 | 103 | def __getitem__(self, index: int) -> Any: 104 | return self.root[index] 105 | 106 | def __setitem__(self, index: int, value: Any) -> None: 107 | self.root[index] = value 108 | 109 | def __delitem__(self, index: int) -> None: 110 | del self.root[index] 111 | 112 | def __len__(self) -> int: 113 | return len(self.root) 114 | 115 | def __eq__(self, other: Any) -> bool: 116 | if isinstance(other, ListLikeRootModel): 117 | return self.root == other.root 118 | else: 119 | return self.root == other 120 | 121 | def insert(self, index: int, item: Any) -> None: 122 | """Inserts an item at a specified index in the root list.""" 123 | self.root.insert(index, item) 124 | -------------------------------------------------------------------------------- /datacommons/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Utilities Library. 15 | 16 | Various functions that can aid in the extension of the Data Commons API. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import base64 24 | from collections import defaultdict 25 | import json 26 | import os 27 | import zlib 28 | 29 | import six.moves.urllib.error 30 | import six.moves.urllib.request 31 | 32 | # --------------------------------- CONSTANTS --------------------------------- 33 | 34 | # REST API endpoint root 35 | _API_ROOT = "https://api.datacommons.org" 36 | 37 | # REST API endpoint paths 38 | _API_ENDPOINTS = { 39 | 'query': '/query', 40 | 'get_property_labels': '/node/property-labels', 41 | 'get_property_values': '/node/property-values', 42 | 'get_triples': '/node/triples', 43 | 'get_places_in': '/node/places-in', 44 | 'get_related_places': '/node/related-places', 45 | 'get_stats': '/bulk/stats', 46 | 'get_stat_value': '/stat/value', 47 | 'get_stat_series': '/stat/series', 48 | 'get_stat_all': '/stat/all', 49 | } 50 | 51 | # The default value to limit to 52 | _MAX_LIMIT = 100 53 | 54 | # Batch size for heavyweight queries. 55 | _QUERY_BATCH_SIZE = 500 56 | 57 | # Environment variable names used by the package 58 | _ENV_VAR_API_KEY = 'DC_API_KEY' 59 | 60 | # ------------------------- INTERNAL HELPER FUNCTIONS ------------------------- 61 | 62 | 63 | def _send_request(req_url, 64 | req_json={}, 65 | compress=False, 66 | post=True, 67 | use_payload=True): 68 | """ Sends a POST/GET request to req_url with req_json, default to POST. 69 | 70 | Returns: 71 | The payload returned by sending the POST/GET request formatted as a dict. 72 | """ 73 | headers = {'Content-Type': 'application/json'} 74 | 75 | # Pass along API key if provided 76 | if os.environ.get(_ENV_VAR_API_KEY): 77 | headers['x-api-key'] = os.environ[_ENV_VAR_API_KEY] 78 | 79 | # Send the request and verify the request succeeded 80 | if post: 81 | req = six.moves.urllib.request.Request( 82 | req_url, data=json.dumps(req_json).encode('utf-8'), headers=headers) 83 | else: 84 | req = six.moves.urllib.request.Request(req_url, headers=headers) 85 | try: 86 | res = six.moves.urllib.request.urlopen(req) 87 | except six.moves.urllib.error.HTTPError as e: 88 | raise ValueError( 89 | 'Response error: An HTTP {} code was returned by the REST API. ' 90 | 'Printing response\n\n{}'.format(e.code, e.read())) 91 | if isinstance(res, six.moves.urllib.error.HTTPError): 92 | raise ValueError( 93 | 'Response error: An HTTP {} code was returned by the REST API. ' 94 | 'Printing response\n\n{}'.format(res.code, res.reason)) 95 | # Get the JSON 96 | res_json = json.loads(res.read()) 97 | if not use_payload: 98 | return res_json 99 | if 'payload' not in res_json: 100 | raise ValueError('Response error: Payload not found. Printing response\n\n' 101 | '{}'.format(res.text)) 102 | 103 | # If the payload is compressed, decompress and decode it 104 | payload = res_json['payload'] 105 | if compress: 106 | payload = zlib.decompress(base64.b64decode(payload), zlib.MAX_WBITS | 32) 107 | return json.loads(payload) 108 | 109 | 110 | def _format_expand_payload(payload, new_key, must_exist=[]): 111 | """ Formats expand type payloads into dicts from dcids to lists of values. """ 112 | # Create the results dictionary from payload 113 | results = defaultdict(set) 114 | for entry in payload: 115 | if 'dcid' in entry and new_key in entry: 116 | dcid = entry['dcid'] 117 | results[dcid].add(entry[new_key]) 118 | 119 | # Ensure all dcids in must_exist have some entry in results. 120 | for dcid in must_exist: 121 | results[dcid] 122 | return {k: sorted(list(v)) for k, v in results.items()} 123 | 124 | 125 | def _get_direction(out: bool): 126 | return "out" if out else "in" 127 | 128 | 129 | def _get_arrow(out: bool): 130 | """Returns the arrow syntax for an arc direction. 131 | 132 | Args: 133 | out: Whether the arc direction is out. 134 | Returns: 135 | The corresponding arrow syntax. 136 | """ 137 | return "->" if out else "<-" 138 | -------------------------------------------------------------------------------- /datacommons_pandas/examples/df_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Basic examples for building pandas objects using the Data Commons Pandas API.""" 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import datacommons_pandas as dcpd 21 | 22 | 23 | def build_time_series_example(): 24 | 25 | print(""" 26 | # Build a pd.Series of time series for one variable and one place. 27 | $ dcpd.build_time_series('country/CAN', 'Count_WildlandFireEvent') 28 | {}""".format(dcpd.build_time_series('country/CAN', 'Count_WildlandFireEvent'))) 29 | 30 | print(""" 31 | # Build a pd.Series of time series for one variable and one place and optional args. 32 | $ dcpd.build_time_series('country/USA', 'Count_Person', 'CensusPEPSurvey') 33 | {}""".format( 34 | dcpd.build_time_series('country/USA', 'Count_Person', 'CensusPEPSurvey'))) 35 | 36 | 37 | def build_time_series_dataframe_example(): 38 | 39 | def demonstrate_build_time_series_dataframe(intro_str, 40 | places, 41 | stat_var, 42 | desc_col=False): 43 | arg_str = "{}, '{}'".format(places, stat_var) 44 | if desc_col: 45 | arg_str += ", desc_col=True" 46 | print(""" 47 | # {} 48 | $ dcpd.build_time_series_dataframe({}) 49 | {}""".format(intro_str, arg_str, 50 | dcpd.build_time_series_dataframe(places, stat_var, desc_col))) 51 | 52 | build_time_series_dataframe_params = [{ 53 | 'intro_str': 54 | 'Build a DataFrame of time series for one variable in multiple places.', 55 | 'places': ['geoId/33', 'geoId/29', 'country/USA'], 56 | 'stat_var': 57 | 'Median_Income_Person' 58 | }, { 59 | 'intro_str': 60 | 'Build a DataFrame of time series with columns sorted in descending order.', 61 | 'places': ['country/USA'], 62 | 'stat_var': 63 | 'Median_Income_Person', 64 | 'desc_col': 65 | True 66 | }] 67 | 68 | for param_set in build_time_series_dataframe_params: 69 | demonstrate_build_time_series_dataframe(**param_set) 70 | 71 | 72 | def build_multivariate_dataframe_example(): 73 | 74 | def demonstrate_build_multivariate_dataframe(intro_str, places, stat_vars): 75 | print(""" 76 | # {} 77 | $ dcpd.build_multivariate_dataframe({}, {}) 78 | {}""".format(intro_str, places, stat_vars, 79 | dcpd.build_multivariate_dataframe(places, stat_vars))) 80 | 81 | build_multivariate_dataframe_params = [{ 82 | 'intro_str': 83 | 'Build a DataFrame of latest observations for multiple variables in multiple places.', 84 | 'places': ['geoId/06', 'country/FRA'], 85 | 'stat_vars': ['Median_Age_Person', 'Count_Person', 'Count_Household'] 86 | }] 87 | 88 | for param_set in build_multivariate_dataframe_params: 89 | demonstrate_build_multivariate_dataframe(**param_set) 90 | 91 | 92 | def expect_err_examples(): 93 | 94 | print("\n\nExpect 6 errors, starting HERE:") 95 | try: 96 | dcpd.build_time_series_dataframe(['geoId/33'], 97 | ['Median_Income_Person', 'Count_Person']) 98 | except ValueError as e: 99 | print("Successfully errored on: ", e) 100 | try: 101 | dcpd.build_time_series_dataframe(24, ['Median_Income_Person']) 102 | except ValueError as e: 103 | print("Successfully errored on: ", e) 104 | try: 105 | dcpd.build_multivariate_dataframe([3], 106 | ['Median_Income_Person', 'Count_Person']) 107 | except ValueError as e: 108 | print("Successfully errored on: ", e) 109 | try: 110 | dcpd.build_multivariate_dataframe('country/USA', True) 111 | except ValueError as e: 112 | print("Successfully errored on: ", e) 113 | # If the following two do not error due to the addition of 114 | # Median_Income_Person statistics for NUTS geos, then please 115 | # replace either the places or the StatVar. 116 | try: 117 | dcpd.build_time_series_dataframe(['nuts/HU2', 'nuts/HU22'], 118 | 'Median_Income_Person') 119 | except ValueError as e: 120 | print("Successfully errored on: ", e) 121 | try: 122 | dcpd.build_multivariate_dataframe(['nuts/HU2', 'nuts/HU22'], 123 | ['Median_Income_Person']) 124 | except ValueError as e: 125 | print("Successfully errored on: ", e) 126 | print("until HERE.") 127 | 128 | 129 | def main(): 130 | build_time_series_example() 131 | build_time_series_dataframe_example() 132 | build_multivariate_dataframe_example() 133 | expect_err_examples() 134 | 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /datacommons_client/endpoints/payloads.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from pydantic import Field 4 | from pydantic import field_serializer 5 | from pydantic import field_validator 6 | from pydantic import model_serializer 7 | from pydantic import model_validator 8 | 9 | from datacommons_client.models.base import BaseDCModel 10 | from datacommons_client.models.base import ListOrStr 11 | from datacommons_client.models.observation import ObservationDate 12 | from datacommons_client.models.observation import ObservationSelect 13 | from datacommons_client.models.observation import ObservationSelectList 14 | 15 | 16 | def normalize_list_to_string(value: str | list[str]) -> str: 17 | """Converts a list of properties to a string.""" 18 | 19 | if isinstance(value, list): 20 | return f"[{', '.join(value)}]" 21 | 22 | return value 23 | 24 | 25 | class NodeRequestPayload(BaseDCModel): 26 | """ 27 | A Pydantic model to structure, normalize, and validate the payload for a Node V2 API request. 28 | 29 | Attributes: 30 | node_dcids (str | list[str]): The DCID(s) of the nodes to query. 31 | expression (str): The property or relation expression(s) to query. 32 | """ 33 | 34 | node_dcids: ListOrStr = Field(..., serialization_alias="nodes") 35 | expression: list | str = Field(..., serialization_alias="property") 36 | 37 | 38 | class ObservationRequestPayload(BaseDCModel): 39 | """ 40 | A Pydantic model to structure, normalize, and validate the payload for an Observation V2 API request. 41 | 42 | Attributes: 43 | date (str): The date for which data is being requested. 44 | variable_dcids (str | list[str]): One or more variable IDs for the data. 45 | select (list[ObservationSelect]): Fields to include in the response. 46 | Defaults to ["date", "variable", "entity", "value"]. 47 | entity_dcids (Optional[str | list[str]]): One or more entity IDs to filter the data. 48 | entity_expression (Optional[str]): A string expression to filter entities. 49 | filter_facet_domains (Optional[str | list[str]]): One or more domain names to filter the data. 50 | filter_facet_ids (Optional[str | list[str]]): One or more facet IDs to filter the data. 51 | """ 52 | 53 | date: ObservationDate | str = Field(default_factory=str, 54 | validate_default=True) 55 | variable_dcids: Optional[ListOrStr] = Field(default=None, 56 | serialization_alias="variable") 57 | select: Optional[list[str]] = None 58 | entity_dcids: Optional[ListOrStr] = None 59 | entity_expression: Optional[str | list[str]] = None 60 | filter_facet_domains: Optional[ListOrStr] = None 61 | filter_facet_ids: Optional[ListOrStr] = None 62 | 63 | @field_validator("date", mode="before") 64 | def _validate_date(cls, v): 65 | try: 66 | return ObservationDate(v) 67 | except ValueError: 68 | return v 69 | 70 | @field_validator("select", mode="before") 71 | def _coerce_select(cls, v): 72 | return ObservationSelectList.model_validate(v).select 73 | 74 | @field_validator("entity_expression", mode="before") 75 | def _coerce_expr(cls, v): 76 | if v is None: 77 | return v 78 | if isinstance(v, list): 79 | return normalize_list_to_string(v) 80 | if isinstance(v, str): 81 | return v 82 | raise TypeError("expression must be a string or list[str]") 83 | 84 | @field_serializer("variable_dcids", "entity_dcids", when_used="unless-none") 85 | def _serialise_dcids_fields(self, v): 86 | return {"dcids": v} 87 | 88 | @field_serializer("entity_expression", when_used="unless-none") 89 | def _serialise_expression_field(self, v): 90 | return {"expression": v} 91 | 92 | @model_validator(mode="after") 93 | def _check_one(self): 94 | if bool(self.entity_dcids) == bool(self.entity_expression): 95 | raise ValueError("Exactly one of dcids or expression must be set") 96 | return self 97 | 98 | @model_serializer(mode="wrap") 99 | def _wrap_filter(self, handler): 100 | # Normal dump 101 | data = handler(self) 102 | 103 | # pull out entity dcid or expression 104 | entity = data.pop("entity_dcids", None) or data.pop("entity_expression", 105 | None) 106 | 107 | # add entity to the data dictionary 108 | data["entity"] = entity 109 | 110 | # pull out the two filter keys if present 111 | domains = data.pop("filter_facet_domains", None) 112 | ids = data.pop("filter_facet_ids", None) 113 | 114 | # only add "filter" if at least one is set 115 | if domains or ids: 116 | filter_dict = {} 117 | if domains is not None: 118 | filter_dict["domains"] = domains 119 | if ids is not None: 120 | filter_dict["facet_ids"] = ids 121 | data["filter"] = filter_dict 122 | 123 | return data 124 | 125 | 126 | class ResolveRequestPayload(BaseDCModel): 127 | """ 128 | A Pydantic model to structure, normalize, and validate the payload for a Resolve V2 API request. 129 | 130 | Attributes: 131 | node_dcids (str | list[str]): The DCID(s) of the nodes to query. 132 | expression (str): The relation expression to query. 133 | """ 134 | 135 | node_dcids: ListOrStr = Field(..., serialization_alias="nodes") 136 | expression: str | list[str] = Field(..., serialization_alias="property") 137 | -------------------------------------------------------------------------------- /datacommons_client/tests/models/test_node_models.py: -------------------------------------------------------------------------------- 1 | from datacommons_client.models.node import Arcs 2 | from datacommons_client.models.node import Node 3 | from datacommons_client.models.node import NodeGroup 4 | from datacommons_client.models.node import Properties 5 | from datacommons_client.models.node import StatVarConstraint 6 | from datacommons_client.models.node import StatVarConstraints 7 | 8 | 9 | def test_node_model_validation(): 10 | """Test that Node.model_validate parses data correctly.""" 11 | json_data = { 12 | "dcid": "node123", 13 | "name": "Test Node", 14 | "provenanceId": "prov123", 15 | "types": ["TypeA", "TypeB"], 16 | "value": "42", 17 | } 18 | node = Node.model_validate(json_data) 19 | assert node.dcid == "node123" 20 | assert node.name == "Test Node" 21 | assert node.provenanceId == "prov123" 22 | assert node.types == ["TypeA", "TypeB"] 23 | assert node.value == "42" 24 | 25 | 26 | def test_node_model_validation_partial(): 27 | """Test Node.model_validate with partial data.""" 28 | json_data = { 29 | "dcid": "node123", 30 | } 31 | node = Node.model_validate(json_data) 32 | assert node.dcid == "node123" 33 | assert node.name is None 34 | assert node.provenanceId is None 35 | assert node.types is None 36 | assert node.value is None 37 | 38 | 39 | def test_nodegroup_model_validation(): 40 | """Test that NodeGroup.model_validate parses data correctly.""" 41 | json_data = { 42 | "nodes": [ 43 | { 44 | "dcid": "node1", 45 | "name": "Node 1" 46 | }, 47 | { 48 | "dcid": "node2", 49 | "name": "Node 2" 50 | }, 51 | ] 52 | } 53 | node_group = NodeGroup.model_validate(json_data) 54 | assert len(node_group.nodes) == 2 55 | assert node_group.nodes[0].dcid == "node1" 56 | assert node_group.nodes[1].name == "Node 2" 57 | 58 | 59 | def test_nodegroup_model_validation_empty(): 60 | """Test NodeGroup.model_validate with empty data.""" 61 | json_data = {} 62 | node_group = NodeGroup.model_validate(json_data) 63 | assert len(node_group.nodes) == 0 64 | 65 | 66 | def test_arcs_model_validation(): 67 | """Test that Arcs.model_validate parses data correctly.""" 68 | json_data = { 69 | "arcs": { 70 | "label1": { 71 | "nodes": [{ 72 | "dcid": "node1" 73 | }, { 74 | "dcid": "node2" 75 | }] 76 | }, 77 | "label2": { 78 | "nodes": [{ 79 | "dcid": "node3" 80 | }] 81 | }, 82 | } 83 | } 84 | arcs = Arcs.model_validate(json_data) 85 | assert len(arcs.arcs) == 2 86 | assert "label1" in arcs.arcs 87 | assert len(arcs.arcs["label1"].nodes) == 2 88 | assert arcs.arcs["label1"].nodes[0].dcid == "node1" 89 | assert len(arcs.arcs["label2"].nodes) == 1 90 | assert arcs.arcs["label2"].nodes[0].dcid == "node3" 91 | 92 | 93 | def test_arcs_model_validation_empty(): 94 | """Test Arcs.model_validate with empty data.""" 95 | json_data = {} 96 | arcs = Arcs.model_validate(json_data) 97 | assert len(arcs.arcs) == 0 98 | 99 | 100 | def test_properties_model_validation(): 101 | """Test that Properties.model_validate parses data correctly.""" 102 | json_data = {"properties": ["prop1", "prop2", "prop3"]} 103 | properties = Properties.model_validate(json_data) 104 | assert len(properties.properties) == 3 105 | assert properties.properties == ["prop1", "prop2", "prop3"] 106 | 107 | 108 | def test_properties_model_validation_empty(): 109 | """Test Properties.model_validate with empty data.""" 110 | json_data = {} 111 | properties = Properties.model_validate(json_data) 112 | assert properties.properties is None 113 | 114 | 115 | def test_statvarconstraint_model_validation(): 116 | """Test StatVarConstraint.model_validate parses data correctly.""" 117 | data = { 118 | "constraintId": "DevelopmentFinanceScheme", 119 | "constraintName": "Development Finance Scheme", 120 | "valueId": "ODAGrants", 121 | "valueName": "Official Development Assistance Grants", 122 | } 123 | constraint = StatVarConstraint.model_validate(data) 124 | 125 | assert constraint.constraintId == "DevelopmentFinanceScheme" 126 | assert constraint.constraintName == "Development Finance Scheme" 127 | assert constraint.valueId == "ODAGrants" 128 | assert constraint.valueName == "Official Development Assistance Grants" 129 | 130 | 131 | def test_statvarconstraints_model_validation(): 132 | """Test StatVarConstraints root model validates mapping properly.""" 133 | constraints = StatVarConstraints.model_validate({ 134 | "sv/1": [ 135 | { 136 | "constraintId": "DevelopmentFinanceScheme", 137 | "constraintName": "Development Finance Scheme", 138 | "valueId": "ODAGrants", 139 | "valueName": "Official Development Assistance Grants", 140 | }, 141 | { 142 | "constraintId": "DevelopmentFinanceRecipient", 143 | "constraintName": "Development Finance Recipient", 144 | "valueId": "country/GTM", 145 | "valueName": "Guatemala", 146 | }, 147 | ], 148 | "sv/2": [], 149 | }) 150 | 151 | assert "sv/1" in constraints and "sv/2" in constraints 152 | assert len(constraints["sv/1"]) == 2 153 | assert constraints["sv/2"] == [] 154 | -------------------------------------------------------------------------------- /datacommons_client/tests/endpoints/test_observation_endpoint.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from datacommons_client.endpoints.base import API 4 | from datacommons_client.endpoints.observation import ObservationEndpoint 5 | from datacommons_client.endpoints.response import ObservationResponse 6 | from datacommons_client.models.observation import ByVariable 7 | from datacommons_client.models.observation import ObservationDate 8 | from datacommons_client.models.observation import ObservationSelect 9 | 10 | 11 | def test_fetch(): 12 | """Tests the fetch method of ObservationEndpoint.""" 13 | api_mock = MagicMock(spec=API) 14 | api_mock.post.return_value = {"byVariable": {}} 15 | endpoint = ObservationEndpoint(api=api_mock) 16 | 17 | response = endpoint.fetch(variable_dcids="dcid/variableID", 18 | date=ObservationDate.LATEST, 19 | select=["date", "variable", "entity", "value"], 20 | entity_dcids="dc/EntityID", 21 | filter_facet_domains="domain1", 22 | filter_facet_ids="facet1") 23 | 24 | # Check the response 25 | assert isinstance(response, ObservationResponse) 26 | 27 | # Check the post request 28 | api_mock.post.assert_called_once_with(payload={ 29 | "date": ObservationDate.LATEST, 30 | "variable": { 31 | "dcids": ["dcid/variableID"] 32 | }, 33 | "entity": { 34 | "dcids": ["dc/EntityID"], 35 | }, 36 | "select": ["date", "variable", "entity", "value"], 37 | "filter": { 38 | "domains": ["domain1"], 39 | "facet_ids": ["facet1"] 40 | } 41 | }, 42 | endpoint="observation", 43 | all_pages=True, 44 | next_token=None) 45 | 46 | 47 | def test_fetch_observations_by_entity_type(): 48 | """Tests the fetch_observations_by_entity_type method.""" 49 | api_mock = MagicMock(spec=API) 50 | api_mock.post.return_value = {"byVariable": {}} 51 | endpoint = ObservationEndpoint(api=api_mock) 52 | 53 | response = endpoint.fetch_observations_by_entity_type( 54 | date="2023", 55 | parent_entity="Earth", 56 | entity_type="Country", 57 | select=["variable", "entity", "facet"], 58 | variable_dcids="dc/VariableID") 59 | 60 | # Check the response 61 | assert isinstance(response, ObservationResponse) 62 | 63 | # Check the post request 64 | api_mock.post.assert_called_once_with(payload={ 65 | "date": "2023", 66 | "variable": { 67 | "dcids": ["dc/VariableID"] 68 | }, 69 | "entity": { 70 | "expression": "Earth<-containedInPlace+{typeOf:Country}" 71 | }, 72 | "select": ["variable", "entity", "facet"], 73 | }, 74 | endpoint="observation", 75 | all_pages=True, 76 | next_token=None) 77 | 78 | 79 | def test_fetch_observations_facets_by_entity_type(): 80 | """Tests the fetch_observations_by_entity_type method.""" 81 | api_mock = MagicMock(spec=API) 82 | api_mock.post.return_value = {"byVariable": {}} 83 | endpoint = ObservationEndpoint(api=api_mock) 84 | 85 | response = endpoint.fetch_observations_by_entity_type( 86 | date="2023", 87 | parent_entity="Earth", 88 | entity_type="Country", 89 | variable_dcids="dc/VariableID", 90 | select=["variable", "entity", "facet"], 91 | ) 92 | 93 | # Check the response 94 | assert isinstance(response, ObservationResponse) 95 | 96 | # Check the post request 97 | api_mock.post.assert_called_once_with(payload={ 98 | "date": "2023", 99 | "variable": { 100 | "dcids": ["dc/VariableID"] 101 | }, 102 | "entity": { 103 | "expression": "Earth<-containedInPlace+{typeOf:Country}" 104 | }, 105 | "select": ["variable", "entity", "facet"], 106 | }, 107 | endpoint="observation", 108 | all_pages=True, 109 | next_token=None) 110 | 111 | 112 | def test_fetch_available_statistical_variables_single_entity(): 113 | """Test fetching variables for a single entity.""" 114 | mock_data = { 115 | "var1": ["ent1"], 116 | "var2": ["ent1"], 117 | } 118 | 119 | # Mock the fetch method on the ObservationEndpoint instance 120 | endpoint = ObservationEndpoint(api=MagicMock()) 121 | endpoint.fetch = MagicMock() 122 | endpoint.fetch.return_value.get_data_by_entity = MagicMock( 123 | return_value=mock_data) 124 | 125 | result = endpoint.fetch_available_statistical_variables("ent1") 126 | 127 | expected = { 128 | "ent1": ["var1", "var2"], 129 | } 130 | assert result == expected 131 | 132 | endpoint.fetch.assert_called_once_with( 133 | entity_dcids="ent1", 134 | select=[ObservationSelect.VARIABLE, ObservationSelect.ENTITY], 135 | variable_dcids=[]) 136 | 137 | 138 | def test_fetch_available_statistical_variables_multiple_entities(): 139 | """Test fetching variables for multiple entities.""" 140 | mock_data = { 141 | "var1": ["ent1", "ent2"], 142 | "var2": ["ent2"], 143 | } 144 | 145 | endpoint = ObservationEndpoint(api=MagicMock()) 146 | endpoint.fetch = MagicMock() 147 | endpoint.fetch.return_value.get_data_by_entity = MagicMock( 148 | return_value=mock_data) 149 | 150 | result = endpoint.fetch_available_statistical_variables(["ent1", "ent2"]) 151 | 152 | expected = { 153 | "ent1": ["var1"], 154 | "ent2": ["var1", "var2"], 155 | } 156 | assert result == expected 157 | -------------------------------------------------------------------------------- /datacommons_client/tests/endpoints/test_payloads.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datacommons_client.endpoints.payloads import NodeRequestPayload 4 | from datacommons_client.endpoints.payloads import ObservationRequestPayload 5 | from datacommons_client.endpoints.payloads import ResolveRequestPayload 6 | from datacommons_client.models.observation import ObservationDate 7 | from datacommons_client.models.observation import ObservationSelect 8 | from datacommons_client.utils.error_handling import InvalidObservationSelectError 9 | 10 | 11 | def test_node_payload_normalize(): 12 | """Tests that NodeRequestPayload correctly normalizes single and multiple node_dcids.""" 13 | payload = NodeRequestPayload(node_dcids="node1", expression="prop1") 14 | assert payload.node_dcids == ["node1"] 15 | 16 | payload = NodeRequestPayload(node_dcids=["node1", "node2"], 17 | expression="prop1") 18 | assert payload.node_dcids == ["node1", "node2"] 19 | 20 | 21 | def test_node_payload_validate(): 22 | """Tests that NodeRequestPayload validates its inputs correctly.""" 23 | with pytest.raises(ValueError): 24 | NodeRequestPayload(node_dcids="node1", 25 | expression=123) # `expression` must be a string 26 | 27 | 28 | def test_node_payload_to_dict(): 29 | """Tests NodeRequestPayload conversion to dictionary.""" 30 | payload = NodeRequestPayload(node_dcids="node1", expression="prop1") 31 | assert payload.to_dict() == {"nodes": ["node1"], "property": "prop1"} 32 | 33 | 34 | def test_observation_payload_normalize(): 35 | """Tests that ObservationRequestPayload normalizes inputs correctly.""" 36 | payload = ObservationRequestPayload( 37 | date="LATEST", 38 | variable_dcids="var1", 39 | select=["variable", "entity"], 40 | entity_dcids="ent1", 41 | filter_facet_domains="domain1", 42 | filter_facet_ids="facets1", 43 | ) 44 | assert payload.variable_dcids == ["var1"] 45 | assert payload.entity_dcids == ["ent1"] 46 | assert payload.filter_facet_domains == ["domain1"] 47 | assert payload.filter_facet_ids == ["facets1"] 48 | assert payload.date == ObservationDate.LATEST 49 | 50 | assert "filter" in payload.to_dict() 51 | assert "facet_ids" in payload.to_dict()["filter"] 52 | assert "domains" in payload.to_dict()["filter"] 53 | 54 | # Check that when domain and facets are not included, they are not in the payload 55 | payload = ObservationRequestPayload( 56 | date="all", 57 | variable_dcids=["var1"], 58 | select=["variable", "entity"], 59 | entity_dcids=["ent1"], 60 | ) 61 | assert payload.date == ObservationDate.ALL 62 | assert payload.variable_dcids == ["var1"] 63 | assert payload.entity_dcids == ["ent1"] 64 | assert "filter" not in payload.to_dict() 65 | 66 | 67 | def test_observation_select_invalid_value(): 68 | """Tests that an invalid ObservationSelect value raises InvalidObservationSelectError.""" 69 | with pytest.raises(InvalidObservationSelectError): 70 | ObservationSelect("invalid") 71 | 72 | 73 | def test_observation_payload_validate(): 74 | """Tests that ObservationRequestPayload validates its inputs.""" 75 | with pytest.raises(InvalidObservationSelectError): 76 | ObservationRequestPayload( 77 | date="LATEST", 78 | variable_dcids="var1", 79 | select=["variable"], 80 | entity_dcids=None, 81 | entity_expression=None, 82 | ) # Requires either `entity_dcids` or `entity_expression` 83 | 84 | with pytest.raises(InvalidObservationSelectError): 85 | ObservationRequestPayload( 86 | date="LATEST", 87 | variable_dcids="var1", 88 | select=["value"], # Missing required "variable" and "entity" 89 | entity_expression="expression", 90 | ) 91 | 92 | with pytest.raises(ValueError): 93 | ObservationRequestPayload( 94 | date="LATEST", 95 | variable_dcids="var1", 96 | select=["variable", "entity"], 97 | entity_dcids="ent1", 98 | entity_expression= 99 | "expression", # Both `entity_dcids` and `entity_expression` set 100 | ) 101 | 102 | 103 | def test_observation_payload_to_dict(): 104 | """Tests ObservationRequestPayload conversion to dictionary.""" 105 | payload = ObservationRequestPayload( 106 | date="LATEST", 107 | variable_dcids="var1", 108 | select=["variable", "entity"], 109 | entity_dcids="ent1", 110 | filter_facet_ids="facets1", 111 | ) 112 | assert payload.to_dict() == { 113 | "date": ObservationDate.LATEST, 114 | "variable": { 115 | "dcids": ["var1"] 116 | }, 117 | "entity": { 118 | "dcids": ["ent1"] 119 | }, 120 | "select": ["variable", "entity"], 121 | "filter": { 122 | "facet_ids": ["facets1"] 123 | } 124 | } 125 | 126 | 127 | def test_resolve_payload_normalize(): 128 | """Tests that ResolveRequestPayload normalizes single and multiple node_dcids.""" 129 | payload = ResolveRequestPayload(node_dcids="node1", expression="expr1") 130 | assert payload.node_dcids == ["node1"] 131 | 132 | payload = ResolveRequestPayload(node_dcids=["node1", "node2"], 133 | expression="expr1") 134 | assert payload.node_dcids == ["node1", "node2"] 135 | 136 | 137 | def test_resolve_payload_validate(): 138 | """Tests that ResolveRequestPayload validates its inputs correctly.""" 139 | with pytest.raises(ValueError): 140 | ResolveRequestPayload(node_dcids="node1", 141 | expression=123) # `expression` must be a string 142 | 143 | 144 | def test_resolve_payload_to_dict(): 145 | """Tests ResolveRequestPayload conversion to dictionary.""" 146 | payload = ResolveRequestPayload(node_dcids="node1", expression="expr1") 147 | assert payload.to_dict() == {"nodes": ["node1"], "property": "expr1"} 148 | -------------------------------------------------------------------------------- /datacommons_client/tests/models/test_observation_models.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from datacommons_client.models.observation import Facet 4 | from datacommons_client.models.observation import Observation 5 | from datacommons_client.models.observation import ObservationSelectList 6 | from datacommons_client.models.observation import OrderedFacet 7 | from datacommons_client.models.observation import Variable 8 | from datacommons_client.utils.error_handling import InvalidObservationSelectError 9 | 10 | 11 | def test_observation_model_validation(): 12 | """Test that Observation.model_validate parses data correctly.""" 13 | json_data = {"date": "2024-01-01", "value": 123.45} 14 | observation = Observation.model_validate(json_data) 15 | assert observation.date == "2024-01-01" 16 | assert observation.value == 123.45 17 | assert isinstance(observation.value, float) 18 | 19 | 20 | def test_observation_model_validation_partial(): 21 | """Test Observation.model_validate with missing data.""" 22 | json_data = {"date": "2024-01-01"} 23 | observation = Observation.model_validate(json_data) 24 | assert observation.date == "2024-01-01" 25 | assert observation.value is None 26 | 27 | 28 | def test_ordered_facets_model_validation(): 29 | """Test that OrderedFacet.model_validate parses data correctly.""" 30 | json_data = { 31 | "earliestDate": 32 | "2023-01-01", 33 | "facetId": 34 | "facet123", 35 | "latestDate": 36 | "2024-01-01", 37 | "obsCount": 38 | 2, 39 | "observations": [ 40 | { 41 | "date": "2023-01-01", 42 | "value": 100.0 43 | }, 44 | { 45 | "date": "2024-01-01", 46 | "value": 200.0 47 | }, 48 | ], 49 | } 50 | ordered_facets = OrderedFacet.model_validate(json_data) 51 | assert ordered_facets.earliestDate == "2023-01-01" 52 | assert ordered_facets.facetId == "facet123" 53 | assert ordered_facets.latestDate == "2024-01-01" 54 | assert ordered_facets.obsCount == 2 55 | assert len(ordered_facets.observations) == 2 56 | assert ordered_facets.observations[0].value == 100.0 57 | 58 | 59 | def test_ordered_facets_model_validation_empty_observations(): 60 | """Test OrderedFacet.model_validate with empty observations.""" 61 | json_data = { 62 | "earliestDate": "2023-01-01", 63 | "facetId": "facet123", 64 | "latestDate": "2024-01-01", 65 | "obsCount": 0, 66 | "observations": [], 67 | } 68 | ordered_facets = OrderedFacet.model_validate(json_data) 69 | assert len(ordered_facets.observations) == 0 70 | 71 | 72 | def test_variable_model_validation(): 73 | """Test that Variable.model_validate parses data correctly.""" 74 | json_data = { 75 | "byEntity": { 76 | "entity1": { 77 | "orderedFacets": [{ 78 | "earliestDate": 79 | "2023-01-01", 80 | "facetId": 81 | "facet1", 82 | "latestDate": 83 | "2023-12-31", 84 | "obsCount": 85 | 2, 86 | "observations": [ 87 | { 88 | "date": "2023-01-01", 89 | "value": 50.0 90 | }, 91 | { 92 | "date": "2023-12-31", 93 | "value": 75.0 94 | }, 95 | ], 96 | }] 97 | } 98 | } 99 | } 100 | variable = Variable.model_validate(json_data) 101 | assert "entity1" in variable.byEntity 102 | facets = variable.byEntity["entity1"].orderedFacets 103 | assert len(facets) == 1 104 | assert facets[0].facetId == "facet1" 105 | assert facets[0].observations[0].value == 50.0 106 | 107 | 108 | def test_variable_model_validation_empty(): 109 | """Test Variable.model_validate with empty byEntity.""" 110 | json_data = {"byEntity": {}} 111 | variable = Variable.model_validate(json_data) 112 | assert len(variable.byEntity) == 0 113 | 114 | 115 | def test_facet_model_validation(): 116 | """Test that Facet.model_validate parses data correctly.""" 117 | json_data = { 118 | "importName": "Import 1", 119 | "measurementMethod": "Method A", 120 | "observationPeriod": "2023", 121 | "provenanceUrl": "http://example.com", 122 | "unit": "usd", 123 | } 124 | facet = Facet.model_validate(json_data) 125 | assert facet.importName == "Import 1" 126 | assert facet.measurementMethod == "Method A" 127 | assert facet.observationPeriod == "2023" 128 | assert facet.provenanceUrl == "http://example.com" 129 | assert facet.unit == "usd" 130 | 131 | 132 | def test_facet_model_validation_partial(): 133 | """Test Facet.model_validate with missing data.""" 134 | json_data = {"importName": "Import 1", "unit": "GTQ"} 135 | facet = Facet.model_validate(json_data) 136 | assert facet.importName == "Import 1" 137 | assert facet.measurementMethod is None 138 | assert facet.unit == "GTQ" 139 | assert facet.provenanceUrl is None 140 | 141 | 142 | def test_observation_select_list_defaults(): 143 | """ObservationSelectList returns default selects when none provided.""" 144 | osl = ObservationSelectList.model_validate(None) 145 | assert osl.select == ["date", "variable", "entity", "value"] 146 | 147 | 148 | def test_observation_select_list_custom(): 149 | """ObservationSelectList accepts custom select lists.""" 150 | osl = ObservationSelectList.model_validate(["variable", "entity", "facet"]) 151 | assert osl.select == ["variable", "entity", "facet"] 152 | 153 | 154 | def test_observation_select_list_missing_required(): 155 | """Missing required select entries raises InvalidObservationSelectError.""" 156 | with pytest.raises(InvalidObservationSelectError): 157 | ObservationSelectList.model_validate(["date", "value"]) 158 | -------------------------------------------------------------------------------- /datacommons_client/endpoints/resolve.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from datacommons_client.endpoints.base import API 4 | from datacommons_client.endpoints.base import Endpoint 5 | from datacommons_client.endpoints.payloads import ResolveRequestPayload 6 | from datacommons_client.endpoints.response import ResolveResponse 7 | 8 | 9 | def _resolve_correspondence_expression(from_type: str, 10 | to_type: str, 11 | entity_type: str | None = None) -> str: 12 | """ 13 | Constructs a relation expression for fetching correspondence between entities of two types. 14 | 15 | Args: 16 | from_type (str): The source entity type. 17 | to_type (str): The target entity type. 18 | entity_type (Optional[str]): Optional type of the entities. 19 | 20 | Returns: 21 | str: The relation expression to fetch correspondence between entities of the given types. 22 | """ 23 | return (f"<-{from_type}{{typeOf:{entity_type}}}->{to_type}" 24 | if entity_type else f"<-{from_type}->{to_type}") 25 | 26 | 27 | class ResolveEndpoint(Endpoint): 28 | """ 29 | A class to interact with the resolve API endpoint. 30 | 31 | Args: 32 | api (API): The API instance providing the environment configuration 33 | (base URL, headers, authentication) to be used for requests. 34 | """ 35 | 36 | def __init__(self, api: API): 37 | """Initializes the ResolveEndpoint instance.""" 38 | super().__init__(endpoint="resolve", api=api) 39 | 40 | def fetch(self, node_ids: str | list[str], 41 | expression: str | list[str]) -> ResolveResponse: 42 | """ 43 | Fetches resolved data for the given nodes and expressions, identified by name, 44 | coordinates, or wiki ID. 45 | 46 | Args: 47 | node_ids (str | list[str]): One or more node IDs to resolve. 48 | expression (str): The relation expression to query. 49 | 50 | Returns: 51 | ResolveResponse: The response object containing the resolved data. 52 | """ 53 | # Check if the node_ids is a single string. If so, convert it to a list. 54 | if isinstance(node_ids, str): 55 | node_ids = [node_ids] 56 | 57 | # Construct the payload 58 | payload = ResolveRequestPayload(node_dcids=node_ids, 59 | expression=expression).to_dict() 60 | 61 | # Send the request and return the response 62 | return ResolveResponse.model_validate(self.post(payload)) 63 | 64 | def fetch_dcids_by_name(self, 65 | names: str | list[str], 66 | entity_type: Optional[str] = None) -> ResolveResponse: 67 | """ 68 | Fetches DCIDs for entities by their names. 69 | 70 | Args: 71 | names (str | list[str]): One or more entity names to resolve. 72 | entity_type (Optional[str]): Optional type of the entities. 73 | 74 | Returns: 75 | ResolveResponse: The response object containing the resolved DCIDs. 76 | """ 77 | 78 | expression = _resolve_correspondence_expression(from_type="description", 79 | to_type="dcid", 80 | entity_type=entity_type) 81 | 82 | return self.fetch(node_ids=names, expression=expression) 83 | 84 | def fetch_dcids_by_wikidata_id( 85 | self, 86 | wikidata_ids: str | list[str], 87 | entity_type: Optional[str] = None) -> ResolveResponse: 88 | """ 89 | Fetches DCIDs for entities by their Wikidata IDs. 90 | 91 | Args: 92 | wikidata_ids (str | list[str]): One or more Wikidata IDs to resolve. 93 | entity_type (Optional[str]): Optional type of the entities. 94 | 95 | Returns: 96 | ResolveResponse: The response object containing the resolved DCIDs. 97 | """ 98 | expression = _resolve_correspondence_expression(from_type="wikidataId", 99 | to_type="dcid", 100 | entity_type=entity_type) 101 | 102 | return self.fetch(node_ids=wikidata_ids, expression=expression) 103 | 104 | def fetch_dcid_by_coordinates( 105 | self, 106 | latitude: str, 107 | longitude: str, 108 | entity_type: Optional[str] = None) -> ResolveResponse: 109 | """ 110 | Fetches DCIDs for entities by their geographic coordinates. 111 | 112 | Args: 113 | latitude (str): Latitude of the entity. 114 | longitude (str): Longitude of the entity. 115 | entity_type (Optional[str]): Optional type of the entities to refine results 116 | (e.g., "City", "State", "Country"). 117 | 118 | Returns: 119 | ResolveResponse: The response object containing the resolved DCIDs. 120 | 121 | Example: 122 | To find the DCID for "Mountain View" using its latitude and longitude: 123 | ```python 124 | latitude = "37.42" 125 | longitude = "-122.08" 126 | response = client.fetch_dcid_by_coordinates(latitude=latitude, longitude=longitude) 127 | print(response.entities) 128 | ``` 129 | Note: 130 | - For ambiguous results, providing an entity type (e.g., "City") can help disambiguate. 131 | - The coordinates should be passed as strings in decimal format (e.g., "37.42", "-122.08"). 132 | 133 | 134 | """ 135 | expression = _resolve_correspondence_expression(from_type="geoCoordinate", 136 | to_type="dcid", 137 | entity_type=entity_type) 138 | coordinates = f"{latitude}#{longitude}" 139 | return self.fetch(node_ids=coordinates, expression=expression) 140 | -------------------------------------------------------------------------------- /datacommons_client/tests/endpoints/test_resolve_endpoint.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from datacommons_client.endpoints.base import API 4 | from datacommons_client.endpoints.resolve import _resolve_correspondence_expression 5 | from datacommons_client.endpoints.resolve import ResolveEndpoint 6 | from datacommons_client.endpoints.response import ResolveResponse 7 | from datacommons_client.models.resolve import Candidate 8 | from datacommons_client.models.resolve import Entity 9 | 10 | 11 | def test_fetch(): 12 | """Tests the fetch method of ResolveEndpoint.""" 13 | api_mock = MagicMock(spec=API) 14 | api_mock.post = MagicMock(return_value={}) 15 | endpoint = ResolveEndpoint(api=api_mock) 16 | 17 | response = endpoint.fetch(node_ids="Node1", expression="some_expression") 18 | 19 | # Check the response 20 | assert isinstance(response, ResolveResponse) 21 | 22 | # Check the post request 23 | api_mock.post.assert_called_once_with(payload={ 24 | "nodes": ["Node1"], 25 | "property": "some_expression", 26 | }, 27 | endpoint="resolve", 28 | all_pages=True, 29 | next_token=None) 30 | 31 | 32 | def test_fetch_dcid_by_name(): 33 | """Tests the fetch_dcid_by_name method.""" 34 | api_mock = MagicMock(spec=API) 35 | api_mock.post = MagicMock(return_value={}) 36 | endpoint = ResolveEndpoint(api=api_mock) 37 | 38 | response = endpoint.fetch_dcids_by_name(names=["Entity1"], 39 | entity_type="Place") 40 | 41 | # Check the response 42 | assert isinstance(response, ResolveResponse) 43 | 44 | # Check the post request 45 | api_mock.post.assert_called_once_with(payload={ 46 | "nodes": ["Entity1"], 47 | "property": "<-description{typeOf:Place}->dcid" 48 | }, 49 | endpoint="resolve", 50 | all_pages=True, 51 | next_token=None) 52 | 53 | 54 | def test_fetch_dcid_by_wikidata_id(): 55 | """Tests the fetch_dcid_by_wikidata_id method.""" 56 | api_mock = MagicMock(spec=API) 57 | api_mock.post = MagicMock(return_value={}) 58 | endpoint = ResolveEndpoint(api=api_mock) 59 | 60 | response = endpoint.fetch_dcids_by_wikidata_id(wikidata_ids="Q12345", 61 | entity_type="Country") 62 | 63 | # Check the response 64 | assert isinstance(response, ResolveResponse) 65 | 66 | # Check the post request 67 | api_mock.post.assert_called_once_with(payload={ 68 | "nodes": ["Q12345"], 69 | "property": "<-wikidataId{typeOf:Country}->dcid", 70 | }, 71 | endpoint="resolve", 72 | all_pages=True, 73 | next_token=None) 74 | 75 | 76 | def test_fetch_dcids_list_by_wikidata_id(): 77 | """Tests the fetch_dcid_by_wikidata_id method.""" 78 | api_mock = MagicMock(spec=API) 79 | api_mock.post = MagicMock(return_value={}) 80 | endpoint = ResolveEndpoint(api=api_mock) 81 | 82 | response = endpoint.fetch_dcids_by_wikidata_id( 83 | wikidata_ids=["Q12345", "Q695660"]) 84 | 85 | # Check the response 86 | assert isinstance(response, ResolveResponse) 87 | 88 | # Check the post request 89 | api_mock.post.assert_called_once_with(payload={ 90 | "nodes": ["Q12345", "Q695660"], 91 | "property": "<-wikidataId->dcid", 92 | }, 93 | endpoint="resolve", 94 | all_pages=True, 95 | next_token=None) 96 | 97 | 98 | def test_fetch_dcid_by_coordinates(): 99 | """Tests the fetch_dcid_by_coordinates method.""" 100 | api_mock = MagicMock(spec=API) 101 | api_mock.post = MagicMock(return_value={}) 102 | endpoint = ResolveEndpoint(api=api_mock) 103 | 104 | response = endpoint.fetch_dcid_by_coordinates(latitude="37.7749", 105 | longitude="-122.4194", 106 | entity_type="City") 107 | 108 | # Check the response 109 | assert isinstance(response, ResolveResponse) 110 | 111 | # Check the post request 112 | api_mock.post.assert_called_once_with(payload={ 113 | "nodes": ["37.7749#-122.4194"], 114 | "property": "<-geoCoordinate{typeOf:City}->dcid", 115 | }, 116 | endpoint="resolve", 117 | all_pages=True, 118 | next_token=None) 119 | 120 | 121 | def test_resolve_correspondence_expression(): 122 | """Tests the resolve_correspondence_expression function.""" 123 | expression = _resolve_correspondence_expression(from_type="description", 124 | to_type="dcid", 125 | entity_type="Place") 126 | assert expression == "<-description{typeOf:Place}->dcid" 127 | 128 | expression_no_entity_type = _resolve_correspondence_expression( 129 | from_type="description", to_type="dcid") 130 | assert expression_no_entity_type == "<-description->dcid" 131 | 132 | 133 | def test_flatten_resolve_response(): 134 | """Tests the flatten_resolve_response function.""" 135 | # Mock ResolveResponse with multiple entities 136 | mock_data = ResolveResponse(entities=[ 137 | Entity(node="Node1", candidates=[Candidate(dcid="Candidate1")]), 138 | Entity(node="Node2", 139 | candidates=[ 140 | Candidate(dcid="Candidate2"), 141 | Candidate(dcid="Candidate3") 142 | ]), 143 | Entity(node="Node3", candidates=[]) # No candidates 144 | ]) 145 | 146 | # Call the function 147 | result = mock_data.to_flat_dict() 148 | 149 | # Expected output 150 | expected = { 151 | "Node1": "Candidate1", # Single candidate 152 | "Node2": ["Candidate2", "Candidate3"], # Multiple candidates 153 | "Node3": [], # No candidates 154 | } 155 | 156 | # Assertions 157 | assert result == expected 158 | -------------------------------------------------------------------------------- /datacommons_client/models/observation.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Optional 3 | 4 | from pydantic import Field 5 | from pydantic import field_validator 6 | from pydantic import model_serializer 7 | from pydantic import RootModel 8 | 9 | from datacommons_client.models.base import BaseDCModel 10 | from datacommons_client.models.base import DictLikeRootModel 11 | from datacommons_client.models.base import entityDCID 12 | from datacommons_client.models.base import facetID 13 | from datacommons_client.models.base import ListLikeRootModel 14 | from datacommons_client.models.base import variableDCID 15 | from datacommons_client.utils.error_handling import InvalidObservationSelectError 16 | 17 | 18 | class ObservationDate(str, Enum): 19 | LATEST = "LATEST" 20 | ALL = "" 21 | 22 | @classmethod 23 | def _missing_(cls, value): 24 | if isinstance(value, str): 25 | u = value.strip().upper() 26 | if u == "LATEST": 27 | return cls.LATEST 28 | if u in ("ALL", ""): 29 | return cls.ALL 30 | raise ValueError(f"Invalid date value: '{value}'. Only 'LATEST' or" 31 | f" '' (empty string) are allowed.") 32 | 33 | 34 | class ObservationSelect(str, Enum): 35 | DATE = "date" 36 | VARIABLE = "variable" 37 | ENTITY = "entity" 38 | VALUE = "value" 39 | FACET = "facet" 40 | 41 | @classmethod 42 | def valid_values(cls): 43 | """Returns a list of valid enum values.""" 44 | return sorted(cls._value2member_map_.keys()) 45 | 46 | @classmethod 47 | def _missing_(cls, value): 48 | """Handle missing enum values by raising a custom error.""" 49 | message = f"Invalid `select` Field: '{value}'. Only {', '.join(cls.valid_values())} are allowed." 50 | raise InvalidObservationSelectError(message=message) 51 | 52 | 53 | class ObservationSelectList(RootModel[list[ObservationSelect]]): 54 | """A model to represent a list of ObservationSelect values. 55 | 56 | Attributes: 57 | select (List[ObservationSelect]): A list of ObservationSelect enum values. 58 | """ 59 | 60 | root: Optional[list[ObservationSelect | str]] = None 61 | 62 | @field_validator("root", mode="before") 63 | def _validate_select(cls, v): 64 | if v is None: 65 | select = [ 66 | ObservationSelect.DATE, 67 | ObservationSelect.VARIABLE, 68 | ObservationSelect.ENTITY, 69 | ObservationSelect.VALUE, 70 | ] 71 | else: 72 | select = v 73 | 74 | select = [ObservationSelect(s).value for s in select] 75 | 76 | required_select = {"variable", "entity"} 77 | 78 | missing_fields = required_select - set(select) 79 | if missing_fields: 80 | raise InvalidObservationSelectError(message=( 81 | f"The 'select' field must include at least the following: {', '.join(required_select)} " 82 | f"(missing: {', '.join(missing_fields)})")) 83 | 84 | return select 85 | 86 | @property 87 | def select(self) -> list[str]: 88 | """Return select values directly as list""" 89 | return self.root or [] 90 | 91 | 92 | class Observation(BaseDCModel): 93 | """Represents an observation with a date and value. 94 | 95 | Attributes: 96 | date (str): The date of the observation. 97 | value (float): Optional. The value of the observation. 98 | """ 99 | 100 | date: Optional[str] = None 101 | value: Optional[float] = None 102 | 103 | 104 | class OrderedFacet(BaseDCModel): 105 | """Represents ordered facets of observations. 106 | 107 | Attributes: 108 | earliestDate (str): The earliest date in the observations. 109 | facetId (str): The identifier for the facet. 110 | latestDate (str): The latest date in the observations. 111 | obsCount (int): The total number of observations. 112 | observations (List[Observation]): A list of observations associated with the facet. 113 | """ 114 | 115 | earliestDate: Optional[str] = None 116 | facetId: Optional[str] = None 117 | latestDate: Optional[str] = None 118 | obsCount: Optional[int] = None 119 | observations: list[Observation] = Field(default_factory=list) 120 | 121 | 122 | class OrderedFacets(BaseDCModel): 123 | """Represents a list of ordered facets. 124 | """ 125 | orderedFacets: list[OrderedFacet] = Field(default_factory=list) 126 | 127 | 128 | class Variable(BaseDCModel): 129 | """Represents a variable with data grouped by entity. 130 | 131 | Attributes: 132 | byEntity (dict[entityDCID, OrderedFacets]): A dictionary mapping 133 | entities to their ordered facets. 134 | """ 135 | 136 | byEntity: dict[entityDCID, OrderedFacets] = Field(default_factory=dict) 137 | 138 | 139 | class Facet(BaseDCModel): 140 | """Represents metadata for a facet. 141 | 142 | Attributes: 143 | importName (str): The name of the data import. 144 | measurementMethod (str): The method used to measure the data. 145 | observationPeriod (str): The period over which the observations were made. 146 | provenanceUrl (str): The URL of the data's provenance. 147 | unit (str): The unit of the observations. 148 | """ 149 | 150 | importName: Optional[str] = None 151 | measurementMethod: Optional[str] = None 152 | observationPeriod: Optional[str] = None 153 | provenanceUrl: Optional[str] = None 154 | unit: Optional[str] = None 155 | 156 | 157 | class ByVariable(BaseDCModel, DictLikeRootModel[dict[variableDCID, Variable]]): 158 | """A root model whose value is a dict mapping variableDCID to Variable.""" 159 | 160 | 161 | class VariableByEntity(BaseDCModel, 162 | DictLikeRootModel[dict[variableDCID, 163 | dict[entityDCID, 164 | OrderedFacets]]]): 165 | """A root model whose value is a dict mapping entityDCID to Variable.""" 166 | 167 | 168 | class ObservationRecord(Observation, Facet): 169 | """Represents a record of observations for a specific variable and entity. 170 | 171 | Attributes: 172 | date (str): The date of the observation. 173 | value (float): The value of the observation. 174 | """ 175 | 176 | entity: Optional[entityDCID] = None 177 | variable: Optional[variableDCID] = None 178 | facetId: Optional[facetID] = None 179 | 180 | _order = [ 181 | "date", "entity", "variable", "facetId", "importName", 182 | "measurementMethod", "observationPeriod", "provenanceUrl", "unit", "value" 183 | ] 184 | 185 | @model_serializer(mode="wrap") 186 | def _reorder(self, helper): 187 | """Reorders the fields for serialization.""" 188 | data = helper(self) 189 | ordered = {} 190 | 191 | # Ensure the order of fields matches the specified order 192 | for key in self._order: 193 | if key in data: 194 | ordered[key] = data.pop(key) 195 | 196 | # Add any remaining fields that were not in the order list 197 | ordered.update(data) 198 | 199 | # Ensure the 'value' field is always at the end 200 | if "value" in ordered: 201 | ordered["value"] = ordered.pop("value") 202 | 203 | return ordered 204 | 205 | 206 | class ObservationRecords(BaseDCModel, 207 | ListLikeRootModel[list[ObservationRecord]]): 208 | """A root model whose value is a list of ObservationRecord.""" 209 | -------------------------------------------------------------------------------- /datacommons_client/endpoints/base.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Any, Dict, Optional 3 | 4 | from datacommons_client.utils.request_handling import check_instance_is_valid 5 | from datacommons_client.utils.request_handling import post_request 6 | from datacommons_client.utils.request_handling import resolve_instance_url 7 | 8 | 9 | class API: 10 | """Represents a configured API interface to the Data Commons API. 11 | 12 | This class handles environment setup, resolving the base URL, building headers, 13 | or optionally using a fully qualified URL directly. It can be used standalone 14 | to interact with the API or in combination with Endpoint classes. 15 | """ 16 | 17 | def __init__( 18 | self, 19 | api_key: Optional[str] = None, 20 | dc_instance: Optional[str] = None, 21 | url: Optional[str] = None, 22 | surface_header_value: Optional[str] = None, 23 | ): 24 | """ 25 | Initializes the API instance. 26 | 27 | Args: 28 | api_key: The API key for authentication. Defaults to None. 29 | dc_instance: The Data Commons instance domain. Ignored if `url` is provided. 30 | Defaults to 'datacommons.org' if both `url` and `dc_instance` are None. 31 | url: A fully qualified URL for the base API. This may be useful if more granular control 32 | of the API is required (for local development, for example). If provided, dc_instance` 33 | should not be provided. 34 | surface_header_value: indicates which DC surface (MCP server, etc.) makes a call to the python library. 35 | If the call originated internally, this is null and we pass in "clientlib-python" as the surface header 36 | 37 | Raises: 38 | ValueError: If both `dc_instance` and `url` are provided. 39 | """ 40 | if dc_instance and url: 41 | raise ValueError("Cannot provide both `dc_instance` and `url`.") 42 | 43 | if not dc_instance and not url: 44 | dc_instance = "datacommons.org" 45 | 46 | if url is not None: 47 | # Use the given URL directly (strip trailing slash) 48 | self.base_url = check_instance_is_valid(url.rstrip("/"), api_key=api_key) 49 | else: 50 | # Resolve from dc_instance 51 | self.base_url = resolve_instance_url(dc_instance) 52 | 53 | self.headers = self.build_headers(surface_header_value=surface_header_value, 54 | api_key=api_key) 55 | 56 | def __repr__(self) -> str: 57 | """Returns a readable representation of the API object. 58 | 59 | Indicates the base URL and if it's authenticated. 60 | 61 | Returns: 62 | str: A string representation of the API object. 63 | """ 64 | has_auth = " (Authenticated)" if "X-API-Key" in self.headers else "" 65 | return f"" 66 | 67 | def post(self, 68 | payload: dict[str, Any], 69 | endpoint: Optional[str] = None, 70 | *, 71 | all_pages: bool = True, 72 | next_token: Optional[str] = None) -> Dict[str, Any]: 73 | """Makes a POST request using the configured API environment. 74 | 75 | If `endpoint` is provided, it will be appended to the base_url. Otherwise, 76 | it will just POST to the base URL. 77 | 78 | Args: 79 | payload: The JSON payload for the POST request. 80 | endpoint: An optional endpoint path to append to the base URL. 81 | all_pages: If True, fetch all pages of the response. If False, fetch only the first page. 82 | Defaults to True. Set to False to only fetch the first page. In that case, a 83 | `next_token` key in the response will indicate if more pages are available. 84 | That token can be used to fetch the next page. 85 | 86 | Returns: 87 | A dictionary containing the merged response data. 88 | 89 | Raises: 90 | ValueError: If the payload is not a valid dictionary. 91 | """ 92 | if not isinstance(payload, dict): 93 | raise ValueError("Payload must be a dictionary.") 94 | 95 | url = (self.base_url if endpoint is None else f"{self.base_url}/{endpoint}") 96 | 97 | return post_request(url=url, 98 | payload=payload, 99 | headers=self.headers, 100 | all_pages=all_pages, 101 | next_token=next_token) 102 | 103 | def build_headers(self, 104 | surface_header_value: Optional[str], 105 | api_key: Optional[str] = None) -> dict[str, str]: 106 | """Build request headers for API requests. 107 | 108 | Includes JSON content type. If an API key is provided, add it as `X-API-Key`. 109 | 110 | Args: 111 | self: the API, which includes API key and surface header if available 112 | 113 | Returns: 114 | A dictionary of headers for the request. 115 | """ 116 | headers = { 117 | "Content-Type": "application/json", 118 | "x-surface": "clientlib-python" 119 | } 120 | if api_key: 121 | headers["X-API-Key"] = api_key 122 | 123 | if surface_header_value: 124 | headers["x-surface"] = surface_header_value 125 | 126 | return headers 127 | 128 | 129 | class Endpoint: 130 | """Represents a specific endpoint within the Data Commons API. 131 | 132 | This class leverages an API instance to make requests. It does not 133 | handle instance resolution or headers directly; that is delegated to the API instance. 134 | 135 | Attributes: 136 | endpoint (str): The endpoint path (e.g., 'node'). 137 | api (API): The API instance providing configuration and the `post` method. 138 | """ 139 | 140 | def __init__(self, endpoint: str, api: API): 141 | """ 142 | Initializes the Endpoint instance. 143 | 144 | Args: 145 | endpoint: The endpoint path (e.g., 'node'). 146 | api: An API instance that provides the environment configuration. 147 | """ 148 | self.endpoint = endpoint 149 | self.api = api 150 | 151 | def __repr__(self) -> str: 152 | """Returns a readable representation of the Endpoint object. 153 | 154 | Shows the endpoint and underlying API configuration. 155 | 156 | Returns: 157 | str: A string representation of the Endpoint object. 158 | """ 159 | return f"<{self.endpoint.title()} Endpoint using {repr(self.api)}>" 160 | 161 | def post(self, 162 | payload: dict[str, Any], 163 | all_pages: bool = True, 164 | next_token: Optional[str] = None) -> Dict[str, Any]: 165 | """Makes a POST request to the specified endpoint using the API instance. 166 | 167 | Args: 168 | payload: The JSON payload for the POST request. 169 | all_pages: If True, fetch all pages of the response. If False, fetch only the first page. 170 | Defaults to True. Set to False to only fetch the first page. In that case, a 171 | `next_token` key in the response will indicate if more pages are available. 172 | That token can be used to fetch the next page. 173 | next_token: Optionally, the token to fetch the next page of results. Defaults to None. 174 | 175 | Returns: 176 | A dictionary with the merged API response data. 177 | 178 | Raises: 179 | ValueError: If the payload is not a valid dictionary. 180 | """ 181 | return self.api.post(payload=payload, 182 | endpoint=self.endpoint, 183 | all_pages=all_pages, 184 | next_token=next_token) 185 | -------------------------------------------------------------------------------- /datacommons_client/utils/data_processing.py: -------------------------------------------------------------------------------- 1 | from dataclasses import asdict 2 | import json 3 | from typing import Any, Dict, List 4 | 5 | from datacommons_client.models.base import ArcLabel 6 | from datacommons_client.models.base import facetID 7 | from datacommons_client.models.base import NodeDCID 8 | from datacommons_client.models.base import Property 9 | from datacommons_client.models.node import Arcs 10 | from datacommons_client.models.node import FlattenedArcsMapping 11 | from datacommons_client.models.node import FlattenedPropertiesMapping 12 | from datacommons_client.models.node import Name 13 | from datacommons_client.models.node import Node 14 | from datacommons_client.models.node import NodeGroup 15 | from datacommons_client.models.node import Properties 16 | from datacommons_client.models.observation import Facet 17 | from datacommons_client.models.observation import ObservationRecord 18 | from datacommons_client.models.observation import ObservationRecords 19 | from datacommons_client.models.observation import OrderedFacets 20 | from datacommons_client.models.observation import VariableByEntity 21 | 22 | 23 | def unpack_arcs(arcs: Dict[ArcLabel, NodeGroup]) -> dict[Property, list[Node]]: 24 | """Simplify the 'arcs' structure.""" 25 | # Return dictionary of property nodes 26 | return { 27 | prop: getattr(arc_data, "nodes", []) for prop, arc_data in arcs.items() 28 | } 29 | 30 | 31 | def flatten_properties( 32 | data: Dict[NodeDCID, Arcs | Properties] 33 | ) -> FlattenedPropertiesMapping | FlattenedArcsMapping: 34 | """ 35 | Flatten the properties of a node response. 36 | 37 | Processes a dictionary of node responses, extracting and 38 | simplifying their properties and arcs into a flattened dictionary. 39 | 40 | Args: 41 | data (Dict[NodeDCID, Arcs | Properties]): 42 | The input dictionary containing node responses. Each node maps to 43 | a dictionary with potential "arcs" and "properties" keys. 44 | 45 | Returns: 46 | FlattenedPropertiesMapping | FlattenedArcsMapping: 47 | A flattened dictionary where keys are node identifiers, and values 48 | are the simplified properties or nodes. 49 | """ 50 | if not data: 51 | return FlattenedPropertiesMapping.model_validate({}) 52 | 53 | first_node = next(iter(data.values())) 54 | is_properties = isinstance(first_node, Properties) 55 | mapping_cls = FlattenedPropertiesMapping if is_properties else FlattenedArcsMapping 56 | 57 | # Store simplified properties 58 | items = {} 59 | for node_id, node_data in data.items(): 60 | if is_properties: 61 | props = getattr(node_data, "properties", None) 62 | if props: 63 | items[node_id] = props 64 | else: 65 | arcs = getattr(node_data, "arcs", None) 66 | if arcs: 67 | items[node_id] = unpack_arcs(arcs) 68 | 69 | return mapping_cls.model_validate(items) 70 | 71 | 72 | def extract_observations( 73 | variable: str, entity: str, entity_data: OrderedFacets, 74 | facet_metadata: dict[facetID, Facet]) -> list[ObservationRecord]: 75 | """ 76 | Extracts observations for a given variable, entity, and its data. 77 | 78 | Args: 79 | variable (str): The variable name. 80 | entity (str): The entity name. 81 | entity_data (OrderedFacets): Data for the entity, including ordered facets. 82 | facet_metadata (dict[facetID, Facet]): Metadata for facets. 83 | 84 | Returns: 85 | list[dict]: A list of observation records. 86 | """ 87 | observations = [] 88 | for facet in entity_data.orderedFacets: 89 | for observation in facet.observations: 90 | observations.append( 91 | ObservationRecord.model_validate({ 92 | "date": observation.date, 93 | "entity": entity, 94 | "variable": variable, 95 | "value": observation.value, 96 | "facetId": facet.facetId, 97 | **facet_metadata.get(facet.facetId, Facet()).to_dict(), 98 | })) 99 | 100 | return observations 101 | 102 | 103 | def observations_as_records(data: VariableByEntity, 104 | facets: dict[facetID, Facet]) -> ObservationRecords: 105 | """ 106 | Converts observation data into a list of records. 107 | 108 | Args: 109 | data (VariableByEntity): A mapping of variables to entities and their data. 110 | facets (dict): Facet metadata for the observations. 111 | 112 | Returns: 113 | ObservationRecords: A flattened list of observation records. 114 | """ 115 | 116 | records = [] 117 | for variable, entities in data.items(): 118 | for entity, entity_data in entities.items(): 119 | for record in extract_observations( 120 | variable=variable, 121 | entity=entity, 122 | entity_data=entity_data, 123 | facet_metadata=facets, 124 | ): 125 | records.append(record) 126 | 127 | return ObservationRecords.model_validate(records) 128 | 129 | 130 | def group_variables_by_entity( 131 | data: dict[str, list[str]]) -> dict[str, list[str]]: 132 | """Groups variables by the entities they are associated with. 133 | Takes a dictionary mapping statistical variable DCIDs to a list of entity DCIDs, 134 | and returns a new dictionary mapping each entity DCID to a list of statistical 135 | variables available for that entity. 136 | Args: 137 | data: A dictionary where each key is a variable DCID and the value is a list 138 | of entity DCIDs that have observations for that variable. 139 | Returns: 140 | A dictionary where each key is an entity DCID and the value is a list of 141 | variable DCIDs available for that entity. 142 | """ 143 | result: dict[str, list[str]] = {} 144 | for variable, entities in data.items(): 145 | for entity in entities: 146 | result.setdefault(entity, []).append(variable) 147 | return result 148 | 149 | 150 | class SerializableMixin: 151 | """Provides serialization methods for the Response dataclasses.""" 152 | 153 | def to_dict(self, exclude_none: bool = True) -> Dict[str, Any]: 154 | """Converts the instance to a dictionary. 155 | 156 | Args: 157 | exclude_none: If True, only include non-empty values in the response. 158 | 159 | Returns: 160 | Dict[str, Any]: The dictionary representation of the instance. 161 | """ 162 | 163 | def _remove_none(data: Any) -> Any: 164 | """Recursively removes None or empty values from a dictionary or list.""" 165 | if isinstance(data, dict): 166 | return {k: _remove_none(v) for k, v in data.items() if v is not None} 167 | elif isinstance(data, list): 168 | return [_remove_none(item) for item in data] 169 | return data 170 | 171 | result = asdict(self) 172 | return _remove_none(result) if exclude_none else result 173 | 174 | def to_json(self, exclude_none: bool = True) -> str: 175 | """Converts the instance to a JSON string. 176 | 177 | Args: 178 | exclude_none: If True, only include non-empty values in the response. 179 | 180 | Returns: 181 | str: The JSON string representation of the instance. 182 | """ 183 | return json.dumps(self.to_dict(exclude_none=exclude_none), indent=2) 184 | 185 | 186 | def flatten_names_dictionary(names_dict: dict[str, Name]) -> dict[str, str]: 187 | """ 188 | Flattens a dictionary which contains Name objects into a flattened dictionary 189 | with DCIDs as keys and names as values. 190 | 191 | Args: 192 | names_dict (dict[str, Name]): The input dictionary to flatten. 193 | 194 | Returns: 195 | dict[str, str]: A flattened dictionary with DCIDs as keys and names as values. 196 | """ 197 | 198 | return {dcid: name.to_dict()['value'] for dcid, name in names_dict.items()} 199 | -------------------------------------------------------------------------------- /datacommons_client/endpoints/observation.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from datacommons_client.endpoints.base import API 4 | from datacommons_client.endpoints.base import Endpoint 5 | from datacommons_client.endpoints.payloads import ObservationRequestPayload 6 | from datacommons_client.endpoints.response import ObservationResponse 7 | from datacommons_client.models.observation import ObservationDate 8 | from datacommons_client.models.observation import ObservationSelect 9 | from datacommons_client.utils.data_processing import group_variables_by_entity 10 | 11 | 12 | class ObservationEndpoint(Endpoint): 13 | """ 14 | A class to interact with the observation API endpoint. 15 | 16 | Args: 17 | api (API): The API instance providing the environment configuration 18 | (base URL, headers, authentication) to be used for requests. 19 | """ 20 | 21 | def __init__(self, api: API): 22 | """Initializes the ObservationEndpoint instance.""" 23 | super().__init__(endpoint="observation", api=api) 24 | 25 | def fetch( 26 | self, 27 | variable_dcids: str | list[str], 28 | date: ObservationDate | str = ObservationDate.LATEST, 29 | select: Optional[list[ObservationSelect | str]] = None, 30 | entity_dcids: Optional[str | list[str]] = None, 31 | entity_expression: Optional[str] = None, 32 | filter_facet_domains: Optional[str | list[str]] = None, 33 | filter_facet_ids: Optional[str | list[str]] = None 34 | ) -> ObservationResponse: 35 | """ 36 | Fetches data from the observation endpoint. 37 | 38 | Args: 39 | variable_dcids (str | list[str]): One or more variable IDs for the data. 40 | date (str | ObservationDate): The date for which data is being requested. 41 | Defaults to the latest observation. 42 | select (list[ObservationSelect]): Fields to include in the response. 43 | Defaults to ["date", "variable", "entity", "value"]. 44 | entity_dcids (Optional[str | list[str]]): One or more entity IDs to filter the data. 45 | entity_expression (Optional[str]): A string expression to filter entities. 46 | filter_facet_domains (Optional[str | list[str]]): One or more domain names to filter the data. 47 | filter_facet_ids (Optional[str | list[str]]): One or more facet IDs to filter the data. 48 | 49 | Returns: 50 | ObservationResponse: The response object containing observations for the specified query. 51 | """ 52 | # Construct the payload 53 | payload = ObservationRequestPayload( 54 | date=date, 55 | variable_dcids=variable_dcids, 56 | select=select, 57 | entity_dcids=entity_dcids, 58 | entity_expression=entity_expression, 59 | filter_facet_domains=filter_facet_domains, 60 | filter_facet_ids=filter_facet_ids, 61 | ).to_dict() 62 | 63 | response = self.post(payload=payload) 64 | 65 | # Send the request 66 | return ObservationResponse.model_validate(response) 67 | 68 | def fetch_observations_by_entity_type( 69 | self, 70 | date: ObservationDate | str, 71 | parent_entity: str, 72 | entity_type: str, 73 | variable_dcids: str | list[str], 74 | *, 75 | select: Optional[list[ObservationSelect | str]] = None, 76 | filter_facet_domains: Optional[str | list[str]] = None, 77 | filter_facet_ids: Optional[str | list[str]] = None 78 | ) -> ObservationResponse: 79 | """ 80 | Fetches all observations for a given entity type. 81 | 82 | Args: 83 | date (ObservationDate | str): The date option for the observations. 84 | Use 'all' for all dates, 'latest' for the most recent data, 85 | or provide a date as a string (e.g., "2024"). 86 | parent_entity (str): The parent entity under which the target entities fall. 87 | For example, "africa" for African countries, or "Earth" for all countries. 88 | entity_type (str): The type of entities for which to fetch observations. 89 | For example, "Country" or "Region". 90 | variable_dcids (str | list[str]): The variable(s) to fetch observations for. 91 | This can be a single variable ID or a list of IDs. 92 | select (Optional[list[ObservationSelect | str]]): Fields to include in the response. 93 | If not provided, defaults to ["date", "variable", "entity", "value"]. 94 | filter_facet_domains: Optional[str | list[str]: One or more domain names to filter the data. 95 | filter_facet_ids: Optional[str | list[str]: One or more facet IDs to filter the data. 96 | 97 | Returns: 98 | ObservationResponse: The response object containing observations for the specified entity type. 99 | 100 | Example: 101 | To fetch all observations for African countries for a specific variable: 102 | 103 | ```python 104 | api = API() 105 | ObservationEndpoint(api).fetch_observations_by_entity_type( 106 | date="all", 107 | parent_entity="africa", 108 | entity_type="Country", 109 | variable_dcids="sdg/SI_POV_DAY1" 110 | ) 111 | ``` 112 | """ 113 | 114 | return self.fetch( 115 | variable_dcids=variable_dcids, 116 | date=date, 117 | select=[s for s in ObservationSelect] if not select else select, 118 | entity_expression= 119 | f"{parent_entity}<-containedInPlace+{{typeOf:{entity_type}}}", 120 | filter_facet_domains=filter_facet_domains, 121 | filter_facet_ids=filter_facet_ids) 122 | 123 | def fetch_observations_by_entity_dcid( 124 | self, 125 | date: ObservationDate | str, 126 | entity_dcids: str | list[str], 127 | variable_dcids: str | list[str], 128 | *, 129 | select: Optional[list[ObservationSelect | str]] = None, 130 | filter_facet_domains: Optional[str | list[str]] = None, 131 | filter_facet_ids: Optional[str | list[str]] = None 132 | ) -> ObservationResponse: 133 | """ 134 | Fetches all observations for a given entity type. 135 | 136 | Args: 137 | date (ObservationDate | str): The date option for the observations. 138 | Use 'all' for all dates, 'latest' for the most recent data, 139 | or provide a date as a string (e.g., "2024"). 140 | entity_dcids (str | list[str]): One or more entity IDs to filter the data. 141 | variable_dcids (str | list[str]): The variable(s) to fetch observations for. 142 | This can be a single variable ID or a list of IDs. 143 | select (Optional[list[ObservationSelect | str]]): Fields to include in the response. 144 | If not provided, defaults to ["date", "variable", "entity", "value"]. 145 | filter_facet_domains: Optional[str | list[str]: One or more domain names to filter the data. 146 | filter_facet_ids: Optional[str | list[str]: One or more facet IDs to filter the data. 147 | 148 | Returns: 149 | ObservationResponse: The response object containing observations for the specified entity type. 150 | 151 | Example: 152 | To fetch all observations for Nigeria for a specific variable: 153 | 154 | ```python 155 | api = API() 156 | ObservationEndpoint(api).fetch_observations_by_entity_dcid( 157 | date="all", 158 | entity_dcids="country/NGA", 159 | variable_dcids="sdg/SI_POV_DAY1" 160 | ) 161 | ``` 162 | """ 163 | 164 | return self.fetch( 165 | variable_dcids=variable_dcids, 166 | date=date, 167 | select=[s for s in ObservationSelect] if not select else select, 168 | entity_dcids=entity_dcids, 169 | filter_facet_domains=filter_facet_domains, 170 | filter_facet_ids=filter_facet_ids) 171 | 172 | def fetch_available_statistical_variables( 173 | self, 174 | entity_dcids: str | list[str], 175 | ) -> dict[str, list[str]]: 176 | """ 177 | Fetches available statistical variables (which have observations) for given entities. 178 | Args: 179 | entity_dcids (str | list[str]): One or more entity DCIDs(s) to fetch variables for. 180 | Returns: 181 | dict[str, list[str]]: A dictionary mapping entity DCIDs to their available statistical variables. 182 | """ 183 | 184 | # Fetch observations for the given entity DCIDs. If variable is empty list 185 | # all available variables are retrieved. 186 | data = self.fetch( 187 | entity_dcids=entity_dcids, 188 | select=[ObservationSelect.VARIABLE, ObservationSelect.ENTITY], 189 | variable_dcids=[]).get_data_by_entity() 190 | 191 | return group_variables_by_entity(data=data) 192 | -------------------------------------------------------------------------------- /datacommons/test/node_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | from unittest.mock import patch 17 | 18 | import datacommons 19 | 20 | 21 | class TestProperties(unittest.TestCase): 22 | 23 | @patch("datacommons.node._post") 24 | def test_with_data(self, _post): 25 | 26 | def side_effect(path, data): 27 | if path == "/v2/node" and data == { 28 | "nodes": ["City", "Count_Person", "foo"], 29 | "property": "->" 30 | }: 31 | return { 32 | "data": { 33 | "City": { 34 | "properties": [ 35 | "name", "provenance", "subClassOf", "typeOf" 36 | ] 37 | }, 38 | "Count_Person": { 39 | "properties": [ 40 | "description", "measuredProperty", "memberOf", "name", 41 | "populationType", "provenance", "statType", "typeOf" 42 | ] 43 | }, 44 | "foo": {} 45 | } 46 | } 47 | 48 | _post.side_effect = side_effect 49 | response = datacommons.properties(["City", "Count_Person", "foo"]) 50 | assert response == { 51 | "City": ["name", "provenance", "subClassOf", "typeOf"], 52 | "Count_Person": [ 53 | "description", "measuredProperty", "memberOf", "name", 54 | "populationType", "provenance", "statType", "typeOf" 55 | ], 56 | "foo": [] 57 | } 58 | 59 | @patch("datacommons.node._post") 60 | def test_with_direction(self, _post): 61 | 62 | def side_effect(path, data): 63 | if path == "/v2/node" and data == { 64 | "nodes": ["City", "Count_Person", "foo"], 65 | "property": "<-" 66 | }: 67 | return { 68 | "data": { 69 | "City": { 70 | "properties": [ 71 | "placeType", "rangeIncludes", "schoolLocationType", 72 | "typeOf" 73 | ] 74 | }, 75 | "Count_Person": { 76 | "properties": [ 77 | "measurementDenominator", "outputProperty", 78 | "relevantVariable" 79 | ] 80 | }, 81 | "foo": {} 82 | } 83 | } 84 | 85 | _post.side_effect = side_effect 86 | response = datacommons.properties(["City", "Count_Person", "foo"], 87 | is_out=False) 88 | assert response == { 89 | "City": ["placeType", "rangeIncludes", "schoolLocationType", "typeOf"], 90 | "Count_Person": [ 91 | "measurementDenominator", "outputProperty", "relevantVariable" 92 | ], 93 | "foo": [] 94 | } 95 | 96 | 97 | class TestPropertyValues(unittest.TestCase): 98 | 99 | @patch("datacommons.node._post") 100 | def test_with_data(self, _post): 101 | 102 | def side_effect(path, data): 103 | print(path) 104 | if path == "/v1/bulk/property/values/out" and data == { 105 | "nodes": ["geoId/06"], 106 | "property": "name", 107 | }: 108 | return { 109 | "data": [{ 110 | "node": 111 | "geoId/06", 112 | "values": [{ 113 | "provenanceId": "dc/5n63hr1", 114 | "value": "California" 115 | }] 116 | }] 117 | } 118 | 119 | _post.side_effect = side_effect 120 | response = datacommons.property_values(["geoId/06"], "name") 121 | assert response == {"geoId/06": ["California"]} 122 | 123 | @patch("datacommons.node._post") 124 | def test_multiple_values(self, _post): 125 | 126 | def side_effect(path, data): 127 | print(path) 128 | if path == "/v1/bulk/property/values/out" and data == { 129 | "nodes": ["geoId/06"], 130 | "property": "geoOverlaps", 131 | }: 132 | return { 133 | "data": [{ 134 | "node": 135 | "geoId/06", 136 | "values": [{ 137 | "provenanceId": "dc/5n63hr1", 138 | "value": "geoId/05" 139 | }, { 140 | "provenanceId": "dc/5n63hr1", 141 | "value": "geoId/07" 142 | }] 143 | }] 144 | } 145 | 146 | _post.side_effect = side_effect 147 | response = datacommons.property_values(["geoId/06"], "geoOverlaps") 148 | assert response == {"geoId/06": ["geoId/05", "geoId/07"]} 149 | 150 | 151 | class TestTriples(unittest.TestCase): 152 | 153 | @patch("datacommons.node._post") 154 | def test_with_data(self, _post): 155 | 156 | def side_effect(path, data): 157 | print(path) 158 | if path == "/v1/bulk/triples/out" and data == { 159 | "nodes": ["Class"], 160 | }: 161 | return { 162 | "data": [{ 163 | "node": "Class", 164 | "triples": { 165 | "typeOf": { 166 | "nodes": [{ 167 | "name": "Class", 168 | "types": ["Class"], 169 | "dcid": "Class", 170 | "provenanceId": "dc/5l5zxr1" 171 | }, { 172 | "name": "Class", 173 | "types": ["Class"], 174 | "dcid": "Class", 175 | "provenanceId": "dc/5l5zxr1" 176 | }] 177 | }, 178 | "isPartOf": { 179 | "nodes": [{ 180 | "provenanceId": "dc/5l5zxr1", 181 | "value": "http://meta.schema.org" 182 | }] 183 | }, 184 | "name": { 185 | "nodes": [{ 186 | "provenanceId": "dc/5l5zxr1", 187 | "value": "Class" 188 | }] 189 | }, 190 | "provenance": { 191 | "nodes": [{ 192 | "name": "BaseSchema", 193 | "types": ["Provenance"], 194 | "dcid": "dc/5l5zxr1", 195 | "provenanceId": "dc/5l5zxr1" 196 | }] 197 | }, 198 | "sameAs": { 199 | "nodes": [{ 200 | "provenanceId": "dc/5l5zxr1", 201 | "value": "http://www.w3.org/2000/01/rdf-schema" 202 | }] 203 | }, 204 | "subClassOf": { 205 | "nodes": [{ 206 | "name": "Intangible", 207 | "types": ["Class"], 208 | "dcid": "Intangible", 209 | "provenanceId": "dc/5l5zxr1" 210 | }] 211 | } 212 | } 213 | }] 214 | } 215 | 216 | _post.side_effect = side_effect 217 | response = datacommons.triples(["Class"]) 218 | assert response == { 219 | "Class": { 220 | 'isPartOf': [{ 221 | 'provenanceId': 'dc/5l5zxr1', 222 | 'value': 'http://meta.schema.org' 223 | }], 224 | 'name': [{ 225 | 'provenanceId': 'dc/5l5zxr1', 226 | 'value': 'Class' 227 | }], 228 | 'provenance': [{ 229 | 'dcid': 'dc/5l5zxr1', 230 | 'name': 'BaseSchema', 231 | 'provenanceId': 'dc/5l5zxr1', 232 | 'types': ['Provenance'] 233 | }], 234 | 'sameAs': [{ 235 | 'provenanceId': 'dc/5l5zxr1', 236 | 'value': 'http://www.w3.org/2000/01/rdf-schema' 237 | }], 238 | 'subClassOf': [{ 239 | 'dcid': 'Intangible', 240 | 'name': 'Intangible', 241 | 'provenanceId': 'dc/5l5zxr1', 242 | 'types': ['Class'] 243 | }], 244 | 'typeOf': [{ 245 | 'dcid': 'Class', 246 | 'name': 'Class', 247 | 'provenanceId': 'dc/5l5zxr1', 248 | 'types': ['Class'] 249 | }, { 250 | 'dcid': 'Class', 251 | 'name': 'Class', 252 | 'provenanceId': 'dc/5l5zxr1', 253 | 'types': ['Class'] 254 | }] 255 | }, 256 | } 257 | -------------------------------------------------------------------------------- /datacommons/core.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ Data Commons Python API Core. 15 | 16 | Provides primitive operations for working with collections of nodes. For a 17 | collection of nodes identified by their dcids, this submodule implements the 18 | following: 19 | 20 | - Getting all property labels 21 | - Getting all property values 22 | - Getting all triples 23 | """ 24 | 25 | from __future__ import absolute_import 26 | from __future__ import division 27 | from __future__ import print_function 28 | 29 | from collections import defaultdict 30 | 31 | import datacommons.utils as utils 32 | 33 | # ----------------------------- WRAPPER FUNCTIONS ----------------------------- 34 | 35 | 36 | def get_property_labels(dcids, out=True): 37 | """ Returns the labels of properties defined for the given :code:`dcids`. 38 | 39 | Args: 40 | dcids (:obj:`iterable` of :obj:`str`): A list of nodes identified by their 41 | dcids. 42 | out (:obj:`bool`, optional): Whether or not the property points away from 43 | the given list of nodes. 44 | 45 | Returns: 46 | A :obj:`dict` mapping dcids to lists of property labels. If `out` is `True`, 47 | then property labels correspond to edges directed away from given nodes. 48 | Otherwise, they correspond to edges directed towards the given nodes. 49 | 50 | Raises: 51 | ValueError: If the payload returned by the Data Commons REST API is 52 | malformed. 53 | 54 | Examples: 55 | To get all outgoing property labels for 56 | `California `_ and 57 | `Colorado `_, we can write 58 | the following. 59 | 60 | >>> get_property_labels(['geoId/06', 'geoId/08']) 61 | { 62 | "geoId/06": [ 63 | "containedInPlace", 64 | "geoId", 65 | "kmlCoordinates", 66 | "name", 67 | "provenance", 68 | "typeOf" 69 | ], 70 | "geoId/08",: [ 71 | "containedInPlace", 72 | "geoId", 73 | "kmlCoordinates", 74 | "name", 75 | "provenance", 76 | "typeOf" 77 | ] 78 | } 79 | 80 | We can also get incoming property labels by setting `out=False`. 81 | 82 | >>> get_property_labels(['geoId/06', 'geoId/08'], out=False) 83 | { 84 | "geoId/06": [ 85 | "addressRegion", 86 | "containedInPlace", 87 | "location", 88 | "overlapsWith" 89 | ], 90 | "geoId/08",: [ 91 | "addressRegion", 92 | "containedInPlace", 93 | "location", 94 | "overlapsWith" 95 | ] 96 | } 97 | """ 98 | # Generate the GetProperty query and send the request 99 | dcids = filter(lambda v: v == v, dcids) # Filter out NaN values 100 | dcids = list(dcids) 101 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_property_labels'] 102 | payload = utils._send_request(url, req_json={'dcids': dcids}) 103 | 104 | # Return the results based on the orientation 105 | results = {} 106 | for dcid in dcids: 107 | if out: 108 | results[dcid] = payload[dcid]['outLabels'] 109 | else: 110 | results[dcid] = payload[dcid]['inLabels'] 111 | return results 112 | 113 | 114 | def get_property_values(dcids, 115 | prop, 116 | out=True, 117 | value_type=None, 118 | limit=utils._MAX_LIMIT): 119 | """ Returns property values of given :code:`dcids` along the given property. 120 | 121 | Args: 122 | dcids (:obj:`iterable` of :obj:`str`): dcids to get property values for. 123 | prop (:obj:`str`): The property to get property values for. 124 | out (:obj:`bool`, optional): A flag that indicates the property is directed 125 | away from the given nodes when set to true. 126 | value_type (:obj:`str`, optional): A type to filter returned property values 127 | by. 128 | limit (:obj:`int`, optional): The maximum number of property values returned 129 | aggregated over all given nodes. 130 | 131 | Returns: 132 | Returned property values are formatted as a :obj:`dict` from a given dcid 133 | to a list of its property values. 134 | 135 | Raises: 136 | ValueError: If the payload returned by the Data Commons REST API is 137 | malformed. 138 | 139 | Examples: 140 | We would like to get the `name` of a list of states specified by their dcid: 141 | `geoId/06 `_, 142 | `geoId/21 `_, and 143 | `geoId/24 `_ 144 | 145 | First, let's try specifying the :code:`dcids` as a :obj:`list` of 146 | :obj:`str`. 147 | 148 | >>> get_property_values(["geoId/06", "geoId/21", "geoId/24"], "name") 149 | { 150 | "geoId/06": ["California"], 151 | "geoId/21": ["Kentucky"], 152 | "geoId/24": ["Maryland"], 153 | } 154 | """ 155 | # Convert the dcids field and format the request to GetPropertyValue 156 | dcids = filter(lambda v: v == v, dcids) # Filter out NaN values 157 | dcids = list(dcids) 158 | if out: 159 | direction = 'out' 160 | else: 161 | direction = 'in' 162 | 163 | req_json = { 164 | 'dcids': dcids, 165 | 'property': prop, 166 | 'limit': limit, 167 | 'direction': direction 168 | } 169 | if value_type: 170 | req_json['value_type'] = value_type 171 | 172 | # Send the request 173 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_property_values'] 174 | payload = utils._send_request(url, req_json=req_json) 175 | 176 | # Create the result format for when dcids is provided as a list. 177 | unique_results = defaultdict(set) 178 | for dcid in dcids: 179 | # Get the list of nodes based on the direction given. 180 | nodes = [] 181 | if out: 182 | if dcid in payload and 'out' in payload[dcid]: 183 | nodes = payload[dcid]['out'] 184 | else: 185 | if dcid in payload and 'in' in payload[dcid]: 186 | nodes = payload[dcid]['in'] 187 | 188 | # Add nodes to unique_results if it is not empty 189 | for node in nodes: 190 | if 'dcid' in node: 191 | unique_results[dcid].add(node['dcid']) 192 | elif 'value' in node: 193 | unique_results[dcid].add(node['value']) 194 | 195 | # Make sure each dcid is in the results dict, and convert all sets to lists. 196 | results = {dcid: sorted(list(unique_results[dcid])) for dcid in dcids} 197 | 198 | return results 199 | 200 | 201 | def get_triples(dcids, limit=utils._MAX_LIMIT): 202 | """ Returns all triples associated with the given :code:`dcids`. 203 | 204 | A knowledge graph can be described as a collection of `triples` which are 205 | 3-tuples that take the form `(s, p, o)`. Here `s` and `o` are nodes in the 206 | graph called the *subject* and *object* respectively while `p` is the property 207 | label of a directed edge from `s` to `o` (sometimes also called the 208 | *predicate*). 209 | 210 | Args: 211 | dcids (:obj:`iterable` of :obj:`str`): A list of dcids to get triples for. 212 | limit (:obj:`int`, optional): The maximum total number of triples to get. 213 | 214 | Returns: 215 | A :obj:`dict` mapping dcids to a :obj:`list` of triples `(s, p, o)` where 216 | `s`, `p`, and `o` are instances of :obj:`str` and either the subject 217 | or object is the mapped dcid. 218 | 219 | Raises: 220 | ValueError: If the payload returned by the Data Commons REST API is 221 | malformed. 222 | 223 | Examples: 224 | We would like to get five triples associated with 225 | `California `_ 226 | 227 | >>> get_triples(["geoId/06"], limit=5) 228 | { 229 | "geoId/06": [ 230 | ("geoId/06", "name", "California"), 231 | ("geoId/06", "typeOf", "State"), 232 | ("geoId/06", "geoId", "06"), 233 | ("geoId/0687056", "containedInPlace", "geoId/06"), 234 | ("geoId/0686440", "containedInPlace", "geoId/06") 235 | ] 236 | } 237 | """ 238 | # Generate the GetTriple query and send the request. 239 | dcids = filter(lambda v: v == v, dcids) # Filter out NaN values 240 | dcids = list(dcids) 241 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_triples'] 242 | payload = utils._send_request(url, req_json={'dcids': dcids, 'limit': limit}) 243 | 244 | # Create a map from dcid to list of triples. 245 | results = defaultdict(list) 246 | for dcid in dcids: 247 | # Make sure each dcid is mapped to an empty list. 248 | results[dcid] 249 | 250 | # Add triples as appropriate 251 | for t in payload[dcid]: 252 | if 'objectId' in t: 253 | results[dcid].append((t['subjectId'], t['predicate'], t['objectId'])) 254 | elif 'objectValue' in t: 255 | results[dcid].append((t['subjectId'], t['predicate'], t['objectValue'])) 256 | return dict(results) 257 | -------------------------------------------------------------------------------- /datacommons_client/client.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Optional 2 | 3 | from datacommons_client.endpoints.base import API 4 | from datacommons_client.endpoints.node import NodeEndpoint 5 | from datacommons_client.endpoints.observation import ObservationEndpoint 6 | from datacommons_client.endpoints.resolve import ResolveEndpoint 7 | from datacommons_client.models.observation import ObservationDate 8 | from datacommons_client.utils.dataframes import add_entity_names_to_observations_dataframe 9 | from datacommons_client.utils.dataframes import add_property_constraints_to_observations_dataframe 10 | from datacommons_client.utils.decorators import requires_pandas 11 | from datacommons_client.utils.error_handling import NoDataForPropertyError 12 | 13 | try: 14 | import pandas as pd 15 | except ImportError: 16 | pd = None 17 | 18 | 19 | class DataCommonsClient: 20 | """ 21 | A client for interacting with the Data Commons API. 22 | 23 | This class provides convenient access to the V2 Data Commons API endpoints. 24 | 25 | Attributes: 26 | api (API): An instance of the API class that handles requests. 27 | node (NodeEndpoint): Provides access to node-related queries, such as fetching property labels 28 | and values for individual or multiple nodes in the Data Commons knowledge graph. 29 | observation (ObservationEndpoint): Handles observation-related queries, allowing retrieval of 30 | statistical observations associated with entities, variables, and dates (e.g., GDP of California in 2010). 31 | resolve (ResolveEndpoint): Manages resolution queries to find different DCIDs for entities. 32 | 33 | """ 34 | 35 | def __init__(self, 36 | api_key: Optional[str] = None, 37 | *, 38 | dc_instance: Optional[str] = "datacommons.org", 39 | url: Optional[str] = None, 40 | surface_header_value: Optional[str] = None): 41 | """ 42 | Initializes the DataCommonsClient. 43 | 44 | Args: 45 | api_key (Optional[str]): The API key for authentication. Defaults to None. Note that 46 | custom DC instances do not currently require an API key. 47 | dc_instance (Optional[str]): The Data Commons instance to use. Defaults to "datacommons.org". 48 | url (Optional[str]): A custom, fully resolved URL for the Data Commons API. Defaults to None. 49 | """ 50 | # If a fully resolved URL is provided, and the default dc_instance is used, 51 | # ignore that default value 52 | if dc_instance == "datacommons.org" and url: 53 | dc_instance = None 54 | 55 | # Create an instance of the API class which will be injected to the endpoints 56 | self.api = API(api_key=api_key, 57 | dc_instance=dc_instance, 58 | url=url, 59 | surface_header_value=surface_header_value) 60 | 61 | # Create instances of the endpoints 62 | self.node = NodeEndpoint(api=self.api) 63 | self.observation = ObservationEndpoint(api=self.api) 64 | self.resolve = ResolveEndpoint(api=self.api) 65 | 66 | def _find_filter_facet_ids( 67 | self, 68 | fetch_by: Literal["entity", "entity_type"], 69 | date: ObservationDate | str, 70 | variable_dcids: str | list[str], 71 | entity_dcids: Literal["all"] | list[str] = "all", 72 | entity_type: Optional[str] = None, 73 | parent_entity: Optional[str] = None, 74 | property_filters: Optional[dict[str, str | list[str]]] = None, 75 | ) -> list[str] | None: 76 | """Finds matching facet IDs for property filters. 77 | 78 | Args: 79 | fetch_by (Literal["entity", "entity_type"]): Determines whether to fetch by entity or entity type. 80 | variable_dcids (str | list[str]): The variable DCIDs for which to retrieve facet IDs. 81 | entity_dcids (Literal["all"] | list[str], optional): The entity DCIDs, or "all" if filtering by entity type. 82 | entity_type (Optional[str]): The entity type, required if fetching by entity type. 83 | parent_entity (Optional[str]): The parent entity, used when fetching by entity type. 84 | property_filters (Optional[dict[str, str | list[str]]): A dictionary of properties to match facets against. 85 | 86 | Returns: 87 | list[str] | None: A list of matching facet IDs, or None if no filters are applied. 88 | """ 89 | 90 | if not property_filters: 91 | return None 92 | 93 | if fetch_by == "entity": 94 | observations = self.observation.fetch_observations_by_entity_dcid( 95 | date=date, 96 | entity_dcids=entity_dcids, 97 | variable_dcids=variable_dcids, 98 | select=["variable", "entity", "facet"], 99 | ) 100 | else: 101 | observations = self.observation.fetch_observations_by_entity_type( 102 | date=date, 103 | entity_type=entity_type, 104 | parent_entity=parent_entity, 105 | variable_dcids=variable_dcids, 106 | select=["variable", "entity", "facet"], 107 | ) 108 | 109 | facet_sets = [ 110 | observations.find_matching_facet_id(property_name=p, value=v) 111 | for p, v in property_filters.items() 112 | ] 113 | 114 | facet_ids = list({facet for facets in facet_sets for facet in facets}) 115 | 116 | return facet_ids 117 | 118 | @requires_pandas 119 | def observations_dataframe( 120 | self, 121 | variable_dcids: str | list[str], 122 | date: ObservationDate | str, 123 | entity_dcids: Literal["all"] | list[str] = "all", 124 | entity_type: Optional[str] = None, 125 | parent_entity: Optional[str] = None, 126 | property_filters: Optional[dict[str, str | list[str]]] = None, 127 | include_constraints_metadata: bool = False, 128 | ): 129 | """ 130 | Fetches statistical observations and returns them as a Pandas DataFrame. 131 | 132 | The Observation API fetches statistical observations linked to entities and variables 133 | at a particular date (e.g., "population of USA in 2020", "GDP of California in 2010"). 134 | 135 | Args: 136 | variable_dcids (str | list[str]): One or more variable DCIDs for the observation. 137 | date (ObservationDate | str): The date for which observations are requested. It can be 138 | a specific date, "all" to retrieve all observations, or "latest" to get the most recent observations. 139 | entity_dcids (Literal["all"] | list[str], optional): The entity DCIDs for which to retrieve data. 140 | Defaults to "all". 141 | entity_type (Optional[str]): The type of entities to filter by when `entity_dcids="all"`. 142 | Required if `entity_dcids="all"`. Defaults to None. 143 | parent_entity (Optional[str]): The parent entity under which the target entities fall. 144 | Required if `entity_dcids="all"`. Defaults to None. 145 | property_filters (Optional[dict[str, str | list[str]]): An optional dictionary used to filter 146 | the data by using observation properties like `measurementMethod`, `unit`, or `observationPeriod`. 147 | include_constraints_metadata (bool): If True, includes the dcid and name of any constraint 148 | properties associated with the variable DCIDs (based on the `constraintProperties` property) 149 | in the returned DataFrame. Defaults to False. 150 | 151 | Returns: 152 | pd.DataFrame: A DataFrame containing the requested observations. 153 | """ 154 | 155 | if entity_dcids == "all" and not (entity_type and parent_entity): 156 | raise ValueError( 157 | "When 'entity_dcids' is 'all', both 'parent_entity' and 'entity_type' must be specified." 158 | ) 159 | 160 | if entity_dcids != "all" and (entity_type or parent_entity): 161 | raise ValueError( 162 | "Specify 'entity_type' and 'parent_entity' only when 'entity_dcids' is 'all'." 163 | ) 164 | 165 | # If property filters are provided, fetch the required facet IDs. Otherwise, set to None. 166 | facets = self._find_filter_facet_ids( 167 | fetch_by="entity" if entity_dcids != "all" else "entity_type", 168 | date=date, 169 | variable_dcids=variable_dcids, 170 | entity_dcids=entity_dcids, 171 | entity_type=entity_type, 172 | parent_entity=parent_entity, 173 | property_filters=property_filters, 174 | ) 175 | 176 | if not facets and property_filters: 177 | raise NoDataForPropertyError 178 | 179 | if entity_dcids == "all": 180 | observations = self.observation.fetch_observations_by_entity_type( 181 | date=date, 182 | parent_entity=parent_entity, 183 | entity_type=entity_type, 184 | variable_dcids=variable_dcids, 185 | filter_facet_ids=facets, 186 | ) 187 | else: 188 | observations = self.observation.fetch_observations_by_entity_dcid( 189 | date=date, 190 | entity_dcids=entity_dcids, 191 | variable_dcids=variable_dcids, 192 | filter_facet_ids=facets, 193 | ) 194 | 195 | # Convert the observations to a DataFrame 196 | df = pd.DataFrame(observations.to_observation_records().model_dump()) 197 | 198 | # Add entity names to the DataFrame 199 | df = add_entity_names_to_observations_dataframe( 200 | endpoint=self.node, 201 | observations_df=df, 202 | entity_columns=["entity", "variable"], 203 | ) 204 | 205 | if include_constraints_metadata: 206 | df = add_property_constraints_to_observations_dataframe( 207 | endpoint=self.node, 208 | observations_df=df, 209 | ) 210 | 211 | return df 212 | -------------------------------------------------------------------------------- /datacommons/stat_vars.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Data Commons Python API Stat Module. 15 | 16 | Provides functions for getting data on StatisticalVariables from Data Commons Graph. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import collections 24 | 25 | import six 26 | 27 | import datacommons.utils as utils 28 | 29 | # stat_var specific batch size. 30 | _STAT_BATCH_SIZE = 2000 31 | 32 | 33 | def get_stat_value(place, 34 | stat_var, 35 | date=None, 36 | measurement_method=None, 37 | observation_period=None, 38 | unit=None, 39 | scaling_factor=None): 40 | """Returns a value for `place` based on the `stat_var`. 41 | 42 | Args: 43 | place (`str`): The dcid of Place to query for. 44 | stat_var (`str`): The dcid of the StatisticalVariable. 45 | date (`str`): Optional, the preferred date of observation 46 | in ISO 8601 format. If not specified, returns the latest observation. 47 | measurement_method (`str`): Optional, the dcid of the preferred 48 | `measurementMethod` value. 49 | observation_period (`str`): Optional, the preferred 50 | `observationPeriod` value. 51 | unit (`str`): Optional, the dcid of the preferred `unit` value. 52 | scaling_factor (`int`): Optional, the preferred `scalingFactor` value. 53 | Returns: 54 | A `float` the value of `stat_var` for `place`, filtered 55 | by optional args. If no data, returns nan. 56 | 57 | Raises: 58 | ValueError: If the payload returned by the Data Commons REST API is 59 | malformed. 60 | 61 | Examples: 62 | >>> get_stat_value("geoId/05", "Count_Person") 63 | 366331 64 | """ 65 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_stat_value'] 66 | url += '?place={}&stat_var={}'.format(place, stat_var) 67 | if date: 68 | url += '&date={}'.format(date) 69 | if measurement_method: 70 | url += '&measurement_method={}'.format(measurement_method) 71 | if observation_period: 72 | url += '&observation_period={}'.format(observation_period) 73 | if unit: 74 | url += '&unit={}'.format(unit) 75 | if scaling_factor: 76 | url += '&scaling_factor={}'.format(scaling_factor) 77 | 78 | try: 79 | res_json = utils._send_request(url, post=False, use_payload=False) 80 | except ValueError: 81 | return float('nan') 82 | if 'value' not in res_json: 83 | return float('nan') 84 | return res_json['value'] 85 | 86 | 87 | def get_stat_series(place, 88 | stat_var, 89 | measurement_method=None, 90 | observation_period=None, 91 | unit=None, 92 | scaling_factor=None): 93 | """Returns a `dict` mapping dates to value of `stat_var` for `place`. 94 | 95 | Args: 96 | place (`str`): The dcid of Place to query for. 97 | stat_var (`str`): The dcid of the StatisticalVariable. 98 | measurement_method (`str`): Optional, the dcid of the preferred 99 | `measurementMethod` value. 100 | observation_period (`str`): Optional, the preferred 101 | `observationPeriod` value. 102 | unit (`str`): Optional, the dcid of the preferred `unit` value. 103 | scaling_factor (`int`): Optional, the preferred `scalingFactor` value. 104 | Returns: 105 | A `dict` mapping dates to value of `stat_var` for `place`, 106 | representing a time series that satisfies all input parameters. 107 | 108 | Raises: 109 | ValueError: If the payload returned by the Data Commons REST API is 110 | malformed. 111 | 112 | Examples: 113 | >>> get_stat_series("geoId/05", "Count_Person") 114 | {"1962":17072000,"2009":36887615,"1929":5531000,"1930":5711000} 115 | """ 116 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_stat_series'] 117 | url += '?place={}&stat_var={}'.format(place, stat_var) 118 | if measurement_method: 119 | url += '&measurement_method={}'.format(measurement_method) 120 | if observation_period: 121 | url += '&observation_period={}'.format(observation_period) 122 | if unit: 123 | url += '&unit={}'.format(unit) 124 | if scaling_factor: 125 | url += '&scaling_factor={}'.format(scaling_factor) 126 | 127 | try: 128 | res_json = utils._send_request(url, post=False, use_payload=False) 129 | except ValueError: 130 | return {} 131 | 132 | if 'series' not in res_json: 133 | return {} 134 | return res_json['series'] 135 | 136 | 137 | def get_stat_all(places, stat_vars): 138 | """Returns a nested `dict` of all time series for `places` and `stat_vars`. 139 | 140 | Args: 141 | places (`Iterable` of `str`): The dcids of Places to query for. 142 | stat_vars (`Iterable` of `str`): The dcids of the StatisticalVariables. 143 | Returns: 144 | A nested `dict` mapping Places to StatisticalVariables and all available 145 | time series for each Place and StatisticalVariable pair. 146 | 147 | Raises: 148 | ValueError: If the payload returned by the Data Commons REST API is 149 | malformed. 150 | 151 | Examples: 152 | >>> get_stat_all(["geoId/05", "geoId/06"], ["Count_Person", "Count_Person_Male"]) 153 | { 154 | "geoId/05": { 155 | "Count_Person": { 156 | "sourceSeries": [ 157 | { 158 | "val": { 159 | "2010": 1633, 160 | "2011": 1509, 161 | "2012": 1581, 162 | }, 163 | "observationPeriod": "P1Y", 164 | "importName": "Wikidata", 165 | "provenanceDomain": "wikidata.org" 166 | }, 167 | { 168 | "val": { 169 | "2010": 1333, 170 | "2011": 1309, 171 | "2012": 131, 172 | }, 173 | "observationPeriod": "P1Y", 174 | "importName": "CensusPEPSurvey", 175 | "provenanceDomain": "census.gov" 176 | } 177 | ], 178 | } 179 | }, 180 | "Count_Person_Male": { 181 | "sourceSeries": [ 182 | { 183 | "val": { 184 | "2010": 1633, 185 | "2011": 1509, 186 | "2012": 1581, 187 | }, 188 | "observationPeriod": "P1Y", 189 | "importName": "CensusPEPSurvey", 190 | "provenanceDomain": "census.gov" 191 | } 192 | ], 193 | } 194 | }, 195 | "geoId/02": { 196 | "Count_Person": {}, 197 | "Count_Person_Male": { 198 | "sourceSeries": [ 199 | { 200 | "val": { 201 | "2010": 13, 202 | "2011": 13, 203 | "2012": 322, 204 | }, 205 | "observationPeriod": "P1Y", 206 | "importName": "CensusPEPSurvey", 207 | "provenanceDomain": "census.gov" 208 | } 209 | ] 210 | } 211 | } 212 | } 213 | """ 214 | url = utils._API_ROOT + utils._API_ENDPOINTS['get_stat_all'] 215 | # Cast iterable-like to list. 216 | places = list(places) 217 | stat_vars = list(stat_vars) 218 | 219 | # Aiming for _STAT_BATCH_SIZE entries total. 220 | # _STAT_BATCH_SIZE = num places x num stat_vars, so aim for 221 | # _STAT_BATCH_SIZE/len(stat_vars) places per batch. 222 | places_per_batch = _STAT_BATCH_SIZE // len(stat_vars) 223 | # Get number of batches via an arithmetic ceiling trick: 224 | # 11//10 rounds down to 1. 225 | # -11//10 rounds down to -2. 226 | # We can divide with, then remove the negative to get the ceiling. 227 | batches = -(-len(places) // places_per_batch) 228 | res = {} 229 | for i in range(batches): 230 | req_json = { 231 | 'stat_vars': stat_vars, 232 | 'places': places[i * places_per_batch:(i + 1) * places_per_batch] 233 | } 234 | # Send the request 235 | res_json = utils._send_request(url, req_json=req_json, use_payload=False) 236 | if 'placeData' not in res_json: 237 | # The REST API spec will always return a dictionary under 238 | # placeData, even if no places exist or have no 239 | # data. If no Places are provided, REST will return an 240 | # error, which will have been caught and passed on in 241 | # _send_request. 242 | raise ValueError("Unexpected response from REST stat/all API.") 243 | 244 | # Unnest the REST response for keys that have single-element values. 245 | place_statvar_series = collections.defaultdict(dict) 246 | for place_dcid, place in res_json['placeData'].items(): 247 | stat_var_data = place.get('statVarData') 248 | if not stat_var_data: 249 | # The REST API spec will always return a dictionary under 250 | # statVarData, even if no StatVars exist or have no 251 | # data. If no StatVars are provided, REST will return an 252 | # error, which will have been caught and passed on in 253 | # _send_request. 254 | raise ValueError("Unexpected response from REST stat/all API.") 255 | for stat_var_dcid, stat_var in stat_var_data.items(): 256 | place_statvar_series[place_dcid][stat_var_dcid] = stat_var 257 | res.update(dict(place_statvar_series)) 258 | 259 | return res 260 | -------------------------------------------------------------------------------- /datacommons_client/tests/utils/test_graph.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from unittest.mock import MagicMock 3 | 4 | from datacommons_client.models.node import Node 5 | from datacommons_client.utils.graph import _assemble_tree 6 | from datacommons_client.utils.graph import _fetch_relationship_uncached 7 | from datacommons_client.utils.graph import _postorder_nodes 8 | from datacommons_client.utils.graph import build_graph_map 9 | from datacommons_client.utils.graph import build_relationship_tree 10 | from datacommons_client.utils.graph import fetch_relationship_lru 11 | from datacommons_client.utils.graph import flatten_relationship 12 | 13 | 14 | def test_fetch_parents_uncached_returns_data(): 15 | """Test _fetch_parents_uncached delegates to endpoint correctly.""" 16 | endpoint = MagicMock() 17 | endpoint.fetch_place_parents.return_value.get.return_value = [ 18 | Node(dcid="parent1", name="Parent 1", types=["Country"]) 19 | ] 20 | 21 | result = _fetch_relationship_uncached(endpoint, 22 | "test_dcid", 23 | contained_type=None, 24 | relationship="parents") 25 | assert isinstance(result, list) 26 | assert result[0].dcid == "parent1" 27 | endpoint.fetch_place_parents.assert_called_once_with( 28 | "test_dcid", 29 | as_dict=False, 30 | ) 31 | 32 | 33 | def test_fetch_relationship_lru_caches_results(): 34 | """Test fetch_relationship_lru uses LRU cache and returns list.""" 35 | endpoint = MagicMock() 36 | endpoint.fetch_place_parents.return_value.get.return_value = [ 37 | Node(dcid="parentX", name="Parent X", types=["Region"]) 38 | ] 39 | 40 | result1 = fetch_relationship_lru(endpoint, 41 | "nodeA", 42 | contained_type=None, 43 | relationship="parents") 44 | result2 = fetch_relationship_lru(endpoint, 45 | "nodeA", 46 | contained_type=None, 47 | relationship="parents") 48 | fetch_relationship_lru(endpoint, 49 | "nodeA", 50 | contained_type=None, 51 | relationship="parents") 52 | 53 | assert isinstance(result1, list) 54 | assert result1[0].dcid == "parentX" 55 | assert result1 == result2 56 | assert endpoint.fetch_place_parents.call_count == 1 57 | 58 | 59 | def test_build_ancestry_map_linear_tree(): 60 | """A -> B -> C""" 61 | 62 | def fetch_mock(dcid): 63 | return { 64 | "C": [Node(dcid="B", name="Node B", types=["Type"])], 65 | "B": [Node(dcid="A", name="Node A", types=["Type"])], 66 | "A": [], 67 | }.get(dcid, []) 68 | 69 | root, graph = build_graph_map("C", fetch_mock, max_workers=2) 70 | 71 | assert root == "C" 72 | assert set(graph.keys()) == {"C", "B", "A"} 73 | assert graph["C"][0].dcid == "B" 74 | assert graph["B"][0].dcid == "A" 75 | assert graph["A"] == [] 76 | 77 | 78 | def test_build_ancestry_map_branching_graph(): 79 | r""" 80 | Graph: 81 | F 82 | / \ 83 | D E 84 | / \ / 85 | B C 86 | \/ 87 | A 88 | """ 89 | 90 | def fetch_mock(dcid): 91 | return { 92 | "A": (Node(dcid="B", name="Node B", 93 | types=["Type"]), Node(dcid="C", 94 | name="Node C", 95 | types=["Type"])), 96 | "B": (Node(dcid="D", name="Node D", types=["Type"]),), 97 | "C": (Node(dcid="D", name="Node D", 98 | types=["Type"]), Node(dcid="E", 99 | name="Node E", 100 | types=["Type"])), 101 | "D": (Node(dcid="F", name="Node F", types=["Type"]),), 102 | "E": (Node(dcid="F", name="Node F", types=["Type"]),), 103 | "F": tuple(), 104 | }.get(dcid, tuple()) 105 | 106 | root, ancestry = build_graph_map("A", fetch_mock, max_workers=4) 107 | 108 | assert root == "A" 109 | assert set(ancestry.keys()) == {"A", "B", "C", "D", "E", "F"} 110 | assert [p.dcid for p in ancestry["A"]] == ["B", "C"] # A has two parents 111 | assert [p.dcid for p in ancestry["B"]] == ["D"] # B has one parent 112 | assert [p.dcid for p in ancestry["C"]] == ["D", "E"] # C has two parents 113 | assert [p.dcid for p in ancestry["D"]] == ["F"] # D has one parent 114 | assert [p.dcid for p in ancestry["E"]] == ["F"] # E has one parent 115 | assert ancestry["F"] == [] # F has no parents 116 | 117 | 118 | def test_build_ancestry_map_cycle_detection(): 119 | """ 120 | Graph with a cycle: 121 | A -> B -> C -> A 122 | (Should not loop infinitely) 123 | """ 124 | 125 | call_count = defaultdict(int) 126 | 127 | def fetch_mock(dcid): 128 | call_count[dcid] += 1 129 | return { 130 | "A": (Node(dcid="B", name="B", types=["Type"]),), 131 | "B": (Node(dcid="C", name="C", types=["Type"]),), 132 | "C": (Node(dcid="A", name="A", types=["Type"]),), # Cycle back to A 133 | }.get(dcid, tuple()) 134 | 135 | root, ancestry = build_graph_map("A", fetch_mock, max_workers=2) 136 | 137 | assert root == "A" # Since we start from A 138 | assert set(ancestry.keys()) == {"A", "B", "C"} 139 | assert [p.dcid for p in ancestry["A"]] == ["B"] # A points to B 140 | assert [p.dcid for p in ancestry["B"]] == ["C"] # B points to C 141 | assert [p.dcid for p in ancestry["C"]] == ["A" 142 | ] # C points back to A but it's ok 143 | 144 | # Check that each node was fetched only once (particularly for A to avoid infinite loop) 145 | assert call_count["A"] == 1 146 | assert call_count["B"] == 1 147 | assert call_count["C"] == 1 148 | 149 | 150 | def test_postorder_nodes_simple_graph(): 151 | """Test postorder traversal on a simple graph.""" 152 | ancestry = { 153 | "C": [Node(dcid="B", name="B", types=["Type"])], 154 | "B": [Node(dcid="A", name="A", types=["Type"])], 155 | "A": [], 156 | } 157 | 158 | order = _postorder_nodes("C", ancestry) 159 | assert order == ["A", "B", "C"] 160 | 161 | new_order = _postorder_nodes("B", ancestry) 162 | assert new_order == ["A", "B"] 163 | 164 | 165 | def test_postorder_nodes_ignores_disconnected(): 166 | """ 167 | Graph: 168 | A <- B <- C 169 | D (disconnected) 170 | """ 171 | graph = { 172 | "A": [Node(dcid="B", name="B", types=["Type"])], 173 | "B": [Node(dcid="C", name="C", types=["Type"])], 174 | "C": [], 175 | "D": [Node(dcid="Z", name="Z", types=["Type"])], 176 | } 177 | order = _postorder_nodes("A", graph) 178 | assert order == ["C", "B", "A"] 179 | assert "D" not in order 180 | 181 | 182 | def test_assemble_tree_creates_nested_structure(): 183 | """Test _assemble_tree creates a nested structure.""" 184 | ancestry = { 185 | "C": [Node(dcid="B", name="Node B", types=["Type"])], 186 | "B": [Node(dcid="A", name="Node A", types=["Type"])], 187 | "A": [], 188 | } 189 | postorder = ["A", "B", "C"] 190 | tree = _assemble_tree(postorder, ancestry, relationship_key="parents") 191 | 192 | assert tree["dcid"] == "C" 193 | assert tree["parents"][0]["dcid"] == "B" 194 | assert tree["parents"][0]["parents"][0]["dcid"] == "A" 195 | 196 | 197 | def test_postorder_nodes_ignores_unreachable_nodes(): 198 | """ 199 | Graph: 200 | A → B → C 201 | Ancestry map also includes D (unconnected) 202 | """ 203 | ancestry = { 204 | "A": [Node(dcid="B", name="B", types=["Type"])], 205 | "B": [Node(dcid="C", name="C", types=["Type"])], 206 | "C": [], 207 | "D": [Node(dcid="X", name="X", types=["Type"])], 208 | } 209 | 210 | postorder = _postorder_nodes("A", ancestry) 211 | 212 | # Only nodes reachable from A should be included 213 | assert postorder == ["C", "B", "A"] 214 | assert "D" not in postorder 215 | 216 | 217 | def test_assemble_tree_shared_parent_not_duplicated(): 218 | """ 219 | Structure: 220 | A → C 221 | B → C 222 | Both A and B have same parent C 223 | """ 224 | 225 | ancestry = { 226 | "A": [Node(dcid="C", name="C name", types=["City"])], 227 | "B": [Node(dcid="C", name="C name", types=["City"])], 228 | "C": [], 229 | } 230 | 231 | postorder = ["C", "A", "B"] # C first to allow bottom-up build 232 | tree = _assemble_tree(postorder, ancestry, relationship_key="parents") 233 | 234 | assert tree["dcid"] == "B" 235 | assert len(tree["parents"]) == 1 236 | assert tree["parents"][0]["dcid"] == "C" 237 | 238 | # Confirm C only appears once 239 | assert tree["parents"][0] is not None 240 | assert tree["parents"][0]["name"] == "C name" 241 | 242 | 243 | def test_build_ancestry_tree_nested_output(): 244 | """Test build_ancestry_tree creates a nested structure.""" 245 | ancestry = { 246 | "C": [Node(dcid="B", name="B", types=["Type"])], 247 | "B": [Node(dcid="A", name="A", types=["Type"])], 248 | "A": [], 249 | } 250 | 251 | tree = build_relationship_tree("C", ancestry, relationship_key="parents") 252 | 253 | assert tree["dcid"] == "C" 254 | assert tree["parents"][0]["dcid"] == "B" 255 | assert tree["parents"][0]["parents"][0]["dcid"] == "A" 256 | 257 | 258 | def test_flatten_ancestry_deduplicates(): 259 | """Test flatten_ancestry deduplicates parents.""" 260 | 261 | ancestry = { 262 | "X": [Node(dcid="A", name="A", types=["Country"])], 263 | "Y": [ 264 | Node(dcid="A", name="A", types=["Country"]), 265 | Node(dcid="B", name="B", types=["City"]) 266 | ], 267 | } 268 | 269 | flat = flatten_relationship(ancestry) 270 | 271 | assert {"dcid": "A", "name": "A", "types": ["Country"]} in flat 272 | assert {"dcid": "B", "name": "B", "types": ["City"]} in flat 273 | assert len(flat) == 2 274 | --------------------------------------------------------------------------------