├── .github ├── dependabot.yaml └── workflows │ ├── CI.yaml │ └── pypi.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .prettierignore ├── .prettierrc.json ├── LICENSE ├── README.md ├── TERMS_OF_DATA_ACCESS ├── docs ├── TERMS-OF-DATA-ACCESS.md ├── _static │ ├── monogram-dark-cropped.png │ └── monogram-light-cropped.png ├── api.md ├── conf.py ├── data-access.md ├── data-processing.md ├── glossary.md ├── index.md └── install-offsets-db-data.md ├── offsets_db_data ├── __init__.py ├── apx.py ├── arb.py ├── catalog.yaml ├── common.py ├── configs │ ├── all-protocol-mapping.json │ ├── beneficiary-mappings.json │ ├── berkeley-project-types.json │ ├── credits-raw-columns-mapping.json │ ├── projects-raw-columns-mapping.json │ └── type-category-mapping.json ├── credits.py ├── data.py ├── gld.py ├── models.py ├── openrefine.py ├── pipeline_utils.py ├── projects.py ├── py.typed ├── registry.py └── vcs.py ├── pyproject.toml ├── readthedocs.yml ├── requirements-dev.txt ├── requirements-docs.txt ├── requirements.txt ├── scripts ├── check-beneficiary-coverage.py └── extract-berkeley-project-types.py └── tests ├── __init__.py ├── test_integration.py ├── test_pipeline_utils.py └── test_vcs.py /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: 'github-actions' 4 | directory: '/' 5 | schedule: 6 | interval: monthly 7 | -------------------------------------------------------------------------------- /.github/workflows/CI.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | workflow_dispatch: 10 | 11 | schedule: 12 | - cron: '0 5 * * *' # At 05:00 13 | 14 | concurrency: 15 | group: ${{ github.workflow }}-${{ github.ref }} 16 | cancel-in-progress: true 17 | 18 | permissions: 19 | id-token: write # This is required for requesting the JWT 20 | contents: read # This is required for actions/checkout 21 | 22 | env: 23 | AWS_DEFAULT_REGION: us-west-2 24 | 25 | jobs: 26 | test: 27 | runs-on: ubuntu-latest 28 | timeout-minutes: 120 29 | defaults: 30 | run: 31 | shell: bash -l {0} 32 | services: 33 | openrefine: 34 | image: abesesr/openrefine:3.8.7 35 | ports: 36 | - 3333:3333 37 | options: --name openrefine 38 | steps: 39 | - uses: actions/checkout@v4 40 | - name: configure aws credentials 41 | uses: aws-actions/configure-aws-credentials@v4 42 | with: 43 | role-to-assume: arn:aws:iam::631969445205:role/github-action-role 44 | role-session-name: offsets-db-etl-role-session 45 | aws-region: ${{ env.AWS_DEFAULT_REGION }} 46 | - uses: actions/setup-python@v5 47 | with: 48 | python-version: '3.10' 49 | cache: 'pip' 50 | 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | python -m pip install -r requirements.txt 55 | python -m pip install -r requirements-dev.txt 56 | 57 | - name: Install package 58 | run: | 59 | python -m pip install . 60 | 61 | - name: List packages 62 | run: | 63 | python -m pip list 64 | 65 | - name: Install CLI 66 | run: | 67 | offsets-db-data-orcli install --destination /usr/local/bin 68 | offsets-db-data-orcli --help 69 | 70 | - name: Run tests 71 | run: | 72 | python -m pytest -s 73 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Build distribution 2 | on: 3 | release: 4 | types: 5 | - published 6 | push: 7 | 8 | jobs: 9 | build-artifacts: 10 | runs-on: ubuntu-latest 11 | if: github.repository == 'carbonplan/offsets-db-data' 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | - uses: actions/setup-python@v5 17 | name: Install Python 18 | with: 19 | python-version: '3.10' 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install setuptools setuptools-scm wheel twine check-manifest 25 | 26 | - name: Build tarball and wheels 27 | run: | 28 | git clean -xdf 29 | git restore -SW . 30 | python -m build --sdist --wheel . 31 | 32 | - name: Check built artifacts 33 | run: | 34 | python -m twine check dist/* 35 | pwd 36 | if [ -f dist/offsets-db-data-unknown.tar.gz ]; then 37 | echo "❌ INVALID VERSION NUMBER" 38 | exit 1 39 | else 40 | echo "✅ Looks good" 41 | fi 42 | - uses: actions/upload-artifact@v4 43 | with: 44 | name: releases 45 | path: dist 46 | 47 | test-built-dist: 48 | needs: build-artifacts 49 | runs-on: ubuntu-latest 50 | steps: 51 | - uses: actions/setup-python@v5 52 | name: Install Python 53 | with: 54 | python-version: '3.10' 55 | - uses: actions/download-artifact@v4 56 | with: 57 | name: releases 58 | path: dist 59 | - name: List contents of built dist 60 | run: | 61 | ls -ltrh 62 | ls -ltrh dist 63 | 64 | - name: Verify the built dist/wheel is valid 65 | if: github.event_name == 'push' 66 | run: | 67 | python -m pip install --upgrade pip 68 | python -m pip install dist/offsets_db_data*.whl 69 | python -c "import offsets_db_data; print(offsets_db_data.__version__)" 70 | 71 | upload-to-pypi: 72 | needs: test-built-dist 73 | if: github.event_name == 'release' 74 | runs-on: ubuntu-latest 75 | permissions: 76 | id-token: write 77 | steps: 78 | - uses: actions/download-artifact@v4 79 | with: 80 | name: releases 81 | path: dist 82 | - name: Publish package to PyPI 83 | uses: pypa/gh-action-pypi-publish@v1.12.4 84 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | offsets_db_data/_version.py 162 | 163 | .DS_Store 164 | .idea 165 | .jupyter_cache/ 166 | jupyter_execute/ 167 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autoupdate_schedule: monthly 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v5.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-docstring-first 10 | - id: check-json 11 | - id: check-yaml 12 | - id: double-quote-string-fixer 13 | - id: debug-statements 14 | - id: mixed-line-ending 15 | 16 | - repo: https://github.com/astral-sh/ruff-pre-commit 17 | rev: 'v0.11.8' 18 | hooks: 19 | - id: ruff 20 | args: ['--fix'] 21 | - id: ruff-format 22 | 23 | - repo: https://github.com/pre-commit/mirrors-prettier 24 | rev: v4.0.0-alpha.8 25 | hooks: 26 | - id: prettier 27 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/.prettierignore -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "jsxSingleQuote": true, 3 | "printWidth": 80, 4 | "quoteProps": "as-needed", 5 | "semi": false, 6 | "singleQuote": true, 7 | "tabWidth": 2 8 | } 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 carbonplan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 7 | 8 | 9 | 13 | 14 |

15 | 16 | [![CI](https://github.com/carbonplan/offsets-db-data/actions/workflows/CI.yaml/badge.svg)](https://github.com/carbonplan/offsets-db-data/actions/workflows/CI.yaml) 17 | [![PyPI](https://github.com/carbonplan/offsets-db-data/actions/workflows/pypi.yaml/badge.svg)](https://github.com/carbonplan/offsets-db-data/actions/workflows/pypi.yaml) 18 | [![PyPI][pypi-badge]][pypi-link] 19 | [![Documentation Status][rtd-badge]][rtd-link] 20 | 21 | # carbonplan / offsets-db-data 22 | 23 | Utilities for cleaning, and processing data for the [OffsetsDB web tool](https://carbonplan.org/research/offsets-db/) 24 | 25 | ## installation 26 | 27 | To install the package, you can use pip: 28 | 29 | ```bash 30 | python -m pip install git+https://github.com/carbonplan/offsets-db-data.git 31 | ``` 32 | 33 | You can also install the package locally by cloning the repository and running: 34 | 35 | ```bash 36 | git clone https://github.com/carbonplan/offsets-db-data.git 37 | cd offsets-db-data 38 | python -m pip install -e . 39 | ``` 40 | 41 | To install the dependencies for development, you can use pip: 42 | 43 | ```bash 44 | python -m pip install -e ".[all]" 45 | 46 | # or 47 | 48 | python -m pip install -e ".[dev]" 49 | 50 | ``` 51 | 52 | ## building the documentation 53 | 54 | To build the documentation locally, you can use [sphinx](https://www.sphinx-doc.org/en/master/). You can install the documentation dependencies by running: 55 | 56 | ```bash 57 | python -m pip install -e ".[docs]" 58 | ``` 59 | 60 | Then, you can build the documentation by running: 61 | 62 | ```bash 63 | sphinx-build docs docs/_build 64 | ``` 65 | 66 | You can view the documentation by opening `docs/_build/index.html` in your browser. 67 | 68 | ## license 69 | 70 | All the code in this repository is [MIT](https://choosealicense.com/licenses/mit/) licensed. 71 | 72 | > [!IMPORTANT] 73 | > Data associated with this repository are subject to additional [terms of data access](https://github.com/carbonplan/offsets-db-data/blob/main/TERMS_OF_DATA_ACCESS). 74 | 75 | ## about us 76 | 77 | CarbonPlan is a non-profit organization that uses data and science for climate action. We aim to improve the transparency and scientific integrity of carbon removal and climate solutions through open data and tools. Find out more at [carbonplan.org](https://carbonplan.org/) or get in touch by [opening an issue](https://github.com/carbonplan/offsets-db/issues/new) or [sending us an email](mailto:hello@carbonplan.org). 78 | 79 | [pypi-badge]: https://img.shields.io/pypi/v/offsets-db-data?logo=pypi 80 | [pypi-link]: https://pypi.org/project/offsets-db-data 81 | [rtd-badge]: https://readthedocs.org/projects/offsets-db-data/badge/?version=latest 82 | [rtd-link]: https://offsets-db-data.readthedocs.io/en/latest/?badge=latest 83 | -------------------------------------------------------------------------------- /TERMS_OF_DATA_ACCESS: -------------------------------------------------------------------------------- 1 | # TERMS OF DATA ACCESS 2 | 3 | ## OffsetsDB 4 | 5 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a 6 | regularly-updated snapshot of carbon offset projects, credit issuances, and 7 | credit retirements published by the following carbon offset registries: 8 | 9 | - American Carbon Registry (ACR) 10 | - ART TREES (ART) 11 | - Climate Action Reserve (CAR) 12 | - Gold Standard (GLD) 13 | - Verra (VCS) 14 | 15 | Carbon offset information has historically been scattered across multiple 16 | locations in formats that are not particularly useful to researchers. This 17 | database is meant to increase transparency, accountability, and reliability of 18 | the carbon offset market, and to provide researchers with a robust tool for 19 | visualizing, validating, and cross-checking offsets. We hope you find it useful! 20 | 21 | ## Our Terms of Use Apply To OffsetsDB 22 | 23 | By downloading, copying, or using this project, and/or any associated content 24 | or data, you agree to CarbonPlan’s Terms Of Use, which can be found here: 25 | [https://carbonplan.org/terms](https://carbonplan.org/terms). As further 26 | explained in the Terms of Use, CarbonPlan makes its projects — including 27 | OffsetsDB — available strictly on an “as-is” and “as-available” basis, without 28 | warranty of any kind, including without limitation the warranties of 29 | merchantability, fitness for a particular purpose, and noninfringement. 30 | 31 | ## Intellectual Property Rights 32 | 33 | Because OffsetsDB consists of purely factual information concerning carbon 34 | offsets that has been made publicly available by the above-referenced 35 | registries, CarbonPlan does not claim copyright in this data. 36 | 37 | However, please note that CarbonPlan does not make any representation as to 38 | whether any of the above-referenced registries may claim any rights in the data 39 | they have published. If you have any questions or concerns about this, please 40 | reach out to the registries directly. 41 | -------------------------------------------------------------------------------- /docs/TERMS-OF-DATA-ACCESS.md: -------------------------------------------------------------------------------- 1 | # TERMS OF DATA ACCESS 2 | 3 | ## OffsetsDB 4 | 5 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a regularly-updated snapshot of carbon offset projects, credit issuances, and credit retirements published by the following carbon offset registries: 6 | 7 | - American Carbon Registry (ACR) 8 | - ART TREES (ART) 9 | - Climate Action Reserve (CAR) 10 | - Gold Standard (GLD) 11 | - Verra (VCS) 12 | 13 | Carbon offset information has historically been scattered across multiple locations in formats that are not particularly useful to researchers. This database is meant to increase transparency, accountability, and reliability of the carbon offset market, and to provide researchers with a robust tool for visualizing, validating, and cross-checking offsets. We hope you find it useful! 14 | 15 | ## Our Terms of Use Apply To OffsetsDB 16 | 17 | By downloading, copying, or using this project, and/or any associated content or data, you agree to CarbonPlan’s Terms Of Use, which can be found here: [https://carbonplan.org/terms](https://carbonplan.org/terms). As further explained in the Terms of Use, CarbonPlan makes its projects — including OffsetsDB — available strictly on an “as-is” and “as-available” basis, without warranty of any kind, including without limitation the warranties of merchantability, fitness for a particular purpose, and noninfringement. 18 | 19 | ## Intellectual Property Rights 20 | 21 | Because OffsetsDB consists of purely factual information concerning carbon offsets that has been made publicly available by the above-referenced registries, CarbonPlan does not claim copyright in this data. 22 | 23 | However, please note that CarbonPlan does not make any representation as to whether any of the above-referenced registries may claim any rights in the data they have published. If you have any questions or concerns about this, please reach out to the registries directly. 24 | -------------------------------------------------------------------------------- /docs/_static/monogram-dark-cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/docs/_static/monogram-dark-cropped.png -------------------------------------------------------------------------------- /docs/_static/monogram-light-cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/docs/_static/monogram-light-cropped.png -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | This page provides an autogenerated summary of offsets-db-data's API. For more details and examples, refer to the relevant chapters in the main part of teh documentation. 4 | 5 | ## Registry Specific Functions 6 | 7 | The following functions are specific to a given registry and are grouped under each registry's module. We currently support the following registries: 8 | 9 | - [verra](https://registry.verra.org/) 10 | - [gold-standard](https://www.goldstandard.org) 11 | - APX registries 12 | - [art-trees](https://art.apx.com/) 13 | - [climate action reserve](https://thereserve2.apx.com) 14 | - [american carbon registry](https://acr2.apx.com/) 15 | 16 | ### Verra 17 | 18 | ```{eval-rst} 19 | .. automodule:: offsets_db_data.vcs 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | ``` 24 | 25 | ### Gold Standard 26 | 27 | ```{eval-rst} 28 | .. automodule:: offsets_db_data.gld 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | ``` 33 | 34 | ### APX Registries 35 | 36 | Functionality for APX registries is currently grouped under the `apx`` module. 37 | 38 | ```{eval-rst} 39 | .. automodule:: offsets_db_data.apx 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | ``` 44 | 45 | ## ARB Data Functions 46 | 47 | The following functions are specific to the [ARB data](https://ww2.arb.ca.gov/our-work/programs/compliance-offset-program/arb-offset-credit-issuance). 48 | 49 | ```{eval-rst} 50 | .. automodule:: offsets_db_data.arb 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | ``` 55 | 56 | ## Common Functions 57 | 58 | The following functions are common to all registries. 59 | 60 | ```{eval-rst} 61 | .. automodule:: offsets_db_data.common 62 | :members: 63 | :undoc-members: 64 | :show-inheritance: 65 | 66 | .. automodule:: offsets_db_data.credits 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | .. automodule:: offsets_db_data.projects 72 | :members: 73 | :undoc-members: 74 | :show-inheritance: 75 | 76 | .. automodule:: offsets_db_data.models 77 | :members: 78 | :undoc-members: 79 | :show-inheritance: 80 | 81 | .. automodule:: offsets_db_data.registry 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | ``` 87 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | 10 | import datetime 11 | import sys 12 | 13 | import offsets_db_data 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # sys.path.insert(0, os.path.abspath('.')) 19 | # sys.path.insert(os.path.abspath('..')) 20 | 21 | print('python exec:', sys.executable) 22 | print('sys.path:', sys.path) 23 | 24 | 25 | project = 'offsets-db-data' 26 | copyright = f'{datetime.datetime.now().date().year}, carbonplan' 27 | author = 'carbonplan' 28 | release = f'v{offsets_db_data.__version__}' 29 | 30 | # -- General configuration --------------------------------------------------- 31 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 32 | 33 | extensions = [ 34 | 'myst_nb', 35 | # 'sphinxext.opengraph', 36 | 'sphinx_copybutton', 37 | 'sphinx_design', 38 | 'sphinx.ext.autodoc', 39 | 'sphinx.ext.viewcode', 40 | 'sphinx.ext.autosummary', 41 | 'sphinx.ext.doctest', 42 | 'sphinx.ext.intersphinx', 43 | 'sphinx.ext.extlinks', 44 | 'sphinx.ext.intersphinx', 45 | 'sphinx.ext.napoleon', 46 | 'sphinx_togglebutton', 47 | ] 48 | 49 | # MyST config 50 | myst_enable_extensions = ['amsmath', 'colon_fence', 'deflist', 'html_image'] 51 | myst_url_schemes = ['http', 'https', 'mailto'] 52 | 53 | # sphinx-copybutton configurations 54 | copybutton_prompt_text = r'>>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: ' 55 | copybutton_prompt_is_regexp = True 56 | 57 | nb_execution_mode = 'auto' 58 | nb_execution_timeout = 600 59 | nb_execution_raise_on_error = True 60 | autosummary_generate = True 61 | 62 | 63 | templates_path = ['_templates'] 64 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 65 | # Sphinx project configuration 66 | source_suffix = ['.rst', '.md'] 67 | 68 | 69 | # -- Options for HTML output ------------------------------------------------- 70 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 71 | 72 | 73 | html_theme = 'sphinx_book_theme' 74 | 75 | 76 | html_last_updated_fmt = '%b %d, %Y' 77 | 78 | html_title = 'offsets-db-data' 79 | 80 | 81 | html_theme_options = { 82 | 'repository_url': 'https://github.com/carbonplan/offsets-db-data', 83 | 'repository_branch': 'main', 84 | 'use_repository_button': True, 85 | 'path_to_docs': 'docs', 86 | 'use_edit_page_button': True, 87 | 'use_source_button': True, 88 | 'logo': { 89 | 'image_dark': 'monogram-light-cropped.png', 90 | 'image_light': 'monogram-dark-cropped.png', 91 | }, 92 | } 93 | html_static_path = ['_static'] 94 | 95 | intersphinx_mapping = { 96 | 'python': ('https://docs.python.org/3/', None), 97 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 98 | } 99 | -------------------------------------------------------------------------------- /docs/data-access.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | format_name: myst 5 | kernelspec: 6 | display_name: Python 3 7 | name: python3 8 | --- 9 | 10 | # Access OffsetsDB Data 11 | 12 | OffsetsDB provides a detailed view of carbon offset credits and projects. 13 | You can access the data in various formats or directly through Python using our data package. 14 | 15 | ```{important} 16 | By downloading or accessing the OffsetsDB data archives, you agree to the [Terms of Data Access](TERMS-OF-DATA-ACCESS.md). 17 | ``` 18 | 19 | ## CSV & Parquet Zipped Files 20 | 21 | Download the latest version of OffsetsDB in CSV: 22 | 23 | - [Download Credits & Projects](https://carbonplan-offsets-db.s3.us-west-2.amazonaws.com/production/latest/offsets-db.csv.zip) 24 | 25 | Download the latest version of OffsetsDB in [Parquet](https://parquet.apache.org/): 26 | 27 | - [Download Credits & Projects](https://carbonplan-offsets-db.s3.us-west-2.amazonaws.com/production/latest/offsets-db.parquet.zip) 28 | 29 | ## Citation 30 | 31 | Please cite OffsetsDB as: 32 | 33 | CarbonPlan (2024) “OffsetsDB” [https://carbonplan.org/research/offsets-db](https://carbonplan.org/research/offsets-db) 34 | 35 | ## Accessing The Full Data Archive Through Python 36 | 37 | For more dynamic and programmatic access to OffsetsDB, you can use our Python data package. This package allows you to load and interact with the data directly in your Python environment. With the data package, you can access the data in a variety of formats including CSV (for raw data) and Parquet (for processed data). 38 | 39 | ### Installation 40 | 41 | To get started, install the offsets_db_data package. Ensure you have Python installed on your system, and then run: 42 | 43 | ```bash 44 | python -m pip install offsets-db-data 45 | ``` 46 | 47 | ### Using the Data Catalog 48 | 49 | Once installed, you can access the data through an Intake catalog. This catalog provides a high-level interface to the OffsetsDB datasets. 50 | 51 | Loading the Catalog 52 | 53 | ```{code-cell} ipython3 54 | import pandas as pd 55 | pd.options.display.max_columns = 5 56 | from offsets_db_data.data import catalog 57 | 58 | # Display the catalog 59 | print(catalog) 60 | ``` 61 | 62 | #### Available Data 63 | 64 | The catalog includes different datasets, like credits and projects 65 | 66 | #### Getting Descriptive Information About a Dataset 67 | 68 | You can get information about a dataset using the `describe()` method. For example, to get information about the 'credits' dataset: 69 | 70 | ```{code-cell} ipython3 71 | catalog['credits'].describe() 72 | ``` 73 | 74 | #### Accessing Specific Datasets 75 | 76 | You can access individual datasets within the catalog. For example, to access the 'credits' dataset: 77 | 78 | ```{code-cell} ipython3 79 | # Access the 'credits' dataset 80 | credits = catalog['credits'] 81 | 82 | # Read the data into a pandas DataFrame 83 | credits_df = credits.read() 84 | credits_df.head() 85 | 86 | ``` 87 | 88 | Similarly, to access the 'projects' dataset: 89 | 90 | ```{code-cell} ipython3 91 | # Access the 'projects' dataset 92 | projects = catalog['projects'] 93 | 94 | # Read the data into a pandas DataFrame 95 | projects_df = projects.read() 96 | projects_df.head() 97 | ``` 98 | 99 | Calling `projects.read()` and `credits.read()` without specifying a date, will return the data downloaded and processed on `2024-02-13`. 100 | 101 | To load data for a specific date, you can specify the date as a string in the format `YYYY-MM-DD`. For example: 102 | 103 | ```{code-cell} ipython3 104 | projects_df = catalog['projects'](date='2024-02-07').read() 105 | projects_df.head() 106 | ``` 107 | 108 | ```{note} 109 | If you specify a date for which the data is not available, the package will raise a `PermissionError: Access Denied`. 110 | ``` 111 | -------------------------------------------------------------------------------- /docs/data-processing.md: -------------------------------------------------------------------------------- 1 | # Data Processing 2 | 3 | ## Order of Operations 4 | 5 | OffsetsDB follows a typical extract-transform-load (ETL) workflow. 6 | Extraction involves querying and downloading raw credit and project data hosted by offset registries. 7 | Transformation involves executing the functions contained within this repo, `offsets-db-data`. 8 | Load involves uploading the resulting data to S3 and the Postgres-backend that powers the OffsetsDB database tool. 9 | 10 | ## Downloading Raw Data 11 | 12 | We download a fresh copy of project and transaction data on a daily basis. 13 | While downloading, we make no changes to the raw data provided by the registries. 14 | We've fully automated downloading of registry data, with the exception of Gold Standard. 15 | Gold Standard's [terms and conditions](https://www.goldstandard.org/articles/terms-and-conditions) require that downloads occur through the interfaces provided by the Gold Standard site. 16 | Those interfaces, as provided, do not allow automated downloads. 17 | 18 | We have no plans to release the code the directly interacts with registries. 19 | We decided to keep this part of OffsetsDB private in an effort to limit download requests to the registries. 20 | Other, technical aspects of OffsetsDB, like the database and API that power the [database tool](https://carbonplan.org/research/offsets-db) are similarly closed. 21 | We made this decision to ensure that the OffsetsDB database tool remains performant. 22 | Critically, the production database represents an identical clone of the data generated by the code contained within `offsets-db-data`. 23 | No additional processing or inferences should occur outside the context of this repository. 24 | 25 | ## Transforming Raw Data 26 | 27 | Nearly the entirety of the code contained within `offsets-db-data` involves registry-specific logic for transforming raw registry data into a common, shared schema. 28 | The logic for transforming the data of each registry is contained within a single file, with the filename denoting which registry the transformations apply to. 29 | For example, the logic involved in transforming Verra data are contained within {py:obj}`offsets_db_data.vcs`. 30 | 31 | Each registry-specific file contains at least two functions: `process_{registry_abbreviation}_credits` and `process_{registry_abbreviation}_projects` 32 | Those functions, in turn, call a series of additional transformation functions that produce the normalized project and credit data which combine to form OffsetsDB. 33 | These transformation functions tend to be quite small and operate on one or two properties of the raw data. 34 | To continue with the Verra example, `vcs.py` contains functions with names like {py:obj}`offsets_db_data.vcs.set_vcs_vintage_year` and {py:obj}`offsets_db_data.vcs.add_vcs_project_id`. 35 | These functions contain the registry-specific logic needed to map Verra's raw data to the common data schema of OffsetsDB. 36 | 37 | ### An Example 38 | 39 | In practice, replicating the behavior of OffsetsDB should be simple. 40 | Here's an example of using `offsets_db_data` to transform the raw transactions data from Verra into a normalized, analysis ready file: 41 | 42 | ```python 43 | import pandas as pd 44 | pd.options.display.max_columns = 5 45 | from offsets_db_data import vcs 46 | 47 | archive_fname = {{ path to local copy of Verra transaction data }} 48 | raw_credits = pd.read_csv(archive_fname) 49 | processed_credits = vcs.process_vcs_credits(raw_credits) 50 | ``` 51 | 52 | ```{note} 53 | Running the above example requires first downloading [a copy of Verra's transaction data](https://registry.verra.org/app/search/VCS) and changing the above code to reference the location of that data on your local machine. 54 | ``` 55 | 56 | Invoking single transformation functions, like {py:obj}`offsets_db_data.vcs.set_vcs_vintage_year` is even more straightforward. 57 | Let's say you want to understand more about how OffsetsDB assigns Verra credits a vintage year. 58 | You can explore the behavior of this single transformation function by calling: 59 | 60 | ```python 61 | raw_credits.set_vcs_vintage_year(date_column='Vintage End').head() 62 | ``` 63 | 64 | It's worth noting that we've wrapped all transformation functions using the [`pandas_flavor.register_dataframe_method`](https://github.com/pyjanitor-devs/pandas_flavor) decorator. 65 | That means that after importing a registry module from `offsets_db_data`, the transformation functions of that module are directly callable by any Pandas dataframe. 66 | 67 | ## Initial Column Mapping 68 | 69 | The initial and perhaps must mundane transformation of OffsetsDB involves mapping properties in the raw data to a common schema. 70 | This step requires constructing a map between the names of properties as they appear in the raw data to the property in OffsetsDB. 71 | For example, the Climate Action Reserve data refers to the property, `project_id`, as `Project ID`. 72 | The ART registry, however, refers to the same property as `Program ID`. 73 | 74 | These column mapping files are stored in [`offsets_db_data/configs`](https://github.com/carbonplan/offsets-db-data/tree/main/offsets_db_data/configs). 75 | There is a separate mapping file for `projects` data and `credits` data. 76 | Some properties either aren't included in the raw data or inferring their value requires special processing. 77 | In these cases, a `null` value is recorded in the column mapping files and the property is populated using registry-specific logic. 78 | 79 | ## Protocol Mapping 80 | 81 | Offset projects are developed by following a set of rules, known as a protocol. 82 | These rules specify things like when measurements must be made and what paperwork must be submitted in order for a project to receive credits. 83 | Unfortunately, there is no standardized way of referring to the exact protocol (or protocol version) used to develop an offset project. 84 | Even within the domain of a single registry, references to the exact protocol used to develop a project are often inconsistent. 85 | Take for example the Clean Development Mechanism protocol AMS-III.D., "Methane recovery in animal manure management systems". 86 | Across all five registries included in OffsetsDB, we identified twenty-two unique ways of referring to this one protocol. 87 | 88 | OffsetsDB addresses this problem by manually assigning every known protocol string to a common schema. 89 | Continuing with the AMS-III.D. example, we map all twenty-two "known strings" that describe the same protocol to a single, unified reference, `ams-iii-d`. 90 | We repeat this manual unification of dissimilar strings for all protocols across all registries. 91 | The results of the mapping are contained within [`offsets-db-data/configs/all-protocol-mapping.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/all-protocol-mapping.json). 92 | 93 | ## Project Type & Categorization 94 | 95 | In addition to unifying protocol mapping, we also assign two levels of classification to projects: `category` and `type`. 96 | Categories represent broad classes of offset approaches, while types provide more specific information about the mitigation strategy. 97 | 98 | ### Category Assignment 99 | 100 | Projects are assigned to one of these broad categories 101 | 102 | - agriculture: offsets derived from changes in the management of agricultural systems, including livestock 103 | - forest: offsets derived from the management of forests 104 | - ghg-management: offsets derived from the destruction or elimination (e.g., substitution) of greenhouse gases 105 | - renewable-energy: offsets derived from expanding renewable energy capacity 106 | - energy-efficiency: offsets derived from decreasing the amount of energy required to complete a task 107 | - fuel-switching: offsets derived from generating energy using a fuel source that produces fewer greenhouse gasses 108 | - carbon-capture: offsets derived from technologies that capture and store carbon 109 | - land-use: offsets derived from land management changes outside of forests 110 | - biochar: offsets derived from biochar production and application 111 | 112 | Category assignment is primarily determined by project type through the mapping defined in [`offsets-db-data/configs/type-category-mapping.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/type-category-mapping.json). 113 | This mapping connects specific project types (like "improved forest management" or "cookstoves") to their appropriate category. 114 | 115 | ### Project Type Assignment 116 | 117 | Project types represent more specific offset approaches. 118 | For example, within the category of "forest", projects might be classified as "improved forest management", "afforestation/reforestation", or "avoided forest conversion". 119 | 120 | Project types are determined through a multi-step process: 121 | 122 | 1. First, we attempt to infer the project type from protocol information (via {py:obj}`offsets_db_data.projects.infer_project_type`). 123 | 2. We apply manual overrides from curated data sources (via {py:obj}`offsets_db_data.projects.override_project_types`). 124 | Currently, the [Berkeley Carbon Trading Project](https://gspp.berkeley.edu/research-and-impact/centers/cepp/projects/berkeley-carbon-trading-project) data in [`offsets-db-data/configs/berkeley-project-types.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/berkeley-project-types.json) serves as the authoritative source for project types. 125 | The project types from the Berkeley Carbon Trading Project's Voluntary Registry Offsets Database are licensed under a [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. 126 | 127 | ## Retirement User Harmonization 128 | 129 | Carbon offset credits are often retired on behalf of a specific entity or organization. 130 | However, the names of these retirement users are recorded inconsistently across registry data, making it difficult to analyze retirement patterns. 131 | The following section describes our approach for identifying and harmonizing information about the end-users ("retirement users") of specific offset credits. 132 | 133 | ### Harmonization Process 134 | 135 | The harmonization process attempts to identify specific "retirement users" from publicly disclosed retirement beneficiary information. 136 | We try to standardize retirement user information across registries using the following steps: 137 | 138 | 1. **Data merging**: we combine information from four sources into a single _temporary_ field: 139 | 140 | - `retirement_beneficiary`: note specifically designating the entity claiming a credit’s environmental benefits 141 | - `retirement_account`: name on account from which credits were retired 142 | - `retirement_note`: short-form text accompanying credit retirement 143 | - `retirement_reason`: short form note specifying why credits were retired (e.g., compliance purposes). Sometimes similar to a retirement note 144 | 145 | We refer to these fields as "retirement beneficiary data." 146 | Any one of these fields might contain information useful for relating a transaction to a retirement user. 147 | 148 | 2. **Standardization via OpenRefine**: we process this merged information through [OpenRefine](https://openrefine.org/) using a detailed set of transformation rules define in [`offsets-db-data/configs/beneficiary-mappings.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/beneficiary-mappings.json). This includes: 149 | - text transformations that standardize common company names and entities 150 | - pattern matching to identify the same entities despite different formatting 151 | 152 | Only confident matches are included in the harmonized beneficiary field, `retirement_beneficiary_harmonized`. 153 | 154 | The retirement user harmonization process runs daily, along with the rest of OffsetsDB. 155 | However, the underyling standardization rules (implemented via OpenRefine) are only irregularly updated. 156 | This means that there might be new retirement data that _could_ be mapped to a known entity but, because of that mapping has not previously been described, that relationship is not reflected in OffsetsDB. 157 | To account for this, all searches via the database tool return matches across _all available_ retirement beneficiary fields: `retirement_beneficiary`, `retirement_account`, `retirement_note`, `retirement_reason`, _and_ `retirement_beneficiary_harmonized`. 158 | Thus, searching for known retirement users, like `Delta`, will return all records that contain the substring `delta` anywhere within their retirement beneficiary data. 159 | Users should carefully examine these unmapped transactions to determine whether or not these unmapped records are relevant to their specific search. 160 | 161 | ### Implementation Details 162 | 163 | Retirement user harmonization is implemented in the function {py:obj}`offsets_db_data.credits.harmonize_beneficiary_data`. 164 | This function runs a temporary OpenRefine project using the `offsets-db-data-orcli` command-line tool (which is a wrapper around [`orcli`](https://github.com/opencultureconsulting/orcli), an OpenRefine's command-line interface) to apply the transformations defined in our mapping file. 165 | The result is a new column, `retirement_beneficiary_harmonized`, that contains the standardized user names. 166 | 167 | ### Examples of Standardization 168 | 169 | Our harmonization process unifies many common variations: 170 | 171 | - "Delta Air Lines", "Delta Airlines" → "Delta Airlines" 172 | - "Terpel", "Organizacion Terpel", "Terpel S.A." → "Terpel" 173 | - "Retired on behalf of Sydney Opera House" → "Sydney Opera House" 174 | 175 | ### Why This Matters 176 | 177 | Without harmonization, the same entity might appear under multiple names, making it difficult to accurately analyze which entities are retiring the most credits. 178 | This harmonization allows for more accurate aggregation of retirement data by user. 179 | 180 | ```{note} 181 | The harmonizaton process can be toggled on or off via the `harmonize_beneficiary_info` parameter of the `process_{registry_abbreviation}_credits` functions. 182 | ``` 183 | 184 | ## Registry Specific Transformations 185 | 186 | Some transformations involved in producing OffsetsDB require special knowledge or assumptions about the underlying data. 187 | This section highlights special cases. 188 | 189 | ```{note} 190 | For additional context, consult specific function docstrings in the [API reference](api.md) or [reach out on GitHub](https://github.com/carbonplan/offsets-db-data/issues) if something doesn't make sense. 191 | ``` 192 | 193 | ### American Carbon Registry 194 | 195 | Project status: When processing ACR projects, we combine two status properties present in the raw data: `Compliance Program Status (ARB or Ecology)` and `Voluntary Status`. 196 | For compliance projects, we report compliance program status. 197 | For voluntary projects, we report voluntary status. 198 | 199 | ### Gold Standard 200 | 201 | #### Planned Emission Reductions 202 | 203 | Some Gold Standard protocols all for the issuance of "[planned emission reductions](https://goldstandardhelp.freshdesk.com/support/solutions/articles/44001989672-what-is-a-planned-emission-reduction-per-)" (PERs). 204 | These credits represent anticipated climate benefits that are expected to occur in the future. 205 | PERs are issued and can be traded, but cannot be retired. 206 | OffsetsDB includes all issued PERs when reporting vintage and project level credit totals. 207 | 208 | ### Verra 209 | 210 | #### Issuance 211 | 212 | Verra allows "rolling" credit issuance. This allows projects to complete the paperwork and verificaiton processes for credit issuance, but delay the actual issuance event. 213 | This results in ambiguities around the precise timing of credit issuance events, as credits that are eligible to be issued but have not yet been issued, are not publicly reported in the Verra crediting data. 214 | We handle this ambiguity by assuming that the first crediting event, be it an issuance, retirement, or cancellation, on a per-project, per-vintage basis results in issuance of 100 percent of credits eligible to be issued for that project-vintage. 215 | 216 | #### Retirements vs. Cancellations 217 | 218 | Verra's publicly available data does not distinguish between retirement events and cancellation events. 219 | We report all Verra retirements and cancellations as `retirement`. 220 | We originally contemplated tagging every Verra retirement or cancellation as `retirement/cancellation`. 221 | This made our processed Verra data slightly incompatiable with data from other registries. 222 | Simple queries, like "give me all the retirements", suddenly required writing code that look like this: 223 | 224 | ```python 225 | credits[(credits['transaction_type'] == 'retirement') | (credits['transaction_type'] == 'retirement/cancellation')] 226 | ``` 227 | 228 | ```{warning} 229 | Because we know the majority of Verra `retirement/cancellation` events are in fact `retirement`, we opted for this more ergonomic representation of the data. 230 | Any analysis involving Verra retirement data should clearly specify that Verra's raw data does not currently distinguish between retirement and cancellation events. 231 | ``` 232 | 233 | Vintage Date: Verra allows for the simultaneous issuance of multiple vintages. 234 | We assign all credits from these multi-vintage issuances to the earliest reported vintage year. 235 | 236 | ### California Compliance Projects 237 | 238 | We treat the California Air Resources Board's [issuance table](https://ww2.arb.ca.gov/resources/documents/arb-offset-credit-issuance-table) as the source of truth for all credits issued and retired by any project developed under an ARB-approved protocol. 239 | When a project appears in the issuance table, we drop all crediting data reported from the project's host registry and _only_ report the issuance and retirement values contained within the ARB issuance table. 240 | This methodological decision introduces a small error when it comes to "Early Action" projects. 241 | These projects were developed during the very first phase of California's offsets program, which had slightly different rules. 242 | After the early action phase, some projects transitioned into the full compliance program, while others did not. 243 | Fully accounting for how these early projects retired credits, both reported by CARB's issuance table and the registries, likely requires more careful treatment. 244 | 245 | Retirement Dates: Offsets retired for compliance with California's cap-and-trade program occur on fixed dates that correspond with the program's reporting deadlines. 246 | These deadlines come in two forms: partial (annual) and full (triennial) compliance events. 247 | For simplicity, the current version of OffsetsDB uses the date of the full (triennial) compliance event as the retirement date for all compliance offsets. 248 | This means some retirement dates go unrecorded. 249 | Specifically, compliance credits retired for _voluntary_ purposes (i.e., not to satisfy requirements under AB32) and credits retired in linked markets (e.g., Quebec) are unknown and reported as `NaT`. 250 | 251 | ## Other Normalizations 252 | 253 | ### Country 254 | 255 | We use the Python package [coutnry_convertor](https://github.com/IndEcol/country_converter) to harmonize country names. 256 | 257 | ### Project Status 258 | 259 | OffsetsDB unifies and simplifies project status information reported by the registries. 260 | OffsetsDB contains use the following status codes: `listed`, `registered`, and `completed`. 261 | Listed typically refers to the stage during which a project has been formally announced and is undergoing development, but has yet to receive credits. 262 | Registered refers to projects that have received credits and are eligible to receive additional credits in the future. 263 | Completed means a project previously received credits and is not currently able to receive additional credits in the future. 264 | Many registries have far more detailed project status information, often reflecting the specific stages of the registry's validation and verification process. 265 | Future work might focus on normalizing these additional project states across the registries. 266 | 267 | ## Schema 268 | 269 | ### Projects 270 | 271 | Project data conform to the following schema: 272 | 273 | ```json 274 | { 275 | "properties": { 276 | "category": { 277 | "anyOf": [ 278 | { 279 | "type": "string" 280 | }, 281 | { 282 | "type": "null" 283 | } 284 | ], 285 | "description": "Category of the project", 286 | "title": "Category" 287 | }, 288 | "country": { 289 | "anyOf": [ 290 | { 291 | "type": "string" 292 | }, 293 | { 294 | "type": "null" 295 | } 296 | ], 297 | "title": "Country" 298 | }, 299 | "first_issuance_at": { 300 | "anyOf": [ 301 | { 302 | "format": "date", 303 | "type": "string" 304 | }, 305 | { 306 | "type": "null" 307 | } 308 | ], 309 | "description": "Date of first issuance of credits", 310 | "title": "First Issuance At" 311 | }, 312 | "first_retirement_at": { 313 | "anyOf": [ 314 | { 315 | "format": "date", 316 | "type": "string" 317 | }, 318 | { 319 | "type": "null" 320 | } 321 | ], 322 | "description": "Date of first retirement of credits", 323 | "title": "First Retirement At" 324 | }, 325 | "is_compliance": { 326 | "anyOf": [ 327 | { 328 | "type": "boolean" 329 | }, 330 | { 331 | "type": "null" 332 | } 333 | ], 334 | "description": "Whether project is compliance project", 335 | "title": "Is Compliance" 336 | }, 337 | "issued": { 338 | "anyOf": [ 339 | { 340 | "type": "integer" 341 | }, 342 | { 343 | "type": "null" 344 | } 345 | ], 346 | "default": 0, 347 | "description": "Total of issued credits", 348 | "title": "Issued" 349 | }, 350 | "listed_at": { 351 | "anyOf": [ 352 | { 353 | "format": "date", 354 | "type": "string" 355 | }, 356 | { 357 | "type": "null" 358 | } 359 | ], 360 | "description": "Date project was listed", 361 | "title": "Listed At" 362 | }, 363 | "name": { 364 | "anyOf": [ 365 | { 366 | "type": "string" 367 | }, 368 | { 369 | "type": "null" 370 | } 371 | ], 372 | "description": "Name of the project", 373 | "title": "Name" 374 | }, 375 | "project_id": { 376 | "description": "Project id used by registry system", 377 | "title": "Project Id", 378 | "type": "string" 379 | }, 380 | "project_url": { 381 | "anyOf": [ 382 | { 383 | "type": "string" 384 | }, 385 | { 386 | "type": "null" 387 | } 388 | ], 389 | "description": "URL to project details", 390 | "title": "Project Url" 391 | }, 392 | "proponent": { 393 | "anyOf": [ 394 | { 395 | "type": "string" 396 | }, 397 | { 398 | "type": "null" 399 | } 400 | ], 401 | "title": "Proponent" 402 | }, 403 | "protocol": { 404 | "anyOf": [ 405 | { 406 | "items": { 407 | "type": "string" 408 | }, 409 | "type": "array" 410 | }, 411 | { 412 | "type": "null" 413 | } 414 | ], 415 | "default": null, 416 | "description": "List of protocols", 417 | "title": "Protocol" 418 | }, 419 | "registry": { 420 | "description": "Name of the registry", 421 | "title": "Registry", 422 | "type": "string" 423 | }, 424 | "retired": { 425 | "anyOf": [ 426 | { 427 | "type": "integer" 428 | }, 429 | { 430 | "type": "null" 431 | } 432 | ], 433 | "default": 0, 434 | "description": "Total of retired credits", 435 | "title": "Retired" 436 | }, 437 | "status": { 438 | "anyOf": [ 439 | { 440 | "type": "string" 441 | }, 442 | { 443 | "type": "null" 444 | } 445 | ], 446 | "title": "Status" 447 | }, 448 | "project_type": { 449 | "anyOf": [ 450 | { 451 | "type": "string" 452 | }, 453 | { 454 | "type": "null" 455 | } 456 | ], 457 | "description": "Type of project", 458 | "title": "Project Type" 459 | }, 460 | "project_type_source": { 461 | "anyOf": [ 462 | { 463 | "type": "string" 464 | }, 465 | { 466 | "type": "null" 467 | } 468 | ], 469 | "description": "Source of project type information", 470 | "title": "Project Type Source" 471 | } 472 | }, 473 | "required": [ 474 | "project_id", 475 | "name", 476 | "registry", 477 | "proponent", 478 | "category", 479 | "status", 480 | "country", 481 | "listed_at", 482 | "is_compliance", 483 | "first_issuance_at", 484 | "first_retirement_at", 485 | "project_url", 486 | "project_type", 487 | "project_type_source" 488 | ], 489 | "title": "Project", 490 | "type": "object" 491 | } 492 | ``` 493 | 494 | The majority of project attributes are directly taken from the project data downloaded from each registry. 495 | Table 1 provides the mapping from the raw column names found in downloaded registry data to the OffsetsDB project schema. 496 | 497 | | | **verra** | **climate-action-reserve** | **american-carbon-registry** | **gold-standard** | **art-trees** | 498 | | ---------------- | ------------------------------ | ------------------------------ | -------------------------------------- | ------------------------------ | ------------------------------ | 499 | | **project_id** | ID | Project ID | Project ID | id | Program ID | 500 | | **name** | Name | Project Name | Project Name | name | Program Name | 501 | | **protocol** | Methodology | Project Type | Project Methodology/Protocol | methodology | \- | 502 | | **category** | inferred from protocol or type | inferred from protocol or type | inferred from protocol | inferred from protocol or type | inferred from protocol or type | 503 | | **proejct_type** | manually assigned | manually assigned | manually assigned | manually assigned | manually assigned | 504 | | **proponent** | Proponent | Project Owner | Project Developer | project_developer | Sovereign Program Developer | 505 | | **country** | Country/Area | Project Site Country | Project Site Country | country | Program Country | 506 | | **status** | Status | Status | Derived: voluntary + compliance status | status | Status | 507 | | **listed_at** | Project Listed Date | \- | \- | \- | \- | 508 | 509 | ### Credits 510 | 511 | Credit data conform to the following schema: 512 | 513 | ```json 514 | { 515 | "properties": { 516 | "id": { 517 | "default": null, 518 | "title": "Id", 519 | "type": "integer" 520 | }, 521 | "project_id": { 522 | "anyOf": [ 523 | { 524 | "type": "string" 525 | }, 526 | { 527 | "type": "null" 528 | } 529 | ], 530 | "description": "Project id used by registry system", 531 | "title": "Project Id" 532 | }, 533 | "quantity": { 534 | "description": "Tons of carbon dioxide equivalent (each ton is one carbon credit", 535 | "title": "Quantity", 536 | "type": "integer" 537 | }, 538 | "retirement_account": { 539 | "anyOf": [ 540 | { 541 | "type": "string" 542 | }, 543 | { 544 | "type": "null" 545 | } 546 | ], 547 | "description": "Name on account from which credits were retired", 548 | "title": "Retirement Account" 549 | }, 550 | "retirement_beneficiary": { 551 | "anyOf": [ 552 | { 553 | "type": "string" 554 | }, 555 | { 556 | "type": "null" 557 | } 558 | ], 559 | "description": "Note specifically designating the entity claiming a credit’s environmental benefits", 560 | "title": "Retirement Beneficiary" 561 | }, 562 | "retirement_beneficiary_harmonized": { 563 | "anyOf": [ 564 | { 565 | "type": "string" 566 | }, 567 | { 568 | "type": "null" 569 | } 570 | ], 571 | "description": "Harmonized beneficiary of credits", 572 | "title": "Retirement Beneficiary Harmonized" 573 | }, 574 | "retirement_note": { 575 | "anyOf": [ 576 | { 577 | "type": "string" 578 | }, 579 | { 580 | "type": "null" 581 | } 582 | ], 583 | "description": "Short-form text accompanying credit retirement", 584 | "title": "Retirement Note" 585 | }, 586 | "retirement_reason": { 587 | "anyOf": [ 588 | { 589 | "type": "string" 590 | }, 591 | { 592 | "type": "null" 593 | } 594 | ], 595 | "description": "Short form note specifying why credits were retired (e.g., compliance purposes). Sometimes similar to a retirement note. ", 596 | "title": "Retirement Reason" 597 | }, 598 | "transaction_date": { 599 | "anyOf": [ 600 | { 601 | "format": "date", 602 | "type": "string" 603 | }, 604 | { 605 | "type": "null" 606 | } 607 | ], 608 | "description": "Date of transaction", 609 | "title": "Transaction Date" 610 | }, 611 | "transaction_type": { 612 | "anyOf": [ 613 | { 614 | "type": "string" 615 | }, 616 | { 617 | "type": "null" 618 | } 619 | ], 620 | "description": "Type of transaction", 621 | "title": "Transaction Type" 622 | }, 623 | "vintage": { 624 | "anyOf": [ 625 | { 626 | "type": "integer" 627 | }, 628 | { 629 | "type": "null" 630 | } 631 | ], 632 | "description": "Year when carbon avoidance/removal occurred", 633 | "title": "Vintage" 634 | } 635 | }, 636 | "required": [ 637 | "quantity", 638 | "vintage", 639 | "transaction_date", 640 | "transaction_type", 641 | "retirement_account", 642 | "retirement_beneficiary", 643 | "retirement_reason", 644 | "retirement_note", 645 | "retirement_beneficiary_harmonized", 646 | "project_id" 647 | ], 648 | "title": "Credit", 649 | "type": "object" 650 | } 651 | ``` 652 | 653 | The majority of credit attributes are taken directly taken from the credit data downloaded from each registry. 654 | The raw attribute names of crediting data tends to vary depending on the transaction type. 655 | For example, ART TREES refers to retirement dates as `Status Effective`, while it reports issuances as `Date Approved`. 656 | Rather than produce a table of each of these mappings here, please refer to [credits-raw-columns-mappings.json](https://github.com/carbonplan/offsets-db/blob/main/data/credits-raw-columns-mappings.json). 657 | -------------------------------------------------------------------------------- /docs/glossary.md: -------------------------------------------------------------------------------- 1 | # Glossary 2 | 3 | ## Terms that apply to offset projects 4 | 5 | - **Protocol**: 6 | The rules used to quantify the number of offset credits awarded to a project. 7 | Protocols outline requirements (e.g., deadlines, verification) and disclosures that projects must satisfy to receive credits. 8 | Protocols are enforced by registries. 9 | 10 | - **Registry**: 11 | An organization (often a non-profit) responsible for creating the rules that govern offset projects and the generation of offset credits. 12 | Registries maintain public records of all credits that have been issued, which ensures that credits are not used more than once ("double counting"). 13 | These public records serve as the raw data behind OffsetsDB. 14 | 15 | ## Terms that apply to offset credits 16 | 17 | - **Carbon Credit / Offset**: 18 | A tradable certificate representing the climate benefit of removing or avoiding the emission of greenhouse gases. 19 | Individual credits are typically denominated in terms of one tonne of carbon dioxide equivalent (tCO₂e). 20 | The terms "carbon credit" and "carbon offset" and "offset credit" are often used interchangeably. 21 | 22 | - **Credit Cancellation**: 23 | Invalidation of an offset credit, often for administrative purposes. 24 | For example, if an error in paperwork is determined to have resulted in the issuance of 1,000 extra credits, a registry would use cancellation to correct the error. 25 | 26 | - **Credit Issuance**: 27 | The creation of new offset credits, which can subsequently be sold, traded, and used to make environmental claims. 28 | Credits are issued by registries to projects. 29 | Issuance occurs only after a project has satisfied all the rules laid out in the specific offset protocol used by the project. 30 | 31 | - **Credit Retirement**: 32 | The use of a carbon offset to make an offsetting claim. 33 | Retirement occurs when the current owner of an offset credit requests that a registry formally retire the credit. 34 | Retiring a credit means no other entity can take credit for the environmental benefit embodied by the retired credit. 35 | 36 | ## More resources 37 | 38 | Carbon Brief has a [nice glossary](https://interactive.carbonbrief.org/carbon-offsets-2023/glossary.html) of carbon market terminology. 39 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | ```{rubric} Offsets-DB Data 4 | 5 | ``` 6 | 7 | Welcome to OffsetsDB! This documentation provides an overview of offsets-db-data, a Python package with utilities for cleaning and processing data for [OffsetsDB web tool](https://carbonplan.org/research/offsets-db/) 8 | 9 | ```{button-ref} install-offsets-db-data 10 | :ref-type: doc 11 | :color: primary 12 | :class: sd-rounded-pill 13 | 14 | Get Started 15 | ``` 16 | 17 | ::: 18 | 19 | :::: 20 | 21 | --- 22 | 23 | ## Get in touch 24 | 25 | - If you encounter any errors or problems with **offsets-db-data**, please open an issue at the GitHub [main repository](http://github.com/carbonplan/offsets-db-data/issues). 26 | - If you have a question like “How do I find x?”, ask on [GitHub discussions](https://github.com/carbonplan/offsets-db-data/discussions). Please include a self-contained reproducible example if possible. 27 | 28 | --- 29 | 30 | ```{toctree} 31 | --- 32 | maxdepth: 1 33 | caption: How to guides and examples 34 | hidden: 35 | --- 36 | install-offsets-db-data.md 37 | data-access.md 38 | ``` 39 | 40 | ```{toctree} 41 | --- 42 | maxdepth: 2 43 | caption: Reference 44 | hidden: 45 | --- 46 | 47 | data-processing.md 48 | api.md 49 | glossary.md 50 | TERMS-OF-DATA-ACCESS.md 51 | ``` 52 | 53 | ```{toctree} 54 | --- 55 | maxdepth: 2 56 | caption: Project links 57 | hidden: 58 | --- 59 | 60 | 61 | GitHub Repo 62 | GitHub discussions 63 | Database Web Tool 64 | Methods 65 | Explainer 66 | 67 | ``` 68 | -------------------------------------------------------------------------------- /docs/install-offsets-db-data.md: -------------------------------------------------------------------------------- 1 | # Install offsets-db-data 2 | 3 | offsets-db-data Python package can be installed in two ways: 4 | 5 | ```{eval-rst} 6 | 7 | .. tab-set:: 8 | 9 | .. tab-item:: pip 10 | 11 | Using the `pip `__ package manager: 12 | 13 | .. code:: bash 14 | 15 | $ python -m pip install offsets-db-data 16 | 17 | 18 | .. tab-item:: Development version 19 | 20 | To install a development version from source: 21 | 22 | .. code:: bash 23 | 24 | $ git clone https://github.com/carbonplan/offsets-db-data 25 | $ cd offsets-db-data 26 | $ python -m pip install -e . 27 | ``` 28 | -------------------------------------------------------------------------------- /offsets_db_data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from ._version import __version__ 3 | -------------------------------------------------------------------------------- /offsets_db_data/apx.py: -------------------------------------------------------------------------------- 1 | import numpy as np # noqa: F401 2 | import pandas as pd 3 | import pandas_flavor as pf 4 | 5 | from offsets_db_data.common import ( 6 | BERKELEY_PROJECT_TYPE_UPATH, 7 | CREDIT_SCHEMA_UPATH, 8 | PROJECT_SCHEMA_UPATH, 9 | load_column_mapping, 10 | load_inverted_protocol_mapping, 11 | load_registry_project_column_mapping, 12 | load_type_category_mapping, 13 | ) 14 | from offsets_db_data.credits import * # noqa: F403 15 | from offsets_db_data.credits import harmonize_beneficiary_data 16 | from offsets_db_data.models import credit_without_id_schema, project_schema 17 | from offsets_db_data.projects import * # noqa: F403 18 | 19 | 20 | @pf.register_dataframe_method 21 | def determine_transaction_type(df: pd.DataFrame, *, download_type: str) -> pd.DataFrame: 22 | """ 23 | Assign a transaction type to each record in the DataFrame based on the download type. 24 | 25 | Parameters 26 | ---------- 27 | df : pd.DataFrame 28 | Input DataFrame containing transaction data. 29 | download_type : str 30 | Type of transaction ('issuances', 'retirements', 'cancellations') to determine the transaction type. 31 | 32 | Returns 33 | ------- 34 | pd.DataFrame 35 | DataFrame with a new 'transaction_type' column, containing assigned transaction types based on download_type. 36 | """ 37 | 38 | transaction_type_mapping = { 39 | 'issuances': 'issuance', 40 | 'retirements': 'retirement', 41 | 'cancellations': 'cancellation', 42 | } 43 | df['transaction_type'] = transaction_type_mapping[download_type] 44 | return df 45 | 46 | 47 | @pf.register_dataframe_method 48 | def process_apx_credits( 49 | df: pd.DataFrame, 50 | *, 51 | download_type: str, 52 | registry_name: str, 53 | arb: pd.DataFrame | None = None, 54 | harmonize_beneficiary_info: bool = False, 55 | ) -> pd.DataFrame: 56 | """ 57 | Process APX credits data by setting registry, determining transaction types, renaming columns, 58 | converting date columns, aggregating issuances (if applicable), and validating the schema. 59 | 60 | Parameters 61 | ---------- 62 | df : pd.DataFrame 63 | Input DataFrame with raw APX credits data. 64 | download_type : str 65 | Type of download ('issuances', 'retirements', etc.). 66 | registry_name : str 67 | Name of the registry for setting and mapping columns. 68 | arb : pd.DataFrame | None, optional 69 | Additional DataFrame for data merging (default is None). 70 | 71 | Returns 72 | ------- 73 | pd.DataFrame 74 | Processed DataFrame with APX credits data. 75 | """ 76 | 77 | df = df.copy() 78 | 79 | column_mapping = load_column_mapping( 80 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH 81 | ) 82 | 83 | columns = {v: k for k, v in column_mapping.items()} 84 | 85 | data = ( 86 | df.set_registry(registry_name=registry_name) 87 | .determine_transaction_type(download_type=download_type) 88 | .rename(columns=columns) 89 | ) 90 | 91 | # split the date and time and keeping only the date. this helps with the inconsistency in the date format 92 | data['transaction_date'] = data['transaction_date'].str.split().str[0] 93 | 94 | data = data.convert_to_datetime(columns=['transaction_date']) 95 | 96 | if download_type == 'issuances': 97 | data = data.aggregate_issuance_transactions() 98 | 99 | data = data.add_missing_columns(schema=credit_without_id_schema).validate( 100 | schema=credit_without_id_schema 101 | ) 102 | if arb is not None and not arb.empty: 103 | data = data.merge_with_arb(arb=arb) 104 | 105 | if harmonize_beneficiary_info: 106 | data = data.pipe( 107 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type 108 | ) 109 | 110 | data = ( 111 | data.add_missing_columns(schema=credit_without_id_schema) 112 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') 113 | .validate(schema=credit_without_id_schema) 114 | ) 115 | return data 116 | 117 | 118 | def harmonize_acr_status(row: pd.Series) -> str: 119 | """Derive single project status for CAR and ACR projects 120 | 121 | Raw CAR and ACR data has two status columns -- one for compliance status, one for voluntary. 122 | Handle and harmonize. 123 | 124 | Parameters 125 | ---------- 126 | row : pd.Series 127 | A row from a pandas DataFrame 128 | 129 | Returns 130 | ------- 131 | value : str 132 | The status of the project 133 | """ 134 | if row['Compliance Program Status (ARB or Ecology)'] == 'Not ARB or Ecology Eligible': 135 | return row['Voluntary Status'].lower() 136 | ACR_COMPLIANCE_STATE_MAP = { 137 | 'Listed - Active ARB Project': 'active', 138 | 'ARB Completed': 'completed', 139 | 'ARB Inactive': 'completed', 140 | 'Listed - Proposed Project': 'listed', 141 | 'Listed - Active Registry Project': 'listed', 142 | 'ARB Terminated': 'completed', 143 | 'Submitted': 'listed', 144 | 'Transferred ARB or Ecology Project': 'active', 145 | 'Listed – Active ARB Project': 'active', 146 | } 147 | 148 | return ACR_COMPLIANCE_STATE_MAP.get( 149 | row['Compliance Program Status (ARB or Ecology)'], 'unknown' 150 | ) 151 | 152 | 153 | @pf.register_dataframe_method 154 | def add_project_url(df: pd.DataFrame, *, registry_name: str) -> pd.DataFrame: 155 | """ 156 | Add a project URL to each record in the DataFrame based on the registry name and project ID. 157 | 158 | Parameters 159 | ---------- 160 | df : pd.DataFrame 161 | Input DataFrame containing project data. 162 | registry_name : str 163 | Name of the registry ('american-carbon-registry', 'climate-action-reserve', 'art-trees'). 164 | 165 | Returns 166 | ------- 167 | pd.DataFrame 168 | DataFrame with a new 'project_url' column, containing URLs for each project. 169 | """ 170 | 171 | if registry_name == 'american-carbon-registry': 172 | base = 'https://acr2.apx.com/mymodule/reg/prjView.asp?id1=' 173 | elif registry_name == 'climate-action-reserve': 174 | base = 'https://thereserve2.apx.com/mymodule/reg/prjView.asp?id1=' 175 | elif registry_name == 'art-trees': 176 | base = 'https://art.apx.com/mymodule/reg/prjView.asp?id1=' 177 | 178 | else: 179 | raise ValueError(f'Unknown registry name: {registry_name}') 180 | 181 | df['project_url'] = base + df['project_id'].str[3:] 182 | return df 183 | 184 | 185 | @pf.register_dataframe_method 186 | def process_apx_projects( 187 | df: pd.DataFrame, *, credits: pd.DataFrame, registry_name: str 188 | ) -> pd.DataFrame: 189 | """ 190 | Process APX projects data, including renaming, adding, and validating columns, harmonizing statuses, 191 | and merging with credits data. 192 | 193 | Parameters 194 | ---------- 195 | df : pd.DataFrame 196 | Input DataFrame with raw projects data. 197 | credits : pd.DataFrame 198 | DataFrame containing credits data for merging. 199 | registry_name : str 200 | Name of the registry for specific processing steps. 201 | 202 | Returns 203 | ------- 204 | pd.DataFrame 205 | Processed DataFrame with harmonized and validated APX projects data. 206 | """ 207 | 208 | df = df.copy() 209 | credits = credits.copy() 210 | registry_project_column_mapping = load_registry_project_column_mapping( 211 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH 212 | ) 213 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()} 214 | inverted_protocol_mapping = load_inverted_protocol_mapping() 215 | type_category_mapping = load_type_category_mapping() 216 | data = df.rename(columns=inverted_column_mapping) 217 | if registry_name == 'art-trees': 218 | data['protocol'] = [['art-trees']] * len(data) 219 | 220 | else: 221 | data = data.map_protocol(inverted_protocol_mapping=inverted_protocol_mapping) 222 | 223 | if registry_name == 'american-carbon-registry': 224 | data['status'] = data.apply(harmonize_acr_status, axis=1) 225 | else: 226 | data = data.harmonize_status_codes() 227 | 228 | data = ( 229 | data.set_registry(registry_name=registry_name) 230 | .add_project_url(registry_name=registry_name) 231 | .harmonize_country_names() 232 | .infer_project_type() 233 | .override_project_types( 234 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley' 235 | ) 236 | .add_category( 237 | type_category_mapping=type_category_mapping 238 | ) # must come after types; type -> category 239 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping) 240 | .add_is_compliance_flag() 241 | .add_retired_and_issued_totals(credits=credits) 242 | .add_first_issuance_and_retirement_dates(credits=credits) 243 | .add_missing_columns(schema=project_schema) 244 | .convert_to_datetime(columns=['listed_at']) 245 | .validate(schema=project_schema) 246 | ) 247 | return data 248 | -------------------------------------------------------------------------------- /offsets_db_data/arb.py: -------------------------------------------------------------------------------- 1 | import janitor # noqa: F401 2 | import numpy as np 3 | import pandas as pd 4 | import pandas_flavor as pf 5 | 6 | from offsets_db_data.common import convert_to_datetime # noqa: F401 7 | from offsets_db_data.models import credit_without_id_schema 8 | 9 | 10 | def _get_registry(item): 11 | registry_map = { 12 | 'CAR': 'climate-action-reserve', 13 | 'ACR': 'american-carbon-registry', 14 | 'VCS': 'verra', 15 | 'ART': 'art-trees', 16 | } 17 | prefix = item[:3] 18 | return registry_map.get(prefix) 19 | 20 | 21 | @pf.register_dataframe_method 22 | def process_arb(df: pd.DataFrame) -> pd.DataFrame: 23 | """ 24 | Process ARB (Air Resources Board) data by renaming columns, handling nulls, interpolating vintages, 25 | and transforming the data structure for transactions. 26 | 27 | Parameters 28 | ---------- 29 | df : pd.DataFrame 30 | Input DataFrame containing raw ARB data. 31 | 32 | Returns 33 | ------- 34 | data : pd.DataFrame 35 | Processed DataFrame with ARB data. Columns include 'opr_id', 'vintage', 'issued_at' (interpolated), 36 | various credit transaction types, and quantities. The DataFrame is also validated against 37 | a predefined schema for credit data. 38 | 39 | Notes 40 | ----- 41 | - The function renames columns for readability and standardization. 42 | - It interpolates missing vintage values and handles NaNs in 'issuance' column. 43 | - Retirement transactions are derived based on compliance period dates. 44 | - The DataFrame is melted to restructure credit data. 45 | - Zero retirement events are dropped as they are considered artifacts. 46 | - A prefix is added to 'project_id' to indicate the source. 47 | - The 'registry' column is derived based on the project_id prefix. 48 | - The 'vintage' column is converted to integer type. 49 | - Finally, the data is converted to datetime where necessary and validated against a predefined schema. 50 | """ 51 | 52 | df = df.copy() 53 | 54 | rename_d = { 55 | 'OPR Project ID': 'opr_id', 56 | 'ARB Offset Credits Issued': 'issuance', 57 | 'Project Type': 'project_type', 58 | 'Issuance Date': 'issued_at', 59 | 'Vintage': 'vintage', 60 | 'Retired Voluntarily': 'vcm_retirement', 61 | 'Retired 1st Compliance Period (CA)': 'first_compliance_ca', 62 | 'Retired 2nd Compliance Period (CA)': 'second_compliance_ca', 63 | 'Retired 3rd Compliance Period (CA)': 'third_compliance_ca', 64 | 'Retired 4th Compliance Period (CA)': 'fourth_compliance_ca', 65 | 'Retired for Compliance in Quebec': 'qc_compliance', 66 | } 67 | 68 | df = df.rename(columns=rename_d) 69 | df['vintage'] = df[ 70 | 'vintage' 71 | ].interpolate() # data is ordered; fills na vintage for zero issuance reporting periods 72 | 73 | df['project_type'] = df['project_type'].str.lower() 74 | 75 | # can be multiple issuance in single RP -- grab issuance ID so can aggregate later 76 | 77 | df = df.replace('reforest defer', np.nan) 78 | df.loc[pd.isna(df['issuance']), 'issuance'] = 0 79 | 80 | print(f'Loaded {len(df)} rows from ARB issuance table') 81 | df = df[rename_d.values()] 82 | 83 | compliance_period_dates = { 84 | 'vcm_retirement': np.datetime64('NaT'), 85 | 'qc_compliance': np.datetime64('NaT'), 86 | 'first_compliance_ca': np.datetime64('2016-03-21'), 87 | 'second_compliance_ca': np.datetime64('2018-11-01'), 88 | 'third_compliance_ca': np.datetime64('2021-11-01'), 89 | 'fourth_compliance_ca': np.datetime64('2022-11-01'), 90 | } 91 | # rename columns to what we want `transaction_type` to be in the end. then call melt 92 | # which casts to (opr_id, vintage, issued_at, transaction_type, quantity) 93 | credit_cols = [ 94 | 'issuance', 95 | 'vcm_retirement', 96 | 'first_compliance_ca', 97 | 'second_compliance_ca', 98 | 'third_compliance_ca', 99 | 'fourth_compliance_ca', 100 | 'qc_compliance', 101 | ] 102 | melted = df.melt( 103 | id_vars=['opr_id', 'vintage', 'issued_at'], 104 | value_vars=credit_cols, 105 | var_name='transaction_type', 106 | value_name='quantity', 107 | ) 108 | melted.loc[melted['transaction_type'].isin(compliance_period_dates.keys()), 'issued_at'] = ( 109 | melted['transaction_type'].map(compliance_period_dates) 110 | ) 111 | melted = melted.rename(columns={'issued_at': 'transaction_date'}).to_datetime( 112 | 'transaction_date', format='mixed', utc=True 113 | ) 114 | melted['transaction_type'] = melted.transaction_type.apply( 115 | lambda x: 'retirement' if x in compliance_period_dates else x 116 | ) 117 | 118 | # handle missing in retirement cols (i.e. ACR570 2022) 119 | melted.loc[pd.isna(melted['quantity']), 'quantity'] = 0 120 | 121 | # drop all th zero retirement events, as they're artifacts of processing steps 122 | data = melted[ 123 | ~((melted['transaction_type'] == 'retirement') & (melted['quantity'] == 0)) 124 | ].copy() 125 | # add a prefix to the project_id to indicate the source 126 | data['project_id'] = data.opr_id.apply( 127 | lambda item: item 128 | if isinstance(item, str) 129 | and (item.startswith('CAR') or item.startswith('ACR') or item.startswith('VCS')) 130 | else f'VCS{item}' 131 | ) 132 | data['registry'] = data.project_id.apply(_get_registry) 133 | data['vintage'] = data['vintage'].astype(int) 134 | 135 | data = ( 136 | data.add_missing_columns(schema=credit_without_id_schema) 137 | .convert_to_datetime(columns=['transaction_date']) 138 | .validate(schema=credit_without_id_schema) 139 | ) 140 | 141 | return data 142 | -------------------------------------------------------------------------------- /offsets_db_data/catalog.yaml: -------------------------------------------------------------------------------- 1 | metadata: 2 | description: https://carbonplan.org/research/offsets-db-explainer 3 | TERMS_OF_DATA_ACCESS: | 4 | # OffsetsDB 5 | 6 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a regularly-updated snapshot of carbon offset projects, credit issuances, and credit retirements published by the following carbon offset registries: 7 | 8 | American Carbon Registry (ACR) 9 | ART TREES (ART) 10 | Climate Action Reserve (CAR) 11 | Gold Standard (GLD) 12 | Verra (VCS) 13 | 14 | Carbon offset information has historically been scattered across multiple locations in formats that are not particularly useful to researchers. This database is meant to increase transparency, accountability, and reliability of the carbon offset market, and to provide researchers with a robust tool for visualizing, validating, and cross-checking offsets. We hope you find it useful! 15 | 16 | ## Our Terms of Use Apply To OffsetsDB 17 | 18 | By downloading, copying, or using this project, and/or any associated content or data, you agree to CarbonPlan’s Terms Of Use, which can be found here: https://carbonplan.org/terms. As further explained in the Terms of Use, CarbonPlan makes its projects — including OffsetsDB — available strictly on an “as-is” and “as-available” basis, without warranty of any kind, including without limitation the warranties of merchantability, fitness for a particular purpose, and noninfringement. 19 | 20 | ## Intellectual Property Rights 21 | 22 | Because OffsetsDB consists of purely factual information concerning carbon offsets that has been made publicly available by the above-referenced registries, CarbonPlan does not claim copyright in this data. 23 | 24 | However, please note that CarbonPlan does not make any representation as to whether any of the above-referenced registries may claim any rights in the data they have published. If you have any questions or concerns about this, please reach out to the registries directly. 25 | 26 | version: 1.0.0 27 | sources: 28 | credits: 29 | description: OffsetsDB processed and transformed data 30 | driver: parquet 31 | parameters: 32 | date: 33 | description: date of the data to load 34 | type: str 35 | default: '2024-02-13' 36 | args: 37 | urlpath: 's3://carbonplan-offsets-db/final/{{ date }}/credits-augmented.parquet' 38 | storage_options: { 'anon': True } 39 | engine: 'fastparquet' 40 | 41 | projects: 42 | description: OffsetsDB processed and transformed data 43 | driver: parquet 44 | parameters: 45 | date: 46 | description: date of the data to load 47 | type: str 48 | default: '2024-02-13' 49 | args: 50 | urlpath: 's3://carbonplan-offsets-db/final/{{ date }}/projects-augmented.parquet' 51 | storage_options: { 'anon': True } 52 | engine: 'fastparquet' 53 | 54 | raw_projects: 55 | description: Raw projects data downloaded from the registries on a daily basis 56 | driver: csv 57 | parameters: 58 | registry: 59 | description: registry name 60 | type: str 61 | default: verra 62 | allowed: 63 | - verra 64 | - art-trees 65 | - gold-standard 66 | - american-carbon-registry 67 | - climate-action-reserve 68 | 69 | date: 70 | description: date of the data to load 71 | type: str 72 | default: '2024-02-13' 73 | args: 74 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/{{ registry }}/projects.csv.gz' 75 | storage_options: { 'anon': True } 76 | 77 | raw_verra_transactions: 78 | description: Raw Verra transactions data downloaded from the registries on a daily basis 79 | driver: csv 80 | parameters: 81 | date: 82 | description: date of the data to load 83 | type: str 84 | default: '2024-02-13' 85 | args: 86 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/verra/transactions.csv.gz' 87 | storage_options: { 'anon': True } 88 | 89 | raw_gold_standard_transactions: 90 | description: Raw Gold Standard transactions data downloaded from the registries on a daily basis 91 | driver: csv 92 | parameters: 93 | date: 94 | description: date of the data to load 95 | type: str 96 | default: '2024-02-13' 97 | transaction_type: 98 | description: transaction type 99 | type: str 100 | default: 'issuances' 101 | allowed: 102 | - issuances 103 | - retirements 104 | - cancellations 105 | args: 106 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/gold-standard/{{ transaction_type }}.csv.gz' 107 | storage_options: { 'anon': True } 108 | 109 | raw_art_trees_transactions: 110 | description: Raw Art Trees transactions data downloaded from the registries on a daily basis 111 | driver: csv 112 | parameters: 113 | date: 114 | description: date of the data to load 115 | type: str 116 | default: '2024-02-13' 117 | transaction_type: 118 | description: transaction type 119 | type: str 120 | default: 'issuances' 121 | allowed: 122 | - issuances 123 | - retirements 124 | - cancellations 125 | args: 126 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/art-trees/{{ transaction_type }}.csv.gz' 127 | storage_options: { 'anon': True } 128 | 129 | raw_american_carbon_registry_transactions: 130 | description: Raw American Carbon Registry transactions data downloaded from the registries on a daily basis 131 | driver: csv 132 | parameters: 133 | date: 134 | description: date of the data to load 135 | type: str 136 | default: '2024-02-13' 137 | transaction_type: 138 | description: transaction type 139 | type: str 140 | default: 'issuances' 141 | allowed: 142 | - issuances 143 | - retirements 144 | - cancellations 145 | args: 146 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/american-carbon-registry/{{ transaction_type }}.csv.gz' 147 | storage_options: { 'anon': True } 148 | 149 | raw_climate_action_reserve_transactions: 150 | description: Raw Climate Action Reserve transactions data downloaded from the registries on a daily basis 151 | driver: csv 152 | parameters: 153 | date: 154 | description: date of the data to load 155 | type: str 156 | default: '2024-02-13' 157 | transaction_type: 158 | description: transaction type 159 | type: str 160 | default: 'issuances' 161 | allowed: 162 | - issuances 163 | - retirements 164 | - cancellations 165 | args: 166 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/climate-action-reserve/{{ transaction_type }}.csv.gz' 167 | storage_options: { 'anon': True } 168 | -------------------------------------------------------------------------------- /offsets_db_data/common.py: -------------------------------------------------------------------------------- 1 | import json 2 | import typing 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pandas_flavor as pf 8 | import pandera as pa 9 | import upath 10 | 11 | CREDIT_SCHEMA_UPATH = ( 12 | upath.UPath(__file__).parents[0] / 'configs' / 'credits-raw-columns-mapping.json' 13 | ) 14 | PROTOCOL_MAPPING_UPATH = upath.UPath(__file__).parents[0] / 'configs' / 'all-protocol-mapping.json' 15 | PROJECT_SCHEMA_UPATH = ( 16 | upath.UPath(__file__).parents[0] / 'configs' / 'projects-raw-columns-mapping.json' 17 | ) 18 | TYPE_CATEGORY_MAPPING_UPATH = ( 19 | upath.UPath(__file__).parents[0] / 'configs' / 'type-category-mapping.json' 20 | ) 21 | 22 | BERKELEY_PROJECT_TYPE_UPATH = ( 23 | upath.UPath(__file__).parents[0] / 'configs' / 'berkeley-project-types.json' 24 | ) 25 | 26 | 27 | def load_registry_project_column_mapping( 28 | *, registry_name: str, file_path: upath.UPath = PROJECT_SCHEMA_UPATH 29 | ) -> dict: 30 | with open(file_path) as file: 31 | data = json.load(file) 32 | 33 | mapping: dict = {} 34 | for key1, value_dict in data.items(): 35 | for key2, value in value_dict.items(): 36 | if key2 not in mapping: 37 | mapping[key2] = {} 38 | if value: 39 | mapping[key2][key1] = value 40 | return mapping[registry_name] 41 | 42 | 43 | def load_protocol_mapping(path: upath.UPath = PROTOCOL_MAPPING_UPATH) -> dict: 44 | return json.loads(path.read_text()) 45 | 46 | 47 | def load_inverted_protocol_mapping() -> dict: 48 | protocol_mapping = load_protocol_mapping() 49 | store = defaultdict(list) 50 | for protocol_str, metadata in protocol_mapping.items(): 51 | for known_string in metadata.get('known-strings', []): 52 | store[known_string].append(protocol_str) 53 | 54 | return store 55 | 56 | 57 | def load_column_mapping(*, registry_name: str, download_type: str, mapping_path: str) -> dict: 58 | with open(mapping_path) as f: 59 | registry_credit_column_mapping = json.load(f) 60 | return registry_credit_column_mapping[registry_name][download_type] 61 | 62 | 63 | def load_type_category_mapping(path: upath.UPath = TYPE_CATEGORY_MAPPING_UPATH) -> dict: 64 | return json.loads(path.read_text()) 65 | 66 | 67 | @pf.register_dataframe_method 68 | def set_registry(df: pd.DataFrame, registry_name: str) -> pd.DataFrame: 69 | """ 70 | Set the registry name for each record in the DataFrame. 71 | 72 | Parameters 73 | ---------- 74 | df : pd.DataFrame 75 | Input DataFrame. 76 | registry_name : str 77 | Name of the registry to set. 78 | 79 | Returns 80 | ------- 81 | pd.DataFrame 82 | DataFrame with a new 'registry' column set to the specified registry name.""" 83 | 84 | df['registry'] = registry_name 85 | return df 86 | 87 | 88 | @pf.register_dataframe_method 89 | def convert_to_datetime( 90 | df: pd.DataFrame, *, columns: list, utc: bool = True, **kwargs: typing.Any 91 | ) -> pd.DataFrame: 92 | """ 93 | Convert specified columns in the DataFrame to datetime format. 94 | 95 | Parameters 96 | ---------- 97 | df : pd.DataFrame 98 | Input DataFrame. 99 | columns : list 100 | List of column names to convert to datetime. 101 | utc : bool, optional 102 | Whether to convert to UTC (default is True). 103 | **kwargs : typing.Any 104 | Additional keyword arguments passed to pd.to_datetime. 105 | 106 | Returns 107 | ------- 108 | pd.DataFrame 109 | DataFrame with specified columns converted to datetime format. 110 | """ 111 | 112 | for column in columns: 113 | if column not in df.columns: 114 | raise KeyError(f"The column '{column}' is missing.") 115 | try: 116 | df[column] = pd.to_datetime(df[column], utc=utc, **kwargs).dt.normalize() 117 | except ValueError: 118 | df[column] = pd.to_datetime(df[column], utc=utc).dt.normalize() 119 | return df 120 | 121 | 122 | @pf.register_dataframe_method 123 | def add_missing_columns(df: pd.DataFrame, *, schema: pa.DataFrameSchema) -> pd.DataFrame: 124 | """ 125 | Add any missing columns to the DataFrame and initialize them with None. 126 | 127 | Parameters 128 | ---------- 129 | df : pd.DataFrame 130 | Input DataFrame. 131 | schema : pa.DataFrameSchema 132 | Pandera schema to validate against. 133 | 134 | 135 | Returns 136 | ------- 137 | pd.DataFrame 138 | DataFrame with all specified columns, adding missing ones initialized to None. 139 | """ 140 | 141 | default_values = { 142 | np.dtype('int64'): 0, 143 | np.dtype('int32'): 0, 144 | np.dtype('float64'): 0.0, 145 | np.dtype('float32'): 0.0, 146 | np.dtype('O'): None, 147 | np.dtype(' pd.DataFrame: 163 | """ 164 | Validate the DataFrame against a given Pandera schema. 165 | 166 | Parameters 167 | ---------- 168 | df : pd.DataFrame 169 | Input DataFrame. 170 | schema : pa.DataFrameSchema 171 | Pandera schema to validate against. 172 | 173 | Returns 174 | ------- 175 | pd.DataFrame 176 | DataFrame with columns sorted according to the schema and validated against it. 177 | """ 178 | 179 | results = schema.validate(df) 180 | keys = sorted(list(schema.columns.keys())) 181 | results = results[keys] 182 | 183 | return results 184 | 185 | 186 | @pf.register_dataframe_method 187 | def clean_and_convert_numeric_columns(df: pd.DataFrame, *, columns: list[str]) -> pd.DataFrame: 188 | """ 189 | Clean and convert specified columns to numeric format in the DataFrame. 190 | 191 | Parameters 192 | ---------- 193 | df : pd.DataFrame 194 | Input DataFrame. 195 | columns : list[str] 196 | List of column names to clean and convert to numeric format. 197 | 198 | Returns 199 | ------- 200 | pd.DataFrame 201 | DataFrame with specified columns cleaned (removing commas) and converted to numeric format. 202 | """ 203 | 204 | for column in columns: 205 | df[column] = df[column].str.replace(',', '', regex=True) 206 | df[column] = pd.to_numeric(df[column], errors='coerce') 207 | return df 208 | -------------------------------------------------------------------------------- /offsets_db_data/configs/credits-raw-columns-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "american-carbon-registry": { 3 | "cancellations": { 4 | "project_id": "Project ID", 5 | "quantity": "Quantity of Credits", 6 | "retirement_account": null, 7 | "retirement_beneficiary": null, 8 | "retirement_note": null, 9 | "retirement_reason": null, 10 | "transaction_date": "Status Effective (GMT)", 11 | "vintage": "Vintage" 12 | }, 13 | "issuances": { 14 | "project_id": "Project ID", 15 | "quantity": "Total Credits Issued", 16 | "retirement_account": null, 17 | "retirement_beneficiary": null, 18 | "retirement_note": null, 19 | "retirement_reason": null, 20 | "transaction_date": "Date Issued (GMT)", 21 | "vintage": "Vintage" 22 | }, 23 | "retirements": { 24 | "project_id": "Project ID", 25 | "quantity": "Quantity of Credits", 26 | "retirement_account": "Account Holder", 27 | "retirement_beneficiary": null, 28 | "retirement_note": "Purpose of Retirement", 29 | "retirement_reason": "Retirement Reason", 30 | "transaction_date": "Status Effective (GMT)", 31 | "vintage": "Vintage" 32 | } 33 | }, 34 | "art-trees": { 35 | "cancellations": { 36 | "project_id": "Program ID", 37 | "quantity": "Quantity of Credits", 38 | "retirement_account": null, 39 | "retirement_beneficiary": null, 40 | "retirement_note": null, 41 | "retirement_reason": null, 42 | "transaction_date": "Status Effective", 43 | "vintage": "Vintage" 44 | }, 45 | "issuances": { 46 | "project_id": "Program ID", 47 | "quantity": "Credits Verified", 48 | "retirement_account": null, 49 | "retirement_beneficiary": null, 50 | "retirement_note": null, 51 | "retirement_reason": null, 52 | "transaction_date": "Date Approved", 53 | "vintage": "Vintage" 54 | }, 55 | "retirements": { 56 | "project_id": "Program ID", 57 | "quantity": "Quantity of Credits", 58 | "retirement_account": "Account Holder", 59 | "retirement_beneficiary": null, 60 | "retirement_note": "Retirement Reason Details", 61 | "retirement_reason": "Retirement Reason", 62 | "transaction_date": "Status Effective", 63 | "vintage": "Vintage" 64 | } 65 | }, 66 | "climate-action-reserve": { 67 | "cancellations": { 68 | "project_id": "Project ID", 69 | "quantity": "Quantity of Offset Credits", 70 | "retirement_account": null, 71 | "retirement_beneficiary": null, 72 | "retirement_note": null, 73 | "retirement_reason": null, 74 | "transaction_date": "Status Effective", 75 | "vintage": "Vintage" 76 | }, 77 | "issuances": { 78 | "project_id": "Project ID", 79 | "quantity": "Total Offset Credits Issued", 80 | "retirement_account": null, 81 | "retirement_beneficiary": null, 82 | "retirement_note": null, 83 | "retirement_reason": null, 84 | "transaction_date": "Date Issued", 85 | "vintage": "Vintage" 86 | }, 87 | "retirements": { 88 | "project_id": "Project ID", 89 | "quantity": "Quantity of Offset Credits", 90 | "retirement_account": "Account Holder", 91 | "retirement_beneficiary": null, 92 | "retirement_note": "Retirement Reason Details", 93 | "retirement_reason": "Retirement Reason", 94 | "transaction_date": "Status Effective", 95 | "vintage": "Vintage" 96 | } 97 | }, 98 | "gold-standard": { 99 | "issuances": { 100 | "project_id": "GSID", 101 | "quantity": "Quantity", 102 | "retirement_account": null, 103 | "retirement_beneficiary": null, 104 | "retirement_note": null, 105 | "retirement_reason": null, 106 | "transaction_date": "Issuance Date", 107 | "vintage": "Vintage" 108 | }, 109 | "retirements": { 110 | "project_id": "GSID", 111 | "quantity": "Quantity", 112 | "retirement_account": null, 113 | "retirement_beneficiary": "Using Entity", 114 | "retirement_note": "Note", 115 | "retirement_reason": null, 116 | "transaction_date": "Retirement Date", 117 | "vintage": "Vintage" 118 | } 119 | }, 120 | "verra": { 121 | "transactions": { 122 | "project_id": null, 123 | "quantity": null, 124 | "retirement_account": null, 125 | "retirement_beneficiary": "Retirement Beneficiary", 126 | "retirement_note": "Retirement Details", 127 | "retirement_reason": "Retirement Reason", 128 | "transaction_date": null, 129 | "vintage": null 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /offsets_db_data/configs/projects-raw-columns-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "country": { 3 | "american-carbon-registry": "Project Site Country", 4 | "art-trees": "Program Country", 5 | "climate-action-reserve": "Project Site Country", 6 | "gold-standard": "Country", 7 | "verra": "Country/Area" 8 | }, 9 | "listed_at": { 10 | "american-carbon-registry": null, 11 | "art-trees": null, 12 | "climate-action-reserve": "Project Listed Date", 13 | "gold-standard": null, 14 | "verra": null 15 | }, 16 | "name": { 17 | "american-carbon-registry": "Project Name", 18 | "art-trees": "Program Name", 19 | "climate-action-reserve": "Project Name", 20 | "gold-standard": "Project Name", 21 | "verra": "Name" 22 | }, 23 | "original_protocol": { 24 | "american-carbon-registry": "Project Methodology/Protocol", 25 | "art-trees": null, 26 | "climate-action-reserve": "Project Type", 27 | "gold-standard": "Methodology", 28 | "verra": "Methodology" 29 | }, 30 | "project_id": { 31 | "american-carbon-registry": "Project ID", 32 | "art-trees": "Program ID", 33 | "climate-action-reserve": "Project ID", 34 | "gold-standard": "GSID", 35 | "verra": "ID" 36 | }, 37 | "proponent": { 38 | "american-carbon-registry": null, 39 | "art-trees": "Sovereign Program Developer", 40 | "climate-action-reserve": "Project Owner", 41 | "gold-standard": "Project Developer Name", 42 | "verra": "Proponent" 43 | }, 44 | "status": { 45 | "american-carbon-registry": null, 46 | "art-trees": "Status", 47 | "climate-action-reserve": "Status", 48 | "gold-standard": "Status", 49 | "verra": "Status" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /offsets_db_data/configs/type-category-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "advanced refrigerants": { 3 | "category": "ghg-management", 4 | "project-type-display-name": "Advanced Refrigerants" 5 | }, 6 | "afforestation/reforestation": { 7 | "category": "forest", 8 | "project-type-display-name": "Afforestation + Reforestation" 9 | }, 10 | "aluminum smelters emission reductions": { 11 | "category": "fuel-switching", 12 | "project-type-display-name": "Aluminum Smelter" 13 | }, 14 | "avoided forest conversion": { 15 | "category": "forest", 16 | "project-type-display-name": "Avoided Forest Conversion" 17 | }, 18 | "avoided grassland conversion": { 19 | "category": "land-use", 20 | "project-type-display-name": "Avoided Grassland Conversion" 21 | }, 22 | "bicycles": { 23 | "category": "fuel-switching", 24 | "project-type-display-name": "Bicycle" 25 | }, 26 | "biochar": { 27 | "category": "biochar", 28 | "project-type-display-name": "Biochar" 29 | }, 30 | "biodigesters": { 31 | "category": "ghg-management", 32 | "project-type-display-name": "Methane Biodigester" 33 | }, 34 | "biomass": { 35 | "category": "fuel-switching", 36 | "project-type-display-name": "Biomass" 37 | }, 38 | "brick manufacturing emission reductions": { 39 | "category": "energy-efficiency", 40 | "project-type-display-name": "Brick Manufacturing" 41 | }, 42 | "bundled compost production and soil application": { 43 | "category": "ghg-management", 44 | "project-type-display-name": "Compost" 45 | }, 46 | "bundled energy efficiency": { 47 | "category": "energy-efficiency", 48 | "project-type-display-name": "Energy Efficiency" 49 | }, 50 | "carbon capture & enhanced oil recovery": { 51 | "category": "ghg-management", 52 | "project-type-display-name": "Enhanced Oil Recovery" 53 | }, 54 | "carbon capture in concrete": { 55 | "category": "carbon-capture", 56 | "project-type-display-name": "Concrete CCS" 57 | }, 58 | "carbon capture in plastic": { 59 | "category": "carbon-capture", 60 | "project-type-display-name": "Plastic CCS" 61 | }, 62 | "carbon-absorbing concrete": { 63 | "category": "carbon-capture", 64 | "project-type-display-name": "Concrete CCS" 65 | }, 66 | "clean water": { 67 | "category": "energy-efficiency", 68 | "project-type-display-name": "Clean Water" 69 | }, 70 | "community boreholes": { 71 | "category": "energy-efficiency", 72 | "project-type-display-name": "Borehole" 73 | }, 74 | "composting": { 75 | "category": "ghg-management", 76 | "project-type-display-name": "Compost" 77 | }, 78 | "cookstoves": { 79 | "category": "fuel-switching", 80 | "project-type-display-name": "Cookstove" 81 | }, 82 | "electric vehicles & charging": { 83 | "category": "fuel-switching", 84 | "project-type-display-name": "Electric Vehicle" 85 | }, 86 | "energy efficiency": { 87 | "category": "energy-efficiency", 88 | "project-type-display-name": "Energy Efficiency" 89 | }, 90 | "feed additives": { 91 | "category": "ghg-management", 92 | "project-type-display-name": "Feed Additive" 93 | }, 94 | "fleet efficiency": { 95 | "category": "energy-efficiency", 96 | "project-type-display-name": "Fleet Efficiency" 97 | }, 98 | "fuel switching": { 99 | "category": "fuel-switching", 100 | "project-type-display-name": "Fuel Switching" 101 | }, 102 | "fuel transport": { 103 | "category": "fuel-switching", 104 | "project-type-display-name": "Fuel Transport" 105 | }, 106 | "geothermal": { 107 | "category": "renewable-energy", 108 | "project-type-display-name": "Geothermal" 109 | }, 110 | "grid expansion & mini-grids": { 111 | "category": "fuel-switching", 112 | "project-type-display-name": "Grid Improvements" 113 | }, 114 | "hfc refrigerant reclamation": { 115 | "category": "ghg-management", 116 | "project-type-display-name": "HFC Reclamation" 117 | }, 118 | "hfc replacement in foam production": { 119 | "category": "ghg-management", 120 | "project-type-display-name": "HFC Replacement" 121 | }, 122 | "hfc23 destruction": { 123 | "category": "ghg-management", 124 | "project-type-display-name": "HFC Destruction" 125 | }, 126 | "hydropower": { 127 | "category": "renewable-energy", 128 | "project-type-display-name": "Hydropower" 129 | }, 130 | "improved forest management": { 131 | "category": "forest", 132 | "project-type-display-name": "Improved Forest Management" 133 | }, 134 | "improved irrigation management": { 135 | "category": "agriculture", 136 | "project-type-display-name": "Improved Irrigation Management" 137 | }, 138 | "landfill methane": { 139 | "category": "ghg-management", 140 | "project-type-display-name": "Landfill" 141 | }, 142 | "leak detection & repair in gas systems": { 143 | "category": "ghg-management", 144 | "project-type-display-name": "Gas Leak Repair" 145 | }, 146 | "lighting": { 147 | "category": "energy-efficiency", 148 | "project-type-display-name": "Lighting" 149 | }, 150 | "lower carbon cement & concrete": { 151 | "category": "energy-efficiency", 152 | "project-type-display-name": "Low Carbon Concrete" 153 | }, 154 | "manure methane digester": { 155 | "category": "ghg-management", 156 | "project-type-display-name": "Manure Biodigester" 157 | }, 158 | "mass transit": { 159 | "category": "energy-efficiency", 160 | "project-type-display-name": "Mass Transit" 161 | }, 162 | "methane recovery in wastewater": { 163 | "category": "ghg-management", 164 | "project-type-display-name": "Wastewater Methane" 165 | }, 166 | "mine methane capture": { 167 | "category": "ghg-management", 168 | "project-type-display-name": "Mine Methane" 169 | }, 170 | "n2o destruction in adipic acid production": { 171 | "category": "ghg-management", 172 | "project-type-display-name": "N\\u2082O Destruction (Adipic Acid)" 173 | }, 174 | "n2o destruction in nitric acid production": { 175 | "category": "ghg-management", 176 | "project-type-display-name": "N\\u2082O Destruction (Nitric Acid)" 177 | }, 178 | "natural gas electricity generation": { 179 | "category": "fuel-switching", 180 | "project-type-display-name": "Natural Gas" 181 | }, 182 | "nitrogen management": { 183 | "category": "agriculture", 184 | "project-type-display-name": "Nitrogen Management" 185 | }, 186 | "oil recycling": { 187 | "category": "energy-efficiency", 188 | "project-type-display-name": "Oil Recycling" 189 | }, 190 | "ozone depleting substances recovery & destruction": { 191 | "category": "ghg-management", 192 | "project-type-display-name": "Ozone Depleting Substances" 193 | }, 194 | "plugging oil & gas wells": { 195 | "category": "ghg-management", 196 | "project-type-display-name": "Oil + Gas Well" 197 | }, 198 | "pneumatic retrofit": { 199 | "category": "ghg-management", 200 | "project-type-display-name": "Pneumatic Retrofit" 201 | }, 202 | "propylene oxide production": { 203 | "category": "ghg-management", 204 | "project-type-display-name": "Propylene Oxide" 205 | }, 206 | "re bundled": { 207 | "category": "renewable-energy", 208 | "project-type-display-name": "Renewable Energy" 209 | }, 210 | "redd+": { 211 | "category": "forest", 212 | "project-type-display-name": "REDD+" 213 | }, 214 | "refrigerant leak detection": { 215 | "category": "ghg-management", 216 | "project-type-display-name": "Refrigerant Leak" 217 | }, 218 | "rice emission reductions": { 219 | "category": "agriculture", 220 | "project-type-display-name": "Rice Emission" 221 | }, 222 | "road construction emission reductions": { 223 | "category": "energy-efficiency", 224 | "project-type-display-name": "Road Construction" 225 | }, 226 | "sf6 replacement": { 227 | "category": "ghg-management", 228 | "project-type-display-name": "SF\\u2086 Replacement" 229 | }, 230 | "shipping": { 231 | "category": "energy-efficiency", 232 | "project-type-display-name": "Shipping" 233 | }, 234 | "solar - centralized": { 235 | "category": "renewable-energy", 236 | "project-type-display-name": "Centralized Solar" 237 | }, 238 | "solar - distributed": { 239 | "category": "renewable-energy", 240 | "project-type-display-name": "Distributed Solar" 241 | }, 242 | "solar lighting": { 243 | "category": "renewable-energy", 244 | "project-type-display-name": "Lighting" 245 | }, 246 | "solar water heaters": { 247 | "category": "renewable-energy", 248 | "project-type-display-name": "Solar Water Heater" 249 | }, 250 | "solid waste separation": { 251 | "category": "ghg-management", 252 | "project-type-display-name": "Solid Waste Separation" 253 | }, 254 | "sustainable agriculture": { 255 | "category": "agriculture", 256 | "project-type-display-name": "Sustainable Agriculture" 257 | }, 258 | "sustainable grassland management": { 259 | "category": "land-use", 260 | "project-type-display-name": "Grassland Management" 261 | }, 262 | "truck stop electrification": { 263 | "category": "energy-efficiency", 264 | "project-type-display-name": "Truck Stop" 265 | }, 266 | "university campus emission reductions": { 267 | "category": "energy-efficiency", 268 | "project-type-display-name": "University" 269 | }, 270 | "waste diversion": { 271 | "category": "ghg-management", 272 | "project-type-display-name": "Waste Diversion" 273 | }, 274 | "waste gas recovery": { 275 | "category": "ghg-management", 276 | "project-type-display-name": "Waste Gas Recovery" 277 | }, 278 | "waste heat recovery": { 279 | "category": "energy-efficiency", 280 | "project-type-display-name": "Waste Heat Recovery" 281 | }, 282 | "waste incineration": { 283 | "category": "fuel-switching", 284 | "project-type-display-name": "Waste Incineration" 285 | }, 286 | "waste recycling": { 287 | "category": "energy-efficiency", 288 | "project-type-display-name": "Recycling" 289 | }, 290 | "waste reduction": { 291 | "category": "ghg-management", 292 | "project-type-display-name": "Waste Reduction" 293 | }, 294 | "weatherization": { 295 | "category": "energy-efficiency", 296 | "project-type-display-name": "Weatherization" 297 | }, 298 | "wetland restoration": { 299 | "category": "land-use", 300 | "project-type-display-name": "Wetland" 301 | }, 302 | "wind": { 303 | "category": "renewable-energy", 304 | "project-type-display-name": "Wind" 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /offsets_db_data/credits.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pathlib 3 | import subprocess 4 | import tempfile 5 | import uuid 6 | 7 | import janitor # noqa: F401 8 | import numpy as np 9 | import pandas as pd 10 | import pandas_flavor as pf 11 | import upath 12 | 13 | BENEFICIARY_MAPPING_UPATH = ( 14 | upath.UPath(__file__).parents[0] / 'configs' / 'beneficiary-mappings.json' 15 | ) 16 | 17 | 18 | @pf.register_dataframe_method 19 | def aggregate_issuance_transactions(df: pd.DataFrame) -> pd.DataFrame: 20 | """ 21 | Aggregate issuance transactions by summing the quantity for each combination of project ID, transaction date, and vintage. 22 | 23 | Parameters 24 | ---------- 25 | df : pd.DataFrame 26 | Input DataFrame containing issuance transaction data. 27 | 28 | Returns 29 | ------- 30 | pd.DataFrame 31 | DataFrame with aggregated issuance transactions, filtered to include only those with a positive quantity. 32 | """ 33 | 34 | # Check if 'transaction_type' exists in DataFrame columns 35 | if 'transaction_type' not in df.columns: 36 | raise KeyError("The column 'transaction_type' is missing.") 37 | 38 | # Initialize df_issuance_agg to an empty DataFrame 39 | df_issuance_agg = pd.DataFrame() 40 | df_issuance = df[df['transaction_type'] == 'issuance'] 41 | 42 | if not df_issuance.empty: 43 | df_issuance_agg = ( 44 | df_issuance.groupby(['project_id', 'transaction_date', 'vintage']) 45 | .agg( 46 | { 47 | 'quantity': 'sum', 48 | 'registry': 'first', 49 | 'transaction_type': 'first', 50 | } 51 | ) 52 | .reset_index() 53 | ) 54 | df_issuance_agg = df_issuance_agg[df_issuance_agg['quantity'] > 0] 55 | return df_issuance_agg 56 | 57 | 58 | @pf.register_dataframe_method 59 | def filter_and_merge_transactions( 60 | df: pd.DataFrame, arb_data: pd.DataFrame, project_id_column: str = 'project_id' 61 | ) -> pd.DataFrame: 62 | """ 63 | Filter transactions based on project ID intersection with ARB data and merge the filtered transactions. 64 | 65 | Parameters 66 | ---------- 67 | df : pd.DataFrame 68 | Input DataFrame with transaction data. 69 | arb_data : pd.DataFrame 70 | DataFrame containing ARB issuance data. 71 | project_id_column : str, optional 72 | The name of the column containing project IDs (default is 'project_id'). 73 | 74 | Returns 75 | ------- 76 | pd.DataFrame 77 | DataFrame with transactions from the input DataFrame, excluding those present in ARB data, merged with relevant ARB transactions. 78 | """ 79 | 80 | if intersection_values := list( 81 | set(df[project_id_column]).intersection(set(arb_data[project_id_column])) 82 | ): 83 | df = df[~df[project_id_column].isin(intersection_values)] 84 | df = pd.concat( 85 | [df, arb_data[arb_data[project_id_column].isin(intersection_values)]], ignore_index=True 86 | ) 87 | return df 88 | 89 | 90 | @pf.register_dataframe_method 91 | def handle_non_issuance_transactions(df: pd.DataFrame) -> pd.DataFrame: 92 | """ 93 | Filter the DataFrame to include only non-issuance transactions. 94 | 95 | Parameters 96 | ---------- 97 | df : pd.DataFrame 98 | Input DataFrame containing transaction data. 99 | 100 | Returns 101 | ------- 102 | pd.DataFrame 103 | DataFrame containing only transactions where 'transaction_type' is not 'issuance'. 104 | """ 105 | 106 | df_non_issuance = df[df['transaction_type'] != 'issuance'] 107 | return df_non_issuance 108 | 109 | 110 | @pf.register_dataframe_method 111 | def merge_with_arb(credits: pd.DataFrame, *, arb: pd.DataFrame) -> pd.DataFrame: 112 | """ 113 | ARB issuance table contains the authorative version of all credit transactions for ARB projects. 114 | This function drops all registry crediting data and, isntead, patches in data from the ARB issuance table. 115 | 116 | Parameters 117 | ---------- 118 | credits: pd.DataFrame 119 | Pandas dataframe containing registry credit data 120 | arb: pd.DataFrame 121 | Pandas dataframe containing ARB issuance data 122 | 123 | Returns 124 | ------- 125 | pd.DataFrame 126 | Pandas dataframe containing merged credit and ARB data 127 | """ 128 | df = credits 129 | project_id_column = 'project_id' 130 | if intersection_values := list( 131 | set(df[project_id_column]).intersection(set(arb[project_id_column])) 132 | ): 133 | df = df[~df[project_id_column].isin(intersection_values)] 134 | 135 | df = pd.concat([df, arb], ignore_index=True) 136 | return df 137 | 138 | 139 | def harmonize_beneficiary_data( 140 | credits: pd.DataFrame, registry_name: str, download_type: str 141 | ) -> pd.DataFrame: 142 | """ 143 | Harmonize the beneficiary information via OpenRefine. 144 | 145 | Parameters 146 | ---------- 147 | credits : pd.DataFrame 148 | Input DataFrame containing credit data. 149 | """ 150 | 151 | tempdir = tempfile.gettempdir() 152 | temp_path = pathlib.Path(tempdir) / f'{registry_name}-{download_type}-credits.csv' 153 | 154 | if len(credits) == 0: 155 | print( 156 | f'Empty dataframe with shape={credits.shape} - columns:{credits.columns.tolist()}. No credits to harmonize' 157 | ) 158 | data = credits.copy() 159 | data['retirement_beneficiary_harmonized'] = pd.Series(dtype='str') 160 | return data 161 | credits.to_csv(temp_path, index=False) 162 | 163 | project_name = f'{registry_name}-{download_type}-beneficiary-harmonization-{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}-{uuid.uuid4()}' 164 | output_path = pathlib.Path(tempdir) / f'{project_name}.csv' 165 | 166 | try: 167 | return _extract_harmonized_beneficiary_data_via_openrefine( 168 | temp_path, project_name, str(BENEFICIARY_MAPPING_UPATH), str(output_path) 169 | ) 170 | 171 | except subprocess.CalledProcessError as e: 172 | raise ValueError( 173 | f'Commad failed with return code: {e.returncode}\nOutput: {e.output}\nError output: {e.stderr}' 174 | ) from e 175 | 176 | 177 | def _extract_harmonized_beneficiary_data_via_openrefine( 178 | temp_path, project_name, beneficiary_mapping_path, output_path 179 | ): 180 | result = subprocess.run( 181 | [ 182 | 'offsets-db-data-orcli', 183 | 'run', 184 | '--', 185 | 'import', 186 | 'csv', 187 | str(temp_path), 188 | '--projectName', 189 | f'{project_name}', 190 | ], 191 | capture_output=True, 192 | text=True, 193 | check=True, 194 | ) 195 | 196 | result = subprocess.run( 197 | ['offsets-db-data-orcli', 'run', '--', 'info', project_name], 198 | capture_output=True, 199 | text=True, 200 | check=True, 201 | ) 202 | 203 | result = subprocess.run( 204 | [ 205 | 'offsets-db-data-orcli', 206 | 'run', 207 | '--', 208 | 'transform', 209 | project_name, 210 | beneficiary_mapping_path, 211 | ], 212 | capture_output=True, 213 | text=True, 214 | check=True, 215 | ) 216 | 217 | result = subprocess.run( 218 | [ 219 | 'offsets-db-data-orcli', 220 | 'run', 221 | '--', 222 | 'export', 223 | 'csv', 224 | project_name, 225 | '--output', 226 | output_path, 227 | ], 228 | capture_output=True, 229 | text=True, 230 | check=True, 231 | ) 232 | 233 | result = subprocess.run( 234 | ['offsets-db-data-orcli', 'run', '--', 'delete', project_name], 235 | capture_output=True, 236 | text=True, 237 | check=True, 238 | ) 239 | 240 | print(result.stdout) 241 | 242 | data = pd.read_csv(output_path) 243 | data['merged_beneficiary'] = data['merged_beneficiary'].fillna('').astype(str) 244 | data['retirement_beneficiary_harmonized'] = np.where( 245 | data['merged_beneficiary'].notnull() & (~data['merged_beneficiary'].str.contains(';%')), 246 | data['merged_beneficiary'], 247 | np.nan, 248 | ) 249 | return data 250 | -------------------------------------------------------------------------------- /offsets_db_data/data.py: -------------------------------------------------------------------------------- 1 | import intake 2 | import pkg_resources 3 | 4 | catalog_file = pkg_resources.resource_filename('offsets_db_data', 'catalog.yaml') 5 | catalog = intake.open_catalog(catalog_file) 6 | -------------------------------------------------------------------------------- /offsets_db_data/gld.py: -------------------------------------------------------------------------------- 1 | import numpy as np # noqa: F401, I001 2 | import pandas as pd 3 | import pandas_flavor as pf 4 | 5 | from offsets_db_data.common import ( 6 | BERKELEY_PROJECT_TYPE_UPATH, 7 | CREDIT_SCHEMA_UPATH, 8 | PROJECT_SCHEMA_UPATH, 9 | load_column_mapping, 10 | load_inverted_protocol_mapping, 11 | load_registry_project_column_mapping, 12 | load_type_category_mapping, 13 | ) 14 | from offsets_db_data.credits import aggregate_issuance_transactions # noqa: F401 15 | from offsets_db_data.credits import filter_and_merge_transactions # noqa: F401 16 | from offsets_db_data.credits import merge_with_arb # noqa: F401 17 | from offsets_db_data.credits import harmonize_beneficiary_data 18 | from offsets_db_data.models import credit_without_id_schema, project_schema 19 | from offsets_db_data.projects import add_category # noqa: F401 20 | from offsets_db_data.projects import add_first_issuance_and_retirement_dates # noqa: F401 21 | from offsets_db_data.projects import add_is_compliance_flag # noqa: F401 22 | from offsets_db_data.projects import add_retired_and_issued_totals # noqa: F401 23 | from offsets_db_data.projects import harmonize_country_names # noqa: F401 24 | from offsets_db_data.projects import harmonize_status_codes # noqa: F401 25 | from offsets_db_data.projects import map_protocol # noqa: F401 26 | 27 | 28 | @pf.register_dataframe_method 29 | def determine_gld_transaction_type(df: pd.DataFrame, *, download_type: str) -> pd.DataFrame: 30 | """ 31 | Assign a transaction type to each record in the DataFrame based on the download type for Gold Standard transactions. 32 | 33 | Parameters 34 | ---------- 35 | df : pd.DataFrame 36 | Input DataFrame containing transaction data. 37 | download_type : str 38 | Type of transaction ('issuances', 'retirements') to determine the transaction type. 39 | 40 | Returns 41 | ------- 42 | pd.DataFrame 43 | DataFrame with a new 'transaction_type' column, containing assigned transaction types based on download_type. 44 | """ 45 | 46 | transaction_type_mapping = {'issuances': 'issuance', 'retirements': 'retirement'} 47 | df['transaction_type'] = transaction_type_mapping[download_type] 48 | return df 49 | 50 | 51 | @pf.register_dataframe_method 52 | def add_gld_project_id(df: pd.DataFrame, *, prefix: str) -> pd.DataFrame: 53 | """ 54 | Add Gold Standard project IDs to the DataFrame 55 | 56 | Parameters 57 | ---------- 58 | df : pd.DataFrame 59 | Input DataFrame containing credits data. 60 | prefix : str 61 | Prefix string to prepend to each project ID. 62 | 63 | Returns 64 | ------- 65 | pd.DataFrame 66 | DataFrame with a new 'project_id' column, containing the generated project IDs. 67 | """ 68 | 69 | df['project_id'] = prefix + df['project_id'].astype(str) 70 | return df 71 | 72 | 73 | @pf.register_dataframe_method 74 | def process_gld_credits( 75 | df: pd.DataFrame, 76 | *, 77 | download_type: str, 78 | registry_name: str = 'gold-standard', 79 | prefix: str = 'GLD', 80 | arb: pd.DataFrame | None = None, 81 | harmonize_beneficiary_info: bool = False, 82 | ) -> pd.DataFrame: 83 | """ 84 | Process Gold Standard credits data by renaming columns, setting registry, determining transaction types, 85 | adding project IDs, converting date columns, aggregating issuances (if applicable), and validating the schema. 86 | 87 | Parameters 88 | ---------- 89 | df : pd.DataFrame 90 | Input DataFrame with raw Gold Standard credits data. 91 | download_type : str 92 | Type of download ('issuances' or 'retirements'). 93 | registry_name : str, optional 94 | Name of the registry for setting and mapping columns (default is 'gold-standard'). 95 | prefix : str, optional 96 | Prefix for generating project IDs (default is 'GLD'). 97 | arb : pd.DataFrame | None, optional 98 | Additional DataFrame for data merging (default is None). 99 | 100 | Returns 101 | ------- 102 | pd.DataFrame 103 | Processed DataFrame with Gold Standard credits data. 104 | """ 105 | 106 | column_mapping = load_column_mapping( 107 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH 108 | ) 109 | 110 | columns = {v: k for k, v in column_mapping.items()} 111 | 112 | df = df.copy() 113 | 114 | if not df.empty: 115 | data = ( 116 | df.rename(columns=columns) 117 | .set_registry(registry_name=registry_name) 118 | .determine_gld_transaction_type(download_type=download_type) 119 | .add_gld_project_id(prefix=prefix) 120 | ) 121 | # split on T and discard the microseconds for consistency 122 | data['transaction_date'] = data['transaction_date'].str.split('T').str[0] 123 | data = data.convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') 124 | 125 | if download_type == 'issuances': 126 | data = data.aggregate_issuance_transactions() 127 | 128 | data = data.add_missing_columns(schema=credit_without_id_schema).validate( 129 | schema=credit_without_id_schema 130 | ) 131 | 132 | if arb is not None and not arb.empty: 133 | data = data.merge_with_arb(arb=arb) 134 | 135 | else: 136 | data = ( 137 | pd.DataFrame(columns=credit_without_id_schema.columns.keys()) 138 | .add_missing_columns(schema=credit_without_id_schema) 139 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') 140 | .add_missing_columns(schema=credit_without_id_schema) 141 | .validate(schema=credit_without_id_schema) 142 | ) 143 | 144 | if harmonize_beneficiary_info: 145 | data = data.pipe( 146 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type 147 | ) 148 | 149 | data = ( 150 | data.add_missing_columns(schema=credit_without_id_schema) 151 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') 152 | .validate(schema=credit_without_id_schema) 153 | ) 154 | 155 | return data 156 | 157 | 158 | @pf.register_dataframe_method 159 | def add_gld_project_url(df: pd.DataFrame) -> pd.DataFrame: 160 | """Add url for gold standard projects 161 | 162 | gs project ids are different from the id used in gold standard urls. 163 | 164 | Parameters 165 | ---------- 166 | df : pd.DataFrame 167 | Input DataFrame containing Gold Standard project data. 168 | 169 | Returns 170 | ------- 171 | pd.DataFrame 172 | DataFrame with a new 'project_url' column, containing URLs for each project. 173 | """ 174 | df['project_url'] = 'https://registry.goldstandard.org/projects?q=gs' + df['project_id'].apply( 175 | str 176 | ) 177 | return df 178 | 179 | 180 | @pf.register_dataframe_method 181 | def process_gld_projects( 182 | df: pd.DataFrame, 183 | *, 184 | credits: pd.DataFrame, 185 | registry_name: str = 'gold-standard', 186 | prefix: str = 'GLD', 187 | ) -> pd.DataFrame: 188 | """ 189 | Process Gold Standard projects data, including renaming, adding, and validating columns, harmonizing statuses, 190 | and merging with credits data. 191 | 192 | Parameters 193 | ---------- 194 | df : pd.DataFrame 195 | Input DataFrame with raw Gold Standard projects data. 196 | credits : pd.DataFrame 197 | DataFrame containing credits data for merging. 198 | registry_name : str, optional 199 | Name of the registry for specific processing steps (default is 'gold-standard'). 200 | prefix : str, optional 201 | Prefix for generating project IDs (default is 'GLD'). 202 | 203 | Returns 204 | ------- 205 | pd.DataFrame 206 | Processed DataFrame with harmonized and validated Gold Standard projects data. 207 | """ 208 | 209 | registry_project_column_mapping = load_registry_project_column_mapping( 210 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH 211 | ) 212 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()} 213 | type_category_mapping = load_type_category_mapping() 214 | inverted_protocol_mapping = load_inverted_protocol_mapping() 215 | 216 | df = df.copy() 217 | credits = credits.copy() 218 | 219 | if not df.empty and not credits.empty: 220 | data = ( 221 | df.rename(columns=inverted_column_mapping) 222 | .set_registry(registry_name=registry_name) 223 | .add_gld_project_url() 224 | .add_gld_project_id(prefix=prefix) 225 | .harmonize_country_names() 226 | .harmonize_status_codes() 227 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping) 228 | .infer_project_type() 229 | .override_project_types( 230 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley' 231 | ) 232 | .add_category( 233 | type_category_mapping=type_category_mapping 234 | ) # must come after types; type -> category 235 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping) 236 | .add_is_compliance_flag() 237 | .add_retired_and_issued_totals(credits=credits) 238 | .add_first_issuance_and_retirement_dates(credits=credits) 239 | .add_missing_columns(schema=project_schema) 240 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at']) 241 | .validate(schema=project_schema) 242 | ) 243 | return data 244 | 245 | elif not df.empty and credits.empty: 246 | data = ( 247 | df.rename(columns=inverted_column_mapping) 248 | .set_registry(registry_name=registry_name) 249 | .add_gld_project_url() 250 | .add_gld_project_id(prefix=prefix) 251 | .harmonize_country_names() 252 | .harmonize_status_codes() 253 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping) 254 | .infer_project_type() 255 | .override_project_types( 256 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley' 257 | ) 258 | .add_category( 259 | type_category_mapping=type_category_mapping 260 | ) # must come after types; type -> category 261 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping) 262 | .add_is_compliance_flag() 263 | .add_missing_columns(schema=project_schema) 264 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at']) 265 | .validate(schema=project_schema) 266 | ) 267 | return data 268 | elif df.empty: 269 | data = ( 270 | pd.DataFrame(columns=project_schema.columns.keys()) 271 | .add_missing_columns(schema=project_schema) 272 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at']) 273 | ) 274 | 275 | data['is_compliance'] = data['is_compliance'].astype(bool) 276 | data = data.validate(schema=project_schema) 277 | return data 278 | -------------------------------------------------------------------------------- /offsets_db_data/models.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | import janitor # noqa: F401 4 | import pandas as pd 5 | import pandera as pa 6 | 7 | RegistryType = typing.Literal[ 8 | 'verra', 9 | 'gold-standard', 10 | 'art-trees', 11 | 'american-carbon-registry', 12 | 'climate-action-reserve', 13 | 'none', 14 | ] 15 | 16 | 17 | project_schema = pa.DataFrameSchema( 18 | { 19 | 'protocol': pa.Column(pa.Object, nullable=True), # Array of strings 20 | 'category': pa.Column(pa.String, nullable=True), 21 | 'project_type': pa.Column(pa.String, nullable=False), 22 | 'project_type_source': pa.Column(pa.String, nullable=False), 23 | 'retired': pa.Column( 24 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True 25 | ), 26 | 'issued': pa.Column( 27 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True 28 | ), 29 | 'project_id': pa.Column(pa.String, nullable=False), 30 | 'name': pa.Column(pa.String, nullable=True), 31 | 'registry': pa.Column(pa.String, nullable=False), 32 | 'proponent': pa.Column(pa.String, nullable=True), 33 | 'status': pa.Column(pa.String, nullable=True), 34 | 'country': pa.Column(pa.String, nullable=True), 35 | 'listed_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 36 | 'first_issuance_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 37 | 'first_retirement_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 38 | 'is_compliance': pa.Column(pa.Bool, nullable=True), 39 | 'project_url': pa.Column(pa.String, nullable=True), 40 | } 41 | ) 42 | 43 | 44 | credit_without_id_schema = pa.DataFrameSchema( 45 | { 46 | 'quantity': pa.Column( 47 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True 48 | ), 49 | 'project_id': pa.Column(pa.String, nullable=False), 50 | 'vintage': pa.Column(pa.Int, nullable=True, coerce=True), 51 | 'transaction_date': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 52 | 'transaction_type': pa.Column(pa.String, nullable=True), 53 | 'retirement_account': pa.Column(pa.String, nullable=True), 54 | 'retirement_reason': pa.Column(pa.String, nullable=True), 55 | 'retirement_note': pa.Column(pa.String, nullable=True), 56 | 'retirement_beneficiary': pa.Column(pa.String, nullable=True), 57 | 'retirement_beneficiary_harmonized': pa.Column(pa.String, nullable=True), 58 | } 59 | ) 60 | 61 | credit_schema = credit_without_id_schema.add_columns({'id': pa.Column(pa.Int, nullable=False)}) 62 | 63 | 64 | clip_schema = pa.DataFrameSchema( 65 | { 66 | 'id': pa.Column(pa.Int, nullable=False), 67 | 'date': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 68 | 'title': pa.Column(pa.String, nullable=True), 69 | 'url': pa.Column(pa.String, nullable=True), 70 | 'source': pa.Column(pa.String, nullable=True), 71 | 'tags': pa.Column(pa.Object, nullable=True), 72 | 'notes': pa.Column(pa.String, nullable=True), 73 | 'is_waybacked': pa.Column(pa.Bool, nullable=True), 74 | 'type': pa.Column(pa.String, nullable=True), 75 | } 76 | ) 77 | -------------------------------------------------------------------------------- /offsets_db_data/openrefine.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import shutil 3 | import subprocess 4 | import tempfile 5 | import traceback 6 | 7 | import requests 8 | import rich.console 9 | import typer 10 | 11 | app = typer.Typer(help='offsets-db-data-orcli') 12 | console = rich.console.Console() 13 | 14 | 15 | @app.command() 16 | def install( 17 | url: str = typer.Option( 18 | 'https://github.com/opencultureconsulting/orcli/raw/main/orcli', 19 | help='The URL to download orcli from.', 20 | show_default=True, 21 | ), 22 | destination: str = typer.Option( 23 | './', 24 | help='The destination path to move the downloaded file to.', 25 | show_default=True, 26 | ), 27 | ): 28 | """ 29 | Install orcli from GitHub. 30 | """ 31 | 32 | try: 33 | tempfile_path = (pathlib.Path(tempfile.gettempdir()) / 'orcli').as_posix() 34 | 35 | file_path = f'{destination}/orcli' if destination else 'orcli' 36 | abs_file_path = pathlib.Path(file_path).expanduser().resolve() 37 | filename = abs_file_path.as_posix() 38 | # Download orcli from GitHub 39 | # Download the file with streaming to handle large files. 40 | response = requests.get(url, stream=True) 41 | response.raise_for_status() # Raise error if the download failed. 42 | 43 | with open(tempfile_path, 'wb') as f: 44 | for chunk in response.iter_content(chunk_size=8192): 45 | if chunk: # Filter out keep-alive chunks. 46 | f.write(chunk) 47 | 48 | # Make the file executable 49 | subprocess.run(['chmod', '+x', tempfile_path], check=True) 50 | console.print(f'Moving orcli from {tempfile_path} to {filename}.') 51 | subprocess.run(['mv', tempfile_path, destination], check=True) 52 | console.print(f'orcli installed to {filename}.') 53 | 54 | except Exception as _: 55 | console.print(f'Error: {traceback.format_exc()}') 56 | raise typer.Exit(1) 57 | 58 | 59 | @app.command() 60 | def run( 61 | args: list[str] = typer.Argument(help='The arguments to pass to orcli.'), 62 | binary_path: str | None = typer.Option( 63 | None, help='The path to the orcli binary.', show_default=True 64 | ), 65 | ): 66 | """ 67 | Run orcli with the specified arguments. 68 | """ 69 | if binary_path is None: 70 | binary_path = shutil.which('orcli') 71 | if binary_path is None: 72 | typer.echo('orcli not found. Please install orcli first.') 73 | raise typer.Exit(1) 74 | 75 | command = [binary_path] + list(args) 76 | try: 77 | result = subprocess.run(command, check=True, capture_output=True, text=True) 78 | console.print(result.stdout) 79 | return result.stdout 80 | except subprocess.CalledProcessError as e: 81 | console.print(e.stderr) 82 | raise typer.Exit(e.returncode) from e 83 | 84 | 85 | def main(): 86 | app() 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /offsets_db_data/pipeline_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import io 3 | import tempfile 4 | import zipfile 5 | from collections.abc import Callable 6 | 7 | import fsspec 8 | import pandas as pd 9 | 10 | from offsets_db_data.data import catalog 11 | from offsets_db_data.registry import get_registry_from_project_id 12 | 13 | 14 | def validate_data( 15 | *, 16 | new_data: pd.DataFrame, 17 | as_of: datetime.datetime, 18 | data_type: str, 19 | quantity_column: str, 20 | aggregation_func, 21 | ) -> None: 22 | success = False 23 | for delta_days in [1, 2, 3, 4]: 24 | try: 25 | previous_date = (as_of - datetime.timedelta(days=delta_days)).strftime('%Y-%m-%d') 26 | print( 27 | f'Validating {data_type} for {as_of.strftime("%Y-%m-%d")} against {previous_date}' 28 | ) 29 | old_data = catalog[data_type](date=previous_date).read() 30 | 31 | new_quantity = aggregation_func(new_data[quantity_column]) 32 | old_quantity = aggregation_func(old_data[quantity_column]) 33 | 34 | print(f'New {data_type}: {new_data.shape} | New {quantity_column}: {new_quantity}') 35 | print(f'Old {data_type}: {old_data.shape} | Old {quantity_column}: {old_quantity}') 36 | 37 | if new_quantity < old_quantity * 0.99: 38 | raise ValueError( 39 | f'New {data_type}: {new_quantity} (from {as_of.strftime("%Y-%m-%d")}) are less than 99% of old {data_type}: {old_quantity} (from {previous_date})' 40 | ) 41 | else: 42 | print(f'New {data_type} are at least 99% of old {data_type}') 43 | success = True 44 | break 45 | except Exception as e: 46 | print(f'Validation failed for {delta_days} day(s) back: {e}') 47 | continue 48 | 49 | if not success: 50 | raise ValueError( 51 | 'Validation failed for either 1, 2, 3, or 4 days back. Please make sure the data is available for either 1, 2, 3 or 4 days back.' 52 | ) 53 | 54 | 55 | def validate_credits(*, new_credits: pd.DataFrame, as_of: datetime.datetime) -> None: 56 | validate_data( 57 | new_data=new_credits, 58 | as_of=as_of, 59 | data_type='credits', 60 | quantity_column='quantity', 61 | aggregation_func=sum, 62 | ) 63 | 64 | 65 | def validate_projects(*, new_projects: pd.DataFrame, as_of: datetime.datetime) -> None: 66 | validate_data( 67 | new_data=new_projects, 68 | as_of=as_of, 69 | data_type='projects', 70 | quantity_column='project_id', 71 | aggregation_func=pd.Series.nunique, 72 | ) 73 | 74 | 75 | def validate( 76 | *, new_credits: pd.DataFrame, new_projects: pd.DataFrame, as_of: datetime.datetime 77 | ) -> None: 78 | validate_credits(new_credits=new_credits, as_of=as_of) 79 | validate_projects(new_projects=new_projects, as_of=as_of) 80 | 81 | 82 | def summarize( 83 | *, 84 | credits: pd.DataFrame, 85 | projects: pd.DataFrame, 86 | registry_name: str | None = None, 87 | ) -> None: 88 | """ 89 | Summarizes the credits, projects, and project types data. 90 | 91 | Parameters 92 | ---------- 93 | credits : DataFrame 94 | The credits data. 95 | projects : DataFrame 96 | The projects data. 97 | registry_name : str, optional 98 | Name of the specific registry to summarize. If None, summarizes across all registries. 99 | 100 | Returns 101 | ------- 102 | None 103 | """ 104 | # Create defensive copies to avoid modifying the original dataframes 105 | credits = credits if credits.empty else credits.copy() 106 | projects = projects if projects.empty else projects.copy() 107 | 108 | # Single registry mode 109 | if registry_name: 110 | if not projects.empty: 111 | print( 112 | f'\n\nRetired and Issued (in Millions) summary for {registry_name}:\n\n' 113 | f'{projects[["retired", "issued"]].sum() / 1_000_000}\n\n' 114 | f'{projects.project_id.nunique()} unique projects.\n\n' 115 | ) 116 | else: 117 | print(f'No projects found for {registry_name}...') 118 | 119 | if not credits.empty: 120 | print( 121 | f'\n\nCredits summary (in Millions) for {registry_name}:\n\n' 122 | f'{credits.groupby(["transaction_type"])[["quantity"]].sum() / 1_000_000}\n\n' 123 | f'{credits.shape[0]} total transactions.\n\n' 124 | ) 125 | else: 126 | print(f'No credits found for {registry_name}...') 127 | 128 | # Multi-registry mode 129 | else: 130 | if not projects.empty: 131 | print( 132 | f'Summary Statistics for projects (in Millions):\n' 133 | f'{projects.groupby(["registry", "is_compliance"])[["retired", "issued"]].sum() / 1_000_000}\n' 134 | ) 135 | else: 136 | print('No projects found') 137 | 138 | if not credits.empty: 139 | credits['registry'] = credits['project_id'].map(get_registry_from_project_id) 140 | 141 | print( 142 | f'Summary Statistics for credits (in Millions):\n' 143 | f'{credits.groupby(["registry", "transaction_type"])[["quantity"]].sum() / 1_000_000}\n' 144 | ) 145 | else: 146 | print('No credits found') 147 | 148 | 149 | def to_parquet( 150 | *, 151 | credits: pd.DataFrame, 152 | projects: pd.DataFrame, 153 | output_paths: dict, 154 | registry_name: str | None = None, 155 | ): 156 | """ 157 | Write the given DataFrames to Parquet files. 158 | 159 | Parameters 160 | ----------- 161 | credits : pd.DataFrame 162 | The DataFrame containing credits data. 163 | projects : pd.DataFrame 164 | The DataFrame containing projects data. 165 | output_paths : dict 166 | Dictionary containing output file paths. 167 | 168 | registry_name : str, optional 169 | The name of the registry for logging purposes. 170 | """ 171 | credits.to_parquet( 172 | output_paths['credits'], index=False, compression='gzip', engine='fastparquet' 173 | ) 174 | 175 | prefix = f'{registry_name} ' if registry_name else '' 176 | print(f'Wrote {prefix} credits to {output_paths["credits"]}...') 177 | 178 | projects.to_parquet( 179 | output_paths['projects'], index=False, compression='gzip', engine='fastparquet' 180 | ) 181 | print(f'Wrote {prefix} projects to {output_paths["projects"]}...') 182 | 183 | 184 | def _create_data_zip_buffer( 185 | *, 186 | credits: pd.DataFrame, 187 | projects: pd.DataFrame, 188 | format_type: str, 189 | terms_content: str, 190 | ) -> io.BytesIO: 191 | """ 192 | Create a zip buffer containing data files in the specified format with terms of access. 193 | 194 | Parameters 195 | ---------- 196 | credits : pd.DataFrame 197 | DataFrame containing credit data. 198 | projects : pd.DataFrame 199 | DataFrame containing project data. 200 | project_types : pd.DataFrame 201 | DataFrame containing project type data. 202 | format_type : str 203 | Format type, either 'csv' or 'parquet'. 204 | terms_content : str 205 | Content of the terms of access file. 206 | 207 | Returns 208 | ------- 209 | io.BytesIO 210 | Buffer containing the zip file. 211 | """ 212 | zip_buffer = io.BytesIO() 213 | 214 | with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zf: 215 | zf.writestr('TERMS_OF_DATA_ACCESS.txt', terms_content) 216 | 217 | if format_type == 'csv': 218 | with zf.open('credits.csv', 'w') as buffer: 219 | credits.to_csv(buffer, index=False) 220 | with zf.open('projects.csv', 'w') as buffer: 221 | projects.to_csv(buffer, index=False) 222 | 223 | elif format_type == 'parquet': 224 | # Write Parquet files to temporary files 225 | with tempfile.NamedTemporaryFile(suffix='.parquet') as temp_credits: 226 | credits.to_parquet(temp_credits.name, index=False, engine='fastparquet') 227 | temp_credits.seek(0) 228 | zf.writestr('credits.parquet', temp_credits.read()) 229 | 230 | with tempfile.NamedTemporaryFile(suffix='.parquet') as temp_projects: 231 | projects.to_parquet(temp_projects.name, index=False, engine='fastparquet') 232 | temp_projects.seek(0) 233 | zf.writestr('projects.parquet', temp_projects.read()) 234 | 235 | # Move to the beginning of the BytesIO buffer 236 | zip_buffer.seek(0) 237 | return zip_buffer 238 | 239 | 240 | def write_latest_production( 241 | *, 242 | credits: pd.DataFrame, 243 | projects: pd.DataFrame, 244 | bucket: str, 245 | terms_url: str = 's3://carbonplan-offsets-db/TERMS_OF_DATA_ACCESS.txt', 246 | ): 247 | """ 248 | Write the latest production data to S3 as zip archives containing CSV and Parquet files. 249 | 250 | Parameters 251 | ---------- 252 | credits : pd.DataFrame 253 | DataFrame containing credit data. 254 | projects : pd.DataFrame 255 | DataFrame containing project data. 256 | bucket : str 257 | S3 bucket path to write the data to. 258 | terms_url : str, optional 259 | URL of the terms of access file. 260 | """ 261 | paths = { 262 | 'csv': f'{bucket}/production/latest/offsets-db.csv.zip', 263 | 'parquet': f'{bucket}/production/latest/offsets-db.parquet.zip', 264 | } 265 | 266 | # Get terms content once 267 | fs = fsspec.filesystem('s3', anon=False) 268 | terms_content = fs.read_text(terms_url) 269 | 270 | for format_type, path in paths.items(): 271 | # Create zip buffer with data in the appropriate format 272 | zip_buffer = _create_data_zip_buffer( 273 | credits=credits, 274 | projects=projects, 275 | format_type=format_type, 276 | terms_content=terms_content, 277 | ) 278 | 279 | # Write buffer to S3 280 | with fsspec.open(path, 'wb') as f: 281 | f.write(zip_buffer.getvalue()) 282 | 283 | print(f'Wrote {format_type} to {path}...') 284 | zip_buffer.close() 285 | 286 | 287 | def transform_registry_data( 288 | *, 289 | process_credits_fn: Callable[[], pd.DataFrame], 290 | process_projects_fn: Callable[[pd.DataFrame], pd.DataFrame], 291 | output_paths: dict, 292 | registry_name: str | None = None, 293 | ): 294 | """ 295 | Transform registry data by processing credits and projects, then writing to parquet files. 296 | 297 | Parameters 298 | ---------- 299 | process_credits_fn : callable 300 | Function that returns processed credits DataFrame 301 | process_projects_fn : callable 302 | Function that takes a credits DataFrame and returns processed projects DataFrame 303 | output_paths : dict 304 | Dictionary containing output file paths for 'credits' and 'projects' 305 | registry_name : str, optional 306 | Name of the registry for logging purposes 307 | """ 308 | # Process credits 309 | credits = process_credits_fn() 310 | if registry_name: 311 | print(f'credits for {registry_name}: {credits.head()}') 312 | else: 313 | print(f'processed credits: {credits.head()}') 314 | 315 | # Process projects 316 | projects = process_projects_fn(credits=credits) 317 | if registry_name: 318 | print(f'projects for {registry_name}: {projects.head()}') 319 | else: 320 | print(f'processed projects: {projects.head()}') 321 | 322 | # Summarize data 323 | summarize(credits=credits, projects=projects, registry_name=registry_name) 324 | 325 | # Write to parquet files 326 | to_parquet( 327 | credits=credits, 328 | projects=projects, 329 | output_paths=output_paths, 330 | registry_name=registry_name, 331 | ) 332 | 333 | return credits, projects 334 | -------------------------------------------------------------------------------- /offsets_db_data/projects.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import json 3 | 4 | import country_converter as coco 5 | import janitor # noqa: F401 6 | import numpy as np 7 | import pandas as pd 8 | import pandas_flavor as pf 9 | 10 | 11 | @pf.register_dataframe_method 12 | def harmonize_country_names(df: pd.DataFrame, *, country_column: str = 'country') -> pd.DataFrame: 13 | """ 14 | Harmonize country names in the DataFrame to standardized country names. 15 | 16 | Parameters 17 | ---------- 18 | df : pd.DataFrame 19 | Input DataFrame with country data. 20 | country_column : str, optional 21 | The name of the column containing country names to be harmonized (default is 'country'). 22 | 23 | Returns 24 | ------- 25 | pd.DataFrame 26 | DataFrame with harmonized country names in the specified column. 27 | """ 28 | 29 | print('Harmonizing country names...') 30 | cc = coco.CountryConverter() 31 | df[country_column] = cc.pandas_convert(df[country_column], to='name') 32 | print('Done converting country names...') 33 | return df 34 | 35 | 36 | @pf.register_dataframe_method 37 | def add_category(df: pd.DataFrame, *, type_category_mapping: dict) -> pd.DataFrame: 38 | """ 39 | Add a category to each record in the DataFrame based on its protocol. 40 | 41 | Parameters 42 | ---------- 43 | df : pd.DataFrame 44 | Input DataFrame containing protocol data. 45 | type_category_mapping : dict 46 | Dictionary mapping types to categories. 47 | 48 | Returns 49 | ------- 50 | pd.DataFrame 51 | DataFrame with a new 'category' column, derived from the protocol information. 52 | """ 53 | 54 | print('Adding category based on protocol...') 55 | df['category'] = ( 56 | df['project_type'] 57 | .str.lower() 58 | .map({key.lower(): value['category'] for key, value in type_category_mapping.items()}) 59 | .fillna('unknown') 60 | ) 61 | return df 62 | 63 | 64 | @pf.register_dataframe_method 65 | def override_project_types(df: pd.DataFrame, *, override_data_path: str, source_str: str): 66 | """ 67 | Override project types to the DataFrame based on project characteristics 68 | We treat Berkeley data as source of truth for most project types 69 | 70 | Parameters 71 | ---------- 72 | df : pd.DataFrame 73 | Input DataFrame containing project data. 74 | override_data_path: str 75 | Path to where json of override data lives 76 | source: str 77 | Value to write to `type_source` when applying override values 78 | 79 | Returns 80 | ------- 81 | pd.DataFrame 82 | DataFrame with a 'project_type' column overridden by all values in override_data. 83 | """ 84 | 85 | override_d = json.load(open(override_data_path)) 86 | df['project_type'] = df['project_id'].map(override_d).fillna(df['project_type']) 87 | df.loc[df['project_id'].isin(list(override_d.keys())), 'project_type_source'] = source_str 88 | 89 | return df 90 | 91 | 92 | @pf.register_dataframe_method 93 | def infer_project_type(df: pd.DataFrame) -> pd.DataFrame: 94 | """ 95 | Add project types to the DataFrame based on project characteristics 96 | 97 | Parameters 98 | ---------- 99 | df : pd.DataFrame 100 | Input DataFrame containing project data. 101 | 102 | Returns 103 | ------- 104 | pd.DataFrame 105 | DataFrame with a new 'project_type' column, indicating the project's type. Defaults to None 106 | """ 107 | df.loc[:, 'project_type'] = 'unknown' 108 | df.loc[:, 'project_type_source'] = 'carbonplan' 109 | df.loc[df.apply(lambda x: 'art-trees' in x['protocol'], axis=1), 'project_type'] = 'redd+' 110 | 111 | df.loc[df.apply(lambda x: 'acr-ifm-nonfed' in x['protocol'], axis=1), 'project_type'] = ( 112 | 'improved forest management' 113 | ) 114 | df.loc[df.apply(lambda x: 'acr-abandoned-wells' in x['protocol'], axis=1), 'project_type'] = ( 115 | 'plugging oil & gas wells' 116 | ) 117 | 118 | df.loc[df.apply(lambda x: 'arb-mine-methane' in x['protocol'], axis=1), 'project_type'] = ( 119 | 'mine methane capture' 120 | ) 121 | 122 | df.loc[df.apply(lambda x: 'vm0048' in x['protocol'], axis=1), 'project_type'] = 'redd+' 123 | df.loc[df.apply(lambda x: 'vm0047' in x['protocol'], axis=1), 'project_type'] = ( 124 | 'afforestation/reforestation' 125 | ) 126 | df.loc[df.apply(lambda x: 'vm0045' in x['protocol'], axis=1), 'project_type'] = ( 127 | 'improved forest management' 128 | ) 129 | df.loc[df.apply(lambda x: 'vm0042' in x['protocol'], axis=1), 'project_type'] = 'agriculture' 130 | df.loc[df.apply(lambda x: 'vm0007' in x['protocol'], axis=1), 'project_type'] = 'redd+' 131 | 132 | return df 133 | 134 | 135 | @pf.register_dataframe_method 136 | def map_project_type_to_display_name( 137 | df: pd.DataFrame, *, type_category_mapping: dict 138 | ) -> pd.DataFrame: 139 | """ 140 | Map project types in the DataFrame to display names based on a mapping dictionary. 141 | 142 | Parameters 143 | ---------- 144 | df : pd.DataFrame 145 | Input DataFrame containing project data. 146 | type_category_mapping : dict 147 | Dictionary mapping project type strings to display names. 148 | 149 | Returns 150 | ------- 151 | pd.DataFrame 152 | DataFrame with a new 'project_type' column, containing mapped display names. 153 | """ 154 | 155 | print('Mapping project types to display names...') 156 | df['project_type'] = ( 157 | df['project_type'] 158 | .map( 159 | { 160 | key.lower(): value['project-type-display-name'] 161 | for key, value in type_category_mapping.items() 162 | } 163 | ) 164 | .fillna('Unknown') 165 | ) 166 | return df 167 | 168 | 169 | @pf.register_dataframe_method 170 | def add_is_compliance_flag(df: pd.DataFrame) -> pd.DataFrame: 171 | """ 172 | Add a compliance flag to the DataFrame based on the protocol. 173 | 174 | Parameters 175 | ---------- 176 | df : pd.DataFrame 177 | Input DataFrame containing protocol data. 178 | 179 | Returns 180 | ------- 181 | pd.DataFrame 182 | DataFrame with a new 'is_compliance' column, indicating if the protocol starts with 'arb-'. 183 | """ 184 | 185 | print('Adding is_compliance flag...') 186 | df['is_compliance'] = df.apply( 187 | lambda row: np.any([protocol_str.startswith('arb-') for protocol_str in row['protocol']]), 188 | axis=1, 189 | ) 190 | return df 191 | 192 | 193 | @pf.register_dataframe_method 194 | def map_protocol( 195 | df: pd.DataFrame, 196 | *, 197 | inverted_protocol_mapping: dict, 198 | original_protocol_column: str = 'original_protocol', 199 | ) -> pd.DataFrame: 200 | """ 201 | Map protocols in the DataFrame to standardized names based on an inverted protocol mapping. 202 | 203 | Parameters 204 | ---------- 205 | df : pd.DataFrame 206 | Input DataFrame containing protocol data. 207 | inverted_protocol_mapping : dict 208 | Dictionary mapping protocol strings to standardized protocol names. 209 | original_protocol_column : str, optional 210 | Name of the column containing original protocol information (default is 'original_protocol'). 211 | 212 | Returns 213 | ------- 214 | pd.DataFrame 215 | DataFrame with a new 'protocol' column, containing mapped protocol names. 216 | """ 217 | 218 | print('Mapping protocol based on known string...') 219 | try: 220 | df['protocol'] = df[original_protocol_column].apply( 221 | lambda item: find_protocol( 222 | search_string=item, inverted_protocol_mapping=inverted_protocol_mapping 223 | ) 224 | ) 225 | except KeyError: 226 | # art-trees doesnt have protocol column 227 | df['protocol'] = [['unknown']] * len(df) # protocol column is nested list 228 | 229 | return df 230 | 231 | 232 | @pf.register_dataframe_method 233 | def harmonize_status_codes(df: pd.DataFrame, *, status_column: str = 'status') -> pd.DataFrame: 234 | """Harmonize project status codes across registries 235 | 236 | Excludes ACR, as it requires special treatment across two columns 237 | 238 | Parameters 239 | ---------- 240 | df : pd.DataFrame 241 | Input DataFrame with project status data. 242 | status_column : str, optional 243 | Name of the column containing status codes to harmonize (default is 'status'). 244 | 245 | Returns 246 | ------- 247 | pd.DataFrame 248 | DataFrame with harmonized project status codes. 249 | """ 250 | print('Harmonizing status codes') 251 | with contextlib.suppress(KeyError): 252 | CAR_STATES = { 253 | 'Registered': 'registered', 254 | 'Completed': 'completed', 255 | 'Listed': 'listed', 256 | 'Transitioned': 'unknown', 257 | } 258 | 259 | VERRA_STATES = { 260 | 'Under validation': 'listed', 261 | 'Under development': 'listed', 262 | 'Registration requested': 'listed', 263 | 'Registration and verification approval requested': 'listed', 264 | 'Withdrawn': 'completed', 265 | 'On Hold': 'registered', 266 | 'Units Transferred from Approved GHG Program': 'unknown', 267 | 'Rejected by Administrator': 'completed', 268 | 'Crediting Period Renewal Requested': 'registered', 269 | 'Inactive': 'completed', 270 | 'Crediting Period Renewal and Verification Approval Requested': 'registered', 271 | } 272 | 273 | GS_STATES = { 274 | 'GOLD_STANDARD_CERTIFIED_PROJECT': 'registered', 275 | 'LISTED': 'listed', 276 | 'GOLD_STANDARD_CERTIFIED_DESIGN': 'registered', 277 | } 278 | 279 | state_dict = CAR_STATES | VERRA_STATES | GS_STATES 280 | df[status_column] = df[status_column].apply(lambda x: state_dict.get(x, 'unknown')) 281 | return df 282 | 283 | 284 | def find_protocol( 285 | *, search_string: str, inverted_protocol_mapping: dict[str, list[str]] 286 | ) -> list[str]: 287 | """Match known strings of project methodologies to internal topology 288 | 289 | Unmatched strings are passed through to the database, until such time that we update mapping data. 290 | """ 291 | if pd.isna(search_string): # handle nan case, which crops up in verra data right now 292 | return ['unknown'] 293 | if known_match := inverted_protocol_mapping.get(search_string.strip()): 294 | return known_match # inverted_mapping returns lst 295 | print(f"'{search_string}' is unmapped in full protocol mapping") 296 | return [search_string] 297 | 298 | 299 | def get_protocol_category(*, protocol_strs: list[str] | str, protocol_mapping: dict) -> list[str]: 300 | """ 301 | Get category based on protocol string 302 | 303 | Parameters 304 | ---------- 305 | protocol_strs : str or list 306 | single protocol string or list of protocol strings 307 | 308 | protocol_mapping: dict 309 | metadata about normalized protocol strings 310 | 311 | Returns 312 | ------- 313 | categories : list[str] 314 | list of category strings 315 | """ 316 | 317 | def _get_category(protocol_str, protocol_mapping): 318 | try: 319 | return protocol_mapping.get(protocol_str).get('category', 'unknown') 320 | except AttributeError: 321 | return 'unknown' 322 | 323 | if isinstance(protocol_strs, str): 324 | protocol_strs = [protocol_strs] 325 | categories = [_get_category(protocol_str, protocol_mapping) for protocol_str in protocol_strs] 326 | return list( 327 | set(categories) 328 | ) # if multiple protocols have same category, just return category once 329 | 330 | 331 | @pf.register_dataframe_method 332 | def add_first_issuance_and_retirement_dates( 333 | projects: pd.DataFrame, *, credits: pd.DataFrame 334 | ) -> pd.DataFrame: 335 | """ 336 | Add the first issuance date of carbon credits to each project in the projects DataFrame. 337 | 338 | Parameters 339 | ---------- 340 | credits : pd.DataFrame 341 | A pandas DataFrame containing credit issuance data with columns 'project_id', 'transaction_date', and 'transaction_type'. 342 | projects : pd.DataFrame 343 | A pandas DataFrame containing project data with a 'project_id' column. 344 | 345 | Returns 346 | ------- 347 | projects : pd.DataFrame 348 | A pandas DataFrame which is the original projects DataFrame with two additional columns 'first_issuance_at' representing 349 | the first issuance date of each project and 'first_retirement_at' representing the first retirement date of each project. 350 | """ 351 | 352 | first_issuance = ( 353 | credits[credits['transaction_type'] == 'issuance'] 354 | .groupby('project_id')['transaction_date'] 355 | .min() 356 | .reset_index() 357 | ) 358 | first_retirement = ( 359 | credits[credits['transaction_type'].str.contains('retirement')] 360 | .groupby('project_id')['transaction_date'] 361 | .min() 362 | .reset_index() 363 | ) 364 | 365 | # Merge the projects DataFrame with the first issuance and retirement dates 366 | projects_with_dates = pd.merge(projects, first_issuance, on='project_id', how='left') 367 | projects_with_dates = pd.merge( 368 | projects_with_dates, first_retirement, on='project_id', how='left' 369 | ) 370 | 371 | # Rename the merged columns for clarity 372 | projects_with_dates = projects_with_dates.rename( 373 | columns={ 374 | 'transaction_date_x': 'first_issuance_at', 375 | 'transaction_date_y': 'first_retirement_at', 376 | } 377 | ) 378 | 379 | return projects_with_dates 380 | 381 | 382 | @pf.register_dataframe_method 383 | def add_retired_and_issued_totals(projects: pd.DataFrame, *, credits: pd.DataFrame) -> pd.DataFrame: 384 | """ 385 | Add total quantities of issued and retired credits to each project. 386 | 387 | Parameters 388 | ---------- 389 | projects : pd.DataFrame 390 | DataFrame containing project data. 391 | credits : pd.DataFrame 392 | DataFrame containing credit transaction data. 393 | 394 | Returns 395 | ------- 396 | pd.DataFrame 397 | DataFrame with two new columns: 'issued' and 'retired', representing the total quantities of issued and retired credits. 398 | """ 399 | 400 | # Drop conflicting columns if they exist 401 | projects = projects.drop(columns=['issued', 'retired'], errors='ignore') 402 | 403 | # # filter out the projects that are not in the credits data 404 | # credits = credits[credits['project_id'].isin(projects['project_id'].unique())] 405 | # groupd and sum 406 | credit_totals = ( 407 | credits.groupby(['project_id', 'transaction_type'])['quantity'].sum().reset_index() 408 | ) 409 | # pivot the table 410 | credit_totals_pivot = credit_totals.pivot( 411 | index='project_id', columns='transaction_type', values='quantity' 412 | ).reset_index() 413 | 414 | # merge with projects 415 | projects_combined = pd.merge( 416 | projects, 417 | credit_totals_pivot[['project_id', 'issuance', 'retirement']], 418 | left_on='project_id', 419 | right_on='project_id', 420 | how='left', 421 | ) 422 | 423 | # rename columns for clarity 424 | projects_combined = projects_combined.rename( 425 | columns={'issuance': 'issued', 'retirement': 'retired'} 426 | ) 427 | 428 | # replace Nans with 0 if any 429 | projects_combined[['issued', 'retired']] = projects_combined[['issued', 'retired']].fillna(0) 430 | 431 | return projects_combined 432 | -------------------------------------------------------------------------------- /offsets_db_data/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/offsets_db_data/py.typed -------------------------------------------------------------------------------- /offsets_db_data/registry.py: -------------------------------------------------------------------------------- 1 | REGISTRY_ABBR_MAP = { 2 | 'vcs': 'verra', 3 | 'car': 'climate-action-reserve', 4 | 'acr': 'american-carbon-registry', 5 | 'art': 'art-trees', 6 | 'gld': 'gold-standard', 7 | } 8 | 9 | 10 | def get_registry_from_project_id(project_id: str) -> str: 11 | """ 12 | Retrieve the full registry name from a project ID using a predefined abbreviation mapping. 13 | 14 | Parameters 15 | ---------- 16 | project_id : str 17 | The project ID whose registry needs to be identified. 18 | 19 | Returns 20 | ------- 21 | str 22 | The full name of the registry corresponding to the abbreviation in the project ID. 23 | 24 | Notes 25 | ----- 26 | - The function expects the first three characters of the project ID to be the abbreviation of the registry. 27 | - It uses a predefined mapping (`REGISTRY_ABBR_MAP`) to convert the abbreviation to the full registry name. 28 | - The project ID is converted to lowercase to ensure case-insensitive matching. 29 | - The function raises a KeyError if the abbreviation is not found in `REGISTRY_ABBR_MAP`. 30 | """ 31 | 32 | lowered_id = project_id.lower() 33 | return REGISTRY_ABBR_MAP[lowered_id[:3]] 34 | -------------------------------------------------------------------------------- /offsets_db_data/vcs.py: -------------------------------------------------------------------------------- 1 | import numpy as np # noqa: F401 2 | import pandas as pd 3 | import pandas_flavor as pf 4 | 5 | from offsets_db_data.common import ( 6 | BERKELEY_PROJECT_TYPE_UPATH, 7 | CREDIT_SCHEMA_UPATH, 8 | PROJECT_SCHEMA_UPATH, 9 | load_column_mapping, 10 | load_inverted_protocol_mapping, 11 | load_registry_project_column_mapping, 12 | load_type_category_mapping, 13 | ) 14 | from offsets_db_data.credits import * # noqa: F403 15 | from offsets_db_data.credits import harmonize_beneficiary_data 16 | from offsets_db_data.models import credit_without_id_schema, project_schema 17 | from offsets_db_data.projects import * # noqa: F403 18 | 19 | 20 | @pf.register_dataframe_method 21 | def generate_vcs_project_ids(df: pd.DataFrame, *, prefix: str) -> pd.DataFrame: 22 | """ 23 | Generate Verra project IDs by concatenating a specified prefix with the 'ID' column of the DataFrame. 24 | 25 | Parameters 26 | ---------- 27 | df : pd.DataFrame 28 | Input DataFrame containing Verra project data. 29 | prefix : str 30 | Prefix string to prepend to each project ID. 31 | 32 | Returns 33 | ------- 34 | pd.DataFrame 35 | DataFrame with a new 'project_id' column, containing the generated project IDs. 36 | """ 37 | 38 | df['project_id'] = prefix + df['ID'].astype(str) 39 | return df 40 | 41 | 42 | @pf.register_dataframe_method 43 | def determine_vcs_transaction_type(df: pd.DataFrame, *, date_column: str) -> pd.DataFrame: 44 | """ 45 | Determine the transaction type for Verra transactions based on a specified date column. 46 | Transactions with non-null date values are labeled as 'retirement', else as 'issuance'. 47 | 48 | Parameters 49 | ---------- 50 | df : pd.DataFrame 51 | Input DataFrame with transaction data. 52 | date_column : str 53 | Name of the column in the DataFrame used to determine the transaction type. 54 | 55 | Returns 56 | ------- 57 | pd.DataFrame 58 | DataFrame with a new 'transaction_type' column indicating the type of each transaction. 59 | """ 60 | 61 | # Verra doesn't have a transaction type column, and doesn't differentitate between retirements and cancelattions 62 | # So we'll use the date column to determine whether a transaction is a retirement or issuance and set the 63 | # transaction type accordingly 64 | df['transaction_type'] = df[date_column].apply( 65 | lambda x: 'retirement' if pd.notnull(x) else 'issuance' 66 | ) 67 | return df 68 | 69 | 70 | @pf.register_dataframe_method 71 | def set_vcs_transaction_dates( 72 | df: pd.DataFrame, *, date_column: str, fallback_column: str 73 | ) -> pd.DataFrame: 74 | """ 75 | Set the transaction dates in a DataFrame, using a primary date column and a fallback column. 76 | 77 | Parameters 78 | ---------- 79 | df : pd.DataFrame 80 | Input DataFrame with transaction data. 81 | date_column : str 82 | Primary column to use for transaction dates. 83 | fallback_column : str 84 | Column to use as fallback for transaction dates when primary column is null. 85 | 86 | Returns 87 | ------- 88 | pd.DataFrame 89 | DataFrame with a new 'transaction_date' column, containing the determined dates. 90 | """ 91 | 92 | df['transaction_date'] = df[date_column].where(df[date_column].notnull(), df[fallback_column]) 93 | return df 94 | 95 | 96 | @pf.register_dataframe_method 97 | def set_vcs_vintage_year(df: pd.DataFrame, *, date_column: str) -> pd.DataFrame: 98 | """ 99 | Set the vintage year for Verra transactions based on a date column formatted as '%d/%m/%Y'. 100 | 101 | Parameters 102 | ---------- 103 | df : pd.DataFrame 104 | Input DataFrame with transaction data. 105 | date_column : str 106 | Name of the column containing date information to extract the vintage year from. 107 | 108 | Returns 109 | ------- 110 | pd.DataFrame 111 | DataFrame with a new 'vintage' column, containing the vintage year of each transaction. 112 | """ 113 | 114 | try: 115 | df[date_column] = pd.to_datetime(df[date_column], format='%d/%m/%Y', utc=True) 116 | except ValueError: 117 | df[date_column] = pd.to_datetime(df[date_column], utc=True) 118 | df['vintage'] = df[date_column].dt.year 119 | return df 120 | 121 | 122 | @pf.register_dataframe_method 123 | def calculate_vcs_issuances(df: pd.DataFrame) -> pd.DataFrame: 124 | """Logic to calculate verra transactions from prepocessed transaction data 125 | 126 | Verra allows rolling/partial issuances. This requires inferring vintage issuance from `Total Vintage Quantity` 127 | 128 | Parameters 129 | ---------- 130 | df : pd.DataFrame 131 | Input DataFrame with preprocessed transaction data. 132 | 133 | Returns 134 | ------- 135 | pd.DataFrame 136 | DataFrame containing only issuance transactions with deduplicated and renamed columns. 137 | """ 138 | 139 | df_issuance = df.sort_values('transaction_date').drop_duplicates( 140 | ['vintage', 'project_id', 'Total Vintage Quantity'], keep='first' 141 | ) 142 | 143 | df_issuance = df_issuance.rename(columns={'Total Vintage Quantity': 'quantity'}) 144 | 145 | df_issuance['transaction_type'] = 'issuance' 146 | 147 | return df_issuance 148 | 149 | 150 | @pf.register_dataframe_method 151 | def calculate_vcs_retirements(df: pd.DataFrame) -> pd.DataFrame: 152 | """ 153 | Calculate retirements and cancellations for Verra transactions. The data does not allow 154 | distinguishing between retirements and cancellations. 155 | 156 | Parameters 157 | ---------- 158 | df : pd.DataFrame 159 | Input DataFrame with Verra transaction data. 160 | 161 | Returns 162 | ------- 163 | pd.DataFrame 164 | DataFrame containing only retirement transactions with renamed columns. 165 | """ 166 | 167 | retirements = df[df['transaction_type'] != 'issuance'] 168 | retirements = retirements.rename(columns={'Quantity Issued': 'quantity'}) 169 | return retirements 170 | 171 | 172 | @pf.register_dataframe_method 173 | def process_vcs_credits( 174 | df: pd.DataFrame, 175 | *, 176 | download_type: str = 'transactions', 177 | registry_name: str = 'verra', 178 | prefix: str = 'VCS', 179 | arb: pd.DataFrame | None = None, 180 | harmonize_beneficiary_info: bool = False, 181 | ) -> pd.DataFrame: 182 | """ 183 | Process Verra credits data, including generation of project IDs, determination of transaction types, 184 | setting transaction dates, and various data transformations and validations. 185 | 186 | Parameters 187 | ---------- 188 | df : pd.DataFrame 189 | Input DataFrame with raw credits data. 190 | download_type : str, optional 191 | Type of download operation performed (default is 'transactions'). 192 | registry_name : str, optional 193 | Name of the registry (default is 'verra'). 194 | prefix : str, optional 195 | Prefix for generating project IDs (default is 'VCS'). 196 | arb : pd.DataFrame | None, optional 197 | DataFrame for additional data merging (default is None). 198 | 199 | Returns 200 | ------- 201 | pd.DataFrame 202 | Processed DataFrame with Verra credits data. 203 | """ 204 | 205 | df = df.copy() 206 | data = ( 207 | df.set_registry(registry_name=registry_name) 208 | .generate_vcs_project_ids(prefix=prefix) 209 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date') 210 | .set_vcs_transaction_dates( 211 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date' 212 | ) 213 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued']) 214 | .set_vcs_vintage_year(date_column='Vintage End') 215 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True) 216 | ) 217 | 218 | issuances = data.calculate_vcs_issuances() 219 | retirements = data.calculate_vcs_retirements() 220 | 221 | column_mapping = load_column_mapping( 222 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH 223 | ) 224 | 225 | columns = {v: k for k, v in column_mapping.items()} 226 | 227 | merged_df = pd.concat([issuances, retirements]).reset_index(drop=True).rename(columns=columns) 228 | 229 | issuances = merged_df.aggregate_issuance_transactions() 230 | retirements = merged_df[merged_df['transaction_type'].str.contains('retirement')] 231 | data = ( 232 | pd.concat([issuances, retirements]) 233 | .reset_index(drop=True) 234 | .add_missing_columns(schema=credit_without_id_schema) 235 | .validate(schema=credit_without_id_schema) 236 | ) 237 | 238 | if arb is not None and not arb.empty: 239 | data = data.merge_with_arb(arb=arb) 240 | 241 | if harmonize_beneficiary_info: 242 | data = data.pipe( 243 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type 244 | ) 245 | 246 | data = ( 247 | data.add_missing_columns(schema=credit_without_id_schema) 248 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') 249 | .validate(schema=credit_without_id_schema) 250 | ) 251 | 252 | return data 253 | 254 | 255 | @pf.register_dataframe_method 256 | def add_vcs_compliance_projects(df: pd.DataFrame) -> pd.DataFrame: 257 | """ 258 | Add details about two compliance projects to projects database. 259 | 260 | Parameters 261 | ---------- 262 | df : pd.DataFrame 263 | A pandas DataFrame containing project data with a 'project_id' column. 264 | 265 | Returns 266 | -------- 267 | df: pd.DataFrame 268 | A pandas DataFrame with two additional rows, describing two projects from the mostly unused Verra compliance 269 | registry portal. 270 | """ 271 | 272 | vcs_project_dicts = [ 273 | { 274 | 'project_id': 'VCSOPR2', 275 | 'name': 'Corinth Abandoned Mine Methane Recovery Project', 276 | 'protocol': ['arb-mine-methane'], 277 | 'category': 'ghg-management', 278 | 'project_type': 'mine methane capture', 279 | 'project_type_source': 'carbonplan', 280 | 'proponent': 'Keyrock Energy LLC', 281 | 'country': 'United States', 282 | 'status': 'registered', 283 | 'is_compliance': True, 284 | 'registry': 'verra', 285 | 'project_url': 'https://registry.verra.org/app/projectDetail/VCS/2265', 286 | }, 287 | { 288 | 'project_id': 'VCSOPR10', 289 | 'name': 'Blue Source-Alford Improved Forest Management Project', 290 | 'protocol': ['arb-forest'], 291 | 'category': 'forest', 292 | 'project_type': 'improved forest management', 293 | 'project_type_source': 'carbonplan', 294 | 'proponent': 'Ozark Regional Land Trust', 295 | 'country': 'United States', 296 | 'status': 'registered', 297 | 'is_compliance': True, 298 | 'registry': 'verra', 299 | 'project_url': 'https://registry.verra.org/app/projectDetail/VCS/2271', 300 | }, 301 | ] 302 | vcs_projects = pd.DataFrame(vcs_project_dicts) 303 | return pd.concat([df, vcs_projects], ignore_index=True) 304 | 305 | 306 | @pf.register_dataframe_method 307 | def add_vcs_project_url(df: pd.DataFrame) -> pd.DataFrame: 308 | """ 309 | Create a URL for each project based on its Verra project ID. 310 | 311 | Parameters 312 | ---------- 313 | df : pd.DataFrame 314 | Input DataFrame with Verra project data. 315 | 316 | Returns 317 | ------- 318 | pd.DataFrame 319 | DataFrame with a new 'project_url' column, containing the generated URLs for each project. 320 | """ 321 | 322 | df['project_url'] = ( 323 | 'https://registry.verra.org/app/projectDetail/VCS/' + df['project_id'].str[3:] 324 | ) 325 | return df 326 | 327 | 328 | @pf.register_dataframe_method 329 | def add_vcs_project_id(df: pd.DataFrame) -> pd.DataFrame: 330 | """ 331 | Add a prefix 'VCS' to each project ID in the DataFrame. 332 | 333 | Parameters 334 | ---------- 335 | df : pd.DataFrame 336 | Input DataFrame with Verra project data. 337 | 338 | Returns 339 | ------- 340 | pd.DataFrame 341 | DataFrame with updated 'project_id' column, containing the prefixed project IDs. 342 | """ 343 | 344 | df['project_id'] = df['project_id'].apply(lambda x: f'VCS{str(x)}') 345 | return df 346 | 347 | 348 | @pf.register_dataframe_method 349 | def process_vcs_projects( 350 | df: pd.DataFrame, 351 | *, 352 | credits: pd.DataFrame, 353 | registry_name: str = 'verra', 354 | download_type: str = 'projects', 355 | ) -> pd.DataFrame: 356 | """ 357 | Process Verra projects data, including renaming, adding, and validating columns, and merging with credits data. 358 | 359 | Parameters 360 | ---------- 361 | df : pd.DataFrame 362 | Input DataFrame with raw projects data. 363 | credits : pd.DataFrame 364 | DataFrame containing credits data for merging. 365 | registry_name : str, optional 366 | Name of the registry (default is 'verra'). 367 | download_type : str, optional 368 | Type of download operation performed (default is 'projects'). 369 | 370 | Returns 371 | ------- 372 | pd.DataFrame 373 | Processed DataFrame with harmonized and validated Verra projects data. 374 | """ 375 | 376 | df = df.copy() 377 | credits = credits.copy() 378 | registry_project_column_mapping = load_registry_project_column_mapping( 379 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH 380 | ) 381 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()} 382 | type_category_mapping = load_type_category_mapping() 383 | inverted_protocol_mapping = load_inverted_protocol_mapping() 384 | 385 | data = ( 386 | df.rename(columns=inverted_column_mapping) 387 | .set_registry(registry_name=registry_name) 388 | .add_vcs_project_id() 389 | .add_vcs_project_url() 390 | .harmonize_country_names() 391 | .harmonize_status_codes() 392 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping) 393 | .infer_project_type() 394 | .override_project_types( 395 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley' 396 | ) 397 | .add_category( 398 | type_category_mapping=type_category_mapping 399 | ) # must come after types; type -> category 400 | .add_is_compliance_flag() 401 | .add_vcs_compliance_projects() 402 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping) 403 | .add_retired_and_issued_totals(credits=credits) 404 | .add_first_issuance_and_retirement_dates(credits=credits) 405 | .add_missing_columns(schema=project_schema) 406 | .convert_to_datetime(columns=['listed_at'], dayfirst=True) 407 | .validate(schema=project_schema) 408 | ) 409 | 410 | return data 411 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["setuptools-scm[toml]>=6.2", "setuptools>=64", "wheel"] 4 | 5 | [project] 6 | authors = [{ name = "CarbonPlan", email = "tech@carbonplan.org" }] 7 | classifiers = [ 8 | "Development Status :: 4 - Beta", 9 | "Intended Audience :: Science/Research", 10 | "License :: OSI Approved :: MIT License", 11 | "Operating System :: OS Independent", 12 | "Programming Language :: Python :: 3", 13 | "Programming Language :: Python :: 3.10", 14 | "Programming Language :: Python :: 3.11", 15 | "Programming Language :: Python :: 3.12", 16 | "Programming Language :: Python :: 3.13", 17 | "Programming Language :: Python", 18 | "Topic :: Scientific/Engineering", 19 | ] 20 | description = "Monitoring the global carbon market" 21 | license = { text = "MIT" } 22 | name = "offsets-db-data" 23 | readme = "README.md" 24 | requires-python = ">=3.10" 25 | 26 | dynamic = ["dependencies", "version"] 27 | 28 | [project.scripts] 29 | offsets-db-data-orcli = "offsets_db_data.openrefine:main" 30 | 31 | [tool.setuptools.dynamic] 32 | 33 | dependencies = { file = ["requirements.txt"] } 34 | optional-dependencies = { dev = { file = [ 35 | "requirements-dev.txt", 36 | ] }, docs = { file = [ 37 | "requirements-docs.txt", 38 | ] } } 39 | 40 | [project.urls] 41 | "database web tool" = "https://carbonplan.org/research/offsets-db" 42 | "documentation" = "https://offsets-db-data.readthedocs.io/" 43 | "explainer" = "https://carbonplan.org/research/offsets-db-explainer" 44 | repository = "https://github.com/carbonplan/offsets-db-data" 45 | 46 | [tool.setuptools.packages.find] 47 | include = ["offsets_db_data*"] 48 | 49 | [tool.setuptools.package-data] 50 | offsets_db_data = ["*.yaml", "configs/*.json", "py.typed"] 51 | 52 | [tool.setuptools_scm] 53 | fallback_version = "999" 54 | local_scheme = "node-and-date" 55 | version_scheme = "post-release" 56 | write_to = "offsets_db_data/_version.py" 57 | write_to_template = '__version__ = "{version}"' 58 | 59 | [tool.coverage.run] 60 | branch = true 61 | omit = ["tests/*"] 62 | 63 | [tool.ruff] 64 | extend-include = ["*.ipynb"] 65 | line-length = 100 66 | target-version = "py310" 67 | 68 | builtins = ["ellipsis"] 69 | # Exclude a variety of commonly ignored directories. 70 | exclude = [ 71 | ".bzr", 72 | ".direnv", 73 | ".eggs", 74 | ".git", 75 | ".git-rewrite", 76 | ".hg", 77 | ".ipynb_checkpoints", 78 | ".mypy_cache", 79 | ".nox", 80 | ".pants.d", 81 | ".pyenv", 82 | ".pytest_cache", 83 | ".pytype", 84 | ".ruff_cache", 85 | ".svn", 86 | ".tox", 87 | ".venv", 88 | ".vscode", 89 | "__pypackages__", 90 | "_build", 91 | "buck-out", 92 | "build", 93 | "dist", 94 | "node_modules", 95 | "site-packages", 96 | "venv", 97 | ] 98 | [tool.ruff.lint] 99 | ignore = [ 100 | "E501", # Conflicts with ruff format 101 | "E721", # Comparing types instead of isinstance 102 | "E741", # Ambiguous variable names 103 | ] 104 | per-file-ignores = {} 105 | select = [ 106 | # Pyflakes 107 | "F", 108 | # Pycodestyle 109 | "E", 110 | "W", 111 | # isort 112 | "I", 113 | # Pyupgrade 114 | "UP", 115 | ] 116 | 117 | [tool.ruff.lint.mccabe] 118 | max-complexity = 18 119 | 120 | [tool.ruff.lint.isort] 121 | combine-as-imports = true 122 | known-first-party = ["offsets_db_data"] 123 | 124 | [tool.ruff.format] 125 | docstring-code-format = true 126 | quote-style = "single" 127 | 128 | [tool.ruff.lint.pydocstyle] 129 | convention = "numpy" 130 | 131 | [tool.ruff.lint.pyupgrade] 132 | # Preserve types, even if a file imports `from __future__ import annotations`. 133 | keep-runtime-typing = true 134 | 135 | [tool.pytest.ini_options] 136 | addopts = "-n auto --cov=./ --cov-report=xml --cov-report=term-missing --verbose" 137 | console_output_style = "count" 138 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: '3.12' 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | # formats: 24 | # - pdf 25 | # - epub 26 | 27 | # Optional but recommended, declare the Python requirements required 28 | # to build your documentation 29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 30 | python: 31 | install: 32 | - method: pip 33 | path: . 34 | - requirements: requirements-docs.txt 35 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==7.4.* 2 | pytest-cov==4.1.* 3 | pytest-mock==3.10.* 4 | pytest-xdist==3.3.* 5 | requests-mock==1.11.* 6 | hypothesis==6.111.* 7 | openpyxl 8 | -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx-book-theme>=1.1.2 2 | myst-nb 3 | sphinx 4 | sphinx-copybutton 5 | sphinx-design 6 | sphinxext-opengraph 7 | jupyterlab 8 | sphinx-togglebutton 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | country_converter==1.0.0 2 | dask==2025.2.0 3 | fastparquet==2024.11 4 | fsspec==2025.2.0 5 | intake-parquet>=0.3.0 6 | intake<2 7 | pandas==2.2.2 8 | pandera==0.23 9 | pydantic==2.10.* 10 | pyjanitor==0.23.* 11 | requests>=2.31.0 12 | s3fs==2025.2.0 13 | universal_pathlib>=0.1.3 14 | numpy>=2 15 | typer >=0.15.2 16 | -------------------------------------------------------------------------------- /scripts/check-beneficiary-coverage.py: -------------------------------------------------------------------------------- 1 | import fsspec 2 | import pandas as pd 3 | 4 | 5 | def main(): 6 | print('Checking beneficiary coverage against latest production release on S3') 7 | 8 | with fsspec.open( 9 | 'zip://credits.parquet::s3://carbonplan-offsets-db/production/latest/offsets-db.parquet.zip' 10 | ) as f: 11 | credits = pd.read_parquet(f) 12 | retirement_credits = credits[credits['transaction_type'] == 'retirement'] 13 | 14 | beneficiary_cols = [ 15 | 'retirement_beneficiary', 16 | 'retirement_account', 17 | 'retirement_note', 18 | 'retirement_reason', 19 | ] 20 | no_user_data = pd.isna(retirement_credits[beneficiary_cols]).sum(axis=1) == 4 21 | 22 | mapped_stats = ( 23 | retirement_credits[(~no_user_data)] 24 | .groupby(pd.isna(retirement_credits['retirement_beneficiary_harmonized'])) 25 | .quantity.sum() 26 | ) 27 | tot_mapped = mapped_stats.sum() 28 | frac_mapped = mapped_stats[False] / tot_mapped 29 | nlarge_unmapped = ( 30 | retirement_credits[ 31 | (~no_user_data) & pd.isna(retirement_credits['retirement_beneficiary_harmonized']) 32 | ].quantity 33 | > 50_000 34 | ).sum() 35 | 36 | print(f'A total of {mapped_stats[False] / 1_000_000:.2f} million credits have been mapped') 37 | print(f'which represents {frac_mapped * 100:.1f} percent of mappable credit') 38 | print(f'There are {nlarge_unmapped} mappable transactions that 50,000 credits') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /scripts/extract-berkeley-project-types.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import pandas as pd 5 | 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser( 9 | description='Extract project types from latest version of berkeley carbon project data', 10 | ) 11 | parser.add_argument('filename', help='Input filename to process') 12 | 13 | args = parser.parse_args() 14 | 15 | # this is surprisingly slow? openpyxl is doing some _work_ 16 | project_data = pd.read_excel( 17 | args.filename, sheet_name='PROJECTS', skiprows=3, usecols=['Project ID', ' Type'] 18 | ) 19 | 20 | def _fix_gld_ids(s: str) -> str: 21 | if s.startswith('GS'): 22 | return f'GLD{s[2:]}' 23 | else: 24 | return s 25 | 26 | out_d = project_data.dropna().set_index('Project ID')[' Type'].to_dict() 27 | out_d = {_fix_gld_ids(k): v.lower() for k, v in out_d.items()} 28 | out_f = '/tmp/berkeley-project-types.json' 29 | with open(out_f, 'w') as f: 30 | print(f'Writing project types to {out_f}') 31 | json.dump(out_d, f, indent=1) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from offsets_db_data.apx import * # noqa: F403 5 | from offsets_db_data.arb import * # noqa: F403 6 | from offsets_db_data.gld import * # noqa: F403 7 | from offsets_db_data.models import credit_without_id_schema, project_schema 8 | from offsets_db_data.vcs import * # noqa: F403 9 | 10 | 11 | @pytest.fixture 12 | def date() -> str: 13 | return '2024-08-27' 14 | 15 | 16 | @pytest.fixture 17 | def bucket() -> str: 18 | return 's3://carbonplan-offsets-db/raw' 19 | 20 | 21 | @pytest.fixture 22 | def arb() -> pd.DataFrame: 23 | data = pd.read_excel( 24 | 's3://carbonplan-offsets-db/raw/2024-08-27/arb/nc-arboc_issuance.xlsx', sheet_name=3 25 | ) 26 | return data.process_arb() 27 | 28 | 29 | @pytest.mark.parametrize( 30 | 'harmonize_beneficiary_info', 31 | [True, False], 32 | ) 33 | def test_verra(date, bucket, arb, harmonize_beneficiary_info): 34 | prefix = 'VCS' 35 | projects = pd.read_csv(f'{bucket}/{date}/verra/projects.csv.gz') 36 | credits = pd.read_csv(f'{bucket}/{date}/verra/transactions.csv.gz') 37 | df_credits = credits.process_vcs_credits( 38 | arb=arb[arb.project_id.str.startswith(prefix)], 39 | harmonize_beneficiary_info=harmonize_beneficiary_info, 40 | ) 41 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys()) 42 | df_projects = projects.process_vcs_projects(credits=df_credits) 43 | project_schema.validate(df_projects) 44 | credit_without_id_schema.validate(df_credits) 45 | 46 | assert df_projects['project_id'].str.startswith(prefix).all() 47 | assert df_credits['project_id'].str.startswith(prefix).all() 48 | 49 | 50 | @pytest.mark.parametrize( 51 | 'registry, download_types, prefix', 52 | [ 53 | ('art-trees', ['issuances', 'retirements', 'cancellations'], 'ART'), 54 | ('american-carbon-registry', ['issuances', 'retirements', 'cancellations'], 'ACR'), 55 | ('climate-action-reserve', ['issuances', 'retirements', 'cancellations'], 'CAR'), 56 | ], 57 | ) 58 | def test_apx(date, bucket, arb, registry, download_types, prefix): 59 | dfs = [] 60 | for key in download_types: 61 | credits = pd.read_csv(f'{bucket}/{date}/{registry}/{key}.csv.gz') 62 | p = credits.process_apx_credits( 63 | download_type=key, registry_name=registry, harmonize_beneficiary_info=True 64 | ) 65 | dfs.append(p) 66 | 67 | df_credits = pd.concat(dfs).merge_with_arb(arb=arb[arb.project_id.str.startswith(prefix)]) 68 | credit_without_id_schema.validate(df_credits) 69 | 70 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys()) 71 | 72 | projects = pd.read_csv(f'{bucket}/{date}/{registry}/projects.csv.gz') 73 | df_projects = projects.process_apx_projects(credits=df_credits, registry_name=registry) 74 | project_schema.validate(df_projects) 75 | 76 | assert df_projects['project_id'].str.startswith(prefix).all() 77 | assert df_credits['project_id'].str.startswith(prefix).all() 78 | 79 | 80 | @pytest.mark.parametrize( 81 | 'harmonize_beneficiary_info', 82 | [True, False], 83 | ) 84 | def test_gld( 85 | date, 86 | bucket, 87 | harmonize_beneficiary_info, 88 | ): 89 | registry = 'gold-standard' 90 | download_types = ['issuances', 'retirements'] 91 | prefix = 'GLD' 92 | 93 | dfs = [] 94 | for key in download_types: 95 | credits = pd.read_csv(f'{bucket}/{date}/{registry}/{key}.csv.gz') 96 | p = credits.process_gld_credits( 97 | download_type=key, harmonize_beneficiary_info=harmonize_beneficiary_info 98 | ) 99 | dfs.append(p) 100 | 101 | df_credits = pd.concat(dfs) 102 | credit_without_id_schema.validate(df_credits) 103 | 104 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys()) 105 | 106 | projects = pd.read_csv(f'{bucket}/{date}/{registry}/projects.csv.gz') 107 | df_projects = projects.process_gld_projects(credits=df_credits) 108 | project_schema.validate(df_projects) 109 | 110 | # check if all project_id use the same prefix 111 | assert df_projects['project_id'].str.startswith(prefix).all() 112 | assert df_credits['project_id'].str.startswith(prefix).all() 113 | 114 | 115 | @pytest.mark.parametrize( 116 | 'df_credits', 117 | [ 118 | pd.DataFrame().process_gld_credits( 119 | download_type='issuances', harmonize_beneficiary_info=True 120 | ), 121 | pd.concat( 122 | [ 123 | pd.read_csv( 124 | 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/issuances.csv.gz' 125 | ).process_gld_credits(download_type='issuances', harmonize_beneficiary_info=True), 126 | pd.read_csv( 127 | 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/retirements.csv.gz' 128 | ).process_gld_credits(download_type='retirements', harmonize_beneficiary_info=True), 129 | ] 130 | ), 131 | ], 132 | ) 133 | @pytest.mark.parametrize( 134 | 'projects', 135 | [ 136 | pd.DataFrame(), 137 | pd.read_csv('s3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/projects.csv.gz'), 138 | ], 139 | ) 140 | def test_gld_empty(df_credits, projects): 141 | prefix = 'GLD' 142 | 143 | credit_without_id_schema.validate(df_credits) 144 | 145 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys()) 146 | 147 | df_projects = projects.process_gld_projects(credits=df_credits) 148 | project_schema.validate(df_projects) 149 | 150 | # check if all project_id use the same prefix 151 | assert df_projects['project_id'].str.startswith(prefix).all() 152 | assert df_credits['project_id'].str.startswith(prefix).all() 153 | -------------------------------------------------------------------------------- /tests/test_pipeline_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import zipfile 3 | from datetime import datetime 4 | from unittest.mock import MagicMock, patch 5 | 6 | import pandas as pd 7 | import pytest 8 | 9 | from offsets_db_data.pipeline_utils import ( 10 | _create_data_zip_buffer, 11 | summarize, 12 | to_parquet, 13 | transform_registry_data, 14 | validate_data, 15 | write_latest_production, 16 | ) 17 | 18 | 19 | @pytest.fixture 20 | def sample_credits(): 21 | """Sample credits dataframe for testing.""" 22 | return pd.DataFrame( 23 | { 24 | 'project_id': ['VCS123', 'VCS124', 'ACR456', 'CAR789'], 25 | 'quantity': [100, 200, 150, 300], 26 | 'vintage': [2020, 2021, 2020, 2022], 27 | 'transaction_date': pd.to_datetime( 28 | ['2021-01-01', '2022-02-01', '2021-03-15', '2022-04-30'] 29 | ), 30 | 'transaction_type': ['issuance', 'retirement', 'issuance', 'retirement'], 31 | } 32 | ) 33 | 34 | 35 | @pytest.fixture 36 | def sample_projects(): 37 | """Sample projects dataframe for testing.""" 38 | return pd.DataFrame( 39 | { 40 | 'project_id': ['VCS123', 'VCS124', 'ACR456', 'CAR789'], 41 | 'name': ['Project A', 'Project B', 'Project C', 'Project D'], 42 | 'registry': ['verra', 'verra', 'american-carbon-registry', 'climate-action-reserve'], 43 | 'is_compliance': [False, True, False, True], 44 | 'retired': [50, 200, 75, 250], 45 | 'issued': [100, 200, 150, 300], 46 | 'type': ['forestry', 'renewable-energy', 'agriculture', 'forestry'], 47 | 'type_source': ['carbonplan', 'berkeley', 'carbonplan', 'carbonplan'], 48 | } 49 | ) 50 | 51 | 52 | @patch('offsets_db_data.pipeline_utils.catalog') 53 | def test_validate_data_success(mock_catalog, sample_credits): 54 | """Test validate_data when data passes validation criteria.""" 55 | # Mock old data with 90% of new data quantity 56 | mock_old_data = sample_credits.copy() 57 | mock_old_data['quantity'] = mock_old_data['quantity'] * 0.9 58 | 59 | mock_catalog.__getitem__.return_value = MagicMock() 60 | mock_catalog.__getitem__.return_value.read.return_value = mock_old_data 61 | 62 | # Should not raise exception 63 | validate_data( 64 | new_data=sample_credits, 65 | as_of=datetime(2023, 1, 1), 66 | data_type='credits', 67 | quantity_column='quantity', 68 | aggregation_func=sum, 69 | ) 70 | 71 | # Verify catalog was called properly 72 | mock_catalog.__getitem__.assert_called_with('credits') 73 | 74 | 75 | def test_summarize_single_registry(sample_credits, sample_projects, capsys): 76 | """Test summarize function with a single registry.""" 77 | registry_name = 'verra' 78 | 79 | # Filter data for verra registry 80 | verra_projects = sample_projects[sample_projects['registry'] == registry_name] 81 | verra_credits = sample_credits[sample_credits['project_id'].str.startswith('VCS')] 82 | 83 | summarize( 84 | credits=verra_credits, 85 | projects=verra_projects, 86 | registry_name=registry_name, 87 | ) 88 | 89 | captured = capsys.readouterr() 90 | 91 | assert f'Retired and Issued (in Millions) summary for {registry_name}' in captured.out 92 | assert f'Credits summary (in Millions) for {registry_name}' in captured.out 93 | 94 | 95 | def test_summarize_multi_registry(sample_credits, sample_projects, capsys): 96 | """Test summarize function across multiple registries.""" 97 | 98 | summarize( 99 | credits=sample_credits, 100 | projects=sample_projects, 101 | ) 102 | 103 | captured = capsys.readouterr() 104 | 105 | assert 'Summary Statistics for projects (in Millions)' in captured.out 106 | assert 'Summary Statistics for credits (in Millions)' in captured.out 107 | 108 | 109 | def test_create_data_zip_buffer_csv(sample_credits, sample_projects): 110 | """Test _create_data_zip_buffer with CSV format.""" 111 | 112 | buffer = _create_data_zip_buffer( 113 | credits=sample_credits, 114 | projects=sample_projects, 115 | format_type='csv', 116 | terms_content='Test terms content', 117 | ) 118 | 119 | # Test the buffer contains a valid ZIP 120 | with zipfile.ZipFile(buffer, 'r') as zip_ref: 121 | filenames = zip_ref.namelist() 122 | 123 | # Check expected files exist 124 | assert 'TERMS_OF_DATA_ACCESS.txt' in filenames 125 | assert 'credits.csv' in filenames 126 | assert 'projects.csv' in filenames 127 | 128 | # Check terms content 129 | assert zip_ref.read('TERMS_OF_DATA_ACCESS.txt').decode('utf-8') == 'Test terms content' 130 | 131 | 132 | def test_create_data_zip_buffer_parquet(sample_credits, sample_projects): 133 | """Test _create_data_zip_buffer with Parquet format.""" 134 | buffer = _create_data_zip_buffer( 135 | credits=sample_credits, 136 | projects=sample_projects, 137 | format_type='parquet', 138 | terms_content='Test terms content', 139 | ) 140 | 141 | # Test the buffer contains a valid ZIP 142 | with zipfile.ZipFile(buffer, 'r') as zip_ref: 143 | filenames = zip_ref.namelist() 144 | 145 | assert 'TERMS_OF_DATA_ACCESS.txt' in filenames 146 | assert 'credits.parquet' in filenames 147 | assert 'projects.parquet' in filenames 148 | 149 | 150 | @patch('fsspec.filesystem') 151 | @patch('fsspec.open') 152 | @patch('offsets_db_data.pipeline_utils._create_data_zip_buffer') 153 | def test_write_latest_production( 154 | mock_create_buffer, 155 | mock_fsspec_open, 156 | mock_fsspec_fs, 157 | sample_credits, 158 | sample_projects, 159 | ): 160 | """Test write_latest_production function.""" 161 | # Setup mocks 162 | mock_fs = MagicMock() 163 | mock_fs.read_text.return_value = 'Test terms content' 164 | mock_fsspec_fs.return_value = mock_fs 165 | 166 | # Create a new buffer for each call 167 | mock_create_buffer.side_effect = [ 168 | io.BytesIO(b'test csv data'), 169 | io.BytesIO(b'test parquet data'), 170 | ] 171 | 172 | mock_file = MagicMock() 173 | mock_context = MagicMock() 174 | mock_context.__enter__.return_value = mock_file 175 | mock_fsspec_open.return_value = mock_context 176 | 177 | # Call function 178 | write_latest_production( 179 | credits=sample_credits, 180 | projects=sample_projects, 181 | bucket='s3://test-bucket', 182 | ) 183 | 184 | # Assert mocks called correctly 185 | assert mock_create_buffer.call_count == 2 # Called for CSV and Parquet 186 | mock_fsspec_fs.assert_called_once_with('s3', anon=False) 187 | assert mock_fsspec_open.call_count == 2 188 | 189 | # Verify write calls 190 | assert mock_file.write.call_count == 2 191 | 192 | 193 | @patch('offsets_db_data.pipeline_utils.to_parquet') 194 | @patch('offsets_db_data.pipeline_utils.summarize') 195 | def test_transform_registry_data(mock_summarize, mock_to_parquet, sample_credits, sample_projects): 196 | """Test transform_registry_data function.""" 197 | # Setup mock functions 198 | process_credits_fn = MagicMock(return_value=sample_credits) 199 | process_projects_fn = MagicMock(return_value=sample_projects) 200 | output_paths = {'credits': 'path/to/credits', 'projects': 'path/to/projects'} 201 | 202 | # Call function 203 | result_credits, result_projects = transform_registry_data( 204 | process_credits_fn=process_credits_fn, 205 | process_projects_fn=process_projects_fn, 206 | output_paths=output_paths, 207 | registry_name='test-registry', 208 | ) 209 | 210 | # Verify calls and returns 211 | process_credits_fn.assert_called_once() 212 | process_projects_fn.assert_called_once_with(credits=sample_credits) 213 | mock_summarize.assert_called_once_with( 214 | credits=sample_credits, projects=sample_projects, registry_name='test-registry' 215 | ) 216 | mock_to_parquet.assert_called_once() 217 | 218 | # Verify return values 219 | assert result_credits.equals(sample_credits) 220 | assert result_projects.equals(sample_projects) 221 | 222 | 223 | @patch('tempfile.NamedTemporaryFile') 224 | def test_to_parquet(mock_temp_file, sample_credits, sample_projects): 225 | """Test to_parquet function.""" 226 | # Setup mock 227 | mock_temp = MagicMock() 228 | mock_temp_file.return_value.__enter__.return_value = mock_temp 229 | 230 | # Setup output paths 231 | output_paths = { 232 | 'credits': 'path/to/credits', 233 | 'projects': 'path/to/projects', 234 | } 235 | 236 | # Patch pandas to_parquet to prevent actual file writing 237 | with patch.object(pd.DataFrame, 'to_parquet') as mock_to_parquet: 238 | to_parquet( 239 | credits=sample_credits, 240 | projects=sample_projects, 241 | output_paths=output_paths, 242 | registry_name='test-registry', 243 | ) 244 | 245 | # Assert to_parquet called for all three dataframes 246 | assert mock_to_parquet.call_count == 2 247 | -------------------------------------------------------------------------------- /tests/test_vcs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from offsets_db_data.vcs import ( 6 | add_vcs_compliance_projects, 7 | calculate_vcs_issuances, 8 | calculate_vcs_retirements, 9 | determine_vcs_transaction_type, 10 | generate_vcs_project_ids, 11 | process_vcs_credits, 12 | process_vcs_projects, 13 | set_vcs_transaction_dates, 14 | set_vcs_vintage_year, 15 | ) 16 | 17 | 18 | def vcs_projects() -> pd.DataFrame: 19 | df = pd.DataFrame( 20 | [ 21 | { 22 | 'ID': 75, 23 | 'Name': '5.4 MW Grouped Wind Power Project in Gujarat & Maharashtra (India) by Rohan Builders (India) Pvt Ltd.', 24 | 'Proponent': 'Rohan Builders (India)', 25 | 'Project Type': 'Energy industries (renewable/non-renewable sources)', 26 | 'AFOLU Activities': np.nan, 27 | 'Methodology': 'AMS-I.D.', 28 | 'Status': 'Registered', 29 | 'Country/Area': 'India', 30 | 'Estimated Annual Emission Reductions': '9,143', 31 | 'Region': 'Asia', 32 | 'Project Registration Date': '2009-06-15', 33 | 'Crediting Period Start Date': np.nan, 34 | 'Crediting Period End Date': np.nan, 35 | }, 36 | { 37 | 'ID': 2498, 38 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 39 | 'Proponent': 'Miller Forest Investment AG', 40 | 'Project Type': 'Agriculture Forestry and Other Land Use', 41 | 'AFOLU Activities': 'ARR', 42 | 'Methodology': 'AR-ACM0003', 43 | 'Status': 'Registered', 44 | 'Country/Area': 'Paraguay', 45 | 'Estimated Annual Emission Reductions': '204,819', 46 | 'Region': 'Latin America', 47 | 'Project Registration Date': '2022-01-14', 48 | 'Crediting Period Start Date': '2016-01-13', 49 | 'Crediting Period End Date': '2046-01-12', 50 | }, 51 | { 52 | 'ID': 101, 53 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli', 54 | 'Proponent': 'Triveni Engineering and Industries Limited (TEIL)', 55 | 'Project Type': 'Energy industries (renewable/non-renewable sources)', 56 | 'AFOLU Activities': np.nan, 57 | 'Methodology': 'ACM0006', 58 | 'Status': 'Registered', 59 | 'Country/Area': 'India', 60 | 'Estimated Annual Emission Reductions': '86,808', 61 | 'Region': 'Asia', 62 | 'Project Registration Date': '2009-07-15', 63 | 'Crediting Period Start Date': np.nan, 64 | 'Crediting Period End Date': np.nan, 65 | }, 66 | { 67 | 'ID': 3408, 68 | 'Name': 'Mianning1 Water Management with Rice Cultivation', 69 | 'Proponent': 'Yunnan Ruihan Agricultural Technology Development Co., Ltd.', 70 | 'Project Type': 'Agriculture Forestry and Other Land Use', 71 | 'AFOLU Activities': 'ALM', 72 | 'Methodology': 'AMS-III.AU', 73 | 'Status': 'Under development', 74 | 'Country/Area': 'China', 75 | 'Estimated Annual Emission Reductions': '55,497', 76 | 'Region': 'Asia', 77 | 'Project Registration Date': np.nan, 78 | 'Crediting Period Start Date': '2018-04-06', 79 | 'Crediting Period End Date': '2025-04-05', 80 | }, 81 | { 82 | 'ID': 1223, 83 | 'Name': 'Yanhe, Dejiang, and Yinjiang Rural Methane Digesters Project in Guizhou Province, China', 84 | 'Proponent': 'Guizhou Black Carbon Energy Tech Prom & App Co. Lt', 85 | 'Project Type': 'Energy industries (renewable/non-renewable sources)', 86 | 'AFOLU Activities': np.nan, 87 | 'Methodology': 'AMS-I.C.; AMS-III.R.', 88 | 'Status': 'Under validation', 89 | 'Country/Area': 'China', 90 | 'Estimated Annual Emission Reductions': '53,247', 91 | 'Region': 'Asia', 92 | 'Project Registration Date': np.nan, 93 | 'Crediting Period Start Date': np.nan, 94 | 'Crediting Period End Date': np.nan, 95 | }, 96 | ] 97 | ) 98 | 99 | return df 100 | 101 | 102 | @pytest.fixture(name='vcs_projects') 103 | def fixture_vcs_projects() -> pd.DataFrame: 104 | return vcs_projects() 105 | 106 | 107 | def vcs_transactions() -> pd.DataFrame: 108 | df = pd.DataFrame( 109 | [ 110 | { 111 | 'Issuance Date': '08/03/2022', 112 | 'Sustainable Development Goals': np.nan, 113 | 'Vintage Start': '01/01/2020', 114 | 'Vintage End': '19/11/2020', 115 | 'ID': 2498, 116 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 117 | 'Country/Area': 'Paraguay', 118 | 'Project Type': 'Agriculture Forestry and Other Land Use', 119 | 'Methodology': 'AR-ACM0003', 120 | 'Total Vintage Quantity': '99,870', 121 | 'Quantity Issued': '84,773', 122 | 'Serial Number': '12629-421604735-421689507-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0', 123 | 'Additional Certifications': np.nan, 124 | 'Retirement/Cancellation Date': np.nan, 125 | 'Retirement Beneficiary': np.nan, 126 | 'Retirement Reason': np.nan, 127 | 'Retirement Details': np.nan, 128 | }, 129 | { 130 | 'Issuance Date': '29/11/2022', 131 | 'Sustainable Development Goals': np.nan, 132 | 'Vintage Start': '01/01/2017', 133 | 'Vintage End': '31/12/2017', 134 | 'ID': 2498, 135 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 136 | 'Country/Area': 'Paraguay', 137 | 'Project Type': 'Agriculture Forestry and Other Land Use', 138 | 'Methodology': 'AR-ACM0003', 139 | 'Total Vintage Quantity': '82,455', 140 | 'Quantity Issued': '5,000', 141 | 'Serial Number': '14121-556418249-556423248-VCS-VCU-576-VER-PY-14-2498-01012017-31122017-0', 142 | 'Additional Certifications': np.nan, 143 | 'Retirement/Cancellation Date': '26/12/2022', 144 | 'Retirement Beneficiary': 'DNV AS', 145 | 'Retirement Reason': 'Environmental Benefit', 146 | 'Retirement Details': 'VCUs 2022 for DNV', 147 | }, 148 | { 149 | 'Issuance Date': '24/06/2022', 150 | 'Sustainable Development Goals': np.nan, 151 | 'Vintage Start': '13/01/2016', 152 | 'Vintage End': '31/12/2016', 153 | 'ID': 2498, 154 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 155 | 'Country/Area': 'Paraguay', 156 | 'Project Type': 'Agriculture Forestry and Other Land Use', 157 | 'Methodology': 'AR-ACM0003', 158 | 'Total Vintage Quantity': '55,805', 159 | 'Quantity Issued': '1,788', 160 | 'Serial Number': '13378-495669005-495670792-VCS-VCU-576-VER-PY-14-2498-13012016-31122016-0', 161 | 'Additional Certifications': np.nan, 162 | 'Retirement/Cancellation Date': '11/09/2022', 163 | 'Retirement Beneficiary': np.nan, 164 | 'Retirement Reason': np.nan, 165 | 'Retirement Details': np.nan, 166 | }, 167 | { 168 | 'Issuance Date': '27/07/2022', 169 | 'Sustainable Development Goals': np.nan, 170 | 'Vintage Start': '01/01/2020', 171 | 'Vintage End': '19/11/2020', 172 | 'ID': 2498, 173 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 174 | 'Country/Area': 'Paraguay', 175 | 'Project Type': 'Agriculture Forestry and Other Land Use', 176 | 'Methodology': 'AR-ACM0003', 177 | 'Total Vintage Quantity': '99,870', 178 | 'Quantity Issued': '725', 179 | 'Serial Number': '13488-505972385-505973109-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0', 180 | 'Additional Certifications': np.nan, 181 | 'Retirement/Cancellation Date': '27/07/2022', 182 | 'Retirement Beneficiary': 'Jebsen & Jessen (GmbH & Co.) KG', 183 | 'Retirement Reason': 'Environmental Benefit', 184 | 'Retirement Details': 'Retired on behalf of Jebsen & Jessen 2022', 185 | }, 186 | { 187 | 'Issuance Date': '11/09/2009', 188 | 'Sustainable Development Goals': np.nan, 189 | 'Vintage Start': '01/04/2006', 190 | 'Vintage End': '18/03/2007', 191 | 'ID': 101, 192 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli', 193 | 'Country/Area': 'India', 194 | 'Project Type': 'Energy industries (renewable/non-renewable sources)', 195 | 'Methodology': 'ACM0006', 196 | 'Total Vintage Quantity': '62,796', 197 | 'Quantity Issued': '25,433', 198 | 'Serial Number': '240-7863589-7889021-VCU-003-APX-IN-1-101-01042006-18032007-0', 199 | 'Additional Certifications': np.nan, 200 | 'Retirement/Cancellation Date': '17/06/2015', 201 | 'Retirement Beneficiary': np.nan, 202 | 'Retirement Reason': np.nan, 203 | 'Retirement Details': np.nan, 204 | }, 205 | { 206 | 'Issuance Date': '04/11/2022', 207 | 'Sustainable Development Goals': np.nan, 208 | 'Vintage Start': '01/01/2019', 209 | 'Vintage End': '31/12/2019', 210 | 'ID': 2498, 211 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 212 | 'Country/Area': 'Paraguay', 213 | 'Project Type': 'Agriculture Forestry and Other Land Use', 214 | 'Methodology': 'AR-ACM0003', 215 | 'Total Vintage Quantity': '99,871', 216 | 'Quantity Issued': '1,413', 217 | 'Serial Number': '13969-543072663-543074075-VCS-VCU-576-VER-PY-14-2498-01012019-31122019-0', 218 | 'Additional Certifications': np.nan, 219 | 'Retirement/Cancellation Date': '26/12/2022', 220 | 'Retirement Beneficiary': 'DNV AS', 221 | 'Retirement Reason': 'Environmental Benefit', 222 | 'Retirement Details': 'VCUs 2022 for DNV', 223 | }, 224 | { 225 | 'Issuance Date': '27/07/2022', 226 | 'Sustainable Development Goals': np.nan, 227 | 'Vintage Start': '01/01/2020', 228 | 'Vintage End': '19/11/2020', 229 | 'ID': 2498, 230 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 231 | 'Country/Area': 'Paraguay', 232 | 'Project Type': 'Agriculture Forestry and Other Land Use', 233 | 'Methodology': 'AR-ACM0003', 234 | 'Total Vintage Quantity': '99,870', 235 | 'Quantity Issued': '297', 236 | 'Serial Number': '13488-505982056-505982352-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0', 237 | 'Additional Certifications': np.nan, 238 | 'Retirement/Cancellation Date': '26/12/2022', 239 | 'Retirement Beneficiary': 'DNV AS', 240 | 'Retirement Reason': 'Environmental Benefit', 241 | 'Retirement Details': 'VCUs 2022 for DNV', 242 | }, 243 | { 244 | 'Issuance Date': '27/07/2022', 245 | 'Sustainable Development Goals': np.nan, 246 | 'Vintage Start': '01/01/2018', 247 | 'Vintage End': '31/12/2018', 248 | 'ID': 2498, 249 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 250 | 'Country/Area': 'Paraguay', 251 | 'Project Type': 'Agriculture Forestry and Other Land Use', 252 | 'Methodology': 'AR-ACM0003', 253 | 'Total Vintage Quantity': '97,077', 254 | 'Quantity Issued': '1,380', 255 | 'Serial Number': '13487-505962385-505963764-VCS-VCU-576-VER-PY-14-2498-01012018-31122018-0', 256 | 'Additional Certifications': np.nan, 257 | 'Retirement/Cancellation Date': '20/10/2022', 258 | 'Retirement Beneficiary': 'Implement Consulting Group', 259 | 'Retirement Reason': 'Environmental Benefit', 260 | 'Retirement Details': 'Retirement of 1380t in the name of Implement Consulting Group, for flights 2021', 261 | }, 262 | { 263 | 'Issuance Date': '27/07/2022', 264 | 'Sustainable Development Goals': np.nan, 265 | 'Vintage Start': '01/01/2020', 266 | 'Vintage End': '19/11/2020', 267 | 'ID': 2498, 268 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá', 269 | 'Country/Area': 'Paraguay', 270 | 'Project Type': 'Agriculture Forestry and Other Land Use', 271 | 'Methodology': 'AR-ACM0003', 272 | 'Total Vintage Quantity': '99,870', 273 | 'Quantity Issued': '8,946', 274 | 'Serial Number': '13488-505973110-505982055-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0', 275 | 'Additional Certifications': np.nan, 276 | 'Retirement/Cancellation Date': '01/12/2022', 277 | 'Retirement Beneficiary': np.nan, 278 | 'Retirement Reason': np.nan, 279 | 'Retirement Details': np.nan, 280 | }, 281 | { 282 | 'Issuance Date': '11/09/2009', 283 | 'Sustainable Development Goals': np.nan, 284 | 'Vintage Start': '01/04/2006', 285 | 'Vintage End': '18/03/2007', 286 | 'ID': 101, 287 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli', 288 | 'Country/Area': 'India', 289 | 'Project Type': 'Energy industries (renewable/non-renewable sources)', 290 | 'Methodology': 'ACM0006', 291 | 'Total Vintage Quantity': '62,796', 292 | 'Quantity Issued': '1,466', 293 | 'Serial Number': '240-7889022-7890487-VCU-003-APX-IN-1-101-01042006-18032007-0', 294 | 'Additional Certifications': np.nan, 295 | 'Retirement/Cancellation Date': '18/06/2015', 296 | 'Retirement Beneficiary': np.nan, 297 | 'Retirement Reason': np.nan, 298 | 'Retirement Details': np.nan, 299 | }, 300 | ] 301 | ) 302 | return df 303 | 304 | 305 | @pytest.fixture(name='vcs_transactions') 306 | def fixture_vcs_transactions() -> pd.DataFrame: 307 | return vcs_transactions() 308 | 309 | 310 | def test_determine_vcs_transaction_type(vcs_transactions): 311 | df = determine_vcs_transaction_type( 312 | vcs_transactions, date_column='Retirement/Cancellation Date' 313 | ) 314 | 315 | # Check if the 'transaction_type' column is created 316 | assert 'transaction_type' in df.columns 317 | 318 | # Check that the function correctly assigns 'retirement/cancellation' or 'issuance' 319 | for i, row in df.iterrows(): 320 | if pd.notnull(row['Retirement/Cancellation Date']): 321 | assert row['transaction_type'] == 'retirement' 322 | else: 323 | assert row['transaction_type'] == 'issuance' 324 | 325 | 326 | def test_set_vcs_transaction_dates(vcs_transactions): 327 | df = set_vcs_transaction_dates( 328 | vcs_transactions, 329 | date_column='Retirement/Cancellation Date', 330 | fallback_column='Issuance Date', 331 | ) 332 | 333 | # Check if the 'transaction_date' column is created 334 | assert 'transaction_date' in df.columns 335 | 336 | # Create a series for expected transaction_date values 337 | expected_transaction_date = vcs_transactions['Retirement/Cancellation Date'].where( 338 | vcs_transactions['Retirement/Cancellation Date'].notnull(), 339 | vcs_transactions['Issuance Date'], 340 | ) 341 | 342 | expected_transaction_date.name = ( 343 | 'transaction_date' # Set the name of the Series to match the DataFrame column 344 | ) 345 | 346 | # Use assert_series_equal to compare the entire series 347 | pd.testing.assert_series_equal(df['transaction_date'], expected_transaction_date) 348 | 349 | 350 | def test_set_vcs_vintage_year(vcs_transactions): 351 | df = set_vcs_vintage_year(vcs_transactions, date_column='Issuance Date') 352 | 353 | # Check if the 'vintage' column is created 354 | assert 'vintage' in df.columns 355 | 356 | # Convert 'Issuance Date' in the original DataFrame to datetime for comparison 357 | expected_vintage = pd.to_datetime( 358 | vcs_transactions['Issuance Date'], dayfirst=True, utc=True 359 | ).dt.year 360 | expected_vintage.name = 'vintage' # Set the name of the Series to match the DataFrame column 361 | 362 | # Use assert_series_equal to compare the 'vintage' column with the expected result 363 | pd.testing.assert_series_equal(df['vintage'], expected_vintage) 364 | 365 | 366 | def test_calculate_vcs_issuances(vcs_transactions): 367 | # Process the vcs_transactions similar to process_vcs_credits 368 | processed_data = ( 369 | vcs_transactions.set_registry(registry_name='verra') 370 | .generate_vcs_project_ids(prefix='VCS') 371 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date') 372 | .set_vcs_transaction_dates( 373 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date' 374 | ) 375 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued']) 376 | .set_vcs_vintage_year(date_column='Vintage End') 377 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True) 378 | ) 379 | 380 | # Apply calculate_vcs_issuances 381 | issuances = calculate_vcs_issuances(processed_data) 382 | 383 | # Assertions 384 | # Ensure duplicates are removed based on the specified columns 385 | assert issuances.duplicated(subset=['vintage', 'project_id', 'quantity']).sum() == 0 386 | 387 | # Ensure the 'quantity' column is correctly populated 388 | assert 'quantity' in issuances.columns 389 | 390 | # Ensure 'transaction_type' is set to 'issuance' 391 | assert all(issuances['transaction_type'] == 'issuance') 392 | 393 | 394 | def test_calculate_vcs_retirements(vcs_transactions): 395 | # Process the vcs_transactions similar to process_vcs_credits 396 | processed_data = ( 397 | vcs_transactions.set_registry(registry_name='verra') 398 | .generate_vcs_project_ids(prefix='VCS') 399 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date') 400 | .set_vcs_transaction_dates( 401 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date' 402 | ) 403 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued']) 404 | .set_vcs_vintage_year(date_column='Vintage End') 405 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True) 406 | ) 407 | 408 | # Apply calculate_vcs_retirements 409 | retirements = calculate_vcs_retirements(processed_data) 410 | 411 | # Assertions 412 | # Check if 'retirement' and 'cancellation' types are present and 'issuance' types are filtered out 413 | assert all(retirements['transaction_type'].str.contains('retirement')) 414 | 415 | # Ensure the 'quantity' column is correctly renamed 416 | assert 'quantity' in retirements.columns 417 | assert 'Quantity Issued' not in retirements.columns 418 | 419 | 420 | def test_generate_vcs_project_ids(vcs_projects): 421 | df = vcs_projects 422 | df = generate_vcs_project_ids(df, prefix='VCS') 423 | assert df['project_id'].tolist() == [ 424 | 'VCS75', 425 | 'VCS2498', 426 | 'VCS101', 427 | 'VCS3408', 428 | 'VCS1223', 429 | ] 430 | 431 | 432 | def test_add_vcs_compliance_projects(vcs_projects): 433 | original_length = len(vcs_projects) 434 | df = add_vcs_compliance_projects(vcs_projects) 435 | 436 | # Check if two new rows are added 437 | assert len(df) == original_length + 2 438 | 439 | # Optionally, check for the presence of specific project details 440 | assert 'VCSOPR2' in df['project_id'].values 441 | assert 'VCSOPR10' in df['project_id'].values 442 | 443 | 444 | def test_process_vcs_projects(vcs_projects, vcs_transactions): 445 | vcs_credits = process_vcs_credits(vcs_transactions, harmonize_beneficiary_info=False) 446 | df = process_vcs_projects( 447 | vcs_projects, credits=vcs_credits, registry_name='verra', download_type='projects' 448 | ) 449 | 450 | assert 'listed_at' in df.columns 451 | # check project_url series 452 | assert df['project_url'].tolist() == [ 453 | 'https://registry.verra.org/app/projectDetail/VCS/75', 454 | 'https://registry.verra.org/app/projectDetail/VCS/2498', 455 | 'https://registry.verra.org/app/projectDetail/VCS/101', 456 | 'https://registry.verra.org/app/projectDetail/VCS/3408', 457 | 'https://registry.verra.org/app/projectDetail/VCS/1223', 458 | 'https://registry.verra.org/app/projectDetail/VCS/2265', # From add_vcs_compliance_projects 459 | 'https://registry.verra.org/app/projectDetail/VCS/2271', # From add_vcs_compliance_projects 460 | ] 461 | # check project_id series 462 | assert df['project_id'].tolist() == [ 463 | 'VCS75', 464 | 'VCS2498', 465 | 'VCS101', 466 | 'VCS3408', 467 | 'VCS1223', 468 | 'VCSOPR2', # From add_vcs_compliance_projects 469 | 'VCSOPR10', # From add_vcs_compliance_projects 470 | ] 471 | 472 | 473 | def test_process_vcs_projects_with_totals_and_dates(vcs_projects, vcs_transactions): 474 | # Process the vcs_transactions as per your existing pipeline 475 | # Assuming process_vcs_credits or similar functions are in place 476 | vcs_credits = process_vcs_credits(vcs_transactions, harmonize_beneficiary_info=False) 477 | 478 | # Process the vcs_projects 479 | processed_projects = process_vcs_projects( 480 | vcs_projects, credits=vcs_credits, registry_name='verra', download_type='projects' 481 | ) 482 | 483 | # Assertions for retired and issued totals, and first issuance/retirement dates 484 | # You need to know expected values for at least one project based on your test data 485 | project_id = 'VCS2498' 486 | 487 | # Extract the row for the specific project 488 | project_data = processed_projects[processed_projects['project_id'] == project_id] 489 | 490 | # Assert the total issued and retired quantities 491 | expected_total_issued = 435078 # Calculate this based on vcs_transactions fixture 492 | expected_total_retired = 19549 # Calculate this based on vcs_transactions fixture 493 | assert project_data['issued'].iloc[0] == expected_total_issued 494 | assert project_data['retired'].iloc[0] == expected_total_retired 495 | 496 | assert isinstance(project_data['first_issuance_at'].iloc[0], pd.Timestamp) 497 | assert isinstance(project_data['first_retirement_at'].iloc[0], pd.Timestamp) 498 | --------------------------------------------------------------------------------