├── .github
├── dependabot.yaml
└── workflows
│ ├── CI.yaml
│ └── pypi.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.json
├── LICENSE
├── README.md
├── TERMS_OF_DATA_ACCESS
├── docs
├── TERMS-OF-DATA-ACCESS.md
├── _static
│ ├── monogram-dark-cropped.png
│ └── monogram-light-cropped.png
├── api.md
├── conf.py
├── data-access.md
├── data-processing.md
├── glossary.md
├── index.md
└── install-offsets-db-data.md
├── offsets_db_data
├── __init__.py
├── apx.py
├── arb.py
├── catalog.yaml
├── common.py
├── configs
│ ├── all-protocol-mapping.json
│ ├── beneficiary-mappings.json
│ ├── berkeley-project-types.json
│ ├── credits-raw-columns-mapping.json
│ ├── projects-raw-columns-mapping.json
│ └── type-category-mapping.json
├── credits.py
├── data.py
├── gld.py
├── models.py
├── openrefine.py
├── pipeline_utils.py
├── projects.py
├── py.typed
├── registry.py
└── vcs.py
├── pyproject.toml
├── readthedocs.yml
├── requirements-dev.txt
├── requirements-docs.txt
├── requirements.txt
├── scripts
├── check-beneficiary-coverage.py
└── extract-berkeley-project-types.py
└── tests
├── __init__.py
├── test_integration.py
├── test_pipeline_utils.py
└── test_vcs.py
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: 'github-actions'
4 | directory: '/'
5 | schedule:
6 | interval: monthly
7 |
--------------------------------------------------------------------------------
/.github/workflows/CI.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on:
3 | push:
4 | branches:
5 | - main
6 | pull_request:
7 | branches:
8 | - main
9 | workflow_dispatch:
10 |
11 | schedule:
12 | - cron: '0 5 * * *' # At 05:00
13 |
14 | concurrency:
15 | group: ${{ github.workflow }}-${{ github.ref }}
16 | cancel-in-progress: true
17 |
18 | permissions:
19 | id-token: write # This is required for requesting the JWT
20 | contents: read # This is required for actions/checkout
21 |
22 | env:
23 | AWS_DEFAULT_REGION: us-west-2
24 |
25 | jobs:
26 | test:
27 | runs-on: ubuntu-latest
28 | timeout-minutes: 120
29 | defaults:
30 | run:
31 | shell: bash -l {0}
32 | services:
33 | openrefine:
34 | image: abesesr/openrefine:3.8.7
35 | ports:
36 | - 3333:3333
37 | options: --name openrefine
38 | steps:
39 | - uses: actions/checkout@v4
40 | - name: configure aws credentials
41 | uses: aws-actions/configure-aws-credentials@v4
42 | with:
43 | role-to-assume: arn:aws:iam::631969445205:role/github-action-role
44 | role-session-name: offsets-db-etl-role-session
45 | aws-region: ${{ env.AWS_DEFAULT_REGION }}
46 | - uses: actions/setup-python@v5
47 | with:
48 | python-version: '3.10'
49 | cache: 'pip'
50 |
51 | - name: Install dependencies
52 | run: |
53 | python -m pip install --upgrade pip
54 | python -m pip install -r requirements.txt
55 | python -m pip install -r requirements-dev.txt
56 |
57 | - name: Install package
58 | run: |
59 | python -m pip install .
60 |
61 | - name: List packages
62 | run: |
63 | python -m pip list
64 |
65 | - name: Install CLI
66 | run: |
67 | offsets-db-data-orcli install --destination /usr/local/bin
68 | offsets-db-data-orcli --help
69 |
70 | - name: Run tests
71 | run: |
72 | python -m pytest -s
73 |
--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
1 | name: Build distribution
2 | on:
3 | release:
4 | types:
5 | - published
6 | push:
7 |
8 | jobs:
9 | build-artifacts:
10 | runs-on: ubuntu-latest
11 | if: github.repository == 'carbonplan/offsets-db-data'
12 | steps:
13 | - uses: actions/checkout@v4
14 | with:
15 | fetch-depth: 0
16 | - uses: actions/setup-python@v5
17 | name: Install Python
18 | with:
19 | python-version: '3.10'
20 |
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | python -m pip install setuptools setuptools-scm wheel twine check-manifest
25 |
26 | - name: Build tarball and wheels
27 | run: |
28 | git clean -xdf
29 | git restore -SW .
30 | python -m build --sdist --wheel .
31 |
32 | - name: Check built artifacts
33 | run: |
34 | python -m twine check dist/*
35 | pwd
36 | if [ -f dist/offsets-db-data-unknown.tar.gz ]; then
37 | echo "❌ INVALID VERSION NUMBER"
38 | exit 1
39 | else
40 | echo "✅ Looks good"
41 | fi
42 | - uses: actions/upload-artifact@v4
43 | with:
44 | name: releases
45 | path: dist
46 |
47 | test-built-dist:
48 | needs: build-artifacts
49 | runs-on: ubuntu-latest
50 | steps:
51 | - uses: actions/setup-python@v5
52 | name: Install Python
53 | with:
54 | python-version: '3.10'
55 | - uses: actions/download-artifact@v4
56 | with:
57 | name: releases
58 | path: dist
59 | - name: List contents of built dist
60 | run: |
61 | ls -ltrh
62 | ls -ltrh dist
63 |
64 | - name: Verify the built dist/wheel is valid
65 | if: github.event_name == 'push'
66 | run: |
67 | python -m pip install --upgrade pip
68 | python -m pip install dist/offsets_db_data*.whl
69 | python -c "import offsets_db_data; print(offsets_db_data.__version__)"
70 |
71 | upload-to-pypi:
72 | needs: test-built-dist
73 | if: github.event_name == 'release'
74 | runs-on: ubuntu-latest
75 | permissions:
76 | id-token: write
77 | steps:
78 | - uses: actions/download-artifact@v4
79 | with:
80 | name: releases
81 | path: dist
82 | - name: Publish package to PyPI
83 | uses: pypa/gh-action-pypi-publish@v1.12.4
84 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | offsets_db_data/_version.py
162 |
163 | .DS_Store
164 | .idea
165 | .jupyter_cache/
166 | jupyter_execute/
167 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | ci:
2 | autoupdate_schedule: monthly
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v5.0.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: end-of-file-fixer
9 | - id: check-docstring-first
10 | - id: check-json
11 | - id: check-yaml
12 | - id: double-quote-string-fixer
13 | - id: debug-statements
14 | - id: mixed-line-ending
15 |
16 | - repo: https://github.com/astral-sh/ruff-pre-commit
17 | rev: 'v0.11.8'
18 | hooks:
19 | - id: ruff
20 | args: ['--fix']
21 | - id: ruff-format
22 |
23 | - repo: https://github.com/pre-commit/mirrors-prettier
24 | rev: v4.0.0-alpha.8
25 | hooks:
26 | - id: prettier
27 |
--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/.prettierignore
--------------------------------------------------------------------------------
/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "jsxSingleQuote": true,
3 | "printWidth": 80,
4 | "quoteProps": "as-needed",
5 | "semi": false,
6 | "singleQuote": true,
7 | "tabWidth": 2
8 | }
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 carbonplan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
8 |
9 |
13 |
14 |
15 |
16 | [](https://github.com/carbonplan/offsets-db-data/actions/workflows/CI.yaml)
17 | [](https://github.com/carbonplan/offsets-db-data/actions/workflows/pypi.yaml)
18 | [![PyPI][pypi-badge]][pypi-link]
19 | [![Documentation Status][rtd-badge]][rtd-link]
20 |
21 | # carbonplan / offsets-db-data
22 |
23 | Utilities for cleaning, and processing data for the [OffsetsDB web tool](https://carbonplan.org/research/offsets-db/)
24 |
25 | ## installation
26 |
27 | To install the package, you can use pip:
28 |
29 | ```bash
30 | python -m pip install git+https://github.com/carbonplan/offsets-db-data.git
31 | ```
32 |
33 | You can also install the package locally by cloning the repository and running:
34 |
35 | ```bash
36 | git clone https://github.com/carbonplan/offsets-db-data.git
37 | cd offsets-db-data
38 | python -m pip install -e .
39 | ```
40 |
41 | To install the dependencies for development, you can use pip:
42 |
43 | ```bash
44 | python -m pip install -e ".[all]"
45 |
46 | # or
47 |
48 | python -m pip install -e ".[dev]"
49 |
50 | ```
51 |
52 | ## building the documentation
53 |
54 | To build the documentation locally, you can use [sphinx](https://www.sphinx-doc.org/en/master/). You can install the documentation dependencies by running:
55 |
56 | ```bash
57 | python -m pip install -e ".[docs]"
58 | ```
59 |
60 | Then, you can build the documentation by running:
61 |
62 | ```bash
63 | sphinx-build docs docs/_build
64 | ```
65 |
66 | You can view the documentation by opening `docs/_build/index.html` in your browser.
67 |
68 | ## license
69 |
70 | All the code in this repository is [MIT](https://choosealicense.com/licenses/mit/) licensed.
71 |
72 | > [!IMPORTANT]
73 | > Data associated with this repository are subject to additional [terms of data access](https://github.com/carbonplan/offsets-db-data/blob/main/TERMS_OF_DATA_ACCESS).
74 |
75 | ## about us
76 |
77 | CarbonPlan is a non-profit organization that uses data and science for climate action. We aim to improve the transparency and scientific integrity of carbon removal and climate solutions through open data and tools. Find out more at [carbonplan.org](https://carbonplan.org/) or get in touch by [opening an issue](https://github.com/carbonplan/offsets-db/issues/new) or [sending us an email](mailto:hello@carbonplan.org).
78 |
79 | [pypi-badge]: https://img.shields.io/pypi/v/offsets-db-data?logo=pypi
80 | [pypi-link]: https://pypi.org/project/offsets-db-data
81 | [rtd-badge]: https://readthedocs.org/projects/offsets-db-data/badge/?version=latest
82 | [rtd-link]: https://offsets-db-data.readthedocs.io/en/latest/?badge=latest
83 |
--------------------------------------------------------------------------------
/TERMS_OF_DATA_ACCESS:
--------------------------------------------------------------------------------
1 | # TERMS OF DATA ACCESS
2 |
3 | ## OffsetsDB
4 |
5 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a
6 | regularly-updated snapshot of carbon offset projects, credit issuances, and
7 | credit retirements published by the following carbon offset registries:
8 |
9 | - American Carbon Registry (ACR)
10 | - ART TREES (ART)
11 | - Climate Action Reserve (CAR)
12 | - Gold Standard (GLD)
13 | - Verra (VCS)
14 |
15 | Carbon offset information has historically been scattered across multiple
16 | locations in formats that are not particularly useful to researchers. This
17 | database is meant to increase transparency, accountability, and reliability of
18 | the carbon offset market, and to provide researchers with a robust tool for
19 | visualizing, validating, and cross-checking offsets. We hope you find it useful!
20 |
21 | ## Our Terms of Use Apply To OffsetsDB
22 |
23 | By downloading, copying, or using this project, and/or any associated content
24 | or data, you agree to CarbonPlan’s Terms Of Use, which can be found here:
25 | [https://carbonplan.org/terms](https://carbonplan.org/terms). As further
26 | explained in the Terms of Use, CarbonPlan makes its projects — including
27 | OffsetsDB — available strictly on an “as-is” and “as-available” basis, without
28 | warranty of any kind, including without limitation the warranties of
29 | merchantability, fitness for a particular purpose, and noninfringement.
30 |
31 | ## Intellectual Property Rights
32 |
33 | Because OffsetsDB consists of purely factual information concerning carbon
34 | offsets that has been made publicly available by the above-referenced
35 | registries, CarbonPlan does not claim copyright in this data.
36 |
37 | However, please note that CarbonPlan does not make any representation as to
38 | whether any of the above-referenced registries may claim any rights in the data
39 | they have published. If you have any questions or concerns about this, please
40 | reach out to the registries directly.
41 |
--------------------------------------------------------------------------------
/docs/TERMS-OF-DATA-ACCESS.md:
--------------------------------------------------------------------------------
1 | # TERMS OF DATA ACCESS
2 |
3 | ## OffsetsDB
4 |
5 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a regularly-updated snapshot of carbon offset projects, credit issuances, and credit retirements published by the following carbon offset registries:
6 |
7 | - American Carbon Registry (ACR)
8 | - ART TREES (ART)
9 | - Climate Action Reserve (CAR)
10 | - Gold Standard (GLD)
11 | - Verra (VCS)
12 |
13 | Carbon offset information has historically been scattered across multiple locations in formats that are not particularly useful to researchers. This database is meant to increase transparency, accountability, and reliability of the carbon offset market, and to provide researchers with a robust tool for visualizing, validating, and cross-checking offsets. We hope you find it useful!
14 |
15 | ## Our Terms of Use Apply To OffsetsDB
16 |
17 | By downloading, copying, or using this project, and/or any associated content or data, you agree to CarbonPlan’s Terms Of Use, which can be found here: [https://carbonplan.org/terms](https://carbonplan.org/terms). As further explained in the Terms of Use, CarbonPlan makes its projects — including OffsetsDB — available strictly on an “as-is” and “as-available” basis, without warranty of any kind, including without limitation the warranties of merchantability, fitness for a particular purpose, and noninfringement.
18 |
19 | ## Intellectual Property Rights
20 |
21 | Because OffsetsDB consists of purely factual information concerning carbon offsets that has been made publicly available by the above-referenced registries, CarbonPlan does not claim copyright in this data.
22 |
23 | However, please note that CarbonPlan does not make any representation as to whether any of the above-referenced registries may claim any rights in the data they have published. If you have any questions or concerns about this, please reach out to the registries directly.
24 |
--------------------------------------------------------------------------------
/docs/_static/monogram-dark-cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/docs/_static/monogram-dark-cropped.png
--------------------------------------------------------------------------------
/docs/_static/monogram-light-cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/docs/_static/monogram-light-cropped.png
--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | # API Reference
2 |
3 | This page provides an autogenerated summary of offsets-db-data's API. For more details and examples, refer to the relevant chapters in the main part of teh documentation.
4 |
5 | ## Registry Specific Functions
6 |
7 | The following functions are specific to a given registry and are grouped under each registry's module. We currently support the following registries:
8 |
9 | - [verra](https://registry.verra.org/)
10 | - [gold-standard](https://www.goldstandard.org)
11 | - APX registries
12 | - [art-trees](https://art.apx.com/)
13 | - [climate action reserve](https://thereserve2.apx.com)
14 | - [american carbon registry](https://acr2.apx.com/)
15 |
16 | ### Verra
17 |
18 | ```{eval-rst}
19 | .. automodule:: offsets_db_data.vcs
20 | :members:
21 | :undoc-members:
22 | :show-inheritance:
23 | ```
24 |
25 | ### Gold Standard
26 |
27 | ```{eval-rst}
28 | .. automodule:: offsets_db_data.gld
29 | :members:
30 | :undoc-members:
31 | :show-inheritance:
32 | ```
33 |
34 | ### APX Registries
35 |
36 | Functionality for APX registries is currently grouped under the `apx`` module.
37 |
38 | ```{eval-rst}
39 | .. automodule:: offsets_db_data.apx
40 | :members:
41 | :undoc-members:
42 | :show-inheritance:
43 | ```
44 |
45 | ## ARB Data Functions
46 |
47 | The following functions are specific to the [ARB data](https://ww2.arb.ca.gov/our-work/programs/compliance-offset-program/arb-offset-credit-issuance).
48 |
49 | ```{eval-rst}
50 | .. automodule:: offsets_db_data.arb
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 | ```
55 |
56 | ## Common Functions
57 |
58 | The following functions are common to all registries.
59 |
60 | ```{eval-rst}
61 | .. automodule:: offsets_db_data.common
62 | :members:
63 | :undoc-members:
64 | :show-inheritance:
65 |
66 | .. automodule:: offsets_db_data.credits
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
71 | .. automodule:: offsets_db_data.projects
72 | :members:
73 | :undoc-members:
74 | :show-inheritance:
75 |
76 | .. automodule:: offsets_db_data.models
77 | :members:
78 | :undoc-members:
79 | :show-inheritance:
80 |
81 | .. automodule:: offsets_db_data.registry
82 | :members:
83 | :undoc-members:
84 | :show-inheritance:
85 |
86 | ```
87 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 |
9 |
10 | import datetime
11 | import sys
12 |
13 | import offsets_db_data
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | # sys.path.insert(0, os.path.abspath('.'))
19 | # sys.path.insert(os.path.abspath('..'))
20 |
21 | print('python exec:', sys.executable)
22 | print('sys.path:', sys.path)
23 |
24 |
25 | project = 'offsets-db-data'
26 | copyright = f'{datetime.datetime.now().date().year}, carbonplan'
27 | author = 'carbonplan'
28 | release = f'v{offsets_db_data.__version__}'
29 |
30 | # -- General configuration ---------------------------------------------------
31 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
32 |
33 | extensions = [
34 | 'myst_nb',
35 | # 'sphinxext.opengraph',
36 | 'sphinx_copybutton',
37 | 'sphinx_design',
38 | 'sphinx.ext.autodoc',
39 | 'sphinx.ext.viewcode',
40 | 'sphinx.ext.autosummary',
41 | 'sphinx.ext.doctest',
42 | 'sphinx.ext.intersphinx',
43 | 'sphinx.ext.extlinks',
44 | 'sphinx.ext.intersphinx',
45 | 'sphinx.ext.napoleon',
46 | 'sphinx_togglebutton',
47 | ]
48 |
49 | # MyST config
50 | myst_enable_extensions = ['amsmath', 'colon_fence', 'deflist', 'html_image']
51 | myst_url_schemes = ['http', 'https', 'mailto']
52 |
53 | # sphinx-copybutton configurations
54 | copybutton_prompt_text = r'>>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: '
55 | copybutton_prompt_is_regexp = True
56 |
57 | nb_execution_mode = 'auto'
58 | nb_execution_timeout = 600
59 | nb_execution_raise_on_error = True
60 | autosummary_generate = True
61 |
62 |
63 | templates_path = ['_templates']
64 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
65 | # Sphinx project configuration
66 | source_suffix = ['.rst', '.md']
67 |
68 |
69 | # -- Options for HTML output -------------------------------------------------
70 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
71 |
72 |
73 | html_theme = 'sphinx_book_theme'
74 |
75 |
76 | html_last_updated_fmt = '%b %d, %Y'
77 |
78 | html_title = 'offsets-db-data'
79 |
80 |
81 | html_theme_options = {
82 | 'repository_url': 'https://github.com/carbonplan/offsets-db-data',
83 | 'repository_branch': 'main',
84 | 'use_repository_button': True,
85 | 'path_to_docs': 'docs',
86 | 'use_edit_page_button': True,
87 | 'use_source_button': True,
88 | 'logo': {
89 | 'image_dark': 'monogram-light-cropped.png',
90 | 'image_light': 'monogram-dark-cropped.png',
91 | },
92 | }
93 | html_static_path = ['_static']
94 |
95 | intersphinx_mapping = {
96 | 'python': ('https://docs.python.org/3/', None),
97 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
98 | }
99 |
--------------------------------------------------------------------------------
/docs/data-access.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | format_name: myst
5 | kernelspec:
6 | display_name: Python 3
7 | name: python3
8 | ---
9 |
10 | # Access OffsetsDB Data
11 |
12 | OffsetsDB provides a detailed view of carbon offset credits and projects.
13 | You can access the data in various formats or directly through Python using our data package.
14 |
15 | ```{important}
16 | By downloading or accessing the OffsetsDB data archives, you agree to the [Terms of Data Access](TERMS-OF-DATA-ACCESS.md).
17 | ```
18 |
19 | ## CSV & Parquet Zipped Files
20 |
21 | Download the latest version of OffsetsDB in CSV:
22 |
23 | - [Download Credits & Projects](https://carbonplan-offsets-db.s3.us-west-2.amazonaws.com/production/latest/offsets-db.csv.zip)
24 |
25 | Download the latest version of OffsetsDB in [Parquet](https://parquet.apache.org/):
26 |
27 | - [Download Credits & Projects](https://carbonplan-offsets-db.s3.us-west-2.amazonaws.com/production/latest/offsets-db.parquet.zip)
28 |
29 | ## Citation
30 |
31 | Please cite OffsetsDB as:
32 |
33 | CarbonPlan (2024) “OffsetsDB” [https://carbonplan.org/research/offsets-db](https://carbonplan.org/research/offsets-db)
34 |
35 | ## Accessing The Full Data Archive Through Python
36 |
37 | For more dynamic and programmatic access to OffsetsDB, you can use our Python data package. This package allows you to load and interact with the data directly in your Python environment. With the data package, you can access the data in a variety of formats including CSV (for raw data) and Parquet (for processed data).
38 |
39 | ### Installation
40 |
41 | To get started, install the offsets_db_data package. Ensure you have Python installed on your system, and then run:
42 |
43 | ```bash
44 | python -m pip install offsets-db-data
45 | ```
46 |
47 | ### Using the Data Catalog
48 |
49 | Once installed, you can access the data through an Intake catalog. This catalog provides a high-level interface to the OffsetsDB datasets.
50 |
51 | Loading the Catalog
52 |
53 | ```{code-cell} ipython3
54 | import pandas as pd
55 | pd.options.display.max_columns = 5
56 | from offsets_db_data.data import catalog
57 |
58 | # Display the catalog
59 | print(catalog)
60 | ```
61 |
62 | #### Available Data
63 |
64 | The catalog includes different datasets, like credits and projects
65 |
66 | #### Getting Descriptive Information About a Dataset
67 |
68 | You can get information about a dataset using the `describe()` method. For example, to get information about the 'credits' dataset:
69 |
70 | ```{code-cell} ipython3
71 | catalog['credits'].describe()
72 | ```
73 |
74 | #### Accessing Specific Datasets
75 |
76 | You can access individual datasets within the catalog. For example, to access the 'credits' dataset:
77 |
78 | ```{code-cell} ipython3
79 | # Access the 'credits' dataset
80 | credits = catalog['credits']
81 |
82 | # Read the data into a pandas DataFrame
83 | credits_df = credits.read()
84 | credits_df.head()
85 |
86 | ```
87 |
88 | Similarly, to access the 'projects' dataset:
89 |
90 | ```{code-cell} ipython3
91 | # Access the 'projects' dataset
92 | projects = catalog['projects']
93 |
94 | # Read the data into a pandas DataFrame
95 | projects_df = projects.read()
96 | projects_df.head()
97 | ```
98 |
99 | Calling `projects.read()` and `credits.read()` without specifying a date, will return the data downloaded and processed on `2024-02-13`.
100 |
101 | To load data for a specific date, you can specify the date as a string in the format `YYYY-MM-DD`. For example:
102 |
103 | ```{code-cell} ipython3
104 | projects_df = catalog['projects'](date='2024-02-07').read()
105 | projects_df.head()
106 | ```
107 |
108 | ```{note}
109 | If you specify a date for which the data is not available, the package will raise a `PermissionError: Access Denied`.
110 | ```
111 |
--------------------------------------------------------------------------------
/docs/data-processing.md:
--------------------------------------------------------------------------------
1 | # Data Processing
2 |
3 | ## Order of Operations
4 |
5 | OffsetsDB follows a typical extract-transform-load (ETL) workflow.
6 | Extraction involves querying and downloading raw credit and project data hosted by offset registries.
7 | Transformation involves executing the functions contained within this repo, `offsets-db-data`.
8 | Load involves uploading the resulting data to S3 and the Postgres-backend that powers the OffsetsDB database tool.
9 |
10 | ## Downloading Raw Data
11 |
12 | We download a fresh copy of project and transaction data on a daily basis.
13 | While downloading, we make no changes to the raw data provided by the registries.
14 | We've fully automated downloading of registry data, with the exception of Gold Standard.
15 | Gold Standard's [terms and conditions](https://www.goldstandard.org/articles/terms-and-conditions) require that downloads occur through the interfaces provided by the Gold Standard site.
16 | Those interfaces, as provided, do not allow automated downloads.
17 |
18 | We have no plans to release the code the directly interacts with registries.
19 | We decided to keep this part of OffsetsDB private in an effort to limit download requests to the registries.
20 | Other, technical aspects of OffsetsDB, like the database and API that power the [database tool](https://carbonplan.org/research/offsets-db) are similarly closed.
21 | We made this decision to ensure that the OffsetsDB database tool remains performant.
22 | Critically, the production database represents an identical clone of the data generated by the code contained within `offsets-db-data`.
23 | No additional processing or inferences should occur outside the context of this repository.
24 |
25 | ## Transforming Raw Data
26 |
27 | Nearly the entirety of the code contained within `offsets-db-data` involves registry-specific logic for transforming raw registry data into a common, shared schema.
28 | The logic for transforming the data of each registry is contained within a single file, with the filename denoting which registry the transformations apply to.
29 | For example, the logic involved in transforming Verra data are contained within {py:obj}`offsets_db_data.vcs`.
30 |
31 | Each registry-specific file contains at least two functions: `process_{registry_abbreviation}_credits` and `process_{registry_abbreviation}_projects`
32 | Those functions, in turn, call a series of additional transformation functions that produce the normalized project and credit data which combine to form OffsetsDB.
33 | These transformation functions tend to be quite small and operate on one or two properties of the raw data.
34 | To continue with the Verra example, `vcs.py` contains functions with names like {py:obj}`offsets_db_data.vcs.set_vcs_vintage_year` and {py:obj}`offsets_db_data.vcs.add_vcs_project_id`.
35 | These functions contain the registry-specific logic needed to map Verra's raw data to the common data schema of OffsetsDB.
36 |
37 | ### An Example
38 |
39 | In practice, replicating the behavior of OffsetsDB should be simple.
40 | Here's an example of using `offsets_db_data` to transform the raw transactions data from Verra into a normalized, analysis ready file:
41 |
42 | ```python
43 | import pandas as pd
44 | pd.options.display.max_columns = 5
45 | from offsets_db_data import vcs
46 |
47 | archive_fname = {{ path to local copy of Verra transaction data }}
48 | raw_credits = pd.read_csv(archive_fname)
49 | processed_credits = vcs.process_vcs_credits(raw_credits)
50 | ```
51 |
52 | ```{note}
53 | Running the above example requires first downloading [a copy of Verra's transaction data](https://registry.verra.org/app/search/VCS) and changing the above code to reference the location of that data on your local machine.
54 | ```
55 |
56 | Invoking single transformation functions, like {py:obj}`offsets_db_data.vcs.set_vcs_vintage_year` is even more straightforward.
57 | Let's say you want to understand more about how OffsetsDB assigns Verra credits a vintage year.
58 | You can explore the behavior of this single transformation function by calling:
59 |
60 | ```python
61 | raw_credits.set_vcs_vintage_year(date_column='Vintage End').head()
62 | ```
63 |
64 | It's worth noting that we've wrapped all transformation functions using the [`pandas_flavor.register_dataframe_method`](https://github.com/pyjanitor-devs/pandas_flavor) decorator.
65 | That means that after importing a registry module from `offsets_db_data`, the transformation functions of that module are directly callable by any Pandas dataframe.
66 |
67 | ## Initial Column Mapping
68 |
69 | The initial and perhaps must mundane transformation of OffsetsDB involves mapping properties in the raw data to a common schema.
70 | This step requires constructing a map between the names of properties as they appear in the raw data to the property in OffsetsDB.
71 | For example, the Climate Action Reserve data refers to the property, `project_id`, as `Project ID`.
72 | The ART registry, however, refers to the same property as `Program ID`.
73 |
74 | These column mapping files are stored in [`offsets_db_data/configs`](https://github.com/carbonplan/offsets-db-data/tree/main/offsets_db_data/configs).
75 | There is a separate mapping file for `projects` data and `credits` data.
76 | Some properties either aren't included in the raw data or inferring their value requires special processing.
77 | In these cases, a `null` value is recorded in the column mapping files and the property is populated using registry-specific logic.
78 |
79 | ## Protocol Mapping
80 |
81 | Offset projects are developed by following a set of rules, known as a protocol.
82 | These rules specify things like when measurements must be made and what paperwork must be submitted in order for a project to receive credits.
83 | Unfortunately, there is no standardized way of referring to the exact protocol (or protocol version) used to develop an offset project.
84 | Even within the domain of a single registry, references to the exact protocol used to develop a project are often inconsistent.
85 | Take for example the Clean Development Mechanism protocol AMS-III.D., "Methane recovery in animal manure management systems".
86 | Across all five registries included in OffsetsDB, we identified twenty-two unique ways of referring to this one protocol.
87 |
88 | OffsetsDB addresses this problem by manually assigning every known protocol string to a common schema.
89 | Continuing with the AMS-III.D. example, we map all twenty-two "known strings" that describe the same protocol to a single, unified reference, `ams-iii-d`.
90 | We repeat this manual unification of dissimilar strings for all protocols across all registries.
91 | The results of the mapping are contained within [`offsets-db-data/configs/all-protocol-mapping.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/all-protocol-mapping.json).
92 |
93 | ## Project Type & Categorization
94 |
95 | In addition to unifying protocol mapping, we also assign two levels of classification to projects: `category` and `type`.
96 | Categories represent broad classes of offset approaches, while types provide more specific information about the mitigation strategy.
97 |
98 | ### Category Assignment
99 |
100 | Projects are assigned to one of these broad categories
101 |
102 | - agriculture: offsets derived from changes in the management of agricultural systems, including livestock
103 | - forest: offsets derived from the management of forests
104 | - ghg-management: offsets derived from the destruction or elimination (e.g., substitution) of greenhouse gases
105 | - renewable-energy: offsets derived from expanding renewable energy capacity
106 | - energy-efficiency: offsets derived from decreasing the amount of energy required to complete a task
107 | - fuel-switching: offsets derived from generating energy using a fuel source that produces fewer greenhouse gasses
108 | - carbon-capture: offsets derived from technologies that capture and store carbon
109 | - land-use: offsets derived from land management changes outside of forests
110 | - biochar: offsets derived from biochar production and application
111 |
112 | Category assignment is primarily determined by project type through the mapping defined in [`offsets-db-data/configs/type-category-mapping.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/type-category-mapping.json).
113 | This mapping connects specific project types (like "improved forest management" or "cookstoves") to their appropriate category.
114 |
115 | ### Project Type Assignment
116 |
117 | Project types represent more specific offset approaches.
118 | For example, within the category of "forest", projects might be classified as "improved forest management", "afforestation/reforestation", or "avoided forest conversion".
119 |
120 | Project types are determined through a multi-step process:
121 |
122 | 1. First, we attempt to infer the project type from protocol information (via {py:obj}`offsets_db_data.projects.infer_project_type`).
123 | 2. We apply manual overrides from curated data sources (via {py:obj}`offsets_db_data.projects.override_project_types`).
124 | Currently, the [Berkeley Carbon Trading Project](https://gspp.berkeley.edu/research-and-impact/centers/cepp/projects/berkeley-carbon-trading-project) data in [`offsets-db-data/configs/berkeley-project-types.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/berkeley-project-types.json) serves as the authoritative source for project types.
125 | The project types from the Berkeley Carbon Trading Project's Voluntary Registry Offsets Database are licensed under a [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
126 |
127 | ## Retirement User Harmonization
128 |
129 | Carbon offset credits are often retired on behalf of a specific entity or organization.
130 | However, the names of these retirement users are recorded inconsistently across registry data, making it difficult to analyze retirement patterns.
131 | The following section describes our approach for identifying and harmonizing information about the end-users ("retirement users") of specific offset credits.
132 |
133 | ### Harmonization Process
134 |
135 | The harmonization process attempts to identify specific "retirement users" from publicly disclosed retirement beneficiary information.
136 | We try to standardize retirement user information across registries using the following steps:
137 |
138 | 1. **Data merging**: we combine information from four sources into a single _temporary_ field:
139 |
140 | - `retirement_beneficiary`: note specifically designating the entity claiming a credit’s environmental benefits
141 | - `retirement_account`: name on account from which credits were retired
142 | - `retirement_note`: short-form text accompanying credit retirement
143 | - `retirement_reason`: short form note specifying why credits were retired (e.g., compliance purposes). Sometimes similar to a retirement note
144 |
145 | We refer to these fields as "retirement beneficiary data."
146 | Any one of these fields might contain information useful for relating a transaction to a retirement user.
147 |
148 | 2. **Standardization via OpenRefine**: we process this merged information through [OpenRefine](https://openrefine.org/) using a detailed set of transformation rules define in [`offsets-db-data/configs/beneficiary-mappings.json`](https://github.com/carbonplan/offsets-db-data/blob/main/offsets_db_data/configs/beneficiary-mappings.json). This includes:
149 | - text transformations that standardize common company names and entities
150 | - pattern matching to identify the same entities despite different formatting
151 |
152 | Only confident matches are included in the harmonized beneficiary field, `retirement_beneficiary_harmonized`.
153 |
154 | The retirement user harmonization process runs daily, along with the rest of OffsetsDB.
155 | However, the underyling standardization rules (implemented via OpenRefine) are only irregularly updated.
156 | This means that there might be new retirement data that _could_ be mapped to a known entity but, because of that mapping has not previously been described, that relationship is not reflected in OffsetsDB.
157 | To account for this, all searches via the database tool return matches across _all available_ retirement beneficiary fields: `retirement_beneficiary`, `retirement_account`, `retirement_note`, `retirement_reason`, _and_ `retirement_beneficiary_harmonized`.
158 | Thus, searching for known retirement users, like `Delta`, will return all records that contain the substring `delta` anywhere within their retirement beneficiary data.
159 | Users should carefully examine these unmapped transactions to determine whether or not these unmapped records are relevant to their specific search.
160 |
161 | ### Implementation Details
162 |
163 | Retirement user harmonization is implemented in the function {py:obj}`offsets_db_data.credits.harmonize_beneficiary_data`.
164 | This function runs a temporary OpenRefine project using the `offsets-db-data-orcli` command-line tool (which is a wrapper around [`orcli`](https://github.com/opencultureconsulting/orcli), an OpenRefine's command-line interface) to apply the transformations defined in our mapping file.
165 | The result is a new column, `retirement_beneficiary_harmonized`, that contains the standardized user names.
166 |
167 | ### Examples of Standardization
168 |
169 | Our harmonization process unifies many common variations:
170 |
171 | - "Delta Air Lines", "Delta Airlines" → "Delta Airlines"
172 | - "Terpel", "Organizacion Terpel", "Terpel S.A." → "Terpel"
173 | - "Retired on behalf of Sydney Opera House" → "Sydney Opera House"
174 |
175 | ### Why This Matters
176 |
177 | Without harmonization, the same entity might appear under multiple names, making it difficult to accurately analyze which entities are retiring the most credits.
178 | This harmonization allows for more accurate aggregation of retirement data by user.
179 |
180 | ```{note}
181 | The harmonizaton process can be toggled on or off via the `harmonize_beneficiary_info` parameter of the `process_{registry_abbreviation}_credits` functions.
182 | ```
183 |
184 | ## Registry Specific Transformations
185 |
186 | Some transformations involved in producing OffsetsDB require special knowledge or assumptions about the underlying data.
187 | This section highlights special cases.
188 |
189 | ```{note}
190 | For additional context, consult specific function docstrings in the [API reference](api.md) or [reach out on GitHub](https://github.com/carbonplan/offsets-db-data/issues) if something doesn't make sense.
191 | ```
192 |
193 | ### American Carbon Registry
194 |
195 | Project status: When processing ACR projects, we combine two status properties present in the raw data: `Compliance Program Status (ARB or Ecology)` and `Voluntary Status`.
196 | For compliance projects, we report compliance program status.
197 | For voluntary projects, we report voluntary status.
198 |
199 | ### Gold Standard
200 |
201 | #### Planned Emission Reductions
202 |
203 | Some Gold Standard protocols all for the issuance of "[planned emission reductions](https://goldstandardhelp.freshdesk.com/support/solutions/articles/44001989672-what-is-a-planned-emission-reduction-per-)" (PERs).
204 | These credits represent anticipated climate benefits that are expected to occur in the future.
205 | PERs are issued and can be traded, but cannot be retired.
206 | OffsetsDB includes all issued PERs when reporting vintage and project level credit totals.
207 |
208 | ### Verra
209 |
210 | #### Issuance
211 |
212 | Verra allows "rolling" credit issuance. This allows projects to complete the paperwork and verificaiton processes for credit issuance, but delay the actual issuance event.
213 | This results in ambiguities around the precise timing of credit issuance events, as credits that are eligible to be issued but have not yet been issued, are not publicly reported in the Verra crediting data.
214 | We handle this ambiguity by assuming that the first crediting event, be it an issuance, retirement, or cancellation, on a per-project, per-vintage basis results in issuance of 100 percent of credits eligible to be issued for that project-vintage.
215 |
216 | #### Retirements vs. Cancellations
217 |
218 | Verra's publicly available data does not distinguish between retirement events and cancellation events.
219 | We report all Verra retirements and cancellations as `retirement`.
220 | We originally contemplated tagging every Verra retirement or cancellation as `retirement/cancellation`.
221 | This made our processed Verra data slightly incompatiable with data from other registries.
222 | Simple queries, like "give me all the retirements", suddenly required writing code that look like this:
223 |
224 | ```python
225 | credits[(credits['transaction_type'] == 'retirement') | (credits['transaction_type'] == 'retirement/cancellation')]
226 | ```
227 |
228 | ```{warning}
229 | Because we know the majority of Verra `retirement/cancellation` events are in fact `retirement`, we opted for this more ergonomic representation of the data.
230 | Any analysis involving Verra retirement data should clearly specify that Verra's raw data does not currently distinguish between retirement and cancellation events.
231 | ```
232 |
233 | Vintage Date: Verra allows for the simultaneous issuance of multiple vintages.
234 | We assign all credits from these multi-vintage issuances to the earliest reported vintage year.
235 |
236 | ### California Compliance Projects
237 |
238 | We treat the California Air Resources Board's [issuance table](https://ww2.arb.ca.gov/resources/documents/arb-offset-credit-issuance-table) as the source of truth for all credits issued and retired by any project developed under an ARB-approved protocol.
239 | When a project appears in the issuance table, we drop all crediting data reported from the project's host registry and _only_ report the issuance and retirement values contained within the ARB issuance table.
240 | This methodological decision introduces a small error when it comes to "Early Action" projects.
241 | These projects were developed during the very first phase of California's offsets program, which had slightly different rules.
242 | After the early action phase, some projects transitioned into the full compliance program, while others did not.
243 | Fully accounting for how these early projects retired credits, both reported by CARB's issuance table and the registries, likely requires more careful treatment.
244 |
245 | Retirement Dates: Offsets retired for compliance with California's cap-and-trade program occur on fixed dates that correspond with the program's reporting deadlines.
246 | These deadlines come in two forms: partial (annual) and full (triennial) compliance events.
247 | For simplicity, the current version of OffsetsDB uses the date of the full (triennial) compliance event as the retirement date for all compliance offsets.
248 | This means some retirement dates go unrecorded.
249 | Specifically, compliance credits retired for _voluntary_ purposes (i.e., not to satisfy requirements under AB32) and credits retired in linked markets (e.g., Quebec) are unknown and reported as `NaT`.
250 |
251 | ## Other Normalizations
252 |
253 | ### Country
254 |
255 | We use the Python package [coutnry_convertor](https://github.com/IndEcol/country_converter) to harmonize country names.
256 |
257 | ### Project Status
258 |
259 | OffsetsDB unifies and simplifies project status information reported by the registries.
260 | OffsetsDB contains use the following status codes: `listed`, `registered`, and `completed`.
261 | Listed typically refers to the stage during which a project has been formally announced and is undergoing development, but has yet to receive credits.
262 | Registered refers to projects that have received credits and are eligible to receive additional credits in the future.
263 | Completed means a project previously received credits and is not currently able to receive additional credits in the future.
264 | Many registries have far more detailed project status information, often reflecting the specific stages of the registry's validation and verification process.
265 | Future work might focus on normalizing these additional project states across the registries.
266 |
267 | ## Schema
268 |
269 | ### Projects
270 |
271 | Project data conform to the following schema:
272 |
273 | ```json
274 | {
275 | "properties": {
276 | "category": {
277 | "anyOf": [
278 | {
279 | "type": "string"
280 | },
281 | {
282 | "type": "null"
283 | }
284 | ],
285 | "description": "Category of the project",
286 | "title": "Category"
287 | },
288 | "country": {
289 | "anyOf": [
290 | {
291 | "type": "string"
292 | },
293 | {
294 | "type": "null"
295 | }
296 | ],
297 | "title": "Country"
298 | },
299 | "first_issuance_at": {
300 | "anyOf": [
301 | {
302 | "format": "date",
303 | "type": "string"
304 | },
305 | {
306 | "type": "null"
307 | }
308 | ],
309 | "description": "Date of first issuance of credits",
310 | "title": "First Issuance At"
311 | },
312 | "first_retirement_at": {
313 | "anyOf": [
314 | {
315 | "format": "date",
316 | "type": "string"
317 | },
318 | {
319 | "type": "null"
320 | }
321 | ],
322 | "description": "Date of first retirement of credits",
323 | "title": "First Retirement At"
324 | },
325 | "is_compliance": {
326 | "anyOf": [
327 | {
328 | "type": "boolean"
329 | },
330 | {
331 | "type": "null"
332 | }
333 | ],
334 | "description": "Whether project is compliance project",
335 | "title": "Is Compliance"
336 | },
337 | "issued": {
338 | "anyOf": [
339 | {
340 | "type": "integer"
341 | },
342 | {
343 | "type": "null"
344 | }
345 | ],
346 | "default": 0,
347 | "description": "Total of issued credits",
348 | "title": "Issued"
349 | },
350 | "listed_at": {
351 | "anyOf": [
352 | {
353 | "format": "date",
354 | "type": "string"
355 | },
356 | {
357 | "type": "null"
358 | }
359 | ],
360 | "description": "Date project was listed",
361 | "title": "Listed At"
362 | },
363 | "name": {
364 | "anyOf": [
365 | {
366 | "type": "string"
367 | },
368 | {
369 | "type": "null"
370 | }
371 | ],
372 | "description": "Name of the project",
373 | "title": "Name"
374 | },
375 | "project_id": {
376 | "description": "Project id used by registry system",
377 | "title": "Project Id",
378 | "type": "string"
379 | },
380 | "project_url": {
381 | "anyOf": [
382 | {
383 | "type": "string"
384 | },
385 | {
386 | "type": "null"
387 | }
388 | ],
389 | "description": "URL to project details",
390 | "title": "Project Url"
391 | },
392 | "proponent": {
393 | "anyOf": [
394 | {
395 | "type": "string"
396 | },
397 | {
398 | "type": "null"
399 | }
400 | ],
401 | "title": "Proponent"
402 | },
403 | "protocol": {
404 | "anyOf": [
405 | {
406 | "items": {
407 | "type": "string"
408 | },
409 | "type": "array"
410 | },
411 | {
412 | "type": "null"
413 | }
414 | ],
415 | "default": null,
416 | "description": "List of protocols",
417 | "title": "Protocol"
418 | },
419 | "registry": {
420 | "description": "Name of the registry",
421 | "title": "Registry",
422 | "type": "string"
423 | },
424 | "retired": {
425 | "anyOf": [
426 | {
427 | "type": "integer"
428 | },
429 | {
430 | "type": "null"
431 | }
432 | ],
433 | "default": 0,
434 | "description": "Total of retired credits",
435 | "title": "Retired"
436 | },
437 | "status": {
438 | "anyOf": [
439 | {
440 | "type": "string"
441 | },
442 | {
443 | "type": "null"
444 | }
445 | ],
446 | "title": "Status"
447 | },
448 | "project_type": {
449 | "anyOf": [
450 | {
451 | "type": "string"
452 | },
453 | {
454 | "type": "null"
455 | }
456 | ],
457 | "description": "Type of project",
458 | "title": "Project Type"
459 | },
460 | "project_type_source": {
461 | "anyOf": [
462 | {
463 | "type": "string"
464 | },
465 | {
466 | "type": "null"
467 | }
468 | ],
469 | "description": "Source of project type information",
470 | "title": "Project Type Source"
471 | }
472 | },
473 | "required": [
474 | "project_id",
475 | "name",
476 | "registry",
477 | "proponent",
478 | "category",
479 | "status",
480 | "country",
481 | "listed_at",
482 | "is_compliance",
483 | "first_issuance_at",
484 | "first_retirement_at",
485 | "project_url",
486 | "project_type",
487 | "project_type_source"
488 | ],
489 | "title": "Project",
490 | "type": "object"
491 | }
492 | ```
493 |
494 | The majority of project attributes are directly taken from the project data downloaded from each registry.
495 | Table 1 provides the mapping from the raw column names found in downloaded registry data to the OffsetsDB project schema.
496 |
497 | | | **verra** | **climate-action-reserve** | **american-carbon-registry** | **gold-standard** | **art-trees** |
498 | | ---------------- | ------------------------------ | ------------------------------ | -------------------------------------- | ------------------------------ | ------------------------------ |
499 | | **project_id** | ID | Project ID | Project ID | id | Program ID |
500 | | **name** | Name | Project Name | Project Name | name | Program Name |
501 | | **protocol** | Methodology | Project Type | Project Methodology/Protocol | methodology | \- |
502 | | **category** | inferred from protocol or type | inferred from protocol or type | inferred from protocol | inferred from protocol or type | inferred from protocol or type |
503 | | **proejct_type** | manually assigned | manually assigned | manually assigned | manually assigned | manually assigned |
504 | | **proponent** | Proponent | Project Owner | Project Developer | project_developer | Sovereign Program Developer |
505 | | **country** | Country/Area | Project Site Country | Project Site Country | country | Program Country |
506 | | **status** | Status | Status | Derived: voluntary + compliance status | status | Status |
507 | | **listed_at** | Project Listed Date | \- | \- | \- | \- |
508 |
509 | ### Credits
510 |
511 | Credit data conform to the following schema:
512 |
513 | ```json
514 | {
515 | "properties": {
516 | "id": {
517 | "default": null,
518 | "title": "Id",
519 | "type": "integer"
520 | },
521 | "project_id": {
522 | "anyOf": [
523 | {
524 | "type": "string"
525 | },
526 | {
527 | "type": "null"
528 | }
529 | ],
530 | "description": "Project id used by registry system",
531 | "title": "Project Id"
532 | },
533 | "quantity": {
534 | "description": "Tons of carbon dioxide equivalent (each ton is one carbon credit",
535 | "title": "Quantity",
536 | "type": "integer"
537 | },
538 | "retirement_account": {
539 | "anyOf": [
540 | {
541 | "type": "string"
542 | },
543 | {
544 | "type": "null"
545 | }
546 | ],
547 | "description": "Name on account from which credits were retired",
548 | "title": "Retirement Account"
549 | },
550 | "retirement_beneficiary": {
551 | "anyOf": [
552 | {
553 | "type": "string"
554 | },
555 | {
556 | "type": "null"
557 | }
558 | ],
559 | "description": "Note specifically designating the entity claiming a credit’s environmental benefits",
560 | "title": "Retirement Beneficiary"
561 | },
562 | "retirement_beneficiary_harmonized": {
563 | "anyOf": [
564 | {
565 | "type": "string"
566 | },
567 | {
568 | "type": "null"
569 | }
570 | ],
571 | "description": "Harmonized beneficiary of credits",
572 | "title": "Retirement Beneficiary Harmonized"
573 | },
574 | "retirement_note": {
575 | "anyOf": [
576 | {
577 | "type": "string"
578 | },
579 | {
580 | "type": "null"
581 | }
582 | ],
583 | "description": "Short-form text accompanying credit retirement",
584 | "title": "Retirement Note"
585 | },
586 | "retirement_reason": {
587 | "anyOf": [
588 | {
589 | "type": "string"
590 | },
591 | {
592 | "type": "null"
593 | }
594 | ],
595 | "description": "Short form note specifying why credits were retired (e.g., compliance purposes). Sometimes similar to a retirement note. ",
596 | "title": "Retirement Reason"
597 | },
598 | "transaction_date": {
599 | "anyOf": [
600 | {
601 | "format": "date",
602 | "type": "string"
603 | },
604 | {
605 | "type": "null"
606 | }
607 | ],
608 | "description": "Date of transaction",
609 | "title": "Transaction Date"
610 | },
611 | "transaction_type": {
612 | "anyOf": [
613 | {
614 | "type": "string"
615 | },
616 | {
617 | "type": "null"
618 | }
619 | ],
620 | "description": "Type of transaction",
621 | "title": "Transaction Type"
622 | },
623 | "vintage": {
624 | "anyOf": [
625 | {
626 | "type": "integer"
627 | },
628 | {
629 | "type": "null"
630 | }
631 | ],
632 | "description": "Year when carbon avoidance/removal occurred",
633 | "title": "Vintage"
634 | }
635 | },
636 | "required": [
637 | "quantity",
638 | "vintage",
639 | "transaction_date",
640 | "transaction_type",
641 | "retirement_account",
642 | "retirement_beneficiary",
643 | "retirement_reason",
644 | "retirement_note",
645 | "retirement_beneficiary_harmonized",
646 | "project_id"
647 | ],
648 | "title": "Credit",
649 | "type": "object"
650 | }
651 | ```
652 |
653 | The majority of credit attributes are taken directly taken from the credit data downloaded from each registry.
654 | The raw attribute names of crediting data tends to vary depending on the transaction type.
655 | For example, ART TREES refers to retirement dates as `Status Effective`, while it reports issuances as `Date Approved`.
656 | Rather than produce a table of each of these mappings here, please refer to [credits-raw-columns-mappings.json](https://github.com/carbonplan/offsets-db/blob/main/data/credits-raw-columns-mappings.json).
657 |
--------------------------------------------------------------------------------
/docs/glossary.md:
--------------------------------------------------------------------------------
1 | # Glossary
2 |
3 | ## Terms that apply to offset projects
4 |
5 | - **Protocol**:
6 | The rules used to quantify the number of offset credits awarded to a project.
7 | Protocols outline requirements (e.g., deadlines, verification) and disclosures that projects must satisfy to receive credits.
8 | Protocols are enforced by registries.
9 |
10 | - **Registry**:
11 | An organization (often a non-profit) responsible for creating the rules that govern offset projects and the generation of offset credits.
12 | Registries maintain public records of all credits that have been issued, which ensures that credits are not used more than once ("double counting").
13 | These public records serve as the raw data behind OffsetsDB.
14 |
15 | ## Terms that apply to offset credits
16 |
17 | - **Carbon Credit / Offset**:
18 | A tradable certificate representing the climate benefit of removing or avoiding the emission of greenhouse gases.
19 | Individual credits are typically denominated in terms of one tonne of carbon dioxide equivalent (tCO₂e).
20 | The terms "carbon credit" and "carbon offset" and "offset credit" are often used interchangeably.
21 |
22 | - **Credit Cancellation**:
23 | Invalidation of an offset credit, often for administrative purposes.
24 | For example, if an error in paperwork is determined to have resulted in the issuance of 1,000 extra credits, a registry would use cancellation to correct the error.
25 |
26 | - **Credit Issuance**:
27 | The creation of new offset credits, which can subsequently be sold, traded, and used to make environmental claims.
28 | Credits are issued by registries to projects.
29 | Issuance occurs only after a project has satisfied all the rules laid out in the specific offset protocol used by the project.
30 |
31 | - **Credit Retirement**:
32 | The use of a carbon offset to make an offsetting claim.
33 | Retirement occurs when the current owner of an offset credit requests that a registry formally retire the credit.
34 | Retiring a credit means no other entity can take credit for the environmental benefit embodied by the retired credit.
35 |
36 | ## More resources
37 |
38 | Carbon Brief has a [nice glossary](https://interactive.carbonbrief.org/carbon-offsets-2023/glossary.html) of carbon market terminology.
39 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | ```{rubric} Offsets-DB Data
4 |
5 | ```
6 |
7 | Welcome to OffsetsDB! This documentation provides an overview of offsets-db-data, a Python package with utilities for cleaning and processing data for [OffsetsDB web tool](https://carbonplan.org/research/offsets-db/)
8 |
9 | ```{button-ref} install-offsets-db-data
10 | :ref-type: doc
11 | :color: primary
12 | :class: sd-rounded-pill
13 |
14 | Get Started
15 | ```
16 |
17 | :::
18 |
19 | ::::
20 |
21 | ---
22 |
23 | ## Get in touch
24 |
25 | - If you encounter any errors or problems with **offsets-db-data**, please open an issue at the GitHub [main repository](http://github.com/carbonplan/offsets-db-data/issues).
26 | - If you have a question like “How do I find x?”, ask on [GitHub discussions](https://github.com/carbonplan/offsets-db-data/discussions). Please include a self-contained reproducible example if possible.
27 |
28 | ---
29 |
30 | ```{toctree}
31 | ---
32 | maxdepth: 1
33 | caption: How to guides and examples
34 | hidden:
35 | ---
36 | install-offsets-db-data.md
37 | data-access.md
38 | ```
39 |
40 | ```{toctree}
41 | ---
42 | maxdepth: 2
43 | caption: Reference
44 | hidden:
45 | ---
46 |
47 | data-processing.md
48 | api.md
49 | glossary.md
50 | TERMS-OF-DATA-ACCESS.md
51 | ```
52 |
53 | ```{toctree}
54 | ---
55 | maxdepth: 2
56 | caption: Project links
57 | hidden:
58 | ---
59 |
60 |
61 | GitHub Repo
62 | GitHub discussions
63 | Database Web Tool
64 | Methods
65 | Explainer
66 |
67 | ```
68 |
--------------------------------------------------------------------------------
/docs/install-offsets-db-data.md:
--------------------------------------------------------------------------------
1 | # Install offsets-db-data
2 |
3 | offsets-db-data Python package can be installed in two ways:
4 |
5 | ```{eval-rst}
6 |
7 | .. tab-set::
8 |
9 | .. tab-item:: pip
10 |
11 | Using the `pip `__ package manager:
12 |
13 | .. code:: bash
14 |
15 | $ python -m pip install offsets-db-data
16 |
17 |
18 | .. tab-item:: Development version
19 |
20 | To install a development version from source:
21 |
22 | .. code:: bash
23 |
24 | $ git clone https://github.com/carbonplan/offsets-db-data
25 | $ cd offsets-db-data
26 | $ python -m pip install -e .
27 | ```
28 |
--------------------------------------------------------------------------------
/offsets_db_data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from ._version import __version__
3 |
--------------------------------------------------------------------------------
/offsets_db_data/apx.py:
--------------------------------------------------------------------------------
1 | import numpy as np # noqa: F401
2 | import pandas as pd
3 | import pandas_flavor as pf
4 |
5 | from offsets_db_data.common import (
6 | BERKELEY_PROJECT_TYPE_UPATH,
7 | CREDIT_SCHEMA_UPATH,
8 | PROJECT_SCHEMA_UPATH,
9 | load_column_mapping,
10 | load_inverted_protocol_mapping,
11 | load_registry_project_column_mapping,
12 | load_type_category_mapping,
13 | )
14 | from offsets_db_data.credits import * # noqa: F403
15 | from offsets_db_data.credits import harmonize_beneficiary_data
16 | from offsets_db_data.models import credit_without_id_schema, project_schema
17 | from offsets_db_data.projects import * # noqa: F403
18 |
19 |
20 | @pf.register_dataframe_method
21 | def determine_transaction_type(df: pd.DataFrame, *, download_type: str) -> pd.DataFrame:
22 | """
23 | Assign a transaction type to each record in the DataFrame based on the download type.
24 |
25 | Parameters
26 | ----------
27 | df : pd.DataFrame
28 | Input DataFrame containing transaction data.
29 | download_type : str
30 | Type of transaction ('issuances', 'retirements', 'cancellations') to determine the transaction type.
31 |
32 | Returns
33 | -------
34 | pd.DataFrame
35 | DataFrame with a new 'transaction_type' column, containing assigned transaction types based on download_type.
36 | """
37 |
38 | transaction_type_mapping = {
39 | 'issuances': 'issuance',
40 | 'retirements': 'retirement',
41 | 'cancellations': 'cancellation',
42 | }
43 | df['transaction_type'] = transaction_type_mapping[download_type]
44 | return df
45 |
46 |
47 | @pf.register_dataframe_method
48 | def process_apx_credits(
49 | df: pd.DataFrame,
50 | *,
51 | download_type: str,
52 | registry_name: str,
53 | arb: pd.DataFrame | None = None,
54 | harmonize_beneficiary_info: bool = False,
55 | ) -> pd.DataFrame:
56 | """
57 | Process APX credits data by setting registry, determining transaction types, renaming columns,
58 | converting date columns, aggregating issuances (if applicable), and validating the schema.
59 |
60 | Parameters
61 | ----------
62 | df : pd.DataFrame
63 | Input DataFrame with raw APX credits data.
64 | download_type : str
65 | Type of download ('issuances', 'retirements', etc.).
66 | registry_name : str
67 | Name of the registry for setting and mapping columns.
68 | arb : pd.DataFrame | None, optional
69 | Additional DataFrame for data merging (default is None).
70 |
71 | Returns
72 | -------
73 | pd.DataFrame
74 | Processed DataFrame with APX credits data.
75 | """
76 |
77 | df = df.copy()
78 |
79 | column_mapping = load_column_mapping(
80 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH
81 | )
82 |
83 | columns = {v: k for k, v in column_mapping.items()}
84 |
85 | data = (
86 | df.set_registry(registry_name=registry_name)
87 | .determine_transaction_type(download_type=download_type)
88 | .rename(columns=columns)
89 | )
90 |
91 | # split the date and time and keeping only the date. this helps with the inconsistency in the date format
92 | data['transaction_date'] = data['transaction_date'].str.split().str[0]
93 |
94 | data = data.convert_to_datetime(columns=['transaction_date'])
95 |
96 | if download_type == 'issuances':
97 | data = data.aggregate_issuance_transactions()
98 |
99 | data = data.add_missing_columns(schema=credit_without_id_schema).validate(
100 | schema=credit_without_id_schema
101 | )
102 | if arb is not None and not arb.empty:
103 | data = data.merge_with_arb(arb=arb)
104 |
105 | if harmonize_beneficiary_info:
106 | data = data.pipe(
107 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type
108 | )
109 |
110 | data = (
111 | data.add_missing_columns(schema=credit_without_id_schema)
112 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d')
113 | .validate(schema=credit_without_id_schema)
114 | )
115 | return data
116 |
117 |
118 | def harmonize_acr_status(row: pd.Series) -> str:
119 | """Derive single project status for CAR and ACR projects
120 |
121 | Raw CAR and ACR data has two status columns -- one for compliance status, one for voluntary.
122 | Handle and harmonize.
123 |
124 | Parameters
125 | ----------
126 | row : pd.Series
127 | A row from a pandas DataFrame
128 |
129 | Returns
130 | -------
131 | value : str
132 | The status of the project
133 | """
134 | if row['Compliance Program Status (ARB or Ecology)'] == 'Not ARB or Ecology Eligible':
135 | return row['Voluntary Status'].lower()
136 | ACR_COMPLIANCE_STATE_MAP = {
137 | 'Listed - Active ARB Project': 'active',
138 | 'ARB Completed': 'completed',
139 | 'ARB Inactive': 'completed',
140 | 'Listed - Proposed Project': 'listed',
141 | 'Listed - Active Registry Project': 'listed',
142 | 'ARB Terminated': 'completed',
143 | 'Submitted': 'listed',
144 | 'Transferred ARB or Ecology Project': 'active',
145 | 'Listed – Active ARB Project': 'active',
146 | }
147 |
148 | return ACR_COMPLIANCE_STATE_MAP.get(
149 | row['Compliance Program Status (ARB or Ecology)'], 'unknown'
150 | )
151 |
152 |
153 | @pf.register_dataframe_method
154 | def add_project_url(df: pd.DataFrame, *, registry_name: str) -> pd.DataFrame:
155 | """
156 | Add a project URL to each record in the DataFrame based on the registry name and project ID.
157 |
158 | Parameters
159 | ----------
160 | df : pd.DataFrame
161 | Input DataFrame containing project data.
162 | registry_name : str
163 | Name of the registry ('american-carbon-registry', 'climate-action-reserve', 'art-trees').
164 |
165 | Returns
166 | -------
167 | pd.DataFrame
168 | DataFrame with a new 'project_url' column, containing URLs for each project.
169 | """
170 |
171 | if registry_name == 'american-carbon-registry':
172 | base = 'https://acr2.apx.com/mymodule/reg/prjView.asp?id1='
173 | elif registry_name == 'climate-action-reserve':
174 | base = 'https://thereserve2.apx.com/mymodule/reg/prjView.asp?id1='
175 | elif registry_name == 'art-trees':
176 | base = 'https://art.apx.com/mymodule/reg/prjView.asp?id1='
177 |
178 | else:
179 | raise ValueError(f'Unknown registry name: {registry_name}')
180 |
181 | df['project_url'] = base + df['project_id'].str[3:]
182 | return df
183 |
184 |
185 | @pf.register_dataframe_method
186 | def process_apx_projects(
187 | df: pd.DataFrame, *, credits: pd.DataFrame, registry_name: str
188 | ) -> pd.DataFrame:
189 | """
190 | Process APX projects data, including renaming, adding, and validating columns, harmonizing statuses,
191 | and merging with credits data.
192 |
193 | Parameters
194 | ----------
195 | df : pd.DataFrame
196 | Input DataFrame with raw projects data.
197 | credits : pd.DataFrame
198 | DataFrame containing credits data for merging.
199 | registry_name : str
200 | Name of the registry for specific processing steps.
201 |
202 | Returns
203 | -------
204 | pd.DataFrame
205 | Processed DataFrame with harmonized and validated APX projects data.
206 | """
207 |
208 | df = df.copy()
209 | credits = credits.copy()
210 | registry_project_column_mapping = load_registry_project_column_mapping(
211 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH
212 | )
213 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()}
214 | inverted_protocol_mapping = load_inverted_protocol_mapping()
215 | type_category_mapping = load_type_category_mapping()
216 | data = df.rename(columns=inverted_column_mapping)
217 | if registry_name == 'art-trees':
218 | data['protocol'] = [['art-trees']] * len(data)
219 |
220 | else:
221 | data = data.map_protocol(inverted_protocol_mapping=inverted_protocol_mapping)
222 |
223 | if registry_name == 'american-carbon-registry':
224 | data['status'] = data.apply(harmonize_acr_status, axis=1)
225 | else:
226 | data = data.harmonize_status_codes()
227 |
228 | data = (
229 | data.set_registry(registry_name=registry_name)
230 | .add_project_url(registry_name=registry_name)
231 | .harmonize_country_names()
232 | .infer_project_type()
233 | .override_project_types(
234 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley'
235 | )
236 | .add_category(
237 | type_category_mapping=type_category_mapping
238 | ) # must come after types; type -> category
239 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping)
240 | .add_is_compliance_flag()
241 | .add_retired_and_issued_totals(credits=credits)
242 | .add_first_issuance_and_retirement_dates(credits=credits)
243 | .add_missing_columns(schema=project_schema)
244 | .convert_to_datetime(columns=['listed_at'])
245 | .validate(schema=project_schema)
246 | )
247 | return data
248 |
--------------------------------------------------------------------------------
/offsets_db_data/arb.py:
--------------------------------------------------------------------------------
1 | import janitor # noqa: F401
2 | import numpy as np
3 | import pandas as pd
4 | import pandas_flavor as pf
5 |
6 | from offsets_db_data.common import convert_to_datetime # noqa: F401
7 | from offsets_db_data.models import credit_without_id_schema
8 |
9 |
10 | def _get_registry(item):
11 | registry_map = {
12 | 'CAR': 'climate-action-reserve',
13 | 'ACR': 'american-carbon-registry',
14 | 'VCS': 'verra',
15 | 'ART': 'art-trees',
16 | }
17 | prefix = item[:3]
18 | return registry_map.get(prefix)
19 |
20 |
21 | @pf.register_dataframe_method
22 | def process_arb(df: pd.DataFrame) -> pd.DataFrame:
23 | """
24 | Process ARB (Air Resources Board) data by renaming columns, handling nulls, interpolating vintages,
25 | and transforming the data structure for transactions.
26 |
27 | Parameters
28 | ----------
29 | df : pd.DataFrame
30 | Input DataFrame containing raw ARB data.
31 |
32 | Returns
33 | -------
34 | data : pd.DataFrame
35 | Processed DataFrame with ARB data. Columns include 'opr_id', 'vintage', 'issued_at' (interpolated),
36 | various credit transaction types, and quantities. The DataFrame is also validated against
37 | a predefined schema for credit data.
38 |
39 | Notes
40 | -----
41 | - The function renames columns for readability and standardization.
42 | - It interpolates missing vintage values and handles NaNs in 'issuance' column.
43 | - Retirement transactions are derived based on compliance period dates.
44 | - The DataFrame is melted to restructure credit data.
45 | - Zero retirement events are dropped as they are considered artifacts.
46 | - A prefix is added to 'project_id' to indicate the source.
47 | - The 'registry' column is derived based on the project_id prefix.
48 | - The 'vintage' column is converted to integer type.
49 | - Finally, the data is converted to datetime where necessary and validated against a predefined schema.
50 | """
51 |
52 | df = df.copy()
53 |
54 | rename_d = {
55 | 'OPR Project ID': 'opr_id',
56 | 'ARB Offset Credits Issued': 'issuance',
57 | 'Project Type': 'project_type',
58 | 'Issuance Date': 'issued_at',
59 | 'Vintage': 'vintage',
60 | 'Retired Voluntarily': 'vcm_retirement',
61 | 'Retired 1st Compliance Period (CA)': 'first_compliance_ca',
62 | 'Retired 2nd Compliance Period (CA)': 'second_compliance_ca',
63 | 'Retired 3rd Compliance Period (CA)': 'third_compliance_ca',
64 | 'Retired 4th Compliance Period (CA)': 'fourth_compliance_ca',
65 | 'Retired for Compliance in Quebec': 'qc_compliance',
66 | }
67 |
68 | df = df.rename(columns=rename_d)
69 | df['vintage'] = df[
70 | 'vintage'
71 | ].interpolate() # data is ordered; fills na vintage for zero issuance reporting periods
72 |
73 | df['project_type'] = df['project_type'].str.lower()
74 |
75 | # can be multiple issuance in single RP -- grab issuance ID so can aggregate later
76 |
77 | df = df.replace('reforest defer', np.nan)
78 | df.loc[pd.isna(df['issuance']), 'issuance'] = 0
79 |
80 | print(f'Loaded {len(df)} rows from ARB issuance table')
81 | df = df[rename_d.values()]
82 |
83 | compliance_period_dates = {
84 | 'vcm_retirement': np.datetime64('NaT'),
85 | 'qc_compliance': np.datetime64('NaT'),
86 | 'first_compliance_ca': np.datetime64('2016-03-21'),
87 | 'second_compliance_ca': np.datetime64('2018-11-01'),
88 | 'third_compliance_ca': np.datetime64('2021-11-01'),
89 | 'fourth_compliance_ca': np.datetime64('2022-11-01'),
90 | }
91 | # rename columns to what we want `transaction_type` to be in the end. then call melt
92 | # which casts to (opr_id, vintage, issued_at, transaction_type, quantity)
93 | credit_cols = [
94 | 'issuance',
95 | 'vcm_retirement',
96 | 'first_compliance_ca',
97 | 'second_compliance_ca',
98 | 'third_compliance_ca',
99 | 'fourth_compliance_ca',
100 | 'qc_compliance',
101 | ]
102 | melted = df.melt(
103 | id_vars=['opr_id', 'vintage', 'issued_at'],
104 | value_vars=credit_cols,
105 | var_name='transaction_type',
106 | value_name='quantity',
107 | )
108 | melted.loc[melted['transaction_type'].isin(compliance_period_dates.keys()), 'issued_at'] = (
109 | melted['transaction_type'].map(compliance_period_dates)
110 | )
111 | melted = melted.rename(columns={'issued_at': 'transaction_date'}).to_datetime(
112 | 'transaction_date', format='mixed', utc=True
113 | )
114 | melted['transaction_type'] = melted.transaction_type.apply(
115 | lambda x: 'retirement' if x in compliance_period_dates else x
116 | )
117 |
118 | # handle missing in retirement cols (i.e. ACR570 2022)
119 | melted.loc[pd.isna(melted['quantity']), 'quantity'] = 0
120 |
121 | # drop all th zero retirement events, as they're artifacts of processing steps
122 | data = melted[
123 | ~((melted['transaction_type'] == 'retirement') & (melted['quantity'] == 0))
124 | ].copy()
125 | # add a prefix to the project_id to indicate the source
126 | data['project_id'] = data.opr_id.apply(
127 | lambda item: item
128 | if isinstance(item, str)
129 | and (item.startswith('CAR') or item.startswith('ACR') or item.startswith('VCS'))
130 | else f'VCS{item}'
131 | )
132 | data['registry'] = data.project_id.apply(_get_registry)
133 | data['vintage'] = data['vintage'].astype(int)
134 |
135 | data = (
136 | data.add_missing_columns(schema=credit_without_id_schema)
137 | .convert_to_datetime(columns=['transaction_date'])
138 | .validate(schema=credit_without_id_schema)
139 | )
140 |
141 | return data
142 |
--------------------------------------------------------------------------------
/offsets_db_data/catalog.yaml:
--------------------------------------------------------------------------------
1 | metadata:
2 | description: https://carbonplan.org/research/offsets-db-explainer
3 | TERMS_OF_DATA_ACCESS: |
4 | # OffsetsDB
5 |
6 | OffsetsDB, created by CarbonPlan (https://carbonplan.org) is a regularly-updated snapshot of carbon offset projects, credit issuances, and credit retirements published by the following carbon offset registries:
7 |
8 | American Carbon Registry (ACR)
9 | ART TREES (ART)
10 | Climate Action Reserve (CAR)
11 | Gold Standard (GLD)
12 | Verra (VCS)
13 |
14 | Carbon offset information has historically been scattered across multiple locations in formats that are not particularly useful to researchers. This database is meant to increase transparency, accountability, and reliability of the carbon offset market, and to provide researchers with a robust tool for visualizing, validating, and cross-checking offsets. We hope you find it useful!
15 |
16 | ## Our Terms of Use Apply To OffsetsDB
17 |
18 | By downloading, copying, or using this project, and/or any associated content or data, you agree to CarbonPlan’s Terms Of Use, which can be found here: https://carbonplan.org/terms. As further explained in the Terms of Use, CarbonPlan makes its projects — including OffsetsDB — available strictly on an “as-is” and “as-available” basis, without warranty of any kind, including without limitation the warranties of merchantability, fitness for a particular purpose, and noninfringement.
19 |
20 | ## Intellectual Property Rights
21 |
22 | Because OffsetsDB consists of purely factual information concerning carbon offsets that has been made publicly available by the above-referenced registries, CarbonPlan does not claim copyright in this data.
23 |
24 | However, please note that CarbonPlan does not make any representation as to whether any of the above-referenced registries may claim any rights in the data they have published. If you have any questions or concerns about this, please reach out to the registries directly.
25 |
26 | version: 1.0.0
27 | sources:
28 | credits:
29 | description: OffsetsDB processed and transformed data
30 | driver: parquet
31 | parameters:
32 | date:
33 | description: date of the data to load
34 | type: str
35 | default: '2024-02-13'
36 | args:
37 | urlpath: 's3://carbonplan-offsets-db/final/{{ date }}/credits-augmented.parquet'
38 | storage_options: { 'anon': True }
39 | engine: 'fastparquet'
40 |
41 | projects:
42 | description: OffsetsDB processed and transformed data
43 | driver: parquet
44 | parameters:
45 | date:
46 | description: date of the data to load
47 | type: str
48 | default: '2024-02-13'
49 | args:
50 | urlpath: 's3://carbonplan-offsets-db/final/{{ date }}/projects-augmented.parquet'
51 | storage_options: { 'anon': True }
52 | engine: 'fastparquet'
53 |
54 | raw_projects:
55 | description: Raw projects data downloaded from the registries on a daily basis
56 | driver: csv
57 | parameters:
58 | registry:
59 | description: registry name
60 | type: str
61 | default: verra
62 | allowed:
63 | - verra
64 | - art-trees
65 | - gold-standard
66 | - american-carbon-registry
67 | - climate-action-reserve
68 |
69 | date:
70 | description: date of the data to load
71 | type: str
72 | default: '2024-02-13'
73 | args:
74 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/{{ registry }}/projects.csv.gz'
75 | storage_options: { 'anon': True }
76 |
77 | raw_verra_transactions:
78 | description: Raw Verra transactions data downloaded from the registries on a daily basis
79 | driver: csv
80 | parameters:
81 | date:
82 | description: date of the data to load
83 | type: str
84 | default: '2024-02-13'
85 | args:
86 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/verra/transactions.csv.gz'
87 | storage_options: { 'anon': True }
88 |
89 | raw_gold_standard_transactions:
90 | description: Raw Gold Standard transactions data downloaded from the registries on a daily basis
91 | driver: csv
92 | parameters:
93 | date:
94 | description: date of the data to load
95 | type: str
96 | default: '2024-02-13'
97 | transaction_type:
98 | description: transaction type
99 | type: str
100 | default: 'issuances'
101 | allowed:
102 | - issuances
103 | - retirements
104 | - cancellations
105 | args:
106 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/gold-standard/{{ transaction_type }}.csv.gz'
107 | storage_options: { 'anon': True }
108 |
109 | raw_art_trees_transactions:
110 | description: Raw Art Trees transactions data downloaded from the registries on a daily basis
111 | driver: csv
112 | parameters:
113 | date:
114 | description: date of the data to load
115 | type: str
116 | default: '2024-02-13'
117 | transaction_type:
118 | description: transaction type
119 | type: str
120 | default: 'issuances'
121 | allowed:
122 | - issuances
123 | - retirements
124 | - cancellations
125 | args:
126 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/art-trees/{{ transaction_type }}.csv.gz'
127 | storage_options: { 'anon': True }
128 |
129 | raw_american_carbon_registry_transactions:
130 | description: Raw American Carbon Registry transactions data downloaded from the registries on a daily basis
131 | driver: csv
132 | parameters:
133 | date:
134 | description: date of the data to load
135 | type: str
136 | default: '2024-02-13'
137 | transaction_type:
138 | description: transaction type
139 | type: str
140 | default: 'issuances'
141 | allowed:
142 | - issuances
143 | - retirements
144 | - cancellations
145 | args:
146 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/american-carbon-registry/{{ transaction_type }}.csv.gz'
147 | storage_options: { 'anon': True }
148 |
149 | raw_climate_action_reserve_transactions:
150 | description: Raw Climate Action Reserve transactions data downloaded from the registries on a daily basis
151 | driver: csv
152 | parameters:
153 | date:
154 | description: date of the data to load
155 | type: str
156 | default: '2024-02-13'
157 | transaction_type:
158 | description: transaction type
159 | type: str
160 | default: 'issuances'
161 | allowed:
162 | - issuances
163 | - retirements
164 | - cancellations
165 | args:
166 | urlpath: 's3://carbonplan-offsets-db/raw/{{ date }}/climate-action-reserve/{{ transaction_type }}.csv.gz'
167 | storage_options: { 'anon': True }
168 |
--------------------------------------------------------------------------------
/offsets_db_data/common.py:
--------------------------------------------------------------------------------
1 | import json
2 | import typing
3 | from collections import defaultdict
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import pandas_flavor as pf
8 | import pandera as pa
9 | import upath
10 |
11 | CREDIT_SCHEMA_UPATH = (
12 | upath.UPath(__file__).parents[0] / 'configs' / 'credits-raw-columns-mapping.json'
13 | )
14 | PROTOCOL_MAPPING_UPATH = upath.UPath(__file__).parents[0] / 'configs' / 'all-protocol-mapping.json'
15 | PROJECT_SCHEMA_UPATH = (
16 | upath.UPath(__file__).parents[0] / 'configs' / 'projects-raw-columns-mapping.json'
17 | )
18 | TYPE_CATEGORY_MAPPING_UPATH = (
19 | upath.UPath(__file__).parents[0] / 'configs' / 'type-category-mapping.json'
20 | )
21 |
22 | BERKELEY_PROJECT_TYPE_UPATH = (
23 | upath.UPath(__file__).parents[0] / 'configs' / 'berkeley-project-types.json'
24 | )
25 |
26 |
27 | def load_registry_project_column_mapping(
28 | *, registry_name: str, file_path: upath.UPath = PROJECT_SCHEMA_UPATH
29 | ) -> dict:
30 | with open(file_path) as file:
31 | data = json.load(file)
32 |
33 | mapping: dict = {}
34 | for key1, value_dict in data.items():
35 | for key2, value in value_dict.items():
36 | if key2 not in mapping:
37 | mapping[key2] = {}
38 | if value:
39 | mapping[key2][key1] = value
40 | return mapping[registry_name]
41 |
42 |
43 | def load_protocol_mapping(path: upath.UPath = PROTOCOL_MAPPING_UPATH) -> dict:
44 | return json.loads(path.read_text())
45 |
46 |
47 | def load_inverted_protocol_mapping() -> dict:
48 | protocol_mapping = load_protocol_mapping()
49 | store = defaultdict(list)
50 | for protocol_str, metadata in protocol_mapping.items():
51 | for known_string in metadata.get('known-strings', []):
52 | store[known_string].append(protocol_str)
53 |
54 | return store
55 |
56 |
57 | def load_column_mapping(*, registry_name: str, download_type: str, mapping_path: str) -> dict:
58 | with open(mapping_path) as f:
59 | registry_credit_column_mapping = json.load(f)
60 | return registry_credit_column_mapping[registry_name][download_type]
61 |
62 |
63 | def load_type_category_mapping(path: upath.UPath = TYPE_CATEGORY_MAPPING_UPATH) -> dict:
64 | return json.loads(path.read_text())
65 |
66 |
67 | @pf.register_dataframe_method
68 | def set_registry(df: pd.DataFrame, registry_name: str) -> pd.DataFrame:
69 | """
70 | Set the registry name for each record in the DataFrame.
71 |
72 | Parameters
73 | ----------
74 | df : pd.DataFrame
75 | Input DataFrame.
76 | registry_name : str
77 | Name of the registry to set.
78 |
79 | Returns
80 | -------
81 | pd.DataFrame
82 | DataFrame with a new 'registry' column set to the specified registry name."""
83 |
84 | df['registry'] = registry_name
85 | return df
86 |
87 |
88 | @pf.register_dataframe_method
89 | def convert_to_datetime(
90 | df: pd.DataFrame, *, columns: list, utc: bool = True, **kwargs: typing.Any
91 | ) -> pd.DataFrame:
92 | """
93 | Convert specified columns in the DataFrame to datetime format.
94 |
95 | Parameters
96 | ----------
97 | df : pd.DataFrame
98 | Input DataFrame.
99 | columns : list
100 | List of column names to convert to datetime.
101 | utc : bool, optional
102 | Whether to convert to UTC (default is True).
103 | **kwargs : typing.Any
104 | Additional keyword arguments passed to pd.to_datetime.
105 |
106 | Returns
107 | -------
108 | pd.DataFrame
109 | DataFrame with specified columns converted to datetime format.
110 | """
111 |
112 | for column in columns:
113 | if column not in df.columns:
114 | raise KeyError(f"The column '{column}' is missing.")
115 | try:
116 | df[column] = pd.to_datetime(df[column], utc=utc, **kwargs).dt.normalize()
117 | except ValueError:
118 | df[column] = pd.to_datetime(df[column], utc=utc).dt.normalize()
119 | return df
120 |
121 |
122 | @pf.register_dataframe_method
123 | def add_missing_columns(df: pd.DataFrame, *, schema: pa.DataFrameSchema) -> pd.DataFrame:
124 | """
125 | Add any missing columns to the DataFrame and initialize them with None.
126 |
127 | Parameters
128 | ----------
129 | df : pd.DataFrame
130 | Input DataFrame.
131 | schema : pa.DataFrameSchema
132 | Pandera schema to validate against.
133 |
134 |
135 | Returns
136 | -------
137 | pd.DataFrame
138 | DataFrame with all specified columns, adding missing ones initialized to None.
139 | """
140 |
141 | default_values = {
142 | np.dtype('int64'): 0,
143 | np.dtype('int32'): 0,
144 | np.dtype('float64'): 0.0,
145 | np.dtype('float32'): 0.0,
146 | np.dtype('O'): None,
147 | np.dtype(' pd.DataFrame:
163 | """
164 | Validate the DataFrame against a given Pandera schema.
165 |
166 | Parameters
167 | ----------
168 | df : pd.DataFrame
169 | Input DataFrame.
170 | schema : pa.DataFrameSchema
171 | Pandera schema to validate against.
172 |
173 | Returns
174 | -------
175 | pd.DataFrame
176 | DataFrame with columns sorted according to the schema and validated against it.
177 | """
178 |
179 | results = schema.validate(df)
180 | keys = sorted(list(schema.columns.keys()))
181 | results = results[keys]
182 |
183 | return results
184 |
185 |
186 | @pf.register_dataframe_method
187 | def clean_and_convert_numeric_columns(df: pd.DataFrame, *, columns: list[str]) -> pd.DataFrame:
188 | """
189 | Clean and convert specified columns to numeric format in the DataFrame.
190 |
191 | Parameters
192 | ----------
193 | df : pd.DataFrame
194 | Input DataFrame.
195 | columns : list[str]
196 | List of column names to clean and convert to numeric format.
197 |
198 | Returns
199 | -------
200 | pd.DataFrame
201 | DataFrame with specified columns cleaned (removing commas) and converted to numeric format.
202 | """
203 |
204 | for column in columns:
205 | df[column] = df[column].str.replace(',', '', regex=True)
206 | df[column] = pd.to_numeric(df[column], errors='coerce')
207 | return df
208 |
--------------------------------------------------------------------------------
/offsets_db_data/configs/credits-raw-columns-mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "american-carbon-registry": {
3 | "cancellations": {
4 | "project_id": "Project ID",
5 | "quantity": "Quantity of Credits",
6 | "retirement_account": null,
7 | "retirement_beneficiary": null,
8 | "retirement_note": null,
9 | "retirement_reason": null,
10 | "transaction_date": "Status Effective (GMT)",
11 | "vintage": "Vintage"
12 | },
13 | "issuances": {
14 | "project_id": "Project ID",
15 | "quantity": "Total Credits Issued",
16 | "retirement_account": null,
17 | "retirement_beneficiary": null,
18 | "retirement_note": null,
19 | "retirement_reason": null,
20 | "transaction_date": "Date Issued (GMT)",
21 | "vintage": "Vintage"
22 | },
23 | "retirements": {
24 | "project_id": "Project ID",
25 | "quantity": "Quantity of Credits",
26 | "retirement_account": "Account Holder",
27 | "retirement_beneficiary": null,
28 | "retirement_note": "Purpose of Retirement",
29 | "retirement_reason": "Retirement Reason",
30 | "transaction_date": "Status Effective (GMT)",
31 | "vintage": "Vintage"
32 | }
33 | },
34 | "art-trees": {
35 | "cancellations": {
36 | "project_id": "Program ID",
37 | "quantity": "Quantity of Credits",
38 | "retirement_account": null,
39 | "retirement_beneficiary": null,
40 | "retirement_note": null,
41 | "retirement_reason": null,
42 | "transaction_date": "Status Effective",
43 | "vintage": "Vintage"
44 | },
45 | "issuances": {
46 | "project_id": "Program ID",
47 | "quantity": "Credits Verified",
48 | "retirement_account": null,
49 | "retirement_beneficiary": null,
50 | "retirement_note": null,
51 | "retirement_reason": null,
52 | "transaction_date": "Date Approved",
53 | "vintage": "Vintage"
54 | },
55 | "retirements": {
56 | "project_id": "Program ID",
57 | "quantity": "Quantity of Credits",
58 | "retirement_account": "Account Holder",
59 | "retirement_beneficiary": null,
60 | "retirement_note": "Retirement Reason Details",
61 | "retirement_reason": "Retirement Reason",
62 | "transaction_date": "Status Effective",
63 | "vintage": "Vintage"
64 | }
65 | },
66 | "climate-action-reserve": {
67 | "cancellations": {
68 | "project_id": "Project ID",
69 | "quantity": "Quantity of Offset Credits",
70 | "retirement_account": null,
71 | "retirement_beneficiary": null,
72 | "retirement_note": null,
73 | "retirement_reason": null,
74 | "transaction_date": "Status Effective",
75 | "vintage": "Vintage"
76 | },
77 | "issuances": {
78 | "project_id": "Project ID",
79 | "quantity": "Total Offset Credits Issued",
80 | "retirement_account": null,
81 | "retirement_beneficiary": null,
82 | "retirement_note": null,
83 | "retirement_reason": null,
84 | "transaction_date": "Date Issued",
85 | "vintage": "Vintage"
86 | },
87 | "retirements": {
88 | "project_id": "Project ID",
89 | "quantity": "Quantity of Offset Credits",
90 | "retirement_account": "Account Holder",
91 | "retirement_beneficiary": null,
92 | "retirement_note": "Retirement Reason Details",
93 | "retirement_reason": "Retirement Reason",
94 | "transaction_date": "Status Effective",
95 | "vintage": "Vintage"
96 | }
97 | },
98 | "gold-standard": {
99 | "issuances": {
100 | "project_id": "GSID",
101 | "quantity": "Quantity",
102 | "retirement_account": null,
103 | "retirement_beneficiary": null,
104 | "retirement_note": null,
105 | "retirement_reason": null,
106 | "transaction_date": "Issuance Date",
107 | "vintage": "Vintage"
108 | },
109 | "retirements": {
110 | "project_id": "GSID",
111 | "quantity": "Quantity",
112 | "retirement_account": null,
113 | "retirement_beneficiary": "Using Entity",
114 | "retirement_note": "Note",
115 | "retirement_reason": null,
116 | "transaction_date": "Retirement Date",
117 | "vintage": "Vintage"
118 | }
119 | },
120 | "verra": {
121 | "transactions": {
122 | "project_id": null,
123 | "quantity": null,
124 | "retirement_account": null,
125 | "retirement_beneficiary": "Retirement Beneficiary",
126 | "retirement_note": "Retirement Details",
127 | "retirement_reason": "Retirement Reason",
128 | "transaction_date": null,
129 | "vintage": null
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/offsets_db_data/configs/projects-raw-columns-mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "country": {
3 | "american-carbon-registry": "Project Site Country",
4 | "art-trees": "Program Country",
5 | "climate-action-reserve": "Project Site Country",
6 | "gold-standard": "Country",
7 | "verra": "Country/Area"
8 | },
9 | "listed_at": {
10 | "american-carbon-registry": null,
11 | "art-trees": null,
12 | "climate-action-reserve": "Project Listed Date",
13 | "gold-standard": null,
14 | "verra": null
15 | },
16 | "name": {
17 | "american-carbon-registry": "Project Name",
18 | "art-trees": "Program Name",
19 | "climate-action-reserve": "Project Name",
20 | "gold-standard": "Project Name",
21 | "verra": "Name"
22 | },
23 | "original_protocol": {
24 | "american-carbon-registry": "Project Methodology/Protocol",
25 | "art-trees": null,
26 | "climate-action-reserve": "Project Type",
27 | "gold-standard": "Methodology",
28 | "verra": "Methodology"
29 | },
30 | "project_id": {
31 | "american-carbon-registry": "Project ID",
32 | "art-trees": "Program ID",
33 | "climate-action-reserve": "Project ID",
34 | "gold-standard": "GSID",
35 | "verra": "ID"
36 | },
37 | "proponent": {
38 | "american-carbon-registry": null,
39 | "art-trees": "Sovereign Program Developer",
40 | "climate-action-reserve": "Project Owner",
41 | "gold-standard": "Project Developer Name",
42 | "verra": "Proponent"
43 | },
44 | "status": {
45 | "american-carbon-registry": null,
46 | "art-trees": "Status",
47 | "climate-action-reserve": "Status",
48 | "gold-standard": "Status",
49 | "verra": "Status"
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/offsets_db_data/configs/type-category-mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "advanced refrigerants": {
3 | "category": "ghg-management",
4 | "project-type-display-name": "Advanced Refrigerants"
5 | },
6 | "afforestation/reforestation": {
7 | "category": "forest",
8 | "project-type-display-name": "Afforestation + Reforestation"
9 | },
10 | "aluminum smelters emission reductions": {
11 | "category": "fuel-switching",
12 | "project-type-display-name": "Aluminum Smelter"
13 | },
14 | "avoided forest conversion": {
15 | "category": "forest",
16 | "project-type-display-name": "Avoided Forest Conversion"
17 | },
18 | "avoided grassland conversion": {
19 | "category": "land-use",
20 | "project-type-display-name": "Avoided Grassland Conversion"
21 | },
22 | "bicycles": {
23 | "category": "fuel-switching",
24 | "project-type-display-name": "Bicycle"
25 | },
26 | "biochar": {
27 | "category": "biochar",
28 | "project-type-display-name": "Biochar"
29 | },
30 | "biodigesters": {
31 | "category": "ghg-management",
32 | "project-type-display-name": "Methane Biodigester"
33 | },
34 | "biomass": {
35 | "category": "fuel-switching",
36 | "project-type-display-name": "Biomass"
37 | },
38 | "brick manufacturing emission reductions": {
39 | "category": "energy-efficiency",
40 | "project-type-display-name": "Brick Manufacturing"
41 | },
42 | "bundled compost production and soil application": {
43 | "category": "ghg-management",
44 | "project-type-display-name": "Compost"
45 | },
46 | "bundled energy efficiency": {
47 | "category": "energy-efficiency",
48 | "project-type-display-name": "Energy Efficiency"
49 | },
50 | "carbon capture & enhanced oil recovery": {
51 | "category": "ghg-management",
52 | "project-type-display-name": "Enhanced Oil Recovery"
53 | },
54 | "carbon capture in concrete": {
55 | "category": "carbon-capture",
56 | "project-type-display-name": "Concrete CCS"
57 | },
58 | "carbon capture in plastic": {
59 | "category": "carbon-capture",
60 | "project-type-display-name": "Plastic CCS"
61 | },
62 | "carbon-absorbing concrete": {
63 | "category": "carbon-capture",
64 | "project-type-display-name": "Concrete CCS"
65 | },
66 | "clean water": {
67 | "category": "energy-efficiency",
68 | "project-type-display-name": "Clean Water"
69 | },
70 | "community boreholes": {
71 | "category": "energy-efficiency",
72 | "project-type-display-name": "Borehole"
73 | },
74 | "composting": {
75 | "category": "ghg-management",
76 | "project-type-display-name": "Compost"
77 | },
78 | "cookstoves": {
79 | "category": "fuel-switching",
80 | "project-type-display-name": "Cookstove"
81 | },
82 | "electric vehicles & charging": {
83 | "category": "fuel-switching",
84 | "project-type-display-name": "Electric Vehicle"
85 | },
86 | "energy efficiency": {
87 | "category": "energy-efficiency",
88 | "project-type-display-name": "Energy Efficiency"
89 | },
90 | "feed additives": {
91 | "category": "ghg-management",
92 | "project-type-display-name": "Feed Additive"
93 | },
94 | "fleet efficiency": {
95 | "category": "energy-efficiency",
96 | "project-type-display-name": "Fleet Efficiency"
97 | },
98 | "fuel switching": {
99 | "category": "fuel-switching",
100 | "project-type-display-name": "Fuel Switching"
101 | },
102 | "fuel transport": {
103 | "category": "fuel-switching",
104 | "project-type-display-name": "Fuel Transport"
105 | },
106 | "geothermal": {
107 | "category": "renewable-energy",
108 | "project-type-display-name": "Geothermal"
109 | },
110 | "grid expansion & mini-grids": {
111 | "category": "fuel-switching",
112 | "project-type-display-name": "Grid Improvements"
113 | },
114 | "hfc refrigerant reclamation": {
115 | "category": "ghg-management",
116 | "project-type-display-name": "HFC Reclamation"
117 | },
118 | "hfc replacement in foam production": {
119 | "category": "ghg-management",
120 | "project-type-display-name": "HFC Replacement"
121 | },
122 | "hfc23 destruction": {
123 | "category": "ghg-management",
124 | "project-type-display-name": "HFC Destruction"
125 | },
126 | "hydropower": {
127 | "category": "renewable-energy",
128 | "project-type-display-name": "Hydropower"
129 | },
130 | "improved forest management": {
131 | "category": "forest",
132 | "project-type-display-name": "Improved Forest Management"
133 | },
134 | "improved irrigation management": {
135 | "category": "agriculture",
136 | "project-type-display-name": "Improved Irrigation Management"
137 | },
138 | "landfill methane": {
139 | "category": "ghg-management",
140 | "project-type-display-name": "Landfill"
141 | },
142 | "leak detection & repair in gas systems": {
143 | "category": "ghg-management",
144 | "project-type-display-name": "Gas Leak Repair"
145 | },
146 | "lighting": {
147 | "category": "energy-efficiency",
148 | "project-type-display-name": "Lighting"
149 | },
150 | "lower carbon cement & concrete": {
151 | "category": "energy-efficiency",
152 | "project-type-display-name": "Low Carbon Concrete"
153 | },
154 | "manure methane digester": {
155 | "category": "ghg-management",
156 | "project-type-display-name": "Manure Biodigester"
157 | },
158 | "mass transit": {
159 | "category": "energy-efficiency",
160 | "project-type-display-name": "Mass Transit"
161 | },
162 | "methane recovery in wastewater": {
163 | "category": "ghg-management",
164 | "project-type-display-name": "Wastewater Methane"
165 | },
166 | "mine methane capture": {
167 | "category": "ghg-management",
168 | "project-type-display-name": "Mine Methane"
169 | },
170 | "n2o destruction in adipic acid production": {
171 | "category": "ghg-management",
172 | "project-type-display-name": "N\\u2082O Destruction (Adipic Acid)"
173 | },
174 | "n2o destruction in nitric acid production": {
175 | "category": "ghg-management",
176 | "project-type-display-name": "N\\u2082O Destruction (Nitric Acid)"
177 | },
178 | "natural gas electricity generation": {
179 | "category": "fuel-switching",
180 | "project-type-display-name": "Natural Gas"
181 | },
182 | "nitrogen management": {
183 | "category": "agriculture",
184 | "project-type-display-name": "Nitrogen Management"
185 | },
186 | "oil recycling": {
187 | "category": "energy-efficiency",
188 | "project-type-display-name": "Oil Recycling"
189 | },
190 | "ozone depleting substances recovery & destruction": {
191 | "category": "ghg-management",
192 | "project-type-display-name": "Ozone Depleting Substances"
193 | },
194 | "plugging oil & gas wells": {
195 | "category": "ghg-management",
196 | "project-type-display-name": "Oil + Gas Well"
197 | },
198 | "pneumatic retrofit": {
199 | "category": "ghg-management",
200 | "project-type-display-name": "Pneumatic Retrofit"
201 | },
202 | "propylene oxide production": {
203 | "category": "ghg-management",
204 | "project-type-display-name": "Propylene Oxide"
205 | },
206 | "re bundled": {
207 | "category": "renewable-energy",
208 | "project-type-display-name": "Renewable Energy"
209 | },
210 | "redd+": {
211 | "category": "forest",
212 | "project-type-display-name": "REDD+"
213 | },
214 | "refrigerant leak detection": {
215 | "category": "ghg-management",
216 | "project-type-display-name": "Refrigerant Leak"
217 | },
218 | "rice emission reductions": {
219 | "category": "agriculture",
220 | "project-type-display-name": "Rice Emission"
221 | },
222 | "road construction emission reductions": {
223 | "category": "energy-efficiency",
224 | "project-type-display-name": "Road Construction"
225 | },
226 | "sf6 replacement": {
227 | "category": "ghg-management",
228 | "project-type-display-name": "SF\\u2086 Replacement"
229 | },
230 | "shipping": {
231 | "category": "energy-efficiency",
232 | "project-type-display-name": "Shipping"
233 | },
234 | "solar - centralized": {
235 | "category": "renewable-energy",
236 | "project-type-display-name": "Centralized Solar"
237 | },
238 | "solar - distributed": {
239 | "category": "renewable-energy",
240 | "project-type-display-name": "Distributed Solar"
241 | },
242 | "solar lighting": {
243 | "category": "renewable-energy",
244 | "project-type-display-name": "Lighting"
245 | },
246 | "solar water heaters": {
247 | "category": "renewable-energy",
248 | "project-type-display-name": "Solar Water Heater"
249 | },
250 | "solid waste separation": {
251 | "category": "ghg-management",
252 | "project-type-display-name": "Solid Waste Separation"
253 | },
254 | "sustainable agriculture": {
255 | "category": "agriculture",
256 | "project-type-display-name": "Sustainable Agriculture"
257 | },
258 | "sustainable grassland management": {
259 | "category": "land-use",
260 | "project-type-display-name": "Grassland Management"
261 | },
262 | "truck stop electrification": {
263 | "category": "energy-efficiency",
264 | "project-type-display-name": "Truck Stop"
265 | },
266 | "university campus emission reductions": {
267 | "category": "energy-efficiency",
268 | "project-type-display-name": "University"
269 | },
270 | "waste diversion": {
271 | "category": "ghg-management",
272 | "project-type-display-name": "Waste Diversion"
273 | },
274 | "waste gas recovery": {
275 | "category": "ghg-management",
276 | "project-type-display-name": "Waste Gas Recovery"
277 | },
278 | "waste heat recovery": {
279 | "category": "energy-efficiency",
280 | "project-type-display-name": "Waste Heat Recovery"
281 | },
282 | "waste incineration": {
283 | "category": "fuel-switching",
284 | "project-type-display-name": "Waste Incineration"
285 | },
286 | "waste recycling": {
287 | "category": "energy-efficiency",
288 | "project-type-display-name": "Recycling"
289 | },
290 | "waste reduction": {
291 | "category": "ghg-management",
292 | "project-type-display-name": "Waste Reduction"
293 | },
294 | "weatherization": {
295 | "category": "energy-efficiency",
296 | "project-type-display-name": "Weatherization"
297 | },
298 | "wetland restoration": {
299 | "category": "land-use",
300 | "project-type-display-name": "Wetland"
301 | },
302 | "wind": {
303 | "category": "renewable-energy",
304 | "project-type-display-name": "Wind"
305 | }
306 | }
307 |
--------------------------------------------------------------------------------
/offsets_db_data/credits.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import pathlib
3 | import subprocess
4 | import tempfile
5 | import uuid
6 |
7 | import janitor # noqa: F401
8 | import numpy as np
9 | import pandas as pd
10 | import pandas_flavor as pf
11 | import upath
12 |
13 | BENEFICIARY_MAPPING_UPATH = (
14 | upath.UPath(__file__).parents[0] / 'configs' / 'beneficiary-mappings.json'
15 | )
16 |
17 |
18 | @pf.register_dataframe_method
19 | def aggregate_issuance_transactions(df: pd.DataFrame) -> pd.DataFrame:
20 | """
21 | Aggregate issuance transactions by summing the quantity for each combination of project ID, transaction date, and vintage.
22 |
23 | Parameters
24 | ----------
25 | df : pd.DataFrame
26 | Input DataFrame containing issuance transaction data.
27 |
28 | Returns
29 | -------
30 | pd.DataFrame
31 | DataFrame with aggregated issuance transactions, filtered to include only those with a positive quantity.
32 | """
33 |
34 | # Check if 'transaction_type' exists in DataFrame columns
35 | if 'transaction_type' not in df.columns:
36 | raise KeyError("The column 'transaction_type' is missing.")
37 |
38 | # Initialize df_issuance_agg to an empty DataFrame
39 | df_issuance_agg = pd.DataFrame()
40 | df_issuance = df[df['transaction_type'] == 'issuance']
41 |
42 | if not df_issuance.empty:
43 | df_issuance_agg = (
44 | df_issuance.groupby(['project_id', 'transaction_date', 'vintage'])
45 | .agg(
46 | {
47 | 'quantity': 'sum',
48 | 'registry': 'first',
49 | 'transaction_type': 'first',
50 | }
51 | )
52 | .reset_index()
53 | )
54 | df_issuance_agg = df_issuance_agg[df_issuance_agg['quantity'] > 0]
55 | return df_issuance_agg
56 |
57 |
58 | @pf.register_dataframe_method
59 | def filter_and_merge_transactions(
60 | df: pd.DataFrame, arb_data: pd.DataFrame, project_id_column: str = 'project_id'
61 | ) -> pd.DataFrame:
62 | """
63 | Filter transactions based on project ID intersection with ARB data and merge the filtered transactions.
64 |
65 | Parameters
66 | ----------
67 | df : pd.DataFrame
68 | Input DataFrame with transaction data.
69 | arb_data : pd.DataFrame
70 | DataFrame containing ARB issuance data.
71 | project_id_column : str, optional
72 | The name of the column containing project IDs (default is 'project_id').
73 |
74 | Returns
75 | -------
76 | pd.DataFrame
77 | DataFrame with transactions from the input DataFrame, excluding those present in ARB data, merged with relevant ARB transactions.
78 | """
79 |
80 | if intersection_values := list(
81 | set(df[project_id_column]).intersection(set(arb_data[project_id_column]))
82 | ):
83 | df = df[~df[project_id_column].isin(intersection_values)]
84 | df = pd.concat(
85 | [df, arb_data[arb_data[project_id_column].isin(intersection_values)]], ignore_index=True
86 | )
87 | return df
88 |
89 |
90 | @pf.register_dataframe_method
91 | def handle_non_issuance_transactions(df: pd.DataFrame) -> pd.DataFrame:
92 | """
93 | Filter the DataFrame to include only non-issuance transactions.
94 |
95 | Parameters
96 | ----------
97 | df : pd.DataFrame
98 | Input DataFrame containing transaction data.
99 |
100 | Returns
101 | -------
102 | pd.DataFrame
103 | DataFrame containing only transactions where 'transaction_type' is not 'issuance'.
104 | """
105 |
106 | df_non_issuance = df[df['transaction_type'] != 'issuance']
107 | return df_non_issuance
108 |
109 |
110 | @pf.register_dataframe_method
111 | def merge_with_arb(credits: pd.DataFrame, *, arb: pd.DataFrame) -> pd.DataFrame:
112 | """
113 | ARB issuance table contains the authorative version of all credit transactions for ARB projects.
114 | This function drops all registry crediting data and, isntead, patches in data from the ARB issuance table.
115 |
116 | Parameters
117 | ----------
118 | credits: pd.DataFrame
119 | Pandas dataframe containing registry credit data
120 | arb: pd.DataFrame
121 | Pandas dataframe containing ARB issuance data
122 |
123 | Returns
124 | -------
125 | pd.DataFrame
126 | Pandas dataframe containing merged credit and ARB data
127 | """
128 | df = credits
129 | project_id_column = 'project_id'
130 | if intersection_values := list(
131 | set(df[project_id_column]).intersection(set(arb[project_id_column]))
132 | ):
133 | df = df[~df[project_id_column].isin(intersection_values)]
134 |
135 | df = pd.concat([df, arb], ignore_index=True)
136 | return df
137 |
138 |
139 | def harmonize_beneficiary_data(
140 | credits: pd.DataFrame, registry_name: str, download_type: str
141 | ) -> pd.DataFrame:
142 | """
143 | Harmonize the beneficiary information via OpenRefine.
144 |
145 | Parameters
146 | ----------
147 | credits : pd.DataFrame
148 | Input DataFrame containing credit data.
149 | """
150 |
151 | tempdir = tempfile.gettempdir()
152 | temp_path = pathlib.Path(tempdir) / f'{registry_name}-{download_type}-credits.csv'
153 |
154 | if len(credits) == 0:
155 | print(
156 | f'Empty dataframe with shape={credits.shape} - columns:{credits.columns.tolist()}. No credits to harmonize'
157 | )
158 | data = credits.copy()
159 | data['retirement_beneficiary_harmonized'] = pd.Series(dtype='str')
160 | return data
161 | credits.to_csv(temp_path, index=False)
162 |
163 | project_name = f'{registry_name}-{download_type}-beneficiary-harmonization-{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}-{uuid.uuid4()}'
164 | output_path = pathlib.Path(tempdir) / f'{project_name}.csv'
165 |
166 | try:
167 | return _extract_harmonized_beneficiary_data_via_openrefine(
168 | temp_path, project_name, str(BENEFICIARY_MAPPING_UPATH), str(output_path)
169 | )
170 |
171 | except subprocess.CalledProcessError as e:
172 | raise ValueError(
173 | f'Commad failed with return code: {e.returncode}\nOutput: {e.output}\nError output: {e.stderr}'
174 | ) from e
175 |
176 |
177 | def _extract_harmonized_beneficiary_data_via_openrefine(
178 | temp_path, project_name, beneficiary_mapping_path, output_path
179 | ):
180 | result = subprocess.run(
181 | [
182 | 'offsets-db-data-orcli',
183 | 'run',
184 | '--',
185 | 'import',
186 | 'csv',
187 | str(temp_path),
188 | '--projectName',
189 | f'{project_name}',
190 | ],
191 | capture_output=True,
192 | text=True,
193 | check=True,
194 | )
195 |
196 | result = subprocess.run(
197 | ['offsets-db-data-orcli', 'run', '--', 'info', project_name],
198 | capture_output=True,
199 | text=True,
200 | check=True,
201 | )
202 |
203 | result = subprocess.run(
204 | [
205 | 'offsets-db-data-orcli',
206 | 'run',
207 | '--',
208 | 'transform',
209 | project_name,
210 | beneficiary_mapping_path,
211 | ],
212 | capture_output=True,
213 | text=True,
214 | check=True,
215 | )
216 |
217 | result = subprocess.run(
218 | [
219 | 'offsets-db-data-orcli',
220 | 'run',
221 | '--',
222 | 'export',
223 | 'csv',
224 | project_name,
225 | '--output',
226 | output_path,
227 | ],
228 | capture_output=True,
229 | text=True,
230 | check=True,
231 | )
232 |
233 | result = subprocess.run(
234 | ['offsets-db-data-orcli', 'run', '--', 'delete', project_name],
235 | capture_output=True,
236 | text=True,
237 | check=True,
238 | )
239 |
240 | print(result.stdout)
241 |
242 | data = pd.read_csv(output_path)
243 | data['merged_beneficiary'] = data['merged_beneficiary'].fillna('').astype(str)
244 | data['retirement_beneficiary_harmonized'] = np.where(
245 | data['merged_beneficiary'].notnull() & (~data['merged_beneficiary'].str.contains(';%')),
246 | data['merged_beneficiary'],
247 | np.nan,
248 | )
249 | return data
250 |
--------------------------------------------------------------------------------
/offsets_db_data/data.py:
--------------------------------------------------------------------------------
1 | import intake
2 | import pkg_resources
3 |
4 | catalog_file = pkg_resources.resource_filename('offsets_db_data', 'catalog.yaml')
5 | catalog = intake.open_catalog(catalog_file)
6 |
--------------------------------------------------------------------------------
/offsets_db_data/gld.py:
--------------------------------------------------------------------------------
1 | import numpy as np # noqa: F401, I001
2 | import pandas as pd
3 | import pandas_flavor as pf
4 |
5 | from offsets_db_data.common import (
6 | BERKELEY_PROJECT_TYPE_UPATH,
7 | CREDIT_SCHEMA_UPATH,
8 | PROJECT_SCHEMA_UPATH,
9 | load_column_mapping,
10 | load_inverted_protocol_mapping,
11 | load_registry_project_column_mapping,
12 | load_type_category_mapping,
13 | )
14 | from offsets_db_data.credits import aggregate_issuance_transactions # noqa: F401
15 | from offsets_db_data.credits import filter_and_merge_transactions # noqa: F401
16 | from offsets_db_data.credits import merge_with_arb # noqa: F401
17 | from offsets_db_data.credits import harmonize_beneficiary_data
18 | from offsets_db_data.models import credit_without_id_schema, project_schema
19 | from offsets_db_data.projects import add_category # noqa: F401
20 | from offsets_db_data.projects import add_first_issuance_and_retirement_dates # noqa: F401
21 | from offsets_db_data.projects import add_is_compliance_flag # noqa: F401
22 | from offsets_db_data.projects import add_retired_and_issued_totals # noqa: F401
23 | from offsets_db_data.projects import harmonize_country_names # noqa: F401
24 | from offsets_db_data.projects import harmonize_status_codes # noqa: F401
25 | from offsets_db_data.projects import map_protocol # noqa: F401
26 |
27 |
28 | @pf.register_dataframe_method
29 | def determine_gld_transaction_type(df: pd.DataFrame, *, download_type: str) -> pd.DataFrame:
30 | """
31 | Assign a transaction type to each record in the DataFrame based on the download type for Gold Standard transactions.
32 |
33 | Parameters
34 | ----------
35 | df : pd.DataFrame
36 | Input DataFrame containing transaction data.
37 | download_type : str
38 | Type of transaction ('issuances', 'retirements') to determine the transaction type.
39 |
40 | Returns
41 | -------
42 | pd.DataFrame
43 | DataFrame with a new 'transaction_type' column, containing assigned transaction types based on download_type.
44 | """
45 |
46 | transaction_type_mapping = {'issuances': 'issuance', 'retirements': 'retirement'}
47 | df['transaction_type'] = transaction_type_mapping[download_type]
48 | return df
49 |
50 |
51 | @pf.register_dataframe_method
52 | def add_gld_project_id(df: pd.DataFrame, *, prefix: str) -> pd.DataFrame:
53 | """
54 | Add Gold Standard project IDs to the DataFrame
55 |
56 | Parameters
57 | ----------
58 | df : pd.DataFrame
59 | Input DataFrame containing credits data.
60 | prefix : str
61 | Prefix string to prepend to each project ID.
62 |
63 | Returns
64 | -------
65 | pd.DataFrame
66 | DataFrame with a new 'project_id' column, containing the generated project IDs.
67 | """
68 |
69 | df['project_id'] = prefix + df['project_id'].astype(str)
70 | return df
71 |
72 |
73 | @pf.register_dataframe_method
74 | def process_gld_credits(
75 | df: pd.DataFrame,
76 | *,
77 | download_type: str,
78 | registry_name: str = 'gold-standard',
79 | prefix: str = 'GLD',
80 | arb: pd.DataFrame | None = None,
81 | harmonize_beneficiary_info: bool = False,
82 | ) -> pd.DataFrame:
83 | """
84 | Process Gold Standard credits data by renaming columns, setting registry, determining transaction types,
85 | adding project IDs, converting date columns, aggregating issuances (if applicable), and validating the schema.
86 |
87 | Parameters
88 | ----------
89 | df : pd.DataFrame
90 | Input DataFrame with raw Gold Standard credits data.
91 | download_type : str
92 | Type of download ('issuances' or 'retirements').
93 | registry_name : str, optional
94 | Name of the registry for setting and mapping columns (default is 'gold-standard').
95 | prefix : str, optional
96 | Prefix for generating project IDs (default is 'GLD').
97 | arb : pd.DataFrame | None, optional
98 | Additional DataFrame for data merging (default is None).
99 |
100 | Returns
101 | -------
102 | pd.DataFrame
103 | Processed DataFrame with Gold Standard credits data.
104 | """
105 |
106 | column_mapping = load_column_mapping(
107 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH
108 | )
109 |
110 | columns = {v: k for k, v in column_mapping.items()}
111 |
112 | df = df.copy()
113 |
114 | if not df.empty:
115 | data = (
116 | df.rename(columns=columns)
117 | .set_registry(registry_name=registry_name)
118 | .determine_gld_transaction_type(download_type=download_type)
119 | .add_gld_project_id(prefix=prefix)
120 | )
121 | # split on T and discard the microseconds for consistency
122 | data['transaction_date'] = data['transaction_date'].str.split('T').str[0]
123 | data = data.convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d')
124 |
125 | if download_type == 'issuances':
126 | data = data.aggregate_issuance_transactions()
127 |
128 | data = data.add_missing_columns(schema=credit_without_id_schema).validate(
129 | schema=credit_without_id_schema
130 | )
131 |
132 | if arb is not None and not arb.empty:
133 | data = data.merge_with_arb(arb=arb)
134 |
135 | else:
136 | data = (
137 | pd.DataFrame(columns=credit_without_id_schema.columns.keys())
138 | .add_missing_columns(schema=credit_without_id_schema)
139 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d')
140 | .add_missing_columns(schema=credit_without_id_schema)
141 | .validate(schema=credit_without_id_schema)
142 | )
143 |
144 | if harmonize_beneficiary_info:
145 | data = data.pipe(
146 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type
147 | )
148 |
149 | data = (
150 | data.add_missing_columns(schema=credit_without_id_schema)
151 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d')
152 | .validate(schema=credit_without_id_schema)
153 | )
154 |
155 | return data
156 |
157 |
158 | @pf.register_dataframe_method
159 | def add_gld_project_url(df: pd.DataFrame) -> pd.DataFrame:
160 | """Add url for gold standard projects
161 |
162 | gs project ids are different from the id used in gold standard urls.
163 |
164 | Parameters
165 | ----------
166 | df : pd.DataFrame
167 | Input DataFrame containing Gold Standard project data.
168 |
169 | Returns
170 | -------
171 | pd.DataFrame
172 | DataFrame with a new 'project_url' column, containing URLs for each project.
173 | """
174 | df['project_url'] = 'https://registry.goldstandard.org/projects?q=gs' + df['project_id'].apply(
175 | str
176 | )
177 | return df
178 |
179 |
180 | @pf.register_dataframe_method
181 | def process_gld_projects(
182 | df: pd.DataFrame,
183 | *,
184 | credits: pd.DataFrame,
185 | registry_name: str = 'gold-standard',
186 | prefix: str = 'GLD',
187 | ) -> pd.DataFrame:
188 | """
189 | Process Gold Standard projects data, including renaming, adding, and validating columns, harmonizing statuses,
190 | and merging with credits data.
191 |
192 | Parameters
193 | ----------
194 | df : pd.DataFrame
195 | Input DataFrame with raw Gold Standard projects data.
196 | credits : pd.DataFrame
197 | DataFrame containing credits data for merging.
198 | registry_name : str, optional
199 | Name of the registry for specific processing steps (default is 'gold-standard').
200 | prefix : str, optional
201 | Prefix for generating project IDs (default is 'GLD').
202 |
203 | Returns
204 | -------
205 | pd.DataFrame
206 | Processed DataFrame with harmonized and validated Gold Standard projects data.
207 | """
208 |
209 | registry_project_column_mapping = load_registry_project_column_mapping(
210 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH
211 | )
212 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()}
213 | type_category_mapping = load_type_category_mapping()
214 | inverted_protocol_mapping = load_inverted_protocol_mapping()
215 |
216 | df = df.copy()
217 | credits = credits.copy()
218 |
219 | if not df.empty and not credits.empty:
220 | data = (
221 | df.rename(columns=inverted_column_mapping)
222 | .set_registry(registry_name=registry_name)
223 | .add_gld_project_url()
224 | .add_gld_project_id(prefix=prefix)
225 | .harmonize_country_names()
226 | .harmonize_status_codes()
227 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping)
228 | .infer_project_type()
229 | .override_project_types(
230 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley'
231 | )
232 | .add_category(
233 | type_category_mapping=type_category_mapping
234 | ) # must come after types; type -> category
235 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping)
236 | .add_is_compliance_flag()
237 | .add_retired_and_issued_totals(credits=credits)
238 | .add_first_issuance_and_retirement_dates(credits=credits)
239 | .add_missing_columns(schema=project_schema)
240 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at'])
241 | .validate(schema=project_schema)
242 | )
243 | return data
244 |
245 | elif not df.empty and credits.empty:
246 | data = (
247 | df.rename(columns=inverted_column_mapping)
248 | .set_registry(registry_name=registry_name)
249 | .add_gld_project_url()
250 | .add_gld_project_id(prefix=prefix)
251 | .harmonize_country_names()
252 | .harmonize_status_codes()
253 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping)
254 | .infer_project_type()
255 | .override_project_types(
256 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley'
257 | )
258 | .add_category(
259 | type_category_mapping=type_category_mapping
260 | ) # must come after types; type -> category
261 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping)
262 | .add_is_compliance_flag()
263 | .add_missing_columns(schema=project_schema)
264 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at'])
265 | .validate(schema=project_schema)
266 | )
267 | return data
268 | elif df.empty:
269 | data = (
270 | pd.DataFrame(columns=project_schema.columns.keys())
271 | .add_missing_columns(schema=project_schema)
272 | .convert_to_datetime(columns=['listed_at', 'first_issuance_at', 'first_retirement_at'])
273 | )
274 |
275 | data['is_compliance'] = data['is_compliance'].astype(bool)
276 | data = data.validate(schema=project_schema)
277 | return data
278 |
--------------------------------------------------------------------------------
/offsets_db_data/models.py:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 | import janitor # noqa: F401
4 | import pandas as pd
5 | import pandera as pa
6 |
7 | RegistryType = typing.Literal[
8 | 'verra',
9 | 'gold-standard',
10 | 'art-trees',
11 | 'american-carbon-registry',
12 | 'climate-action-reserve',
13 | 'none',
14 | ]
15 |
16 |
17 | project_schema = pa.DataFrameSchema(
18 | {
19 | 'protocol': pa.Column(pa.Object, nullable=True), # Array of strings
20 | 'category': pa.Column(pa.String, nullable=True),
21 | 'project_type': pa.Column(pa.String, nullable=False),
22 | 'project_type_source': pa.Column(pa.String, nullable=False),
23 | 'retired': pa.Column(
24 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True
25 | ),
26 | 'issued': pa.Column(
27 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True
28 | ),
29 | 'project_id': pa.Column(pa.String, nullable=False),
30 | 'name': pa.Column(pa.String, nullable=True),
31 | 'registry': pa.Column(pa.String, nullable=False),
32 | 'proponent': pa.Column(pa.String, nullable=True),
33 | 'status': pa.Column(pa.String, nullable=True),
34 | 'country': pa.Column(pa.String, nullable=True),
35 | 'listed_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True),
36 | 'first_issuance_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True),
37 | 'first_retirement_at': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True),
38 | 'is_compliance': pa.Column(pa.Bool, nullable=True),
39 | 'project_url': pa.Column(pa.String, nullable=True),
40 | }
41 | )
42 |
43 |
44 | credit_without_id_schema = pa.DataFrameSchema(
45 | {
46 | 'quantity': pa.Column(
47 | pa.Int, pa.Check.greater_than_or_equal_to(0), nullable=True, coerce=True
48 | ),
49 | 'project_id': pa.Column(pa.String, nullable=False),
50 | 'vintage': pa.Column(pa.Int, nullable=True, coerce=True),
51 | 'transaction_date': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True),
52 | 'transaction_type': pa.Column(pa.String, nullable=True),
53 | 'retirement_account': pa.Column(pa.String, nullable=True),
54 | 'retirement_reason': pa.Column(pa.String, nullable=True),
55 | 'retirement_note': pa.Column(pa.String, nullable=True),
56 | 'retirement_beneficiary': pa.Column(pa.String, nullable=True),
57 | 'retirement_beneficiary_harmonized': pa.Column(pa.String, nullable=True),
58 | }
59 | )
60 |
61 | credit_schema = credit_without_id_schema.add_columns({'id': pa.Column(pa.Int, nullable=False)})
62 |
63 |
64 | clip_schema = pa.DataFrameSchema(
65 | {
66 | 'id': pa.Column(pa.Int, nullable=False),
67 | 'date': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True),
68 | 'title': pa.Column(pa.String, nullable=True),
69 | 'url': pa.Column(pa.String, nullable=True),
70 | 'source': pa.Column(pa.String, nullable=True),
71 | 'tags': pa.Column(pa.Object, nullable=True),
72 | 'notes': pa.Column(pa.String, nullable=True),
73 | 'is_waybacked': pa.Column(pa.Bool, nullable=True),
74 | 'type': pa.Column(pa.String, nullable=True),
75 | }
76 | )
77 |
--------------------------------------------------------------------------------
/offsets_db_data/openrefine.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import shutil
3 | import subprocess
4 | import tempfile
5 | import traceback
6 |
7 | import requests
8 | import rich.console
9 | import typer
10 |
11 | app = typer.Typer(help='offsets-db-data-orcli')
12 | console = rich.console.Console()
13 |
14 |
15 | @app.command()
16 | def install(
17 | url: str = typer.Option(
18 | 'https://github.com/opencultureconsulting/orcli/raw/main/orcli',
19 | help='The URL to download orcli from.',
20 | show_default=True,
21 | ),
22 | destination: str = typer.Option(
23 | './',
24 | help='The destination path to move the downloaded file to.',
25 | show_default=True,
26 | ),
27 | ):
28 | """
29 | Install orcli from GitHub.
30 | """
31 |
32 | try:
33 | tempfile_path = (pathlib.Path(tempfile.gettempdir()) / 'orcli').as_posix()
34 |
35 | file_path = f'{destination}/orcli' if destination else 'orcli'
36 | abs_file_path = pathlib.Path(file_path).expanduser().resolve()
37 | filename = abs_file_path.as_posix()
38 | # Download orcli from GitHub
39 | # Download the file with streaming to handle large files.
40 | response = requests.get(url, stream=True)
41 | response.raise_for_status() # Raise error if the download failed.
42 |
43 | with open(tempfile_path, 'wb') as f:
44 | for chunk in response.iter_content(chunk_size=8192):
45 | if chunk: # Filter out keep-alive chunks.
46 | f.write(chunk)
47 |
48 | # Make the file executable
49 | subprocess.run(['chmod', '+x', tempfile_path], check=True)
50 | console.print(f'Moving orcli from {tempfile_path} to {filename}.')
51 | subprocess.run(['mv', tempfile_path, destination], check=True)
52 | console.print(f'orcli installed to {filename}.')
53 |
54 | except Exception as _:
55 | console.print(f'Error: {traceback.format_exc()}')
56 | raise typer.Exit(1)
57 |
58 |
59 | @app.command()
60 | def run(
61 | args: list[str] = typer.Argument(help='The arguments to pass to orcli.'),
62 | binary_path: str | None = typer.Option(
63 | None, help='The path to the orcli binary.', show_default=True
64 | ),
65 | ):
66 | """
67 | Run orcli with the specified arguments.
68 | """
69 | if binary_path is None:
70 | binary_path = shutil.which('orcli')
71 | if binary_path is None:
72 | typer.echo('orcli not found. Please install orcli first.')
73 | raise typer.Exit(1)
74 |
75 | command = [binary_path] + list(args)
76 | try:
77 | result = subprocess.run(command, check=True, capture_output=True, text=True)
78 | console.print(result.stdout)
79 | return result.stdout
80 | except subprocess.CalledProcessError as e:
81 | console.print(e.stderr)
82 | raise typer.Exit(e.returncode) from e
83 |
84 |
85 | def main():
86 | app()
87 |
88 |
89 | if __name__ == '__main__':
90 | main()
91 |
--------------------------------------------------------------------------------
/offsets_db_data/pipeline_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import io
3 | import tempfile
4 | import zipfile
5 | from collections.abc import Callable
6 |
7 | import fsspec
8 | import pandas as pd
9 |
10 | from offsets_db_data.data import catalog
11 | from offsets_db_data.registry import get_registry_from_project_id
12 |
13 |
14 | def validate_data(
15 | *,
16 | new_data: pd.DataFrame,
17 | as_of: datetime.datetime,
18 | data_type: str,
19 | quantity_column: str,
20 | aggregation_func,
21 | ) -> None:
22 | success = False
23 | for delta_days in [1, 2, 3, 4]:
24 | try:
25 | previous_date = (as_of - datetime.timedelta(days=delta_days)).strftime('%Y-%m-%d')
26 | print(
27 | f'Validating {data_type} for {as_of.strftime("%Y-%m-%d")} against {previous_date}'
28 | )
29 | old_data = catalog[data_type](date=previous_date).read()
30 |
31 | new_quantity = aggregation_func(new_data[quantity_column])
32 | old_quantity = aggregation_func(old_data[quantity_column])
33 |
34 | print(f'New {data_type}: {new_data.shape} | New {quantity_column}: {new_quantity}')
35 | print(f'Old {data_type}: {old_data.shape} | Old {quantity_column}: {old_quantity}')
36 |
37 | if new_quantity < old_quantity * 0.99:
38 | raise ValueError(
39 | f'New {data_type}: {new_quantity} (from {as_of.strftime("%Y-%m-%d")}) are less than 99% of old {data_type}: {old_quantity} (from {previous_date})'
40 | )
41 | else:
42 | print(f'New {data_type} are at least 99% of old {data_type}')
43 | success = True
44 | break
45 | except Exception as e:
46 | print(f'Validation failed for {delta_days} day(s) back: {e}')
47 | continue
48 |
49 | if not success:
50 | raise ValueError(
51 | 'Validation failed for either 1, 2, 3, or 4 days back. Please make sure the data is available for either 1, 2, 3 or 4 days back.'
52 | )
53 |
54 |
55 | def validate_credits(*, new_credits: pd.DataFrame, as_of: datetime.datetime) -> None:
56 | validate_data(
57 | new_data=new_credits,
58 | as_of=as_of,
59 | data_type='credits',
60 | quantity_column='quantity',
61 | aggregation_func=sum,
62 | )
63 |
64 |
65 | def validate_projects(*, new_projects: pd.DataFrame, as_of: datetime.datetime) -> None:
66 | validate_data(
67 | new_data=new_projects,
68 | as_of=as_of,
69 | data_type='projects',
70 | quantity_column='project_id',
71 | aggregation_func=pd.Series.nunique,
72 | )
73 |
74 |
75 | def validate(
76 | *, new_credits: pd.DataFrame, new_projects: pd.DataFrame, as_of: datetime.datetime
77 | ) -> None:
78 | validate_credits(new_credits=new_credits, as_of=as_of)
79 | validate_projects(new_projects=new_projects, as_of=as_of)
80 |
81 |
82 | def summarize(
83 | *,
84 | credits: pd.DataFrame,
85 | projects: pd.DataFrame,
86 | registry_name: str | None = None,
87 | ) -> None:
88 | """
89 | Summarizes the credits, projects, and project types data.
90 |
91 | Parameters
92 | ----------
93 | credits : DataFrame
94 | The credits data.
95 | projects : DataFrame
96 | The projects data.
97 | registry_name : str, optional
98 | Name of the specific registry to summarize. If None, summarizes across all registries.
99 |
100 | Returns
101 | -------
102 | None
103 | """
104 | # Create defensive copies to avoid modifying the original dataframes
105 | credits = credits if credits.empty else credits.copy()
106 | projects = projects if projects.empty else projects.copy()
107 |
108 | # Single registry mode
109 | if registry_name:
110 | if not projects.empty:
111 | print(
112 | f'\n\nRetired and Issued (in Millions) summary for {registry_name}:\n\n'
113 | f'{projects[["retired", "issued"]].sum() / 1_000_000}\n\n'
114 | f'{projects.project_id.nunique()} unique projects.\n\n'
115 | )
116 | else:
117 | print(f'No projects found for {registry_name}...')
118 |
119 | if not credits.empty:
120 | print(
121 | f'\n\nCredits summary (in Millions) for {registry_name}:\n\n'
122 | f'{credits.groupby(["transaction_type"])[["quantity"]].sum() / 1_000_000}\n\n'
123 | f'{credits.shape[0]} total transactions.\n\n'
124 | )
125 | else:
126 | print(f'No credits found for {registry_name}...')
127 |
128 | # Multi-registry mode
129 | else:
130 | if not projects.empty:
131 | print(
132 | f'Summary Statistics for projects (in Millions):\n'
133 | f'{projects.groupby(["registry", "is_compliance"])[["retired", "issued"]].sum() / 1_000_000}\n'
134 | )
135 | else:
136 | print('No projects found')
137 |
138 | if not credits.empty:
139 | credits['registry'] = credits['project_id'].map(get_registry_from_project_id)
140 |
141 | print(
142 | f'Summary Statistics for credits (in Millions):\n'
143 | f'{credits.groupby(["registry", "transaction_type"])[["quantity"]].sum() / 1_000_000}\n'
144 | )
145 | else:
146 | print('No credits found')
147 |
148 |
149 | def to_parquet(
150 | *,
151 | credits: pd.DataFrame,
152 | projects: pd.DataFrame,
153 | output_paths: dict,
154 | registry_name: str | None = None,
155 | ):
156 | """
157 | Write the given DataFrames to Parquet files.
158 |
159 | Parameters
160 | -----------
161 | credits : pd.DataFrame
162 | The DataFrame containing credits data.
163 | projects : pd.DataFrame
164 | The DataFrame containing projects data.
165 | output_paths : dict
166 | Dictionary containing output file paths.
167 |
168 | registry_name : str, optional
169 | The name of the registry for logging purposes.
170 | """
171 | credits.to_parquet(
172 | output_paths['credits'], index=False, compression='gzip', engine='fastparquet'
173 | )
174 |
175 | prefix = f'{registry_name} ' if registry_name else ''
176 | print(f'Wrote {prefix} credits to {output_paths["credits"]}...')
177 |
178 | projects.to_parquet(
179 | output_paths['projects'], index=False, compression='gzip', engine='fastparquet'
180 | )
181 | print(f'Wrote {prefix} projects to {output_paths["projects"]}...')
182 |
183 |
184 | def _create_data_zip_buffer(
185 | *,
186 | credits: pd.DataFrame,
187 | projects: pd.DataFrame,
188 | format_type: str,
189 | terms_content: str,
190 | ) -> io.BytesIO:
191 | """
192 | Create a zip buffer containing data files in the specified format with terms of access.
193 |
194 | Parameters
195 | ----------
196 | credits : pd.DataFrame
197 | DataFrame containing credit data.
198 | projects : pd.DataFrame
199 | DataFrame containing project data.
200 | project_types : pd.DataFrame
201 | DataFrame containing project type data.
202 | format_type : str
203 | Format type, either 'csv' or 'parquet'.
204 | terms_content : str
205 | Content of the terms of access file.
206 |
207 | Returns
208 | -------
209 | io.BytesIO
210 | Buffer containing the zip file.
211 | """
212 | zip_buffer = io.BytesIO()
213 |
214 | with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zf:
215 | zf.writestr('TERMS_OF_DATA_ACCESS.txt', terms_content)
216 |
217 | if format_type == 'csv':
218 | with zf.open('credits.csv', 'w') as buffer:
219 | credits.to_csv(buffer, index=False)
220 | with zf.open('projects.csv', 'w') as buffer:
221 | projects.to_csv(buffer, index=False)
222 |
223 | elif format_type == 'parquet':
224 | # Write Parquet files to temporary files
225 | with tempfile.NamedTemporaryFile(suffix='.parquet') as temp_credits:
226 | credits.to_parquet(temp_credits.name, index=False, engine='fastparquet')
227 | temp_credits.seek(0)
228 | zf.writestr('credits.parquet', temp_credits.read())
229 |
230 | with tempfile.NamedTemporaryFile(suffix='.parquet') as temp_projects:
231 | projects.to_parquet(temp_projects.name, index=False, engine='fastparquet')
232 | temp_projects.seek(0)
233 | zf.writestr('projects.parquet', temp_projects.read())
234 |
235 | # Move to the beginning of the BytesIO buffer
236 | zip_buffer.seek(0)
237 | return zip_buffer
238 |
239 |
240 | def write_latest_production(
241 | *,
242 | credits: pd.DataFrame,
243 | projects: pd.DataFrame,
244 | bucket: str,
245 | terms_url: str = 's3://carbonplan-offsets-db/TERMS_OF_DATA_ACCESS.txt',
246 | ):
247 | """
248 | Write the latest production data to S3 as zip archives containing CSV and Parquet files.
249 |
250 | Parameters
251 | ----------
252 | credits : pd.DataFrame
253 | DataFrame containing credit data.
254 | projects : pd.DataFrame
255 | DataFrame containing project data.
256 | bucket : str
257 | S3 bucket path to write the data to.
258 | terms_url : str, optional
259 | URL of the terms of access file.
260 | """
261 | paths = {
262 | 'csv': f'{bucket}/production/latest/offsets-db.csv.zip',
263 | 'parquet': f'{bucket}/production/latest/offsets-db.parquet.zip',
264 | }
265 |
266 | # Get terms content once
267 | fs = fsspec.filesystem('s3', anon=False)
268 | terms_content = fs.read_text(terms_url)
269 |
270 | for format_type, path in paths.items():
271 | # Create zip buffer with data in the appropriate format
272 | zip_buffer = _create_data_zip_buffer(
273 | credits=credits,
274 | projects=projects,
275 | format_type=format_type,
276 | terms_content=terms_content,
277 | )
278 |
279 | # Write buffer to S3
280 | with fsspec.open(path, 'wb') as f:
281 | f.write(zip_buffer.getvalue())
282 |
283 | print(f'Wrote {format_type} to {path}...')
284 | zip_buffer.close()
285 |
286 |
287 | def transform_registry_data(
288 | *,
289 | process_credits_fn: Callable[[], pd.DataFrame],
290 | process_projects_fn: Callable[[pd.DataFrame], pd.DataFrame],
291 | output_paths: dict,
292 | registry_name: str | None = None,
293 | ):
294 | """
295 | Transform registry data by processing credits and projects, then writing to parquet files.
296 |
297 | Parameters
298 | ----------
299 | process_credits_fn : callable
300 | Function that returns processed credits DataFrame
301 | process_projects_fn : callable
302 | Function that takes a credits DataFrame and returns processed projects DataFrame
303 | output_paths : dict
304 | Dictionary containing output file paths for 'credits' and 'projects'
305 | registry_name : str, optional
306 | Name of the registry for logging purposes
307 | """
308 | # Process credits
309 | credits = process_credits_fn()
310 | if registry_name:
311 | print(f'credits for {registry_name}: {credits.head()}')
312 | else:
313 | print(f'processed credits: {credits.head()}')
314 |
315 | # Process projects
316 | projects = process_projects_fn(credits=credits)
317 | if registry_name:
318 | print(f'projects for {registry_name}: {projects.head()}')
319 | else:
320 | print(f'processed projects: {projects.head()}')
321 |
322 | # Summarize data
323 | summarize(credits=credits, projects=projects, registry_name=registry_name)
324 |
325 | # Write to parquet files
326 | to_parquet(
327 | credits=credits,
328 | projects=projects,
329 | output_paths=output_paths,
330 | registry_name=registry_name,
331 | )
332 |
333 | return credits, projects
334 |
--------------------------------------------------------------------------------
/offsets_db_data/projects.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | import json
3 |
4 | import country_converter as coco
5 | import janitor # noqa: F401
6 | import numpy as np
7 | import pandas as pd
8 | import pandas_flavor as pf
9 |
10 |
11 | @pf.register_dataframe_method
12 | def harmonize_country_names(df: pd.DataFrame, *, country_column: str = 'country') -> pd.DataFrame:
13 | """
14 | Harmonize country names in the DataFrame to standardized country names.
15 |
16 | Parameters
17 | ----------
18 | df : pd.DataFrame
19 | Input DataFrame with country data.
20 | country_column : str, optional
21 | The name of the column containing country names to be harmonized (default is 'country').
22 |
23 | Returns
24 | -------
25 | pd.DataFrame
26 | DataFrame with harmonized country names in the specified column.
27 | """
28 |
29 | print('Harmonizing country names...')
30 | cc = coco.CountryConverter()
31 | df[country_column] = cc.pandas_convert(df[country_column], to='name')
32 | print('Done converting country names...')
33 | return df
34 |
35 |
36 | @pf.register_dataframe_method
37 | def add_category(df: pd.DataFrame, *, type_category_mapping: dict) -> pd.DataFrame:
38 | """
39 | Add a category to each record in the DataFrame based on its protocol.
40 |
41 | Parameters
42 | ----------
43 | df : pd.DataFrame
44 | Input DataFrame containing protocol data.
45 | type_category_mapping : dict
46 | Dictionary mapping types to categories.
47 |
48 | Returns
49 | -------
50 | pd.DataFrame
51 | DataFrame with a new 'category' column, derived from the protocol information.
52 | """
53 |
54 | print('Adding category based on protocol...')
55 | df['category'] = (
56 | df['project_type']
57 | .str.lower()
58 | .map({key.lower(): value['category'] for key, value in type_category_mapping.items()})
59 | .fillna('unknown')
60 | )
61 | return df
62 |
63 |
64 | @pf.register_dataframe_method
65 | def override_project_types(df: pd.DataFrame, *, override_data_path: str, source_str: str):
66 | """
67 | Override project types to the DataFrame based on project characteristics
68 | We treat Berkeley data as source of truth for most project types
69 |
70 | Parameters
71 | ----------
72 | df : pd.DataFrame
73 | Input DataFrame containing project data.
74 | override_data_path: str
75 | Path to where json of override data lives
76 | source: str
77 | Value to write to `type_source` when applying override values
78 |
79 | Returns
80 | -------
81 | pd.DataFrame
82 | DataFrame with a 'project_type' column overridden by all values in override_data.
83 | """
84 |
85 | override_d = json.load(open(override_data_path))
86 | df['project_type'] = df['project_id'].map(override_d).fillna(df['project_type'])
87 | df.loc[df['project_id'].isin(list(override_d.keys())), 'project_type_source'] = source_str
88 |
89 | return df
90 |
91 |
92 | @pf.register_dataframe_method
93 | def infer_project_type(df: pd.DataFrame) -> pd.DataFrame:
94 | """
95 | Add project types to the DataFrame based on project characteristics
96 |
97 | Parameters
98 | ----------
99 | df : pd.DataFrame
100 | Input DataFrame containing project data.
101 |
102 | Returns
103 | -------
104 | pd.DataFrame
105 | DataFrame with a new 'project_type' column, indicating the project's type. Defaults to None
106 | """
107 | df.loc[:, 'project_type'] = 'unknown'
108 | df.loc[:, 'project_type_source'] = 'carbonplan'
109 | df.loc[df.apply(lambda x: 'art-trees' in x['protocol'], axis=1), 'project_type'] = 'redd+'
110 |
111 | df.loc[df.apply(lambda x: 'acr-ifm-nonfed' in x['protocol'], axis=1), 'project_type'] = (
112 | 'improved forest management'
113 | )
114 | df.loc[df.apply(lambda x: 'acr-abandoned-wells' in x['protocol'], axis=1), 'project_type'] = (
115 | 'plugging oil & gas wells'
116 | )
117 |
118 | df.loc[df.apply(lambda x: 'arb-mine-methane' in x['protocol'], axis=1), 'project_type'] = (
119 | 'mine methane capture'
120 | )
121 |
122 | df.loc[df.apply(lambda x: 'vm0048' in x['protocol'], axis=1), 'project_type'] = 'redd+'
123 | df.loc[df.apply(lambda x: 'vm0047' in x['protocol'], axis=1), 'project_type'] = (
124 | 'afforestation/reforestation'
125 | )
126 | df.loc[df.apply(lambda x: 'vm0045' in x['protocol'], axis=1), 'project_type'] = (
127 | 'improved forest management'
128 | )
129 | df.loc[df.apply(lambda x: 'vm0042' in x['protocol'], axis=1), 'project_type'] = 'agriculture'
130 | df.loc[df.apply(lambda x: 'vm0007' in x['protocol'], axis=1), 'project_type'] = 'redd+'
131 |
132 | return df
133 |
134 |
135 | @pf.register_dataframe_method
136 | def map_project_type_to_display_name(
137 | df: pd.DataFrame, *, type_category_mapping: dict
138 | ) -> pd.DataFrame:
139 | """
140 | Map project types in the DataFrame to display names based on a mapping dictionary.
141 |
142 | Parameters
143 | ----------
144 | df : pd.DataFrame
145 | Input DataFrame containing project data.
146 | type_category_mapping : dict
147 | Dictionary mapping project type strings to display names.
148 |
149 | Returns
150 | -------
151 | pd.DataFrame
152 | DataFrame with a new 'project_type' column, containing mapped display names.
153 | """
154 |
155 | print('Mapping project types to display names...')
156 | df['project_type'] = (
157 | df['project_type']
158 | .map(
159 | {
160 | key.lower(): value['project-type-display-name']
161 | for key, value in type_category_mapping.items()
162 | }
163 | )
164 | .fillna('Unknown')
165 | )
166 | return df
167 |
168 |
169 | @pf.register_dataframe_method
170 | def add_is_compliance_flag(df: pd.DataFrame) -> pd.DataFrame:
171 | """
172 | Add a compliance flag to the DataFrame based on the protocol.
173 |
174 | Parameters
175 | ----------
176 | df : pd.DataFrame
177 | Input DataFrame containing protocol data.
178 |
179 | Returns
180 | -------
181 | pd.DataFrame
182 | DataFrame with a new 'is_compliance' column, indicating if the protocol starts with 'arb-'.
183 | """
184 |
185 | print('Adding is_compliance flag...')
186 | df['is_compliance'] = df.apply(
187 | lambda row: np.any([protocol_str.startswith('arb-') for protocol_str in row['protocol']]),
188 | axis=1,
189 | )
190 | return df
191 |
192 |
193 | @pf.register_dataframe_method
194 | def map_protocol(
195 | df: pd.DataFrame,
196 | *,
197 | inverted_protocol_mapping: dict,
198 | original_protocol_column: str = 'original_protocol',
199 | ) -> pd.DataFrame:
200 | """
201 | Map protocols in the DataFrame to standardized names based on an inverted protocol mapping.
202 |
203 | Parameters
204 | ----------
205 | df : pd.DataFrame
206 | Input DataFrame containing protocol data.
207 | inverted_protocol_mapping : dict
208 | Dictionary mapping protocol strings to standardized protocol names.
209 | original_protocol_column : str, optional
210 | Name of the column containing original protocol information (default is 'original_protocol').
211 |
212 | Returns
213 | -------
214 | pd.DataFrame
215 | DataFrame with a new 'protocol' column, containing mapped protocol names.
216 | """
217 |
218 | print('Mapping protocol based on known string...')
219 | try:
220 | df['protocol'] = df[original_protocol_column].apply(
221 | lambda item: find_protocol(
222 | search_string=item, inverted_protocol_mapping=inverted_protocol_mapping
223 | )
224 | )
225 | except KeyError:
226 | # art-trees doesnt have protocol column
227 | df['protocol'] = [['unknown']] * len(df) # protocol column is nested list
228 |
229 | return df
230 |
231 |
232 | @pf.register_dataframe_method
233 | def harmonize_status_codes(df: pd.DataFrame, *, status_column: str = 'status') -> pd.DataFrame:
234 | """Harmonize project status codes across registries
235 |
236 | Excludes ACR, as it requires special treatment across two columns
237 |
238 | Parameters
239 | ----------
240 | df : pd.DataFrame
241 | Input DataFrame with project status data.
242 | status_column : str, optional
243 | Name of the column containing status codes to harmonize (default is 'status').
244 |
245 | Returns
246 | -------
247 | pd.DataFrame
248 | DataFrame with harmonized project status codes.
249 | """
250 | print('Harmonizing status codes')
251 | with contextlib.suppress(KeyError):
252 | CAR_STATES = {
253 | 'Registered': 'registered',
254 | 'Completed': 'completed',
255 | 'Listed': 'listed',
256 | 'Transitioned': 'unknown',
257 | }
258 |
259 | VERRA_STATES = {
260 | 'Under validation': 'listed',
261 | 'Under development': 'listed',
262 | 'Registration requested': 'listed',
263 | 'Registration and verification approval requested': 'listed',
264 | 'Withdrawn': 'completed',
265 | 'On Hold': 'registered',
266 | 'Units Transferred from Approved GHG Program': 'unknown',
267 | 'Rejected by Administrator': 'completed',
268 | 'Crediting Period Renewal Requested': 'registered',
269 | 'Inactive': 'completed',
270 | 'Crediting Period Renewal and Verification Approval Requested': 'registered',
271 | }
272 |
273 | GS_STATES = {
274 | 'GOLD_STANDARD_CERTIFIED_PROJECT': 'registered',
275 | 'LISTED': 'listed',
276 | 'GOLD_STANDARD_CERTIFIED_DESIGN': 'registered',
277 | }
278 |
279 | state_dict = CAR_STATES | VERRA_STATES | GS_STATES
280 | df[status_column] = df[status_column].apply(lambda x: state_dict.get(x, 'unknown'))
281 | return df
282 |
283 |
284 | def find_protocol(
285 | *, search_string: str, inverted_protocol_mapping: dict[str, list[str]]
286 | ) -> list[str]:
287 | """Match known strings of project methodologies to internal topology
288 |
289 | Unmatched strings are passed through to the database, until such time that we update mapping data.
290 | """
291 | if pd.isna(search_string): # handle nan case, which crops up in verra data right now
292 | return ['unknown']
293 | if known_match := inverted_protocol_mapping.get(search_string.strip()):
294 | return known_match # inverted_mapping returns lst
295 | print(f"'{search_string}' is unmapped in full protocol mapping")
296 | return [search_string]
297 |
298 |
299 | def get_protocol_category(*, protocol_strs: list[str] | str, protocol_mapping: dict) -> list[str]:
300 | """
301 | Get category based on protocol string
302 |
303 | Parameters
304 | ----------
305 | protocol_strs : str or list
306 | single protocol string or list of protocol strings
307 |
308 | protocol_mapping: dict
309 | metadata about normalized protocol strings
310 |
311 | Returns
312 | -------
313 | categories : list[str]
314 | list of category strings
315 | """
316 |
317 | def _get_category(protocol_str, protocol_mapping):
318 | try:
319 | return protocol_mapping.get(protocol_str).get('category', 'unknown')
320 | except AttributeError:
321 | return 'unknown'
322 |
323 | if isinstance(protocol_strs, str):
324 | protocol_strs = [protocol_strs]
325 | categories = [_get_category(protocol_str, protocol_mapping) for protocol_str in protocol_strs]
326 | return list(
327 | set(categories)
328 | ) # if multiple protocols have same category, just return category once
329 |
330 |
331 | @pf.register_dataframe_method
332 | def add_first_issuance_and_retirement_dates(
333 | projects: pd.DataFrame, *, credits: pd.DataFrame
334 | ) -> pd.DataFrame:
335 | """
336 | Add the first issuance date of carbon credits to each project in the projects DataFrame.
337 |
338 | Parameters
339 | ----------
340 | credits : pd.DataFrame
341 | A pandas DataFrame containing credit issuance data with columns 'project_id', 'transaction_date', and 'transaction_type'.
342 | projects : pd.DataFrame
343 | A pandas DataFrame containing project data with a 'project_id' column.
344 |
345 | Returns
346 | -------
347 | projects : pd.DataFrame
348 | A pandas DataFrame which is the original projects DataFrame with two additional columns 'first_issuance_at' representing
349 | the first issuance date of each project and 'first_retirement_at' representing the first retirement date of each project.
350 | """
351 |
352 | first_issuance = (
353 | credits[credits['transaction_type'] == 'issuance']
354 | .groupby('project_id')['transaction_date']
355 | .min()
356 | .reset_index()
357 | )
358 | first_retirement = (
359 | credits[credits['transaction_type'].str.contains('retirement')]
360 | .groupby('project_id')['transaction_date']
361 | .min()
362 | .reset_index()
363 | )
364 |
365 | # Merge the projects DataFrame with the first issuance and retirement dates
366 | projects_with_dates = pd.merge(projects, first_issuance, on='project_id', how='left')
367 | projects_with_dates = pd.merge(
368 | projects_with_dates, first_retirement, on='project_id', how='left'
369 | )
370 |
371 | # Rename the merged columns for clarity
372 | projects_with_dates = projects_with_dates.rename(
373 | columns={
374 | 'transaction_date_x': 'first_issuance_at',
375 | 'transaction_date_y': 'first_retirement_at',
376 | }
377 | )
378 |
379 | return projects_with_dates
380 |
381 |
382 | @pf.register_dataframe_method
383 | def add_retired_and_issued_totals(projects: pd.DataFrame, *, credits: pd.DataFrame) -> pd.DataFrame:
384 | """
385 | Add total quantities of issued and retired credits to each project.
386 |
387 | Parameters
388 | ----------
389 | projects : pd.DataFrame
390 | DataFrame containing project data.
391 | credits : pd.DataFrame
392 | DataFrame containing credit transaction data.
393 |
394 | Returns
395 | -------
396 | pd.DataFrame
397 | DataFrame with two new columns: 'issued' and 'retired', representing the total quantities of issued and retired credits.
398 | """
399 |
400 | # Drop conflicting columns if they exist
401 | projects = projects.drop(columns=['issued', 'retired'], errors='ignore')
402 |
403 | # # filter out the projects that are not in the credits data
404 | # credits = credits[credits['project_id'].isin(projects['project_id'].unique())]
405 | # groupd and sum
406 | credit_totals = (
407 | credits.groupby(['project_id', 'transaction_type'])['quantity'].sum().reset_index()
408 | )
409 | # pivot the table
410 | credit_totals_pivot = credit_totals.pivot(
411 | index='project_id', columns='transaction_type', values='quantity'
412 | ).reset_index()
413 |
414 | # merge with projects
415 | projects_combined = pd.merge(
416 | projects,
417 | credit_totals_pivot[['project_id', 'issuance', 'retirement']],
418 | left_on='project_id',
419 | right_on='project_id',
420 | how='left',
421 | )
422 |
423 | # rename columns for clarity
424 | projects_combined = projects_combined.rename(
425 | columns={'issuance': 'issued', 'retirement': 'retired'}
426 | )
427 |
428 | # replace Nans with 0 if any
429 | projects_combined[['issued', 'retired']] = projects_combined[['issued', 'retired']].fillna(0)
430 |
431 | return projects_combined
432 |
--------------------------------------------------------------------------------
/offsets_db_data/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/offsets_db_data/py.typed
--------------------------------------------------------------------------------
/offsets_db_data/registry.py:
--------------------------------------------------------------------------------
1 | REGISTRY_ABBR_MAP = {
2 | 'vcs': 'verra',
3 | 'car': 'climate-action-reserve',
4 | 'acr': 'american-carbon-registry',
5 | 'art': 'art-trees',
6 | 'gld': 'gold-standard',
7 | }
8 |
9 |
10 | def get_registry_from_project_id(project_id: str) -> str:
11 | """
12 | Retrieve the full registry name from a project ID using a predefined abbreviation mapping.
13 |
14 | Parameters
15 | ----------
16 | project_id : str
17 | The project ID whose registry needs to be identified.
18 |
19 | Returns
20 | -------
21 | str
22 | The full name of the registry corresponding to the abbreviation in the project ID.
23 |
24 | Notes
25 | -----
26 | - The function expects the first three characters of the project ID to be the abbreviation of the registry.
27 | - It uses a predefined mapping (`REGISTRY_ABBR_MAP`) to convert the abbreviation to the full registry name.
28 | - The project ID is converted to lowercase to ensure case-insensitive matching.
29 | - The function raises a KeyError if the abbreviation is not found in `REGISTRY_ABBR_MAP`.
30 | """
31 |
32 | lowered_id = project_id.lower()
33 | return REGISTRY_ABBR_MAP[lowered_id[:3]]
34 |
--------------------------------------------------------------------------------
/offsets_db_data/vcs.py:
--------------------------------------------------------------------------------
1 | import numpy as np # noqa: F401
2 | import pandas as pd
3 | import pandas_flavor as pf
4 |
5 | from offsets_db_data.common import (
6 | BERKELEY_PROJECT_TYPE_UPATH,
7 | CREDIT_SCHEMA_UPATH,
8 | PROJECT_SCHEMA_UPATH,
9 | load_column_mapping,
10 | load_inverted_protocol_mapping,
11 | load_registry_project_column_mapping,
12 | load_type_category_mapping,
13 | )
14 | from offsets_db_data.credits import * # noqa: F403
15 | from offsets_db_data.credits import harmonize_beneficiary_data
16 | from offsets_db_data.models import credit_without_id_schema, project_schema
17 | from offsets_db_data.projects import * # noqa: F403
18 |
19 |
20 | @pf.register_dataframe_method
21 | def generate_vcs_project_ids(df: pd.DataFrame, *, prefix: str) -> pd.DataFrame:
22 | """
23 | Generate Verra project IDs by concatenating a specified prefix with the 'ID' column of the DataFrame.
24 |
25 | Parameters
26 | ----------
27 | df : pd.DataFrame
28 | Input DataFrame containing Verra project data.
29 | prefix : str
30 | Prefix string to prepend to each project ID.
31 |
32 | Returns
33 | -------
34 | pd.DataFrame
35 | DataFrame with a new 'project_id' column, containing the generated project IDs.
36 | """
37 |
38 | df['project_id'] = prefix + df['ID'].astype(str)
39 | return df
40 |
41 |
42 | @pf.register_dataframe_method
43 | def determine_vcs_transaction_type(df: pd.DataFrame, *, date_column: str) -> pd.DataFrame:
44 | """
45 | Determine the transaction type for Verra transactions based on a specified date column.
46 | Transactions with non-null date values are labeled as 'retirement', else as 'issuance'.
47 |
48 | Parameters
49 | ----------
50 | df : pd.DataFrame
51 | Input DataFrame with transaction data.
52 | date_column : str
53 | Name of the column in the DataFrame used to determine the transaction type.
54 |
55 | Returns
56 | -------
57 | pd.DataFrame
58 | DataFrame with a new 'transaction_type' column indicating the type of each transaction.
59 | """
60 |
61 | # Verra doesn't have a transaction type column, and doesn't differentitate between retirements and cancelattions
62 | # So we'll use the date column to determine whether a transaction is a retirement or issuance and set the
63 | # transaction type accordingly
64 | df['transaction_type'] = df[date_column].apply(
65 | lambda x: 'retirement' if pd.notnull(x) else 'issuance'
66 | )
67 | return df
68 |
69 |
70 | @pf.register_dataframe_method
71 | def set_vcs_transaction_dates(
72 | df: pd.DataFrame, *, date_column: str, fallback_column: str
73 | ) -> pd.DataFrame:
74 | """
75 | Set the transaction dates in a DataFrame, using a primary date column and a fallback column.
76 |
77 | Parameters
78 | ----------
79 | df : pd.DataFrame
80 | Input DataFrame with transaction data.
81 | date_column : str
82 | Primary column to use for transaction dates.
83 | fallback_column : str
84 | Column to use as fallback for transaction dates when primary column is null.
85 |
86 | Returns
87 | -------
88 | pd.DataFrame
89 | DataFrame with a new 'transaction_date' column, containing the determined dates.
90 | """
91 |
92 | df['transaction_date'] = df[date_column].where(df[date_column].notnull(), df[fallback_column])
93 | return df
94 |
95 |
96 | @pf.register_dataframe_method
97 | def set_vcs_vintage_year(df: pd.DataFrame, *, date_column: str) -> pd.DataFrame:
98 | """
99 | Set the vintage year for Verra transactions based on a date column formatted as '%d/%m/%Y'.
100 |
101 | Parameters
102 | ----------
103 | df : pd.DataFrame
104 | Input DataFrame with transaction data.
105 | date_column : str
106 | Name of the column containing date information to extract the vintage year from.
107 |
108 | Returns
109 | -------
110 | pd.DataFrame
111 | DataFrame with a new 'vintage' column, containing the vintage year of each transaction.
112 | """
113 |
114 | try:
115 | df[date_column] = pd.to_datetime(df[date_column], format='%d/%m/%Y', utc=True)
116 | except ValueError:
117 | df[date_column] = pd.to_datetime(df[date_column], utc=True)
118 | df['vintage'] = df[date_column].dt.year
119 | return df
120 |
121 |
122 | @pf.register_dataframe_method
123 | def calculate_vcs_issuances(df: pd.DataFrame) -> pd.DataFrame:
124 | """Logic to calculate verra transactions from prepocessed transaction data
125 |
126 | Verra allows rolling/partial issuances. This requires inferring vintage issuance from `Total Vintage Quantity`
127 |
128 | Parameters
129 | ----------
130 | df : pd.DataFrame
131 | Input DataFrame with preprocessed transaction data.
132 |
133 | Returns
134 | -------
135 | pd.DataFrame
136 | DataFrame containing only issuance transactions with deduplicated and renamed columns.
137 | """
138 |
139 | df_issuance = df.sort_values('transaction_date').drop_duplicates(
140 | ['vintage', 'project_id', 'Total Vintage Quantity'], keep='first'
141 | )
142 |
143 | df_issuance = df_issuance.rename(columns={'Total Vintage Quantity': 'quantity'})
144 |
145 | df_issuance['transaction_type'] = 'issuance'
146 |
147 | return df_issuance
148 |
149 |
150 | @pf.register_dataframe_method
151 | def calculate_vcs_retirements(df: pd.DataFrame) -> pd.DataFrame:
152 | """
153 | Calculate retirements and cancellations for Verra transactions. The data does not allow
154 | distinguishing between retirements and cancellations.
155 |
156 | Parameters
157 | ----------
158 | df : pd.DataFrame
159 | Input DataFrame with Verra transaction data.
160 |
161 | Returns
162 | -------
163 | pd.DataFrame
164 | DataFrame containing only retirement transactions with renamed columns.
165 | """
166 |
167 | retirements = df[df['transaction_type'] != 'issuance']
168 | retirements = retirements.rename(columns={'Quantity Issued': 'quantity'})
169 | return retirements
170 |
171 |
172 | @pf.register_dataframe_method
173 | def process_vcs_credits(
174 | df: pd.DataFrame,
175 | *,
176 | download_type: str = 'transactions',
177 | registry_name: str = 'verra',
178 | prefix: str = 'VCS',
179 | arb: pd.DataFrame | None = None,
180 | harmonize_beneficiary_info: bool = False,
181 | ) -> pd.DataFrame:
182 | """
183 | Process Verra credits data, including generation of project IDs, determination of transaction types,
184 | setting transaction dates, and various data transformations and validations.
185 |
186 | Parameters
187 | ----------
188 | df : pd.DataFrame
189 | Input DataFrame with raw credits data.
190 | download_type : str, optional
191 | Type of download operation performed (default is 'transactions').
192 | registry_name : str, optional
193 | Name of the registry (default is 'verra').
194 | prefix : str, optional
195 | Prefix for generating project IDs (default is 'VCS').
196 | arb : pd.DataFrame | None, optional
197 | DataFrame for additional data merging (default is None).
198 |
199 | Returns
200 | -------
201 | pd.DataFrame
202 | Processed DataFrame with Verra credits data.
203 | """
204 |
205 | df = df.copy()
206 | data = (
207 | df.set_registry(registry_name=registry_name)
208 | .generate_vcs_project_ids(prefix=prefix)
209 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date')
210 | .set_vcs_transaction_dates(
211 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date'
212 | )
213 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued'])
214 | .set_vcs_vintage_year(date_column='Vintage End')
215 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True)
216 | )
217 |
218 | issuances = data.calculate_vcs_issuances()
219 | retirements = data.calculate_vcs_retirements()
220 |
221 | column_mapping = load_column_mapping(
222 | registry_name=registry_name, download_type=download_type, mapping_path=CREDIT_SCHEMA_UPATH
223 | )
224 |
225 | columns = {v: k for k, v in column_mapping.items()}
226 |
227 | merged_df = pd.concat([issuances, retirements]).reset_index(drop=True).rename(columns=columns)
228 |
229 | issuances = merged_df.aggregate_issuance_transactions()
230 | retirements = merged_df[merged_df['transaction_type'].str.contains('retirement')]
231 | data = (
232 | pd.concat([issuances, retirements])
233 | .reset_index(drop=True)
234 | .add_missing_columns(schema=credit_without_id_schema)
235 | .validate(schema=credit_without_id_schema)
236 | )
237 |
238 | if arb is not None and not arb.empty:
239 | data = data.merge_with_arb(arb=arb)
240 |
241 | if harmonize_beneficiary_info:
242 | data = data.pipe(
243 | harmonize_beneficiary_data, registry_name=registry_name, download_type=download_type
244 | )
245 |
246 | data = (
247 | data.add_missing_columns(schema=credit_without_id_schema)
248 | .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d')
249 | .validate(schema=credit_without_id_schema)
250 | )
251 |
252 | return data
253 |
254 |
255 | @pf.register_dataframe_method
256 | def add_vcs_compliance_projects(df: pd.DataFrame) -> pd.DataFrame:
257 | """
258 | Add details about two compliance projects to projects database.
259 |
260 | Parameters
261 | ----------
262 | df : pd.DataFrame
263 | A pandas DataFrame containing project data with a 'project_id' column.
264 |
265 | Returns
266 | --------
267 | df: pd.DataFrame
268 | A pandas DataFrame with two additional rows, describing two projects from the mostly unused Verra compliance
269 | registry portal.
270 | """
271 |
272 | vcs_project_dicts = [
273 | {
274 | 'project_id': 'VCSOPR2',
275 | 'name': 'Corinth Abandoned Mine Methane Recovery Project',
276 | 'protocol': ['arb-mine-methane'],
277 | 'category': 'ghg-management',
278 | 'project_type': 'mine methane capture',
279 | 'project_type_source': 'carbonplan',
280 | 'proponent': 'Keyrock Energy LLC',
281 | 'country': 'United States',
282 | 'status': 'registered',
283 | 'is_compliance': True,
284 | 'registry': 'verra',
285 | 'project_url': 'https://registry.verra.org/app/projectDetail/VCS/2265',
286 | },
287 | {
288 | 'project_id': 'VCSOPR10',
289 | 'name': 'Blue Source-Alford Improved Forest Management Project',
290 | 'protocol': ['arb-forest'],
291 | 'category': 'forest',
292 | 'project_type': 'improved forest management',
293 | 'project_type_source': 'carbonplan',
294 | 'proponent': 'Ozark Regional Land Trust',
295 | 'country': 'United States',
296 | 'status': 'registered',
297 | 'is_compliance': True,
298 | 'registry': 'verra',
299 | 'project_url': 'https://registry.verra.org/app/projectDetail/VCS/2271',
300 | },
301 | ]
302 | vcs_projects = pd.DataFrame(vcs_project_dicts)
303 | return pd.concat([df, vcs_projects], ignore_index=True)
304 |
305 |
306 | @pf.register_dataframe_method
307 | def add_vcs_project_url(df: pd.DataFrame) -> pd.DataFrame:
308 | """
309 | Create a URL for each project based on its Verra project ID.
310 |
311 | Parameters
312 | ----------
313 | df : pd.DataFrame
314 | Input DataFrame with Verra project data.
315 |
316 | Returns
317 | -------
318 | pd.DataFrame
319 | DataFrame with a new 'project_url' column, containing the generated URLs for each project.
320 | """
321 |
322 | df['project_url'] = (
323 | 'https://registry.verra.org/app/projectDetail/VCS/' + df['project_id'].str[3:]
324 | )
325 | return df
326 |
327 |
328 | @pf.register_dataframe_method
329 | def add_vcs_project_id(df: pd.DataFrame) -> pd.DataFrame:
330 | """
331 | Add a prefix 'VCS' to each project ID in the DataFrame.
332 |
333 | Parameters
334 | ----------
335 | df : pd.DataFrame
336 | Input DataFrame with Verra project data.
337 |
338 | Returns
339 | -------
340 | pd.DataFrame
341 | DataFrame with updated 'project_id' column, containing the prefixed project IDs.
342 | """
343 |
344 | df['project_id'] = df['project_id'].apply(lambda x: f'VCS{str(x)}')
345 | return df
346 |
347 |
348 | @pf.register_dataframe_method
349 | def process_vcs_projects(
350 | df: pd.DataFrame,
351 | *,
352 | credits: pd.DataFrame,
353 | registry_name: str = 'verra',
354 | download_type: str = 'projects',
355 | ) -> pd.DataFrame:
356 | """
357 | Process Verra projects data, including renaming, adding, and validating columns, and merging with credits data.
358 |
359 | Parameters
360 | ----------
361 | df : pd.DataFrame
362 | Input DataFrame with raw projects data.
363 | credits : pd.DataFrame
364 | DataFrame containing credits data for merging.
365 | registry_name : str, optional
366 | Name of the registry (default is 'verra').
367 | download_type : str, optional
368 | Type of download operation performed (default is 'projects').
369 |
370 | Returns
371 | -------
372 | pd.DataFrame
373 | Processed DataFrame with harmonized and validated Verra projects data.
374 | """
375 |
376 | df = df.copy()
377 | credits = credits.copy()
378 | registry_project_column_mapping = load_registry_project_column_mapping(
379 | registry_name=registry_name, file_path=PROJECT_SCHEMA_UPATH
380 | )
381 | inverted_column_mapping = {value: key for key, value in registry_project_column_mapping.items()}
382 | type_category_mapping = load_type_category_mapping()
383 | inverted_protocol_mapping = load_inverted_protocol_mapping()
384 |
385 | data = (
386 | df.rename(columns=inverted_column_mapping)
387 | .set_registry(registry_name=registry_name)
388 | .add_vcs_project_id()
389 | .add_vcs_project_url()
390 | .harmonize_country_names()
391 | .harmonize_status_codes()
392 | .map_protocol(inverted_protocol_mapping=inverted_protocol_mapping)
393 | .infer_project_type()
394 | .override_project_types(
395 | override_data_path=BERKELEY_PROJECT_TYPE_UPATH, source_str='berkeley'
396 | )
397 | .add_category(
398 | type_category_mapping=type_category_mapping
399 | ) # must come after types; type -> category
400 | .add_is_compliance_flag()
401 | .add_vcs_compliance_projects()
402 | .map_project_type_to_display_name(type_category_mapping=type_category_mapping)
403 | .add_retired_and_issued_totals(credits=credits)
404 | .add_first_issuance_and_retirement_dates(credits=credits)
405 | .add_missing_columns(schema=project_schema)
406 | .convert_to_datetime(columns=['listed_at'], dayfirst=True)
407 | .validate(schema=project_schema)
408 | )
409 |
410 | return data
411 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | build-backend = "setuptools.build_meta"
3 | requires = ["setuptools-scm[toml]>=6.2", "setuptools>=64", "wheel"]
4 |
5 | [project]
6 | authors = [{ name = "CarbonPlan", email = "tech@carbonplan.org" }]
7 | classifiers = [
8 | "Development Status :: 4 - Beta",
9 | "Intended Audience :: Science/Research",
10 | "License :: OSI Approved :: MIT License",
11 | "Operating System :: OS Independent",
12 | "Programming Language :: Python :: 3",
13 | "Programming Language :: Python :: 3.10",
14 | "Programming Language :: Python :: 3.11",
15 | "Programming Language :: Python :: 3.12",
16 | "Programming Language :: Python :: 3.13",
17 | "Programming Language :: Python",
18 | "Topic :: Scientific/Engineering",
19 | ]
20 | description = "Monitoring the global carbon market"
21 | license = { text = "MIT" }
22 | name = "offsets-db-data"
23 | readme = "README.md"
24 | requires-python = ">=3.10"
25 |
26 | dynamic = ["dependencies", "version"]
27 |
28 | [project.scripts]
29 | offsets-db-data-orcli = "offsets_db_data.openrefine:main"
30 |
31 | [tool.setuptools.dynamic]
32 |
33 | dependencies = { file = ["requirements.txt"] }
34 | optional-dependencies = { dev = { file = [
35 | "requirements-dev.txt",
36 | ] }, docs = { file = [
37 | "requirements-docs.txt",
38 | ] } }
39 |
40 | [project.urls]
41 | "database web tool" = "https://carbonplan.org/research/offsets-db"
42 | "documentation" = "https://offsets-db-data.readthedocs.io/"
43 | "explainer" = "https://carbonplan.org/research/offsets-db-explainer"
44 | repository = "https://github.com/carbonplan/offsets-db-data"
45 |
46 | [tool.setuptools.packages.find]
47 | include = ["offsets_db_data*"]
48 |
49 | [tool.setuptools.package-data]
50 | offsets_db_data = ["*.yaml", "configs/*.json", "py.typed"]
51 |
52 | [tool.setuptools_scm]
53 | fallback_version = "999"
54 | local_scheme = "node-and-date"
55 | version_scheme = "post-release"
56 | write_to = "offsets_db_data/_version.py"
57 | write_to_template = '__version__ = "{version}"'
58 |
59 | [tool.coverage.run]
60 | branch = true
61 | omit = ["tests/*"]
62 |
63 | [tool.ruff]
64 | extend-include = ["*.ipynb"]
65 | line-length = 100
66 | target-version = "py310"
67 |
68 | builtins = ["ellipsis"]
69 | # Exclude a variety of commonly ignored directories.
70 | exclude = [
71 | ".bzr",
72 | ".direnv",
73 | ".eggs",
74 | ".git",
75 | ".git-rewrite",
76 | ".hg",
77 | ".ipynb_checkpoints",
78 | ".mypy_cache",
79 | ".nox",
80 | ".pants.d",
81 | ".pyenv",
82 | ".pytest_cache",
83 | ".pytype",
84 | ".ruff_cache",
85 | ".svn",
86 | ".tox",
87 | ".venv",
88 | ".vscode",
89 | "__pypackages__",
90 | "_build",
91 | "buck-out",
92 | "build",
93 | "dist",
94 | "node_modules",
95 | "site-packages",
96 | "venv",
97 | ]
98 | [tool.ruff.lint]
99 | ignore = [
100 | "E501", # Conflicts with ruff format
101 | "E721", # Comparing types instead of isinstance
102 | "E741", # Ambiguous variable names
103 | ]
104 | per-file-ignores = {}
105 | select = [
106 | # Pyflakes
107 | "F",
108 | # Pycodestyle
109 | "E",
110 | "W",
111 | # isort
112 | "I",
113 | # Pyupgrade
114 | "UP",
115 | ]
116 |
117 | [tool.ruff.lint.mccabe]
118 | max-complexity = 18
119 |
120 | [tool.ruff.lint.isort]
121 | combine-as-imports = true
122 | known-first-party = ["offsets_db_data"]
123 |
124 | [tool.ruff.format]
125 | docstring-code-format = true
126 | quote-style = "single"
127 |
128 | [tool.ruff.lint.pydocstyle]
129 | convention = "numpy"
130 |
131 | [tool.ruff.lint.pyupgrade]
132 | # Preserve types, even if a file imports `from __future__ import annotations`.
133 | keep-runtime-typing = true
134 |
135 | [tool.pytest.ini_options]
136 | addopts = "-n auto --cov=./ --cov-report=xml --cov-report=term-missing --verbose"
137 | console_output_style = "count"
138 |
--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: '3.12'
13 | # You can also specify other tool versions:
14 | # nodejs: "19"
15 | # rust: "1.64"
16 | # golang: "1.19"
17 |
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 | configuration: docs/conf.py
21 |
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | # formats:
24 | # - pdf
25 | # - epub
26 |
27 | # Optional but recommended, declare the Python requirements required
28 | # to build your documentation
29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30 | python:
31 | install:
32 | - method: pip
33 | path: .
34 | - requirements: requirements-docs.txt
35 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==7.4.*
2 | pytest-cov==4.1.*
3 | pytest-mock==3.10.*
4 | pytest-xdist==3.3.*
5 | requests-mock==1.11.*
6 | hypothesis==6.111.*
7 | openpyxl
8 |
--------------------------------------------------------------------------------
/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx-book-theme>=1.1.2
2 | myst-nb
3 | sphinx
4 | sphinx-copybutton
5 | sphinx-design
6 | sphinxext-opengraph
7 | jupyterlab
8 | sphinx-togglebutton
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | country_converter==1.0.0
2 | dask==2025.2.0
3 | fastparquet==2024.11
4 | fsspec==2025.2.0
5 | intake-parquet>=0.3.0
6 | intake<2
7 | pandas==2.2.2
8 | pandera==0.23
9 | pydantic==2.10.*
10 | pyjanitor==0.23.*
11 | requests>=2.31.0
12 | s3fs==2025.2.0
13 | universal_pathlib>=0.1.3
14 | numpy>=2
15 | typer >=0.15.2
16 |
--------------------------------------------------------------------------------
/scripts/check-beneficiary-coverage.py:
--------------------------------------------------------------------------------
1 | import fsspec
2 | import pandas as pd
3 |
4 |
5 | def main():
6 | print('Checking beneficiary coverage against latest production release on S3')
7 |
8 | with fsspec.open(
9 | 'zip://credits.parquet::s3://carbonplan-offsets-db/production/latest/offsets-db.parquet.zip'
10 | ) as f:
11 | credits = pd.read_parquet(f)
12 | retirement_credits = credits[credits['transaction_type'] == 'retirement']
13 |
14 | beneficiary_cols = [
15 | 'retirement_beneficiary',
16 | 'retirement_account',
17 | 'retirement_note',
18 | 'retirement_reason',
19 | ]
20 | no_user_data = pd.isna(retirement_credits[beneficiary_cols]).sum(axis=1) == 4
21 |
22 | mapped_stats = (
23 | retirement_credits[(~no_user_data)]
24 | .groupby(pd.isna(retirement_credits['retirement_beneficiary_harmonized']))
25 | .quantity.sum()
26 | )
27 | tot_mapped = mapped_stats.sum()
28 | frac_mapped = mapped_stats[False] / tot_mapped
29 | nlarge_unmapped = (
30 | retirement_credits[
31 | (~no_user_data) & pd.isna(retirement_credits['retirement_beneficiary_harmonized'])
32 | ].quantity
33 | > 50_000
34 | ).sum()
35 |
36 | print(f'A total of {mapped_stats[False] / 1_000_000:.2f} million credits have been mapped')
37 | print(f'which represents {frac_mapped * 100:.1f} percent of mappable credit')
38 | print(f'There are {nlarge_unmapped} mappable transactions that 50,000 credits')
39 |
40 |
41 | if __name__ == '__main__':
42 | main()
43 |
--------------------------------------------------------------------------------
/scripts/extract-berkeley-project-types.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | import pandas as pd
5 |
6 |
7 | def main():
8 | parser = argparse.ArgumentParser(
9 | description='Extract project types from latest version of berkeley carbon project data',
10 | )
11 | parser.add_argument('filename', help='Input filename to process')
12 |
13 | args = parser.parse_args()
14 |
15 | # this is surprisingly slow? openpyxl is doing some _work_
16 | project_data = pd.read_excel(
17 | args.filename, sheet_name='PROJECTS', skiprows=3, usecols=['Project ID', ' Type']
18 | )
19 |
20 | def _fix_gld_ids(s: str) -> str:
21 | if s.startswith('GS'):
22 | return f'GLD{s[2:]}'
23 | else:
24 | return s
25 |
26 | out_d = project_data.dropna().set_index('Project ID')[' Type'].to_dict()
27 | out_d = {_fix_gld_ids(k): v.lower() for k, v in out_d.items()}
28 | out_f = '/tmp/berkeley-project-types.json'
29 | with open(out_f, 'w') as f:
30 | print(f'Writing project types to {out_f}')
31 | json.dump(out_d, f, indent=1)
32 |
33 |
34 | if __name__ == '__main__':
35 | main()
36 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carbonplan/offsets-db-data/5a62ee9fd0369b9a164b914d7b0ac979f4ee21e8/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | from offsets_db_data.apx import * # noqa: F403
5 | from offsets_db_data.arb import * # noqa: F403
6 | from offsets_db_data.gld import * # noqa: F403
7 | from offsets_db_data.models import credit_without_id_schema, project_schema
8 | from offsets_db_data.vcs import * # noqa: F403
9 |
10 |
11 | @pytest.fixture
12 | def date() -> str:
13 | return '2024-08-27'
14 |
15 |
16 | @pytest.fixture
17 | def bucket() -> str:
18 | return 's3://carbonplan-offsets-db/raw'
19 |
20 |
21 | @pytest.fixture
22 | def arb() -> pd.DataFrame:
23 | data = pd.read_excel(
24 | 's3://carbonplan-offsets-db/raw/2024-08-27/arb/nc-arboc_issuance.xlsx', sheet_name=3
25 | )
26 | return data.process_arb()
27 |
28 |
29 | @pytest.mark.parametrize(
30 | 'harmonize_beneficiary_info',
31 | [True, False],
32 | )
33 | def test_verra(date, bucket, arb, harmonize_beneficiary_info):
34 | prefix = 'VCS'
35 | projects = pd.read_csv(f'{bucket}/{date}/verra/projects.csv.gz')
36 | credits = pd.read_csv(f'{bucket}/{date}/verra/transactions.csv.gz')
37 | df_credits = credits.process_vcs_credits(
38 | arb=arb[arb.project_id.str.startswith(prefix)],
39 | harmonize_beneficiary_info=harmonize_beneficiary_info,
40 | )
41 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys())
42 | df_projects = projects.process_vcs_projects(credits=df_credits)
43 | project_schema.validate(df_projects)
44 | credit_without_id_schema.validate(df_credits)
45 |
46 | assert df_projects['project_id'].str.startswith(prefix).all()
47 | assert df_credits['project_id'].str.startswith(prefix).all()
48 |
49 |
50 | @pytest.mark.parametrize(
51 | 'registry, download_types, prefix',
52 | [
53 | ('art-trees', ['issuances', 'retirements', 'cancellations'], 'ART'),
54 | ('american-carbon-registry', ['issuances', 'retirements', 'cancellations'], 'ACR'),
55 | ('climate-action-reserve', ['issuances', 'retirements', 'cancellations'], 'CAR'),
56 | ],
57 | )
58 | def test_apx(date, bucket, arb, registry, download_types, prefix):
59 | dfs = []
60 | for key in download_types:
61 | credits = pd.read_csv(f'{bucket}/{date}/{registry}/{key}.csv.gz')
62 | p = credits.process_apx_credits(
63 | download_type=key, registry_name=registry, harmonize_beneficiary_info=True
64 | )
65 | dfs.append(p)
66 |
67 | df_credits = pd.concat(dfs).merge_with_arb(arb=arb[arb.project_id.str.startswith(prefix)])
68 | credit_without_id_schema.validate(df_credits)
69 |
70 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys())
71 |
72 | projects = pd.read_csv(f'{bucket}/{date}/{registry}/projects.csv.gz')
73 | df_projects = projects.process_apx_projects(credits=df_credits, registry_name=registry)
74 | project_schema.validate(df_projects)
75 |
76 | assert df_projects['project_id'].str.startswith(prefix).all()
77 | assert df_credits['project_id'].str.startswith(prefix).all()
78 |
79 |
80 | @pytest.mark.parametrize(
81 | 'harmonize_beneficiary_info',
82 | [True, False],
83 | )
84 | def test_gld(
85 | date,
86 | bucket,
87 | harmonize_beneficiary_info,
88 | ):
89 | registry = 'gold-standard'
90 | download_types = ['issuances', 'retirements']
91 | prefix = 'GLD'
92 |
93 | dfs = []
94 | for key in download_types:
95 | credits = pd.read_csv(f'{bucket}/{date}/{registry}/{key}.csv.gz')
96 | p = credits.process_gld_credits(
97 | download_type=key, harmonize_beneficiary_info=harmonize_beneficiary_info
98 | )
99 | dfs.append(p)
100 |
101 | df_credits = pd.concat(dfs)
102 | credit_without_id_schema.validate(df_credits)
103 |
104 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys())
105 |
106 | projects = pd.read_csv(f'{bucket}/{date}/{registry}/projects.csv.gz')
107 | df_projects = projects.process_gld_projects(credits=df_credits)
108 | project_schema.validate(df_projects)
109 |
110 | # check if all project_id use the same prefix
111 | assert df_projects['project_id'].str.startswith(prefix).all()
112 | assert df_credits['project_id'].str.startswith(prefix).all()
113 |
114 |
115 | @pytest.mark.parametrize(
116 | 'df_credits',
117 | [
118 | pd.DataFrame().process_gld_credits(
119 | download_type='issuances', harmonize_beneficiary_info=True
120 | ),
121 | pd.concat(
122 | [
123 | pd.read_csv(
124 | 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/issuances.csv.gz'
125 | ).process_gld_credits(download_type='issuances', harmonize_beneficiary_info=True),
126 | pd.read_csv(
127 | 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/retirements.csv.gz'
128 | ).process_gld_credits(download_type='retirements', harmonize_beneficiary_info=True),
129 | ]
130 | ),
131 | ],
132 | )
133 | @pytest.mark.parametrize(
134 | 'projects',
135 | [
136 | pd.DataFrame(),
137 | pd.read_csv('s3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/projects.csv.gz'),
138 | ],
139 | )
140 | def test_gld_empty(df_credits, projects):
141 | prefix = 'GLD'
142 |
143 | credit_without_id_schema.validate(df_credits)
144 |
145 | assert set(df_credits.columns) == set(credit_without_id_schema.columns.keys())
146 |
147 | df_projects = projects.process_gld_projects(credits=df_credits)
148 | project_schema.validate(df_projects)
149 |
150 | # check if all project_id use the same prefix
151 | assert df_projects['project_id'].str.startswith(prefix).all()
152 | assert df_credits['project_id'].str.startswith(prefix).all()
153 |
--------------------------------------------------------------------------------
/tests/test_pipeline_utils.py:
--------------------------------------------------------------------------------
1 | import io
2 | import zipfile
3 | from datetime import datetime
4 | from unittest.mock import MagicMock, patch
5 |
6 | import pandas as pd
7 | import pytest
8 |
9 | from offsets_db_data.pipeline_utils import (
10 | _create_data_zip_buffer,
11 | summarize,
12 | to_parquet,
13 | transform_registry_data,
14 | validate_data,
15 | write_latest_production,
16 | )
17 |
18 |
19 | @pytest.fixture
20 | def sample_credits():
21 | """Sample credits dataframe for testing."""
22 | return pd.DataFrame(
23 | {
24 | 'project_id': ['VCS123', 'VCS124', 'ACR456', 'CAR789'],
25 | 'quantity': [100, 200, 150, 300],
26 | 'vintage': [2020, 2021, 2020, 2022],
27 | 'transaction_date': pd.to_datetime(
28 | ['2021-01-01', '2022-02-01', '2021-03-15', '2022-04-30']
29 | ),
30 | 'transaction_type': ['issuance', 'retirement', 'issuance', 'retirement'],
31 | }
32 | )
33 |
34 |
35 | @pytest.fixture
36 | def sample_projects():
37 | """Sample projects dataframe for testing."""
38 | return pd.DataFrame(
39 | {
40 | 'project_id': ['VCS123', 'VCS124', 'ACR456', 'CAR789'],
41 | 'name': ['Project A', 'Project B', 'Project C', 'Project D'],
42 | 'registry': ['verra', 'verra', 'american-carbon-registry', 'climate-action-reserve'],
43 | 'is_compliance': [False, True, False, True],
44 | 'retired': [50, 200, 75, 250],
45 | 'issued': [100, 200, 150, 300],
46 | 'type': ['forestry', 'renewable-energy', 'agriculture', 'forestry'],
47 | 'type_source': ['carbonplan', 'berkeley', 'carbonplan', 'carbonplan'],
48 | }
49 | )
50 |
51 |
52 | @patch('offsets_db_data.pipeline_utils.catalog')
53 | def test_validate_data_success(mock_catalog, sample_credits):
54 | """Test validate_data when data passes validation criteria."""
55 | # Mock old data with 90% of new data quantity
56 | mock_old_data = sample_credits.copy()
57 | mock_old_data['quantity'] = mock_old_data['quantity'] * 0.9
58 |
59 | mock_catalog.__getitem__.return_value = MagicMock()
60 | mock_catalog.__getitem__.return_value.read.return_value = mock_old_data
61 |
62 | # Should not raise exception
63 | validate_data(
64 | new_data=sample_credits,
65 | as_of=datetime(2023, 1, 1),
66 | data_type='credits',
67 | quantity_column='quantity',
68 | aggregation_func=sum,
69 | )
70 |
71 | # Verify catalog was called properly
72 | mock_catalog.__getitem__.assert_called_with('credits')
73 |
74 |
75 | def test_summarize_single_registry(sample_credits, sample_projects, capsys):
76 | """Test summarize function with a single registry."""
77 | registry_name = 'verra'
78 |
79 | # Filter data for verra registry
80 | verra_projects = sample_projects[sample_projects['registry'] == registry_name]
81 | verra_credits = sample_credits[sample_credits['project_id'].str.startswith('VCS')]
82 |
83 | summarize(
84 | credits=verra_credits,
85 | projects=verra_projects,
86 | registry_name=registry_name,
87 | )
88 |
89 | captured = capsys.readouterr()
90 |
91 | assert f'Retired and Issued (in Millions) summary for {registry_name}' in captured.out
92 | assert f'Credits summary (in Millions) for {registry_name}' in captured.out
93 |
94 |
95 | def test_summarize_multi_registry(sample_credits, sample_projects, capsys):
96 | """Test summarize function across multiple registries."""
97 |
98 | summarize(
99 | credits=sample_credits,
100 | projects=sample_projects,
101 | )
102 |
103 | captured = capsys.readouterr()
104 |
105 | assert 'Summary Statistics for projects (in Millions)' in captured.out
106 | assert 'Summary Statistics for credits (in Millions)' in captured.out
107 |
108 |
109 | def test_create_data_zip_buffer_csv(sample_credits, sample_projects):
110 | """Test _create_data_zip_buffer with CSV format."""
111 |
112 | buffer = _create_data_zip_buffer(
113 | credits=sample_credits,
114 | projects=sample_projects,
115 | format_type='csv',
116 | terms_content='Test terms content',
117 | )
118 |
119 | # Test the buffer contains a valid ZIP
120 | with zipfile.ZipFile(buffer, 'r') as zip_ref:
121 | filenames = zip_ref.namelist()
122 |
123 | # Check expected files exist
124 | assert 'TERMS_OF_DATA_ACCESS.txt' in filenames
125 | assert 'credits.csv' in filenames
126 | assert 'projects.csv' in filenames
127 |
128 | # Check terms content
129 | assert zip_ref.read('TERMS_OF_DATA_ACCESS.txt').decode('utf-8') == 'Test terms content'
130 |
131 |
132 | def test_create_data_zip_buffer_parquet(sample_credits, sample_projects):
133 | """Test _create_data_zip_buffer with Parquet format."""
134 | buffer = _create_data_zip_buffer(
135 | credits=sample_credits,
136 | projects=sample_projects,
137 | format_type='parquet',
138 | terms_content='Test terms content',
139 | )
140 |
141 | # Test the buffer contains a valid ZIP
142 | with zipfile.ZipFile(buffer, 'r') as zip_ref:
143 | filenames = zip_ref.namelist()
144 |
145 | assert 'TERMS_OF_DATA_ACCESS.txt' in filenames
146 | assert 'credits.parquet' in filenames
147 | assert 'projects.parquet' in filenames
148 |
149 |
150 | @patch('fsspec.filesystem')
151 | @patch('fsspec.open')
152 | @patch('offsets_db_data.pipeline_utils._create_data_zip_buffer')
153 | def test_write_latest_production(
154 | mock_create_buffer,
155 | mock_fsspec_open,
156 | mock_fsspec_fs,
157 | sample_credits,
158 | sample_projects,
159 | ):
160 | """Test write_latest_production function."""
161 | # Setup mocks
162 | mock_fs = MagicMock()
163 | mock_fs.read_text.return_value = 'Test terms content'
164 | mock_fsspec_fs.return_value = mock_fs
165 |
166 | # Create a new buffer for each call
167 | mock_create_buffer.side_effect = [
168 | io.BytesIO(b'test csv data'),
169 | io.BytesIO(b'test parquet data'),
170 | ]
171 |
172 | mock_file = MagicMock()
173 | mock_context = MagicMock()
174 | mock_context.__enter__.return_value = mock_file
175 | mock_fsspec_open.return_value = mock_context
176 |
177 | # Call function
178 | write_latest_production(
179 | credits=sample_credits,
180 | projects=sample_projects,
181 | bucket='s3://test-bucket',
182 | )
183 |
184 | # Assert mocks called correctly
185 | assert mock_create_buffer.call_count == 2 # Called for CSV and Parquet
186 | mock_fsspec_fs.assert_called_once_with('s3', anon=False)
187 | assert mock_fsspec_open.call_count == 2
188 |
189 | # Verify write calls
190 | assert mock_file.write.call_count == 2
191 |
192 |
193 | @patch('offsets_db_data.pipeline_utils.to_parquet')
194 | @patch('offsets_db_data.pipeline_utils.summarize')
195 | def test_transform_registry_data(mock_summarize, mock_to_parquet, sample_credits, sample_projects):
196 | """Test transform_registry_data function."""
197 | # Setup mock functions
198 | process_credits_fn = MagicMock(return_value=sample_credits)
199 | process_projects_fn = MagicMock(return_value=sample_projects)
200 | output_paths = {'credits': 'path/to/credits', 'projects': 'path/to/projects'}
201 |
202 | # Call function
203 | result_credits, result_projects = transform_registry_data(
204 | process_credits_fn=process_credits_fn,
205 | process_projects_fn=process_projects_fn,
206 | output_paths=output_paths,
207 | registry_name='test-registry',
208 | )
209 |
210 | # Verify calls and returns
211 | process_credits_fn.assert_called_once()
212 | process_projects_fn.assert_called_once_with(credits=sample_credits)
213 | mock_summarize.assert_called_once_with(
214 | credits=sample_credits, projects=sample_projects, registry_name='test-registry'
215 | )
216 | mock_to_parquet.assert_called_once()
217 |
218 | # Verify return values
219 | assert result_credits.equals(sample_credits)
220 | assert result_projects.equals(sample_projects)
221 |
222 |
223 | @patch('tempfile.NamedTemporaryFile')
224 | def test_to_parquet(mock_temp_file, sample_credits, sample_projects):
225 | """Test to_parquet function."""
226 | # Setup mock
227 | mock_temp = MagicMock()
228 | mock_temp_file.return_value.__enter__.return_value = mock_temp
229 |
230 | # Setup output paths
231 | output_paths = {
232 | 'credits': 'path/to/credits',
233 | 'projects': 'path/to/projects',
234 | }
235 |
236 | # Patch pandas to_parquet to prevent actual file writing
237 | with patch.object(pd.DataFrame, 'to_parquet') as mock_to_parquet:
238 | to_parquet(
239 | credits=sample_credits,
240 | projects=sample_projects,
241 | output_paths=output_paths,
242 | registry_name='test-registry',
243 | )
244 |
245 | # Assert to_parquet called for all three dataframes
246 | assert mock_to_parquet.call_count == 2
247 |
--------------------------------------------------------------------------------
/tests/test_vcs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | from offsets_db_data.vcs import (
6 | add_vcs_compliance_projects,
7 | calculate_vcs_issuances,
8 | calculate_vcs_retirements,
9 | determine_vcs_transaction_type,
10 | generate_vcs_project_ids,
11 | process_vcs_credits,
12 | process_vcs_projects,
13 | set_vcs_transaction_dates,
14 | set_vcs_vintage_year,
15 | )
16 |
17 |
18 | def vcs_projects() -> pd.DataFrame:
19 | df = pd.DataFrame(
20 | [
21 | {
22 | 'ID': 75,
23 | 'Name': '5.4 MW Grouped Wind Power Project in Gujarat & Maharashtra (India) by Rohan Builders (India) Pvt Ltd.',
24 | 'Proponent': 'Rohan Builders (India)',
25 | 'Project Type': 'Energy industries (renewable/non-renewable sources)',
26 | 'AFOLU Activities': np.nan,
27 | 'Methodology': 'AMS-I.D.',
28 | 'Status': 'Registered',
29 | 'Country/Area': 'India',
30 | 'Estimated Annual Emission Reductions': '9,143',
31 | 'Region': 'Asia',
32 | 'Project Registration Date': '2009-06-15',
33 | 'Crediting Period Start Date': np.nan,
34 | 'Crediting Period End Date': np.nan,
35 | },
36 | {
37 | 'ID': 2498,
38 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
39 | 'Proponent': 'Miller Forest Investment AG',
40 | 'Project Type': 'Agriculture Forestry and Other Land Use',
41 | 'AFOLU Activities': 'ARR',
42 | 'Methodology': 'AR-ACM0003',
43 | 'Status': 'Registered',
44 | 'Country/Area': 'Paraguay',
45 | 'Estimated Annual Emission Reductions': '204,819',
46 | 'Region': 'Latin America',
47 | 'Project Registration Date': '2022-01-14',
48 | 'Crediting Period Start Date': '2016-01-13',
49 | 'Crediting Period End Date': '2046-01-12',
50 | },
51 | {
52 | 'ID': 101,
53 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli',
54 | 'Proponent': 'Triveni Engineering and Industries Limited (TEIL)',
55 | 'Project Type': 'Energy industries (renewable/non-renewable sources)',
56 | 'AFOLU Activities': np.nan,
57 | 'Methodology': 'ACM0006',
58 | 'Status': 'Registered',
59 | 'Country/Area': 'India',
60 | 'Estimated Annual Emission Reductions': '86,808',
61 | 'Region': 'Asia',
62 | 'Project Registration Date': '2009-07-15',
63 | 'Crediting Period Start Date': np.nan,
64 | 'Crediting Period End Date': np.nan,
65 | },
66 | {
67 | 'ID': 3408,
68 | 'Name': 'Mianning1 Water Management with Rice Cultivation',
69 | 'Proponent': 'Yunnan Ruihan Agricultural Technology Development Co., Ltd.',
70 | 'Project Type': 'Agriculture Forestry and Other Land Use',
71 | 'AFOLU Activities': 'ALM',
72 | 'Methodology': 'AMS-III.AU',
73 | 'Status': 'Under development',
74 | 'Country/Area': 'China',
75 | 'Estimated Annual Emission Reductions': '55,497',
76 | 'Region': 'Asia',
77 | 'Project Registration Date': np.nan,
78 | 'Crediting Period Start Date': '2018-04-06',
79 | 'Crediting Period End Date': '2025-04-05',
80 | },
81 | {
82 | 'ID': 1223,
83 | 'Name': 'Yanhe, Dejiang, and Yinjiang Rural Methane Digesters Project in Guizhou Province, China',
84 | 'Proponent': 'Guizhou Black Carbon Energy Tech Prom & App Co. Lt',
85 | 'Project Type': 'Energy industries (renewable/non-renewable sources)',
86 | 'AFOLU Activities': np.nan,
87 | 'Methodology': 'AMS-I.C.; AMS-III.R.',
88 | 'Status': 'Under validation',
89 | 'Country/Area': 'China',
90 | 'Estimated Annual Emission Reductions': '53,247',
91 | 'Region': 'Asia',
92 | 'Project Registration Date': np.nan,
93 | 'Crediting Period Start Date': np.nan,
94 | 'Crediting Period End Date': np.nan,
95 | },
96 | ]
97 | )
98 |
99 | return df
100 |
101 |
102 | @pytest.fixture(name='vcs_projects')
103 | def fixture_vcs_projects() -> pd.DataFrame:
104 | return vcs_projects()
105 |
106 |
107 | def vcs_transactions() -> pd.DataFrame:
108 | df = pd.DataFrame(
109 | [
110 | {
111 | 'Issuance Date': '08/03/2022',
112 | 'Sustainable Development Goals': np.nan,
113 | 'Vintage Start': '01/01/2020',
114 | 'Vintage End': '19/11/2020',
115 | 'ID': 2498,
116 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
117 | 'Country/Area': 'Paraguay',
118 | 'Project Type': 'Agriculture Forestry and Other Land Use',
119 | 'Methodology': 'AR-ACM0003',
120 | 'Total Vintage Quantity': '99,870',
121 | 'Quantity Issued': '84,773',
122 | 'Serial Number': '12629-421604735-421689507-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0',
123 | 'Additional Certifications': np.nan,
124 | 'Retirement/Cancellation Date': np.nan,
125 | 'Retirement Beneficiary': np.nan,
126 | 'Retirement Reason': np.nan,
127 | 'Retirement Details': np.nan,
128 | },
129 | {
130 | 'Issuance Date': '29/11/2022',
131 | 'Sustainable Development Goals': np.nan,
132 | 'Vintage Start': '01/01/2017',
133 | 'Vintage End': '31/12/2017',
134 | 'ID': 2498,
135 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
136 | 'Country/Area': 'Paraguay',
137 | 'Project Type': 'Agriculture Forestry and Other Land Use',
138 | 'Methodology': 'AR-ACM0003',
139 | 'Total Vintage Quantity': '82,455',
140 | 'Quantity Issued': '5,000',
141 | 'Serial Number': '14121-556418249-556423248-VCS-VCU-576-VER-PY-14-2498-01012017-31122017-0',
142 | 'Additional Certifications': np.nan,
143 | 'Retirement/Cancellation Date': '26/12/2022',
144 | 'Retirement Beneficiary': 'DNV AS',
145 | 'Retirement Reason': 'Environmental Benefit',
146 | 'Retirement Details': 'VCUs 2022 for DNV',
147 | },
148 | {
149 | 'Issuance Date': '24/06/2022',
150 | 'Sustainable Development Goals': np.nan,
151 | 'Vintage Start': '13/01/2016',
152 | 'Vintage End': '31/12/2016',
153 | 'ID': 2498,
154 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
155 | 'Country/Area': 'Paraguay',
156 | 'Project Type': 'Agriculture Forestry and Other Land Use',
157 | 'Methodology': 'AR-ACM0003',
158 | 'Total Vintage Quantity': '55,805',
159 | 'Quantity Issued': '1,788',
160 | 'Serial Number': '13378-495669005-495670792-VCS-VCU-576-VER-PY-14-2498-13012016-31122016-0',
161 | 'Additional Certifications': np.nan,
162 | 'Retirement/Cancellation Date': '11/09/2022',
163 | 'Retirement Beneficiary': np.nan,
164 | 'Retirement Reason': np.nan,
165 | 'Retirement Details': np.nan,
166 | },
167 | {
168 | 'Issuance Date': '27/07/2022',
169 | 'Sustainable Development Goals': np.nan,
170 | 'Vintage Start': '01/01/2020',
171 | 'Vintage End': '19/11/2020',
172 | 'ID': 2498,
173 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
174 | 'Country/Area': 'Paraguay',
175 | 'Project Type': 'Agriculture Forestry and Other Land Use',
176 | 'Methodology': 'AR-ACM0003',
177 | 'Total Vintage Quantity': '99,870',
178 | 'Quantity Issued': '725',
179 | 'Serial Number': '13488-505972385-505973109-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0',
180 | 'Additional Certifications': np.nan,
181 | 'Retirement/Cancellation Date': '27/07/2022',
182 | 'Retirement Beneficiary': 'Jebsen & Jessen (GmbH & Co.) KG',
183 | 'Retirement Reason': 'Environmental Benefit',
184 | 'Retirement Details': 'Retired on behalf of Jebsen & Jessen 2022',
185 | },
186 | {
187 | 'Issuance Date': '11/09/2009',
188 | 'Sustainable Development Goals': np.nan,
189 | 'Vintage Start': '01/04/2006',
190 | 'Vintage End': '18/03/2007',
191 | 'ID': 101,
192 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli',
193 | 'Country/Area': 'India',
194 | 'Project Type': 'Energy industries (renewable/non-renewable sources)',
195 | 'Methodology': 'ACM0006',
196 | 'Total Vintage Quantity': '62,796',
197 | 'Quantity Issued': '25,433',
198 | 'Serial Number': '240-7863589-7889021-VCU-003-APX-IN-1-101-01042006-18032007-0',
199 | 'Additional Certifications': np.nan,
200 | 'Retirement/Cancellation Date': '17/06/2015',
201 | 'Retirement Beneficiary': np.nan,
202 | 'Retirement Reason': np.nan,
203 | 'Retirement Details': np.nan,
204 | },
205 | {
206 | 'Issuance Date': '04/11/2022',
207 | 'Sustainable Development Goals': np.nan,
208 | 'Vintage Start': '01/01/2019',
209 | 'Vintage End': '31/12/2019',
210 | 'ID': 2498,
211 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
212 | 'Country/Area': 'Paraguay',
213 | 'Project Type': 'Agriculture Forestry and Other Land Use',
214 | 'Methodology': 'AR-ACM0003',
215 | 'Total Vintage Quantity': '99,871',
216 | 'Quantity Issued': '1,413',
217 | 'Serial Number': '13969-543072663-543074075-VCS-VCU-576-VER-PY-14-2498-01012019-31122019-0',
218 | 'Additional Certifications': np.nan,
219 | 'Retirement/Cancellation Date': '26/12/2022',
220 | 'Retirement Beneficiary': 'DNV AS',
221 | 'Retirement Reason': 'Environmental Benefit',
222 | 'Retirement Details': 'VCUs 2022 for DNV',
223 | },
224 | {
225 | 'Issuance Date': '27/07/2022',
226 | 'Sustainable Development Goals': np.nan,
227 | 'Vintage Start': '01/01/2020',
228 | 'Vintage End': '19/11/2020',
229 | 'ID': 2498,
230 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
231 | 'Country/Area': 'Paraguay',
232 | 'Project Type': 'Agriculture Forestry and Other Land Use',
233 | 'Methodology': 'AR-ACM0003',
234 | 'Total Vintage Quantity': '99,870',
235 | 'Quantity Issued': '297',
236 | 'Serial Number': '13488-505982056-505982352-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0',
237 | 'Additional Certifications': np.nan,
238 | 'Retirement/Cancellation Date': '26/12/2022',
239 | 'Retirement Beneficiary': 'DNV AS',
240 | 'Retirement Reason': 'Environmental Benefit',
241 | 'Retirement Details': 'VCUs 2022 for DNV',
242 | },
243 | {
244 | 'Issuance Date': '27/07/2022',
245 | 'Sustainable Development Goals': np.nan,
246 | 'Vintage Start': '01/01/2018',
247 | 'Vintage End': '31/12/2018',
248 | 'ID': 2498,
249 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
250 | 'Country/Area': 'Paraguay',
251 | 'Project Type': 'Agriculture Forestry and Other Land Use',
252 | 'Methodology': 'AR-ACM0003',
253 | 'Total Vintage Quantity': '97,077',
254 | 'Quantity Issued': '1,380',
255 | 'Serial Number': '13487-505962385-505963764-VCS-VCU-576-VER-PY-14-2498-01012018-31122018-0',
256 | 'Additional Certifications': np.nan,
257 | 'Retirement/Cancellation Date': '20/10/2022',
258 | 'Retirement Beneficiary': 'Implement Consulting Group',
259 | 'Retirement Reason': 'Environmental Benefit',
260 | 'Retirement Details': 'Retirement of 1380t in the name of Implement Consulting Group, for flights 2021',
261 | },
262 | {
263 | 'Issuance Date': '27/07/2022',
264 | 'Sustainable Development Goals': np.nan,
265 | 'Vintage Start': '01/01/2020',
266 | 'Vintage End': '19/11/2020',
267 | 'ID': 2498,
268 | 'Name': 'Afforestation of degraded grasslands in Caazapa and Guairá',
269 | 'Country/Area': 'Paraguay',
270 | 'Project Type': 'Agriculture Forestry and Other Land Use',
271 | 'Methodology': 'AR-ACM0003',
272 | 'Total Vintage Quantity': '99,870',
273 | 'Quantity Issued': '8,946',
274 | 'Serial Number': '13488-505973110-505982055-VCS-VCU-576-VER-PY-14-2498-01012020-19112020-0',
275 | 'Additional Certifications': np.nan,
276 | 'Retirement/Cancellation Date': '01/12/2022',
277 | 'Retirement Beneficiary': np.nan,
278 | 'Retirement Reason': np.nan,
279 | 'Retirement Details': np.nan,
280 | },
281 | {
282 | 'Issuance Date': '11/09/2009',
283 | 'Sustainable Development Goals': np.nan,
284 | 'Vintage Start': '01/04/2006',
285 | 'Vintage End': '18/03/2007',
286 | 'ID': 101,
287 | 'Name': 'Bagasse based Co-generation Power Project at Khatauli',
288 | 'Country/Area': 'India',
289 | 'Project Type': 'Energy industries (renewable/non-renewable sources)',
290 | 'Methodology': 'ACM0006',
291 | 'Total Vintage Quantity': '62,796',
292 | 'Quantity Issued': '1,466',
293 | 'Serial Number': '240-7889022-7890487-VCU-003-APX-IN-1-101-01042006-18032007-0',
294 | 'Additional Certifications': np.nan,
295 | 'Retirement/Cancellation Date': '18/06/2015',
296 | 'Retirement Beneficiary': np.nan,
297 | 'Retirement Reason': np.nan,
298 | 'Retirement Details': np.nan,
299 | },
300 | ]
301 | )
302 | return df
303 |
304 |
305 | @pytest.fixture(name='vcs_transactions')
306 | def fixture_vcs_transactions() -> pd.DataFrame:
307 | return vcs_transactions()
308 |
309 |
310 | def test_determine_vcs_transaction_type(vcs_transactions):
311 | df = determine_vcs_transaction_type(
312 | vcs_transactions, date_column='Retirement/Cancellation Date'
313 | )
314 |
315 | # Check if the 'transaction_type' column is created
316 | assert 'transaction_type' in df.columns
317 |
318 | # Check that the function correctly assigns 'retirement/cancellation' or 'issuance'
319 | for i, row in df.iterrows():
320 | if pd.notnull(row['Retirement/Cancellation Date']):
321 | assert row['transaction_type'] == 'retirement'
322 | else:
323 | assert row['transaction_type'] == 'issuance'
324 |
325 |
326 | def test_set_vcs_transaction_dates(vcs_transactions):
327 | df = set_vcs_transaction_dates(
328 | vcs_transactions,
329 | date_column='Retirement/Cancellation Date',
330 | fallback_column='Issuance Date',
331 | )
332 |
333 | # Check if the 'transaction_date' column is created
334 | assert 'transaction_date' in df.columns
335 |
336 | # Create a series for expected transaction_date values
337 | expected_transaction_date = vcs_transactions['Retirement/Cancellation Date'].where(
338 | vcs_transactions['Retirement/Cancellation Date'].notnull(),
339 | vcs_transactions['Issuance Date'],
340 | )
341 |
342 | expected_transaction_date.name = (
343 | 'transaction_date' # Set the name of the Series to match the DataFrame column
344 | )
345 |
346 | # Use assert_series_equal to compare the entire series
347 | pd.testing.assert_series_equal(df['transaction_date'], expected_transaction_date)
348 |
349 |
350 | def test_set_vcs_vintage_year(vcs_transactions):
351 | df = set_vcs_vintage_year(vcs_transactions, date_column='Issuance Date')
352 |
353 | # Check if the 'vintage' column is created
354 | assert 'vintage' in df.columns
355 |
356 | # Convert 'Issuance Date' in the original DataFrame to datetime for comparison
357 | expected_vintage = pd.to_datetime(
358 | vcs_transactions['Issuance Date'], dayfirst=True, utc=True
359 | ).dt.year
360 | expected_vintage.name = 'vintage' # Set the name of the Series to match the DataFrame column
361 |
362 | # Use assert_series_equal to compare the 'vintage' column with the expected result
363 | pd.testing.assert_series_equal(df['vintage'], expected_vintage)
364 |
365 |
366 | def test_calculate_vcs_issuances(vcs_transactions):
367 | # Process the vcs_transactions similar to process_vcs_credits
368 | processed_data = (
369 | vcs_transactions.set_registry(registry_name='verra')
370 | .generate_vcs_project_ids(prefix='VCS')
371 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date')
372 | .set_vcs_transaction_dates(
373 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date'
374 | )
375 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued'])
376 | .set_vcs_vintage_year(date_column='Vintage End')
377 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True)
378 | )
379 |
380 | # Apply calculate_vcs_issuances
381 | issuances = calculate_vcs_issuances(processed_data)
382 |
383 | # Assertions
384 | # Ensure duplicates are removed based on the specified columns
385 | assert issuances.duplicated(subset=['vintage', 'project_id', 'quantity']).sum() == 0
386 |
387 | # Ensure the 'quantity' column is correctly populated
388 | assert 'quantity' in issuances.columns
389 |
390 | # Ensure 'transaction_type' is set to 'issuance'
391 | assert all(issuances['transaction_type'] == 'issuance')
392 |
393 |
394 | def test_calculate_vcs_retirements(vcs_transactions):
395 | # Process the vcs_transactions similar to process_vcs_credits
396 | processed_data = (
397 | vcs_transactions.set_registry(registry_name='verra')
398 | .generate_vcs_project_ids(prefix='VCS')
399 | .determine_vcs_transaction_type(date_column='Retirement/Cancellation Date')
400 | .set_vcs_transaction_dates(
401 | date_column='Retirement/Cancellation Date', fallback_column='Issuance Date'
402 | )
403 | .clean_and_convert_numeric_columns(columns=['Total Vintage Quantity', 'Quantity Issued'])
404 | .set_vcs_vintage_year(date_column='Vintage End')
405 | .convert_to_datetime(columns=['transaction_date'], dayfirst=True)
406 | )
407 |
408 | # Apply calculate_vcs_retirements
409 | retirements = calculate_vcs_retirements(processed_data)
410 |
411 | # Assertions
412 | # Check if 'retirement' and 'cancellation' types are present and 'issuance' types are filtered out
413 | assert all(retirements['transaction_type'].str.contains('retirement'))
414 |
415 | # Ensure the 'quantity' column is correctly renamed
416 | assert 'quantity' in retirements.columns
417 | assert 'Quantity Issued' not in retirements.columns
418 |
419 |
420 | def test_generate_vcs_project_ids(vcs_projects):
421 | df = vcs_projects
422 | df = generate_vcs_project_ids(df, prefix='VCS')
423 | assert df['project_id'].tolist() == [
424 | 'VCS75',
425 | 'VCS2498',
426 | 'VCS101',
427 | 'VCS3408',
428 | 'VCS1223',
429 | ]
430 |
431 |
432 | def test_add_vcs_compliance_projects(vcs_projects):
433 | original_length = len(vcs_projects)
434 | df = add_vcs_compliance_projects(vcs_projects)
435 |
436 | # Check if two new rows are added
437 | assert len(df) == original_length + 2
438 |
439 | # Optionally, check for the presence of specific project details
440 | assert 'VCSOPR2' in df['project_id'].values
441 | assert 'VCSOPR10' in df['project_id'].values
442 |
443 |
444 | def test_process_vcs_projects(vcs_projects, vcs_transactions):
445 | vcs_credits = process_vcs_credits(vcs_transactions, harmonize_beneficiary_info=False)
446 | df = process_vcs_projects(
447 | vcs_projects, credits=vcs_credits, registry_name='verra', download_type='projects'
448 | )
449 |
450 | assert 'listed_at' in df.columns
451 | # check project_url series
452 | assert df['project_url'].tolist() == [
453 | 'https://registry.verra.org/app/projectDetail/VCS/75',
454 | 'https://registry.verra.org/app/projectDetail/VCS/2498',
455 | 'https://registry.verra.org/app/projectDetail/VCS/101',
456 | 'https://registry.verra.org/app/projectDetail/VCS/3408',
457 | 'https://registry.verra.org/app/projectDetail/VCS/1223',
458 | 'https://registry.verra.org/app/projectDetail/VCS/2265', # From add_vcs_compliance_projects
459 | 'https://registry.verra.org/app/projectDetail/VCS/2271', # From add_vcs_compliance_projects
460 | ]
461 | # check project_id series
462 | assert df['project_id'].tolist() == [
463 | 'VCS75',
464 | 'VCS2498',
465 | 'VCS101',
466 | 'VCS3408',
467 | 'VCS1223',
468 | 'VCSOPR2', # From add_vcs_compliance_projects
469 | 'VCSOPR10', # From add_vcs_compliance_projects
470 | ]
471 |
472 |
473 | def test_process_vcs_projects_with_totals_and_dates(vcs_projects, vcs_transactions):
474 | # Process the vcs_transactions as per your existing pipeline
475 | # Assuming process_vcs_credits or similar functions are in place
476 | vcs_credits = process_vcs_credits(vcs_transactions, harmonize_beneficiary_info=False)
477 |
478 | # Process the vcs_projects
479 | processed_projects = process_vcs_projects(
480 | vcs_projects, credits=vcs_credits, registry_name='verra', download_type='projects'
481 | )
482 |
483 | # Assertions for retired and issued totals, and first issuance/retirement dates
484 | # You need to know expected values for at least one project based on your test data
485 | project_id = 'VCS2498'
486 |
487 | # Extract the row for the specific project
488 | project_data = processed_projects[processed_projects['project_id'] == project_id]
489 |
490 | # Assert the total issued and retired quantities
491 | expected_total_issued = 435078 # Calculate this based on vcs_transactions fixture
492 | expected_total_retired = 19549 # Calculate this based on vcs_transactions fixture
493 | assert project_data['issued'].iloc[0] == expected_total_issued
494 | assert project_data['retired'].iloc[0] == expected_total_retired
495 |
496 | assert isinstance(project_data['first_issuance_at'].iloc[0], pd.Timestamp)
497 | assert isinstance(project_data['first_retirement_at'].iloc[0], pd.Timestamp)
498 |
--------------------------------------------------------------------------------