├── tests
    ├── __init__.py
    ├── ml
    │   ├── __init__.py
    │   ├── test_utils.py
    │   ├── test_object_detection.py
    │   └── test_image_classification.py
    └── unit
    │   ├── __init__.py
    │   ├── test_types.py
    │   ├── test_api_config.py
    │   ├── utils
    │       ├── test_text.py
    │       └── test_utils.py
    │   ├── test_ocr.py
    │   ├── test_barcode.py
    │   ├── test_taxonomy.py
    │   ├── test_redis.py
    │   └── test_api.py
├── LICENSE
├── openfoodfacts
    ├── py.typed
    ├── __init__.py
    ├── ml
    │   ├── utils.py
    │   ├── triton.py
    │   ├── image_classification.py
    │   └── object_detection.py
    ├── ingredients.py
    ├── barcode.py
    ├── utils
    │   ├── text.py
    │   └── __init__.py
    ├── dataset.py
    ├── images.py
    ├── redis.py
    └── taxonomy.py
├── .release-please-manifest.json
├── MANIFEST.in
├── .coveragerc
├── release-please-config.json
├── .flake8
├── .editorconfig
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── CODEOWNERS
    ├── workflows
    │   ├── auto-assign-pr.yml
    │   ├── reuse.yaml
    │   ├── semantic-pr.yml
    │   ├── publish-conda.yml
    │   ├── merge-conflict-autolabel.yml
    │   ├── pypi.yml
    │   ├── release-please.yml
    │   ├── label.yml
    │   ├── generate-docs.yml
    │   ├── ci.yml
    │   ├── codeql-analysis.yml
    │   └── github-projects.yml
    ├── dependabot.yml
    └── labeler.yml
├── .pre-commit-config.yaml
├── REUSE.toml
├── conda
    └── meta.yaml
├── mkdocs.yml
├── REUSE.md
├── .gitignore
├── LICENSES
    ├── MIT.txt
    └── CC0-1.0.txt
├── pyproject.toml
├── docs
    ├── handle_taxonomies.md
    └── usage.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | LICENSES/MIT.txt


--------------------------------------------------------------------------------
/openfoodfacts/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/ml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.release-please-manifest.json:
--------------------------------------------------------------------------------
1 | {
2 |     ".": "3.3.0"
3 | }


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | global-include *.typed
3 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | source = ./openfoodfacts
3 | omit = ./venv/*,*tests*
4 | 
5 | [report]
6 | omit = ./venv/*,*tests*,*mi
7 | 


--------------------------------------------------------------------------------
/release-please-config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "packages": {
3 |         ".": {
4 |             "release-type": "python"
5 |         }
6 |     }
7 | }


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E501, W503
3 | max-line-length = 88
4 | exclude = .git,__pycache__,build,dist,*_pb2.py,.venv
5 | per-file-ignores =
6 |     robotoff/cli/main.py:B008
7 | max-doc-length = 79


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = space
 6 | indent_size = 4
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | insert_final_newline = true
11 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | 
 3 | - Brief note about the issue
 4 | 
 5 | ## Solution
 6 | 
 7 | - Mention how your solution resolves the issue
 8 | 
 9 | ## Related issue(s)
10 | 
11 | - Fixes #[ISSUE NUMBER]


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2020 Free Software Foundation Europe e.V.
2 | # SPDX-License-Identifier: CC0-1.0
3 | repos:
4 |   - repo: https://github.com/fsfe/reuse-tool
5 |     rev: v5.0.2
6 |     hooks:
7 |       - id: reuse-lint-file
8 | 


--------------------------------------------------------------------------------
/REUSE.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 | 
3 | # Catch-all entry; overridden by specific claims in individual files
4 | [[annotations]]
5 | path = ["**"]
6 | SPDX-FileCopyrightText = "Copyright (c) 2016-2025 Open Food Facts <https://openfoodfacts.org/>"
7 | SPDX-License-Identifier = "MIT"
8 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # review when someone opens a pull request.
4 | # For more on how to customize the CODEOWNERS file - https://help.github.com/en/articles/about-code-owners
5 | 
6 | *       @openfoodfacts/openfoodfacts-python
7 | 


--------------------------------------------------------------------------------
/.github/workflows/auto-assign-pr.yml:
--------------------------------------------------------------------------------
 1 | # .github/workflows/auto-author-assign.yml
 2 | name: 'Auto Author Assign'
 3 | 
 4 | on:
 5 |   pull_request_target:
 6 |     types: [opened, reopened]
 7 | 
 8 | permissions:
 9 |   pull-requests: write
10 | 
11 | jobs:
12 |   assign-author:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: toshimaru/auto-author-assign@v2.1.1
16 | 


--------------------------------------------------------------------------------
/.github/workflows/reuse.yaml:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2020 Free Software Foundation Europe e.V.
 2 | # SPDX-License-Identifier: CC0-1.0
 3 | name: REUSE Compliance Check
 4 | 
 5 | on: [push, pull_request]
 6 | 
 7 | jobs:
 8 |   test:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v5
12 |       - name: REUSE Compliance Check
13 |         uses: fsfe/reuse-action@v5
14 | 


--------------------------------------------------------------------------------
/.github/workflows/semantic-pr.yml:
--------------------------------------------------------------------------------
 1 | name: "Semantic PRs"
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     types:
 6 |       - opened
 7 |       - edited
 8 |       - synchronize
 9 | 
10 | jobs:
11 |   main:
12 |     name: Validate PR title
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: amannn/action-semantic-pull-request@v6
16 |         env:
17 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set data = load_setup_py_data() %}
 2 | 
 3 | package:
 4 |   name: openfoodfacts
 5 |   version: {{ data['version'] }}
 6 | 
 7 | source:
 8 |   path: ..
 9 | 
10 | build:
11 |   number: 0
12 |   script: python -m pip install .
13 | 
14 | requirements:
15 |   host:
16 |     - pip
17 |     - python
18 |   run:
19 |     - python
20 |     - requests >=2.20.0
21 | 
22 | test:
23 |   imports:
24 |     - openfoodfacts
25 | 
26 | about:
27 |   home: {{ data['url'] }}
28 |   license: {{ data['license'] }}
29 |   summary: {{ data['description'] }}
30 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Conda
 2 | 
 3 | on:
 4 |   # Triggers the workflow on a new release
 5 |   release:
 6 |     types: [created]
 7 |     
 8 | jobs:
 9 |   publish:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Checkout repo
13 |       uses: actions/checkout@v5
14 |     - name: publish-to-conda
15 |       uses: MichaelsJP/conda-package-publish-action@v1.1.0
16 |       with:
17 |         subDir: 'conda'
18 |         AnacondaToken: ${{ secrets.ANACONDA_TOKEN }}
19 |         platforms: 'all'
20 |         override: true


--------------------------------------------------------------------------------
/.github/workflows/merge-conflict-autolabel.yml:
--------------------------------------------------------------------------------
 1 | name: '💥 Auto-Label Merge Conflicts on PRs'
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - develop
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.ref }}
 9 |   cancel-in-progress: true
10 |   
11 | jobs:
12 |   triage:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: mschilde/auto-label-merge-conflicts@5981f8933e92b78098af86b9e33fe0871cc7a3be  # v2.0 (2020-01-27)
16 |         with:
17 |           CONFLICT_LABEL_NAME: "💥 Merge Conflicts"
18 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 |           MAX_RETRIES: 5
20 |           WAIT_MS: 5000
21 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish PyPI package
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - v*.*.*
 6 | 
 7 | jobs:
 8 |   push_to_pypi:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Check out the repo
12 |         uses: actions/checkout@v5
13 |       - name: Set up python
14 |         uses: actions/setup-python@v5
15 |         with:
16 |           python-version: 3.9
17 |       - uses: abatilo/actions-poetry@v4.0.0
18 |         with:
19 |           poetry-version: 2.1.3
20 |       - name: Run poetry build
21 |         run:  poetry build
22 |       - name: Run poetry publish
23 |         run: POETRY_PYPI_TOKEN_PYPI=${{ secrets.PYPI_TOKEN }} poetry publish
24 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | updates:
 4 |   - package-ecosystem: "pip"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "monthly"
 8 | 
 9 |     commit-message:
10 |       prefix: "chore"
11 |       include: "scope"
12 | 
13 |     open-pull-requests-limit: 1
14 | 
15 |     groups:
16 |       all-dependencies:
17 |         patterns:
18 |           - "*"
19 | 
20 |   - package-ecosystem: "github-actions"
21 |     directory: "/"
22 |     schedule:
23 |       interval: "monthly"
24 | 
25 |     commit-message:
26 |       prefix: "chore"
27 |       include: "scope"
28 | 
29 |     open-pull-requests-limit: 1
30 | 
31 |     groups:
32 |       all-actions:
33 |         patterns:
34 |           - "*"
35 | 


--------------------------------------------------------------------------------
/openfoodfacts/__init__.py:
--------------------------------------------------------------------------------
 1 | from openfoodfacts.barcode import normalize_barcode
 2 | 
 3 | from .api import API
 4 | from .dataset import ProductDataset, get_dataset
 5 | from .ocr import OCRResult
 6 | from .types import (
 7 |     APIConfig,
 8 |     APIVersion,
 9 |     Country,
10 |     DatasetType,
11 |     Environment,
12 |     Facet,
13 |     Flavor,
14 |     Lang,
15 | )
16 | 
17 | __all__ = [
18 |     "API",
19 |     "APIConfig",
20 |     "APIVersion",
21 |     "Country",
22 |     "DatasetType",
23 |     "Facet",
24 |     "Flavor",
25 |     "Environment",
26 |     "Lang",
27 |     "OCRResult",
28 |     "ProductDataset",
29 |     "get_dataset",
30 |     "normalize_barcode",
31 | ]
32 | 
33 | __version__ = "3.3.0"
34 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 |   on:
 2 |     push:
 3 |       branches:
 4 |         - develop
 5 |   name: release-please
 6 |   jobs:
 7 |     release-please:
 8 |       runs-on: ubuntu-latest
 9 |       steps:
10 |         - uses: GoogleCloudPlatform/release-please-action@v4.3
11 |           with:
12 |             # We can't use GITHUB_TOKEN here because, github actions can't provocate actions
13 |             # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#using-the-github_token-in-a-workflow
14 |             # So this is a personnal access token
15 |             token: ${{ secrets.RELEASE_PLEASE_TOKEN }}
16 |             release-type: python
17 |             package-name: openfoodfacts
18 | 


--------------------------------------------------------------------------------
/.github/workflows/label.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will triage pull requests and apply a label based on the
 2 | # paths that are modified in the pull request.
 3 | #
 4 | # To use this workflow, you will need to set up a .github/labeler.yml
 5 | # file with configuration.  For more information, see:
 6 | # https://github.com/actions/labeler
 7 | 
 8 | name: Labeler
 9 | on:
10 | - pull_request_target
11 | 
12 | jobs:
13 |   label:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       contents: read
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/labeler@v5.0.0
22 |       if: github.event.pull_request.head.repo.full_name == github.repository
23 |       with:
24 |         repo-token: "${{ secrets.GITHUB_TOKEN }}"
25 | 


--------------------------------------------------------------------------------
/tests/unit/test_types.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from openfoodfacts.types import Flavor
 4 | 
 5 | 
 6 | def test_from_product_type_food():
 7 |     assert Flavor.from_product_type("food") == Flavor.off
 8 | 
 9 | 
10 | def test_from_product_type_beauty():
11 |     assert Flavor.from_product_type("beauty") == Flavor.obf
12 | 
13 | 
14 | def test_from_product_type_petfood():
15 |     assert Flavor.from_product_type("petfood") == Flavor.opff
16 | 
17 | 
18 | def test_from_product_type_product():
19 |     assert Flavor.from_product_type("product") == Flavor.opf
20 | 
21 | 
22 | def test_from_product_type_invalid():
23 |     with pytest.raises(
24 |         ValueError, match="no Flavor matched with product_type 'invalid'"
25 |     ):
26 |         Flavor.from_product_type("invalid")
27 | 


--------------------------------------------------------------------------------
/tests/unit/test_api_config.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pydantic_core
 4 | 
 5 | import openfoodfacts
 6 | 
 7 | 
 8 | class TestAPIConfig(unittest.TestCase):
 9 |     def test_valid_user_agent(self):
10 |         config = openfoodfacts.APIConfig(user_agent="Valid User Agent")
11 |         assert config.user_agent == "Valid User Agent"
12 | 
13 |     def test_invalid_user_agent_type(self):
14 |         with self.assertRaises(pydantic_core.ValidationError) as ctx:
15 |             openfoodfacts.APIConfig(user_agent=None)
16 |             self.assertTrue("valid string" in ctx.exception)
17 | 
18 |     def test_blank_user_agent(self):
19 |         with self.assertRaises(pydantic_core.ValidationError) as ctx:
20 |             openfoodfacts.APIConfig(user_agent="")
21 |             self.assertTrue("cannot be empty" in ctx.exception)
22 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Automatic Documentation
 2 | 
 3 | on:
 4 |   # Triggers the workflow on push
 5 |   push:
 6 |     branches:
 7 |       - actions-dev
 8 |       - develop
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v5
15 |         with:
16 |           fetch-depth: 0
17 |       - uses: actions/setup-python@v5
18 |         with: 
19 |           python-version: 3.8
20 |       - name: Copy README.md to docs
21 |         run: cp ./README.md ./docs/index.md
22 |       - name: Install dependencies
23 |         run: pip install --upgrade pip && pip install mkdocs mkdocs-gen-files mkdocs-material
24 |       - run: git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com'
25 |       - name: Publish docs
26 |         run: mkdocs gh-deploy


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | docs_dir: docs
 2 | site_name: openfoodfacts-python
 3 | site_url: https://openfoodfacts.github.io/openfoodfacts-python
 4 | edit_uri: edit/develop/docs/
 5 | site_dir: gh_pages
 6 | 
 7 | nav:
 8 |   - Home: 'index.md'
 9 |   - Usage: 'usage.md'
10 |   - "Handle taxonomies": 'handle_taxonomies.md'
11 | 
12 | theme:
13 |   name: material
14 |   features:
15 |     - content.action.edit
16 |   logo: https://static.openfoodfacts.org/images/logos/off-logo-horizontal-light.svg
17 |   favicon: https://static.openfoodfacts.org/images/logos/off-logo-favicon-light.png
18 |   palette: 
19 |     # Palette toggle for light mode
20 |     - scheme: default
21 |       toggle:
22 |         icon: material/brightness-7 
23 |         name: Switch to dark mode
24 |     # Palette toggle for dark mode
25 |     - scheme: slate
26 |       toggle:
27 |         icon: material/brightness-4
28 |         name: Switch to light mode
29 | 


--------------------------------------------------------------------------------
/REUSE.md:
--------------------------------------------------------------------------------
 1 | ## Applications using this Python SDK
 2 | 
 3 | ### Official applications
 4 | 
 5 | - Robotoff: https://github.com/openfoodfacts/robotoff uses this to create the ML system of Open Food Facts.
 6 | - Open Prices: https://github.com/openfoodfacts/open-prices uses this to handle many operations related to products
 7 | 
 8 | ### Targets
 9 | 
10 | - Folksonomy Engine:
11 | - Nutri-Patrol:
12 | - Taxonomy Editor:
13 | - Facets Knowledge Panels:
14 | 
15 | ### Third party applications
16 | 
17 | Feel [free to open a PR to add your application in this list](https://github.com/openfoodfacts/openfoodfacts-python/edit/develop/REUSE.md).
18 | Please get in touch at reuse@openfoodfacts.org
19 | We are very interested in learning what the Open Food Facts data is used for. It is not mandatory, but we would very much appreciate it if you tell us about your re-uses (https://forms.gle/hwaeqBfs8ywwhbTg8) so that we can share them with the Open Food Facts community. You can also fill this form to get a chance to get your app featured: https://forms.gle/hwaeqBfs8ywwhbTg8
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | .venv
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 
59 | # PyBuilder
60 | target/
61 | 
62 | # Ipython Notebook
63 | .ipynb_checkpoints
64 | 
65 | # OS related files
66 | .DS_Store


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
 1 | github_actions:
 2 | - changed-files:
 3 |   - any-glob-to-any-file: '.github/**/*'
 4 | 
 5 | REDIS:
 6 | - changed-files:
 7 |   - any-glob-to-any-file: 'tests/test_redis.py'
 8 |   - any-glob-to-any-file: 'openfoodfacts/redis.py'
 9 | 
10 | images:
11 | - changed-files:
12 |   - any-glob-to-any-file: 'openfoodfacts/images.py'
13 |   - any-glob-to-any-file: 'tests/​test_images.py'
14 | 
15 | OCR:
16 | - changed-files:
17 |   - any-glob-to-any-file: 'tests/​test_ocr.py'
18 |   
19 | tests:
20 | - changed-files:
21 |   - any-glob-to-any-file: 'tests/test_utils.py'
22 |   - any-glob-to-any-file: 'tests/​test_api.py'
23 |   - any-glob-to-any-file: 'tests/​test_ocr.py'
24 |   - any-glob-to-any-file: 'tests/​test_redis.py'
25 |   - any-glob-to-any-file: 'tests/​test_images.py'
26 |   - any-glob-to-any-file: 'tests/​test_api_config.py'
27 | 
28 | utils:
29 | - changed-files:
30 |   - any-glob-to-any-file: 'openfoodfacts/utils.py'
31 |   - any-glob-to-any-file: 'tests/test_utils.py'
32 | 
33 | dependencies:
34 | - changed-files:
35 |   - any-glob-to-any-file: 'poetry.lock'
36 | 
37 | documentation:
38 | - changed-files:
39 |   - any-glob-to-any-file: 'handle_taxonomies.md'
40 |   - any-glob-to-any-file: 'usage.md'
41 | 


--------------------------------------------------------------------------------
/LICENSES/MIT.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2016 OpenFoodFacts, Inc. http://openfoodfacts.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORG
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 
24 | 


--------------------------------------------------------------------------------
/openfoodfacts/ml/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PIL import Image
 3 | 
 4 | 
 5 | def convert_image_to_array(image: Image.Image) -> np.ndarray:
 6 |     """Convert a PIL Image into a numpy array.
 7 | 
 8 |     The image is converted to RGB if needed before generating the array.
 9 | 
10 |     :param image: the input image.
11 |     :return: the generated numpy array of shape (width, height, 3)
12 |     """
13 |     if image.mode != "RGB":
14 |         image = image.convert("RGB")
15 | 
16 |     (im_width, im_height) = image.size
17 | 
18 |     return np.array(image.getdata(), dtype=np.uint8).reshape((im_height, im_width, 3))
19 | 
20 | 
21 | def resize_image(image: Image.Image, max_size: tuple[int, int]) -> Image.Image:
22 |     """Resize an image to fit within the specified dimensions.
23 | 
24 |     :param image: the input image
25 |     :param max_size: the maximum width and height as a tuple
26 |     :return: the resized image, or the original image if it fits within the
27 |         specified dimensions
28 |     """
29 |     width, height = image.size
30 |     max_width, max_height = max_size
31 | 
32 |     if width > max_width or height > max_height:
33 |         new_image = image.copy()
34 |         new_image.thumbnail((max_width, max_height))
35 |         return new_image
36 | 
37 |     return image
38 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Python SDK quality checks and unit tests
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - "openfoodfacts/**"
 7 |       - "pyproject.toml"
 8 |       - "poetry.lock"
 9 |       - "tests/**"
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: [3.9]
17 |     steps:
18 |     - uses: actions/checkout@v5
19 |     - uses: actions/setup-python@v5
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |     - name: Install Poetry
23 |       uses: snok/install-poetry@v1.4
24 |       with:
25 |         virtualenvs-create: true
26 |         virtualenvs-in-project: true
27 |     - name: Load cached venv
28 |       id: cached-poetry-dependencies
29 |       uses: actions/cache@v4
30 |       with:
31 |         path: .venv
32 |         key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }}
33 |     - name: Install dependencies
34 |       run: poetry install --with=dev --all-extras
35 |       if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
36 | 
37 |     - name: Launch quality checks
38 |       run: |
39 |         poetry run flake8 .
40 |         poetry run black --check .
41 |         poetry run mypy .
42 |         poetry run isort --check .
43 |     - name: Test with pytest
44 |       run: |
45 |         poetry run pytest tests
46 | 


--------------------------------------------------------------------------------
/openfoodfacts/ingredients.py:
--------------------------------------------------------------------------------
 1 | from openfoodfacts.taxonomy import Taxonomy
 2 | from openfoodfacts.types import JSONType
 3 | 
 4 | 
 5 | def add_ingredient_in_taxonomy_field(
 6 |     parsed_ingredients: list[JSONType], ingredient_taxonomy: Taxonomy
 7 | ) -> tuple[int, int]:
 8 |     """Add the `in_taxonomy` field to each ingredient in `parsed_ingredients`.
 9 | 
10 |     This function is called recursively to add the `in_taxonomy` field to each
11 |     sub-ingredient. It returns the total number of ingredients and the number
12 |     of known ingredients (including sub-ingredients).
13 | 
14 |     :param parsed_ingredients: a list of parsed ingredients, in Product Opener
15 |         format
16 |     :param ingredient_taxonomy: the ingredient taxonomy
17 |     :return: a (total_ingredients_n, known_ingredients_n) tuple
18 |     """
19 |     ingredients_n = 0
20 |     known_ingredients_n = 0
21 |     for ingredient_data in parsed_ingredients:
22 |         ingredient_id = ingredient_data["id"]
23 |         in_taxonomy = ingredient_id in ingredient_taxonomy
24 |         ingredient_data["in_taxonomy"] = in_taxonomy
25 |         known_ingredients_n += int(in_taxonomy)
26 |         ingredients_n += 1
27 | 
28 |         if "ingredients" in ingredient_data:
29 |             (
30 |                 sub_ingredients_n,
31 |                 known_sub_ingredients_n,
32 |             ) = add_ingredient_in_taxonomy_field(
33 |                 ingredient_data["ingredients"], ingredient_taxonomy
34 |             )
35 |             ingredients_n += sub_ingredients_n
36 |             known_ingredients_n += known_sub_ingredients_n
37 | 
38 |     return ingredients_n, known_ingredients_n
39 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "Code scanning - action"
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   schedule:
 6 |     - cron: '0 9 * * 1'
 7 | 
 8 | jobs:
 9 |   CodeQL-Build:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - name: Checkout repository
15 |       uses: actions/checkout@v5
16 |       with:
17 |         # We must fetch at least the immediate parents so that if this is
18 |         # a pull request then we can checkout the head.
19 |         fetch-depth: 2
20 | 
21 |     # If this run was triggered by a pull request event, then checkout
22 |     # the head of the pull request instead of the merge commit.
23 |     - run: git checkout HEAD^2
24 |       if: ${{ github.event_name == 'pull_request' }}
25 |       
26 |     # Initializes the CodeQL tools for scanning.
27 |     - name: Initialize CodeQL
28 |       uses: github/codeql-action/init@v3
29 |       # Override language selection by uncommenting this and choosing your languages
30 |       # with:
31 |       #   languages: go, javascript, csharp, python, cpp, java
32 | 
33 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
34 |     # If this step fails, then you should remove it and run the build manually (see below)
35 |     - name: Autobuild
36 |       uses: github/codeql-action/autobuild@v3
37 | 
38 |     # ℹ️ Command-line programs to run using the OS shell.
39 |     # 📚 https://git.io/JvXDl
40 | 
41 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
42 |     #    and modify them (or add more) to build your code if your project
43 |     #    uses a compiled language
44 | 
45 |     #- run: |
46 |     #   make bootstrap
47 |     #   make release
48 | 
49 |     - name: Perform CodeQL Analysis
50 |       uses: github/codeql-action/analyze@v3
51 | 


--------------------------------------------------------------------------------
/tests/unit/utils/test_text.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from openfoodfacts.utils.text import get_tag, replace_lang_prefix
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "value,output",
 8 |     [
 9 |         ("Reflets de France", "reflets-de-france"),
10 |         ("écrasé", "ecrase"),
11 |         ("œufs de plein air", "oeufs-de-plein-air"),
12 |         ("dr.oetker", "dr-oetker"),
13 |         ("mat & lou", "mat-lou"),
14 |         ("monop'daily", "monop-daily"),
15 |         ("épi d'or", "epi-d-or"),
16 |         ("Health Star Rating 0.5", "health-star-rating-0-5"),
17 |         ("C'est qui le Patron ?!", "c-est-qui-le-patron"),
18 |         ("fr: Gésiers", "fr:gesiers"),
19 |         ("ar: تفاح", "ar:تفاح"),
20 |         ("تفاح", "تفاح"),
21 |     ],
22 | )
23 | def test_get_tag(value: str, output: str):
24 |     assert get_tag(value) == output
25 | 
26 | 
27 | @pytest.mark.parametrize(
28 |     "tag,new_lang_prefix,output",
29 |     [
30 |         ("fr:gesiers", "en", "en:gesiers"),
31 |         ("fr:gesiers", "fr", "fr:gesiers"),
32 |         ("fr:gesiers", "ar", "ar:gesiers"),
33 |         ("en:apple", "fr", "fr:apple"),
34 |         ("xx:sashimi", "it", "it:sashimi"),
35 |         ("xx:sashimi", "xx", "xx:sashimi"),
36 |     ],
37 | )
38 | def test_replace_lang_prefix(tag, new_lang_prefix, output):
39 |     assert replace_lang_prefix(tag, new_lang_prefix) == output
40 | 
41 | 
42 | def test_replace_lang_prefix_invalid_new_lang_prefix():
43 |     with pytest.raises(ValueError, match="new_lang_prefix 'a' must be a 2-letter code"):
44 |         replace_lang_prefix("en:apples", "a")
45 | 
46 | 
47 | def test_replace_lang_prefix_invalid_tag():
48 |     with pytest.raises(
49 |         ValueError, match="tag 'e:apples' has an invalid language prefix"
50 |     ):
51 |         replace_lang_prefix("e:apples", "fr")
52 | 


--------------------------------------------------------------------------------
/openfoodfacts/barcode.py:
--------------------------------------------------------------------------------
 1 | def normalize_barcode(barcode: str) -> str:
 2 |     """Normalize the barcode.
 3 | 
 4 |     First, we remove leading zeros, then we pad the barcode with zeros to
 5 |     reach 8 digits.
 6 | 
 7 |     If the barcode is longer than 8 digits, we pad it to 13 digits.
 8 | 
 9 |     :param barcode: the barcode to normalize
10 |     :return: the normalized barcode
11 |     """
12 |     barcode = barcode.lstrip("0").zfill(8)
13 | 
14 |     if len(barcode) > 8:
15 |         barcode = barcode.zfill(13)
16 | 
17 |     return barcode
18 | 
19 | 
20 | def has_valid_check_digit(gtin: str) -> bool:
21 |     """Check if the GTIN has a valid check-digit.
22 | 
23 |     The full GTIN (with the check-digit) is passed as an argument.
24 |     The function returns True if the check-digit is valid, False otherwise.
25 |     """
26 |     if len(gtin) < 2:
27 |         raise ValueError(f"invalid gtin: '{gtin}'")
28 |     return calculate_check_digit(gtin) == gtin[-1]
29 | 
30 | 
31 | def calculate_check_digit(gtin: str) -> str:
32 |     """This function computes the check-digit from a raw GTIN.
33 | 
34 |     The full GTIN (with the check-digit) is passed as an argument.
35 |     The computed check-digit is returned as a string.
36 | 
37 |     The check-digit is computed from the preceding digits by multiplying the
38 |     sum of every 2nd digit *from right to left* by 3, adding that to the sum
39 |     of all the other digits (1st, 3rd, etc.), modulating the result by 10
40 |     (find the remainder after dividing by 10), and subtracting *that*
41 |     result *from* 10.
42 |     """
43 |     # Remove the last digit (checksum)
44 |     gtin = gtin[:-1]
45 |     # Reverse the digits
46 |     digits = tuple(d for d in reversed(gtin))
47 |     return str(
48 |         10
49 |         - (  # From 10 we substract
50 |             (
51 |                 (
52 |                     sum(int(d) for d in digits[::2]) * 3
53 |                 )  # The sum of every 2nd digit, multiplied by 3
54 |                 + (
55 |                     sum(int(d) for d in digits[1::2])
56 |                 )  # The sum of every 2nd digit, offset by 1
57 |             )
58 |             % 10  # Modulo 10 (the remainder after dividing by 10)
59 |         )
60 |     )[-1]
61 | 


--------------------------------------------------------------------------------
/tests/unit/test_ocr.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Tuple
 2 | 
 3 | import pytest
 4 | 
 5 | from openfoodfacts.ocr import OCRResult
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     "ocr_url, bounding_box, expected_text",
10 |     [
11 |         (
12 |             # It corresponds to this OCR crop:
13 |             # https://robotoff.openfoodfacts.org/api/v1/images/crop?image_url=https://images.openfoodfacts.org/images/products/089/000/000/1202/1.jpg&y_min=0.08416666666666667&x_min=0.30077691453940064&y_max=0.09583333333333334&x_max=0.37735849056603776
14 |             "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
15 |             [101, 271, 115, 340],
16 |             "Materne",
17 |         ),
18 |         (
19 |             # same, but the bounding box is distinct from the logo area
20 |             "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
21 |             [120, 271, 134, 340],
22 |             None,
23 |         ),
24 |         (
25 |             # same, but the bounding box is distinct from the logo area
26 |             "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json",
27 |             [120, 271, 134, 340],
28 |             None,
29 |         ),
30 |         (
31 |             # [0.2808293402194977,0.37121888995170593,0.35544055700302124,0.49409016966819763]
32 |             # /540/091/030/1160/1.jpg
33 |             "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/5400910301160_1.json",
34 |             [337, 327, 427, 436],
35 |             "NUTRIDIA",
36 |         ),
37 |     ],
38 | )
39 | def test_get_words_in_area(
40 |     ocr_url: str, bounding_box: Tuple[int, int, int, int], expected_text: Optional[str]
41 | ):
42 |     ocr_result = OCRResult.from_url(ocr_url)
43 |     assert ocr_result is not None
44 |     words = ocr_result.get_words_in_area(bounding_box)
45 | 
46 |     if expected_text is None:
47 |         assert words == []
48 |     else:
49 |         assert words is not None
50 |         assert len(words) == 1
51 |         assert words[0].text.strip() == expected_text
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "openfoodfacts"
 3 | version = "3.3.0"
 4 | authors = [
 5 |     {name = "The Open Food Facts team", email = "contact@openfoodfacts.org"}
 6 | ]
 7 | description = "Official Python SDK of Open Food Facts"
 8 | readme = "README.md"
 9 | license = "MIT"
10 | requires-python = ">=3.10"
11 | dependencies = [
12 |     "requests>=2.20.0",
13 |     "pydantic>=2.0.0,<3.0.0",
14 |     "tqdm>=4.0.0,<5.0.0",
15 | ]
16 | dynamic = ["classifiers"]
17 | 
18 | [tool.poetry]
19 | include = [
20 |     {path = "tests", format = "sdist"},
21 | ]
22 | classifiers = [
23 |     "Development Status :: 5 - Production/Stable",
24 |     "Intended Audience :: Developers",
25 |     "Natural Language :: English",
26 |     "Programming Language :: Python :: Implementation :: CPython",
27 |     "Programming Language :: Python :: Implementation :: PyPy",
28 | ]
29 | 
30 | [project.urls]
31 | repository = "https://github.com/openfoodfacts/openfoodfacts-python"
32 | 
33 | [tool.mypy]
34 | ignore_missing_imports = true
35 | 
36 | [tool.isort] # From https://black.readthedocs.io/en/stable/compatible_configs.html#isort
37 | multi_line_output = 3
38 | include_trailing_comma = true
39 | force_grid_wrap = 0
40 | use_parentheses = true
41 | ensure_newline_before_comments = true
42 | line_length = 88
43 | 
44 | [tool.poetry.dependencies]
45 | python = ">=3.10,<4.0"
46 | redis = { version = "~6.4.0", optional = true, extras = ["hiredis"] }
47 | Pillow = { version = ">=9.3,<12", optional = true }
48 | tritonclient = {extras = ["grpc"], version = ">2.0.0,<3.0.0", optional = true}
49 | opencv-python-headless = {version = ">4.0.0,<5.0.0", optional = true}
50 | 
51 | [tool.poetry.group.dev.dependencies]
52 | requests-mock = "1.12.1"
53 | flake8 = "7.3.0"
54 | black = "25.1.0"
55 | mypy = "1.17.1"
56 | isort = "6.0.1"
57 | coverage = {version = "7.10.4", extras = ["toml"]}
58 | pytest = "8.4.1"
59 | types-requests = "2.32.4.20250809"
60 | types-tqdm = "4.67.0.20250809"
61 | types-redis = "^4.6.0.20240425"
62 | 
63 | [project.optional-dependencies]
64 | redis = ["redis"]
65 | pillow = ["Pillow"]
66 | ml = ["tritonclient[grpc]", "opencv-python-headless", "Pillow", "albumentations>=2.0.0"]
67 | 
68 | [build-system]
69 | requires = ["poetry-core"]
70 | build-backend = "poetry.core.masonry.api"
71 | 


--------------------------------------------------------------------------------
/tests/unit/test_barcode.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from openfoodfacts.barcode import (
 4 |     calculate_check_digit,
 5 |     has_valid_check_digit,
 6 |     normalize_barcode,
 7 | )
 8 | 
 9 | 
10 | def test_normalize_barcode_remove_leading_zeros():
11 |     assert normalize_barcode("00012345") == "00012345"
12 |     assert normalize_barcode("00000001") == "00000001"
13 | 
14 | 
15 | def test_normalize_barcode_pad_to_8_digits():
16 |     assert normalize_barcode("123") == "00000123"
17 |     assert normalize_barcode("1") == "00000001"
18 | 
19 | 
20 | def test_normalize_barcode_pad_to_13_digits():
21 |     assert normalize_barcode("123456789") == "0000123456789"
22 |     assert normalize_barcode("123456789012") == "0123456789012"
23 | 
24 | 
25 | def test_normalize_barcode_no_change_needed():
26 |     assert normalize_barcode("12345678") == "12345678"
27 |     assert normalize_barcode("1234567890123") == "1234567890123"
28 | 
29 | 
30 | @pytest.mark.parametrize(
31 |     "gtin,expected",
32 |     [
33 |         ("3017620422003", "3"),
34 |         ("8901234567890", "0"),
35 |         ("101011", "1"),
36 |         ("000101011", "1"),
37 |         ("0000000101011", "1"),
38 |         ("5678989012342", "2"),
39 |         ("829573994253", "3"),
40 |         ("59366631014", "4"),
41 |         ("150599289765", "5"),
42 |         ("9012345678906", "6"),
43 |         ("360131017", "7"),
44 |         ("1234567890128", "8"),
45 |         ("10061282", "2"),
46 |     ],
47 | )
48 | def test_calculate_check_digit(gtin, expected):
49 |     assert calculate_check_digit(gtin) == expected
50 | 
51 | 
52 | @pytest.mark.parametrize(
53 |     "gtin,expected",
54 |     [
55 |         ("3017620422003", True),
56 |         ("0204341706595", True),
57 |         ("5707196311419", True),
58 |         ("5701018060158", True),
59 |         ("5016451522591", True),
60 |         ("5741000224168", True),
61 |         ("5741000224168", True),
62 |         ("0256844308646", True),
63 |         ("0083012245843", True),
64 |         ("5741000224161", False),
65 |         # EAN8
66 |         ("10061282", True),
67 |         ("10061283", False),
68 |         ("0000010061282", True),
69 |         ("29428984", True),
70 |     ],
71 | )
72 | def test_has_valid_check_digit(gtin, expected):
73 |     assert has_valid_check_digit(gtin) is expected
74 | 


--------------------------------------------------------------------------------
/tests/ml/test_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PIL import Image
 3 | 
 4 | from openfoodfacts.ml.utils import convert_image_to_array, resize_image
 5 | 
 6 | 
 7 | class TestConvertImageToArray:
 8 |     def test_rgb(self):
 9 |         # Create a simple RGB image
10 |         image = Image.new("RGB", (10, 10), color="red")
11 |         array = convert_image_to_array(image)
12 | 
13 |         assert array.shape == (10, 10, 3)
14 |         assert array.dtype == np.uint8
15 |         assert (array == [255, 0, 0]).all()
16 | 
17 |     def test_non_rgb(self):
18 |         # Create a simple grayscale image
19 |         image = Image.new("L", (10, 10), color=128)
20 |         array = convert_image_to_array(image)
21 | 
22 |         assert array.shape == (10, 10, 3)
23 |         assert array.dtype == np.uint8
24 |         assert (array == [128, 128, 128]).all()
25 | 
26 |     def test_size(self):
27 |         # Create a simple RGB image with different size
28 |         image = Image.new("RGB", (20, 15), color="blue")
29 |         array = convert_image_to_array(image)
30 | 
31 |         assert array.shape == (15, 20, 3)
32 |         assert array.dtype == np.uint8
33 |         assert (array == [0, 0, 255]).all()
34 | 
35 | 
36 | class TestResizeImage:
37 |     def test_resize_smaller_image(self):
38 |         # Create a simple RGB image smaller than max_size
39 |         image = Image.new("RGB", (10, 10), color="red")
40 |         max_size = (20, 20)
41 |         resized_image = resize_image(image, max_size)
42 | 
43 |         assert resized_image.size == (10, 10)
44 | 
45 |     def test_resize_larger_image(self):
46 |         # Create a simple RGB image larger than max_size
47 |         image = Image.new("RGB", (30, 30), color="blue")
48 |         max_size = (20, 20)
49 |         resized_image = resize_image(image, max_size)
50 | 
51 |         assert resized_image.size == (20, 20)
52 | 
53 |     def test_resize_wider_image(self):
54 |         # Create a simple RGB image wider than max_size
55 |         image = Image.new("RGB", (40, 20), color="green")
56 |         max_size = (20, 20)
57 |         resized_image = resize_image(image, max_size)
58 | 
59 |         assert resized_image.size == (20, 10)
60 | 
61 |     def test_resize_taller_image(self):
62 |         # Create a simple RGB image taller than max_size
63 |         image = Image.new("RGB", (20, 40), color="yellow")
64 |         max_size = (20, 20)
65 |         resized_image = resize_image(image, max_size)
66 | 
67 |         assert resized_image.size == (10, 20)
68 | 


--------------------------------------------------------------------------------
/openfoodfacts/utils/text.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from .fold_to_ascii import fold, fold_without_insertion_deletion
  4 | 
  5 | 
  6 | def strip_accents(s: str, keep_length: bool = False) -> str:
  7 |     """Strip accents and normalize string.
  8 | 
  9 |     :param s: the string to normalize
 10 |     :param keep_length: if True, no character is replaced without a
 11 |         subtitution of length 1: the length of the string is therefore kept
 12 |         unchanged. Default to False.
 13 |     :return: the normalized string
 14 |     """
 15 |     if keep_length:
 16 |         return fold_without_insertion_deletion(s)
 17 |     else:
 18 |         return fold(s)
 19 | 
 20 | 
 21 | CONSECUTIVE_HYPHEN_REGEX = re.compile(r"-{2,}")
 22 | 
 23 | 
 24 | def strip_consecutive_hyphens(text: str) -> str:
 25 |     """Convert a sequence of 2+ hypens into a single hyphen."""
 26 |     return CONSECUTIVE_HYPHEN_REGEX.sub("-", text)
 27 | 
 28 | 
 29 | TAG_MAP_TABLE = {
 30 |     ord("œ"): "oe",
 31 |     ord(" "): "-",
 32 |     ord("'"): "-",
 33 |     ord("`"): "-",
 34 |     ord('"'): "-",
 35 |     ord("."): "-",
 36 |     ord("!"): "-",
 37 |     ord("?"): "-",
 38 |     ord("["): "-",
 39 |     ord("]"): "-",
 40 |     ord("("): "-",
 41 |     ord(")"): "-",
 42 |     ord("{"): "-",
 43 |     ord("}"): "-",
 44 |     ord("#"): "-",
 45 |     ord("$"): "-",
 46 |     ord("%"): "-",
 47 |     ord("&"): "-",
 48 |     ord("\\"): "-",
 49 |     ord("*"): "-",
 50 |     ord("+"): "-",
 51 |     ord(","): "-",
 52 |     ord("/"): "-",
 53 |     ord(";"): "-",
 54 |     ord("<"): "-",
 55 |     ord(">"): "-",
 56 |     ord("="): "-",
 57 |     ord("@"): "-",
 58 |     ord("^"): "-",
 59 |     ord("_"): "-",
 60 |     ord("|"): "-",
 61 |     ord("~"): "-",
 62 | }
 63 | 
 64 | 
 65 | def get_tag(text: str) -> str:
 66 |     """Return a tag from a text.
 67 | 
 68 |     In Open Food Facts, tags are obtained from free text by performing the
 69 |     following:
 70 |     - lowercasing
 71 |     - accent removal
 72 |     - replacement of punctuation by either a comma ("-") or nothing, depending
 73 |     on the punctuation
 74 | 
 75 |     The input text can contain a language prefix, which is kept in the output
 76 |     if present. The language prefix is a 2-letter code followed by a colon
 77 |     (e.g. "fr:").
 78 | 
 79 |     This function is not strictly on par with Product Opener implementation,
 80 |     but it should be good enough for most cases.
 81 |     """
 82 |     text = text.lower()
 83 |     lang_prefix = None
 84 |     if len(text) >= 3 and text[2] == ":":
 85 |         lang_prefix = text[:2]
 86 |         text = text[3:]
 87 |     text = strip_accents(text, keep_length=True)
 88 |     text = text.translate(TAG_MAP_TABLE).strip("-")
 89 |     text = strip_consecutive_hyphens(text)
 90 |     if lang_prefix:
 91 |         text = f"{lang_prefix}:{text}"
 92 |     return text
 93 | 
 94 | 
 95 | def replace_lang_prefix(tag: str, new_lang_prefix: str) -> str:
 96 |     """Replace the language prefix of a tag with a new one."""
 97 | 
 98 |     if len(new_lang_prefix) != 2:
 99 |         raise ValueError(
100 |             f"new_lang_prefix '{new_lang_prefix}' must be a 2-letter code."
101 |         )
102 | 
103 |     if len(tag) < 3 or tag[2] != ":":
104 |         raise ValueError(f"tag '{tag}' has an invalid language prefix")
105 | 
106 |     return f"{new_lang_prefix}:{tag[3:]}"
107 | 


--------------------------------------------------------------------------------
/openfoodfacts/ml/triton.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import struct
  3 | 
  4 | import grpc
  5 | import numpy as np
  6 | from tritonclient.grpc import service_pb2, service_pb2_grpc
  7 | from tritonclient.grpc.service_pb2_grpc import GRPCInferenceServiceStub
  8 | 
  9 | 
 10 | @functools.cache
 11 | def get_triton_inference_stub(triton_uri: str) -> GRPCInferenceServiceStub:
 12 |     """Return a gRPC stub for Triton Inference Server.
 13 | 
 14 |     :param triton_uri: URI of the Triton Inference Server
 15 |     :return: gRPC stub for Triton Inference Server
 16 |     """
 17 |     channel = grpc.insecure_channel(triton_uri)
 18 |     return service_pb2_grpc.GRPCInferenceServiceStub(channel)
 19 | 
 20 | 
 21 | def deserialize_byte_tensor(data: bytes) -> list[str]:
 22 |     """Deserialize a byte tensor into a list of string.
 23 | 
 24 |     This is used to deserialize string array outputs from Triton models.
 25 |     """
 26 |     offset = 0
 27 |     # 4 bytes are used to encode string length
 28 |     int_byte_len = 4
 29 |     array = []
 30 |     while len(data) >= offset + int_byte_len:
 31 |         str_length = struct.unpack("<I", data[offset : offset + int_byte_len])[0]
 32 |         offset += int_byte_len
 33 |         string_data = data[offset : offset + str_length].decode("utf-8")
 34 |         offset += str_length
 35 |         array.append(string_data)
 36 |     return array
 37 | 
 38 | 
 39 | # Copied from triton client repository
 40 | def serialize_byte_tensor(input_tensor):
 41 |     """
 42 |     Serializes a bytes tensor into a flat numpy array of length prepended
 43 |     bytes. The numpy array should use dtype of np.object_. For np.bytes_,
 44 |     numpy will remove trailing zeros at the end of byte sequence and because
 45 |     of this it should be avoided.
 46 |     Parameters
 47 |     ----------
 48 |     input_tensor : np.array
 49 |         The bytes tensor to serialize.
 50 |     Returns
 51 |     -------
 52 |     serialized_bytes_tensor : np.array
 53 |         The 1-D numpy array of type uint8 containing the serialized bytes in
 54 |         'C' order.
 55 |     Raises
 56 |     ------
 57 |     InferenceServerException
 58 |         If unable to serialize the given tensor.
 59 |     """
 60 | 
 61 |     if input_tensor.size == 0:
 62 |         return ()
 63 | 
 64 |     # If the input is a tensor of string/bytes objects, then must flatten those
 65 |     # into a 1-dimensional array containing the 4-byte byte size followed by
 66 |     # the actual element bytes. All elements are concatenated together in "C"
 67 |     # order.
 68 |     if (input_tensor.dtype == np.object_) or (input_tensor.dtype.type == np.bytes_):
 69 |         flattened_ls = []
 70 |         for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
 71 |             # If directly passing bytes to BYTES type,
 72 |             # don't convert it to str as Python will encode the
 73 |             # bytes which may distort the meaning
 74 |             if input_tensor.dtype == np.object_:
 75 |                 if type(obj.item()) == bytes:
 76 |                     s = obj.item()
 77 |                 else:
 78 |                     s = str(obj.item()).encode("utf-8")
 79 |             else:
 80 |                 s = obj.item()
 81 |             flattened_ls.append(struct.pack("<I", len(s)))
 82 |             flattened_ls.append(s)
 83 |         flattened = b"".join(flattened_ls)
 84 |         return flattened
 85 |     return None
 86 | 
 87 | 
 88 | def add_triton_infer_input_tensor(request, name: str, data: np.ndarray, datatype: str):
 89 |     """Create and add an input tensor to a Triton gRPC Inference request.
 90 | 
 91 |     :param request: the Triton Inference request
 92 |     :param name: the name of the input tensor
 93 |     :param data: the input tensor data
 94 |     :param datatype: the datatype of the input tensor (e.g. "FP32")
 95 |     """
 96 |     input_tensor = service_pb2.ModelInferRequest().InferInputTensor()
 97 |     input_tensor.name = name
 98 |     input_tensor.datatype = datatype
 99 |     input_tensor.shape.extend(data.shape)
100 |     request.inputs.extend([input_tensor])
101 |     request.raw_input_contents.extend([data.tobytes()])
102 | 


--------------------------------------------------------------------------------
/tests/ml/test_object_detection.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | import numpy as np
  4 | import pytest
  5 | from PIL import Image
  6 | 
  7 | from openfoodfacts.ml.object_detection import ObjectDetectionRawResult, ObjectDetector
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def sample_image():
 12 |     # Create a sample image (np uint8 array) for testing
 13 |     return np.array(Image.new("RGB", (100, 200), color="white"))
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def object_detector() -> ObjectDetector:
 18 |     # Create an instance of ObjectDetector for testing
 19 |     label_names = ["label1", "label2"]
 20 |     return ObjectDetector(
 21 |         model_name="test_model", label_names=label_names, image_size=640
 22 |     )
 23 | 
 24 | 
 25 | class ResponseOutputs:
 26 |     def __init__(self, name):
 27 |         self.name = name
 28 | 
 29 | 
 30 | class TestObjectDetector:
 31 |     def test_preprocess(self, sample_image, object_detector: ObjectDetector):
 32 |         image_array = object_detector.preprocess(sample_image)
 33 | 
 34 |         # Check the shape of the output image array
 35 |         assert image_array.shape == (1, 3, 640, 640)
 36 | 
 37 |     def test_postprocess(self, object_detector: ObjectDetector):
 38 |         # Mock response object
 39 |         response = MagicMock()
 40 |         response.outputs = [ResponseOutputs("output0")]
 41 |         response.raw_output_contents = [
 42 |             np.random.rand(1, len(object_detector.label_names) + 4, 10)
 43 |             .astype(np.float32)
 44 |             .tobytes()
 45 |         ]
 46 | 
 47 |         threshold = 0.5
 48 |         result = object_detector.postprocess(
 49 |             response, threshold, original_shape=(200, 100)
 50 |         )
 51 | 
 52 |         # Check the type of the result
 53 |         assert isinstance(result, ObjectDetectionRawResult)
 54 | 
 55 |         # Check the number of detections
 56 |         assert result.num_detections == 10
 57 | 
 58 |         # Check the shape of detection boxes
 59 |         assert result.detection_boxes.shape == (len(result.detection_scores), 4)
 60 | 
 61 |         # Check the length of detection classes and scores
 62 |         assert len(result.detection_classes) == len(result.detection_scores)
 63 | 
 64 |     def test_detect_from_image(self, sample_image, object_detector: ObjectDetector):
 65 |         # Mock the Triton inference stub and response
 66 |         grpc_stub = MagicMock()
 67 |         grpc_stub.ModelInfer.return_value = MagicMock()
 68 |         get_triton_inference_stub = MagicMock(return_value=grpc_stub)
 69 | 
 70 |         # Mock the preprocess and postprocess methods
 71 |         object_detector.preprocess = MagicMock(return_value=np.zeros((1, 3, 640, 640)))  # type: ignore
 72 |         object_detector.postprocess = MagicMock(  # type: ignore
 73 |             return_value=ObjectDetectionRawResult(
 74 |                 num_detections=1,
 75 |                 detection_boxes=np.zeros((1, 4)),
 76 |                 detection_scores=np.array([0.9]),
 77 |                 detection_classes=np.array([1]),
 78 |                 label_names=object_detector.label_names,
 79 |             )
 80 |         )
 81 |         with patch(
 82 |             "openfoodfacts.ml.object_detection.get_triton_inference_stub",
 83 |             get_triton_inference_stub,
 84 |         ):
 85 |             # Run the detect_from_image method
 86 |             result = object_detector.detect_from_image(
 87 |                 sample_image, "fake_triton_uri", threshold=0.5
 88 |             )
 89 | 
 90 |         # Check that preprocess was called
 91 |         object_detector.preprocess.assert_called_once()
 92 |         assert object_detector.preprocess.call_args.kwargs == {
 93 |             "image_array": sample_image
 94 |         }
 95 | 
 96 |         # Check that get_triton_inference_stub was called
 97 |         get_triton_inference_stub.assert_called_once_with("fake_triton_uri")
 98 | 
 99 |         # Check that ModelInfer was called
100 |         grpc_stub.ModelInfer.assert_called_once()
101 | 
102 |         # Check that postprocess was called
103 |         object_detector.postprocess.assert_called_once()
104 | 
105 |         # Check the type of the result
106 |         assert isinstance(result, ObjectDetectionRawResult)
107 | 
108 |         # Check the number of detections
109 |         assert result.num_detections == 1
110 | 


--------------------------------------------------------------------------------
/.github/workflows/github-projects.yml:
--------------------------------------------------------------------------------
 1 | name: Add issues to the relevant GitHub Projects project
 2 | 
 3 | on:
 4 |   issues:
 5 |     types:
 6 |       - opened
 7 |       - labeled
 8 |       - edited
 9 | jobs:
10 |   add-to-project:
11 |     name: Add issues to the relevant GitHub Projects project
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/add-to-project@main
15 |         with:
16 |           project-url: https://github.com/orgs/openfoodfacts/projects/11
17 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
18 |           labeled: 🎨 Mockups available, 🎨 Mockup required
19 |           label-operator: OR
20 |       - uses: actions/add-to-project@main
21 |         with:
22 |           project-url: https://github.com/orgs/openfoodfacts/projects/4 # Add issue to the packaging project
23 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
24 |           labeled: packaging input
25 |           label-operator: OR
26 |       - uses: actions/add-to-project@main
27 |         with:
28 |           project-url: https://github.com/orgs/openfoodfacts/projects/35 # Add issue to the a11y project
29 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
30 |           labeled: accessibility
31 |           label-operator: OR
32 |       - uses: actions/add-to-project@main
33 |         with:
34 |           project-url: https://github.com/orgs/openfoodfacts/projects/132 # Add issue to the Top upvoted issues board
35 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
36 |           labeled: ⭐ top issue, 👍 Top 10 Issue!
37 |           label-operator: OR
38 |       - uses: actions/add-to-project@main
39 |         with:
40 |           project-url: https://github.com/orgs/openfoodfacts/projects/57 # Add issue to the Most impactful issues board
41 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
42 |           labeled: 🎯 P0, 🎯 P1
43 |           label-operator: OR
44 |       - uses: actions/add-to-project@main
45 |         with:
46 |           project-url: https://github.com/orgs/openfoodfacts/projects/43 # Add issue to the open products facts project
47 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
48 |           labeled: 📸 Open Products Facts
49 |           label-operator: OR
50 |       - uses: actions/add-to-project@main
51 |         with:
52 |           project-url: https://github.com/orgs/openfoodfacts/projects/37 # Add issue to the open beauty facts project
53 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
54 |           labeled: 🧴 Open Beauty Facts
55 |           label-operator: OR
56 |       - uses: actions/add-to-project@main
57 |         with:
58 |           project-url: https://github.com/orgs/openfoodfacts/projects/4 # Add issue to the packaging project
59 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
60 |           labeled: 📦 Packaging
61 |           label-operator: OR
62 |       - uses: actions/add-to-project@main
63 |         with:
64 |           project-url: https://github.com/orgs/openfoodfacts/projects/25 # Add issue to the documentation project
65 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
66 |           labeled: 📚 Documentation
67 |           label-operator: OR
68 |       - uses: actions/add-to-project@main
69 |         with:
70 |           project-url: https://github.com/orgs/openfoodfacts/projects/5 # Add issue to the folksonomy project
71 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
72 |           labeled: 🏷️ Folksonomy Project
73 |           label-operator: OR         
74 |       - uses: actions/add-to-project@main
75 |         with:
76 |           project-url: https://github.com/orgs/openfoodfacts/projects/44 # Add issue to the data quality project
77 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
78 |           labeled: 🧽 Data quality
79 |           label-operator: OR    
80 |       - uses: actions/add-to-project@main
81 |         with:
82 |           project-url: https://github.com/orgs/openfoodfacts/projects/82 # Add issue to the search project
83 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
84 |           labeled: 🔎 Search
85 |           label-operator: OR
86 |       - uses: actions/add-to-project@main
87 |         with:
88 |           project-url: https://github.com/orgs/openfoodfacts/projects/41 # Add issue to the producer platform project
89 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
90 |           labeled: 🏭 Producers Platform
91 |           label-operator: OR    
92 |       - uses: actions/add-to-project@main
93 |         with:
94 |           project-url: https://github.com/orgs/openfoodfacts/projects/92 # Add issue to the Nutri-Score project
95 |           github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
96 |           labeled: 🚦 Nutri-Score
97 |           label-operator: OR   
98 | 


--------------------------------------------------------------------------------
/docs/handle_taxonomies.md:
--------------------------------------------------------------------------------
  1 | # Handle taxonomies
  2 | 
  3 | The Python SDK provides an easy way to access and handle the taxonomies available on Open Food Facts.
  4 | 
  5 | Taxonomies are at the heart of Open Food Facts. They are used to structure knowledge about ingredients, categories, labels, additives, countries, brands, etc.
  6 | 
  7 | To have a better understanding of how taxonomies work, you can read the [wiki page about taxonomies](https://wiki.openfoodfacts.org/Global_taxonomies).
  8 | 
  9 | ## Usage
 10 | 
 11 | ### Get information about a taxonomy item
 12 | 
 13 | First, instantiate a Taxonomy object:
 14 | 
 15 | ```python
 16 | from openfoodfacts.taxonomy import get_taxonomy
 17 | 
 18 | # Use the singular form of the taxonomy name
 19 | taxonomy = get_taxonomy("category")
 20 | print(taxonomy)
 21 | # <openfoodfacts.taxonomy.Taxonomy object at 0x7fe9d3f44940>
 22 | ```
 23 | 
 24 | The taxonomy object provides a way to access the taxonomy data. For example, if you want to get the node `en:biscuits`:
 25 | 
 26 | ```python
 27 | node = taxonomy["en:biscuits"]
 28 | print(node)
 29 | # <TaxonomyNode en:biscuits>
 30 | ```
 31 | 
 32 | If the node does not exist, `None` is returned.
 33 | 
 34 | You can get the the translation in a specific language:
 35 | 
 36 | ```python
 37 | print(node.get_localized_name("it"))
 38 | # Biscotti
 39 | ```
 40 | 
 41 | Each node has one or more parents, stored in the `parents` field:
 42 | 
 43 | ```python
 44 | print(node.parents)
 45 | # [<TaxonomyNode en:biscuits-and-cakes>]
 46 | ```
 47 | 
 48 | Likewise, children can be accessed using the `children` field.
 49 | 
 50 | 
 51 | To get the full parent hierarchy (that includes all parents found recursively), use the `get_parents_hierarchy` method:
 52 | 
 53 | ```python
 54 | print(node.get_parents_hierarchy())
 55 | # [<TaxonomyNode en:biscuits-and-cakes>, <TaxonomyNode en:sweet-snacks>, <TaxonomyNode en:snacks>]
 56 | ```
 57 | 
 58 | Beside the main translation that can be accessed using `get_localized_name`, each node may have synonyms. This information can be easily accessed as well:
 59 | 
 60 | ```python
 61 | # synonyms is a dict mapping language codes to a list of
 62 | # synonyms in that language. The key is missing if there are
 63 | # no synonyms.
 64 | print(node.synonyms["es"])
 65 | # ["Galletas", "galleta"]
 66 | ```
 67 | 
 68 | Taxonomy node properties are stored in the `properties` field:
 69 | 
 70 | ```python
 71 | print(node.properties)
 72 | # {
 73 | #    "wikipedia": {"en": "https://en.wikipedia.org/wiki/Biscuit"},
 74 | #    "carbon_footprint_fr_foodges_ingredient": {"fr": "Biscuit au beurre"},
 75 | #    "agribalyse_proxy_food_code": {"en": "24000"},
 76 | #    "ciqual_proxy_food_name": {
 77 | #        "en": "Biscuit -cookie-",
 78 | #        "fr": "Biscuit sec, sans précision",
 79 | #    },
 80 | #    "wikidata": {"en": "Q13270"},
 81 | #    "ciqual_proxy_food_code": {"en": "24000"},
 82 | #}
 83 | ```
 84 | 
 85 | ### The Taxonomy object
 86 | 
 87 | The `Taxonomy` object is a dictionary-like object that maps node IDs to `TaxonomyNode` objects.
 88 | 
 89 | It also provides a way to iterate over all nodes:
 90 | 
 91 | ```python
 92 | for node in taxonomy.iter_nodes():
 93 |     print(node)
 94 | # <TaxonomyNode fr:beaune-premier-cru-belissand-blanc>
 95 | # <TaxonomyNode fr:pommard-les-rugiens-bas>
 96 | # <TaxonomyNode en:hazelnut-butters>
 97 | # <TaxonomyNode fr:pernand-vergelesses>
 98 | # <TaxonomyNode it:terre-di-pisa>
 99 | # <TaxonomyNode en:creamy-quark>
100 | # ...
101 | ```
102 | 
103 | #### Find leaf nodes in the taxonomy
104 | 
105 | One very common usecase is to find the leafs nodes among a list of nodes, i.e. the nodes that have no children.
106 | For example, in Open Food Facts, the `categories_tags` field contains the categories submitted by the user and all their parents. If you're only interested in the most precise categories, you need to filter out the categories that have children:
107 | 
108 | ```python
109 | # Let's say you have a product that has the following categories:
110 | categories_tags = ["en:plant-based-foods-and-beverages","en:plant-based-foods","en:breakfasts","en:cereals-and-potatoes","en:fruits-and-vegetables-based-foods","en:cereals-and-their-products","en:fruits-based-foods","en:breakfast-cereals","en:mueslis","en:cereals-with-nuts","en:crunchy-cereal-clusters","en:cereal-clusters-with-nuts"]
111 | 
112 | # Convert the ID to TaxonomyNode objects:
113 | categories_nodes = [taxonomy[tag] for tag in categories_tags if tag in taxonomy]
114 | 
115 | # Let's find the leaf nodes using find_deepest_nodes method:
116 | leaf_nodes = taxonomy.find_deepest_nodes(categories_nodes)
117 | print(leaf_nodes)
118 | # [<TaxonomyNode en:fruits-based-foods>, <TaxonomyNode en:mueslis>, <TaxonomyNode en:cereal-clusters-with-nuts>]
119 | ```
120 | 
121 | As you can see, the parent categories were removed, and only the leaf nodes remain.


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | # Usage Guide
  2 | 
  3 | This guide provides information on the methods available within the Open Food Facts Python SDK.
  4 | 
  5 | ## API
  6 | 
  7 | The SDK can be used to access Open Food Facts API.
  8 | 
  9 | First, instantiate an API object:
 10 | 
 11 | ```python
 12 | from openfoodfacts import API, APIVersion, Country, Environment, Flavor
 13 | 
 14 | api = API(
 15 |     user_agent="<application name>",
 16 |     username=None,
 17 |     password=None,
 18 |     country=Country.world,
 19 |     flavor=Flavor.off,
 20 |     version=APIVersion.v2,
 21 |     environment=Environment.org,
 22 | )
 23 | ```
 24 | 
 25 | All parameters are optional with the exception of user_agent, but here is a description of the parameters you can tweak:
 26 | 
 27 | - `username` and `password` are used to provide authentication (required for write requests)
 28 | - `country` is used to specify the country, which is used by the API to return product specific to the country or to infer which language to use by default. `world` (all products) is the default value
 29 | - `flavor`: the Open*Facts project you want to interact with: `off` (Open Food Facts, default), `obf` (Open Beauty Facts), `opff` (Open Pet Food Facts), `opf` (Open Products Facts)
 30 | - `version`: API version (v2 is the default)
 31 | - `environment`: either `org` for production environment (openfoodfacts.org) or `net` for staging (openfoodfacts.net)
 32 | 
 33 | ### Get information about a product
 34 | 
 35 | ```python
 36 | code = "3017620422003"
 37 | api.product.get(code)
 38 | ```
 39 | 
 40 | ### Perform text search
 41 | 
 42 | ```python
 43 | results = api.product.text_search("pizza")
 44 | ```
 45 | 
 46 | ### Create a new product or update an existing one
 47 | 
 48 | ```python
 49 | results = api.product.update(body)
 50 | ```
 51 | 
 52 | with `body` the update body. It is a dictionary. It should contain 
 53 | the key "code" and its value, corresponding to the product that we
 54 | want to update. Example:
 55 | ```body = {'code': '3850334341389', 'product_name': 'Mlinci'}```
 56 | 
 57 | ### Perform ingredient analysis
 58 | 
 59 | You can perform the ingredient analysis of a text in a given language using the API. Please note that ingredient analysis is costly, so prefer using the preprod server for this operation.
 60 | 
 61 | ```python
 62 | from openfoodfacts import API, APIVersion, Environment
 63 | 
 64 | api = API(user_agent="<application name>",
 65 |           version=APIVersion.v3,
 66 |           environment=Environment.net)
 67 | 
 68 | results = api.product.parse_ingredients("water, sugar, salt", lang="en")
 69 | 
 70 | print(results)
 71 | 
 72 | ## [{'ciqual_food_code': '18066',
 73 | #  'ecobalyse_code': 'tap-water',
 74 | #  'id': 'en:water',
 75 | #  'is_in_taxonomy': 1,
 76 | #  'percent_estimate': 66.6666666666667,
 77 | #  'percent_max': 100,
 78 | #  'percent_min': 33.3333333333333,
 79 | #  'text': 'water',
 80 | #  'vegan': 'yes',
 81 | #  'vegetarian': 'yes'},
 82 | # {'ciqual_proxy_food_code': '31016',
 83 | #  'ecobalyse_code': 'sugar',
 84 | #  'id': 'en:sugar',
 85 | #  'is_in_taxonomy': 1,
 86 | #  'percent_estimate': 16.6666666666667,
 87 | #  'percent_max': 50,
 88 | #  'percent_min': 0,
 89 | #  'text': 'sugar',
 90 | #  'vegan': 'yes',
 91 | #  'vegetarian': 'yes'},
 92 | # {'ciqual_food_code': '11058',
 93 | #  'id': 'en:salt',
 94 | #  'is_in_taxonomy': 1,
 95 | #  'percent_estimate': 16.6666666666667,
 96 | #  'percent_max': 33.3333333333333,
 97 | #  'percent_min': 0,
 98 | #  'text': 'salt',
 99 | #  'vegan': 'yes',
100 | #  'vegetarian': 'yes'}]
101 | ```
102 | 
103 | ## Using the dataset
104 | 
105 | If you're planning to perform data analysis on Open Food Facts, the easiest way is to download and use the Open Food Facts dataset dump. Fortunately it can be done really easily using the SDK:
106 | 
107 | ```python
108 | from openfoodfacts import ProductDataset
109 | 
110 | dataset = ProductDataset(dataset_type="csv")
111 | 
112 | for product in dataset:
113 |     print(product["product_name"])
114 | ```
115 | 
116 | With `dataset = ProductDataset(dataset_type="csv")`, we automatically download (and cache) the food dataset. We can then iterate over it to get information about products.
117 | 
118 | Two dataset types are available `csv` and `jsonl`. The `jsonl` dataset contains all the Open Food Facts database information but takes much more storage (>5 GB), while the `csv` dataset is much ligher (~800 MB) but only contains the most important fields. The `jsonl` dataset type is used by default.
119 | 
120 | You can also use `ProductDataset` to fetch other non-food datasets:
121 | 
122 | ```python
123 | from openfoodfacts import ProductDataset
124 | 
125 | dataset = ProductDataset(dataset_type="csv")
126 | 
127 | for product in dataset:
128 |     print(product["product_name"])
129 | ```
130 | 
131 | ## Taxonomies
132 | 
133 | For a deep dive on how to handle taxonomies, check out the [dedicated page](./handle_taxonomies.md).
134 | 


--------------------------------------------------------------------------------
/tests/unit/utils/test_utils.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | from pathlib import Path
  3 | from unittest.mock import patch
  4 | 
  5 | import pytest
  6 | import requests
  7 | from PIL import Image
  8 | 
  9 | from openfoodfacts.utils import (
 10 |     AssetLoadingException,
 11 |     get_image_from_url,
 12 |     should_download_file,
 13 | )
 14 | 
 15 | 
 16 | def test_get_image_from_url(requests_mock):
 17 |     # Test case 1: Valid image URL
 18 |     image_url = "https://example.com/image.jpg"
 19 | 
 20 |     f = io.BytesIO()
 21 |     # Create a white image file
 22 |     Image.new("RGB", (100, 100), "white").save(f, format="JPEG")
 23 |     f.seek(0)
 24 |     image_data = f.read()
 25 |     requests_mock.get(image_url, content=image_data)
 26 |     image = get_image_from_url(image_url)
 27 |     assert isinstance(image, Image.Image)
 28 | 
 29 |     struct = get_image_from_url(image_url, return_struct=True)
 30 |     assert struct.url == image_url
 31 |     assert struct.response is not None and struct.response.status_code == 200
 32 |     assert struct.image == image
 33 | 
 34 |     # Test case 2: Invalid image URL
 35 |     invalid_image_url = "https://example.com/invalid_image.jpg"
 36 |     requests_mock.get(invalid_image_url, content=b"invalid-image")
 37 |     with pytest.raises(AssetLoadingException):
 38 |         get_image_from_url(invalid_image_url)
 39 | 
 40 |     # Same with error_raise=False
 41 |     assert get_image_from_url(invalid_image_url, error_raise=False) is None
 42 | 
 43 |     # Same thing with struct
 44 |     struct = get_image_from_url(
 45 |         invalid_image_url, return_struct=True, error_raise=False
 46 |     )
 47 |     assert struct.url == invalid_image_url
 48 |     assert struct.response is not None and struct.response.status_code == 200
 49 |     assert struct.image is None
 50 |     assert struct.error == "Cannot identify image https://example.com/invalid_image.jpg"
 51 | 
 52 |     # Test case 3: Image URL with connection error
 53 |     connection_error_url = "https://example.com/connection_error.jpg"
 54 |     requests_mock.get(connection_error_url, exc=requests.exceptions.ConnectionError)
 55 |     with pytest.raises(AssetLoadingException):
 56 |         get_image_from_url(connection_error_url)
 57 | 
 58 |     # Same but with error_raise=False
 59 |     assert get_image_from_url(connection_error_url, error_raise=False) is None
 60 | 
 61 |     # Same but with return_struct=True
 62 |     struct = get_image_from_url(
 63 |         connection_error_url, return_struct=True, error_raise=False
 64 |     )
 65 |     assert struct.url == connection_error_url
 66 |     assert struct.response is None
 67 |     assert struct.image is None
 68 |     assert struct.error == "Cannot download https://example.com/connection_error.jpg"
 69 | 
 70 |     # Test case 4: Image URL with HTTP error
 71 |     http_error_url = "https://example.com/http_error.jpg"
 72 |     requests_mock.get(http_error_url, status_code=404)
 73 |     with pytest.raises(AssetLoadingException):
 74 |         get_image_from_url(http_error_url)
 75 | 
 76 | 
 77 | def test_should_download_file():
 78 |     url = "https://example.com/file"
 79 |     filepath = Path("/path/to/file")
 80 | 
 81 |     # Test case 1: File does not exist
 82 |     with patch.object(Path, "is_file", return_value=False):
 83 |         assert (
 84 |             should_download_file(
 85 |                 url, filepath, force_download=False, download_newer=False
 86 |             )
 87 |             is True
 88 |         )
 89 | 
 90 |     # Test case 2: Force download
 91 |     with patch.object(Path, "is_file", return_value=True):
 92 |         assert (
 93 |             should_download_file(
 94 |                 url, filepath, force_download=True, download_newer=False
 95 |             )
 96 |             is True
 97 |         )
 98 | 
 99 |     # Test case 3: Download newer with same ETag
100 |     with (
101 |         patch.object(Path, "is_file", return_value=True),
102 |         patch("openfoodfacts.utils.get_file_etag", return_value="etag123"),
103 |         patch("openfoodfacts.utils.fetch_etag", return_value="etag123"),
104 |     ):
105 |         assert (
106 |             should_download_file(
107 |                 url, filepath, force_download=False, download_newer=True
108 |             )
109 |             is False
110 |         )
111 | 
112 |     # Test case 4: Download newer with different ETag
113 |     with (
114 |         patch.object(Path, "is_file", return_value=True),
115 |         patch("openfoodfacts.utils.get_file_etag", return_value="etag123"),
116 |         patch("openfoodfacts.utils.fetch_etag", return_value="etag456"),
117 |     ):
118 |         assert (
119 |             should_download_file(
120 |                 url, filepath, force_download=False, download_newer=True
121 |             )
122 |             is True
123 |         )
124 | 
125 |     # Test case 5: No force download and no download newer, the file
126 |     # exists so we don't download it again
127 |     with patch.object(Path, "is_file", return_value=True):
128 |         assert (
129 |             should_download_file(
130 |                 url, filepath, force_download=False, download_newer=False
131 |             )
132 |             is False
133 |         )
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Open Food Facts Python SDK
  2 | 
  3 | <div align="center">
  4 |   <img width="400" src="https://blog.openfoodfacts.org/wp-content/uploads/2022/05/EXE_LOGO_OFF_RVB_Plan-de-travail-1-copie-0-1-768x256.jpg" alt="Open Food Facts"/>
  5 | </div>
  6 | 
  7 | ## Status
  8 | 
  9 | [![Project Status](https://opensource.box.com/badges/active.svg)](https://opensource.box.com/badges)
 10 | [![Build Status](https://travis-ci.org/openfoodfacts/openfoodfacts-python.svg?branch=master)](https://travis-ci.org/openfoodfacts/openfoodfacts-python)
 11 | [![codecov](https://codecov.io/gh/openfoodfacts/openfoodfacts-python/branch/master/graph/badge.svg)](https://codecov.io/gh/openfoodfacts/openfoodfacts-python)
 12 | [![Latest Version](https://img.shields.io/pypi/v/openfoodfacts.svg)](https://pypi.org/project/openfoodfacts)
 13 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/openfoodfacts/openfoodfacts-python/blob/master/LICENSE)
 14 | 
 15 | ## Description
 16 | 
 17 | This is the official Python SDK for the [Open Food Facts](https://world.openfoodfacts.org/) project.
 18 | It provides a simple interface to the [Open Food Facts API](https://openfoodfacts.github.io/openfoodfacts-server/api/) and allows you to:
 19 | 
 20 | - Get information about a product
 21 | - Perform text search
 22 | - Create a new product or update an existing one
 23 | 
 24 | It also provides some helper functions to make it easier to work with Open Food Facts data and APIs, such as:
 25 | 
 26 | - getting translation of a taxonomized field in a given language
 27 | - downloading and iterating over the Open Food Facts data dump
 28 | - handling OCRs of Open Food Facts images generated by Google Cloud Vision
 29 | 
 30 | Please note that this SDK is still in beta and the API is subject to change. Make sure to pin the version in your requirements file.
 31 | 
 32 | ## Third party applications
 33 | If you use this SDK or want to use this SDK, make sure to read the [REUSE](https://github.com/openfoodfacts/openfoodfacts-python/blob/develop/REUSE.md) and ensure you comply with the OdBL licence, in addition to the licence of this package (MIT). Make sure you at least fill the form, and feel free to open a PR to add your application in this list :-)
 34 | 
 35 | 
 36 | ## Installation
 37 | 
 38 | The easiest way to install the SDK is through pip:
 39 | 
 40 |     pip install openfoodfacts
 41 | 
 42 | or manually from source:
 43 | 
 44 |     git clone https://github.com/openfoodfacts/openfoodfacts-python
 45 |     cd openfoodfacts-python
 46 |     pip install .  # Note the “.” at the end!
 47 | 
 48 | ## Examples
 49 | 
 50 | All the examples below assume that you have imported the SDK and instanciated the API object:
 51 | 
 52 | ```python
 53 | import openfoodfacts
 54 | 
 55 | # User-Agent is mandatory
 56 | api = openfoodfacts.API(user_agent="MyAwesomeApp/1.0")
 57 | ```
 58 | 
 59 | *Get information about a product*
 60 | 
 61 | ```python
 62 | code = "3017620422003"
 63 | api.product.get(code, fields=["code", "product_name"])
 64 | # {'code': '3017620422003', 'product_name': 'Nutella'}
 65 | ```
 66 | 
 67 | *Perform text search*
 68 | 
 69 | ```python
 70 | api.product.text_search("mineral water")
 71 | # {"count": 3006628, "page": 1, "page_count": 20, "page_size": 20, "products": [{...}], "skip": 0}
 72 | ```
 73 | 
 74 | *Create a new product or update an existing one*
 75 | 
 76 | ```python
 77 | results = api.product.update({
 78 |     "code": CODE,
 79 |     "product_name_en": "blueberry jam",
 80 |     "ingredients_text_en": "blueberries, sugar, pectin, citric acid"
 81 | })
 82 | ```
 83 | 
 84 | with `CODE` the product barcode. The rest of the body should be a dictionary of fields to create/update.
 85 | 
 86 | To see all possible capabilities, check out the [usage guide](https://openfoodfacts.github.io/openfoodfacts-python/usage/).
 87 | 
 88 | 
 89 | 
 90 | ## Contributing
 91 | 
 92 | Any help is welcome, as long as you don't break the continuous integration.
 93 | Fork the repository and open a Pull Request directly on the "develop" branch.
 94 | A maintainer will review and integrate your changes.
 95 | 
 96 | Maintainers:
 97 | 
 98 | - [Anubhav Bhargava](https://github.com/Anubhav-Bhargava)
 99 | - [Frank Rousseau](https://github.com/frankrousseau)
100 | - [Pierre Slamich](https://github.com/teolemon)
101 | - [Raphaël](https://github.com/raphael0202)
102 | 
103 | Contributors:
104 | 
105 | - Agamit Sudo
106 | - [Daniel Stolpe](https://github.com/numberpi)
107 | - [Enioluwa Segun](https://github.com/enioluwas)
108 | - [Nicolas Leger](https://github.com/nicolasleger)
109 | - [Pablo Hinojosa](https://github.com/Pablohn26)
110 | - [Andrea Stagi](https://github.com/astagi)
111 | - [Benoît Prieur](https://github.com/benprieur)
112 | - [Aadarsh A](https://github.com/aadarsh-ram)
113 | 
114 | ## Copyright and License
115 | 
116 |     Copyright 2016-2024 Open Food Facts
117 | 
118 | The Open Food Facts Python SDK is licensed under the [MIT License](https://github.com/openfoodfacts/openfoodfacts-python/blob/develop/LICENSE).
119 | 
120 | Other files that are not part of the SDK itself may be under different a different license.
121 | The project complies with the [REUSE 3.3 specification](https://reuse.software/spec-3.3/),
122 | so any such files should be marked accordingly.
123 | 


--------------------------------------------------------------------------------
/openfoodfacts/dataset.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from pathlib import Path
  3 | from typing import Optional
  4 | 
  5 | from .types import DatasetType, Environment, Flavor
  6 | from .utils import (
  7 |     URLBuilder,
  8 |     download_file,
  9 |     get_logger,
 10 |     get_open_fn,
 11 |     jsonl_iter,
 12 |     should_download_file,
 13 | )
 14 | 
 15 | logger = get_logger(__name__)
 16 | 
 17 | # Increase field_size to accommodate large fields.
 18 | # sys.maxsize will overflow on windows so using max 32-bit integer instead.
 19 | csv.field_size_limit(pow(2, 31) - 1)
 20 | 
 21 | 
 22 | DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/datasets").expanduser()
 23 | DATASET_FILE_NAMES = {
 24 |     Flavor.off: {
 25 |         DatasetType.jsonl: "openfoodfacts-products.jsonl.gz",
 26 |         DatasetType.csv: "en.openfoodfacts.org.products.csv.gz",
 27 |     },
 28 |     Flavor.obf: {
 29 |         DatasetType.jsonl: "openbeautyfacts-products.jsonl.gz",
 30 |         DatasetType.csv: "en.openbeautyfacts.org.products.csv",
 31 |     },
 32 |     Flavor.opff: {
 33 |         DatasetType.jsonl: "openpetfoodfacts-products.jsonl.gz",
 34 |         DatasetType.csv: "en.openpetfoodfacts.org.products.csv",
 35 |     },
 36 |     Flavor.opf: {
 37 |         DatasetType.jsonl: "openproductsfacts-products.jsonl.gz",
 38 |         DatasetType.csv: "en.openproductsfacts.org.products.csv",
 39 |     },
 40 | }
 41 | 
 42 | 
 43 | def get_dataset(
 44 |     flavor: Flavor = Flavor.off,
 45 |     dataset_type: DatasetType = DatasetType.jsonl,
 46 |     force_download: bool = False,
 47 |     download_newer: bool = False,
 48 |     cache_dir: Optional[Path] = None,
 49 |     obsolete: bool = False,
 50 | ) -> Path:
 51 |     """Download (and cache) Open Food Facts dataset.
 52 | 
 53 |     The dataset is downloaded the first time and subsequently cached in
 54 |     `~/.cache/openfoodfacts/datasets`.
 55 | 
 56 |     :param flavor: The data source, defaults to Flavor.off
 57 |     :param dataset_type: The returned format, defaults to DatasetType.jsonl
 58 |     :param force_download: if True, (re)download the dataset even if it was
 59 |         cached, defaults to False
 60 |     :param download_newer: if True, download the dataset if a more recent
 61 |         version compared to the cached version is available (based on file
 62 |         Etag). This parameter if ignored if force_download is True, defaults
 63 |         to False.
 64 |     :param cache_dir: the cache directory to use, defaults to
 65 |         ~/.cache/openfoodfacts/taxonomy
 66 |     :param obsolete: if True, download the obsolete dataset, defaults to False
 67 |     :return: the path of the dataset
 68 |     """
 69 |     cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
 70 |     file_name = DATASET_FILE_NAMES[flavor][dataset_type]
 71 |     if obsolete:
 72 |         file_name = file_name.replace(".jsonl.gz", "_obsolete.jsonl.gz")
 73 |     dataset_path = cache_dir / file_name
 74 |     url = f"{URLBuilder.static(flavor, Environment.org)}/data/{file_name}"
 75 |     cache_dir.mkdir(parents=True, exist_ok=True)
 76 | 
 77 |     if not should_download_file(url, dataset_path, force_download, download_newer):
 78 |         return dataset_path
 79 | 
 80 |     logger.info("Downloading dataset, saving it in %s", dataset_path)
 81 |     download_file(url, dataset_path)
 82 |     return dataset_path
 83 | 
 84 | 
 85 | class ProductDataset:
 86 |     def __init__(
 87 |         self,
 88 |         flavor: Flavor = Flavor.off,
 89 |         dataset_type: DatasetType = DatasetType.jsonl,
 90 |         dataset_path: Optional[Path] = None,
 91 |         obsolete: bool = False,
 92 |         **kwargs,
 93 |     ):
 94 |         """A product dataset.
 95 | 
 96 |         This class is used to iterate over the Open Food Facts dataset and
 97 |         to retrieve the information about products as dict.
 98 | 
 99 |         If dataset_path is None (default), the dataset is downloaded and
100 |         cached in `~/.cache/openfoodfacts/datasets`.
101 | 
102 |         Otherwise, the dataset is loaded from the provided path.
103 | 
104 |         :param flavor: the dataset flavor to use (off, obf, opff or opf),
105 |             defaults to Flavor.off. This parameter is ignored if dataset_path
106 |             is provided.
107 |         :param dataset_type: the dataset type to use (csv or jsonl), defaults
108 |             to DatasetType.jsonl. This parameter is ignored if dataset_path is
109 |             provided.
110 |         :param dataset_path: the path of the dataset, defaults to None.
111 |         :param obsolete: if True, download the obsolete dataset, defaults to
112 |             False.
113 |         :param kwargs: additional arguments passed to `get_dataset` when
114 |             downloading the dataset
115 |         """
116 |         self.dataset_type = dataset_type
117 | 
118 |         if dataset_path is not None:
119 |             self.dataset_path = dataset_path
120 | 
121 |             # We infer the dataset type from the file extension
122 |             full_suffix = "".join(dataset_path.suffixes)
123 |             if full_suffix in (".jsonl.gz", ".jsonl"):
124 |                 self.dataset_type = DatasetType.jsonl
125 |             elif full_suffix in (".csv.gz", ".csv"):
126 |                 self.dataset_type = DatasetType.csv
127 |             else:
128 |                 raise ValueError(f"Unknown dataset type: {full_suffix}")
129 |         else:
130 |             self.dataset_path = get_dataset(
131 |                 flavor, dataset_type, obsolete=obsolete, **kwargs
132 |             )
133 | 
134 |     def __iter__(self):
135 |         if self.dataset_type is DatasetType.jsonl:
136 |             return jsonl_iter(self.dataset_path)
137 |         else:
138 |             return self._csv_iterator()
139 | 
140 |     def _csv_iterator(self):
141 |         open_fn = get_open_fn(self.dataset_path)
142 |         with open_fn(self.dataset_path, "rt", newline="") as csvfile:
143 |             reader = csv.DictReader(csvfile, delimiter="\t")
144 |             for row in reader:
145 |                 yield dict(row)
146 | 
147 |     def count(self) -> int:
148 |         """Return the number of products in the dataset."""
149 |         count = 0
150 |         for _ in self:
151 |             count += 1
152 |         return count
153 | 


--------------------------------------------------------------------------------
/tests/ml/test_image_classification.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | import numpy as np
  4 | from PIL import Image
  5 | 
  6 | from openfoodfacts.ml.image_classification import ImageClassifier, classify_transforms
  7 | 
  8 | 
  9 | class TestClassifyTransforms:
 10 |     def test_rgb_image(self):
 11 |         img = Image.new("RGB", (300, 300), color="red")
 12 |         transformed_img = classify_transforms(img)
 13 |         assert transformed_img.shape == (3, 224, 224)
 14 |         assert transformed_img.dtype == np.float32
 15 | 
 16 |     def test_non_rgb_image(self):
 17 |         img = Image.new("L", (300, 300), color="red")
 18 |         transformed_img = classify_transforms(img)
 19 |         assert transformed_img.shape == (3, 224, 224)
 20 |         assert transformed_img.dtype == np.float32
 21 | 
 22 |     def test_custom_size(self):
 23 |         img = Image.new("RGB", (300, 300), color="red")
 24 |         transformed_img = classify_transforms(img, size=128)
 25 |         assert transformed_img.shape == (3, 128, 128)
 26 |         assert transformed_img.dtype == np.float32
 27 | 
 28 |     def test_custom_mean_std(self):
 29 |         img = Image.new("RGB", (300, 300), color="red")
 30 |         mean = (0.5, 0.5, 0.5)
 31 |         std = (0.5, 0.5, 0.5)
 32 |         transformed_img = classify_transforms(img, mean=mean, std=std)
 33 |         assert transformed_img.shape == (3, 224, 224)
 34 |         assert transformed_img.dtype == np.float32
 35 | 
 36 |     def test_custom_interpolation(self):
 37 |         img = Image.new("RGB", (300, 300), color="red")
 38 |         transformed_img = classify_transforms(
 39 |             img, interpolation=Image.Resampling.NEAREST
 40 |         )
 41 |         assert transformed_img.shape == (3, 224, 224)
 42 |         assert transformed_img.dtype == np.float32
 43 | 
 44 |     def test_custom_crop_fraction(self):
 45 |         img = Image.new("RGB", (300, 300), color="red")
 46 |         transformed_img = classify_transforms(img, crop_fraction=0.8)
 47 |         assert transformed_img.shape == (3, 224, 224)
 48 |         assert transformed_img.dtype == np.float32
 49 | 
 50 | 
 51 | class ResponseOutputs:
 52 |     def __init__(self, name):
 53 |         self.name = name
 54 | 
 55 | 
 56 | class TestImageClassifier:
 57 |     def test_preprocess_rgb_image(self):
 58 |         img = Image.new("RGB", (300, 300), color="red")
 59 |         classifier = ImageClassifier(
 60 |             model_name="test_model", label_names=["label1", "label2"]
 61 |         )
 62 |         preprocessed_img = classifier.preprocess(img)
 63 |         assert preprocessed_img.shape == (1, 3, 224, 224)
 64 |         assert preprocessed_img.dtype == np.float32
 65 | 
 66 |     def test_postprocess_single_output(self):
 67 |         classifier = ImageClassifier(
 68 |             model_name="test_model", label_names=["label1", "label2"]
 69 |         )
 70 |         response = MagicMock()
 71 |         response.outputs = [ResponseOutputs(name="output0")]
 72 |         response.raw_output_contents = [
 73 |             np.array([0.8, 0.2], dtype=np.float32).tobytes()
 74 |         ]
 75 | 
 76 |         result = classifier.postprocess(response)
 77 |         assert len(result) == 2
 78 |         assert result[0][0] == "label1"
 79 |         assert np.isclose(result[0][1], 0.8)
 80 |         assert result[1][0] == "label2"
 81 |         assert np.isclose(result[1][1], 0.2)
 82 | 
 83 |     def test_postprocess_multiple_outputs(self):
 84 |         classifier = ImageClassifier(
 85 |             model_name="test_model", label_names=["label1", "label2"]
 86 |         )
 87 |         response = MagicMock()
 88 |         response.outputs = [
 89 |             ResponseOutputs(name="output0"),
 90 |             ResponseOutputs(name="output1"),
 91 |         ]
 92 |         response.raw_output_contents = [
 93 |             np.array([0.8, 0.2], dtype=np.float32).tobytes()
 94 |         ]
 95 | 
 96 |         try:
 97 |             classifier.postprocess(response)
 98 |         except Exception as e:
 99 |             assert str(e) == "expected 1 output, got 2"
100 | 
101 |     def test_postprocess_multiple_raw_output_contents(self):
102 |         classifier = ImageClassifier(
103 |             model_name="test_model", label_names=["label1", "label2"]
104 |         )
105 |         response = MagicMock()
106 |         response.outputs = [ResponseOutputs(name="output0")]
107 |         response.raw_output_contents = [
108 |             np.array([0.8, 0.2], dtype=np.float32).tobytes(),
109 |             np.array([0.1, 0.9], dtype=np.float32).tobytes(),
110 |         ]
111 | 
112 |         try:
113 |             classifier.postprocess(response)
114 |         except Exception as e:
115 |             assert str(e) == "expected 1 raw output content, got 2"
116 | 
117 |     def test_predict(self):
118 |         img = Image.new("RGB", (300, 300), color="red")
119 |         classifier = ImageClassifier(
120 |             model_name="test_model", label_names=["label1", "label2"]
121 |         )
122 |         triton_uri = "fake_triton_uri"
123 | 
124 |         # Mock the preprocess method
125 |         classifier.preprocess = MagicMock(
126 |             return_value=np.random.rand(1, 3, 224, 224).astype(np.float32)
127 |         )
128 | 
129 |         # Mock the Triton inference stub and response
130 |         grpc_stub = MagicMock()
131 |         response = MagicMock()
132 |         response.outputs = [ResponseOutputs(name="output0")]
133 |         response.raw_output_contents = [
134 |             np.array([0.8, 0.2], dtype=np.float32).tobytes()
135 |         ]
136 |         grpc_stub.ModelInfer = MagicMock(return_value=response)
137 | 
138 |         # Mock the get_triton_inference_stub function
139 |         get_triton_inference_stub = MagicMock(return_value=grpc_stub)
140 | 
141 |         with patch(
142 |             "openfoodfacts.ml.image_classification.get_triton_inference_stub",
143 |             get_triton_inference_stub,
144 |         ):
145 |             result = classifier.predict(img, triton_uri)
146 | 
147 |         assert len(result) == 2
148 |         assert result[0][0] == "label1"
149 |         assert np.isclose(result[0][1], 0.8)
150 |         assert result[1][0] == "label2"
151 |         assert np.isclose(result[1][1], 0.2)
152 | 
153 |         classifier.preprocess.assert_called_once_with(img)
154 |         grpc_stub.ModelInfer.assert_called_once()
155 |         get_triton_inference_stub.assert_called_once_with(triton_uri)
156 | 


--------------------------------------------------------------------------------
/openfoodfacts/ml/image_classification.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import time
  4 | import typing
  5 | from typing import Optional
  6 | 
  7 | import numpy as np
  8 | from PIL import Image, ImageOps
  9 | from tritonclient.grpc import service_pb2
 10 | 
 11 | from openfoodfacts.ml.triton import (
 12 |     add_triton_infer_input_tensor,
 13 |     get_triton_inference_stub,
 14 | )
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def classify_transforms(
 20 |     img: Image.Image,
 21 |     size: int = 224,
 22 |     mean: tuple[float, float, float] = (0.0, 0.0, 0.0),
 23 |     std: tuple[float, float, float] = (1.0, 1.0, 1.0),
 24 |     interpolation: Image.Resampling = Image.Resampling.BILINEAR,
 25 |     crop_fraction: float = 1.0,
 26 | ) -> np.ndarray:
 27 |     """
 28 |     Applies a series of image transformations including resizing, center
 29 |     cropping, normalization, and conversion to a NumPy array.
 30 | 
 31 |     Transformation steps is based on the one used in the Ultralytics library:
 32 |     https://github.com/ultralytics/ultralytics/blob/main/ultralytics/data/augment.py#L2319
 33 | 
 34 |     :param img: Input Pillow image.
 35 |     :param size: The target size for the transformed image (shortest edge).
 36 |     :param mean: Mean values for each RGB channel used in normalization.
 37 |     :param std: Standard deviation values for each RGB channel used in
 38 |         normalization.
 39 |     :param interpolation: Interpolation method from PIL (
 40 |     Image.Resampling.NEAREST, Image.Resampling.BILINEAR,
 41 |     Image.Resampling.BICUBIC).
 42 |     :param crop_fraction: Fraction of the image to be cropped.
 43 |     :return: The transformed image as a NumPy array.
 44 |     """
 45 |     if img.mode != "RGB":
 46 |         img = img.convert("RGB")
 47 | 
 48 |     # Rotate the image based on the EXIF orientation if needed
 49 |     img = typing.cast(Image.Image, ImageOps.exif_transpose(img))
 50 | 
 51 |     # Step 1: Resize while preserving the aspect ratio
 52 |     width, height = img.size
 53 | 
 54 |     # Calculate scale size while preserving aspect ratio
 55 |     scale_size = math.floor(size / crop_fraction)
 56 | 
 57 |     aspect_ratio = width / height
 58 |     if width < height:
 59 |         new_width = scale_size
 60 |         new_height = int(new_width / aspect_ratio)
 61 |     else:
 62 |         new_height = scale_size
 63 |         new_width = int(new_height * aspect_ratio)
 64 | 
 65 |     img = img.resize((new_width, new_height), interpolation)
 66 | 
 67 |     # Step 2: Center crop
 68 |     left = (new_width - size) // 2
 69 |     top = (new_height - size) // 2
 70 |     right = left + size
 71 |     bottom = top + size
 72 |     img = img.crop((left, top, right, bottom))
 73 | 
 74 |     # Step 3: Convert the image to a NumPy array and scale pixel values to
 75 |     # [0, 1]
 76 |     img_array = np.array(img).astype(np.float32) / 255.0
 77 | 
 78 |     # Step 4: Normalize the image
 79 |     mean_np = np.array(mean, dtype=np.float32).reshape(1, 1, 3)
 80 |     std_np = np.array(std, dtype=np.float32).reshape(1, 1, 3)
 81 |     img_array = (img_array - mean_np) / std_np
 82 | 
 83 |     # Step 5: Change the order of dimensions from (H, W, C) to (C, H, W)
 84 |     img_array = np.transpose(img_array, (2, 0, 1))
 85 |     return img_array
 86 | 
 87 | 
 88 | class ImageClassifier:
 89 |     def __init__(self, model_name: str, label_names: list[str], image_size: int = 224):
 90 |         """An image classifier based on Yolo models.
 91 | 
 92 |         We support models trained with Yolov8, v9, v10 and v11.
 93 | 
 94 |         :param model_name: the name of the model, as registered in Triton
 95 |         :param label_names: the list of label names
 96 |         :param image_size: the size of the input image for the model
 97 |         """
 98 |         self.model_name: str = model_name
 99 |         self.label_names = label_names
100 |         self.image_size = image_size
101 | 
102 |     def predict(
103 |         self,
104 |         image: Image.Image,
105 |         triton_uri: str,
106 |         model_version: Optional[str] = None,
107 |     ) -> list[tuple[str, float]]:
108 |         """Run an image classification model on an image.
109 | 
110 |         The model is expected to have been trained with Ultralytics library
111 |         (Yolov8).
112 | 
113 |         :param image: the input Pillow image
114 |         :param triton_uri: URI of the Triton Inference Server, defaults to
115 |             None. If not provided, the default value from settings is used.
116 |         :return: the prediction results as a list of tuples (label, confidence)
117 |         """
118 |         image_array = self.preprocess(image)
119 | 
120 |         grpc_stub = get_triton_inference_stub(triton_uri)
121 |         request = service_pb2.ModelInferRequest()
122 |         request.model_name = self.model_name
123 |         if model_version:
124 |             request.model_version = model_version
125 |         add_triton_infer_input_tensor(
126 |             request, name="images", data=image_array, datatype="FP32"
127 |         )
128 |         start_time = time.monotonic()
129 |         response = grpc_stub.ModelInfer(request)
130 |         latency = time.monotonic() - start_time
131 |         logger.debug("Inference time for %s: %s", self.model_name, latency)
132 | 
133 |         start_time = time.monotonic()
134 |         result = self.postprocess(response)
135 |         latency = time.monotonic() - start_time
136 |         logger.debug("Post-processing time for %s: %s", self.model_name, latency)
137 |         return result
138 | 
139 |     def preprocess(self, image: Image.Image) -> np.ndarray:
140 |         """Preprocess an image for object detection.
141 | 
142 |         :param image: the input Pillow image
143 |         :return: the preprocessed image as a NumPy array
144 |         """
145 |         image_array = classify_transforms(image, size=self.image_size)
146 |         return np.expand_dims(image_array, axis=0)
147 | 
148 |     def postprocess(
149 |         self, response: service_pb2.ModelInferResponse
150 |     ) -> list[tuple[str, float]]:
151 |         """Postprocess the inference result.
152 | 
153 |         :param response: the inference response
154 |         """
155 |         if len(response.outputs) != 1:
156 |             raise Exception(f"expected 1 output, got {len(response.outputs)}")
157 | 
158 |         if len(response.raw_output_contents) != 1:
159 |             raise Exception(
160 |                 f"expected 1 raw output content, got {len(response.raw_output_contents)}"
161 |             )
162 | 
163 |         output_index = {output.name: i for i, output in enumerate(response.outputs)}
164 |         output = np.frombuffer(
165 |             response.raw_output_contents[output_index["output0"]],
166 |             dtype=np.float32,
167 |         ).reshape((1, len(self.label_names)))[0]
168 | 
169 |         score_indices = np.argsort(-output)
170 |         return [(self.label_names[i], float(output[i])) for i in score_indices]
171 | 


--------------------------------------------------------------------------------
/LICENSES/CC0-1.0.txt:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/tests/unit/test_taxonomy.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | 
  5 | from openfoodfacts.taxonomy import (
  6 |     Taxonomy,
  7 |     TaxonomyNode,
  8 |     create_brand_taxonomy_mapping,
  9 |     create_taxonomy_mapping,
 10 |     get_taxonomy,
 11 |     map_to_canonical_id,
 12 | )
 13 | 
 14 | label_taxonomy = get_taxonomy("label")
 15 | category_taxonomy = get_taxonomy("category")
 16 | 
 17 | 
 18 | def test_map_to_canonical_id():
 19 |     taxonomy_mapping = {
 20 |         "en:apple": "en:apples",
 21 |         "en:apples": "en:apples",
 22 |         "fr:pomme": "en:apples",
 23 |         "fr:noix-d-isere": "en:nuts-from-isere",
 24 |         "xx:provence-alpes-cote-d-azur": "en:provence-alpes-cote-d-azur",
 25 |         "xx:sashimi": "xx:sashimi",
 26 |     }
 27 |     values = [
 28 |         "en: Apple",
 29 |         "en: apples",
 30 |         "fr: Pomme",
 31 |         "fr: Bananes d'Isère",
 32 |         "fr: Noix d'Isère",
 33 |         "fr: Provence-Alpes-Côte d'Azur",
 34 |         "pt: Provence-Alpes-Côte d'Azur",
 35 |         "it: sashimi",
 36 |     ]
 37 |     expected = {
 38 |         "en: Apple": "en:apples",
 39 |         "en: apples": "en:apples",
 40 |         "fr: Pomme": "en:apples",
 41 |         "fr: Bananes d'Isère": "fr:bananes-d-isere",
 42 |         "fr: Noix d'Isère": "en:nuts-from-isere",
 43 |         "fr: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur",
 44 |         "pt: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur",
 45 |         "it: sashimi": "xx:sashimi",
 46 |     }
 47 |     assert map_to_canonical_id(taxonomy_mapping, values) == expected
 48 | 
 49 | 
 50 | def test_map_to_canonical_id_invalid_value():
 51 |     taxonomy_mapping = {
 52 |         "en:apple": "en:apples",
 53 |         "en:apples": "en:apples",
 54 |         "fr:pomme": "en:apples",
 55 |         "fr:noix-d-isere": "en:nuts-from-isere",
 56 |     }
 57 |     values = ["en: Apple", "apple"]
 58 | 
 59 |     with pytest.raises(
 60 |         ValueError,
 61 |         match=re.escape(
 62 |             "Invalid value: 'apple', expected value to be in 'lang:tag' format"
 63 |         ),
 64 |     ):
 65 |         map_to_canonical_id(taxonomy_mapping, values)
 66 | 
 67 | 
 68 | class TestCreateTaxonomyMapping:
 69 |     def test_basic(self):
 70 |         taxonomy = Taxonomy()
 71 |         node1 = TaxonomyNode(
 72 |             identifier="en:apples",
 73 |             names={"en": "Apple", "fr": "Pomme"},
 74 |             synonyms={"en": ["Apples"], "fr": ["Pommes"]},
 75 |         )
 76 |         node2 = TaxonomyNode(
 77 |             identifier="en:nuts-from-isere",
 78 |             names={"fr": "Noix d'Isère"},
 79 |             synonyms={"fr": ["Noix d'Isère"]},
 80 |         )
 81 |         node3 = TaxonomyNode(
 82 |             identifier="xx:sashimi",
 83 |             names={"xx": "Sashimi"},
 84 |             synonyms={"xx": ["Sashimi"]},
 85 |         )
 86 |         taxonomy.add(node1.id, node1)
 87 |         taxonomy.add(node2.id, node2)
 88 |         taxonomy.add(node3.id, node3)
 89 | 
 90 |         expected_mapping = {
 91 |             "en:apple": "en:apples",
 92 |             "fr:pomme": "en:apples",
 93 |             "en:apples": "en:apples",
 94 |             "fr:pommes": "en:apples",
 95 |             "fr:noix-d-isere": "en:nuts-from-isere",
 96 |             "xx:sashimi": "xx:sashimi",
 97 |         }
 98 | 
 99 |         assert create_taxonomy_mapping(taxonomy) == expected_mapping
100 | 
101 |     def test_empty(self):
102 |         taxonomy = Taxonomy()
103 |         expected_mapping = {}
104 |         assert create_taxonomy_mapping(taxonomy) == expected_mapping
105 | 
106 |     def test_no_synonyms(self):
107 |         taxonomy = Taxonomy()
108 |         node = TaxonomyNode(
109 |             identifier="en:bananas",
110 |             names={"en": "Banana", "fr": "Banane"},
111 |             synonyms={},
112 |         )
113 |         taxonomy.add(node.id, node)
114 | 
115 |         expected_mapping = {
116 |             "en:banana": "en:bananas",
117 |             "fr:banane": "en:bananas",
118 |         }
119 | 
120 |         assert create_taxonomy_mapping(taxonomy) == expected_mapping
121 | 
122 |     def test_multiple_languages_with_different_synonyms(self):
123 |         taxonomy = Taxonomy()
124 |         node = TaxonomyNode(
125 |             identifier="en:grapes",
126 |             names={"en": "Grape", "fr": "Raisin", "es": "Uva"},
127 |             synonyms={
128 |                 "en": ["Grapes"],
129 |                 "fr": ["Raisins", "Raisins d'automne"],
130 |                 "es": ["Uvas"],
131 |             },
132 |         )
133 |         taxonomy.add(node.id, node)
134 | 
135 |         expected_mapping = {
136 |             "en:grape": "en:grapes",
137 |             "fr:raisin": "en:grapes",
138 |             "fr:raisins-d-automne": "en:grapes",
139 |             "es:uva": "en:grapes",
140 |             "en:grapes": "en:grapes",
141 |             "fr:raisins": "en:grapes",
142 |             "es:uvas": "en:grapes",
143 |         }
144 | 
145 |         assert create_taxonomy_mapping(taxonomy) == expected_mapping
146 | 
147 |     def test_create_brand_taxonomy_mapping(self):
148 |         taxonomy = Taxonomy.from_dict(
149 |             {
150 |                 "en:5th-season": {"name": {"en": "5th Season"}},
151 |                 "en:arev": {"name": {"en": "Arèv"}},
152 |                 "en:arrighi": {"name": {"en": "Arrighi"}},
153 |                 "en:voiles-au-vent": {"name": {"en": "Voiles au Vent"}},
154 |                 "xx:turini": {"name": {"xx": "Turini"}},
155 |                 "fr:auchan": {"name": {"xx": "Auchan"}},
156 |                 "fr:mamouth": {"name": {"fr": "Mamouth"}},
157 |                 "fr:carefour": {"name": {}},
158 |             }
159 |         )
160 |         assert create_brand_taxonomy_mapping(taxonomy) == {
161 |             "5th-season": "5th Season",
162 |             "arev": "Arèv",
163 |             "arrighi": "Arrighi",
164 |             "voiles-au-vent": "Voiles au Vent",
165 |             "turini": "Turini",
166 |             "auchan": "Auchan",
167 |             "mamouth": "Mamouth",
168 |             "carefour": "carefour",
169 |         }
170 | 
171 | 
172 | class TestTaxonomy:
173 |     @pytest.mark.parametrize(
174 |         "taxonomy,item,candidates,output",
175 |         [
176 |             (label_taxonomy, "en:organic", {"en:fr-bio-01"}, True),
177 |             (label_taxonomy, "en:fr-bio-01", {"en:organic"}, False),
178 |             (label_taxonomy, "en:fr-bio-01", [], False),
179 |             (label_taxonomy, "en:organic", {"en:gluten-free"}, False),
180 |             (
181 |                 label_taxonomy,
182 |                 "en:organic",
183 |                 {"en:gluten-free", "en:no-additives", "en:vegan"},
184 |                 False,
185 |             ),
186 |             (
187 |                 label_taxonomy,
188 |                 "en:organic",
189 |                 {"en:gluten-free", "en:no-additives", "en:fr-bio-16"},
190 |                 True,
191 |             ),
192 |         ],
193 |     )
194 |     def test_is_child_of_any(
195 |         self, taxonomy: Taxonomy, item: str, candidates: list, output: bool
196 |     ):
197 |         assert taxonomy.is_parent_of_any(item, candidates) is output
198 | 
199 |     def test_is_child_of_any_unknwon_item(self):
200 |         with pytest.raises(ValueError):
201 |             label_taxonomy.is_parent_of_any("unknown-id", set())
202 | 
203 |     @pytest.mark.parametrize(
204 |         "taxonomy,item,output",
205 |         [
206 |             (category_taxonomy, "en:plant-based-foods-and-beverages", set()),
207 |             (
208 |                 category_taxonomy,
209 |                 "en:plant-based-foods",
210 |                 {"en:plant-based-foods-and-beverages"},
211 |             ),
212 |             (
213 |                 category_taxonomy,
214 |                 "en:brown-rices",
215 |                 {
216 |                     "en:rices",
217 |                     "en:cereal-grains",
218 |                     "en:cereals-and-their-products",
219 |                     "en:cereals-and-potatoes",
220 |                     "en:plant-based-foods",
221 |                     "en:plant-based-foods-and-beverages",
222 |                     "en:seeds",
223 |                 },
224 |             ),
225 |         ],
226 |     )
227 |     def test_get_parents_hierarchy(
228 |         self, taxonomy: Taxonomy, item: str, output: set[str]
229 |     ):
230 |         node = taxonomy[item]
231 |         parents = node.get_parents_hierarchy()
232 |         assert set((x.id for x in parents)) == output
233 | 
234 |     @pytest.mark.parametrize(
235 |         "taxonomy,items,output",
236 |         [
237 |             (category_taxonomy, [], []),
238 |             (category_taxonomy, ["en:brown-rices"], ["en:brown-rices"]),
239 |             (category_taxonomy, ["en:brown-rices", "en:rices"], ["en:brown-rices"]),
240 |             (
241 |                 category_taxonomy,
242 |                 ["en:brown-rices", "en:rices", "en:cereal-grains"],
243 |                 ["en:brown-rices"],
244 |             ),
245 |             (
246 |                 category_taxonomy,
247 |                 ["en:brown-rices", "en:teas", "en:cereal-grains"],
248 |                 ["en:brown-rices", "en:teas"],
249 |             ),
250 |         ],
251 |     )
252 |     def test_find_deepest_nodes(
253 |         self, taxonomy: Taxonomy, items: list[str], output: list[str]
254 |     ):
255 |         item_nodes = [taxonomy[item] for item in items]
256 |         output_nodes = [taxonomy[o] for o in output]
257 |         assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes
258 | 


--------------------------------------------------------------------------------
/tests/unit/test_redis.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import json
  3 | from typing import Optional, cast
  4 | 
  5 | import pytest
  6 | from redis import Redis
  7 | 
  8 | from openfoodfacts.redis import (
  9 |     ProductUpdateEvent,
 10 |     get_new_updates_multistream,
 11 |     get_processed_since,
 12 | )
 13 | 
 14 | 
 15 | class TestRedisUpdate:
 16 |     @pytest.mark.parametrize(
 17 |         "diffs, expected",
 18 |         [
 19 |             ({"uploaded_images": {"add": ["2"]}}, True),
 20 |             ({"fields": {"change": {"product_type": "food"}}}, False),
 21 |             (None, False),
 22 |         ],
 23 |     )
 24 |     def test_is_image_upload(self, diffs, expected):
 25 |         update = ProductUpdateEvent(
 26 |             id="1629878400000-0",
 27 |             stream="product_updates",
 28 |             timestamp=1629878400000,
 29 |             code="1",
 30 |             flavor="off",
 31 |             user_id="user1",
 32 |             action="updated",
 33 |             comment="comment",
 34 |             product_type="food",
 35 |             diffs=json.dumps(diffs) if diffs is not None else None,
 36 |         )
 37 |         assert update.is_image_upload() is expected
 38 | 
 39 |     @pytest.mark.parametrize(
 40 |         "diffs, expected",
 41 |         [
 42 |             ({"fields": {"change": ["product_type"]}}, True),
 43 |             ({"fields": {"change": ["countries", "product_type"]}}, True),
 44 |             ({"fields": {"change": ["countries"]}}, False),
 45 |             (None, False),
 46 |         ],
 47 |     )
 48 |     def test_is_product_type_change(self, diffs, expected):
 49 |         update = ProductUpdateEvent(
 50 |             id="1629878400000-0",
 51 |             stream="product_updates",
 52 |             timestamp=1629878400000,
 53 |             code="1",
 54 |             flavor="off",
 55 |             user_id="user1",
 56 |             action="updated",
 57 |             comment="comment",
 58 |             product_type="food",
 59 |             diffs=json.dumps(diffs) if diffs is not None else None,
 60 |         )
 61 |         assert update.is_product_type_change() is expected
 62 | 
 63 |     @pytest.mark.parametrize(
 64 |         "diffs, field_name, expected",
 65 |         [
 66 |             (
 67 |                 {"fields": {"change": ["product_name", "quantity"]}},
 68 |                 "product_name",
 69 |                 True,
 70 |             ),
 71 |             ({"fields": {"change": ["product_name", "quantity"]}}, "countries", False),
 72 |             ({"uploaded_images": {"add": ["4"]}}, "countries", False),
 73 |             ({}, "countries", False),
 74 |             (None, "product_name", False),
 75 |         ],
 76 |     )
 77 |     def test_is_field_updated(self, diffs, field_name, expected):
 78 |         update = ProductUpdateEvent(
 79 |             id="1629878400000-0",
 80 |             stream="product_updates",
 81 |             timestamp=1629878400000,
 82 |             code="1",
 83 |             flavor="off",
 84 |             user_id="user1",
 85 |             action="updated",
 86 |             comment="comment",
 87 |             product_type="food",
 88 |             diffs=json.dumps(diffs) if diffs is not None else None,
 89 |         )
 90 |         assert update.is_field_updated(field_name) is expected
 91 | 
 92 |     @pytest.mark.parametrize(
 93 |         "diffs, field_name, expected",
 94 |         [
 95 |             (
 96 |                 {"fields": {"add": ["product_name", "quantity"]}},
 97 |                 "product_name",
 98 |                 True,
 99 |             ),
100 |             ({"fields": {"add": ["product_name"]}}, "countries", False),
101 |             ({"uploaded_images": {"add": ["4"]}}, "countries", False),
102 |             ({}, "countries", False),
103 |             (None, "product_name", False),
104 |         ],
105 |     )
106 |     def test_is_field_added(self, diffs, field_name, expected):
107 |         update = ProductUpdateEvent(
108 |             id="1629878400000-0",
109 |             stream="product_updates",
110 |             timestamp=1629878400000,
111 |             code="1",
112 |             flavor="off",
113 |             user_id="user1",
114 |             action="updated",
115 |             comment="comment",
116 |             product_type="food",
117 |             diffs=json.dumps(diffs) if diffs is not None else None,
118 |         )
119 |         assert update.is_field_added(field_name) is expected
120 | 
121 |     @pytest.mark.parametrize(
122 |         "diffs, field_name, expected",
123 |         [
124 |             (
125 |                 {"fields": {"change": ["product_name", "quantity"]}},
126 |                 "product_name",
127 |                 True,
128 |             ),
129 |             ({"fields": {"change": ["product_name", "quantity"]}}, "countries", False),
130 |             (
131 |                 {"fields": {"add": ["product_name", "quantity"]}},
132 |                 "product_name",
133 |                 True,
134 |             ),
135 |             ({"fields": {"add": ["product_name"]}}, "countries", False),
136 |             ({"uploaded_images": {"add": ["4"]}}, "countries", False),
137 |             ({}, "countries", False),
138 |             (None, "product_name", False),
139 |         ],
140 |     )
141 |     def test_is_field_added_or_updated(self, diffs, field_name, expected):
142 |         update = ProductUpdateEvent(
143 |             id="1629878400000-0",
144 |             stream="product_updates",
145 |             timestamp=1629878400000,
146 |             code="1",
147 |             flavor="off",
148 |             user_id="user1",
149 |             action="updated",
150 |             comment="comment",
151 |             product_type="food",
152 |             diffs=json.dumps(diffs) if diffs is not None else None,
153 |         )
154 |         assert update.is_field_added_or_updated(field_name) is expected
155 | 
156 |     @pytest.mark.parametrize(
157 |         "diffs, expected",
158 |         [
159 |             (
160 |                 {
161 |                     "selected_images": {"delete": ["front_de"]},
162 |                     "uploaded_images": {"delete": ["1"]},
163 |                 },
164 |                 True,
165 |             ),
166 |             ({"fields": {"add": ["product_name"]}}, False),
167 |             ({"uploaded_images": {"add": ["4"]}}, False),
168 |             ({}, False),
169 |             (None, False),
170 |         ],
171 |     )
172 |     def test_is_image_deletion(self, diffs, expected):
173 |         update = ProductUpdateEvent(
174 |             id="1629878400000-0",
175 |             stream="product_updates",
176 |             timestamp=1629878400000,
177 |             code="1",
178 |             flavor="off",
179 |             user_id="user1",
180 |             action="updated",
181 |             comment="comment",
182 |             product_type="food",
183 |             diffs=json.dumps(diffs),
184 |         )
185 |         assert update.is_image_deletion() is expected
186 | 
187 | 
188 | class RedisXrangeClient:
189 |     def __init__(self, xrange_return_values: list):
190 |         self.xrange_return_values = xrange_return_values
191 |         self.call_count = 0
192 | 
193 |     def xrange(
194 |         self, name: str, min: str = "-", max: str = "+", count: Optional[int] = None
195 |     ):
196 |         assert name in ("product_updates", "ocr_ready")
197 |         assert max == "+"
198 |         assert count == 100
199 |         if self.call_count >= len(self.xrange_return_values):
200 |             return []
201 |         self.call_count += 1
202 |         return self.xrange_return_values[self.call_count - 1]
203 | 
204 | 
205 | def test_get_processed_since():
206 |     stream_name = "product_updates"
207 |     base_values = {
208 |         "flavor": "off",
209 |         "user_id": "user1",
210 |         "action": "updated",
211 |         "comment": "comment",
212 |         "product_type": "food",
213 |     }
214 |     return_values = [
215 |         [
216 |             ("1629878400000-0", {"code": "2", **base_values}),
217 |             ("1629878400001-0", {"code": "3", **base_values}),
218 |         ]
219 |     ]
220 |     redis_client = cast(Redis, RedisXrangeClient(return_values))
221 |     # Wed Aug 25 08:00:00 2021 UTC
222 |     start_timestamp_ms = 1629878400000  # Example start timestamp
223 |     # Call the function and iterate over the results
224 |     results = list(
225 |         get_processed_since(
226 |             redis_client,
227 |             min_id=start_timestamp_ms,
228 |         )
229 |     )
230 | 
231 |     # Assertions
232 |     assert len(results) == 2
233 |     assert results[0] == ProductUpdateEvent(
234 |         id="1629878400000-0",
235 |         stream=stream_name,
236 |         timestamp=1629878400000,
237 |         code="2",
238 |         **base_values,
239 |     )
240 |     assert results[1] == ProductUpdateEvent(
241 |         id="1629878400001-0",
242 |         stream=stream_name,
243 |         timestamp=1629878400001,
244 |         code="3",
245 |         **base_values,
246 |     )
247 | 
248 | 
249 | class RedisXreadClient:
250 |     def __init__(self, xread_return_values: list):
251 |         self.xread_return_values = xread_return_values
252 |         self.call_count = 0
253 | 
254 |     def xread(self, streams: dict, block: int, count: Optional[int] = None):
255 |         assert set(streams.keys()) == {"product_updates", "ocr_ready"}
256 |         assert block == 0
257 |         assert count == 100
258 |         if self.call_count >= len(self.xread_return_values):
259 |             raise ValueError("No more values")
260 |         self.call_count += 1
261 |         return self.xread_return_values[self.call_count - 1]
262 | 
263 | 
264 | def test_get_new_updates_multistream():
265 |     product_updates_stream_name = "product_updates"
266 |     ocr_ready_stream_name = "ocr_ready"
267 |     base_values_product_updates = {
268 |         "flavor": "off",
269 |         "user_id": "user1",
270 |         "action": "updated",
271 |         "comment": "comment",
272 |         "product_type": "beauty",
273 |     }
274 |     ocr_ready_event = {
275 |         "product_type": "beauty",
276 |         "code": "3215495849204",
277 |         "image_id": "2",
278 |         "json_url": "https://images.openfoodfacts.org/images/products/321/549/584/9204/2.json",
279 |     }
280 |     return_values = [
281 |         [
282 |             (
283 |                 product_updates_stream_name,
284 |                 [("1629878400000-0", {"code": "4", **base_values_product_updates})],
285 |             ),
286 |         ],
287 |         [
288 |             (
289 |                 ocr_ready_stream_name,
290 |                 [("1629878400001-0", ocr_ready_event)],
291 |             ),
292 |         ],
293 |         [
294 |             (
295 |                 product_updates_stream_name,
296 |                 [("1629878400002-0", {"code": "1", **base_values_product_updates})],
297 |             )
298 |         ],
299 |         [
300 |             (
301 |                 product_updates_stream_name,
302 |                 [("1629878400003-0", {"code": "2", **base_values_product_updates})],
303 |             )
304 |         ],
305 |         [
306 |             (
307 |                 product_updates_stream_name,
308 |                 [("1629878400004-0", {"code": "3", **base_values_product_updates})],
309 |             )
310 |         ],
311 |     ]
312 |     redis_client = cast(Redis, RedisXreadClient(return_values))
313 | 
314 |     # Call the function and iterate over the results
315 |     updates_iter = get_new_updates_multistream(redis_client)
316 | 
317 |     product_update_result = next(updates_iter)
318 |     assert product_update_result == ProductUpdateEvent(
319 |         id="1629878400000-0",
320 |         stream=product_updates_stream_name,
321 |         timestamp=1629878400000,
322 |         code="4",
323 |         **base_values_product_updates,
324 |     )
325 | 
326 |     ocr_ready_result = next(updates_iter)
327 |     assert ocr_ready_result.id == "1629878400001-0"
328 |     assert ocr_ready_result.stream == ocr_ready_stream_name
329 |     assert ocr_ready_result.timestamp == datetime.datetime.fromtimestamp(
330 |         1629878400.001, tz=datetime.timezone.utc
331 |     )
332 |     assert ocr_ready_result.code == ocr_ready_event["code"]
333 |     assert ocr_ready_result.product_type == ocr_ready_event["product_type"]
334 |     assert ocr_ready_result.image_id == ocr_ready_event["image_id"]
335 |     assert ocr_ready_result.json_url == ocr_ready_event["json_url"]
336 | 


--------------------------------------------------------------------------------
/openfoodfacts/images.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | from typing import List, Optional, Tuple, Union
  4 | from urllib.parse import urlparse
  5 | 
  6 | import requests
  7 | 
  8 | from openfoodfacts.types import Environment, Flavor, JSONType
  9 | from openfoodfacts.utils import ImageDownloadItem, URLBuilder, get_image_from_url
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # Base URL of the public Open Food Facts S3 bucket
 15 | AWS_S3_BASE_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data"
 16 | 
 17 | 
 18 | _pillow_available = True
 19 | try:
 20 |     from PIL import Image
 21 | except ImportError:
 22 |     _pillow_available = False
 23 | 
 24 | 
 25 | def split_barcode(barcode: str) -> List[str]:
 26 |     """Split barcode in the same way as done by Product Opener to generate a
 27 |     product image folder.
 28 | 
 29 |     :param barcode: The barcode of the product. For the pro platform only,
 30 |         it must be prefixed with the org ID using the format
 31 |         `{ORG_ID}/{BARCODE}`
 32 |     :raises ValueError: raise a ValueError if `barcode` is invalid
 33 |     :return: a list containing the splitted barcode
 34 |     """
 35 |     org_id = None
 36 |     if "/" in barcode:
 37 |         # For the pro platform, `barcode` is expected to be in the format
 38 |         # `{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`)
 39 |         org_id, barcode = barcode.split("/", maxsplit=1)
 40 | 
 41 |     if not barcode.isdigit():
 42 |         raise ValueError(f"unknown barcode format: {barcode}")
 43 | 
 44 |     # Pad the barcode with zeros to ensure it has 13 digits
 45 |     barcode = barcode.lstrip("0").zfill(13)
 46 |     # Split the first 9 digits of the barcode into 3 groups of 3 digits to
 47 |     # get the first 3 folder names and use the rest of the barcode as the
 48 |     # last folder name
 49 |     splits = [barcode[0:3], barcode[3:6], barcode[6:9], barcode[9:]]
 50 | 
 51 |     if org_id is not None:
 52 |         # For the pro platform only, images and OCRs belonging to an org
 53 |         # are stored in a folder named after the org for all its products, ex:
 54 |         # https://images.pro.openfoodfacts.org/images/products/org-lea-nature/330/713/080/3004/1.jpg
 55 |         splits.insert(0, org_id)
 56 | 
 57 |     return splits
 58 | 
 59 | 
 60 | def _generate_file_path(code: str, image_id: str, suffix: str):
 61 |     splitted_barcode = split_barcode(code)
 62 |     return f"/{'/'.join(splitted_barcode)}/{image_id}{suffix}"
 63 | 
 64 | 
 65 | def generate_image_path(code: str, image_id: str) -> str:
 66 |     """Generate an image path.
 67 | 
 68 |     It's used to generate a unique identifier of an image for a product (and
 69 |     to generate an URL to fetch this image from the server).
 70 | 
 71 |     :param code: the product barcode
 72 |     :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
 73 |     :return: the full image path
 74 |     """
 75 |     return _generate_file_path(code, image_id, ".jpg")
 76 | 
 77 | 
 78 | def generate_json_ocr_path(code: str, image_id: str) -> str:
 79 |     """Generate a JSON OCR path.
 80 | 
 81 |     It's used to generate a unique identifier of an OCR results for a product
 82 |     (and to generate an URL to fetch this OCR JSON from the server).
 83 | 
 84 |     :param code: the product barcode
 85 |     :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
 86 |     :return: the full image path
 87 |     """
 88 |     return _generate_file_path(code, image_id, ".json")
 89 | 
 90 | 
 91 | def generate_json_ocr_url(
 92 |     code: str,
 93 |     image_id: str,
 94 |     flavor: Flavor = Flavor.off,
 95 |     environment: Environment = Environment.org,
 96 | ) -> str:
 97 |     """Generate the OCR JSON URL for a specific product and
 98 |     image ID.
 99 | 
100 |     :param code: the product barcode
101 |     :param image_id: the image ID (ex: `1`, `2`,...)
102 |     :param flavor: the project to use, defaults to Flavor.off
103 |     :param environment: the environment (prod/staging), defaults to
104 |         Environment.org
105 |     :return: the generated JSON URL
106 |     """
107 |     return URLBuilder.image_url(
108 |         flavor, environment, generate_json_ocr_path(code, image_id)
109 |     )
110 | 
111 | 
112 | def generate_image_url(
113 |     code: str,
114 |     image_id: str,
115 |     flavor: Flavor = Flavor.off,
116 |     environment: Environment = Environment.org,
117 | ) -> str:
118 |     """Generate the image URL for a specific product and
119 |     image ID.
120 | 
121 |     :param code: the product barcode
122 |     :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...)
123 |     :param flavor: the project to use, defaults to Flavor.off
124 |     :param environment: the environment (prod/staging), defaults to
125 |         Environment.org
126 |     :return: the generated image URL
127 |     """
128 |     return URLBuilder.image_url(
129 |         flavor, environment, generate_image_path(code, image_id)
130 |     )
131 | 
132 | 
133 | def extract_barcode_from_url(url: str) -> Optional[str]:
134 |     """Extract a product barcode from an image/OCR URL.
135 | 
136 |     :param url: the URL
137 |     :return: the extracted barcode
138 |     """
139 |     url_path = urlparse(url).path
140 |     return extract_barcode_from_path(url_path)
141 | 
142 | 
143 | def extract_barcode_from_path(path: str) -> Optional[str]:
144 |     """Extract a product barcode from an image/OCR path.
145 | 
146 |     The barcode is normalized using the following rules:
147 | 
148 |     - all leading zeros are stripped
149 |     - if the barcode is less than 8 digits, it is left-padded with zeros up to
150 |       8 digits
151 |     - if the barcode is more than 8 digits but less than 13 digits, it is
152 |       left-padded with zeros up to 13 digits
153 |     - if the barcode has 13 digits or more, it's returned as it
154 |     """
155 |     barcode = ""
156 | 
157 |     for parent in Path(path).parents:
158 |         if parent.name.isdigit():
159 |             barcode = parent.name + barcode
160 |         else:
161 |             break
162 | 
163 |     # Strip leading zeros
164 |     barcode = barcode.lstrip("0")
165 | 
166 |     if not barcode:
167 |         return None
168 | 
169 |     if len(barcode) <= 8:
170 |         barcode = barcode.zfill(8)
171 |         return barcode
172 | 
173 |     barcode = barcode.zfill(13)
174 |     return barcode
175 | 
176 | 
177 | def extract_source_from_url(url: str) -> str:
178 |     """Extract source image from an image or OCR URL.
179 | 
180 |     The source image is a unique identifier of the image or OCR,
181 |     and is the full path of the image or OCR file on the server
182 |     (ex: `/008/009/637/2472/1.jpg`).
183 | 
184 |     :param url: the URL
185 |     :return: the source image
186 |     """
187 |     url_path = urlparse(url).path
188 | 
189 |     if url_path.startswith("/images/products"):
190 |         url_path = url_path[len("/images/products") :]
191 | 
192 |     if url_path.endswith(".json"):
193 |         url_path = str(Path(url_path).with_suffix(".jpg"))
194 | 
195 |     # normalize windows path to unix path
196 |     return url_path.replace("\\", "/")
197 | 
198 | 
199 | def download_image(
200 |     image: Union[str, Tuple[str, str]],
201 |     use_cache: bool = True,
202 |     error_raise: bool = True,
203 |     session: Optional[requests.Session] = None,
204 |     return_struct: bool = False,
205 | ) -> Union[None, "Image.Image", ImageDownloadItem]:
206 |     """Download an Open Food Facts image.
207 | 
208 |     :param image: the image URL or a tuple containing the barcode and the
209 |         image ID
210 |     :param use_cache: whether to use the S3 dataset cache, defaults to True
211 |     :param error_raise: whether to raise an error if the download fails,
212 |         defaults to True
213 |     :param session: the requests session to use, defaults to None
214 |     :param return_struct: if True, return a `ImageDownloadItem` object
215 |         containing the image, image bytes and the response object.
216 |     :return: the downloaded image, or an `ImageDownloadItem` object if
217 |         `return_struct` is True.
218 | 
219 |     >>> download_image("https://images.openfoodfacts.org/images/products/324/227/210/2359/4.jpg")  # noqa
220 |     <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1244x1500>
221 | 
222 |     >>> download_image(("3242272102359", "4"))
223 |     <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1244x1500>
224 |     """
225 |     if not _pillow_available:
226 |         raise ImportError("Pillow is required to use this function")
227 | 
228 |     if isinstance(image, str):
229 |         if use_cache:
230 |             image_path = extract_source_from_url(image)
231 |             image_url = f"{AWS_S3_BASE_URL}{image_path}"
232 | 
233 |             if requests.head(image_url).status_code != 200:
234 |                 logger.debug(f"Image not found in cache: {image_url}")
235 |                 image_url = image
236 |         else:
237 |             image_url = image
238 | 
239 |     if isinstance(image, tuple):
240 |         if use_cache:
241 |             image_path = generate_image_path(*image)
242 |             image_url = f"{AWS_S3_BASE_URL}{image_path}"
243 | 
244 |             if requests.head(image_url).status_code != 200:
245 |                 logger.debug(f"Image not found in cache: {image_url}")
246 |                 image_url = generate_image_url(*image)
247 |         else:
248 |             image_url = generate_image_url(*image)
249 | 
250 |     logger.debug(f"Downloading image from {image_url}")
251 |     return get_image_from_url(
252 |         image_url,
253 |         error_raise=error_raise,
254 |         session=session,
255 |         return_struct=return_struct,
256 |     )
257 | 
258 | 
259 | def convert_to_legacy_schema(images: JSONType) -> JSONType:
260 |     """Convert the images dictionary to the legacy schema.
261 | 
262 |     We've improved the schema of the `images` field, but the new
263 |     schema is not compatible with the legacy schema. This function
264 |     converts the new schema to the legacy schema.
265 | 
266 |     It can be used while migrating the existing Python codebase to the
267 |     new schema.
268 | 
269 |     The new `images` schema is the following:
270 | 
271 |     - the `images` field contains the uploaded images under the `uploaded`
272 |         key and the selected images under the `selected` key
273 |     - `uploaded` contains the images that are uploaded, and maps the
274 |         image ID to the detail about the image:
275 |         - `uploaded_t`: the upload timestamp
276 |         - `uploader`: the username of the uploader
277 |         - `sizes`: dictionary mapping image size (`100`, `200`, `400`, `full`)
278 |             to the information about each resized image:
279 |             - `h`: the height of the image
280 |             - `w`: the width of the image
281 |             - `url`: the URL of the image
282 |     - `selected` contains the images that are selected, and maps the
283 |         image key (`nutrition`, `ingredients`, `packaging`, or `front`) to
284 |         a dictionary mapping the language to the selected image details.
285 |         The selected image details are the following fields:
286 |         - `imgid`: the image ID
287 |         - `rev`: the revision ID
288 |         - `sizes`: dictionary mapping image size (`100`, `200`, `400`, `full`)
289 |             to the information about each resized image:
290 |             - `h`: the height of the image
291 |             - `w`: the width of the image
292 |             - `url`: the URL of the image
293 |         - `generation`: information about how to generate the selected image
294 |             from the uploaded image:
295 |             - `geometry`
296 |             - `x1`, `y1`, `x2`, `y2`: the coordinates of the crop
297 |             - `angle`: the rotation angle of the selected image
298 |             - `coordinates_image_size`: 400 or "full", indicates if the
299 |                 geometry coordinates are relative to the full image, or to a
300 |                 resized version (max width and max height=400)
301 |             - `normalize`: indicates if colors should be normalized
302 |             - `white_magic`: indicates if the background is white and should
303 |                 be removed (e.g. photo on a white sheet of paper)
304 | 
305 |     See https://github.com/openfoodfacts/openfoodfacts-server/pull/11818
306 |     for more details.
307 |     """
308 | 
309 |     if not is_new_image_schema(images):
310 |         return images
311 | 
312 |     images_with_legacy_schema = {}
313 | 
314 |     for image_id, image_data in images.get("uploaded", {}).items():
315 |         images_with_legacy_schema[image_id] = {
316 |             "sizes": {
317 |                 # remove URL field
318 |                 size: {k: v for k, v in image_size_data.items() if k != "url"}
319 |                 for size, image_size_data in image_data["sizes"].items()
320 |             },
321 |             "uploaded_t": image_data["uploaded_t"],
322 |             "uploader": image_data["uploader"],
323 |         }
324 | 
325 |     for selected_key, image_by_lang in images.get("selected", {}).items():
326 |         for lang, image_data in image_by_lang.items():
327 |             new_image_data = {
328 |                 "imgid": image_data["imgid"],
329 |                 "rev": image_data["rev"],
330 |                 "sizes": {
331 |                     # remove URL field
332 |                     size: {k: v for k, v in image_size_data.items() if k != "url"}
333 |                     for size, image_size_data in image_data["sizes"].items()
334 |                 },
335 |                 **(image_data.get("generation", {})),
336 |             }
337 |             images_with_legacy_schema[f"{selected_key}_{lang}"] = new_image_data
338 | 
339 |     return images_with_legacy_schema
340 | 
341 | 
342 | def is_new_image_schema(images_data: JSONType) -> bool:
343 |     """Return True if the `images` dictionary follows the new Product Opener
344 |     images schema.
345 | 
346 |     See https://github.com/openfoodfacts/openfoodfacts-server/pull/11818 for
347 |     more information about this new schema.
348 |     """
349 |     if not images_data:
350 |         return False
351 | 
352 |     return "selected" in images_data or "uploaded" in images_data
353 | 


--------------------------------------------------------------------------------
/openfoodfacts/redis.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | from typing import Any, Iterator, cast
  4 | 
  5 | from pydantic import BaseModel, Json
  6 | from redis import Redis
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def get_redis_client(**kwargs) -> Redis:
 12 |     return Redis(
 13 |         decode_responses=True,
 14 |         **kwargs,
 15 |     )
 16 | 
 17 | 
 18 | class ProductUpdateEvent(BaseModel):
 19 |     """A class representing a product update from a Redis Stream."""
 20 | 
 21 |     # The Redis ID of the event
 22 |     id: str
 23 |     # The name of the Redis stream where the update was published
 24 |     # This will always be "product_updates"
 25 |     stream: str
 26 |     # The timestamp of the event
 27 |     timestamp: datetime.datetime
 28 |     # The code of the product
 29 |     code: str
 30 |     # The flavor of the product (off, obf, opff, off_pro)
 31 |     flavor: str
 32 |     # The user ID of the user who performed the action
 33 |     user_id: str
 34 |     # The action performed by the user (either updated or deleted)
 35 |     action: str
 36 |     # A comment provided by the user
 37 |     comment: str
 38 |     # the type of the product (food, product, petfood, beauty)
 39 |     product_type: str
 40 |     # A JSON object representing the differences between the old and new
 41 |     # product data
 42 |     diffs: Json[Any] | None = None
 43 | 
 44 |     def is_image_upload(self) -> bool:
 45 |         """Returns True if the update is an image upload."""
 46 |         return bool(
 47 |             self.diffs is not None
 48 |             and "uploaded_images" in self.diffs
 49 |             and "add" in self.diffs["uploaded_images"]
 50 |         )
 51 | 
 52 |     def is_product_type_change(self) -> bool:
 53 |         """Returns True if the update contains a product type change (example:
 54 |         switch from `food` to `beauty`)."""
 55 |         return bool(
 56 |             self.diffs is not None
 57 |             and "fields" in self.diffs
 58 |             and "change" in self.diffs["fields"]
 59 |             and "product_type" in self.diffs["fields"]["change"]
 60 |         )
 61 | 
 62 |     def is_field_updated(self, field_name: str) -> bool:
 63 |         """Returns True if the update contains a change in the specified
 64 |         field."""
 65 |         return (
 66 |             self.diffs is not None
 67 |             and "fields" in self.diffs
 68 |             and "change" in self.diffs["fields"]
 69 |             and field_name in self.diffs["fields"]["change"]
 70 |         )
 71 | 
 72 |     def is_field_added(self, field_name: str) -> bool:
 73 |         """Returns True if the update contains a change in the specified
 74 |         field."""
 75 |         return (
 76 |             self.diffs is not None
 77 |             and "fields" in self.diffs
 78 |             and "add" in self.diffs["fields"]
 79 |             and field_name in self.diffs["fields"]["add"]
 80 |         )
 81 | 
 82 |     def is_field_added_or_updated(self, field_name: str) -> bool:
 83 |         """Returns True if the update contains a change in the specified
 84 |         field."""
 85 |         return self.is_field_updated(field_name) or self.is_field_added(field_name)
 86 | 
 87 |     def is_image_deletion(self) -> bool:
 88 |         """Returns True if the event is an image deletion."""
 89 |         return (
 90 |             self.diffs is not None
 91 |             and "uploaded_images" in self.diffs
 92 |             and "delete" in self.diffs["uploaded_images"]
 93 |         )
 94 | 
 95 | 
 96 | class OCRReadyEvent(BaseModel):
 97 |     """A class representing an OCR ready event from a Redis Stream.
 98 | 
 99 |     This event is published when the OCR processing (done by Google Cloud
100 |     Vision) of an image is complete.
101 | 
102 |     The OCR result (JSON file) is available at the URL provided in the
103 |     `json_url` field.
104 |     """
105 | 
106 |     # The Redis ID of the event
107 |     id: str
108 |     # The name of the Redis stream where the event was published
109 |     # This will always be "ocr_ready"
110 |     stream: str
111 |     # The timestamp of the event
112 |     timestamp: datetime.datetime
113 |     # The code of the product
114 |     code: str
115 |     # the type of the product (food, product, petfood, beauty)
116 |     product_type: str
117 |     # The ID of the image (ex: "1")
118 |     image_id: str
119 |     # The URL of the OCR result (JSON file)
120 |     json_url: str
121 | 
122 | 
123 | def get_processed_since(
124 |     redis_client: Redis,
125 |     min_id: str | datetime.datetime,
126 |     product_updates_stream_name: str = "product_updates",
127 |     ocr_ready_stream_name: str = "ocr_ready",
128 |     batch_size: int = 100,
129 | ) -> Iterator[ProductUpdateEvent | OCRReadyEvent]:
130 |     """Fetches all events (product update or ocr ready events) that have been
131 |     published since the given timestamp.
132 | 
133 |     :param redis_client: the Redis client
134 |     :param min_id: the minimum ID to start from, or a datetime object
135 |     :param product_updates_stream_name: the name of the Redis stream for
136 |         product updates, defaults to "product_updates"
137 |     :param ocr_ready_stream_name: the name of the Redis stream for OCR ready
138 |         events, defaults to "ocr_ready"
139 |     :param batch_size: the size of the batch to fetch, defaults to 100
140 |     :yield: a ProductUpdateEvent or OCRReadyEvent instance for each update
141 |     """
142 |     if isinstance(min_id, datetime.datetime):
143 |         min_id = f"{int(min_id.timestamp() * 1000)}-0"
144 | 
145 |     for stream_name in (
146 |         product_updates_stream_name,
147 |         ocr_ready_stream_name,
148 |     ):
149 |         while True:
150 |             logger.debug(
151 |                 "Fetching batch from Redis, stream %s, min_id %s, count %d",
152 |                 stream_name,
153 |                 min_id,
154 |                 batch_size,
155 |             )
156 |             batch = redis_client.xrange(stream_name, min=min_id, count=batch_size)
157 |             if not batch:
158 |                 # We reached the end of the stream
159 |                 break
160 | 
161 |             batch = cast(list[tuple[str, dict]], batch)
162 |             # We update the min_id to the last ID of the batch
163 |             min_id = f"({batch[-1][0]}"
164 |             for timestamp_id, item in batch:
165 |                 # Get the timestamp from the ID
166 |                 timestamp = int(timestamp_id.split("-")[0])
167 | 
168 |                 if stream_name == ocr_ready_stream_name:
169 |                     yield OCRReadyEvent(
170 |                         id=timestamp_id,
171 |                         timestamp=timestamp,  # type: ignore
172 |                         stream=stream_name,
173 |                         code=item["code"],
174 |                         product_type=item["product_type"],
175 |                         image_id=item["image_id"],
176 |                         json_url=item["json_url"],
177 |                     )
178 |                 else:
179 |                     yield ProductUpdateEvent(
180 |                         id=timestamp_id,
181 |                         timestamp=timestamp,  # type: ignore
182 |                         stream=stream_name,
183 |                         code=item["code"],
184 |                         flavor=item["flavor"],
185 |                         user_id=item["user_id"],
186 |                         action=item["action"],
187 |                         comment=item["comment"],
188 |                         product_type=item["product_type"],
189 |                         diffs=item.get("diffs"),
190 |                     )
191 | 
192 | 
193 | def get_new_updates_multistream(
194 |     redis_client: Redis,
195 |     product_updates_stream_name: str = "product_updates",
196 |     ocr_ready_stream_name: str = "ocr_ready",
197 |     min_id: str | datetime.datetime | None = "$",
198 |     batch_size: int = 100,
199 | ) -> Iterator[ProductUpdateEvent | OCRReadyEvent]:
200 |     """Reads new updates from Redis Stream, starting from the moment this
201 |     function is called.
202 | 
203 |     The function will block until new updates are available.
204 | 
205 |     :param redis_client: the Redis client.
206 |     :param product_updates_stream_name: the name of the Redis stream for
207 |         product updates, defaults to "product_updates".
208 |     :param ocr_ready_stream_name: the name of the Redis stream for OCR ready
209 |         events, defaults to "ocr_ready".
210 |     :param min_id: the minimum ID to start from, defaults to "$".
211 |     :param batch_size: the size of the batch to fetch, defaults to 100.
212 |     :yield: a ProductUpdateEvent or OCRReadyEvent instance for each update.
213 |     """
214 |     if min_id is None:
215 |         min_id = "$"
216 |     elif isinstance(min_id, datetime.datetime):
217 |         min_id = f"{int(min_id.timestamp() * 1000)}-0"
218 | 
219 |     stream_names = [product_updates_stream_name, ocr_ready_stream_name]
220 |     # We start from the last ID
221 |     min_ids: dict[bytes | str | memoryview, int | bytes | str | memoryview] = {
222 |         stream_name: min_id for stream_name in stream_names
223 |     }
224 |     while True:
225 |         logger.debug(
226 |             "Listening to new updates from streams %s (ID: %s)", stream_names, min_ids
227 |         )
228 |         # We use block=0 to wait indefinitely for new updates
229 |         response = redis_client.xread(streams=min_ids, block=0, count=batch_size)
230 |         response = cast(list[tuple[str, list[tuple[str, dict]]]], response)
231 |         # The response is a list of tuples (stream_name, batch)
232 | 
233 |         for stream_name, batch in response:
234 |             # We update the min_id to the last ID of the batch
235 |             new_min_id = batch[-1][0]
236 |             min_ids[stream_name] = new_min_id
237 |             for timestamp_id, item in batch:
238 |                 # Get the timestamp from the ID
239 |                 timestamp = int(timestamp_id.split("-")[0])
240 | 
241 |                 if stream_name == ocr_ready_stream_name:
242 |                     yield OCRReadyEvent(
243 |                         id=timestamp_id,
244 |                         stream=stream_name,
245 |                         timestamp=timestamp,  # type: ignore
246 |                         code=item["code"],
247 |                         product_type=item["product_type"],
248 |                         image_id=item["image_id"],
249 |                         json_url=item["json_url"],
250 |                     )
251 |                 else:
252 |                     yield ProductUpdateEvent(
253 |                         id=timestamp_id,
254 |                         stream=stream_name,
255 |                         timestamp=timestamp,  # type: ignore
256 |                         code=item["code"],
257 |                         flavor=item["flavor"],
258 |                         user_id=item["user_id"],
259 |                         action=item["action"],
260 |                         comment=item["comment"],
261 |                         product_type=item["product_type"],
262 |                         diffs=item.get("diffs"),
263 |                     )
264 | 
265 | 
266 | class UpdateListener:
267 |     """A class representing a daemon that listens to events from a Redis
268 |     stream and processes them.
269 | 
270 |     The class is meant to be subclassed to implement the processing logic.
271 |     Subclasses can implement the `process_redis_update` and
272 |     `process_ocr_ready` methods.
273 |     """
274 | 
275 |     def __init__(
276 |         self,
277 |         redis_client: Redis,
278 |         redis_latest_id_key: str,
279 |         product_updates_stream_name: str = "product_updates",
280 |         ocr_ready_stream_name: str = "ocr_ready",
281 |     ):
282 |         self.redis_client = redis_client
283 |         self.product_updates_stream_name = product_updates_stream_name
284 |         self.ocr_ready_stream_name = ocr_ready_stream_name
285 |         self.redis_latest_id_key = redis_latest_id_key
286 | 
287 |     def run(self):
288 |         """Run the update import daemon.
289 | 
290 |         This daemon listens to the Redis stream containing information about
291 |         product updates or OCR ready events, and processes them as they
292 |         arrive.
293 |         """
294 |         logger.info("Starting update listener daemon")
295 | 
296 |         logger.info("Redis client: %s", self.redis_client)
297 |         logger.info("Pinging client...")
298 |         self.redis_client.ping()
299 |         logger.info("Connection successful")
300 | 
301 |         latest_id = self.redis_client.get(self.redis_latest_id_key)
302 | 
303 |         if latest_id:
304 |             logger.info(
305 |                 "Latest ID processed: %s (datetime: %s)",
306 |                 latest_id,
307 |                 datetime.datetime.fromtimestamp(int(latest_id.split("-")[0]) / 1000),
308 |             )
309 |         else:
310 |             logger.info("No latest ID found")
311 | 
312 |         for event in get_new_updates_multistream(
313 |             self.redis_client,
314 |             min_id=latest_id,
315 |         ):
316 |             try:
317 |                 if isinstance(event, OCRReadyEvent):
318 |                     self.process_ocr_ready(event)
319 |                 else:
320 |                     self.process_redis_update(event)
321 |             except Exception as e:
322 |                 logger.exception(e)
323 |             self.redis_client.set(self.redis_latest_id_key, event.id)
324 | 
325 |     def process_updates_since(
326 |         self, since: datetime.datetime, to: datetime.datetime | None = None
327 |     ):
328 |         """Process all the updates since the given timestamp.
329 | 
330 |         :param client: the Redis client
331 |         :param since: the timestamp to start from
332 |         :param to: the timestamp to stop, defaults to None (process all
333 |             updates)
334 |         """
335 |         logger.info("Redis client: %s", self.redis_client)
336 |         logger.info("Pinging client...")
337 |         self.redis_client.ping()
338 | 
339 |         processed = 0
340 |         for event in get_processed_since(
341 |             self.redis_client,
342 |             min_id=since,
343 |         ):
344 |             if to is not None and event.timestamp > to:
345 |                 break
346 |             if isinstance(event, OCRReadyEvent):
347 |                 self.process_ocr_ready(event)
348 |             else:
349 |                 self.process_redis_update(event)
350 | 
351 |             processed += 1
352 | 
353 |         logger.info("Processed %d events", processed)
354 | 
355 |     def process_redis_update(self, event: ProductUpdateEvent):
356 |         pass
357 | 
358 |     def process_ocr_ready(self, event: OCRReadyEvent):
359 |         pass
360 | 


--------------------------------------------------------------------------------
/openfoodfacts/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import gzip
  3 | import json
  4 | import logging
  5 | import random
  6 | import shutil
  7 | import string
  8 | import time
  9 | from io import BytesIO
 10 | from pathlib import Path
 11 | from time import perf_counter
 12 | from typing import Callable, Dict, Iterable, List, Optional, Union
 13 | 
 14 | import requests
 15 | import tqdm
 16 | 
 17 | from ..types import COUNTRY_CODE_TO_NAME, Country, Environment, Flavor
 18 | 
 19 | _orjson_available = True
 20 | try:
 21 |     import orjson
 22 | except ImportError:
 23 |     _orjson_available = False
 24 | 
 25 | _pillow_available = True
 26 | try:
 27 |     import PIL
 28 |     from PIL import Image
 29 | except ImportError:
 30 |     _pillow_available = False
 31 | 
 32 | http_session = requests.Session()
 33 | http_session.headers.update({"User-Agent": "openfoodfacts-python"})
 34 | 
 35 | 
 36 | def configure_root_logger(
 37 |     logger: logging.Logger,
 38 |     level: int = logging.INFO,
 39 |     formatter_string: Optional[str] = None,
 40 | ):
 41 |     logger.setLevel(level)
 42 |     handler = logging.StreamHandler()
 43 | 
 44 |     if formatter_string is None:
 45 |         formatter_string = "%(asctime)s :: %(levelname)s :: %(message)s"
 46 | 
 47 |     formatter = logging.Formatter(formatter_string)
 48 |     handler.setFormatter(formatter)
 49 |     handler.setLevel(level)
 50 |     logger.addHandler(handler)
 51 |     return logger
 52 | 
 53 | 
 54 | def get_logger(name=None, level: int = logging.INFO) -> logging.Logger:
 55 |     logger = logging.getLogger(name)
 56 |     logger.setLevel(level)
 57 | 
 58 |     if name is None:
 59 |         configure_root_logger(logger, level)
 60 | 
 61 |     return logger
 62 | 
 63 | 
 64 | logger = get_logger(__name__)
 65 | 
 66 | 
 67 | class URLBuilder:
 68 |     """URLBuilder allows to generate URLs for Product Opener/Robotoff.
 69 | 
 70 |     Example usage: URLBuilder.robotoff() returns the Robotoff URL.
 71 |     """
 72 | 
 73 |     @staticmethod
 74 |     def _get_url(
 75 |         base_domain: str,
 76 |         prefix: Optional[str] = "world",
 77 |         tld: str = "org",
 78 |         scheme: Optional[str] = None,
 79 |     ):
 80 |         data = {
 81 |             "domain": f"{base_domain}.{tld}",
 82 |             "scheme": "https",
 83 |         }
 84 |         if prefix:
 85 |             data["prefix"] = prefix
 86 |         if scheme:
 87 |             data["scheme"] = scheme
 88 | 
 89 |         if "prefix" in data:
 90 |             return "%(scheme)s://%(prefix)s.%(domain)s" % data
 91 | 
 92 |         return "%(scheme)s://%(domain)s" % data
 93 | 
 94 |     @staticmethod
 95 |     def world(flavor: Flavor, environment: Environment):
 96 |         return URLBuilder._get_url(
 97 |             prefix="world", tld=environment.value, base_domain=flavor.get_base_domain()
 98 |         )
 99 | 
100 |     @staticmethod
101 |     def robotoff(environment: Environment) -> str:
102 |         return URLBuilder._get_url(
103 |             prefix="robotoff",
104 |             tld=environment.value,
105 |             base_domain=Flavor.off.get_base_domain(),
106 |         )
107 | 
108 |     @staticmethod
109 |     def static(flavor: Flavor, environment: Environment) -> str:
110 |         return URLBuilder._get_url(
111 |             prefix="static", tld=environment.value, base_domain=flavor.get_base_domain()
112 |         )
113 | 
114 |     @staticmethod
115 |     def image_url(flavor: Flavor, environment: Environment, image_path: str) -> str:
116 |         prefix = URLBuilder._get_url(
117 |             prefix="images", tld=environment.value, base_domain=flavor.get_base_domain()
118 |         )
119 |         return prefix + f"/images/products{image_path}"
120 | 
121 |     @staticmethod
122 |     def country(flavor: Flavor, environment: Environment, country_code: str) -> str:
123 |         return URLBuilder._get_url(
124 |             prefix=country_code,
125 |             tld=environment.value,
126 |             base_domain=flavor.get_base_domain(),
127 |         )
128 | 
129 | 
130 | def jsonl_iter(jsonl_path: Union[str, Path]) -> Iterable[Dict]:
131 |     """Iterate over elements of a JSONL file.
132 | 
133 |     :param jsonl_path: the path of the JSONL file. Both plain (.jsonl) and
134 |         gzipped (jsonl.gz) files are supported.
135 |     :yield: dict contained in the JSONL file
136 |     """
137 |     open_fn = get_open_fn(jsonl_path)
138 | 
139 |     with open_fn(str(jsonl_path), "rt", encoding="utf-8") as f:
140 |         yield from jsonl_iter_fp(f)
141 | 
142 | 
143 | def get_open_fn(filepath: Union[str, Path]) -> Callable:
144 |     filepath = str(filepath)
145 |     if filepath.endswith(".gz"):
146 |         return gzip.open
147 |     else:
148 |         return open
149 | 
150 | 
151 | def jsonl_iter_fp(fp) -> Iterable[Dict]:
152 |     for line in fp:
153 |         line = line.strip("\n")
154 |         if line:
155 |             if _orjson_available:
156 |                 yield orjson.loads(line)
157 |             else:
158 |                 yield json.loads(line)
159 | 
160 | 
161 | def load_json(filepath: Union[str, Path]) -> Union[Dict, List]:
162 |     """Load a JSON file, support gzipped JSON files.
163 | 
164 |     :param path: the path of the file
165 |     """
166 |     open = get_open_fn(filepath)
167 |     with open(filepath, "rb") as f:
168 |         if _orjson_available:
169 |             return orjson.loads(f.read())
170 |         else:
171 |             return json.loads(f.read().decode("utf-8"))
172 | 
173 | 
174 | def _sanitize_file_path(file_path: Path, suffix: str = "") -> Path:
175 |     """A internal function to normalize cached filenames.
176 | 
177 |     :param file_path: the cached file path
178 |     :param suffix: a optional filename suffix to add
179 |     :return: a sanitized filepath
180 |     """
181 |     return file_path.with_name(file_path.name.replace(".", "_") + suffix)
182 | 
183 | 
184 | def download_file(url: str, output_path: Path):
185 |     """Download a dataset file and store it in `output_path`.
186 | 
187 |     The file metadata (`etag`, `url`, `created_at`) are stored in a JSON
188 |         file whose name is derived from `output_path`
189 |     :param url: the file URL
190 |     :param output_path: the file output path
191 |     """
192 |     r = http_session.get(url, stream=True)
193 |     etag = r.headers.get("ETag", "").strip("'\"")
194 | 
195 |     # add a random string to the output path to avoid concurrent writes
196 |     suffix = "".join(random.choices(string.ascii_letters, k=8))
197 |     tmp_output_path = output_path.with_name(output_path.name + f"-{suffix}.part")
198 |     with (
199 |         tmp_output_path.open("wb") as f,
200 |         tqdm.tqdm(
201 |             unit="B",
202 |             unit_scale=True,
203 |             unit_divisor=1024,
204 |             miniters=1,
205 |             desc=str(output_path),
206 |             total=int(r.headers.get("content-length", 0)),
207 |         ) as pbar,
208 |     ):
209 |         for chunk in r.iter_content(chunk_size=4096):
210 |             f.write(chunk)
211 |             pbar.update(len(chunk))
212 | 
213 |     shutil.move(tmp_output_path, output_path)
214 | 
215 |     _sanitize_file_path(output_path, ".json").write_text(
216 |         json.dumps(
217 |             {
218 |                 "etag": etag,
219 |                 "created_at": int(time.time()),
220 |                 "url": url,
221 |             }
222 |         )
223 |     )
224 | 
225 | 
226 | def get_file_etag(dataset_path: Path) -> Optional[str]:
227 |     """Return a dataset Etag.
228 | 
229 |     :param dataset_path: the path of the dataset
230 |     :return: the file Etag
231 |     """
232 |     metadata_path = _sanitize_file_path(dataset_path, ".json")
233 | 
234 |     if metadata_path.is_file():
235 |         return json.loads(metadata_path.read_text())["etag"]
236 | 
237 |     return None
238 | 
239 | 
240 | def fetch_etag(url: str) -> str:
241 |     """Get the Etag of a remote file.
242 | 
243 |     :param url: the file URL
244 |     :return: the Etag
245 |     """
246 |     r = http_session.head(url)
247 |     return r.headers.get("ETag", "").strip("'\"")
248 | 
249 | 
250 | def should_download_file(
251 |     url: str, filepath: Path, force_download: bool, download_newer: bool
252 | ) -> bool:
253 |     """Return True if the file located at `url` should be downloaded again.
254 | 
255 |     :param url: the file URL
256 |     :param filepath: the file cached location
257 |     :param force_download: if True, (re)download the file even if it was
258 |         cached, defaults to False
259 |     :param download_newer: if True, download the dataset if a more recent
260 |         version compared to the cached version is available (based on file
261 |         Etag). This parameter if ignored if force_download is True, defaults
262 |         to False.
263 |     :return: True if the file should be downloaded again, False otherwise
264 |     """
265 |     if filepath.is_file():
266 |         if force_download:
267 |             # Always download the file if force_download is True
268 |             return True
269 | 
270 |         if download_newer:
271 |             # Check if the file is up to date
272 |             cached_etag = get_file_etag(filepath)
273 |             current_etag = fetch_etag(url)
274 |             return cached_etag != current_etag
275 |         else:
276 |             # The file exists, no need to download it again
277 |             return False
278 | 
279 |     return True
280 | 
281 | 
282 | def get_country_name(country: Country) -> str:
283 |     """Return country name code (ex: `en:portugal`) from `Country`."""
284 |     return COUNTRY_CODE_TO_NAME[country]
285 | 
286 | 
287 | class AssetLoadingException(Exception):
288 |     """Exception raised by `get_asset_from_url` when an asset cannot be fetched
289 |     from URL or if loading failed.
290 |     """
291 | 
292 |     pass
293 | 
294 | 
295 | @dataclasses.dataclass
296 | class AssetDownloadItem:
297 |     """ "The result of a asset download operation.
298 | 
299 |     :param url: the URL of the asset
300 |     :param response: the requests response object (or None)
301 |     :param error: the error message if an error occured (or None)
302 |     """
303 | 
304 |     url: str
305 |     response: Optional[requests.Response] = None
306 |     error: Optional[str] = None
307 | 
308 | 
309 | @dataclasses.dataclass
310 | class ImageDownloadItem(AssetDownloadItem):
311 |     """The result of a image download operation.
312 | 
313 |     :param image: the loaded PIL image, or None if an error occured
314 |     :param image_bytes: the image bytes, or None if an error occured
315 |     """
316 | 
317 |     image: Optional["Image.Image"] = None
318 |     image_bytes: Optional[bytes] = None
319 | 
320 | 
321 | def get_asset_from_url(
322 |     asset_url: str,
323 |     error_raise: bool = True,
324 |     session: Optional[requests.Session] = None,
325 |     auth: Optional[tuple[str, str]] = None,
326 | ) -> AssetDownloadItem:
327 |     try:
328 |         if session:
329 |             r = session.get(asset_url, auth=auth)
330 |         else:
331 |             r = requests.get(asset_url, auth=auth)
332 |     except (
333 |         requests.exceptions.ConnectionError,
334 |         requests.exceptions.SSLError,
335 |         requests.exceptions.Timeout,
336 |     ) as e:
337 |         error_message = "Cannot download %s"
338 |         if error_raise:
339 |             raise AssetLoadingException(error_message % asset_url) from e
340 |         logger.info(error_message, asset_url, exc_info=e)
341 |         return AssetDownloadItem(asset_url, error=error_message % asset_url)
342 | 
343 |     if not r.ok:
344 |         error_message = "Cannot download %s: HTTP %s"
345 |         error_args = (asset_url, r.status_code)
346 |         if error_raise:
347 |             raise AssetLoadingException(error_message % error_args)
348 |         logger.log(
349 |             logging.INFO if r.status_code < 500 else logging.WARNING,
350 |             error_message,
351 |             *error_args,
352 |         )
353 |         return AssetDownloadItem(
354 |             asset_url, response=r, error=error_message % error_args
355 |         )
356 | 
357 |     return AssetDownloadItem(asset_url, response=r)
358 | 
359 | 
360 | def get_image_from_url(
361 |     image_url: str,
362 |     error_raise: bool = True,
363 |     session: Optional[requests.Session] = None,
364 |     return_struct: bool = False,
365 | ) -> Union[ImageDownloadItem, "Image.Image", None]:
366 |     """Fetch an image from `image_url` and load it.
367 | 
368 |     :param image_url: URL of the image to load.
369 |     :param error_raise: if True, raises a `AssetLoadingException` if an error
370 |       occured, defaults to False. If False, None is returned if an error
371 |       occured.
372 |     :param session: requests Session to use, by default no session is used.
373 |     :param return_struct: if True, return a `ImageDownloadItem` object
374 |         containing the image, image bytes and the response object.
375 |     :return: the loaded image, or None if an error occured and `error_raise`
376 |         is False. If `return_struct` is True, return a `ImageDownloadItem`
377 |         object.
378 |     """
379 |     if not _pillow_available:
380 |         raise ImportError("Pillow is required to load images")
381 | 
382 |     asset_item = get_asset_from_url(image_url, error_raise, session)
383 |     response = asset_item.response
384 |     if response is None or asset_item.error:
385 |         if return_struct:
386 |             return ImageDownloadItem(
387 |                 url=image_url, response=response, error=asset_item.error
388 |             )
389 |         else:
390 |             return None
391 | 
392 |     content_bytes = response.content
393 |     try:
394 |         image = Image.open(BytesIO(content_bytes))
395 |         if return_struct:
396 |             return ImageDownloadItem(
397 |                 url=image_url,
398 |                 response=response,
399 |                 image=image,
400 |                 image_bytes=content_bytes,
401 |             )
402 |         return image
403 |     except PIL.UnidentifiedImageError:
404 |         error_message = f"Cannot identify image {image_url}"
405 |         if error_raise:
406 |             raise AssetLoadingException(error_message)
407 |         logger.info(error_message)
408 |     except PIL.Image.DecompressionBombError:
409 |         error_message = f"Decompression bomb error for image {image_url}"
410 |         if error_raise:
411 |             raise AssetLoadingException(error_message)
412 |         logger.info(error_message)
413 | 
414 |     if return_struct:
415 |         return ImageDownloadItem(url=image_url, response=response, error=error_message)
416 | 
417 |     return None
418 | 
419 | 
420 | class PerfTimer:
421 |     """A simple performance timer context manager."""
422 | 
423 |     def __init__(
424 |         self, metric_name: Optional[str] = None, metric_dict: Optional[Dict] = None
425 |     ):
426 |         self.metric_name = metric_name
427 |         self.metric_dict = metric_dict
428 | 
429 |     def __enter__(self):
430 |         self.start = perf_counter()
431 |         return self
432 | 
433 |     def __exit__(self, type, value, traceback):
434 |         self.elapsed = perf_counter() - self.start
435 |         if self.metric_name and self.metric_dict is not None:
436 |             self.metric_dict[self.metric_name] = self.elapsed
437 | 


--------------------------------------------------------------------------------
/openfoodfacts/ml/object_detection.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import logging
  3 | import typing
  4 | 
  5 | import albumentations as A
  6 | import cv2
  7 | import numpy as np
  8 | from cv2 import dnn
  9 | from tritonclient.grpc import service_pb2
 10 | 
 11 | from openfoodfacts.types import JSONType
 12 | from openfoodfacts.utils import PerfTimer
 13 | 
 14 | from .triton import add_triton_infer_input_tensor, get_triton_inference_stub
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def object_detection_transform(
 20 |     image_size: int,
 21 |     fill: int = 114,
 22 |     pad_position: str = "center",
 23 |     normalize_mean: tuple[float, float, float] = (0.0, 0.0, 0.0),
 24 |     normalize_std: tuple[float, float, float] = (1.0, 1.0, 1.0),
 25 | ) -> A.Compose:
 26 |     """Return the Albumentations transform pipeline for object detection.
 27 | 
 28 |     It resizes the image to fit within a square of size (image_size,
 29 |     image_size), preserving the aspect ratio, then pads the image to make it
 30 |     square, and finally normalizes the image.
 31 | 
 32 |     With the default settings, this pipeline matches the preprocessing used by
 33 |     Ultralytics YOLO models.
 34 | 
 35 |     Args:
 36 |         image_size (int): The target size for the longest side of the image.
 37 |         fill (int): The pixel value to use for padding. Default is 114.
 38 |         pad_position (str): The position to place the original image when
 39 |                             padding. Default is "center".
 40 |         normalize_mean (tuple): The mean values for normalization. Default is
 41 |                                 (0.0, 0.0, 0.0).
 42 |         normalize_std (tuple): The std values for normalization. Default is
 43 |                                (1.0, 1.0, 1.0).
 44 |     """
 45 |     return A.Compose(
 46 |         [
 47 |             A.LongestMaxSize(max_size=image_size, interpolation=cv2.INTER_LINEAR),
 48 |             A.PadIfNeeded(
 49 |                 min_height=image_size,
 50 |                 min_width=image_size,
 51 |                 position=pad_position,
 52 |                 fill=fill,
 53 |             ),
 54 |             A.Normalize(mean=normalize_mean, std=normalize_std, p=1.0),
 55 |         ],
 56 |     )
 57 | 
 58 | 
 59 | def reverse_bbox_transform(
 60 |     augmented_bbox: list, original_shape: tuple, image_size: int
 61 | ) -> list:
 62 |     """
 63 |     Reverses the Albumentations pipeline to find original bbox coordinates.
 64 | 
 65 |     Args:
 66 |         augmented_bbox (list): [y_min, x_min, y_max, x_max] from the
 67 |                                augmented (image_size x image_size) image.
 68 |         original_shape (tuple): (height, width) of the *original* image.
 69 |         image_size (int): The target size used in the pipeline.
 70 | 
 71 |     Returns:
 72 |         list: [y_min, x_min, y_max, x_max] in relative coordinates.
 73 |     """
 74 | 
 75 |     original_h, original_w = original_shape
 76 | 
 77 |     # --- 1. Re-calculate the forward transform parameters ---
 78 | 
 79 |     # From A.LongestMaxSize
 80 |     scale = image_size / max(original_h, original_w)
 81 | 
 82 |     # The dimensions of the image *after* scaling but *before* padding
 83 |     scaled_h = int(original_h * scale)
 84 |     scaled_w = int(original_w * scale)
 85 | 
 86 |     # From A.PadIfNeeded (position="center")
 87 |     # This is the amount of padding added to each side
 88 |     pad_top = (image_size - scaled_h) // 2
 89 |     pad_left = (image_size - scaled_w) // 2
 90 | 
 91 |     # --- 2. Apply the inverse transformation ---
 92 |     aug_y_min, aug_x_min, aug_y_max, aug_x_max = augmented_bbox
 93 | 
 94 |     # coord_orig = (coord_aug - padding) / scale
 95 |     orig_y_min = (aug_y_min - pad_top) / scale
 96 |     orig_x_min = (aug_x_min - pad_left) / scale
 97 |     orig_y_max = (aug_y_max - pad_top) / scale
 98 |     orig_x_max = (aug_x_max - pad_left) / scale
 99 | 
100 |     return [
101 |         orig_y_min / original_h,
102 |         orig_x_min / original_w,
103 |         orig_y_max / original_h,
104 |         orig_x_max / original_w,
105 |     ]
106 | 
107 | 
108 | @dataclasses.dataclass
109 | class ObjectDetectionRawResult:
110 |     """The raw result of an object detection model.
111 | 
112 |     Attributes:
113 |         num_detections (int): The number of detections.
114 |         detection_boxes (np.ndarray): The bounding boxes of the detections, in
115 |             relative coordinates (between 0 and 1), with the format
116 |             (y_min, x_min, y_max, x_max).
117 |         detection_scores (np.ndarray): The scores of the detections.
118 |         detection_classes (np.ndarray): The class indices of the detections.
119 |         label_names (list[str]): The list of label names.
120 |         metrics (dict[str, float]): The performance metrics of the detection.
121 |             Each key is the name of the metric (a step in the inference
122 |             process), and the value is the time taken in seconds.
123 |             The following metrics are provided:
124 |                 - preprocess_time: time taken to preprocess the image
125 |                 - grpc_request_build_time: time taken to build the gRPC request
126 |                 - triton_inference_time: time taken for Triton inference
127 |                 - postprocess_time: time taken to postprocess the results
128 |                 - postprocess_nms_time: time taken for Non-Maximum Suppression
129 |                   (included in postprocess_time)
130 |     """
131 | 
132 |     num_detections: int
133 |     detection_boxes: np.ndarray
134 |     detection_scores: np.ndarray
135 |     detection_classes: np.ndarray
136 |     label_names: list[str]
137 |     metrics: dict[str, float] = dataclasses.field(default_factory=dict)
138 | 
139 |     def to_list(self) -> list[JSONType]:
140 |         """Convert the detection results to a JSON serializable format."""
141 |         results = []
142 |         for bounding_box, score, label in zip(
143 |             self.detection_boxes, self.detection_scores, self.detection_classes
144 |         ):
145 |             label_int = int(label)
146 |             label_str = self.label_names[label_int]
147 |             if label_str is not None:
148 |                 result = {
149 |                     "bounding_box": tuple(bounding_box.tolist()),  # type: ignore
150 |                     "score": float(score),
151 |                     "label": label_str,
152 |                 }
153 |                 results.append(result)
154 |         return results
155 | 
156 | 
157 | class ObjectDetector:
158 |     def __init__(self, model_name: str, label_names: list[str], image_size: int = 640):
159 |         """An object detection detector based on Yolo models.
160 | 
161 |         We support models trained with Yolov8, v9, v10, v11 and v12 from
162 |         Ultralytics.
163 | 
164 |         :param model_name: the name of the model, as registered in Triton
165 |         :param label_names: the list of label names
166 |         :param image_size: the size of the input image for the model
167 |         """
168 |         self.model_name: str = model_name
169 |         self.label_names = label_names
170 |         self.image_size = image_size
171 | 
172 |     def detect_from_image(
173 |         self,
174 |         image: np.ndarray,
175 |         triton_uri: str,
176 |         threshold: float = 0.5,
177 |         nms_threshold: float | None = None,
178 |         nms_eta: float | None = None,
179 |         model_version: str | None = None,
180 |     ) -> ObjectDetectionRawResult:
181 |         """Run an object detection model on an image.
182 | 
183 |         The model must have been trained with Ultralytics library.
184 | 
185 |         :param image: the input numpy image
186 |         :param triton_uri: URI of the Triton Inference Server, defaults to
187 |             None. If not provided, the default value from settings is used.
188 |         :param threshold: the minimum score for a detection to be considered,
189 |             defaults to 0.5.
190 |         :param nms_threshold: the NMS (Non Maximum Suppression) threshold to
191 |             use, defaults to None (0.7 will be used).
192 |         :param nms_eta: the NMS eta parameter to use, defaults to None (1.0
193 |             will be used).
194 |         :param model_version: the version of the model to use, defaults to
195 |             None (latest).
196 |         :return: the detection result
197 |         """
198 |         metrics: dict[str, float] = {}
199 | 
200 |         with PerfTimer("preprocess_time", metrics):
201 |             image_array = self.preprocess(image_array=image)
202 | 
203 |         with PerfTimer("grpc_request_build_time", metrics):
204 |             request = service_pb2.ModelInferRequest()
205 |             request.model_name = self.model_name
206 |             if model_version:
207 |                 request.model_version = model_version
208 |             add_triton_infer_input_tensor(
209 |                 request, name="images", data=image_array, datatype="FP32"
210 |             )
211 | 
212 |         with PerfTimer("triton_inference_time", metrics):
213 |             grpc_stub = get_triton_inference_stub(triton_uri)
214 |             response = grpc_stub.ModelInfer(request)
215 | 
216 |         with PerfTimer("postprocess_time", metrics):
217 |             original_shape = typing.cast(tuple[int, int], image.shape[:2])
218 |             response = self.postprocess(
219 |                 response,
220 |                 threshold=threshold,
221 |                 original_shape=original_shape,
222 |                 nms_threshold=nms_threshold,
223 |                 nms_eta=nms_eta,
224 |             )
225 | 
226 |         metrics.update(response.metrics)
227 |         metrics["total_inference_time"] = (
228 |             metrics["preprocess_time"]
229 |             + metrics["grpc_request_build_time"]
230 |             + metrics["triton_inference_time"]
231 |             + metrics["postprocess_time"]
232 |         )
233 |         response.metrics = metrics
234 |         return response
235 | 
236 |     def preprocess(self, image_array: np.ndarray) -> np.ndarray:
237 |         # Apply the transform to the image
238 |         image_array = object_detection_transform(image_size=self.image_size)(
239 |             image=image_array
240 |         )["image"]
241 |         image_array = np.transpose(image_array, (2, 0, 1))[np.newaxis, :]  # HWC to CHW
242 |         return image_array
243 | 
244 |     def postprocess(
245 |         self,
246 |         response,
247 |         threshold: float,
248 |         original_shape: tuple[int, int],
249 |         nms_threshold: float | None = None,
250 |         nms_eta: float | None = None,
251 |     ) -> ObjectDetectionRawResult:
252 |         """Postprocess the output of the object detection model.
253 | 
254 |         :param response: the Triton Inference response
255 |         :param threshold: the minimum score for a detection to be considered
256 |         :param original_shape: the original shape of the image (height, width)
257 |         :param nms_threshold: the NMS (Non Maximum Suppression) threshold to
258 |             use, defaults to None (0.7 will be used).
259 |         :param nms_eta: the NMS eta parameter to use, defaults to None (1.0
260 |             will be used).
261 |         :return: the detection result
262 |         """
263 |         if len(response.outputs) != 1:
264 |             raise ValueError(f"expected 1 output, got {len(response.outputs)}")
265 | 
266 |         if len(response.raw_output_contents) != 1:
267 |             raise ValueError(
268 |                 f"expected 1 raw output content, got {len(response.raw_output_contents)}"
269 |             )
270 | 
271 |         if nms_threshold is None:
272 |             nms_threshold = 0.7
273 |         if nms_eta is None:
274 |             nms_eta = 1.0
275 | 
276 |         output_index = {output.name: i for i, output in enumerate(response.outputs)}
277 |         output = np.frombuffer(
278 |             response.raw_output_contents[output_index["output0"]],
279 |             dtype=np.float32,
280 |         ).reshape((1, len(self.label_names) + 4, -1))[0]
281 | 
282 |         # output is of shape (num_classes + 4, num_detections)
283 |         rows = output.shape[1]
284 |         raw_detection_classes = np.zeros(rows, dtype=int)
285 |         raw_detection_scores = np.zeros(rows, dtype=np.float32)
286 |         raw_detection_boxes = np.zeros((rows, 4), dtype=np.float32)
287 | 
288 |         for i in range(rows):
289 |             classes_scores = output[4:, i]
290 |             max_cls_idx = np.argmax(classes_scores)
291 |             max_score = classes_scores[max_cls_idx]
292 |             if max_score < threshold:
293 |                 continue
294 |             raw_detection_classes[i] = max_cls_idx
295 |             raw_detection_scores[i] = max_score
296 | 
297 |             # The bounding box is in the format (x, y, width, height) in
298 |             # relative coordinates
299 |             # x and y are the coordinates of the center of the bounding box
300 |             bbox_width = output[2, i]
301 |             bbox_height = output[3, i]
302 |             x_min = output[0, i] - 0.5 * bbox_width
303 |             y_min = output[1, i] - 0.5 * bbox_height
304 |             x_max = x_min + bbox_width
305 |             y_max = y_min + bbox_height
306 | 
307 |             # We save the bounding box in the format
308 |             # (y_min, x_min, y_max, x_max) in relative coordinates
309 |             # Scale the bounding boxes back to the original image size
310 | 
311 |             reversed_bboxes = reverse_bbox_transform(
312 |                 augmented_bbox=[y_min, x_min, y_max, x_max],
313 |                 original_shape=original_shape,
314 |                 image_size=self.image_size,
315 |             )
316 |             raw_detection_boxes[i, 0] = max(0.0, min(1.0, reversed_bboxes[0]))
317 |             raw_detection_boxes[i, 1] = max(0.0, min(1.0, reversed_bboxes[1]))
318 |             raw_detection_boxes[i, 2] = max(0.0, min(1.0, reversed_bboxes[2]))
319 |             raw_detection_boxes[i, 3] = max(0.0, min(1.0, reversed_bboxes[3]))
320 | 
321 |         metrics: dict[str, float] = {}
322 |         with PerfTimer("postprocess_nms_time", metrics):
323 |             # Perform NMS (Non Maximum Suppression)
324 |             detection_box_indices = dnn.NMSBoxes(
325 |                 raw_detection_boxes,  # type: ignore
326 |                 raw_detection_scores,  # type: ignore
327 |                 score_threshold=threshold,
328 |                 # the following values are copied from Ultralytics settings
329 |                 nms_threshold=nms_threshold,
330 |                 eta=nms_eta,
331 |             )
332 |         detection_classes = np.zeros(len(detection_box_indices), dtype=int)
333 |         detection_scores = np.zeros(len(detection_box_indices), dtype=np.float32)
334 |         detection_boxes = np.zeros((len(detection_box_indices), 4), dtype=np.float32)
335 | 
336 |         for i, idx in enumerate(detection_box_indices):
337 |             detection_classes[i] = raw_detection_classes[idx]
338 |             detection_scores[i] = raw_detection_scores[idx]
339 |             detection_boxes[i] = raw_detection_boxes[idx]
340 | 
341 |         result = ObjectDetectionRawResult(
342 |             num_detections=rows,
343 |             detection_classes=detection_classes,
344 |             detection_boxes=detection_boxes,
345 |             detection_scores=detection_scores,
346 |             label_names=self.label_names,
347 |             metrics=metrics,
348 |         )
349 |         return result
350 | 


--------------------------------------------------------------------------------
/tests/unit/test_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | 
  4 | import pytest
  5 | import requests_mock
  6 | 
  7 | import openfoodfacts
  8 | 
  9 | TEST_USER_AGENT = "test_off_python"
 10 | 
 11 | 
 12 | class TestProducts:
 13 |     def test_get_product(self):
 14 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
 15 |         code = "1223435"
 16 |         response_data = {
 17 |             "product": {"code": "1223435"},
 18 |             "status": 1,
 19 |             "status_verbose": "product found",
 20 |         }
 21 |         with requests_mock.mock() as mock:
 22 |             mock.get(
 23 |                 f"https://world.openfoodfacts.org/api/v2/product/{code}",
 24 |                 text=json.dumps(response_data),
 25 |             )
 26 |             res = api.product.get(code)
 27 |             assert res == response_data["product"]
 28 | 
 29 |     def test_get_product_missing(self):
 30 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
 31 |         code = "1223435"
 32 |         response_data = {
 33 |             "status": 0,
 34 |             "status_verbose": "product not found",
 35 |         }
 36 |         with requests_mock.mock() as mock:
 37 |             mock.get(
 38 |                 f"https://world.openfoodfacts.org/api/v2/product/{code}",
 39 |                 text=json.dumps(response_data),
 40 |                 status_code=404,
 41 |             )
 42 |             res = api.product.get(code)
 43 |             assert res is None
 44 | 
 45 |     def test_get_product_with_fields(self):
 46 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
 47 |         code = "1223435"
 48 |         response_data = {
 49 |             "product": {"code": "1223435"},
 50 |             "status": 1,
 51 |             "status_verbose": "product found",
 52 |         }
 53 |         with requests_mock.mock() as mock:
 54 |             mock.get(
 55 |                 f"https://world.openfoodfacts.org/api/v2/product/{code}",
 56 |                 text=json.dumps(response_data),
 57 |             )
 58 |             res = api.product.get(code, fields=["code"])
 59 |             assert res == response_data["product"]
 60 |             assert mock.last_request.qs["fields"] == ["code"]
 61 | 
 62 |     def test_get_product_invalid_code(self):
 63 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
 64 |         code = "84800002930392025252502520502"
 65 |         response_data = {
 66 |             "status": 0,
 67 |             "status_verbose": "no code or invalid code",
 68 |         }
 69 |         with requests_mock.mock() as mock:
 70 |             mock.get(
 71 |                 f"https://world.openfoodfacts.org/api/v2/product/{code}",
 72 |                 text=json.dumps(response_data),
 73 |                 status_code=200,
 74 |             )
 75 |             res = api.product.get(code)
 76 |             assert res is None
 77 | 
 78 |             with pytest.raises(
 79 |                 ValueError,
 80 |                 match="invalid barcode: 84800002930392025252502520502",
 81 |             ):
 82 |                 api.product.get(code, raise_if_invalid=True)
 83 | 
 84 |     def test_text_search(self):
 85 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
 86 |         with requests_mock.mock() as mock:
 87 |             response_data = {"products": ["kinder bueno"], "count": 1}
 88 |             mock.get(
 89 |                 "https://world.openfoodfacts.org/cgi/search.pl?"
 90 |                 + "search_terms=kinder+bueno&json=1&page="
 91 |                 + "1&page_size=20",
 92 |                 text=json.dumps(response_data),
 93 |             )
 94 |             res = api.product.text_search("kinder bueno")
 95 |             assert res["products"] == ["kinder bueno"]
 96 |             response_data = {"products": ["banania", "banania big"], "count": 2}
 97 |             mock.get(
 98 |                 "https://world.openfoodfacts.org/cgi/search.pl?"
 99 |                 + "search_terms=banania&json=1&page="
100 |                 + "2&page_size=10&sort_by=unique_scans",
101 |                 text=json.dumps(response_data),
102 |             )
103 |             res = api.product.text_search(
104 |                 "banania", page=2, page_size=10, sort_by="unique_scans"
105 |             )
106 |             assert res["products"] == ["banania", "banania big"]
107 | 
108 |     def test_parse_ingredients(self):
109 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
110 |         ingredients_data = [
111 |             {
112 |                 "ciqual_food_code": "18066",
113 |                 "ecobalyse_code": "tap-water",
114 |                 "id": "en:water",
115 |                 "is_in_taxonomy": 1,
116 |                 "percent_estimate": 75,
117 |                 "percent_max": 100,
118 |                 "percent_min": 50,
119 |                 "text": "eau",
120 |                 "vegan": "yes",
121 |                 "vegetarian": "yes",
122 |             },
123 |             {
124 |                 "ciqual_proxy_food_code": "31016",
125 |                 "ecobalyse_code": "sugar",
126 |                 "id": "en:sugar",
127 |                 "is_in_taxonomy": 1,
128 |                 "percent_estimate": 25,
129 |                 "percent_max": 50,
130 |                 "percent_min": 0,
131 |                 "text": "sucre",
132 |                 "vegan": "yes",
133 |                 "vegetarian": "yes",
134 |             },
135 |         ]
136 |         with requests_mock.mock() as mock:
137 |             response_data = {
138 |                 "product": {"ingredients": ingredients_data},
139 |                 "status": "success",
140 |             }
141 |             mock.patch(
142 |                 "https://world.openfoodfacts.org/api/v3/product/test",
143 |                 text=json.dumps(response_data),
144 |             )
145 |             res = api.product.parse_ingredients("eau, sucre", lang="fr")
146 |             assert res == ingredients_data
147 | 
148 |     def test_parse_ingredients_fail(self):
149 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
150 |         with requests_mock.mock() as mock:
151 |             response_data = {
152 |                 "status": "fail",
153 |             }
154 |             mock.patch(
155 |                 "https://world.openfoodfacts.org/api/v3/product/test",
156 |                 text=json.dumps(response_data),
157 |             )
158 | 
159 |             with pytest.raises(
160 |                 RuntimeError,
161 |                 match="Unable to parse ingredients: {'status': 'fail'}",
162 |             ):
163 |                 api.product.parse_ingredients("eau, sucre", lang="fr")
164 | 
165 |     def test_parse_ingredients_fail_non_HTTP_200(self):
166 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
167 |         with requests_mock.mock() as mock:
168 |             mock.patch(
169 |                 "https://world.openfoodfacts.org/api/v3/product/test",
170 |                 status_code=400,
171 |                 text='{"error": "Bad Request"}',
172 |             )
173 | 
174 |             with pytest.raises(
175 |                 RuntimeError,
176 |                 match=re.escape(
177 |                     'Unable to parse ingredients (non-200 status code): 400, {"error": "Bad Request"}'
178 |                 ),
179 |             ):
180 |                 api.product.parse_ingredients("eau, sucre", lang="fr")
181 | 
182 |     def test_upload_image_success(self):
183 |         api = openfoodfacts.API(
184 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
185 |         )
186 |         code = "1223435"
187 |         response_data = {
188 |             "code": "1223435",
189 |             "errors": [],
190 |             "product": {
191 |                 "images": {
192 |                     "uploaded": {
193 |                         "1": {
194 |                             "imgid": 1,
195 |                             "sizes": {
196 |                                 "100": {"h": 100, "w": 62},
197 |                                 "400": {"h": 400, "w": 248},
198 |                                 "full": {"h": 400, "w": 248},
199 |                             },
200 |                             "uploaded_t": 1758793764,
201 |                             "uploader": "test",
202 |                         }
203 |                     }
204 |                 }
205 |             },
206 |             "result": {
207 |                 "id": "image_uploaded",
208 |                 "lc_name": "Image uploaded",
209 |                 "name": "Image uploaded",
210 |             },
211 |             "status": "success",
212 |             "warnings": [],
213 |         }
214 |         with requests_mock.mock() as mock:
215 |             mock.post(
216 |                 f"https://world.openfoodfacts.org/api/v3/product/{code}/images",
217 |                 text=json.dumps(response_data),
218 |                 status_code=200,
219 |             )
220 |             res = api.product.upload_image(code, image_data_base64="dGVzdA==")
221 |             assert res.status_code == 200
222 |             assert mock.last_request.json() == {
223 |                 "image_data_base64": "dGVzdA==",
224 |                 "user_id": "test",
225 |                 "password": "test",
226 |             }
227 | 
228 |     def test_upload_image_with_selected(self):
229 |         api = openfoodfacts.API(
230 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
231 |         )
232 |         code = "1223435"
233 |         response_data = {
234 |             "code": "1223435",
235 |             "errors": [],
236 |             "product": {
237 |                 "images": {
238 |                     "selected": {
239 |                         "front": {
240 |                             "en": {
241 |                                 "generation": {},
242 |                                 "imgid": 1,
243 |                                 "rev": 2,
244 |                                 "sizes": {
245 |                                     "100": {"h": 100, "w": 62},
246 |                                     "200": {"h": 200, "w": 124},
247 |                                     "400": {"h": 400, "w": 248},
248 |                                     "full": {"h": 400, "w": 248},
249 |                                 },
250 |                             }
251 |                         }
252 |                     },
253 |                     "uploaded": {
254 |                         "1": {
255 |                             "imgid": 1,
256 |                             "sizes": {
257 |                                 "100": {"h": 100, "w": 62},
258 |                                 "400": {"h": 400, "w": 248},
259 |                                 "full": {"h": 400, "w": 248},
260 |                             },
261 |                             "uploaded_t": 1758793852,
262 |                             "uploader": "test",
263 |                         }
264 |                     },
265 |                 }
266 |             },
267 |             "result": {
268 |                 "id": "image_uploaded",
269 |                 "lc_name": "Image uploaded",
270 |                 "name": "Image uploaded",
271 |             },
272 |             "status": "success",
273 |             "warnings": [],
274 |         }
275 |         with requests_mock.mock() as mock:
276 |             mock.post(
277 |                 f"https://world.openfoodfacts.org/api/v3/product/{code}/images",
278 |                 text=json.dumps(response_data),
279 |                 status_code=200,
280 |             )
281 |             res = api.product.upload_image(
282 |                 code, image_data_base64="dGVzdA==", selected={"front": {"en": {}}}
283 |             )
284 |             assert res.status_code == 200
285 |             assert mock.last_request.json() == {
286 |                 "image_data_base64": "dGVzdA==",
287 |                 "user_id": "test",
288 |                 "password": "test",
289 |                 "selected": {"front": {"en": {}}},
290 |             }
291 | 
292 |     def test_upload_image_no_auth(self):
293 |         api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2")
294 |         code = "1223435"
295 |         with pytest.raises(
296 |             ValueError,
297 |             match="a password or a session cookie is required to upload an image",
298 |         ):
299 |             api.product.upload_image(code, image_data_base64="dGVzdA==")
300 | 
301 |     def test_upload_image_invalid_code(self):
302 |         api = openfoodfacts.API(
303 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
304 |         )
305 |         code = "invalidcode"
306 |         with pytest.raises(
307 |             ValueError,
308 |             match="code must be a numeric string",
309 |         ):
310 |             api.product.upload_image(code, image_data_base64="dGVzdA==")
311 | 
312 |     def test_upload_image_no_data(self):
313 |         api = openfoodfacts.API(
314 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
315 |         )
316 |         code = "1223435"
317 |         with pytest.raises(
318 |             ValueError,
319 |             match="one of image_path or image_data_base64 must be provided",
320 |         ):
321 |             api.product.upload_image(code)
322 | 
323 |     def test_upload_image_both_data(self):
324 |         api = openfoodfacts.API(
325 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
326 |         )
327 |         code = "1223435"
328 |         with pytest.raises(
329 |             ValueError,
330 |             match="only one of image_path or image_data_base64 must be provided",
331 |         ):
332 |             api.product.upload_image(
333 |                 code, image_path="path/to/image.jpg", image_data_base64="dGVzdA=="
334 |             )
335 | 
336 |     def test_upload_image_invalid_selected(self):
337 |         api = openfoodfacts.API(
338 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
339 |         )
340 |         code = "1223435"
341 |         with pytest.raises(
342 |             ValueError,
343 |             match=re.escape(
344 |                 "invalid image field name in selected: wrong (must be one of front, ingredients, nutrition, packaging)"
345 |             ),
346 |         ):
347 |             api.product.upload_image(
348 |                 code, image_data_base64="dGVzdA==", selected={"wrong": {}}
349 |             )
350 | 
351 |     def test_upload_image_with_path(self, tmp_path):
352 |         api = openfoodfacts.API(
353 |             user_agent=TEST_USER_AGENT, version="v2", username="test", password="test"
354 |         )
355 |         code = "1223435"
356 |         response_data = {
357 |             "code": "1223435",
358 |             "errors": [],
359 |             "product": {
360 |                 "images": {
361 |                     "uploaded": {
362 |                         "1": {
363 |                             "imgid": 1,
364 |                             "sizes": {
365 |                                 "100": {"h": 100, "w": 62},
366 |                                 "400": {"h": 400, "w": 248},
367 |                                 "full": {"h": 400, "w": 248},
368 |                             },
369 |                             "uploaded_t": 1758793764,
370 |                             "uploader": "test",
371 |                         }
372 |                     }
373 |                 }
374 |             },
375 |             "result": {
376 |                 "id": "image_uploaded",
377 |                 "lc_name": "Image uploaded",
378 |                 "name": "Image uploaded",
379 |             },
380 |             "status": "success",
381 |             "warnings": [],
382 |         }
383 |         image_path = tmp_path / "test_image.jpg"
384 |         image_path.write_bytes(b"test")
385 |         with requests_mock.mock() as mock:
386 |             mock.post(
387 |                 f"https://world.openfoodfacts.org/api/v3/product/{code}/images",
388 |                 text=json.dumps(response_data),
389 |                 status_code=200,
390 |             )
391 |             res = api.product.upload_image(code, image_path=image_path)
392 |             assert res.status_code == 200
393 |             assert mock.last_request.json() == {
394 |                 "image_data_base64": "dGVzdA==",
395 |                 "user_id": "test",
396 |                 "password": "test",
397 |             }
398 | 


--------------------------------------------------------------------------------
/openfoodfacts/taxonomy.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Dict, Iterable, List, Optional, Set, Union
  3 | 
  4 | import requests
  5 | 
  6 | from openfoodfacts.utils.text import get_tag, replace_lang_prefix
  7 | 
  8 | from .types import Environment, Flavor, JSONType, TaxonomyType
  9 | from .utils import (
 10 |     URLBuilder,
 11 |     download_file,
 12 |     get_logger,
 13 |     http_session,
 14 |     load_json,
 15 |     should_download_file,
 16 | )
 17 | 
 18 | logger = get_logger(__name__)
 19 | 
 20 | 
 21 | DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/taxonomy").expanduser()
 22 | 
 23 | 
 24 | # Only available for Open Food Facts for now (not other flavors)
 25 | TAXONOMY_URLS = {
 26 |     TaxonomyType.category: URLBuilder.static(Flavor.off, Environment.org)
 27 |     + "/data/taxonomies/categories.full.json",
 28 |     TaxonomyType.ingredient: URLBuilder.static(Flavor.off, Environment.org)
 29 |     + "/data/taxonomies/ingredients.full.json",
 30 |     TaxonomyType.label: URLBuilder.static(Flavor.off, Environment.org)
 31 |     + "/data/taxonomies/labels.full.json",
 32 |     TaxonomyType.brand: URLBuilder.static(Flavor.off, Environment.org)
 33 |     + "/data/taxonomies/brands.full.json",
 34 |     TaxonomyType.packaging_shape: URLBuilder.static(Flavor.off, Environment.org)
 35 |     + "/data/taxonomies/packaging_shapes.full.json",
 36 |     TaxonomyType.packaging_material: URLBuilder.static(Flavor.off, Environment.org)
 37 |     + "/data/taxonomies/packaging_materials.full.json",
 38 |     TaxonomyType.packaging_recycling: URLBuilder.static(Flavor.off, Environment.org)
 39 |     + "/data/taxonomies/packaging_recycling.full.json",
 40 |     TaxonomyType.country: URLBuilder.static(Flavor.off, Environment.org)
 41 |     + "/data/taxonomies/countries.full.json",
 42 |     TaxonomyType.store: URLBuilder.static(Flavor.off, Environment.org)
 43 |     + "/data/taxonomies/stores.full.json",
 44 |     TaxonomyType.nova_group: URLBuilder.static(Flavor.off, Environment.org)
 45 |     + "/data/taxonomies/nova_groups.full.json",
 46 |     TaxonomyType.additive: URLBuilder.static(Flavor.off, Environment.org)
 47 |     + "/data/taxonomies/additives.full.json",
 48 |     TaxonomyType.vitamin: URLBuilder.static(Flavor.off, Environment.org)
 49 |     + "/data/taxonomies/vitamins.full.json",
 50 |     TaxonomyType.mineral: URLBuilder.static(Flavor.off, Environment.org)
 51 |     + "/data/taxonomies/minerals.full.json",
 52 |     TaxonomyType.amino_acid: URLBuilder.static(Flavor.off, Environment.org)
 53 |     + "/data/taxonomies/amino_acids.full.json",
 54 |     TaxonomyType.nucleotide: URLBuilder.static(Flavor.off, Environment.org)
 55 |     + "/data/taxonomies/nucleotides.full.json",
 56 |     TaxonomyType.allergen: URLBuilder.static(Flavor.off, Environment.org)
 57 |     + "/data/taxonomies/allergens.full.json",
 58 |     TaxonomyType.state: URLBuilder.static(Flavor.off, Environment.org)
 59 |     + "/data/taxonomies/states.full.json",
 60 |     TaxonomyType.data_quality: URLBuilder.static(Flavor.off, Environment.org)
 61 |     + "/data/taxonomies/data_quality.full.json",
 62 |     TaxonomyType.origin: URLBuilder.static(Flavor.off, Environment.org)
 63 |     + "/data/taxonomies/origins.full.json",
 64 |     TaxonomyType.language: URLBuilder.static(Flavor.off, Environment.org)
 65 |     + "/data/taxonomies/languages.full.json",
 66 |     TaxonomyType.other_nutritional_substance: URLBuilder.static(
 67 |         Flavor.off, Environment.org
 68 |     )
 69 |     + "/data/taxonomies/other_nutritional_substances.full.json",
 70 | }
 71 | 
 72 | 
 73 | class TaxonomyNode:
 74 |     """A taxonomy element.
 75 | 
 76 |     Each node has 0+ parents and 0+ children. Each node has the following
 77 |     attributes:
 78 | 
 79 |     - `id`: the node identifier, it starts with a language prefix (ex: `en:`)
 80 |     - `names`: a dict mapping language 2-letter code to the node name for this
 81 |       language
 82 |     - `parents`: the list of the node parents
 83 |     - `children`: the list of the node children
 84 |     - `properties`: additional properties of the node (taxonomy-dependent)
 85 |     - `synonyms`: a dict mapping language 2-letter code to a list of synonyms
 86 |       for this language
 87 |     """
 88 | 
 89 |     __slots__ = ("id", "names", "parents", "children", "synonyms", "properties")
 90 | 
 91 |     def __init__(
 92 |         self,
 93 |         identifier: str,
 94 |         names: Dict[str, str],
 95 |         synonyms: Optional[Dict[str, List[str]]],
 96 |         properties: Optional[Dict[str, Any]] = None,
 97 |     ):
 98 |         self.id: str = identifier
 99 |         self.names: Dict[str, str] = names
100 |         self.parents: List["TaxonomyNode"] = []
101 |         self.children: List["TaxonomyNode"] = []
102 |         self.properties = properties or {}
103 | 
104 |         if synonyms:
105 |             self.synonyms = synonyms
106 |         else:
107 |             self.synonyms = {}
108 | 
109 |     def is_child_of(self, item: "TaxonomyNode") -> bool:
110 |         """Return True if `item` is a child of `self` in the taxonomy."""
111 |         if not self.parents:
112 |             return False
113 | 
114 |         if item in self.parents:
115 |             return True
116 | 
117 |         for parent in self.parents:
118 |             is_parent = parent.is_child_of(item)
119 | 
120 |             if is_parent:
121 |                 return True
122 | 
123 |         return False
124 | 
125 |     def is_parent_of(self, candidate: "TaxonomyNode") -> bool:
126 |         """Return True if `self` is parent of `candidate`, False otherwise.
127 | 
128 |         :param candidate: a TaxonomyNode of the same Taxonomy
129 |         """
130 |         return candidate.is_child_of(self)
131 | 
132 |     def is_parent_of_any(self, candidates: Iterable["TaxonomyNode"]) -> bool:
133 |         """Return True if `self` is a parent of any of `candidates`, False
134 |         otherwise.
135 | 
136 |         :param candidates: an iterable of TaxonomyNodes of the same Taxonomy
137 |         """
138 |         for candidate in candidates:
139 |             if candidate.is_child_of(self):
140 |                 return True
141 | 
142 |         return False
143 | 
144 |     def get_parents_hierarchy(self) -> List["TaxonomyNode"]:
145 |         """Return the list of all parent nodes (direct and indirect)."""
146 |         all_parents = []
147 |         seen: Set[str] = set()
148 | 
149 |         if not self.parents:
150 |             return []
151 | 
152 |         for self_parent in self.parents:
153 |             if self_parent.id not in seen:
154 |                 all_parents.append(self_parent)
155 |                 seen.add(self_parent.id)
156 | 
157 |             for parent_parent in self_parent.get_parents_hierarchy():
158 |                 if parent_parent.id not in seen:
159 |                     all_parents.append(parent_parent)
160 |                     seen.add(parent_parent.id)
161 | 
162 |         return all_parents
163 | 
164 |     def get_localized_name(self, lang: str) -> str:
165 |         """Return the localized name of the node.
166 | 
167 |         We first check if there is an entry in `names` under the provided
168 |         `lang`. Otherwise, we check the existence of an international name
169 |         (`xx`). We eventually return the node ID if none of the previous
170 |         checks were successful.
171 | 
172 |         :param lang: the language code
173 |         """
174 |         if lang in self.names:
175 |             return self.names[lang]
176 | 
177 |         if "xx" in self.names:
178 |             # Return international name if it exists
179 |             return self.names["xx"]
180 | 
181 |         return self.id
182 | 
183 |     def get_synonyms(self, lang: str) -> List[str]:
184 |         return self.synonyms.get(lang, [])
185 | 
186 |     def add_parents(self, parents: Iterable["TaxonomyNode"]):
187 |         for parent in parents:
188 |             if parent not in self.parents:
189 |                 self.parents.append(parent)
190 |                 parent.children.append(self)
191 | 
192 |     def to_dict(self) -> JSONType:
193 |         return {"name": self.names, "parents": [p.id for p in self.parents]}
194 | 
195 |     def __repr__(self):
196 |         return "<TaxonomyNode %s>" % self.id
197 | 
198 | 
199 | class Taxonomy:
200 |     """A class representing a taxonomy.
201 | 
202 |     For more information about taxonomy, see
203 |     https://wiki.openfoodfacts.org/Global_taxonomies.
204 | 
205 |     A Taxonomy instance has only a single `nodes` attribute, that maps the
206 |     node identifier to a `TaxonomyNode`.
207 |     """
208 | 
209 |     def __init__(self) -> None:
210 |         self.nodes: Dict[str, TaxonomyNode] = {}
211 | 
212 |     def add(self, key: str, node: TaxonomyNode) -> None:
213 |         """Add a node to the taxonomy under the id `key`.
214 | 
215 |         :param key: The node id
216 |         :param node: the TaxonomyNode
217 |         """
218 |         self.nodes[key] = node
219 | 
220 |     def __contains__(self, item: str):
221 |         """Return True if `item` (a taxonomy id) is in the taxonomy, False
222 |         otherwise."""
223 |         return item in self.nodes
224 | 
225 |     def __getitem__(self, item: str):
226 |         return self.nodes.get(item)
227 | 
228 |     def __len__(self) -> int:
229 |         """Return the number of items in the taxonomy."""
230 |         return len(self.nodes)
231 | 
232 |     def iter_nodes(self) -> Iterable[TaxonomyNode]:
233 |         """Iterate over the nodes of the taxonomy."""
234 |         return iter(self.nodes.values())
235 | 
236 |     def keys(self) -> Iterable[str]:
237 |         """Return all node IDs from the taxonomy."""
238 |         return self.nodes.keys()
239 | 
240 |     def find_deepest_nodes(self, nodes: List[TaxonomyNode]) -> List[TaxonomyNode]:
241 |         """Given a list of nodes, returns the list of nodes where all the
242 |         parents within the list have been removed.
243 | 
244 |         For example, for a taxonomy, 'fish' -> 'salmon' -> 'smoked-salmon':
245 | 
246 |         ['fish', 'salmon'] -> ['salmon'] ['fish', 'smoked-salmon'] ->
247 |         [smoked-salmon']
248 |         """
249 |         excluded: Set[str] = set()
250 | 
251 |         for node in nodes:
252 |             for second_node in (
253 |                 n for n in nodes if n.id not in excluded and n.id != node.id
254 |             ):
255 |                 if node.is_child_of(second_node):
256 |                     excluded.add(second_node.id)
257 | 
258 |         return [node for node in nodes if node.id not in excluded]
259 | 
260 |     def is_parent_of_any(
261 |         self, item: str, candidates: Iterable[str], raises: bool = True
262 |     ) -> bool:
263 |         """Return True if `item` is parent of any candidate, False otherwise.
264 | 
265 |         If the item is not in the taxonomy and raises is False, return False.
266 | 
267 |         :param item: The item to compare
268 |         :param candidates: A list of candidates
269 |         :param raises: if True, raises a ValueError if item is not in the
270 |         taxonomy, defaults to True.
271 |         """
272 |         node: TaxonomyNode = self[item]
273 | 
274 |         if node is None:
275 |             if raises:
276 |                 raise ValueError("unknown id in taxonomy: %s", node)
277 |             else:
278 |                 return False
279 | 
280 |         to_check_nodes: Set[TaxonomyNode] = set()
281 | 
282 |         for candidate in candidates:
283 |             candidate_node = self[candidate]
284 | 
285 |             if candidate_node is not None:
286 |                 to_check_nodes.add(candidate_node)
287 | 
288 |         return node.is_parent_of_any(to_check_nodes)
289 | 
290 |     def get_localized_name(self, key: str, lang: str) -> str:
291 |         """Return the name of a taxonomy element in a given language.
292 | 
293 |         If `key` is not in the taxonomy or if no name is available for the
294 |         requested language, return `key`.
295 | 
296 |         :param key: the taxonomy element id
297 |         :param lang: the 2-letter language code
298 |         :return: the localized name
299 |         """
300 |         if key not in self.nodes:
301 |             return key
302 | 
303 |         return self.nodes[key].get_localized_name(lang)
304 | 
305 |     def to_dict(self) -> JSONType:
306 |         """Generate a dict from the Taxonomy."""
307 |         export = {}
308 | 
309 |         for key, node in self.nodes.items():
310 |             export[key] = node.to_dict()
311 | 
312 |         return export
313 | 
314 |     @classmethod
315 |     def from_dict(cls, data: JSONType) -> "Taxonomy":
316 |         """Create a Taxonomy from `data`.
317 | 
318 |         :param data: the taxonomy as a dict
319 |         :return: a Taxonomy
320 |         """
321 |         taxonomy = Taxonomy()
322 | 
323 |         for key, key_data in data.items():
324 |             if key not in taxonomy:
325 |                 node = TaxonomyNode(
326 |                     identifier=key,
327 |                     names=key_data.get("name", {}),
328 |                     synonyms=key_data.get("synonyms", None),
329 |                     properties={
330 |                         k: v
331 |                         for k, v in key_data.items()
332 |                         if k not in {"parents", "name", "synonyms", "children"}
333 |                     },
334 |                 )
335 |                 taxonomy.add(key, node)
336 | 
337 |         for key, key_data in data.items():
338 |             node = taxonomy[key]
339 |             parents = [taxonomy[ref] for ref in key_data.get("parents", [])]
340 |             node.add_parents(parents)
341 | 
342 |         return taxonomy
343 | 
344 |     @classmethod
345 |     def from_path(cls, file_path: Union[str, Path]) -> "Taxonomy":
346 |         """Create a Taxonomy from a JSON file.
347 | 
348 |         :param file_path: a JSON file, gzipped (.json.gz) files are supported
349 |         :return: a Taxonomy
350 |         """
351 |         return cls.from_dict(load_json(file_path))  # type: ignore
352 | 
353 |     @classmethod
354 |     def from_url(
355 |         cls, url: str, session: Optional[requests.Session] = None, timeout: int = 120
356 |     ) -> "Taxonomy":
357 |         """Create a Taxonomy from a taxonomy file hosted at `url`.
358 | 
359 |         :param url: the URL of the taxonomy
360 |         :param session: the requests session, use a default session if None
361 |         :param timeout: the request timeout, defaults to 120
362 |         :return: a Taxonomy
363 |         """
364 |         session = http_session if session is None else session
365 |         r = session.get(url, timeout=timeout)
366 |         data = r.json()
367 |         return cls.from_dict(data)
368 | 
369 |     @classmethod
370 |     def from_type(cls, taxonomy_type: TaxonomyType) -> "Taxonomy":
371 |         """Create a Taxonomy from a taxonomy file hosted online from a
372 |         taxonomy type.
373 | 
374 |         :param taxonomy_type: the taxonomy type
375 |         :return: a Taxonomy
376 |         """
377 |         url = TAXONOMY_URLS[TaxonomyType[taxonomy_type]]
378 |         return cls.from_url(url)
379 | 
380 | 
381 | def get_taxonomy(
382 |     taxonomy_type: Union[TaxonomyType, str],
383 |     force_download: bool = False,
384 |     download_newer: bool = False,
385 |     cache_dir: Optional[Path] = None,
386 | ) -> Taxonomy:
387 |     """Return the taxonomy of the provided type.
388 | 
389 |     The taxonomy file is downloaded and cached locally.
390 | 
391 |     :param taxonomy_type: the requested taxonomy type
392 |     :param force_download: if True, (re)download the taxonomy even if it was
393 |         cached, defaults to False
394 |     :param download_newer: if True, download the taxonomy if a more recent
395 |         version compared to the cached version is available (based on file
396 |         Etag). This parameter if ignored if force_download is True, defaults
397 |         to False.
398 |     :param cache_dir: the cache directory to use, defaults to
399 |         ~/.cache/openfoodfacts/taxonomy
400 |     :return: a Taxonomy
401 |     """
402 |     taxonomy_type = TaxonomyType[taxonomy_type]
403 |     filename = f"{taxonomy_type.name}.json"
404 | 
405 |     cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir
406 |     taxonomy_path = cache_dir / filename
407 |     url = TAXONOMY_URLS[taxonomy_type]
408 | 
409 |     if not should_download_file(url, taxonomy_path, force_download, download_newer):
410 |         return Taxonomy.from_path(taxonomy_path)
411 | 
412 |     cache_dir.mkdir(parents=True, exist_ok=True)
413 |     logger.info("Downloading taxonomy, saving it in %s", taxonomy_path)
414 |     download_file(url, taxonomy_path)
415 |     return Taxonomy.from_path(taxonomy_path)
416 | 
417 | 
418 | def create_taxonomy_mapping(taxonomy: Taxonomy) -> Dict[str, str]:
419 |     """From a taxonomy, create a mapping of tags to taxonomy node ids.
420 | 
421 |     The mapping is created by iterating over the nodes of the taxonomy and
422 |     creating a tag from the name and synonyms of each node.
423 | 
424 |     The taxonomy mapping has the following format:
425 |     {
426 |         "fr:noix": "en:nuts",
427 |         "en:nuts": "en:nuts",
428 |         ...
429 |     }
430 | 
431 |     :param taxonomy: the taxonomy to use
432 |     :return: a dict mapping tags (with language prefix) to taxonomy node ids
433 |     """
434 |     mapping = {}
435 |     for node in taxonomy.iter_nodes():
436 |         for lang, name in node.names.items():
437 |             tag = get_tag(name)
438 |             tag_id = f"{lang}:{tag}".lower()
439 |             mapping[tag_id] = node.id
440 | 
441 |         for lang, synonyms in node.synonyms.items():
442 |             for synonym in synonyms:
443 |                 tag = get_tag(synonym)
444 |                 tag_id = f"{lang}:{tag}".lower()
445 |                 mapping[tag_id] = node.id
446 |     return mapping
447 | 
448 | 
449 | def is_prefixed_value(value: str) -> bool:
450 |     """Return True if the given value has a language prefix (en:, fr:,...),
451 |     False otherwise."""
452 |     return len(value) > 3 and value[2] == ":"
453 | 
454 | 
455 | def create_brand_taxonomy_mapping(taxonomy: Taxonomy) -> Dict[str, str]:
456 |     """From a brand taxonomy, create a mapping of tags to taxonomy brand names.
457 | 
458 |     The mapping generated is different than the mapping generated by the
459 |     `create_taxonomy_mapping` function, as it maps an unprefixed value
460 |     (ex: `nestle`) to a brand name, with capitalization and accents
461 |     (ex: `Nestlé`).
462 | 
463 |     The taxonomy mapping has the following format:
464 |     {
465 |         "alva": "Alva",
466 |         "benecop": "Bénécop",
467 |         ...
468 |     }
469 | 
470 |     :param taxonomy: the taxonomy to use (brand taxonomy)
471 |     :return: a dict mapping tags (*without* language prefix) to brand values
472 |         (capitalized)
473 |     """
474 |     mapping = {}
475 |     for node in taxonomy.iter_nodes():
476 |         unprefixed_key = node.id
477 |         if is_prefixed_value(node.id):
478 |             prefix = node.id[:2]
479 |             unprefixed_key = node.id[3:]
480 |         mapping[unprefixed_key] = node.names.get(
481 |             "xx", node.names.get("en", node.names.get(prefix, unprefixed_key))
482 |         )
483 |     return mapping
484 | 
485 | 
486 | def map_to_canonical_id(
487 |     taxonomy_mapping: Dict[str, str], values: List[str]
488 | ) -> Dict[str, str]:
489 |     """Map a list of values to their canonical taxonomy id.
490 | 
491 |     Each value should be a tag in the form `lang:tag`. If a value is not found
492 |     in the taxonomy mapping, it is returned as is, in its tag form.
493 | 
494 |     :param taxonomy_mapping: a mapping of tags to taxonomy node ids, generated
495 |         by `create_taxonomy_mapping`
496 |     :param values: a list of string values
497 |     :return: a dict mapping values to their canonical taxonomy id
498 |     """
499 |     for value in values:
500 |         if len(value) < 3 or value[2] != ":":
501 |             raise ValueError(
502 |                 f"Invalid value: '{value}', expected value to be in 'lang:tag' format"
503 |             )
504 | 
505 |     output = {}
506 |     for value in values:
507 |         tag = get_tag(value)
508 |         output[value] = (
509 |             # Look for a direct match first
510 |             taxonomy_mapping.get(tag)
511 |             # Then look for a match with the xx prefix (language-independent
512 |             # entry)
513 |             or taxonomy_mapping.get(replace_lang_prefix(tag, "xx"))
514 |             # If no match is found, return the original taggified value
515 |             or tag
516 |         )
517 | 
518 |     return output
519 | 


--------------------------------------------------------------------------------