├── tests ├── __init__.py ├── ml │ ├── __init__.py │ ├── test_utils.py │ ├── test_object_detection.py │ └── test_image_classification.py └── unit │ ├── __init__.py │ ├── test_types.py │ ├── test_api_config.py │ ├── utils │ ├── test_text.py │ └── test_utils.py │ ├── test_ocr.py │ ├── test_barcode.py │ ├── test_taxonomy.py │ ├── test_redis.py │ └── test_api.py ├── LICENSE ├── openfoodfacts ├── py.typed ├── __init__.py ├── ml │ ├── utils.py │ ├── triton.py │ ├── image_classification.py │ └── object_detection.py ├── ingredients.py ├── barcode.py ├── utils │ ├── text.py │ └── __init__.py ├── dataset.py ├── images.py ├── redis.py └── taxonomy.py ├── .release-please-manifest.json ├── MANIFEST.in ├── .coveragerc ├── release-please-config.json ├── .flake8 ├── .editorconfig ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── CODEOWNERS ├── workflows │ ├── auto-assign-pr.yml │ ├── reuse.yaml │ ├── semantic-pr.yml │ ├── publish-conda.yml │ ├── merge-conflict-autolabel.yml │ ├── pypi.yml │ ├── release-please.yml │ ├── label.yml │ ├── generate-docs.yml │ ├── ci.yml │ ├── codeql-analysis.yml │ └── github-projects.yml ├── dependabot.yml └── labeler.yml ├── .pre-commit-config.yaml ├── REUSE.toml ├── conda └── meta.yaml ├── mkdocs.yml ├── REUSE.md ├── .gitignore ├── LICENSES ├── MIT.txt └── CC0-1.0.txt ├── pyproject.toml ├── docs ├── handle_taxonomies.md └── usage.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | LICENSES/MIT.txt -------------------------------------------------------------------------------- /openfoodfacts/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.release-please-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | ".": "3.3.0" 3 | } -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | global-include *.typed 3 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = ./openfoodfacts 3 | omit = ./venv/*,*tests* 4 | 5 | [report] 6 | omit = ./venv/*,*tests*,*mi 7 | -------------------------------------------------------------------------------- /release-please-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "packages": { 3 | ".": { 4 | "release-type": "python" 5 | } 6 | } 7 | } -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E501, W503 3 | max-line-length = 88 4 | exclude = .git,__pycache__,build,dist,*_pb2.py,.venv 5 | per-file-ignores = 6 | robotoff/cli/main.py:B008 7 | max-doc-length = 79 -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | indent_size = 4 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | - Brief note about the issue 4 | 5 | ## Solution 6 | 7 | - Mention how your solution resolves the issue 8 | 9 | ## Related issue(s) 10 | 11 | - Fixes #[ISSUE NUMBER] -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2020 Free Software Foundation Europe e.V. 2 | # SPDX-License-Identifier: CC0-1.0 3 | repos: 4 | - repo: https://github.com/fsfe/reuse-tool 5 | rev: v5.0.2 6 | hooks: 7 | - id: reuse-lint-file 8 | -------------------------------------------------------------------------------- /REUSE.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | # Catch-all entry; overridden by specific claims in individual files 4 | [[annotations]] 5 | path = ["**"] 6 | SPDX-FileCopyrightText = "Copyright (c) 2016-2025 Open Food Facts " 7 | SPDX-License-Identifier = "MIT" 8 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. Unless a later match takes precedence, 3 | # review when someone opens a pull request. 4 | # For more on how to customize the CODEOWNERS file - https://help.github.com/en/articles/about-code-owners 5 | 6 | * @openfoodfacts/openfoodfacts-python 7 | -------------------------------------------------------------------------------- /.github/workflows/auto-assign-pr.yml: -------------------------------------------------------------------------------- 1 | # .github/workflows/auto-author-assign.yml 2 | name: 'Auto Author Assign' 3 | 4 | on: 5 | pull_request_target: 6 | types: [opened, reopened] 7 | 8 | permissions: 9 | pull-requests: write 10 | 11 | jobs: 12 | assign-author: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: toshimaru/auto-author-assign@v2.1.1 16 | -------------------------------------------------------------------------------- /.github/workflows/reuse.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2020 Free Software Foundation Europe e.V. 2 | # SPDX-License-Identifier: CC0-1.0 3 | name: REUSE Compliance Check 4 | 5 | on: [push, pull_request] 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v5 12 | - name: REUSE Compliance Check 13 | uses: fsfe/reuse-action@v5 14 | -------------------------------------------------------------------------------- /.github/workflows/semantic-pr.yml: -------------------------------------------------------------------------------- 1 | name: "Semantic PRs" 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - opened 7 | - edited 8 | - synchronize 9 | 10 | jobs: 11 | main: 12 | name: Validate PR title 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: amannn/action-semantic-pull-request@v6 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set data = load_setup_py_data() %} 2 | 3 | package: 4 | name: openfoodfacts 5 | version: {{ data['version'] }} 6 | 7 | source: 8 | path: .. 9 | 10 | build: 11 | number: 0 12 | script: python -m pip install . 13 | 14 | requirements: 15 | host: 16 | - pip 17 | - python 18 | run: 19 | - python 20 | - requests >=2.20.0 21 | 22 | test: 23 | imports: 24 | - openfoodfacts 25 | 26 | about: 27 | home: {{ data['url'] }} 28 | license: {{ data['license'] }} 29 | summary: {{ data['description'] }} 30 | -------------------------------------------------------------------------------- /.github/workflows/publish-conda.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Conda 2 | 3 | on: 4 | # Triggers the workflow on a new release 5 | release: 6 | types: [created] 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout repo 13 | uses: actions/checkout@v5 14 | - name: publish-to-conda 15 | uses: MichaelsJP/conda-package-publish-action@v1.1.0 16 | with: 17 | subDir: 'conda' 18 | AnacondaToken: ${{ secrets.ANACONDA_TOKEN }} 19 | platforms: 'all' 20 | override: true -------------------------------------------------------------------------------- /.github/workflows/merge-conflict-autolabel.yml: -------------------------------------------------------------------------------- 1 | name: '💥 Auto-Label Merge Conflicts on PRs' 2 | on: 3 | push: 4 | branches: 5 | - develop 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | triage: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: mschilde/auto-label-merge-conflicts@5981f8933e92b78098af86b9e33fe0871cc7a3be # v2.0 (2020-01-27) 16 | with: 17 | CONFLICT_LABEL_NAME: "💥 Merge Conflicts" 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | MAX_RETRIES: 5 20 | WAIT_MS: 5000 21 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyPI package 2 | on: 3 | push: 4 | tags: 5 | - v*.*.* 6 | 7 | jobs: 8 | push_to_pypi: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out the repo 12 | uses: actions/checkout@v5 13 | - name: Set up python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: 3.9 17 | - uses: abatilo/actions-poetry@v4.0.0 18 | with: 19 | poetry-version: 2.1.3 20 | - name: Run poetry build 21 | run: poetry build 22 | - name: Run poetry publish 23 | run: POETRY_PYPI_TOKEN_PYPI=${{ secrets.PYPI_TOKEN }} poetry publish 24 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | 9 | commit-message: 10 | prefix: "chore" 11 | include: "scope" 12 | 13 | open-pull-requests-limit: 1 14 | 15 | groups: 16 | all-dependencies: 17 | patterns: 18 | - "*" 19 | 20 | - package-ecosystem: "github-actions" 21 | directory: "/" 22 | schedule: 23 | interval: "monthly" 24 | 25 | commit-message: 26 | prefix: "chore" 27 | include: "scope" 28 | 29 | open-pull-requests-limit: 1 30 | 31 | groups: 32 | all-actions: 33 | patterns: 34 | - "*" 35 | -------------------------------------------------------------------------------- /openfoodfacts/__init__.py: -------------------------------------------------------------------------------- 1 | from openfoodfacts.barcode import normalize_barcode 2 | 3 | from .api import API 4 | from .dataset import ProductDataset, get_dataset 5 | from .ocr import OCRResult 6 | from .types import ( 7 | APIConfig, 8 | APIVersion, 9 | Country, 10 | DatasetType, 11 | Environment, 12 | Facet, 13 | Flavor, 14 | Lang, 15 | ) 16 | 17 | __all__ = [ 18 | "API", 19 | "APIConfig", 20 | "APIVersion", 21 | "Country", 22 | "DatasetType", 23 | "Facet", 24 | "Flavor", 25 | "Environment", 26 | "Lang", 27 | "OCRResult", 28 | "ProductDataset", 29 | "get_dataset", 30 | "normalize_barcode", 31 | ] 32 | 33 | __version__ = "3.3.0" 34 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - develop 5 | name: release-please 6 | jobs: 7 | release-please: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: GoogleCloudPlatform/release-please-action@v4.3 11 | with: 12 | # We can't use GITHUB_TOKEN here because, github actions can't provocate actions 13 | # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#using-the-github_token-in-a-workflow 14 | # So this is a personnal access token 15 | token: ${{ secrets.RELEASE_PLEASE_TOKEN }} 16 | release-type: python 17 | package-name: openfoodfacts 18 | -------------------------------------------------------------------------------- /.github/workflows/label.yml: -------------------------------------------------------------------------------- 1 | # This workflow will triage pull requests and apply a label based on the 2 | # paths that are modified in the pull request. 3 | # 4 | # To use this workflow, you will need to set up a .github/labeler.yml 5 | # file with configuration. For more information, see: 6 | # https://github.com/actions/labeler 7 | 8 | name: Labeler 9 | on: 10 | - pull_request_target 11 | 12 | jobs: 13 | label: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | contents: read 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/labeler@v5.0.0 22 | if: github.event.pull_request.head.repo.full_name == github.repository 23 | with: 24 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 25 | -------------------------------------------------------------------------------- /tests/unit/test_types.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from openfoodfacts.types import Flavor 4 | 5 | 6 | def test_from_product_type_food(): 7 | assert Flavor.from_product_type("food") == Flavor.off 8 | 9 | 10 | def test_from_product_type_beauty(): 11 | assert Flavor.from_product_type("beauty") == Flavor.obf 12 | 13 | 14 | def test_from_product_type_petfood(): 15 | assert Flavor.from_product_type("petfood") == Flavor.opff 16 | 17 | 18 | def test_from_product_type_product(): 19 | assert Flavor.from_product_type("product") == Flavor.opf 20 | 21 | 22 | def test_from_product_type_invalid(): 23 | with pytest.raises( 24 | ValueError, match="no Flavor matched with product_type 'invalid'" 25 | ): 26 | Flavor.from_product_type("invalid") 27 | -------------------------------------------------------------------------------- /tests/unit/test_api_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pydantic_core 4 | 5 | import openfoodfacts 6 | 7 | 8 | class TestAPIConfig(unittest.TestCase): 9 | def test_valid_user_agent(self): 10 | config = openfoodfacts.APIConfig(user_agent="Valid User Agent") 11 | assert config.user_agent == "Valid User Agent" 12 | 13 | def test_invalid_user_agent_type(self): 14 | with self.assertRaises(pydantic_core.ValidationError) as ctx: 15 | openfoodfacts.APIConfig(user_agent=None) 16 | self.assertTrue("valid string" in ctx.exception) 17 | 18 | def test_blank_user_agent(self): 19 | with self.assertRaises(pydantic_core.ValidationError) as ctx: 20 | openfoodfacts.APIConfig(user_agent="") 21 | self.assertTrue("cannot be empty" in ctx.exception) 22 | -------------------------------------------------------------------------------- /.github/workflows/generate-docs.yml: -------------------------------------------------------------------------------- 1 | name: Generate Automatic Documentation 2 | 3 | on: 4 | # Triggers the workflow on push 5 | push: 6 | branches: 7 | - actions-dev 8 | - develop 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v5 15 | with: 16 | fetch-depth: 0 17 | - uses: actions/setup-python@v5 18 | with: 19 | python-version: 3.8 20 | - name: Copy README.md to docs 21 | run: cp ./README.md ./docs/index.md 22 | - name: Install dependencies 23 | run: pip install --upgrade pip && pip install mkdocs mkdocs-gen-files mkdocs-material 24 | - run: git config user.name 'github-actions[bot]' && git config user.email 'github-actions[bot]@users.noreply.github.com' 25 | - name: Publish docs 26 | run: mkdocs gh-deploy -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | docs_dir: docs 2 | site_name: openfoodfacts-python 3 | site_url: https://openfoodfacts.github.io/openfoodfacts-python 4 | edit_uri: edit/develop/docs/ 5 | site_dir: gh_pages 6 | 7 | nav: 8 | - Home: 'index.md' 9 | - Usage: 'usage.md' 10 | - "Handle taxonomies": 'handle_taxonomies.md' 11 | 12 | theme: 13 | name: material 14 | features: 15 | - content.action.edit 16 | logo: https://static.openfoodfacts.org/images/logos/off-logo-horizontal-light.svg 17 | favicon: https://static.openfoodfacts.org/images/logos/off-logo-favicon-light.png 18 | palette: 19 | # Palette toggle for light mode 20 | - scheme: default 21 | toggle: 22 | icon: material/brightness-7 23 | name: Switch to dark mode 24 | # Palette toggle for dark mode 25 | - scheme: slate 26 | toggle: 27 | icon: material/brightness-4 28 | name: Switch to light mode 29 | -------------------------------------------------------------------------------- /REUSE.md: -------------------------------------------------------------------------------- 1 | ## Applications using this Python SDK 2 | 3 | ### Official applications 4 | 5 | - Robotoff: https://github.com/openfoodfacts/robotoff uses this to create the ML system of Open Food Facts. 6 | - Open Prices: https://github.com/openfoodfacts/open-prices uses this to handle many operations related to products 7 | 8 | ### Targets 9 | 10 | - Folksonomy Engine: 11 | - Nutri-Patrol: 12 | - Taxonomy Editor: 13 | - Facets Knowledge Panels: 14 | 15 | ### Third party applications 16 | 17 | Feel [free to open a PR to add your application in this list](https://github.com/openfoodfacts/openfoodfacts-python/edit/develop/REUSE.md). 18 | Please get in touch at reuse@openfoodfacts.org 19 | We are very interested in learning what the Open Food Facts data is used for. It is not mandatory, but we would very much appreciate it if you tell us about your re-uses (https://forms.gle/hwaeqBfs8ywwhbTg8) so that we can share them with the Open Food Facts community. You can also fill this form to get a chance to get your app featured: https://forms.gle/hwaeqBfs8ywwhbTg8 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | .venv 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # Ipython Notebook 63 | .ipynb_checkpoints 64 | 65 | # OS related files 66 | .DS_Store -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | github_actions: 2 | - changed-files: 3 | - any-glob-to-any-file: '.github/**/*' 4 | 5 | REDIS: 6 | - changed-files: 7 | - any-glob-to-any-file: 'tests/test_redis.py' 8 | - any-glob-to-any-file: 'openfoodfacts/redis.py' 9 | 10 | images: 11 | - changed-files: 12 | - any-glob-to-any-file: 'openfoodfacts/images.py' 13 | - any-glob-to-any-file: 'tests/​test_images.py' 14 | 15 | OCR: 16 | - changed-files: 17 | - any-glob-to-any-file: 'tests/​test_ocr.py' 18 | 19 | tests: 20 | - changed-files: 21 | - any-glob-to-any-file: 'tests/test_utils.py' 22 | - any-glob-to-any-file: 'tests/​test_api.py' 23 | - any-glob-to-any-file: 'tests/​test_ocr.py' 24 | - any-glob-to-any-file: 'tests/​test_redis.py' 25 | - any-glob-to-any-file: 'tests/​test_images.py' 26 | - any-glob-to-any-file: 'tests/​test_api_config.py' 27 | 28 | utils: 29 | - changed-files: 30 | - any-glob-to-any-file: 'openfoodfacts/utils.py' 31 | - any-glob-to-any-file: 'tests/test_utils.py' 32 | 33 | dependencies: 34 | - changed-files: 35 | - any-glob-to-any-file: 'poetry.lock' 36 | 37 | documentation: 38 | - changed-files: 39 | - any-glob-to-any-file: 'handle_taxonomies.md' 40 | - any-glob-to-any-file: 'usage.md' 41 | -------------------------------------------------------------------------------- /LICENSES/MIT.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016 OpenFoodFacts, Inc. http://openfoodfacts.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORG 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /openfoodfacts/ml/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | 5 | def convert_image_to_array(image: Image.Image) -> np.ndarray: 6 | """Convert a PIL Image into a numpy array. 7 | 8 | The image is converted to RGB if needed before generating the array. 9 | 10 | :param image: the input image. 11 | :return: the generated numpy array of shape (width, height, 3) 12 | """ 13 | if image.mode != "RGB": 14 | image = image.convert("RGB") 15 | 16 | (im_width, im_height) = image.size 17 | 18 | return np.array(image.getdata(), dtype=np.uint8).reshape((im_height, im_width, 3)) 19 | 20 | 21 | def resize_image(image: Image.Image, max_size: tuple[int, int]) -> Image.Image: 22 | """Resize an image to fit within the specified dimensions. 23 | 24 | :param image: the input image 25 | :param max_size: the maximum width and height as a tuple 26 | :return: the resized image, or the original image if it fits within the 27 | specified dimensions 28 | """ 29 | width, height = image.size 30 | max_width, max_height = max_size 31 | 32 | if width > max_width or height > max_height: 33 | new_image = image.copy() 34 | new_image.thumbnail((max_width, max_height)) 35 | return new_image 36 | 37 | return image 38 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Python SDK quality checks and unit tests 2 | 3 | on: 4 | push: 5 | paths: 6 | - "openfoodfacts/**" 7 | - "pyproject.toml" 8 | - "poetry.lock" 9 | - "tests/**" 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.9] 17 | steps: 18 | - uses: actions/checkout@v5 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install Poetry 23 | uses: snok/install-poetry@v1.4 24 | with: 25 | virtualenvs-create: true 26 | virtualenvs-in-project: true 27 | - name: Load cached venv 28 | id: cached-poetry-dependencies 29 | uses: actions/cache@v4 30 | with: 31 | path: .venv 32 | key: venv-${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} 33 | - name: Install dependencies 34 | run: poetry install --with=dev --all-extras 35 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 36 | 37 | - name: Launch quality checks 38 | run: | 39 | poetry run flake8 . 40 | poetry run black --check . 41 | poetry run mypy . 42 | poetry run isort --check . 43 | - name: Test with pytest 44 | run: | 45 | poetry run pytest tests 46 | -------------------------------------------------------------------------------- /openfoodfacts/ingredients.py: -------------------------------------------------------------------------------- 1 | from openfoodfacts.taxonomy import Taxonomy 2 | from openfoodfacts.types import JSONType 3 | 4 | 5 | def add_ingredient_in_taxonomy_field( 6 | parsed_ingredients: list[JSONType], ingredient_taxonomy: Taxonomy 7 | ) -> tuple[int, int]: 8 | """Add the `in_taxonomy` field to each ingredient in `parsed_ingredients`. 9 | 10 | This function is called recursively to add the `in_taxonomy` field to each 11 | sub-ingredient. It returns the total number of ingredients and the number 12 | of known ingredients (including sub-ingredients). 13 | 14 | :param parsed_ingredients: a list of parsed ingredients, in Product Opener 15 | format 16 | :param ingredient_taxonomy: the ingredient taxonomy 17 | :return: a (total_ingredients_n, known_ingredients_n) tuple 18 | """ 19 | ingredients_n = 0 20 | known_ingredients_n = 0 21 | for ingredient_data in parsed_ingredients: 22 | ingredient_id = ingredient_data["id"] 23 | in_taxonomy = ingredient_id in ingredient_taxonomy 24 | ingredient_data["in_taxonomy"] = in_taxonomy 25 | known_ingredients_n += int(in_taxonomy) 26 | ingredients_n += 1 27 | 28 | if "ingredients" in ingredient_data: 29 | ( 30 | sub_ingredients_n, 31 | known_sub_ingredients_n, 32 | ) = add_ingredient_in_taxonomy_field( 33 | ingredient_data["ingredients"], ingredient_taxonomy 34 | ) 35 | ingredients_n += sub_ingredients_n 36 | known_ingredients_n += known_sub_ingredients_n 37 | 38 | return ingredients_n, known_ingredients_n 39 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "Code scanning - action" 2 | 3 | on: 4 | pull_request: 5 | schedule: 6 | - cron: '0 9 * * 1' 7 | 8 | jobs: 9 | CodeQL-Build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v5 16 | with: 17 | # We must fetch at least the immediate parents so that if this is 18 | # a pull request then we can checkout the head. 19 | fetch-depth: 2 20 | 21 | # If this run was triggered by a pull request event, then checkout 22 | # the head of the pull request instead of the merge commit. 23 | - run: git checkout HEAD^2 24 | if: ${{ github.event_name == 'pull_request' }} 25 | 26 | # Initializes the CodeQL tools for scanning. 27 | - name: Initialize CodeQL 28 | uses: github/codeql-action/init@v3 29 | # Override language selection by uncommenting this and choosing your languages 30 | # with: 31 | # languages: go, javascript, csharp, python, cpp, java 32 | 33 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 34 | # If this step fails, then you should remove it and run the build manually (see below) 35 | - name: Autobuild 36 | uses: github/codeql-action/autobuild@v3 37 | 38 | # ℹ️ Command-line programs to run using the OS shell. 39 | # 📚 https://git.io/JvXDl 40 | 41 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 42 | # and modify them (or add more) to build your code if your project 43 | # uses a compiled language 44 | 45 | #- run: | 46 | # make bootstrap 47 | # make release 48 | 49 | - name: Perform CodeQL Analysis 50 | uses: github/codeql-action/analyze@v3 51 | -------------------------------------------------------------------------------- /tests/unit/utils/test_text.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from openfoodfacts.utils.text import get_tag, replace_lang_prefix 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "value,output", 8 | [ 9 | ("Reflets de France", "reflets-de-france"), 10 | ("écrasé", "ecrase"), 11 | ("œufs de plein air", "oeufs-de-plein-air"), 12 | ("dr.oetker", "dr-oetker"), 13 | ("mat & lou", "mat-lou"), 14 | ("monop'daily", "monop-daily"), 15 | ("épi d'or", "epi-d-or"), 16 | ("Health Star Rating 0.5", "health-star-rating-0-5"), 17 | ("C'est qui le Patron ?!", "c-est-qui-le-patron"), 18 | ("fr: Gésiers", "fr:gesiers"), 19 | ("ar: تفاح", "ar:تفاح"), 20 | ("تفاح", "تفاح"), 21 | ], 22 | ) 23 | def test_get_tag(value: str, output: str): 24 | assert get_tag(value) == output 25 | 26 | 27 | @pytest.mark.parametrize( 28 | "tag,new_lang_prefix,output", 29 | [ 30 | ("fr:gesiers", "en", "en:gesiers"), 31 | ("fr:gesiers", "fr", "fr:gesiers"), 32 | ("fr:gesiers", "ar", "ar:gesiers"), 33 | ("en:apple", "fr", "fr:apple"), 34 | ("xx:sashimi", "it", "it:sashimi"), 35 | ("xx:sashimi", "xx", "xx:sashimi"), 36 | ], 37 | ) 38 | def test_replace_lang_prefix(tag, new_lang_prefix, output): 39 | assert replace_lang_prefix(tag, new_lang_prefix) == output 40 | 41 | 42 | def test_replace_lang_prefix_invalid_new_lang_prefix(): 43 | with pytest.raises(ValueError, match="new_lang_prefix 'a' must be a 2-letter code"): 44 | replace_lang_prefix("en:apples", "a") 45 | 46 | 47 | def test_replace_lang_prefix_invalid_tag(): 48 | with pytest.raises( 49 | ValueError, match="tag 'e:apples' has an invalid language prefix" 50 | ): 51 | replace_lang_prefix("e:apples", "fr") 52 | -------------------------------------------------------------------------------- /openfoodfacts/barcode.py: -------------------------------------------------------------------------------- 1 | def normalize_barcode(barcode: str) -> str: 2 | """Normalize the barcode. 3 | 4 | First, we remove leading zeros, then we pad the barcode with zeros to 5 | reach 8 digits. 6 | 7 | If the barcode is longer than 8 digits, we pad it to 13 digits. 8 | 9 | :param barcode: the barcode to normalize 10 | :return: the normalized barcode 11 | """ 12 | barcode = barcode.lstrip("0").zfill(8) 13 | 14 | if len(barcode) > 8: 15 | barcode = barcode.zfill(13) 16 | 17 | return barcode 18 | 19 | 20 | def has_valid_check_digit(gtin: str) -> bool: 21 | """Check if the GTIN has a valid check-digit. 22 | 23 | The full GTIN (with the check-digit) is passed as an argument. 24 | The function returns True if the check-digit is valid, False otherwise. 25 | """ 26 | if len(gtin) < 2: 27 | raise ValueError(f"invalid gtin: '{gtin}'") 28 | return calculate_check_digit(gtin) == gtin[-1] 29 | 30 | 31 | def calculate_check_digit(gtin: str) -> str: 32 | """This function computes the check-digit from a raw GTIN. 33 | 34 | The full GTIN (with the check-digit) is passed as an argument. 35 | The computed check-digit is returned as a string. 36 | 37 | The check-digit is computed from the preceding digits by multiplying the 38 | sum of every 2nd digit *from right to left* by 3, adding that to the sum 39 | of all the other digits (1st, 3rd, etc.), modulating the result by 10 40 | (find the remainder after dividing by 10), and subtracting *that* 41 | result *from* 10. 42 | """ 43 | # Remove the last digit (checksum) 44 | gtin = gtin[:-1] 45 | # Reverse the digits 46 | digits = tuple(d for d in reversed(gtin)) 47 | return str( 48 | 10 49 | - ( # From 10 we substract 50 | ( 51 | ( 52 | sum(int(d) for d in digits[::2]) * 3 53 | ) # The sum of every 2nd digit, multiplied by 3 54 | + ( 55 | sum(int(d) for d in digits[1::2]) 56 | ) # The sum of every 2nd digit, offset by 1 57 | ) 58 | % 10 # Modulo 10 (the remainder after dividing by 10) 59 | ) 60 | )[-1] 61 | -------------------------------------------------------------------------------- /tests/unit/test_ocr.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import pytest 4 | 5 | from openfoodfacts.ocr import OCRResult 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "ocr_url, bounding_box, expected_text", 10 | [ 11 | ( 12 | # It corresponds to this OCR crop: 13 | # https://robotoff.openfoodfacts.org/api/v1/images/crop?image_url=https://images.openfoodfacts.org/images/products/089/000/000/1202/1.jpg&y_min=0.08416666666666667&x_min=0.30077691453940064&y_max=0.09583333333333334&x_max=0.37735849056603776 14 | "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json", 15 | [101, 271, 115, 340], 16 | "Materne", 17 | ), 18 | ( 19 | # same, but the bounding box is distinct from the logo area 20 | "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json", 21 | [120, 271, 134, 340], 22 | None, 23 | ), 24 | ( 25 | # same, but the bounding box is distinct from the logo area 26 | "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/0890000001202_1.json", 27 | [120, 271, 134, 340], 28 | None, 29 | ), 30 | ( 31 | # [0.2808293402194977,0.37121888995170593,0.35544055700302124,0.49409016966819763] 32 | # /540/091/030/1160/1.jpg 33 | "https://raw.githubusercontent.com/openfoodfacts/test-data/main/openfoodfacts-python/tests/unit/5400910301160_1.json", 34 | [337, 327, 427, 436], 35 | "NUTRIDIA", 36 | ), 37 | ], 38 | ) 39 | def test_get_words_in_area( 40 | ocr_url: str, bounding_box: Tuple[int, int, int, int], expected_text: Optional[str] 41 | ): 42 | ocr_result = OCRResult.from_url(ocr_url) 43 | assert ocr_result is not None 44 | words = ocr_result.get_words_in_area(bounding_box) 45 | 46 | if expected_text is None: 47 | assert words == [] 48 | else: 49 | assert words is not None 50 | assert len(words) == 1 51 | assert words[0].text.strip() == expected_text 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "openfoodfacts" 3 | version = "3.3.0" 4 | authors = [ 5 | {name = "The Open Food Facts team", email = "contact@openfoodfacts.org"} 6 | ] 7 | description = "Official Python SDK of Open Food Facts" 8 | readme = "README.md" 9 | license = "MIT" 10 | requires-python = ">=3.10" 11 | dependencies = [ 12 | "requests>=2.20.0", 13 | "pydantic>=2.0.0,<3.0.0", 14 | "tqdm>=4.0.0,<5.0.0", 15 | ] 16 | dynamic = ["classifiers"] 17 | 18 | [tool.poetry] 19 | include = [ 20 | {path = "tests", format = "sdist"}, 21 | ] 22 | classifiers = [ 23 | "Development Status :: 5 - Production/Stable", 24 | "Intended Audience :: Developers", 25 | "Natural Language :: English", 26 | "Programming Language :: Python :: Implementation :: CPython", 27 | "Programming Language :: Python :: Implementation :: PyPy", 28 | ] 29 | 30 | [project.urls] 31 | repository = "https://github.com/openfoodfacts/openfoodfacts-python" 32 | 33 | [tool.mypy] 34 | ignore_missing_imports = true 35 | 36 | [tool.isort] # From https://black.readthedocs.io/en/stable/compatible_configs.html#isort 37 | multi_line_output = 3 38 | include_trailing_comma = true 39 | force_grid_wrap = 0 40 | use_parentheses = true 41 | ensure_newline_before_comments = true 42 | line_length = 88 43 | 44 | [tool.poetry.dependencies] 45 | python = ">=3.10,<4.0" 46 | redis = { version = "~6.4.0", optional = true, extras = ["hiredis"] } 47 | Pillow = { version = ">=9.3,<12", optional = true } 48 | tritonclient = {extras = ["grpc"], version = ">2.0.0,<3.0.0", optional = true} 49 | opencv-python-headless = {version = ">4.0.0,<5.0.0", optional = true} 50 | 51 | [tool.poetry.group.dev.dependencies] 52 | requests-mock = "1.12.1" 53 | flake8 = "7.3.0" 54 | black = "25.1.0" 55 | mypy = "1.17.1" 56 | isort = "6.0.1" 57 | coverage = {version = "7.10.4", extras = ["toml"]} 58 | pytest = "8.4.1" 59 | types-requests = "2.32.4.20250809" 60 | types-tqdm = "4.67.0.20250809" 61 | types-redis = "^4.6.0.20240425" 62 | 63 | [project.optional-dependencies] 64 | redis = ["redis"] 65 | pillow = ["Pillow"] 66 | ml = ["tritonclient[grpc]", "opencv-python-headless", "Pillow", "albumentations>=2.0.0"] 67 | 68 | [build-system] 69 | requires = ["poetry-core"] 70 | build-backend = "poetry.core.masonry.api" 71 | -------------------------------------------------------------------------------- /tests/unit/test_barcode.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from openfoodfacts.barcode import ( 4 | calculate_check_digit, 5 | has_valid_check_digit, 6 | normalize_barcode, 7 | ) 8 | 9 | 10 | def test_normalize_barcode_remove_leading_zeros(): 11 | assert normalize_barcode("00012345") == "00012345" 12 | assert normalize_barcode("00000001") == "00000001" 13 | 14 | 15 | def test_normalize_barcode_pad_to_8_digits(): 16 | assert normalize_barcode("123") == "00000123" 17 | assert normalize_barcode("1") == "00000001" 18 | 19 | 20 | def test_normalize_barcode_pad_to_13_digits(): 21 | assert normalize_barcode("123456789") == "0000123456789" 22 | assert normalize_barcode("123456789012") == "0123456789012" 23 | 24 | 25 | def test_normalize_barcode_no_change_needed(): 26 | assert normalize_barcode("12345678") == "12345678" 27 | assert normalize_barcode("1234567890123") == "1234567890123" 28 | 29 | 30 | @pytest.mark.parametrize( 31 | "gtin,expected", 32 | [ 33 | ("3017620422003", "3"), 34 | ("8901234567890", "0"), 35 | ("101011", "1"), 36 | ("000101011", "1"), 37 | ("0000000101011", "1"), 38 | ("5678989012342", "2"), 39 | ("829573994253", "3"), 40 | ("59366631014", "4"), 41 | ("150599289765", "5"), 42 | ("9012345678906", "6"), 43 | ("360131017", "7"), 44 | ("1234567890128", "8"), 45 | ("10061282", "2"), 46 | ], 47 | ) 48 | def test_calculate_check_digit(gtin, expected): 49 | assert calculate_check_digit(gtin) == expected 50 | 51 | 52 | @pytest.mark.parametrize( 53 | "gtin,expected", 54 | [ 55 | ("3017620422003", True), 56 | ("0204341706595", True), 57 | ("5707196311419", True), 58 | ("5701018060158", True), 59 | ("5016451522591", True), 60 | ("5741000224168", True), 61 | ("5741000224168", True), 62 | ("0256844308646", True), 63 | ("0083012245843", True), 64 | ("5741000224161", False), 65 | # EAN8 66 | ("10061282", True), 67 | ("10061283", False), 68 | ("0000010061282", True), 69 | ("29428984", True), 70 | ], 71 | ) 72 | def test_has_valid_check_digit(gtin, expected): 73 | assert has_valid_check_digit(gtin) is expected 74 | -------------------------------------------------------------------------------- /tests/ml/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | from openfoodfacts.ml.utils import convert_image_to_array, resize_image 5 | 6 | 7 | class TestConvertImageToArray: 8 | def test_rgb(self): 9 | # Create a simple RGB image 10 | image = Image.new("RGB", (10, 10), color="red") 11 | array = convert_image_to_array(image) 12 | 13 | assert array.shape == (10, 10, 3) 14 | assert array.dtype == np.uint8 15 | assert (array == [255, 0, 0]).all() 16 | 17 | def test_non_rgb(self): 18 | # Create a simple grayscale image 19 | image = Image.new("L", (10, 10), color=128) 20 | array = convert_image_to_array(image) 21 | 22 | assert array.shape == (10, 10, 3) 23 | assert array.dtype == np.uint8 24 | assert (array == [128, 128, 128]).all() 25 | 26 | def test_size(self): 27 | # Create a simple RGB image with different size 28 | image = Image.new("RGB", (20, 15), color="blue") 29 | array = convert_image_to_array(image) 30 | 31 | assert array.shape == (15, 20, 3) 32 | assert array.dtype == np.uint8 33 | assert (array == [0, 0, 255]).all() 34 | 35 | 36 | class TestResizeImage: 37 | def test_resize_smaller_image(self): 38 | # Create a simple RGB image smaller than max_size 39 | image = Image.new("RGB", (10, 10), color="red") 40 | max_size = (20, 20) 41 | resized_image = resize_image(image, max_size) 42 | 43 | assert resized_image.size == (10, 10) 44 | 45 | def test_resize_larger_image(self): 46 | # Create a simple RGB image larger than max_size 47 | image = Image.new("RGB", (30, 30), color="blue") 48 | max_size = (20, 20) 49 | resized_image = resize_image(image, max_size) 50 | 51 | assert resized_image.size == (20, 20) 52 | 53 | def test_resize_wider_image(self): 54 | # Create a simple RGB image wider than max_size 55 | image = Image.new("RGB", (40, 20), color="green") 56 | max_size = (20, 20) 57 | resized_image = resize_image(image, max_size) 58 | 59 | assert resized_image.size == (20, 10) 60 | 61 | def test_resize_taller_image(self): 62 | # Create a simple RGB image taller than max_size 63 | image = Image.new("RGB", (20, 40), color="yellow") 64 | max_size = (20, 20) 65 | resized_image = resize_image(image, max_size) 66 | 67 | assert resized_image.size == (10, 20) 68 | -------------------------------------------------------------------------------- /openfoodfacts/utils/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .fold_to_ascii import fold, fold_without_insertion_deletion 4 | 5 | 6 | def strip_accents(s: str, keep_length: bool = False) -> str: 7 | """Strip accents and normalize string. 8 | 9 | :param s: the string to normalize 10 | :param keep_length: if True, no character is replaced without a 11 | subtitution of length 1: the length of the string is therefore kept 12 | unchanged. Default to False. 13 | :return: the normalized string 14 | """ 15 | if keep_length: 16 | return fold_without_insertion_deletion(s) 17 | else: 18 | return fold(s) 19 | 20 | 21 | CONSECUTIVE_HYPHEN_REGEX = re.compile(r"-{2,}") 22 | 23 | 24 | def strip_consecutive_hyphens(text: str) -> str: 25 | """Convert a sequence of 2+ hypens into a single hyphen.""" 26 | return CONSECUTIVE_HYPHEN_REGEX.sub("-", text) 27 | 28 | 29 | TAG_MAP_TABLE = { 30 | ord("œ"): "oe", 31 | ord(" "): "-", 32 | ord("'"): "-", 33 | ord("`"): "-", 34 | ord('"'): "-", 35 | ord("."): "-", 36 | ord("!"): "-", 37 | ord("?"): "-", 38 | ord("["): "-", 39 | ord("]"): "-", 40 | ord("("): "-", 41 | ord(")"): "-", 42 | ord("{"): "-", 43 | ord("}"): "-", 44 | ord("#"): "-", 45 | ord("$"): "-", 46 | ord("%"): "-", 47 | ord("&"): "-", 48 | ord("\\"): "-", 49 | ord("*"): "-", 50 | ord("+"): "-", 51 | ord(","): "-", 52 | ord("/"): "-", 53 | ord(";"): "-", 54 | ord("<"): "-", 55 | ord(">"): "-", 56 | ord("="): "-", 57 | ord("@"): "-", 58 | ord("^"): "-", 59 | ord("_"): "-", 60 | ord("|"): "-", 61 | ord("~"): "-", 62 | } 63 | 64 | 65 | def get_tag(text: str) -> str: 66 | """Return a tag from a text. 67 | 68 | In Open Food Facts, tags are obtained from free text by performing the 69 | following: 70 | - lowercasing 71 | - accent removal 72 | - replacement of punctuation by either a comma ("-") or nothing, depending 73 | on the punctuation 74 | 75 | The input text can contain a language prefix, which is kept in the output 76 | if present. The language prefix is a 2-letter code followed by a colon 77 | (e.g. "fr:"). 78 | 79 | This function is not strictly on par with Product Opener implementation, 80 | but it should be good enough for most cases. 81 | """ 82 | text = text.lower() 83 | lang_prefix = None 84 | if len(text) >= 3 and text[2] == ":": 85 | lang_prefix = text[:2] 86 | text = text[3:] 87 | text = strip_accents(text, keep_length=True) 88 | text = text.translate(TAG_MAP_TABLE).strip("-") 89 | text = strip_consecutive_hyphens(text) 90 | if lang_prefix: 91 | text = f"{lang_prefix}:{text}" 92 | return text 93 | 94 | 95 | def replace_lang_prefix(tag: str, new_lang_prefix: str) -> str: 96 | """Replace the language prefix of a tag with a new one.""" 97 | 98 | if len(new_lang_prefix) != 2: 99 | raise ValueError( 100 | f"new_lang_prefix '{new_lang_prefix}' must be a 2-letter code." 101 | ) 102 | 103 | if len(tag) < 3 or tag[2] != ":": 104 | raise ValueError(f"tag '{tag}' has an invalid language prefix") 105 | 106 | return f"{new_lang_prefix}:{tag[3:]}" 107 | -------------------------------------------------------------------------------- /openfoodfacts/ml/triton.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import struct 3 | 4 | import grpc 5 | import numpy as np 6 | from tritonclient.grpc import service_pb2, service_pb2_grpc 7 | from tritonclient.grpc.service_pb2_grpc import GRPCInferenceServiceStub 8 | 9 | 10 | @functools.cache 11 | def get_triton_inference_stub(triton_uri: str) -> GRPCInferenceServiceStub: 12 | """Return a gRPC stub for Triton Inference Server. 13 | 14 | :param triton_uri: URI of the Triton Inference Server 15 | :return: gRPC stub for Triton Inference Server 16 | """ 17 | channel = grpc.insecure_channel(triton_uri) 18 | return service_pb2_grpc.GRPCInferenceServiceStub(channel) 19 | 20 | 21 | def deserialize_byte_tensor(data: bytes) -> list[str]: 22 | """Deserialize a byte tensor into a list of string. 23 | 24 | This is used to deserialize string array outputs from Triton models. 25 | """ 26 | offset = 0 27 | # 4 bytes are used to encode string length 28 | int_byte_len = 4 29 | array = [] 30 | while len(data) >= offset + int_byte_len: 31 | str_length = struct.unpack(" ObjectDetector: 18 | # Create an instance of ObjectDetector for testing 19 | label_names = ["label1", "label2"] 20 | return ObjectDetector( 21 | model_name="test_model", label_names=label_names, image_size=640 22 | ) 23 | 24 | 25 | class ResponseOutputs: 26 | def __init__(self, name): 27 | self.name = name 28 | 29 | 30 | class TestObjectDetector: 31 | def test_preprocess(self, sample_image, object_detector: ObjectDetector): 32 | image_array = object_detector.preprocess(sample_image) 33 | 34 | # Check the shape of the output image array 35 | assert image_array.shape == (1, 3, 640, 640) 36 | 37 | def test_postprocess(self, object_detector: ObjectDetector): 38 | # Mock response object 39 | response = MagicMock() 40 | response.outputs = [ResponseOutputs("output0")] 41 | response.raw_output_contents = [ 42 | np.random.rand(1, len(object_detector.label_names) + 4, 10) 43 | .astype(np.float32) 44 | .tobytes() 45 | ] 46 | 47 | threshold = 0.5 48 | result = object_detector.postprocess( 49 | response, threshold, original_shape=(200, 100) 50 | ) 51 | 52 | # Check the type of the result 53 | assert isinstance(result, ObjectDetectionRawResult) 54 | 55 | # Check the number of detections 56 | assert result.num_detections == 10 57 | 58 | # Check the shape of detection boxes 59 | assert result.detection_boxes.shape == (len(result.detection_scores), 4) 60 | 61 | # Check the length of detection classes and scores 62 | assert len(result.detection_classes) == len(result.detection_scores) 63 | 64 | def test_detect_from_image(self, sample_image, object_detector: ObjectDetector): 65 | # Mock the Triton inference stub and response 66 | grpc_stub = MagicMock() 67 | grpc_stub.ModelInfer.return_value = MagicMock() 68 | get_triton_inference_stub = MagicMock(return_value=grpc_stub) 69 | 70 | # Mock the preprocess and postprocess methods 71 | object_detector.preprocess = MagicMock(return_value=np.zeros((1, 3, 640, 640))) # type: ignore 72 | object_detector.postprocess = MagicMock( # type: ignore 73 | return_value=ObjectDetectionRawResult( 74 | num_detections=1, 75 | detection_boxes=np.zeros((1, 4)), 76 | detection_scores=np.array([0.9]), 77 | detection_classes=np.array([1]), 78 | label_names=object_detector.label_names, 79 | ) 80 | ) 81 | with patch( 82 | "openfoodfacts.ml.object_detection.get_triton_inference_stub", 83 | get_triton_inference_stub, 84 | ): 85 | # Run the detect_from_image method 86 | result = object_detector.detect_from_image( 87 | sample_image, "fake_triton_uri", threshold=0.5 88 | ) 89 | 90 | # Check that preprocess was called 91 | object_detector.preprocess.assert_called_once() 92 | assert object_detector.preprocess.call_args.kwargs == { 93 | "image_array": sample_image 94 | } 95 | 96 | # Check that get_triton_inference_stub was called 97 | get_triton_inference_stub.assert_called_once_with("fake_triton_uri") 98 | 99 | # Check that ModelInfer was called 100 | grpc_stub.ModelInfer.assert_called_once() 101 | 102 | # Check that postprocess was called 103 | object_detector.postprocess.assert_called_once() 104 | 105 | # Check the type of the result 106 | assert isinstance(result, ObjectDetectionRawResult) 107 | 108 | # Check the number of detections 109 | assert result.num_detections == 1 110 | -------------------------------------------------------------------------------- /.github/workflows/github-projects.yml: -------------------------------------------------------------------------------- 1 | name: Add issues to the relevant GitHub Projects project 2 | 3 | on: 4 | issues: 5 | types: 6 | - opened 7 | - labeled 8 | - edited 9 | jobs: 10 | add-to-project: 11 | name: Add issues to the relevant GitHub Projects project 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/add-to-project@main 15 | with: 16 | project-url: https://github.com/orgs/openfoodfacts/projects/11 17 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 18 | labeled: 🎨 Mockups available, 🎨 Mockup required 19 | label-operator: OR 20 | - uses: actions/add-to-project@main 21 | with: 22 | project-url: https://github.com/orgs/openfoodfacts/projects/4 # Add issue to the packaging project 23 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 24 | labeled: packaging input 25 | label-operator: OR 26 | - uses: actions/add-to-project@main 27 | with: 28 | project-url: https://github.com/orgs/openfoodfacts/projects/35 # Add issue to the a11y project 29 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 30 | labeled: accessibility 31 | label-operator: OR 32 | - uses: actions/add-to-project@main 33 | with: 34 | project-url: https://github.com/orgs/openfoodfacts/projects/132 # Add issue to the Top upvoted issues board 35 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 36 | labeled: ⭐ top issue, 👍 Top 10 Issue! 37 | label-operator: OR 38 | - uses: actions/add-to-project@main 39 | with: 40 | project-url: https://github.com/orgs/openfoodfacts/projects/57 # Add issue to the Most impactful issues board 41 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 42 | labeled: 🎯 P0, 🎯 P1 43 | label-operator: OR 44 | - uses: actions/add-to-project@main 45 | with: 46 | project-url: https://github.com/orgs/openfoodfacts/projects/43 # Add issue to the open products facts project 47 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 48 | labeled: 📸 Open Products Facts 49 | label-operator: OR 50 | - uses: actions/add-to-project@main 51 | with: 52 | project-url: https://github.com/orgs/openfoodfacts/projects/37 # Add issue to the open beauty facts project 53 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 54 | labeled: 🧴 Open Beauty Facts 55 | label-operator: OR 56 | - uses: actions/add-to-project@main 57 | with: 58 | project-url: https://github.com/orgs/openfoodfacts/projects/4 # Add issue to the packaging project 59 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 60 | labeled: 📦 Packaging 61 | label-operator: OR 62 | - uses: actions/add-to-project@main 63 | with: 64 | project-url: https://github.com/orgs/openfoodfacts/projects/25 # Add issue to the documentation project 65 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 66 | labeled: 📚 Documentation 67 | label-operator: OR 68 | - uses: actions/add-to-project@main 69 | with: 70 | project-url: https://github.com/orgs/openfoodfacts/projects/5 # Add issue to the folksonomy project 71 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 72 | labeled: 🏷️ Folksonomy Project 73 | label-operator: OR 74 | - uses: actions/add-to-project@main 75 | with: 76 | project-url: https://github.com/orgs/openfoodfacts/projects/44 # Add issue to the data quality project 77 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 78 | labeled: 🧽 Data quality 79 | label-operator: OR 80 | - uses: actions/add-to-project@main 81 | with: 82 | project-url: https://github.com/orgs/openfoodfacts/projects/82 # Add issue to the search project 83 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 84 | labeled: 🔎 Search 85 | label-operator: OR 86 | - uses: actions/add-to-project@main 87 | with: 88 | project-url: https://github.com/orgs/openfoodfacts/projects/41 # Add issue to the producer platform project 89 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 90 | labeled: 🏭 Producers Platform 91 | label-operator: OR 92 | - uses: actions/add-to-project@main 93 | with: 94 | project-url: https://github.com/orgs/openfoodfacts/projects/92 # Add issue to the Nutri-Score project 95 | github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} 96 | labeled: 🚦 Nutri-Score 97 | label-operator: OR 98 | -------------------------------------------------------------------------------- /docs/handle_taxonomies.md: -------------------------------------------------------------------------------- 1 | # Handle taxonomies 2 | 3 | The Python SDK provides an easy way to access and handle the taxonomies available on Open Food Facts. 4 | 5 | Taxonomies are at the heart of Open Food Facts. They are used to structure knowledge about ingredients, categories, labels, additives, countries, brands, etc. 6 | 7 | To have a better understanding of how taxonomies work, you can read the [wiki page about taxonomies](https://wiki.openfoodfacts.org/Global_taxonomies). 8 | 9 | ## Usage 10 | 11 | ### Get information about a taxonomy item 12 | 13 | First, instantiate a Taxonomy object: 14 | 15 | ```python 16 | from openfoodfacts.taxonomy import get_taxonomy 17 | 18 | # Use the singular form of the taxonomy name 19 | taxonomy = get_taxonomy("category") 20 | print(taxonomy) 21 | # 22 | ``` 23 | 24 | The taxonomy object provides a way to access the taxonomy data. For example, if you want to get the node `en:biscuits`: 25 | 26 | ```python 27 | node = taxonomy["en:biscuits"] 28 | print(node) 29 | # 30 | ``` 31 | 32 | If the node does not exist, `None` is returned. 33 | 34 | You can get the the translation in a specific language: 35 | 36 | ```python 37 | print(node.get_localized_name("it")) 38 | # Biscotti 39 | ``` 40 | 41 | Each node has one or more parents, stored in the `parents` field: 42 | 43 | ```python 44 | print(node.parents) 45 | # [] 46 | ``` 47 | 48 | Likewise, children can be accessed using the `children` field. 49 | 50 | 51 | To get the full parent hierarchy (that includes all parents found recursively), use the `get_parents_hierarchy` method: 52 | 53 | ```python 54 | print(node.get_parents_hierarchy()) 55 | # [, , ] 56 | ``` 57 | 58 | Beside the main translation that can be accessed using `get_localized_name`, each node may have synonyms. This information can be easily accessed as well: 59 | 60 | ```python 61 | # synonyms is a dict mapping language codes to a list of 62 | # synonyms in that language. The key is missing if there are 63 | # no synonyms. 64 | print(node.synonyms["es"]) 65 | # ["Galletas", "galleta"] 66 | ``` 67 | 68 | Taxonomy node properties are stored in the `properties` field: 69 | 70 | ```python 71 | print(node.properties) 72 | # { 73 | # "wikipedia": {"en": "https://en.wikipedia.org/wiki/Biscuit"}, 74 | # "carbon_footprint_fr_foodges_ingredient": {"fr": "Biscuit au beurre"}, 75 | # "agribalyse_proxy_food_code": {"en": "24000"}, 76 | # "ciqual_proxy_food_name": { 77 | # "en": "Biscuit -cookie-", 78 | # "fr": "Biscuit sec, sans précision", 79 | # }, 80 | # "wikidata": {"en": "Q13270"}, 81 | # "ciqual_proxy_food_code": {"en": "24000"}, 82 | #} 83 | ``` 84 | 85 | ### The Taxonomy object 86 | 87 | The `Taxonomy` object is a dictionary-like object that maps node IDs to `TaxonomyNode` objects. 88 | 89 | It also provides a way to iterate over all nodes: 90 | 91 | ```python 92 | for node in taxonomy.iter_nodes(): 93 | print(node) 94 | # 95 | # 96 | # 97 | # 98 | # 99 | # 100 | # ... 101 | ``` 102 | 103 | #### Find leaf nodes in the taxonomy 104 | 105 | One very common usecase is to find the leafs nodes among a list of nodes, i.e. the nodes that have no children. 106 | For example, in Open Food Facts, the `categories_tags` field contains the categories submitted by the user and all their parents. If you're only interested in the most precise categories, you need to filter out the categories that have children: 107 | 108 | ```python 109 | # Let's say you have a product that has the following categories: 110 | categories_tags = ["en:plant-based-foods-and-beverages","en:plant-based-foods","en:breakfasts","en:cereals-and-potatoes","en:fruits-and-vegetables-based-foods","en:cereals-and-their-products","en:fruits-based-foods","en:breakfast-cereals","en:mueslis","en:cereals-with-nuts","en:crunchy-cereal-clusters","en:cereal-clusters-with-nuts"] 111 | 112 | # Convert the ID to TaxonomyNode objects: 113 | categories_nodes = [taxonomy[tag] for tag in categories_tags if tag in taxonomy] 114 | 115 | # Let's find the leaf nodes using find_deepest_nodes method: 116 | leaf_nodes = taxonomy.find_deepest_nodes(categories_nodes) 117 | print(leaf_nodes) 118 | # [, , ] 119 | ``` 120 | 121 | As you can see, the parent categories were removed, and only the leaf nodes remain. -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage Guide 2 | 3 | This guide provides information on the methods available within the Open Food Facts Python SDK. 4 | 5 | ## API 6 | 7 | The SDK can be used to access Open Food Facts API. 8 | 9 | First, instantiate an API object: 10 | 11 | ```python 12 | from openfoodfacts import API, APIVersion, Country, Environment, Flavor 13 | 14 | api = API( 15 | user_agent="", 16 | username=None, 17 | password=None, 18 | country=Country.world, 19 | flavor=Flavor.off, 20 | version=APIVersion.v2, 21 | environment=Environment.org, 22 | ) 23 | ``` 24 | 25 | All parameters are optional with the exception of user_agent, but here is a description of the parameters you can tweak: 26 | 27 | - `username` and `password` are used to provide authentication (required for write requests) 28 | - `country` is used to specify the country, which is used by the API to return product specific to the country or to infer which language to use by default. `world` (all products) is the default value 29 | - `flavor`: the Open*Facts project you want to interact with: `off` (Open Food Facts, default), `obf` (Open Beauty Facts), `opff` (Open Pet Food Facts), `opf` (Open Products Facts) 30 | - `version`: API version (v2 is the default) 31 | - `environment`: either `org` for production environment (openfoodfacts.org) or `net` for staging (openfoodfacts.net) 32 | 33 | ### Get information about a product 34 | 35 | ```python 36 | code = "3017620422003" 37 | api.product.get(code) 38 | ``` 39 | 40 | ### Perform text search 41 | 42 | ```python 43 | results = api.product.text_search("pizza") 44 | ``` 45 | 46 | ### Create a new product or update an existing one 47 | 48 | ```python 49 | results = api.product.update(body) 50 | ``` 51 | 52 | with `body` the update body. It is a dictionary. It should contain 53 | the key "code" and its value, corresponding to the product that we 54 | want to update. Example: 55 | ```body = {'code': '3850334341389', 'product_name': 'Mlinci'}``` 56 | 57 | ### Perform ingredient analysis 58 | 59 | You can perform the ingredient analysis of a text in a given language using the API. Please note that ingredient analysis is costly, so prefer using the preprod server for this operation. 60 | 61 | ```python 62 | from openfoodfacts import API, APIVersion, Environment 63 | 64 | api = API(user_agent="", 65 | version=APIVersion.v3, 66 | environment=Environment.net) 67 | 68 | results = api.product.parse_ingredients("water, sugar, salt", lang="en") 69 | 70 | print(results) 71 | 72 | ## [{'ciqual_food_code': '18066', 73 | # 'ecobalyse_code': 'tap-water', 74 | # 'id': 'en:water', 75 | # 'is_in_taxonomy': 1, 76 | # 'percent_estimate': 66.6666666666667, 77 | # 'percent_max': 100, 78 | # 'percent_min': 33.3333333333333, 79 | # 'text': 'water', 80 | # 'vegan': 'yes', 81 | # 'vegetarian': 'yes'}, 82 | # {'ciqual_proxy_food_code': '31016', 83 | # 'ecobalyse_code': 'sugar', 84 | # 'id': 'en:sugar', 85 | # 'is_in_taxonomy': 1, 86 | # 'percent_estimate': 16.6666666666667, 87 | # 'percent_max': 50, 88 | # 'percent_min': 0, 89 | # 'text': 'sugar', 90 | # 'vegan': 'yes', 91 | # 'vegetarian': 'yes'}, 92 | # {'ciqual_food_code': '11058', 93 | # 'id': 'en:salt', 94 | # 'is_in_taxonomy': 1, 95 | # 'percent_estimate': 16.6666666666667, 96 | # 'percent_max': 33.3333333333333, 97 | # 'percent_min': 0, 98 | # 'text': 'salt', 99 | # 'vegan': 'yes', 100 | # 'vegetarian': 'yes'}] 101 | ``` 102 | 103 | ## Using the dataset 104 | 105 | If you're planning to perform data analysis on Open Food Facts, the easiest way is to download and use the Open Food Facts dataset dump. Fortunately it can be done really easily using the SDK: 106 | 107 | ```python 108 | from openfoodfacts import ProductDataset 109 | 110 | dataset = ProductDataset(dataset_type="csv") 111 | 112 | for product in dataset: 113 | print(product["product_name"]) 114 | ``` 115 | 116 | With `dataset = ProductDataset(dataset_type="csv")`, we automatically download (and cache) the food dataset. We can then iterate over it to get information about products. 117 | 118 | Two dataset types are available `csv` and `jsonl`. The `jsonl` dataset contains all the Open Food Facts database information but takes much more storage (>5 GB), while the `csv` dataset is much ligher (~800 MB) but only contains the most important fields. The `jsonl` dataset type is used by default. 119 | 120 | You can also use `ProductDataset` to fetch other non-food datasets: 121 | 122 | ```python 123 | from openfoodfacts import ProductDataset 124 | 125 | dataset = ProductDataset(dataset_type="csv") 126 | 127 | for product in dataset: 128 | print(product["product_name"]) 129 | ``` 130 | 131 | ## Taxonomies 132 | 133 | For a deep dive on how to handle taxonomies, check out the [dedicated page](./handle_taxonomies.md). 134 | -------------------------------------------------------------------------------- /tests/unit/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | from pathlib import Path 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | import requests 7 | from PIL import Image 8 | 9 | from openfoodfacts.utils import ( 10 | AssetLoadingException, 11 | get_image_from_url, 12 | should_download_file, 13 | ) 14 | 15 | 16 | def test_get_image_from_url(requests_mock): 17 | # Test case 1: Valid image URL 18 | image_url = "https://example.com/image.jpg" 19 | 20 | f = io.BytesIO() 21 | # Create a white image file 22 | Image.new("RGB", (100, 100), "white").save(f, format="JPEG") 23 | f.seek(0) 24 | image_data = f.read() 25 | requests_mock.get(image_url, content=image_data) 26 | image = get_image_from_url(image_url) 27 | assert isinstance(image, Image.Image) 28 | 29 | struct = get_image_from_url(image_url, return_struct=True) 30 | assert struct.url == image_url 31 | assert struct.response is not None and struct.response.status_code == 200 32 | assert struct.image == image 33 | 34 | # Test case 2: Invalid image URL 35 | invalid_image_url = "https://example.com/invalid_image.jpg" 36 | requests_mock.get(invalid_image_url, content=b"invalid-image") 37 | with pytest.raises(AssetLoadingException): 38 | get_image_from_url(invalid_image_url) 39 | 40 | # Same with error_raise=False 41 | assert get_image_from_url(invalid_image_url, error_raise=False) is None 42 | 43 | # Same thing with struct 44 | struct = get_image_from_url( 45 | invalid_image_url, return_struct=True, error_raise=False 46 | ) 47 | assert struct.url == invalid_image_url 48 | assert struct.response is not None and struct.response.status_code == 200 49 | assert struct.image is None 50 | assert struct.error == "Cannot identify image https://example.com/invalid_image.jpg" 51 | 52 | # Test case 3: Image URL with connection error 53 | connection_error_url = "https://example.com/connection_error.jpg" 54 | requests_mock.get(connection_error_url, exc=requests.exceptions.ConnectionError) 55 | with pytest.raises(AssetLoadingException): 56 | get_image_from_url(connection_error_url) 57 | 58 | # Same but with error_raise=False 59 | assert get_image_from_url(connection_error_url, error_raise=False) is None 60 | 61 | # Same but with return_struct=True 62 | struct = get_image_from_url( 63 | connection_error_url, return_struct=True, error_raise=False 64 | ) 65 | assert struct.url == connection_error_url 66 | assert struct.response is None 67 | assert struct.image is None 68 | assert struct.error == "Cannot download https://example.com/connection_error.jpg" 69 | 70 | # Test case 4: Image URL with HTTP error 71 | http_error_url = "https://example.com/http_error.jpg" 72 | requests_mock.get(http_error_url, status_code=404) 73 | with pytest.raises(AssetLoadingException): 74 | get_image_from_url(http_error_url) 75 | 76 | 77 | def test_should_download_file(): 78 | url = "https://example.com/file" 79 | filepath = Path("/path/to/file") 80 | 81 | # Test case 1: File does not exist 82 | with patch.object(Path, "is_file", return_value=False): 83 | assert ( 84 | should_download_file( 85 | url, filepath, force_download=False, download_newer=False 86 | ) 87 | is True 88 | ) 89 | 90 | # Test case 2: Force download 91 | with patch.object(Path, "is_file", return_value=True): 92 | assert ( 93 | should_download_file( 94 | url, filepath, force_download=True, download_newer=False 95 | ) 96 | is True 97 | ) 98 | 99 | # Test case 3: Download newer with same ETag 100 | with ( 101 | patch.object(Path, "is_file", return_value=True), 102 | patch("openfoodfacts.utils.get_file_etag", return_value="etag123"), 103 | patch("openfoodfacts.utils.fetch_etag", return_value="etag123"), 104 | ): 105 | assert ( 106 | should_download_file( 107 | url, filepath, force_download=False, download_newer=True 108 | ) 109 | is False 110 | ) 111 | 112 | # Test case 4: Download newer with different ETag 113 | with ( 114 | patch.object(Path, "is_file", return_value=True), 115 | patch("openfoodfacts.utils.get_file_etag", return_value="etag123"), 116 | patch("openfoodfacts.utils.fetch_etag", return_value="etag456"), 117 | ): 118 | assert ( 119 | should_download_file( 120 | url, filepath, force_download=False, download_newer=True 121 | ) 122 | is True 123 | ) 124 | 125 | # Test case 5: No force download and no download newer, the file 126 | # exists so we don't download it again 127 | with patch.object(Path, "is_file", return_value=True): 128 | assert ( 129 | should_download_file( 130 | url, filepath, force_download=False, download_newer=False 131 | ) 132 | is False 133 | ) 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Food Facts Python SDK 2 | 3 |
4 | Open Food Facts 5 |
6 | 7 | ## Status 8 | 9 | [![Project Status](https://opensource.box.com/badges/active.svg)](https://opensource.box.com/badges) 10 | [![Build Status](https://travis-ci.org/openfoodfacts/openfoodfacts-python.svg?branch=master)](https://travis-ci.org/openfoodfacts/openfoodfacts-python) 11 | [![codecov](https://codecov.io/gh/openfoodfacts/openfoodfacts-python/branch/master/graph/badge.svg)](https://codecov.io/gh/openfoodfacts/openfoodfacts-python) 12 | [![Latest Version](https://img.shields.io/pypi/v/openfoodfacts.svg)](https://pypi.org/project/openfoodfacts) 13 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/openfoodfacts/openfoodfacts-python/blob/master/LICENSE) 14 | 15 | ## Description 16 | 17 | This is the official Python SDK for the [Open Food Facts](https://world.openfoodfacts.org/) project. 18 | It provides a simple interface to the [Open Food Facts API](https://openfoodfacts.github.io/openfoodfacts-server/api/) and allows you to: 19 | 20 | - Get information about a product 21 | - Perform text search 22 | - Create a new product or update an existing one 23 | 24 | It also provides some helper functions to make it easier to work with Open Food Facts data and APIs, such as: 25 | 26 | - getting translation of a taxonomized field in a given language 27 | - downloading and iterating over the Open Food Facts data dump 28 | - handling OCRs of Open Food Facts images generated by Google Cloud Vision 29 | 30 | Please note that this SDK is still in beta and the API is subject to change. Make sure to pin the version in your requirements file. 31 | 32 | ## Third party applications 33 | If you use this SDK or want to use this SDK, make sure to read the [REUSE](https://github.com/openfoodfacts/openfoodfacts-python/blob/develop/REUSE.md) and ensure you comply with the OdBL licence, in addition to the licence of this package (MIT). Make sure you at least fill the form, and feel free to open a PR to add your application in this list :-) 34 | 35 | 36 | ## Installation 37 | 38 | The easiest way to install the SDK is through pip: 39 | 40 | pip install openfoodfacts 41 | 42 | or manually from source: 43 | 44 | git clone https://github.com/openfoodfacts/openfoodfacts-python 45 | cd openfoodfacts-python 46 | pip install . # Note the “.” at the end! 47 | 48 | ## Examples 49 | 50 | All the examples below assume that you have imported the SDK and instanciated the API object: 51 | 52 | ```python 53 | import openfoodfacts 54 | 55 | # User-Agent is mandatory 56 | api = openfoodfacts.API(user_agent="MyAwesomeApp/1.0") 57 | ``` 58 | 59 | *Get information about a product* 60 | 61 | ```python 62 | code = "3017620422003" 63 | api.product.get(code, fields=["code", "product_name"]) 64 | # {'code': '3017620422003', 'product_name': 'Nutella'} 65 | ``` 66 | 67 | *Perform text search* 68 | 69 | ```python 70 | api.product.text_search("mineral water") 71 | # {"count": 3006628, "page": 1, "page_count": 20, "page_size": 20, "products": [{...}], "skip": 0} 72 | ``` 73 | 74 | *Create a new product or update an existing one* 75 | 76 | ```python 77 | results = api.product.update({ 78 | "code": CODE, 79 | "product_name_en": "blueberry jam", 80 | "ingredients_text_en": "blueberries, sugar, pectin, citric acid" 81 | }) 82 | ``` 83 | 84 | with `CODE` the product barcode. The rest of the body should be a dictionary of fields to create/update. 85 | 86 | To see all possible capabilities, check out the [usage guide](https://openfoodfacts.github.io/openfoodfacts-python/usage/). 87 | 88 | 89 | 90 | ## Contributing 91 | 92 | Any help is welcome, as long as you don't break the continuous integration. 93 | Fork the repository and open a Pull Request directly on the "develop" branch. 94 | A maintainer will review and integrate your changes. 95 | 96 | Maintainers: 97 | 98 | - [Anubhav Bhargava](https://github.com/Anubhav-Bhargava) 99 | - [Frank Rousseau](https://github.com/frankrousseau) 100 | - [Pierre Slamich](https://github.com/teolemon) 101 | - [Raphaël](https://github.com/raphael0202) 102 | 103 | Contributors: 104 | 105 | - Agamit Sudo 106 | - [Daniel Stolpe](https://github.com/numberpi) 107 | - [Enioluwa Segun](https://github.com/enioluwas) 108 | - [Nicolas Leger](https://github.com/nicolasleger) 109 | - [Pablo Hinojosa](https://github.com/Pablohn26) 110 | - [Andrea Stagi](https://github.com/astagi) 111 | - [Benoît Prieur](https://github.com/benprieur) 112 | - [Aadarsh A](https://github.com/aadarsh-ram) 113 | 114 | ## Copyright and License 115 | 116 | Copyright 2016-2024 Open Food Facts 117 | 118 | The Open Food Facts Python SDK is licensed under the [MIT License](https://github.com/openfoodfacts/openfoodfacts-python/blob/develop/LICENSE). 119 | 120 | Other files that are not part of the SDK itself may be under different a different license. 121 | The project complies with the [REUSE 3.3 specification](https://reuse.software/spec-3.3/), 122 | so any such files should be marked accordingly. 123 | -------------------------------------------------------------------------------- /openfoodfacts/dataset.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from .types import DatasetType, Environment, Flavor 6 | from .utils import ( 7 | URLBuilder, 8 | download_file, 9 | get_logger, 10 | get_open_fn, 11 | jsonl_iter, 12 | should_download_file, 13 | ) 14 | 15 | logger = get_logger(__name__) 16 | 17 | # Increase field_size to accommodate large fields. 18 | # sys.maxsize will overflow on windows so using max 32-bit integer instead. 19 | csv.field_size_limit(pow(2, 31) - 1) 20 | 21 | 22 | DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/datasets").expanduser() 23 | DATASET_FILE_NAMES = { 24 | Flavor.off: { 25 | DatasetType.jsonl: "openfoodfacts-products.jsonl.gz", 26 | DatasetType.csv: "en.openfoodfacts.org.products.csv.gz", 27 | }, 28 | Flavor.obf: { 29 | DatasetType.jsonl: "openbeautyfacts-products.jsonl.gz", 30 | DatasetType.csv: "en.openbeautyfacts.org.products.csv", 31 | }, 32 | Flavor.opff: { 33 | DatasetType.jsonl: "openpetfoodfacts-products.jsonl.gz", 34 | DatasetType.csv: "en.openpetfoodfacts.org.products.csv", 35 | }, 36 | Flavor.opf: { 37 | DatasetType.jsonl: "openproductsfacts-products.jsonl.gz", 38 | DatasetType.csv: "en.openproductsfacts.org.products.csv", 39 | }, 40 | } 41 | 42 | 43 | def get_dataset( 44 | flavor: Flavor = Flavor.off, 45 | dataset_type: DatasetType = DatasetType.jsonl, 46 | force_download: bool = False, 47 | download_newer: bool = False, 48 | cache_dir: Optional[Path] = None, 49 | obsolete: bool = False, 50 | ) -> Path: 51 | """Download (and cache) Open Food Facts dataset. 52 | 53 | The dataset is downloaded the first time and subsequently cached in 54 | `~/.cache/openfoodfacts/datasets`. 55 | 56 | :param flavor: The data source, defaults to Flavor.off 57 | :param dataset_type: The returned format, defaults to DatasetType.jsonl 58 | :param force_download: if True, (re)download the dataset even if it was 59 | cached, defaults to False 60 | :param download_newer: if True, download the dataset if a more recent 61 | version compared to the cached version is available (based on file 62 | Etag). This parameter if ignored if force_download is True, defaults 63 | to False. 64 | :param cache_dir: the cache directory to use, defaults to 65 | ~/.cache/openfoodfacts/taxonomy 66 | :param obsolete: if True, download the obsolete dataset, defaults to False 67 | :return: the path of the dataset 68 | """ 69 | cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir 70 | file_name = DATASET_FILE_NAMES[flavor][dataset_type] 71 | if obsolete: 72 | file_name = file_name.replace(".jsonl.gz", "_obsolete.jsonl.gz") 73 | dataset_path = cache_dir / file_name 74 | url = f"{URLBuilder.static(flavor, Environment.org)}/data/{file_name}" 75 | cache_dir.mkdir(parents=True, exist_ok=True) 76 | 77 | if not should_download_file(url, dataset_path, force_download, download_newer): 78 | return dataset_path 79 | 80 | logger.info("Downloading dataset, saving it in %s", dataset_path) 81 | download_file(url, dataset_path) 82 | return dataset_path 83 | 84 | 85 | class ProductDataset: 86 | def __init__( 87 | self, 88 | flavor: Flavor = Flavor.off, 89 | dataset_type: DatasetType = DatasetType.jsonl, 90 | dataset_path: Optional[Path] = None, 91 | obsolete: bool = False, 92 | **kwargs, 93 | ): 94 | """A product dataset. 95 | 96 | This class is used to iterate over the Open Food Facts dataset and 97 | to retrieve the information about products as dict. 98 | 99 | If dataset_path is None (default), the dataset is downloaded and 100 | cached in `~/.cache/openfoodfacts/datasets`. 101 | 102 | Otherwise, the dataset is loaded from the provided path. 103 | 104 | :param flavor: the dataset flavor to use (off, obf, opff or opf), 105 | defaults to Flavor.off. This parameter is ignored if dataset_path 106 | is provided. 107 | :param dataset_type: the dataset type to use (csv or jsonl), defaults 108 | to DatasetType.jsonl. This parameter is ignored if dataset_path is 109 | provided. 110 | :param dataset_path: the path of the dataset, defaults to None. 111 | :param obsolete: if True, download the obsolete dataset, defaults to 112 | False. 113 | :param kwargs: additional arguments passed to `get_dataset` when 114 | downloading the dataset 115 | """ 116 | self.dataset_type = dataset_type 117 | 118 | if dataset_path is not None: 119 | self.dataset_path = dataset_path 120 | 121 | # We infer the dataset type from the file extension 122 | full_suffix = "".join(dataset_path.suffixes) 123 | if full_suffix in (".jsonl.gz", ".jsonl"): 124 | self.dataset_type = DatasetType.jsonl 125 | elif full_suffix in (".csv.gz", ".csv"): 126 | self.dataset_type = DatasetType.csv 127 | else: 128 | raise ValueError(f"Unknown dataset type: {full_suffix}") 129 | else: 130 | self.dataset_path = get_dataset( 131 | flavor, dataset_type, obsolete=obsolete, **kwargs 132 | ) 133 | 134 | def __iter__(self): 135 | if self.dataset_type is DatasetType.jsonl: 136 | return jsonl_iter(self.dataset_path) 137 | else: 138 | return self._csv_iterator() 139 | 140 | def _csv_iterator(self): 141 | open_fn = get_open_fn(self.dataset_path) 142 | with open_fn(self.dataset_path, "rt", newline="") as csvfile: 143 | reader = csv.DictReader(csvfile, delimiter="\t") 144 | for row in reader: 145 | yield dict(row) 146 | 147 | def count(self) -> int: 148 | """Return the number of products in the dataset.""" 149 | count = 0 150 | for _ in self: 151 | count += 1 152 | return count 153 | -------------------------------------------------------------------------------- /tests/ml/test_image_classification.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import numpy as np 4 | from PIL import Image 5 | 6 | from openfoodfacts.ml.image_classification import ImageClassifier, classify_transforms 7 | 8 | 9 | class TestClassifyTransforms: 10 | def test_rgb_image(self): 11 | img = Image.new("RGB", (300, 300), color="red") 12 | transformed_img = classify_transforms(img) 13 | assert transformed_img.shape == (3, 224, 224) 14 | assert transformed_img.dtype == np.float32 15 | 16 | def test_non_rgb_image(self): 17 | img = Image.new("L", (300, 300), color="red") 18 | transformed_img = classify_transforms(img) 19 | assert transformed_img.shape == (3, 224, 224) 20 | assert transformed_img.dtype == np.float32 21 | 22 | def test_custom_size(self): 23 | img = Image.new("RGB", (300, 300), color="red") 24 | transformed_img = classify_transforms(img, size=128) 25 | assert transformed_img.shape == (3, 128, 128) 26 | assert transformed_img.dtype == np.float32 27 | 28 | def test_custom_mean_std(self): 29 | img = Image.new("RGB", (300, 300), color="red") 30 | mean = (0.5, 0.5, 0.5) 31 | std = (0.5, 0.5, 0.5) 32 | transformed_img = classify_transforms(img, mean=mean, std=std) 33 | assert transformed_img.shape == (3, 224, 224) 34 | assert transformed_img.dtype == np.float32 35 | 36 | def test_custom_interpolation(self): 37 | img = Image.new("RGB", (300, 300), color="red") 38 | transformed_img = classify_transforms( 39 | img, interpolation=Image.Resampling.NEAREST 40 | ) 41 | assert transformed_img.shape == (3, 224, 224) 42 | assert transformed_img.dtype == np.float32 43 | 44 | def test_custom_crop_fraction(self): 45 | img = Image.new("RGB", (300, 300), color="red") 46 | transformed_img = classify_transforms(img, crop_fraction=0.8) 47 | assert transformed_img.shape == (3, 224, 224) 48 | assert transformed_img.dtype == np.float32 49 | 50 | 51 | class ResponseOutputs: 52 | def __init__(self, name): 53 | self.name = name 54 | 55 | 56 | class TestImageClassifier: 57 | def test_preprocess_rgb_image(self): 58 | img = Image.new("RGB", (300, 300), color="red") 59 | classifier = ImageClassifier( 60 | model_name="test_model", label_names=["label1", "label2"] 61 | ) 62 | preprocessed_img = classifier.preprocess(img) 63 | assert preprocessed_img.shape == (1, 3, 224, 224) 64 | assert preprocessed_img.dtype == np.float32 65 | 66 | def test_postprocess_single_output(self): 67 | classifier = ImageClassifier( 68 | model_name="test_model", label_names=["label1", "label2"] 69 | ) 70 | response = MagicMock() 71 | response.outputs = [ResponseOutputs(name="output0")] 72 | response.raw_output_contents = [ 73 | np.array([0.8, 0.2], dtype=np.float32).tobytes() 74 | ] 75 | 76 | result = classifier.postprocess(response) 77 | assert len(result) == 2 78 | assert result[0][0] == "label1" 79 | assert np.isclose(result[0][1], 0.8) 80 | assert result[1][0] == "label2" 81 | assert np.isclose(result[1][1], 0.2) 82 | 83 | def test_postprocess_multiple_outputs(self): 84 | classifier = ImageClassifier( 85 | model_name="test_model", label_names=["label1", "label2"] 86 | ) 87 | response = MagicMock() 88 | response.outputs = [ 89 | ResponseOutputs(name="output0"), 90 | ResponseOutputs(name="output1"), 91 | ] 92 | response.raw_output_contents = [ 93 | np.array([0.8, 0.2], dtype=np.float32).tobytes() 94 | ] 95 | 96 | try: 97 | classifier.postprocess(response) 98 | except Exception as e: 99 | assert str(e) == "expected 1 output, got 2" 100 | 101 | def test_postprocess_multiple_raw_output_contents(self): 102 | classifier = ImageClassifier( 103 | model_name="test_model", label_names=["label1", "label2"] 104 | ) 105 | response = MagicMock() 106 | response.outputs = [ResponseOutputs(name="output0")] 107 | response.raw_output_contents = [ 108 | np.array([0.8, 0.2], dtype=np.float32).tobytes(), 109 | np.array([0.1, 0.9], dtype=np.float32).tobytes(), 110 | ] 111 | 112 | try: 113 | classifier.postprocess(response) 114 | except Exception as e: 115 | assert str(e) == "expected 1 raw output content, got 2" 116 | 117 | def test_predict(self): 118 | img = Image.new("RGB", (300, 300), color="red") 119 | classifier = ImageClassifier( 120 | model_name="test_model", label_names=["label1", "label2"] 121 | ) 122 | triton_uri = "fake_triton_uri" 123 | 124 | # Mock the preprocess method 125 | classifier.preprocess = MagicMock( 126 | return_value=np.random.rand(1, 3, 224, 224).astype(np.float32) 127 | ) 128 | 129 | # Mock the Triton inference stub and response 130 | grpc_stub = MagicMock() 131 | response = MagicMock() 132 | response.outputs = [ResponseOutputs(name="output0")] 133 | response.raw_output_contents = [ 134 | np.array([0.8, 0.2], dtype=np.float32).tobytes() 135 | ] 136 | grpc_stub.ModelInfer = MagicMock(return_value=response) 137 | 138 | # Mock the get_triton_inference_stub function 139 | get_triton_inference_stub = MagicMock(return_value=grpc_stub) 140 | 141 | with patch( 142 | "openfoodfacts.ml.image_classification.get_triton_inference_stub", 143 | get_triton_inference_stub, 144 | ): 145 | result = classifier.predict(img, triton_uri) 146 | 147 | assert len(result) == 2 148 | assert result[0][0] == "label1" 149 | assert np.isclose(result[0][1], 0.8) 150 | assert result[1][0] == "label2" 151 | assert np.isclose(result[1][1], 0.2) 152 | 153 | classifier.preprocess.assert_called_once_with(img) 154 | grpc_stub.ModelInfer.assert_called_once() 155 | get_triton_inference_stub.assert_called_once_with(triton_uri) 156 | -------------------------------------------------------------------------------- /openfoodfacts/ml/image_classification.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import time 4 | import typing 5 | from typing import Optional 6 | 7 | import numpy as np 8 | from PIL import Image, ImageOps 9 | from tritonclient.grpc import service_pb2 10 | 11 | from openfoodfacts.ml.triton import ( 12 | add_triton_infer_input_tensor, 13 | get_triton_inference_stub, 14 | ) 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def classify_transforms( 20 | img: Image.Image, 21 | size: int = 224, 22 | mean: tuple[float, float, float] = (0.0, 0.0, 0.0), 23 | std: tuple[float, float, float] = (1.0, 1.0, 1.0), 24 | interpolation: Image.Resampling = Image.Resampling.BILINEAR, 25 | crop_fraction: float = 1.0, 26 | ) -> np.ndarray: 27 | """ 28 | Applies a series of image transformations including resizing, center 29 | cropping, normalization, and conversion to a NumPy array. 30 | 31 | Transformation steps is based on the one used in the Ultralytics library: 32 | https://github.com/ultralytics/ultralytics/blob/main/ultralytics/data/augment.py#L2319 33 | 34 | :param img: Input Pillow image. 35 | :param size: The target size for the transformed image (shortest edge). 36 | :param mean: Mean values for each RGB channel used in normalization. 37 | :param std: Standard deviation values for each RGB channel used in 38 | normalization. 39 | :param interpolation: Interpolation method from PIL ( 40 | Image.Resampling.NEAREST, Image.Resampling.BILINEAR, 41 | Image.Resampling.BICUBIC). 42 | :param crop_fraction: Fraction of the image to be cropped. 43 | :return: The transformed image as a NumPy array. 44 | """ 45 | if img.mode != "RGB": 46 | img = img.convert("RGB") 47 | 48 | # Rotate the image based on the EXIF orientation if needed 49 | img = typing.cast(Image.Image, ImageOps.exif_transpose(img)) 50 | 51 | # Step 1: Resize while preserving the aspect ratio 52 | width, height = img.size 53 | 54 | # Calculate scale size while preserving aspect ratio 55 | scale_size = math.floor(size / crop_fraction) 56 | 57 | aspect_ratio = width / height 58 | if width < height: 59 | new_width = scale_size 60 | new_height = int(new_width / aspect_ratio) 61 | else: 62 | new_height = scale_size 63 | new_width = int(new_height * aspect_ratio) 64 | 65 | img = img.resize((new_width, new_height), interpolation) 66 | 67 | # Step 2: Center crop 68 | left = (new_width - size) // 2 69 | top = (new_height - size) // 2 70 | right = left + size 71 | bottom = top + size 72 | img = img.crop((left, top, right, bottom)) 73 | 74 | # Step 3: Convert the image to a NumPy array and scale pixel values to 75 | # [0, 1] 76 | img_array = np.array(img).astype(np.float32) / 255.0 77 | 78 | # Step 4: Normalize the image 79 | mean_np = np.array(mean, dtype=np.float32).reshape(1, 1, 3) 80 | std_np = np.array(std, dtype=np.float32).reshape(1, 1, 3) 81 | img_array = (img_array - mean_np) / std_np 82 | 83 | # Step 5: Change the order of dimensions from (H, W, C) to (C, H, W) 84 | img_array = np.transpose(img_array, (2, 0, 1)) 85 | return img_array 86 | 87 | 88 | class ImageClassifier: 89 | def __init__(self, model_name: str, label_names: list[str], image_size: int = 224): 90 | """An image classifier based on Yolo models. 91 | 92 | We support models trained with Yolov8, v9, v10 and v11. 93 | 94 | :param model_name: the name of the model, as registered in Triton 95 | :param label_names: the list of label names 96 | :param image_size: the size of the input image for the model 97 | """ 98 | self.model_name: str = model_name 99 | self.label_names = label_names 100 | self.image_size = image_size 101 | 102 | def predict( 103 | self, 104 | image: Image.Image, 105 | triton_uri: str, 106 | model_version: Optional[str] = None, 107 | ) -> list[tuple[str, float]]: 108 | """Run an image classification model on an image. 109 | 110 | The model is expected to have been trained with Ultralytics library 111 | (Yolov8). 112 | 113 | :param image: the input Pillow image 114 | :param triton_uri: URI of the Triton Inference Server, defaults to 115 | None. If not provided, the default value from settings is used. 116 | :return: the prediction results as a list of tuples (label, confidence) 117 | """ 118 | image_array = self.preprocess(image) 119 | 120 | grpc_stub = get_triton_inference_stub(triton_uri) 121 | request = service_pb2.ModelInferRequest() 122 | request.model_name = self.model_name 123 | if model_version: 124 | request.model_version = model_version 125 | add_triton_infer_input_tensor( 126 | request, name="images", data=image_array, datatype="FP32" 127 | ) 128 | start_time = time.monotonic() 129 | response = grpc_stub.ModelInfer(request) 130 | latency = time.monotonic() - start_time 131 | logger.debug("Inference time for %s: %s", self.model_name, latency) 132 | 133 | start_time = time.monotonic() 134 | result = self.postprocess(response) 135 | latency = time.monotonic() - start_time 136 | logger.debug("Post-processing time for %s: %s", self.model_name, latency) 137 | return result 138 | 139 | def preprocess(self, image: Image.Image) -> np.ndarray: 140 | """Preprocess an image for object detection. 141 | 142 | :param image: the input Pillow image 143 | :return: the preprocessed image as a NumPy array 144 | """ 145 | image_array = classify_transforms(image, size=self.image_size) 146 | return np.expand_dims(image_array, axis=0) 147 | 148 | def postprocess( 149 | self, response: service_pb2.ModelInferResponse 150 | ) -> list[tuple[str, float]]: 151 | """Postprocess the inference result. 152 | 153 | :param response: the inference response 154 | """ 155 | if len(response.outputs) != 1: 156 | raise Exception(f"expected 1 output, got {len(response.outputs)}") 157 | 158 | if len(response.raw_output_contents) != 1: 159 | raise Exception( 160 | f"expected 1 raw output content, got {len(response.raw_output_contents)}" 161 | ) 162 | 163 | output_index = {output.name: i for i, output in enumerate(response.outputs)} 164 | output = np.frombuffer( 165 | response.raw_output_contents[output_index["output0"]], 166 | dtype=np.float32, 167 | ).reshape((1, len(self.label_names)))[0] 168 | 169 | score_indices = np.argsort(-output) 170 | return [(self.label_names[i], float(output[i])) for i in score_indices] 171 | -------------------------------------------------------------------------------- /LICENSES/CC0-1.0.txt: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /tests/unit/test_taxonomy.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from openfoodfacts.taxonomy import ( 6 | Taxonomy, 7 | TaxonomyNode, 8 | create_brand_taxonomy_mapping, 9 | create_taxonomy_mapping, 10 | get_taxonomy, 11 | map_to_canonical_id, 12 | ) 13 | 14 | label_taxonomy = get_taxonomy("label") 15 | category_taxonomy = get_taxonomy("category") 16 | 17 | 18 | def test_map_to_canonical_id(): 19 | taxonomy_mapping = { 20 | "en:apple": "en:apples", 21 | "en:apples": "en:apples", 22 | "fr:pomme": "en:apples", 23 | "fr:noix-d-isere": "en:nuts-from-isere", 24 | "xx:provence-alpes-cote-d-azur": "en:provence-alpes-cote-d-azur", 25 | "xx:sashimi": "xx:sashimi", 26 | } 27 | values = [ 28 | "en: Apple", 29 | "en: apples", 30 | "fr: Pomme", 31 | "fr: Bananes d'Isère", 32 | "fr: Noix d'Isère", 33 | "fr: Provence-Alpes-Côte d'Azur", 34 | "pt: Provence-Alpes-Côte d'Azur", 35 | "it: sashimi", 36 | ] 37 | expected = { 38 | "en: Apple": "en:apples", 39 | "en: apples": "en:apples", 40 | "fr: Pomme": "en:apples", 41 | "fr: Bananes d'Isère": "fr:bananes-d-isere", 42 | "fr: Noix d'Isère": "en:nuts-from-isere", 43 | "fr: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur", 44 | "pt: Provence-Alpes-Côte d'Azur": "en:provence-alpes-cote-d-azur", 45 | "it: sashimi": "xx:sashimi", 46 | } 47 | assert map_to_canonical_id(taxonomy_mapping, values) == expected 48 | 49 | 50 | def test_map_to_canonical_id_invalid_value(): 51 | taxonomy_mapping = { 52 | "en:apple": "en:apples", 53 | "en:apples": "en:apples", 54 | "fr:pomme": "en:apples", 55 | "fr:noix-d-isere": "en:nuts-from-isere", 56 | } 57 | values = ["en: Apple", "apple"] 58 | 59 | with pytest.raises( 60 | ValueError, 61 | match=re.escape( 62 | "Invalid value: 'apple', expected value to be in 'lang:tag' format" 63 | ), 64 | ): 65 | map_to_canonical_id(taxonomy_mapping, values) 66 | 67 | 68 | class TestCreateTaxonomyMapping: 69 | def test_basic(self): 70 | taxonomy = Taxonomy() 71 | node1 = TaxonomyNode( 72 | identifier="en:apples", 73 | names={"en": "Apple", "fr": "Pomme"}, 74 | synonyms={"en": ["Apples"], "fr": ["Pommes"]}, 75 | ) 76 | node2 = TaxonomyNode( 77 | identifier="en:nuts-from-isere", 78 | names={"fr": "Noix d'Isère"}, 79 | synonyms={"fr": ["Noix d'Isère"]}, 80 | ) 81 | node3 = TaxonomyNode( 82 | identifier="xx:sashimi", 83 | names={"xx": "Sashimi"}, 84 | synonyms={"xx": ["Sashimi"]}, 85 | ) 86 | taxonomy.add(node1.id, node1) 87 | taxonomy.add(node2.id, node2) 88 | taxonomy.add(node3.id, node3) 89 | 90 | expected_mapping = { 91 | "en:apple": "en:apples", 92 | "fr:pomme": "en:apples", 93 | "en:apples": "en:apples", 94 | "fr:pommes": "en:apples", 95 | "fr:noix-d-isere": "en:nuts-from-isere", 96 | "xx:sashimi": "xx:sashimi", 97 | } 98 | 99 | assert create_taxonomy_mapping(taxonomy) == expected_mapping 100 | 101 | def test_empty(self): 102 | taxonomy = Taxonomy() 103 | expected_mapping = {} 104 | assert create_taxonomy_mapping(taxonomy) == expected_mapping 105 | 106 | def test_no_synonyms(self): 107 | taxonomy = Taxonomy() 108 | node = TaxonomyNode( 109 | identifier="en:bananas", 110 | names={"en": "Banana", "fr": "Banane"}, 111 | synonyms={}, 112 | ) 113 | taxonomy.add(node.id, node) 114 | 115 | expected_mapping = { 116 | "en:banana": "en:bananas", 117 | "fr:banane": "en:bananas", 118 | } 119 | 120 | assert create_taxonomy_mapping(taxonomy) == expected_mapping 121 | 122 | def test_multiple_languages_with_different_synonyms(self): 123 | taxonomy = Taxonomy() 124 | node = TaxonomyNode( 125 | identifier="en:grapes", 126 | names={"en": "Grape", "fr": "Raisin", "es": "Uva"}, 127 | synonyms={ 128 | "en": ["Grapes"], 129 | "fr": ["Raisins", "Raisins d'automne"], 130 | "es": ["Uvas"], 131 | }, 132 | ) 133 | taxonomy.add(node.id, node) 134 | 135 | expected_mapping = { 136 | "en:grape": "en:grapes", 137 | "fr:raisin": "en:grapes", 138 | "fr:raisins-d-automne": "en:grapes", 139 | "es:uva": "en:grapes", 140 | "en:grapes": "en:grapes", 141 | "fr:raisins": "en:grapes", 142 | "es:uvas": "en:grapes", 143 | } 144 | 145 | assert create_taxonomy_mapping(taxonomy) == expected_mapping 146 | 147 | def test_create_brand_taxonomy_mapping(self): 148 | taxonomy = Taxonomy.from_dict( 149 | { 150 | "en:5th-season": {"name": {"en": "5th Season"}}, 151 | "en:arev": {"name": {"en": "Arèv"}}, 152 | "en:arrighi": {"name": {"en": "Arrighi"}}, 153 | "en:voiles-au-vent": {"name": {"en": "Voiles au Vent"}}, 154 | "xx:turini": {"name": {"xx": "Turini"}}, 155 | "fr:auchan": {"name": {"xx": "Auchan"}}, 156 | "fr:mamouth": {"name": {"fr": "Mamouth"}}, 157 | "fr:carefour": {"name": {}}, 158 | } 159 | ) 160 | assert create_brand_taxonomy_mapping(taxonomy) == { 161 | "5th-season": "5th Season", 162 | "arev": "Arèv", 163 | "arrighi": "Arrighi", 164 | "voiles-au-vent": "Voiles au Vent", 165 | "turini": "Turini", 166 | "auchan": "Auchan", 167 | "mamouth": "Mamouth", 168 | "carefour": "carefour", 169 | } 170 | 171 | 172 | class TestTaxonomy: 173 | @pytest.mark.parametrize( 174 | "taxonomy,item,candidates,output", 175 | [ 176 | (label_taxonomy, "en:organic", {"en:fr-bio-01"}, True), 177 | (label_taxonomy, "en:fr-bio-01", {"en:organic"}, False), 178 | (label_taxonomy, "en:fr-bio-01", [], False), 179 | (label_taxonomy, "en:organic", {"en:gluten-free"}, False), 180 | ( 181 | label_taxonomy, 182 | "en:organic", 183 | {"en:gluten-free", "en:no-additives", "en:vegan"}, 184 | False, 185 | ), 186 | ( 187 | label_taxonomy, 188 | "en:organic", 189 | {"en:gluten-free", "en:no-additives", "en:fr-bio-16"}, 190 | True, 191 | ), 192 | ], 193 | ) 194 | def test_is_child_of_any( 195 | self, taxonomy: Taxonomy, item: str, candidates: list, output: bool 196 | ): 197 | assert taxonomy.is_parent_of_any(item, candidates) is output 198 | 199 | def test_is_child_of_any_unknwon_item(self): 200 | with pytest.raises(ValueError): 201 | label_taxonomy.is_parent_of_any("unknown-id", set()) 202 | 203 | @pytest.mark.parametrize( 204 | "taxonomy,item,output", 205 | [ 206 | (category_taxonomy, "en:plant-based-foods-and-beverages", set()), 207 | ( 208 | category_taxonomy, 209 | "en:plant-based-foods", 210 | {"en:plant-based-foods-and-beverages"}, 211 | ), 212 | ( 213 | category_taxonomy, 214 | "en:brown-rices", 215 | { 216 | "en:rices", 217 | "en:cereal-grains", 218 | "en:cereals-and-their-products", 219 | "en:cereals-and-potatoes", 220 | "en:plant-based-foods", 221 | "en:plant-based-foods-and-beverages", 222 | "en:seeds", 223 | }, 224 | ), 225 | ], 226 | ) 227 | def test_get_parents_hierarchy( 228 | self, taxonomy: Taxonomy, item: str, output: set[str] 229 | ): 230 | node = taxonomy[item] 231 | parents = node.get_parents_hierarchy() 232 | assert set((x.id for x in parents)) == output 233 | 234 | @pytest.mark.parametrize( 235 | "taxonomy,items,output", 236 | [ 237 | (category_taxonomy, [], []), 238 | (category_taxonomy, ["en:brown-rices"], ["en:brown-rices"]), 239 | (category_taxonomy, ["en:brown-rices", "en:rices"], ["en:brown-rices"]), 240 | ( 241 | category_taxonomy, 242 | ["en:brown-rices", "en:rices", "en:cereal-grains"], 243 | ["en:brown-rices"], 244 | ), 245 | ( 246 | category_taxonomy, 247 | ["en:brown-rices", "en:teas", "en:cereal-grains"], 248 | ["en:brown-rices", "en:teas"], 249 | ), 250 | ], 251 | ) 252 | def test_find_deepest_nodes( 253 | self, taxonomy: Taxonomy, items: list[str], output: list[str] 254 | ): 255 | item_nodes = [taxonomy[item] for item in items] 256 | output_nodes = [taxonomy[o] for o in output] 257 | assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes 258 | -------------------------------------------------------------------------------- /tests/unit/test_redis.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | from typing import Optional, cast 4 | 5 | import pytest 6 | from redis import Redis 7 | 8 | from openfoodfacts.redis import ( 9 | ProductUpdateEvent, 10 | get_new_updates_multistream, 11 | get_processed_since, 12 | ) 13 | 14 | 15 | class TestRedisUpdate: 16 | @pytest.mark.parametrize( 17 | "diffs, expected", 18 | [ 19 | ({"uploaded_images": {"add": ["2"]}}, True), 20 | ({"fields": {"change": {"product_type": "food"}}}, False), 21 | (None, False), 22 | ], 23 | ) 24 | def test_is_image_upload(self, diffs, expected): 25 | update = ProductUpdateEvent( 26 | id="1629878400000-0", 27 | stream="product_updates", 28 | timestamp=1629878400000, 29 | code="1", 30 | flavor="off", 31 | user_id="user1", 32 | action="updated", 33 | comment="comment", 34 | product_type="food", 35 | diffs=json.dumps(diffs) if diffs is not None else None, 36 | ) 37 | assert update.is_image_upload() is expected 38 | 39 | @pytest.mark.parametrize( 40 | "diffs, expected", 41 | [ 42 | ({"fields": {"change": ["product_type"]}}, True), 43 | ({"fields": {"change": ["countries", "product_type"]}}, True), 44 | ({"fields": {"change": ["countries"]}}, False), 45 | (None, False), 46 | ], 47 | ) 48 | def test_is_product_type_change(self, diffs, expected): 49 | update = ProductUpdateEvent( 50 | id="1629878400000-0", 51 | stream="product_updates", 52 | timestamp=1629878400000, 53 | code="1", 54 | flavor="off", 55 | user_id="user1", 56 | action="updated", 57 | comment="comment", 58 | product_type="food", 59 | diffs=json.dumps(diffs) if diffs is not None else None, 60 | ) 61 | assert update.is_product_type_change() is expected 62 | 63 | @pytest.mark.parametrize( 64 | "diffs, field_name, expected", 65 | [ 66 | ( 67 | {"fields": {"change": ["product_name", "quantity"]}}, 68 | "product_name", 69 | True, 70 | ), 71 | ({"fields": {"change": ["product_name", "quantity"]}}, "countries", False), 72 | ({"uploaded_images": {"add": ["4"]}}, "countries", False), 73 | ({}, "countries", False), 74 | (None, "product_name", False), 75 | ], 76 | ) 77 | def test_is_field_updated(self, diffs, field_name, expected): 78 | update = ProductUpdateEvent( 79 | id="1629878400000-0", 80 | stream="product_updates", 81 | timestamp=1629878400000, 82 | code="1", 83 | flavor="off", 84 | user_id="user1", 85 | action="updated", 86 | comment="comment", 87 | product_type="food", 88 | diffs=json.dumps(diffs) if diffs is not None else None, 89 | ) 90 | assert update.is_field_updated(field_name) is expected 91 | 92 | @pytest.mark.parametrize( 93 | "diffs, field_name, expected", 94 | [ 95 | ( 96 | {"fields": {"add": ["product_name", "quantity"]}}, 97 | "product_name", 98 | True, 99 | ), 100 | ({"fields": {"add": ["product_name"]}}, "countries", False), 101 | ({"uploaded_images": {"add": ["4"]}}, "countries", False), 102 | ({}, "countries", False), 103 | (None, "product_name", False), 104 | ], 105 | ) 106 | def test_is_field_added(self, diffs, field_name, expected): 107 | update = ProductUpdateEvent( 108 | id="1629878400000-0", 109 | stream="product_updates", 110 | timestamp=1629878400000, 111 | code="1", 112 | flavor="off", 113 | user_id="user1", 114 | action="updated", 115 | comment="comment", 116 | product_type="food", 117 | diffs=json.dumps(diffs) if diffs is not None else None, 118 | ) 119 | assert update.is_field_added(field_name) is expected 120 | 121 | @pytest.mark.parametrize( 122 | "diffs, field_name, expected", 123 | [ 124 | ( 125 | {"fields": {"change": ["product_name", "quantity"]}}, 126 | "product_name", 127 | True, 128 | ), 129 | ({"fields": {"change": ["product_name", "quantity"]}}, "countries", False), 130 | ( 131 | {"fields": {"add": ["product_name", "quantity"]}}, 132 | "product_name", 133 | True, 134 | ), 135 | ({"fields": {"add": ["product_name"]}}, "countries", False), 136 | ({"uploaded_images": {"add": ["4"]}}, "countries", False), 137 | ({}, "countries", False), 138 | (None, "product_name", False), 139 | ], 140 | ) 141 | def test_is_field_added_or_updated(self, diffs, field_name, expected): 142 | update = ProductUpdateEvent( 143 | id="1629878400000-0", 144 | stream="product_updates", 145 | timestamp=1629878400000, 146 | code="1", 147 | flavor="off", 148 | user_id="user1", 149 | action="updated", 150 | comment="comment", 151 | product_type="food", 152 | diffs=json.dumps(diffs) if diffs is not None else None, 153 | ) 154 | assert update.is_field_added_or_updated(field_name) is expected 155 | 156 | @pytest.mark.parametrize( 157 | "diffs, expected", 158 | [ 159 | ( 160 | { 161 | "selected_images": {"delete": ["front_de"]}, 162 | "uploaded_images": {"delete": ["1"]}, 163 | }, 164 | True, 165 | ), 166 | ({"fields": {"add": ["product_name"]}}, False), 167 | ({"uploaded_images": {"add": ["4"]}}, False), 168 | ({}, False), 169 | (None, False), 170 | ], 171 | ) 172 | def test_is_image_deletion(self, diffs, expected): 173 | update = ProductUpdateEvent( 174 | id="1629878400000-0", 175 | stream="product_updates", 176 | timestamp=1629878400000, 177 | code="1", 178 | flavor="off", 179 | user_id="user1", 180 | action="updated", 181 | comment="comment", 182 | product_type="food", 183 | diffs=json.dumps(diffs), 184 | ) 185 | assert update.is_image_deletion() is expected 186 | 187 | 188 | class RedisXrangeClient: 189 | def __init__(self, xrange_return_values: list): 190 | self.xrange_return_values = xrange_return_values 191 | self.call_count = 0 192 | 193 | def xrange( 194 | self, name: str, min: str = "-", max: str = "+", count: Optional[int] = None 195 | ): 196 | assert name in ("product_updates", "ocr_ready") 197 | assert max == "+" 198 | assert count == 100 199 | if self.call_count >= len(self.xrange_return_values): 200 | return [] 201 | self.call_count += 1 202 | return self.xrange_return_values[self.call_count - 1] 203 | 204 | 205 | def test_get_processed_since(): 206 | stream_name = "product_updates" 207 | base_values = { 208 | "flavor": "off", 209 | "user_id": "user1", 210 | "action": "updated", 211 | "comment": "comment", 212 | "product_type": "food", 213 | } 214 | return_values = [ 215 | [ 216 | ("1629878400000-0", {"code": "2", **base_values}), 217 | ("1629878400001-0", {"code": "3", **base_values}), 218 | ] 219 | ] 220 | redis_client = cast(Redis, RedisXrangeClient(return_values)) 221 | # Wed Aug 25 08:00:00 2021 UTC 222 | start_timestamp_ms = 1629878400000 # Example start timestamp 223 | # Call the function and iterate over the results 224 | results = list( 225 | get_processed_since( 226 | redis_client, 227 | min_id=start_timestamp_ms, 228 | ) 229 | ) 230 | 231 | # Assertions 232 | assert len(results) == 2 233 | assert results[0] == ProductUpdateEvent( 234 | id="1629878400000-0", 235 | stream=stream_name, 236 | timestamp=1629878400000, 237 | code="2", 238 | **base_values, 239 | ) 240 | assert results[1] == ProductUpdateEvent( 241 | id="1629878400001-0", 242 | stream=stream_name, 243 | timestamp=1629878400001, 244 | code="3", 245 | **base_values, 246 | ) 247 | 248 | 249 | class RedisXreadClient: 250 | def __init__(self, xread_return_values: list): 251 | self.xread_return_values = xread_return_values 252 | self.call_count = 0 253 | 254 | def xread(self, streams: dict, block: int, count: Optional[int] = None): 255 | assert set(streams.keys()) == {"product_updates", "ocr_ready"} 256 | assert block == 0 257 | assert count == 100 258 | if self.call_count >= len(self.xread_return_values): 259 | raise ValueError("No more values") 260 | self.call_count += 1 261 | return self.xread_return_values[self.call_count - 1] 262 | 263 | 264 | def test_get_new_updates_multistream(): 265 | product_updates_stream_name = "product_updates" 266 | ocr_ready_stream_name = "ocr_ready" 267 | base_values_product_updates = { 268 | "flavor": "off", 269 | "user_id": "user1", 270 | "action": "updated", 271 | "comment": "comment", 272 | "product_type": "beauty", 273 | } 274 | ocr_ready_event = { 275 | "product_type": "beauty", 276 | "code": "3215495849204", 277 | "image_id": "2", 278 | "json_url": "https://images.openfoodfacts.org/images/products/321/549/584/9204/2.json", 279 | } 280 | return_values = [ 281 | [ 282 | ( 283 | product_updates_stream_name, 284 | [("1629878400000-0", {"code": "4", **base_values_product_updates})], 285 | ), 286 | ], 287 | [ 288 | ( 289 | ocr_ready_stream_name, 290 | [("1629878400001-0", ocr_ready_event)], 291 | ), 292 | ], 293 | [ 294 | ( 295 | product_updates_stream_name, 296 | [("1629878400002-0", {"code": "1", **base_values_product_updates})], 297 | ) 298 | ], 299 | [ 300 | ( 301 | product_updates_stream_name, 302 | [("1629878400003-0", {"code": "2", **base_values_product_updates})], 303 | ) 304 | ], 305 | [ 306 | ( 307 | product_updates_stream_name, 308 | [("1629878400004-0", {"code": "3", **base_values_product_updates})], 309 | ) 310 | ], 311 | ] 312 | redis_client = cast(Redis, RedisXreadClient(return_values)) 313 | 314 | # Call the function and iterate over the results 315 | updates_iter = get_new_updates_multistream(redis_client) 316 | 317 | product_update_result = next(updates_iter) 318 | assert product_update_result == ProductUpdateEvent( 319 | id="1629878400000-0", 320 | stream=product_updates_stream_name, 321 | timestamp=1629878400000, 322 | code="4", 323 | **base_values_product_updates, 324 | ) 325 | 326 | ocr_ready_result = next(updates_iter) 327 | assert ocr_ready_result.id == "1629878400001-0" 328 | assert ocr_ready_result.stream == ocr_ready_stream_name 329 | assert ocr_ready_result.timestamp == datetime.datetime.fromtimestamp( 330 | 1629878400.001, tz=datetime.timezone.utc 331 | ) 332 | assert ocr_ready_result.code == ocr_ready_event["code"] 333 | assert ocr_ready_result.product_type == ocr_ready_event["product_type"] 334 | assert ocr_ready_result.image_id == ocr_ready_event["image_id"] 335 | assert ocr_ready_result.json_url == ocr_ready_event["json_url"] 336 | -------------------------------------------------------------------------------- /openfoodfacts/images.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import List, Optional, Tuple, Union 4 | from urllib.parse import urlparse 5 | 6 | import requests 7 | 8 | from openfoodfacts.types import Environment, Flavor, JSONType 9 | from openfoodfacts.utils import ImageDownloadItem, URLBuilder, get_image_from_url 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | # Base URL of the public Open Food Facts S3 bucket 15 | AWS_S3_BASE_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data" 16 | 17 | 18 | _pillow_available = True 19 | try: 20 | from PIL import Image 21 | except ImportError: 22 | _pillow_available = False 23 | 24 | 25 | def split_barcode(barcode: str) -> List[str]: 26 | """Split barcode in the same way as done by Product Opener to generate a 27 | product image folder. 28 | 29 | :param barcode: The barcode of the product. For the pro platform only, 30 | it must be prefixed with the org ID using the format 31 | `{ORG_ID}/{BARCODE}` 32 | :raises ValueError: raise a ValueError if `barcode` is invalid 33 | :return: a list containing the splitted barcode 34 | """ 35 | org_id = None 36 | if "/" in barcode: 37 | # For the pro platform, `barcode` is expected to be in the format 38 | # `{ORG_ID}/{BARCODE}` (ex: `org-lea-nature/3307130803004`) 39 | org_id, barcode = barcode.split("/", maxsplit=1) 40 | 41 | if not barcode.isdigit(): 42 | raise ValueError(f"unknown barcode format: {barcode}") 43 | 44 | # Pad the barcode with zeros to ensure it has 13 digits 45 | barcode = barcode.lstrip("0").zfill(13) 46 | # Split the first 9 digits of the barcode into 3 groups of 3 digits to 47 | # get the first 3 folder names and use the rest of the barcode as the 48 | # last folder name 49 | splits = [barcode[0:3], barcode[3:6], barcode[6:9], barcode[9:]] 50 | 51 | if org_id is not None: 52 | # For the pro platform only, images and OCRs belonging to an org 53 | # are stored in a folder named after the org for all its products, ex: 54 | # https://images.pro.openfoodfacts.org/images/products/org-lea-nature/330/713/080/3004/1.jpg 55 | splits.insert(0, org_id) 56 | 57 | return splits 58 | 59 | 60 | def _generate_file_path(code: str, image_id: str, suffix: str): 61 | splitted_barcode = split_barcode(code) 62 | return f"/{'/'.join(splitted_barcode)}/{image_id}{suffix}" 63 | 64 | 65 | def generate_image_path(code: str, image_id: str) -> str: 66 | """Generate an image path. 67 | 68 | It's used to generate a unique identifier of an image for a product (and 69 | to generate an URL to fetch this image from the server). 70 | 71 | :param code: the product barcode 72 | :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) 73 | :return: the full image path 74 | """ 75 | return _generate_file_path(code, image_id, ".jpg") 76 | 77 | 78 | def generate_json_ocr_path(code: str, image_id: str) -> str: 79 | """Generate a JSON OCR path. 80 | 81 | It's used to generate a unique identifier of an OCR results for a product 82 | (and to generate an URL to fetch this OCR JSON from the server). 83 | 84 | :param code: the product barcode 85 | :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) 86 | :return: the full image path 87 | """ 88 | return _generate_file_path(code, image_id, ".json") 89 | 90 | 91 | def generate_json_ocr_url( 92 | code: str, 93 | image_id: str, 94 | flavor: Flavor = Flavor.off, 95 | environment: Environment = Environment.org, 96 | ) -> str: 97 | """Generate the OCR JSON URL for a specific product and 98 | image ID. 99 | 100 | :param code: the product barcode 101 | :param image_id: the image ID (ex: `1`, `2`,...) 102 | :param flavor: the project to use, defaults to Flavor.off 103 | :param environment: the environment (prod/staging), defaults to 104 | Environment.org 105 | :return: the generated JSON URL 106 | """ 107 | return URLBuilder.image_url( 108 | flavor, environment, generate_json_ocr_path(code, image_id) 109 | ) 110 | 111 | 112 | def generate_image_url( 113 | code: str, 114 | image_id: str, 115 | flavor: Flavor = Flavor.off, 116 | environment: Environment = Environment.org, 117 | ) -> str: 118 | """Generate the image URL for a specific product and 119 | image ID. 120 | 121 | :param code: the product barcode 122 | :param image_id: the image ID (ex: `1`, `ingredients_fr.full`,...) 123 | :param flavor: the project to use, defaults to Flavor.off 124 | :param environment: the environment (prod/staging), defaults to 125 | Environment.org 126 | :return: the generated image URL 127 | """ 128 | return URLBuilder.image_url( 129 | flavor, environment, generate_image_path(code, image_id) 130 | ) 131 | 132 | 133 | def extract_barcode_from_url(url: str) -> Optional[str]: 134 | """Extract a product barcode from an image/OCR URL. 135 | 136 | :param url: the URL 137 | :return: the extracted barcode 138 | """ 139 | url_path = urlparse(url).path 140 | return extract_barcode_from_path(url_path) 141 | 142 | 143 | def extract_barcode_from_path(path: str) -> Optional[str]: 144 | """Extract a product barcode from an image/OCR path. 145 | 146 | The barcode is normalized using the following rules: 147 | 148 | - all leading zeros are stripped 149 | - if the barcode is less than 8 digits, it is left-padded with zeros up to 150 | 8 digits 151 | - if the barcode is more than 8 digits but less than 13 digits, it is 152 | left-padded with zeros up to 13 digits 153 | - if the barcode has 13 digits or more, it's returned as it 154 | """ 155 | barcode = "" 156 | 157 | for parent in Path(path).parents: 158 | if parent.name.isdigit(): 159 | barcode = parent.name + barcode 160 | else: 161 | break 162 | 163 | # Strip leading zeros 164 | barcode = barcode.lstrip("0") 165 | 166 | if not barcode: 167 | return None 168 | 169 | if len(barcode) <= 8: 170 | barcode = barcode.zfill(8) 171 | return barcode 172 | 173 | barcode = barcode.zfill(13) 174 | return barcode 175 | 176 | 177 | def extract_source_from_url(url: str) -> str: 178 | """Extract source image from an image or OCR URL. 179 | 180 | The source image is a unique identifier of the image or OCR, 181 | and is the full path of the image or OCR file on the server 182 | (ex: `/008/009/637/2472/1.jpg`). 183 | 184 | :param url: the URL 185 | :return: the source image 186 | """ 187 | url_path = urlparse(url).path 188 | 189 | if url_path.startswith("/images/products"): 190 | url_path = url_path[len("/images/products") :] 191 | 192 | if url_path.endswith(".json"): 193 | url_path = str(Path(url_path).with_suffix(".jpg")) 194 | 195 | # normalize windows path to unix path 196 | return url_path.replace("\\", "/") 197 | 198 | 199 | def download_image( 200 | image: Union[str, Tuple[str, str]], 201 | use_cache: bool = True, 202 | error_raise: bool = True, 203 | session: Optional[requests.Session] = None, 204 | return_struct: bool = False, 205 | ) -> Union[None, "Image.Image", ImageDownloadItem]: 206 | """Download an Open Food Facts image. 207 | 208 | :param image: the image URL or a tuple containing the barcode and the 209 | image ID 210 | :param use_cache: whether to use the S3 dataset cache, defaults to True 211 | :param error_raise: whether to raise an error if the download fails, 212 | defaults to True 213 | :param session: the requests session to use, defaults to None 214 | :param return_struct: if True, return a `ImageDownloadItem` object 215 | containing the image, image bytes and the response object. 216 | :return: the downloaded image, or an `ImageDownloadItem` object if 217 | `return_struct` is True. 218 | 219 | >>> download_image("https://images.openfoodfacts.org/images/products/324/227/210/2359/4.jpg") # noqa 220 | 221 | 222 | >>> download_image(("3242272102359", "4")) 223 | 224 | """ 225 | if not _pillow_available: 226 | raise ImportError("Pillow is required to use this function") 227 | 228 | if isinstance(image, str): 229 | if use_cache: 230 | image_path = extract_source_from_url(image) 231 | image_url = f"{AWS_S3_BASE_URL}{image_path}" 232 | 233 | if requests.head(image_url).status_code != 200: 234 | logger.debug(f"Image not found in cache: {image_url}") 235 | image_url = image 236 | else: 237 | image_url = image 238 | 239 | if isinstance(image, tuple): 240 | if use_cache: 241 | image_path = generate_image_path(*image) 242 | image_url = f"{AWS_S3_BASE_URL}{image_path}" 243 | 244 | if requests.head(image_url).status_code != 200: 245 | logger.debug(f"Image not found in cache: {image_url}") 246 | image_url = generate_image_url(*image) 247 | else: 248 | image_url = generate_image_url(*image) 249 | 250 | logger.debug(f"Downloading image from {image_url}") 251 | return get_image_from_url( 252 | image_url, 253 | error_raise=error_raise, 254 | session=session, 255 | return_struct=return_struct, 256 | ) 257 | 258 | 259 | def convert_to_legacy_schema(images: JSONType) -> JSONType: 260 | """Convert the images dictionary to the legacy schema. 261 | 262 | We've improved the schema of the `images` field, but the new 263 | schema is not compatible with the legacy schema. This function 264 | converts the new schema to the legacy schema. 265 | 266 | It can be used while migrating the existing Python codebase to the 267 | new schema. 268 | 269 | The new `images` schema is the following: 270 | 271 | - the `images` field contains the uploaded images under the `uploaded` 272 | key and the selected images under the `selected` key 273 | - `uploaded` contains the images that are uploaded, and maps the 274 | image ID to the detail about the image: 275 | - `uploaded_t`: the upload timestamp 276 | - `uploader`: the username of the uploader 277 | - `sizes`: dictionary mapping image size (`100`, `200`, `400`, `full`) 278 | to the information about each resized image: 279 | - `h`: the height of the image 280 | - `w`: the width of the image 281 | - `url`: the URL of the image 282 | - `selected` contains the images that are selected, and maps the 283 | image key (`nutrition`, `ingredients`, `packaging`, or `front`) to 284 | a dictionary mapping the language to the selected image details. 285 | The selected image details are the following fields: 286 | - `imgid`: the image ID 287 | - `rev`: the revision ID 288 | - `sizes`: dictionary mapping image size (`100`, `200`, `400`, `full`) 289 | to the information about each resized image: 290 | - `h`: the height of the image 291 | - `w`: the width of the image 292 | - `url`: the URL of the image 293 | - `generation`: information about how to generate the selected image 294 | from the uploaded image: 295 | - `geometry` 296 | - `x1`, `y1`, `x2`, `y2`: the coordinates of the crop 297 | - `angle`: the rotation angle of the selected image 298 | - `coordinates_image_size`: 400 or "full", indicates if the 299 | geometry coordinates are relative to the full image, or to a 300 | resized version (max width and max height=400) 301 | - `normalize`: indicates if colors should be normalized 302 | - `white_magic`: indicates if the background is white and should 303 | be removed (e.g. photo on a white sheet of paper) 304 | 305 | See https://github.com/openfoodfacts/openfoodfacts-server/pull/11818 306 | for more details. 307 | """ 308 | 309 | if not is_new_image_schema(images): 310 | return images 311 | 312 | images_with_legacy_schema = {} 313 | 314 | for image_id, image_data in images.get("uploaded", {}).items(): 315 | images_with_legacy_schema[image_id] = { 316 | "sizes": { 317 | # remove URL field 318 | size: {k: v for k, v in image_size_data.items() if k != "url"} 319 | for size, image_size_data in image_data["sizes"].items() 320 | }, 321 | "uploaded_t": image_data["uploaded_t"], 322 | "uploader": image_data["uploader"], 323 | } 324 | 325 | for selected_key, image_by_lang in images.get("selected", {}).items(): 326 | for lang, image_data in image_by_lang.items(): 327 | new_image_data = { 328 | "imgid": image_data["imgid"], 329 | "rev": image_data["rev"], 330 | "sizes": { 331 | # remove URL field 332 | size: {k: v for k, v in image_size_data.items() if k != "url"} 333 | for size, image_size_data in image_data["sizes"].items() 334 | }, 335 | **(image_data.get("generation", {})), 336 | } 337 | images_with_legacy_schema[f"{selected_key}_{lang}"] = new_image_data 338 | 339 | return images_with_legacy_schema 340 | 341 | 342 | def is_new_image_schema(images_data: JSONType) -> bool: 343 | """Return True if the `images` dictionary follows the new Product Opener 344 | images schema. 345 | 346 | See https://github.com/openfoodfacts/openfoodfacts-server/pull/11818 for 347 | more information about this new schema. 348 | """ 349 | if not images_data: 350 | return False 351 | 352 | return "selected" in images_data or "uploaded" in images_data 353 | -------------------------------------------------------------------------------- /openfoodfacts/redis.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | from typing import Any, Iterator, cast 4 | 5 | from pydantic import BaseModel, Json 6 | from redis import Redis 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def get_redis_client(**kwargs) -> Redis: 12 | return Redis( 13 | decode_responses=True, 14 | **kwargs, 15 | ) 16 | 17 | 18 | class ProductUpdateEvent(BaseModel): 19 | """A class representing a product update from a Redis Stream.""" 20 | 21 | # The Redis ID of the event 22 | id: str 23 | # The name of the Redis stream where the update was published 24 | # This will always be "product_updates" 25 | stream: str 26 | # The timestamp of the event 27 | timestamp: datetime.datetime 28 | # The code of the product 29 | code: str 30 | # The flavor of the product (off, obf, opff, off_pro) 31 | flavor: str 32 | # The user ID of the user who performed the action 33 | user_id: str 34 | # The action performed by the user (either updated or deleted) 35 | action: str 36 | # A comment provided by the user 37 | comment: str 38 | # the type of the product (food, product, petfood, beauty) 39 | product_type: str 40 | # A JSON object representing the differences between the old and new 41 | # product data 42 | diffs: Json[Any] | None = None 43 | 44 | def is_image_upload(self) -> bool: 45 | """Returns True if the update is an image upload.""" 46 | return bool( 47 | self.diffs is not None 48 | and "uploaded_images" in self.diffs 49 | and "add" in self.diffs["uploaded_images"] 50 | ) 51 | 52 | def is_product_type_change(self) -> bool: 53 | """Returns True if the update contains a product type change (example: 54 | switch from `food` to `beauty`).""" 55 | return bool( 56 | self.diffs is not None 57 | and "fields" in self.diffs 58 | and "change" in self.diffs["fields"] 59 | and "product_type" in self.diffs["fields"]["change"] 60 | ) 61 | 62 | def is_field_updated(self, field_name: str) -> bool: 63 | """Returns True if the update contains a change in the specified 64 | field.""" 65 | return ( 66 | self.diffs is not None 67 | and "fields" in self.diffs 68 | and "change" in self.diffs["fields"] 69 | and field_name in self.diffs["fields"]["change"] 70 | ) 71 | 72 | def is_field_added(self, field_name: str) -> bool: 73 | """Returns True if the update contains a change in the specified 74 | field.""" 75 | return ( 76 | self.diffs is not None 77 | and "fields" in self.diffs 78 | and "add" in self.diffs["fields"] 79 | and field_name in self.diffs["fields"]["add"] 80 | ) 81 | 82 | def is_field_added_or_updated(self, field_name: str) -> bool: 83 | """Returns True if the update contains a change in the specified 84 | field.""" 85 | return self.is_field_updated(field_name) or self.is_field_added(field_name) 86 | 87 | def is_image_deletion(self) -> bool: 88 | """Returns True if the event is an image deletion.""" 89 | return ( 90 | self.diffs is not None 91 | and "uploaded_images" in self.diffs 92 | and "delete" in self.diffs["uploaded_images"] 93 | ) 94 | 95 | 96 | class OCRReadyEvent(BaseModel): 97 | """A class representing an OCR ready event from a Redis Stream. 98 | 99 | This event is published when the OCR processing (done by Google Cloud 100 | Vision) of an image is complete. 101 | 102 | The OCR result (JSON file) is available at the URL provided in the 103 | `json_url` field. 104 | """ 105 | 106 | # The Redis ID of the event 107 | id: str 108 | # The name of the Redis stream where the event was published 109 | # This will always be "ocr_ready" 110 | stream: str 111 | # The timestamp of the event 112 | timestamp: datetime.datetime 113 | # The code of the product 114 | code: str 115 | # the type of the product (food, product, petfood, beauty) 116 | product_type: str 117 | # The ID of the image (ex: "1") 118 | image_id: str 119 | # The URL of the OCR result (JSON file) 120 | json_url: str 121 | 122 | 123 | def get_processed_since( 124 | redis_client: Redis, 125 | min_id: str | datetime.datetime, 126 | product_updates_stream_name: str = "product_updates", 127 | ocr_ready_stream_name: str = "ocr_ready", 128 | batch_size: int = 100, 129 | ) -> Iterator[ProductUpdateEvent | OCRReadyEvent]: 130 | """Fetches all events (product update or ocr ready events) that have been 131 | published since the given timestamp. 132 | 133 | :param redis_client: the Redis client 134 | :param min_id: the minimum ID to start from, or a datetime object 135 | :param product_updates_stream_name: the name of the Redis stream for 136 | product updates, defaults to "product_updates" 137 | :param ocr_ready_stream_name: the name of the Redis stream for OCR ready 138 | events, defaults to "ocr_ready" 139 | :param batch_size: the size of the batch to fetch, defaults to 100 140 | :yield: a ProductUpdateEvent or OCRReadyEvent instance for each update 141 | """ 142 | if isinstance(min_id, datetime.datetime): 143 | min_id = f"{int(min_id.timestamp() * 1000)}-0" 144 | 145 | for stream_name in ( 146 | product_updates_stream_name, 147 | ocr_ready_stream_name, 148 | ): 149 | while True: 150 | logger.debug( 151 | "Fetching batch from Redis, stream %s, min_id %s, count %d", 152 | stream_name, 153 | min_id, 154 | batch_size, 155 | ) 156 | batch = redis_client.xrange(stream_name, min=min_id, count=batch_size) 157 | if not batch: 158 | # We reached the end of the stream 159 | break 160 | 161 | batch = cast(list[tuple[str, dict]], batch) 162 | # We update the min_id to the last ID of the batch 163 | min_id = f"({batch[-1][0]}" 164 | for timestamp_id, item in batch: 165 | # Get the timestamp from the ID 166 | timestamp = int(timestamp_id.split("-")[0]) 167 | 168 | if stream_name == ocr_ready_stream_name: 169 | yield OCRReadyEvent( 170 | id=timestamp_id, 171 | timestamp=timestamp, # type: ignore 172 | stream=stream_name, 173 | code=item["code"], 174 | product_type=item["product_type"], 175 | image_id=item["image_id"], 176 | json_url=item["json_url"], 177 | ) 178 | else: 179 | yield ProductUpdateEvent( 180 | id=timestamp_id, 181 | timestamp=timestamp, # type: ignore 182 | stream=stream_name, 183 | code=item["code"], 184 | flavor=item["flavor"], 185 | user_id=item["user_id"], 186 | action=item["action"], 187 | comment=item["comment"], 188 | product_type=item["product_type"], 189 | diffs=item.get("diffs"), 190 | ) 191 | 192 | 193 | def get_new_updates_multistream( 194 | redis_client: Redis, 195 | product_updates_stream_name: str = "product_updates", 196 | ocr_ready_stream_name: str = "ocr_ready", 197 | min_id: str | datetime.datetime | None = "$", 198 | batch_size: int = 100, 199 | ) -> Iterator[ProductUpdateEvent | OCRReadyEvent]: 200 | """Reads new updates from Redis Stream, starting from the moment this 201 | function is called. 202 | 203 | The function will block until new updates are available. 204 | 205 | :param redis_client: the Redis client. 206 | :param product_updates_stream_name: the name of the Redis stream for 207 | product updates, defaults to "product_updates". 208 | :param ocr_ready_stream_name: the name of the Redis stream for OCR ready 209 | events, defaults to "ocr_ready". 210 | :param min_id: the minimum ID to start from, defaults to "$". 211 | :param batch_size: the size of the batch to fetch, defaults to 100. 212 | :yield: a ProductUpdateEvent or OCRReadyEvent instance for each update. 213 | """ 214 | if min_id is None: 215 | min_id = "$" 216 | elif isinstance(min_id, datetime.datetime): 217 | min_id = f"{int(min_id.timestamp() * 1000)}-0" 218 | 219 | stream_names = [product_updates_stream_name, ocr_ready_stream_name] 220 | # We start from the last ID 221 | min_ids: dict[bytes | str | memoryview, int | bytes | str | memoryview] = { 222 | stream_name: min_id for stream_name in stream_names 223 | } 224 | while True: 225 | logger.debug( 226 | "Listening to new updates from streams %s (ID: %s)", stream_names, min_ids 227 | ) 228 | # We use block=0 to wait indefinitely for new updates 229 | response = redis_client.xread(streams=min_ids, block=0, count=batch_size) 230 | response = cast(list[tuple[str, list[tuple[str, dict]]]], response) 231 | # The response is a list of tuples (stream_name, batch) 232 | 233 | for stream_name, batch in response: 234 | # We update the min_id to the last ID of the batch 235 | new_min_id = batch[-1][0] 236 | min_ids[stream_name] = new_min_id 237 | for timestamp_id, item in batch: 238 | # Get the timestamp from the ID 239 | timestamp = int(timestamp_id.split("-")[0]) 240 | 241 | if stream_name == ocr_ready_stream_name: 242 | yield OCRReadyEvent( 243 | id=timestamp_id, 244 | stream=stream_name, 245 | timestamp=timestamp, # type: ignore 246 | code=item["code"], 247 | product_type=item["product_type"], 248 | image_id=item["image_id"], 249 | json_url=item["json_url"], 250 | ) 251 | else: 252 | yield ProductUpdateEvent( 253 | id=timestamp_id, 254 | stream=stream_name, 255 | timestamp=timestamp, # type: ignore 256 | code=item["code"], 257 | flavor=item["flavor"], 258 | user_id=item["user_id"], 259 | action=item["action"], 260 | comment=item["comment"], 261 | product_type=item["product_type"], 262 | diffs=item.get("diffs"), 263 | ) 264 | 265 | 266 | class UpdateListener: 267 | """A class representing a daemon that listens to events from a Redis 268 | stream and processes them. 269 | 270 | The class is meant to be subclassed to implement the processing logic. 271 | Subclasses can implement the `process_redis_update` and 272 | `process_ocr_ready` methods. 273 | """ 274 | 275 | def __init__( 276 | self, 277 | redis_client: Redis, 278 | redis_latest_id_key: str, 279 | product_updates_stream_name: str = "product_updates", 280 | ocr_ready_stream_name: str = "ocr_ready", 281 | ): 282 | self.redis_client = redis_client 283 | self.product_updates_stream_name = product_updates_stream_name 284 | self.ocr_ready_stream_name = ocr_ready_stream_name 285 | self.redis_latest_id_key = redis_latest_id_key 286 | 287 | def run(self): 288 | """Run the update import daemon. 289 | 290 | This daemon listens to the Redis stream containing information about 291 | product updates or OCR ready events, and processes them as they 292 | arrive. 293 | """ 294 | logger.info("Starting update listener daemon") 295 | 296 | logger.info("Redis client: %s", self.redis_client) 297 | logger.info("Pinging client...") 298 | self.redis_client.ping() 299 | logger.info("Connection successful") 300 | 301 | latest_id = self.redis_client.get(self.redis_latest_id_key) 302 | 303 | if latest_id: 304 | logger.info( 305 | "Latest ID processed: %s (datetime: %s)", 306 | latest_id, 307 | datetime.datetime.fromtimestamp(int(latest_id.split("-")[0]) / 1000), 308 | ) 309 | else: 310 | logger.info("No latest ID found") 311 | 312 | for event in get_new_updates_multistream( 313 | self.redis_client, 314 | min_id=latest_id, 315 | ): 316 | try: 317 | if isinstance(event, OCRReadyEvent): 318 | self.process_ocr_ready(event) 319 | else: 320 | self.process_redis_update(event) 321 | except Exception as e: 322 | logger.exception(e) 323 | self.redis_client.set(self.redis_latest_id_key, event.id) 324 | 325 | def process_updates_since( 326 | self, since: datetime.datetime, to: datetime.datetime | None = None 327 | ): 328 | """Process all the updates since the given timestamp. 329 | 330 | :param client: the Redis client 331 | :param since: the timestamp to start from 332 | :param to: the timestamp to stop, defaults to None (process all 333 | updates) 334 | """ 335 | logger.info("Redis client: %s", self.redis_client) 336 | logger.info("Pinging client...") 337 | self.redis_client.ping() 338 | 339 | processed = 0 340 | for event in get_processed_since( 341 | self.redis_client, 342 | min_id=since, 343 | ): 344 | if to is not None and event.timestamp > to: 345 | break 346 | if isinstance(event, OCRReadyEvent): 347 | self.process_ocr_ready(event) 348 | else: 349 | self.process_redis_update(event) 350 | 351 | processed += 1 352 | 353 | logger.info("Processed %d events", processed) 354 | 355 | def process_redis_update(self, event: ProductUpdateEvent): 356 | pass 357 | 358 | def process_ocr_ready(self, event: OCRReadyEvent): 359 | pass 360 | -------------------------------------------------------------------------------- /openfoodfacts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import gzip 3 | import json 4 | import logging 5 | import random 6 | import shutil 7 | import string 8 | import time 9 | from io import BytesIO 10 | from pathlib import Path 11 | from time import perf_counter 12 | from typing import Callable, Dict, Iterable, List, Optional, Union 13 | 14 | import requests 15 | import tqdm 16 | 17 | from ..types import COUNTRY_CODE_TO_NAME, Country, Environment, Flavor 18 | 19 | _orjson_available = True 20 | try: 21 | import orjson 22 | except ImportError: 23 | _orjson_available = False 24 | 25 | _pillow_available = True 26 | try: 27 | import PIL 28 | from PIL import Image 29 | except ImportError: 30 | _pillow_available = False 31 | 32 | http_session = requests.Session() 33 | http_session.headers.update({"User-Agent": "openfoodfacts-python"}) 34 | 35 | 36 | def configure_root_logger( 37 | logger: logging.Logger, 38 | level: int = logging.INFO, 39 | formatter_string: Optional[str] = None, 40 | ): 41 | logger.setLevel(level) 42 | handler = logging.StreamHandler() 43 | 44 | if formatter_string is None: 45 | formatter_string = "%(asctime)s :: %(levelname)s :: %(message)s" 46 | 47 | formatter = logging.Formatter(formatter_string) 48 | handler.setFormatter(formatter) 49 | handler.setLevel(level) 50 | logger.addHandler(handler) 51 | return logger 52 | 53 | 54 | def get_logger(name=None, level: int = logging.INFO) -> logging.Logger: 55 | logger = logging.getLogger(name) 56 | logger.setLevel(level) 57 | 58 | if name is None: 59 | configure_root_logger(logger, level) 60 | 61 | return logger 62 | 63 | 64 | logger = get_logger(__name__) 65 | 66 | 67 | class URLBuilder: 68 | """URLBuilder allows to generate URLs for Product Opener/Robotoff. 69 | 70 | Example usage: URLBuilder.robotoff() returns the Robotoff URL. 71 | """ 72 | 73 | @staticmethod 74 | def _get_url( 75 | base_domain: str, 76 | prefix: Optional[str] = "world", 77 | tld: str = "org", 78 | scheme: Optional[str] = None, 79 | ): 80 | data = { 81 | "domain": f"{base_domain}.{tld}", 82 | "scheme": "https", 83 | } 84 | if prefix: 85 | data["prefix"] = prefix 86 | if scheme: 87 | data["scheme"] = scheme 88 | 89 | if "prefix" in data: 90 | return "%(scheme)s://%(prefix)s.%(domain)s" % data 91 | 92 | return "%(scheme)s://%(domain)s" % data 93 | 94 | @staticmethod 95 | def world(flavor: Flavor, environment: Environment): 96 | return URLBuilder._get_url( 97 | prefix="world", tld=environment.value, base_domain=flavor.get_base_domain() 98 | ) 99 | 100 | @staticmethod 101 | def robotoff(environment: Environment) -> str: 102 | return URLBuilder._get_url( 103 | prefix="robotoff", 104 | tld=environment.value, 105 | base_domain=Flavor.off.get_base_domain(), 106 | ) 107 | 108 | @staticmethod 109 | def static(flavor: Flavor, environment: Environment) -> str: 110 | return URLBuilder._get_url( 111 | prefix="static", tld=environment.value, base_domain=flavor.get_base_domain() 112 | ) 113 | 114 | @staticmethod 115 | def image_url(flavor: Flavor, environment: Environment, image_path: str) -> str: 116 | prefix = URLBuilder._get_url( 117 | prefix="images", tld=environment.value, base_domain=flavor.get_base_domain() 118 | ) 119 | return prefix + f"/images/products{image_path}" 120 | 121 | @staticmethod 122 | def country(flavor: Flavor, environment: Environment, country_code: str) -> str: 123 | return URLBuilder._get_url( 124 | prefix=country_code, 125 | tld=environment.value, 126 | base_domain=flavor.get_base_domain(), 127 | ) 128 | 129 | 130 | def jsonl_iter(jsonl_path: Union[str, Path]) -> Iterable[Dict]: 131 | """Iterate over elements of a JSONL file. 132 | 133 | :param jsonl_path: the path of the JSONL file. Both plain (.jsonl) and 134 | gzipped (jsonl.gz) files are supported. 135 | :yield: dict contained in the JSONL file 136 | """ 137 | open_fn = get_open_fn(jsonl_path) 138 | 139 | with open_fn(str(jsonl_path), "rt", encoding="utf-8") as f: 140 | yield from jsonl_iter_fp(f) 141 | 142 | 143 | def get_open_fn(filepath: Union[str, Path]) -> Callable: 144 | filepath = str(filepath) 145 | if filepath.endswith(".gz"): 146 | return gzip.open 147 | else: 148 | return open 149 | 150 | 151 | def jsonl_iter_fp(fp) -> Iterable[Dict]: 152 | for line in fp: 153 | line = line.strip("\n") 154 | if line: 155 | if _orjson_available: 156 | yield orjson.loads(line) 157 | else: 158 | yield json.loads(line) 159 | 160 | 161 | def load_json(filepath: Union[str, Path]) -> Union[Dict, List]: 162 | """Load a JSON file, support gzipped JSON files. 163 | 164 | :param path: the path of the file 165 | """ 166 | open = get_open_fn(filepath) 167 | with open(filepath, "rb") as f: 168 | if _orjson_available: 169 | return orjson.loads(f.read()) 170 | else: 171 | return json.loads(f.read().decode("utf-8")) 172 | 173 | 174 | def _sanitize_file_path(file_path: Path, suffix: str = "") -> Path: 175 | """A internal function to normalize cached filenames. 176 | 177 | :param file_path: the cached file path 178 | :param suffix: a optional filename suffix to add 179 | :return: a sanitized filepath 180 | """ 181 | return file_path.with_name(file_path.name.replace(".", "_") + suffix) 182 | 183 | 184 | def download_file(url: str, output_path: Path): 185 | """Download a dataset file and store it in `output_path`. 186 | 187 | The file metadata (`etag`, `url`, `created_at`) are stored in a JSON 188 | file whose name is derived from `output_path` 189 | :param url: the file URL 190 | :param output_path: the file output path 191 | """ 192 | r = http_session.get(url, stream=True) 193 | etag = r.headers.get("ETag", "").strip("'\"") 194 | 195 | # add a random string to the output path to avoid concurrent writes 196 | suffix = "".join(random.choices(string.ascii_letters, k=8)) 197 | tmp_output_path = output_path.with_name(output_path.name + f"-{suffix}.part") 198 | with ( 199 | tmp_output_path.open("wb") as f, 200 | tqdm.tqdm( 201 | unit="B", 202 | unit_scale=True, 203 | unit_divisor=1024, 204 | miniters=1, 205 | desc=str(output_path), 206 | total=int(r.headers.get("content-length", 0)), 207 | ) as pbar, 208 | ): 209 | for chunk in r.iter_content(chunk_size=4096): 210 | f.write(chunk) 211 | pbar.update(len(chunk)) 212 | 213 | shutil.move(tmp_output_path, output_path) 214 | 215 | _sanitize_file_path(output_path, ".json").write_text( 216 | json.dumps( 217 | { 218 | "etag": etag, 219 | "created_at": int(time.time()), 220 | "url": url, 221 | } 222 | ) 223 | ) 224 | 225 | 226 | def get_file_etag(dataset_path: Path) -> Optional[str]: 227 | """Return a dataset Etag. 228 | 229 | :param dataset_path: the path of the dataset 230 | :return: the file Etag 231 | """ 232 | metadata_path = _sanitize_file_path(dataset_path, ".json") 233 | 234 | if metadata_path.is_file(): 235 | return json.loads(metadata_path.read_text())["etag"] 236 | 237 | return None 238 | 239 | 240 | def fetch_etag(url: str) -> str: 241 | """Get the Etag of a remote file. 242 | 243 | :param url: the file URL 244 | :return: the Etag 245 | """ 246 | r = http_session.head(url) 247 | return r.headers.get("ETag", "").strip("'\"") 248 | 249 | 250 | def should_download_file( 251 | url: str, filepath: Path, force_download: bool, download_newer: bool 252 | ) -> bool: 253 | """Return True if the file located at `url` should be downloaded again. 254 | 255 | :param url: the file URL 256 | :param filepath: the file cached location 257 | :param force_download: if True, (re)download the file even if it was 258 | cached, defaults to False 259 | :param download_newer: if True, download the dataset if a more recent 260 | version compared to the cached version is available (based on file 261 | Etag). This parameter if ignored if force_download is True, defaults 262 | to False. 263 | :return: True if the file should be downloaded again, False otherwise 264 | """ 265 | if filepath.is_file(): 266 | if force_download: 267 | # Always download the file if force_download is True 268 | return True 269 | 270 | if download_newer: 271 | # Check if the file is up to date 272 | cached_etag = get_file_etag(filepath) 273 | current_etag = fetch_etag(url) 274 | return cached_etag != current_etag 275 | else: 276 | # The file exists, no need to download it again 277 | return False 278 | 279 | return True 280 | 281 | 282 | def get_country_name(country: Country) -> str: 283 | """Return country name code (ex: `en:portugal`) from `Country`.""" 284 | return COUNTRY_CODE_TO_NAME[country] 285 | 286 | 287 | class AssetLoadingException(Exception): 288 | """Exception raised by `get_asset_from_url` when an asset cannot be fetched 289 | from URL or if loading failed. 290 | """ 291 | 292 | pass 293 | 294 | 295 | @dataclasses.dataclass 296 | class AssetDownloadItem: 297 | """ "The result of a asset download operation. 298 | 299 | :param url: the URL of the asset 300 | :param response: the requests response object (or None) 301 | :param error: the error message if an error occured (or None) 302 | """ 303 | 304 | url: str 305 | response: Optional[requests.Response] = None 306 | error: Optional[str] = None 307 | 308 | 309 | @dataclasses.dataclass 310 | class ImageDownloadItem(AssetDownloadItem): 311 | """The result of a image download operation. 312 | 313 | :param image: the loaded PIL image, or None if an error occured 314 | :param image_bytes: the image bytes, or None if an error occured 315 | """ 316 | 317 | image: Optional["Image.Image"] = None 318 | image_bytes: Optional[bytes] = None 319 | 320 | 321 | def get_asset_from_url( 322 | asset_url: str, 323 | error_raise: bool = True, 324 | session: Optional[requests.Session] = None, 325 | auth: Optional[tuple[str, str]] = None, 326 | ) -> AssetDownloadItem: 327 | try: 328 | if session: 329 | r = session.get(asset_url, auth=auth) 330 | else: 331 | r = requests.get(asset_url, auth=auth) 332 | except ( 333 | requests.exceptions.ConnectionError, 334 | requests.exceptions.SSLError, 335 | requests.exceptions.Timeout, 336 | ) as e: 337 | error_message = "Cannot download %s" 338 | if error_raise: 339 | raise AssetLoadingException(error_message % asset_url) from e 340 | logger.info(error_message, asset_url, exc_info=e) 341 | return AssetDownloadItem(asset_url, error=error_message % asset_url) 342 | 343 | if not r.ok: 344 | error_message = "Cannot download %s: HTTP %s" 345 | error_args = (asset_url, r.status_code) 346 | if error_raise: 347 | raise AssetLoadingException(error_message % error_args) 348 | logger.log( 349 | logging.INFO if r.status_code < 500 else logging.WARNING, 350 | error_message, 351 | *error_args, 352 | ) 353 | return AssetDownloadItem( 354 | asset_url, response=r, error=error_message % error_args 355 | ) 356 | 357 | return AssetDownloadItem(asset_url, response=r) 358 | 359 | 360 | def get_image_from_url( 361 | image_url: str, 362 | error_raise: bool = True, 363 | session: Optional[requests.Session] = None, 364 | return_struct: bool = False, 365 | ) -> Union[ImageDownloadItem, "Image.Image", None]: 366 | """Fetch an image from `image_url` and load it. 367 | 368 | :param image_url: URL of the image to load. 369 | :param error_raise: if True, raises a `AssetLoadingException` if an error 370 | occured, defaults to False. If False, None is returned if an error 371 | occured. 372 | :param session: requests Session to use, by default no session is used. 373 | :param return_struct: if True, return a `ImageDownloadItem` object 374 | containing the image, image bytes and the response object. 375 | :return: the loaded image, or None if an error occured and `error_raise` 376 | is False. If `return_struct` is True, return a `ImageDownloadItem` 377 | object. 378 | """ 379 | if not _pillow_available: 380 | raise ImportError("Pillow is required to load images") 381 | 382 | asset_item = get_asset_from_url(image_url, error_raise, session) 383 | response = asset_item.response 384 | if response is None or asset_item.error: 385 | if return_struct: 386 | return ImageDownloadItem( 387 | url=image_url, response=response, error=asset_item.error 388 | ) 389 | else: 390 | return None 391 | 392 | content_bytes = response.content 393 | try: 394 | image = Image.open(BytesIO(content_bytes)) 395 | if return_struct: 396 | return ImageDownloadItem( 397 | url=image_url, 398 | response=response, 399 | image=image, 400 | image_bytes=content_bytes, 401 | ) 402 | return image 403 | except PIL.UnidentifiedImageError: 404 | error_message = f"Cannot identify image {image_url}" 405 | if error_raise: 406 | raise AssetLoadingException(error_message) 407 | logger.info(error_message) 408 | except PIL.Image.DecompressionBombError: 409 | error_message = f"Decompression bomb error for image {image_url}" 410 | if error_raise: 411 | raise AssetLoadingException(error_message) 412 | logger.info(error_message) 413 | 414 | if return_struct: 415 | return ImageDownloadItem(url=image_url, response=response, error=error_message) 416 | 417 | return None 418 | 419 | 420 | class PerfTimer: 421 | """A simple performance timer context manager.""" 422 | 423 | def __init__( 424 | self, metric_name: Optional[str] = None, metric_dict: Optional[Dict] = None 425 | ): 426 | self.metric_name = metric_name 427 | self.metric_dict = metric_dict 428 | 429 | def __enter__(self): 430 | self.start = perf_counter() 431 | return self 432 | 433 | def __exit__(self, type, value, traceback): 434 | self.elapsed = perf_counter() - self.start 435 | if self.metric_name and self.metric_dict is not None: 436 | self.metric_dict[self.metric_name] = self.elapsed 437 | -------------------------------------------------------------------------------- /openfoodfacts/ml/object_detection.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | import typing 4 | 5 | import albumentations as A 6 | import cv2 7 | import numpy as np 8 | from cv2 import dnn 9 | from tritonclient.grpc import service_pb2 10 | 11 | from openfoodfacts.types import JSONType 12 | from openfoodfacts.utils import PerfTimer 13 | 14 | from .triton import add_triton_infer_input_tensor, get_triton_inference_stub 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def object_detection_transform( 20 | image_size: int, 21 | fill: int = 114, 22 | pad_position: str = "center", 23 | normalize_mean: tuple[float, float, float] = (0.0, 0.0, 0.0), 24 | normalize_std: tuple[float, float, float] = (1.0, 1.0, 1.0), 25 | ) -> A.Compose: 26 | """Return the Albumentations transform pipeline for object detection. 27 | 28 | It resizes the image to fit within a square of size (image_size, 29 | image_size), preserving the aspect ratio, then pads the image to make it 30 | square, and finally normalizes the image. 31 | 32 | With the default settings, this pipeline matches the preprocessing used by 33 | Ultralytics YOLO models. 34 | 35 | Args: 36 | image_size (int): The target size for the longest side of the image. 37 | fill (int): The pixel value to use for padding. Default is 114. 38 | pad_position (str): The position to place the original image when 39 | padding. Default is "center". 40 | normalize_mean (tuple): The mean values for normalization. Default is 41 | (0.0, 0.0, 0.0). 42 | normalize_std (tuple): The std values for normalization. Default is 43 | (1.0, 1.0, 1.0). 44 | """ 45 | return A.Compose( 46 | [ 47 | A.LongestMaxSize(max_size=image_size, interpolation=cv2.INTER_LINEAR), 48 | A.PadIfNeeded( 49 | min_height=image_size, 50 | min_width=image_size, 51 | position=pad_position, 52 | fill=fill, 53 | ), 54 | A.Normalize(mean=normalize_mean, std=normalize_std, p=1.0), 55 | ], 56 | ) 57 | 58 | 59 | def reverse_bbox_transform( 60 | augmented_bbox: list, original_shape: tuple, image_size: int 61 | ) -> list: 62 | """ 63 | Reverses the Albumentations pipeline to find original bbox coordinates. 64 | 65 | Args: 66 | augmented_bbox (list): [y_min, x_min, y_max, x_max] from the 67 | augmented (image_size x image_size) image. 68 | original_shape (tuple): (height, width) of the *original* image. 69 | image_size (int): The target size used in the pipeline. 70 | 71 | Returns: 72 | list: [y_min, x_min, y_max, x_max] in relative coordinates. 73 | """ 74 | 75 | original_h, original_w = original_shape 76 | 77 | # --- 1. Re-calculate the forward transform parameters --- 78 | 79 | # From A.LongestMaxSize 80 | scale = image_size / max(original_h, original_w) 81 | 82 | # The dimensions of the image *after* scaling but *before* padding 83 | scaled_h = int(original_h * scale) 84 | scaled_w = int(original_w * scale) 85 | 86 | # From A.PadIfNeeded (position="center") 87 | # This is the amount of padding added to each side 88 | pad_top = (image_size - scaled_h) // 2 89 | pad_left = (image_size - scaled_w) // 2 90 | 91 | # --- 2. Apply the inverse transformation --- 92 | aug_y_min, aug_x_min, aug_y_max, aug_x_max = augmented_bbox 93 | 94 | # coord_orig = (coord_aug - padding) / scale 95 | orig_y_min = (aug_y_min - pad_top) / scale 96 | orig_x_min = (aug_x_min - pad_left) / scale 97 | orig_y_max = (aug_y_max - pad_top) / scale 98 | orig_x_max = (aug_x_max - pad_left) / scale 99 | 100 | return [ 101 | orig_y_min / original_h, 102 | orig_x_min / original_w, 103 | orig_y_max / original_h, 104 | orig_x_max / original_w, 105 | ] 106 | 107 | 108 | @dataclasses.dataclass 109 | class ObjectDetectionRawResult: 110 | """The raw result of an object detection model. 111 | 112 | Attributes: 113 | num_detections (int): The number of detections. 114 | detection_boxes (np.ndarray): The bounding boxes of the detections, in 115 | relative coordinates (between 0 and 1), with the format 116 | (y_min, x_min, y_max, x_max). 117 | detection_scores (np.ndarray): The scores of the detections. 118 | detection_classes (np.ndarray): The class indices of the detections. 119 | label_names (list[str]): The list of label names. 120 | metrics (dict[str, float]): The performance metrics of the detection. 121 | Each key is the name of the metric (a step in the inference 122 | process), and the value is the time taken in seconds. 123 | The following metrics are provided: 124 | - preprocess_time: time taken to preprocess the image 125 | - grpc_request_build_time: time taken to build the gRPC request 126 | - triton_inference_time: time taken for Triton inference 127 | - postprocess_time: time taken to postprocess the results 128 | - postprocess_nms_time: time taken for Non-Maximum Suppression 129 | (included in postprocess_time) 130 | """ 131 | 132 | num_detections: int 133 | detection_boxes: np.ndarray 134 | detection_scores: np.ndarray 135 | detection_classes: np.ndarray 136 | label_names: list[str] 137 | metrics: dict[str, float] = dataclasses.field(default_factory=dict) 138 | 139 | def to_list(self) -> list[JSONType]: 140 | """Convert the detection results to a JSON serializable format.""" 141 | results = [] 142 | for bounding_box, score, label in zip( 143 | self.detection_boxes, self.detection_scores, self.detection_classes 144 | ): 145 | label_int = int(label) 146 | label_str = self.label_names[label_int] 147 | if label_str is not None: 148 | result = { 149 | "bounding_box": tuple(bounding_box.tolist()), # type: ignore 150 | "score": float(score), 151 | "label": label_str, 152 | } 153 | results.append(result) 154 | return results 155 | 156 | 157 | class ObjectDetector: 158 | def __init__(self, model_name: str, label_names: list[str], image_size: int = 640): 159 | """An object detection detector based on Yolo models. 160 | 161 | We support models trained with Yolov8, v9, v10, v11 and v12 from 162 | Ultralytics. 163 | 164 | :param model_name: the name of the model, as registered in Triton 165 | :param label_names: the list of label names 166 | :param image_size: the size of the input image for the model 167 | """ 168 | self.model_name: str = model_name 169 | self.label_names = label_names 170 | self.image_size = image_size 171 | 172 | def detect_from_image( 173 | self, 174 | image: np.ndarray, 175 | triton_uri: str, 176 | threshold: float = 0.5, 177 | nms_threshold: float | None = None, 178 | nms_eta: float | None = None, 179 | model_version: str | None = None, 180 | ) -> ObjectDetectionRawResult: 181 | """Run an object detection model on an image. 182 | 183 | The model must have been trained with Ultralytics library. 184 | 185 | :param image: the input numpy image 186 | :param triton_uri: URI of the Triton Inference Server, defaults to 187 | None. If not provided, the default value from settings is used. 188 | :param threshold: the minimum score for a detection to be considered, 189 | defaults to 0.5. 190 | :param nms_threshold: the NMS (Non Maximum Suppression) threshold to 191 | use, defaults to None (0.7 will be used). 192 | :param nms_eta: the NMS eta parameter to use, defaults to None (1.0 193 | will be used). 194 | :param model_version: the version of the model to use, defaults to 195 | None (latest). 196 | :return: the detection result 197 | """ 198 | metrics: dict[str, float] = {} 199 | 200 | with PerfTimer("preprocess_time", metrics): 201 | image_array = self.preprocess(image_array=image) 202 | 203 | with PerfTimer("grpc_request_build_time", metrics): 204 | request = service_pb2.ModelInferRequest() 205 | request.model_name = self.model_name 206 | if model_version: 207 | request.model_version = model_version 208 | add_triton_infer_input_tensor( 209 | request, name="images", data=image_array, datatype="FP32" 210 | ) 211 | 212 | with PerfTimer("triton_inference_time", metrics): 213 | grpc_stub = get_triton_inference_stub(triton_uri) 214 | response = grpc_stub.ModelInfer(request) 215 | 216 | with PerfTimer("postprocess_time", metrics): 217 | original_shape = typing.cast(tuple[int, int], image.shape[:2]) 218 | response = self.postprocess( 219 | response, 220 | threshold=threshold, 221 | original_shape=original_shape, 222 | nms_threshold=nms_threshold, 223 | nms_eta=nms_eta, 224 | ) 225 | 226 | metrics.update(response.metrics) 227 | metrics["total_inference_time"] = ( 228 | metrics["preprocess_time"] 229 | + metrics["grpc_request_build_time"] 230 | + metrics["triton_inference_time"] 231 | + metrics["postprocess_time"] 232 | ) 233 | response.metrics = metrics 234 | return response 235 | 236 | def preprocess(self, image_array: np.ndarray) -> np.ndarray: 237 | # Apply the transform to the image 238 | image_array = object_detection_transform(image_size=self.image_size)( 239 | image=image_array 240 | )["image"] 241 | image_array = np.transpose(image_array, (2, 0, 1))[np.newaxis, :] # HWC to CHW 242 | return image_array 243 | 244 | def postprocess( 245 | self, 246 | response, 247 | threshold: float, 248 | original_shape: tuple[int, int], 249 | nms_threshold: float | None = None, 250 | nms_eta: float | None = None, 251 | ) -> ObjectDetectionRawResult: 252 | """Postprocess the output of the object detection model. 253 | 254 | :param response: the Triton Inference response 255 | :param threshold: the minimum score for a detection to be considered 256 | :param original_shape: the original shape of the image (height, width) 257 | :param nms_threshold: the NMS (Non Maximum Suppression) threshold to 258 | use, defaults to None (0.7 will be used). 259 | :param nms_eta: the NMS eta parameter to use, defaults to None (1.0 260 | will be used). 261 | :return: the detection result 262 | """ 263 | if len(response.outputs) != 1: 264 | raise ValueError(f"expected 1 output, got {len(response.outputs)}") 265 | 266 | if len(response.raw_output_contents) != 1: 267 | raise ValueError( 268 | f"expected 1 raw output content, got {len(response.raw_output_contents)}" 269 | ) 270 | 271 | if nms_threshold is None: 272 | nms_threshold = 0.7 273 | if nms_eta is None: 274 | nms_eta = 1.0 275 | 276 | output_index = {output.name: i for i, output in enumerate(response.outputs)} 277 | output = np.frombuffer( 278 | response.raw_output_contents[output_index["output0"]], 279 | dtype=np.float32, 280 | ).reshape((1, len(self.label_names) + 4, -1))[0] 281 | 282 | # output is of shape (num_classes + 4, num_detections) 283 | rows = output.shape[1] 284 | raw_detection_classes = np.zeros(rows, dtype=int) 285 | raw_detection_scores = np.zeros(rows, dtype=np.float32) 286 | raw_detection_boxes = np.zeros((rows, 4), dtype=np.float32) 287 | 288 | for i in range(rows): 289 | classes_scores = output[4:, i] 290 | max_cls_idx = np.argmax(classes_scores) 291 | max_score = classes_scores[max_cls_idx] 292 | if max_score < threshold: 293 | continue 294 | raw_detection_classes[i] = max_cls_idx 295 | raw_detection_scores[i] = max_score 296 | 297 | # The bounding box is in the format (x, y, width, height) in 298 | # relative coordinates 299 | # x and y are the coordinates of the center of the bounding box 300 | bbox_width = output[2, i] 301 | bbox_height = output[3, i] 302 | x_min = output[0, i] - 0.5 * bbox_width 303 | y_min = output[1, i] - 0.5 * bbox_height 304 | x_max = x_min + bbox_width 305 | y_max = y_min + bbox_height 306 | 307 | # We save the bounding box in the format 308 | # (y_min, x_min, y_max, x_max) in relative coordinates 309 | # Scale the bounding boxes back to the original image size 310 | 311 | reversed_bboxes = reverse_bbox_transform( 312 | augmented_bbox=[y_min, x_min, y_max, x_max], 313 | original_shape=original_shape, 314 | image_size=self.image_size, 315 | ) 316 | raw_detection_boxes[i, 0] = max(0.0, min(1.0, reversed_bboxes[0])) 317 | raw_detection_boxes[i, 1] = max(0.0, min(1.0, reversed_bboxes[1])) 318 | raw_detection_boxes[i, 2] = max(0.0, min(1.0, reversed_bboxes[2])) 319 | raw_detection_boxes[i, 3] = max(0.0, min(1.0, reversed_bboxes[3])) 320 | 321 | metrics: dict[str, float] = {} 322 | with PerfTimer("postprocess_nms_time", metrics): 323 | # Perform NMS (Non Maximum Suppression) 324 | detection_box_indices = dnn.NMSBoxes( 325 | raw_detection_boxes, # type: ignore 326 | raw_detection_scores, # type: ignore 327 | score_threshold=threshold, 328 | # the following values are copied from Ultralytics settings 329 | nms_threshold=nms_threshold, 330 | eta=nms_eta, 331 | ) 332 | detection_classes = np.zeros(len(detection_box_indices), dtype=int) 333 | detection_scores = np.zeros(len(detection_box_indices), dtype=np.float32) 334 | detection_boxes = np.zeros((len(detection_box_indices), 4), dtype=np.float32) 335 | 336 | for i, idx in enumerate(detection_box_indices): 337 | detection_classes[i] = raw_detection_classes[idx] 338 | detection_scores[i] = raw_detection_scores[idx] 339 | detection_boxes[i] = raw_detection_boxes[idx] 340 | 341 | result = ObjectDetectionRawResult( 342 | num_detections=rows, 343 | detection_classes=detection_classes, 344 | detection_boxes=detection_boxes, 345 | detection_scores=detection_scores, 346 | label_names=self.label_names, 347 | metrics=metrics, 348 | ) 349 | return result 350 | -------------------------------------------------------------------------------- /tests/unit/test_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import pytest 5 | import requests_mock 6 | 7 | import openfoodfacts 8 | 9 | TEST_USER_AGENT = "test_off_python" 10 | 11 | 12 | class TestProducts: 13 | def test_get_product(self): 14 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 15 | code = "1223435" 16 | response_data = { 17 | "product": {"code": "1223435"}, 18 | "status": 1, 19 | "status_verbose": "product found", 20 | } 21 | with requests_mock.mock() as mock: 22 | mock.get( 23 | f"https://world.openfoodfacts.org/api/v2/product/{code}", 24 | text=json.dumps(response_data), 25 | ) 26 | res = api.product.get(code) 27 | assert res == response_data["product"] 28 | 29 | def test_get_product_missing(self): 30 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 31 | code = "1223435" 32 | response_data = { 33 | "status": 0, 34 | "status_verbose": "product not found", 35 | } 36 | with requests_mock.mock() as mock: 37 | mock.get( 38 | f"https://world.openfoodfacts.org/api/v2/product/{code}", 39 | text=json.dumps(response_data), 40 | status_code=404, 41 | ) 42 | res = api.product.get(code) 43 | assert res is None 44 | 45 | def test_get_product_with_fields(self): 46 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 47 | code = "1223435" 48 | response_data = { 49 | "product": {"code": "1223435"}, 50 | "status": 1, 51 | "status_verbose": "product found", 52 | } 53 | with requests_mock.mock() as mock: 54 | mock.get( 55 | f"https://world.openfoodfacts.org/api/v2/product/{code}", 56 | text=json.dumps(response_data), 57 | ) 58 | res = api.product.get(code, fields=["code"]) 59 | assert res == response_data["product"] 60 | assert mock.last_request.qs["fields"] == ["code"] 61 | 62 | def test_get_product_invalid_code(self): 63 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 64 | code = "84800002930392025252502520502" 65 | response_data = { 66 | "status": 0, 67 | "status_verbose": "no code or invalid code", 68 | } 69 | with requests_mock.mock() as mock: 70 | mock.get( 71 | f"https://world.openfoodfacts.org/api/v2/product/{code}", 72 | text=json.dumps(response_data), 73 | status_code=200, 74 | ) 75 | res = api.product.get(code) 76 | assert res is None 77 | 78 | with pytest.raises( 79 | ValueError, 80 | match="invalid barcode: 84800002930392025252502520502", 81 | ): 82 | api.product.get(code, raise_if_invalid=True) 83 | 84 | def test_text_search(self): 85 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 86 | with requests_mock.mock() as mock: 87 | response_data = {"products": ["kinder bueno"], "count": 1} 88 | mock.get( 89 | "https://world.openfoodfacts.org/cgi/search.pl?" 90 | + "search_terms=kinder+bueno&json=1&page=" 91 | + "1&page_size=20", 92 | text=json.dumps(response_data), 93 | ) 94 | res = api.product.text_search("kinder bueno") 95 | assert res["products"] == ["kinder bueno"] 96 | response_data = {"products": ["banania", "banania big"], "count": 2} 97 | mock.get( 98 | "https://world.openfoodfacts.org/cgi/search.pl?" 99 | + "search_terms=banania&json=1&page=" 100 | + "2&page_size=10&sort_by=unique_scans", 101 | text=json.dumps(response_data), 102 | ) 103 | res = api.product.text_search( 104 | "banania", page=2, page_size=10, sort_by="unique_scans" 105 | ) 106 | assert res["products"] == ["banania", "banania big"] 107 | 108 | def test_parse_ingredients(self): 109 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 110 | ingredients_data = [ 111 | { 112 | "ciqual_food_code": "18066", 113 | "ecobalyse_code": "tap-water", 114 | "id": "en:water", 115 | "is_in_taxonomy": 1, 116 | "percent_estimate": 75, 117 | "percent_max": 100, 118 | "percent_min": 50, 119 | "text": "eau", 120 | "vegan": "yes", 121 | "vegetarian": "yes", 122 | }, 123 | { 124 | "ciqual_proxy_food_code": "31016", 125 | "ecobalyse_code": "sugar", 126 | "id": "en:sugar", 127 | "is_in_taxonomy": 1, 128 | "percent_estimate": 25, 129 | "percent_max": 50, 130 | "percent_min": 0, 131 | "text": "sucre", 132 | "vegan": "yes", 133 | "vegetarian": "yes", 134 | }, 135 | ] 136 | with requests_mock.mock() as mock: 137 | response_data = { 138 | "product": {"ingredients": ingredients_data}, 139 | "status": "success", 140 | } 141 | mock.patch( 142 | "https://world.openfoodfacts.org/api/v3/product/test", 143 | text=json.dumps(response_data), 144 | ) 145 | res = api.product.parse_ingredients("eau, sucre", lang="fr") 146 | assert res == ingredients_data 147 | 148 | def test_parse_ingredients_fail(self): 149 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 150 | with requests_mock.mock() as mock: 151 | response_data = { 152 | "status": "fail", 153 | } 154 | mock.patch( 155 | "https://world.openfoodfacts.org/api/v3/product/test", 156 | text=json.dumps(response_data), 157 | ) 158 | 159 | with pytest.raises( 160 | RuntimeError, 161 | match="Unable to parse ingredients: {'status': 'fail'}", 162 | ): 163 | api.product.parse_ingredients("eau, sucre", lang="fr") 164 | 165 | def test_parse_ingredients_fail_non_HTTP_200(self): 166 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 167 | with requests_mock.mock() as mock: 168 | mock.patch( 169 | "https://world.openfoodfacts.org/api/v3/product/test", 170 | status_code=400, 171 | text='{"error": "Bad Request"}', 172 | ) 173 | 174 | with pytest.raises( 175 | RuntimeError, 176 | match=re.escape( 177 | 'Unable to parse ingredients (non-200 status code): 400, {"error": "Bad Request"}' 178 | ), 179 | ): 180 | api.product.parse_ingredients("eau, sucre", lang="fr") 181 | 182 | def test_upload_image_success(self): 183 | api = openfoodfacts.API( 184 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 185 | ) 186 | code = "1223435" 187 | response_data = { 188 | "code": "1223435", 189 | "errors": [], 190 | "product": { 191 | "images": { 192 | "uploaded": { 193 | "1": { 194 | "imgid": 1, 195 | "sizes": { 196 | "100": {"h": 100, "w": 62}, 197 | "400": {"h": 400, "w": 248}, 198 | "full": {"h": 400, "w": 248}, 199 | }, 200 | "uploaded_t": 1758793764, 201 | "uploader": "test", 202 | } 203 | } 204 | } 205 | }, 206 | "result": { 207 | "id": "image_uploaded", 208 | "lc_name": "Image uploaded", 209 | "name": "Image uploaded", 210 | }, 211 | "status": "success", 212 | "warnings": [], 213 | } 214 | with requests_mock.mock() as mock: 215 | mock.post( 216 | f"https://world.openfoodfacts.org/api/v3/product/{code}/images", 217 | text=json.dumps(response_data), 218 | status_code=200, 219 | ) 220 | res = api.product.upload_image(code, image_data_base64="dGVzdA==") 221 | assert res.status_code == 200 222 | assert mock.last_request.json() == { 223 | "image_data_base64": "dGVzdA==", 224 | "user_id": "test", 225 | "password": "test", 226 | } 227 | 228 | def test_upload_image_with_selected(self): 229 | api = openfoodfacts.API( 230 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 231 | ) 232 | code = "1223435" 233 | response_data = { 234 | "code": "1223435", 235 | "errors": [], 236 | "product": { 237 | "images": { 238 | "selected": { 239 | "front": { 240 | "en": { 241 | "generation": {}, 242 | "imgid": 1, 243 | "rev": 2, 244 | "sizes": { 245 | "100": {"h": 100, "w": 62}, 246 | "200": {"h": 200, "w": 124}, 247 | "400": {"h": 400, "w": 248}, 248 | "full": {"h": 400, "w": 248}, 249 | }, 250 | } 251 | } 252 | }, 253 | "uploaded": { 254 | "1": { 255 | "imgid": 1, 256 | "sizes": { 257 | "100": {"h": 100, "w": 62}, 258 | "400": {"h": 400, "w": 248}, 259 | "full": {"h": 400, "w": 248}, 260 | }, 261 | "uploaded_t": 1758793852, 262 | "uploader": "test", 263 | } 264 | }, 265 | } 266 | }, 267 | "result": { 268 | "id": "image_uploaded", 269 | "lc_name": "Image uploaded", 270 | "name": "Image uploaded", 271 | }, 272 | "status": "success", 273 | "warnings": [], 274 | } 275 | with requests_mock.mock() as mock: 276 | mock.post( 277 | f"https://world.openfoodfacts.org/api/v3/product/{code}/images", 278 | text=json.dumps(response_data), 279 | status_code=200, 280 | ) 281 | res = api.product.upload_image( 282 | code, image_data_base64="dGVzdA==", selected={"front": {"en": {}}} 283 | ) 284 | assert res.status_code == 200 285 | assert mock.last_request.json() == { 286 | "image_data_base64": "dGVzdA==", 287 | "user_id": "test", 288 | "password": "test", 289 | "selected": {"front": {"en": {}}}, 290 | } 291 | 292 | def test_upload_image_no_auth(self): 293 | api = openfoodfacts.API(user_agent=TEST_USER_AGENT, version="v2") 294 | code = "1223435" 295 | with pytest.raises( 296 | ValueError, 297 | match="a password or a session cookie is required to upload an image", 298 | ): 299 | api.product.upload_image(code, image_data_base64="dGVzdA==") 300 | 301 | def test_upload_image_invalid_code(self): 302 | api = openfoodfacts.API( 303 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 304 | ) 305 | code = "invalidcode" 306 | with pytest.raises( 307 | ValueError, 308 | match="code must be a numeric string", 309 | ): 310 | api.product.upload_image(code, image_data_base64="dGVzdA==") 311 | 312 | def test_upload_image_no_data(self): 313 | api = openfoodfacts.API( 314 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 315 | ) 316 | code = "1223435" 317 | with pytest.raises( 318 | ValueError, 319 | match="one of image_path or image_data_base64 must be provided", 320 | ): 321 | api.product.upload_image(code) 322 | 323 | def test_upload_image_both_data(self): 324 | api = openfoodfacts.API( 325 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 326 | ) 327 | code = "1223435" 328 | with pytest.raises( 329 | ValueError, 330 | match="only one of image_path or image_data_base64 must be provided", 331 | ): 332 | api.product.upload_image( 333 | code, image_path="path/to/image.jpg", image_data_base64="dGVzdA==" 334 | ) 335 | 336 | def test_upload_image_invalid_selected(self): 337 | api = openfoodfacts.API( 338 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 339 | ) 340 | code = "1223435" 341 | with pytest.raises( 342 | ValueError, 343 | match=re.escape( 344 | "invalid image field name in selected: wrong (must be one of front, ingredients, nutrition, packaging)" 345 | ), 346 | ): 347 | api.product.upload_image( 348 | code, image_data_base64="dGVzdA==", selected={"wrong": {}} 349 | ) 350 | 351 | def test_upload_image_with_path(self, tmp_path): 352 | api = openfoodfacts.API( 353 | user_agent=TEST_USER_AGENT, version="v2", username="test", password="test" 354 | ) 355 | code = "1223435" 356 | response_data = { 357 | "code": "1223435", 358 | "errors": [], 359 | "product": { 360 | "images": { 361 | "uploaded": { 362 | "1": { 363 | "imgid": 1, 364 | "sizes": { 365 | "100": {"h": 100, "w": 62}, 366 | "400": {"h": 400, "w": 248}, 367 | "full": {"h": 400, "w": 248}, 368 | }, 369 | "uploaded_t": 1758793764, 370 | "uploader": "test", 371 | } 372 | } 373 | } 374 | }, 375 | "result": { 376 | "id": "image_uploaded", 377 | "lc_name": "Image uploaded", 378 | "name": "Image uploaded", 379 | }, 380 | "status": "success", 381 | "warnings": [], 382 | } 383 | image_path = tmp_path / "test_image.jpg" 384 | image_path.write_bytes(b"test") 385 | with requests_mock.mock() as mock: 386 | mock.post( 387 | f"https://world.openfoodfacts.org/api/v3/product/{code}/images", 388 | text=json.dumps(response_data), 389 | status_code=200, 390 | ) 391 | res = api.product.upload_image(code, image_path=image_path) 392 | assert res.status_code == 200 393 | assert mock.last_request.json() == { 394 | "image_data_base64": "dGVzdA==", 395 | "user_id": "test", 396 | "password": "test", 397 | } 398 | -------------------------------------------------------------------------------- /openfoodfacts/taxonomy.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Dict, Iterable, List, Optional, Set, Union 3 | 4 | import requests 5 | 6 | from openfoodfacts.utils.text import get_tag, replace_lang_prefix 7 | 8 | from .types import Environment, Flavor, JSONType, TaxonomyType 9 | from .utils import ( 10 | URLBuilder, 11 | download_file, 12 | get_logger, 13 | http_session, 14 | load_json, 15 | should_download_file, 16 | ) 17 | 18 | logger = get_logger(__name__) 19 | 20 | 21 | DEFAULT_CACHE_DIR = Path("~/.cache/openfoodfacts/taxonomy").expanduser() 22 | 23 | 24 | # Only available for Open Food Facts for now (not other flavors) 25 | TAXONOMY_URLS = { 26 | TaxonomyType.category: URLBuilder.static(Flavor.off, Environment.org) 27 | + "/data/taxonomies/categories.full.json", 28 | TaxonomyType.ingredient: URLBuilder.static(Flavor.off, Environment.org) 29 | + "/data/taxonomies/ingredients.full.json", 30 | TaxonomyType.label: URLBuilder.static(Flavor.off, Environment.org) 31 | + "/data/taxonomies/labels.full.json", 32 | TaxonomyType.brand: URLBuilder.static(Flavor.off, Environment.org) 33 | + "/data/taxonomies/brands.full.json", 34 | TaxonomyType.packaging_shape: URLBuilder.static(Flavor.off, Environment.org) 35 | + "/data/taxonomies/packaging_shapes.full.json", 36 | TaxonomyType.packaging_material: URLBuilder.static(Flavor.off, Environment.org) 37 | + "/data/taxonomies/packaging_materials.full.json", 38 | TaxonomyType.packaging_recycling: URLBuilder.static(Flavor.off, Environment.org) 39 | + "/data/taxonomies/packaging_recycling.full.json", 40 | TaxonomyType.country: URLBuilder.static(Flavor.off, Environment.org) 41 | + "/data/taxonomies/countries.full.json", 42 | TaxonomyType.store: URLBuilder.static(Flavor.off, Environment.org) 43 | + "/data/taxonomies/stores.full.json", 44 | TaxonomyType.nova_group: URLBuilder.static(Flavor.off, Environment.org) 45 | + "/data/taxonomies/nova_groups.full.json", 46 | TaxonomyType.additive: URLBuilder.static(Flavor.off, Environment.org) 47 | + "/data/taxonomies/additives.full.json", 48 | TaxonomyType.vitamin: URLBuilder.static(Flavor.off, Environment.org) 49 | + "/data/taxonomies/vitamins.full.json", 50 | TaxonomyType.mineral: URLBuilder.static(Flavor.off, Environment.org) 51 | + "/data/taxonomies/minerals.full.json", 52 | TaxonomyType.amino_acid: URLBuilder.static(Flavor.off, Environment.org) 53 | + "/data/taxonomies/amino_acids.full.json", 54 | TaxonomyType.nucleotide: URLBuilder.static(Flavor.off, Environment.org) 55 | + "/data/taxonomies/nucleotides.full.json", 56 | TaxonomyType.allergen: URLBuilder.static(Flavor.off, Environment.org) 57 | + "/data/taxonomies/allergens.full.json", 58 | TaxonomyType.state: URLBuilder.static(Flavor.off, Environment.org) 59 | + "/data/taxonomies/states.full.json", 60 | TaxonomyType.data_quality: URLBuilder.static(Flavor.off, Environment.org) 61 | + "/data/taxonomies/data_quality.full.json", 62 | TaxonomyType.origin: URLBuilder.static(Flavor.off, Environment.org) 63 | + "/data/taxonomies/origins.full.json", 64 | TaxonomyType.language: URLBuilder.static(Flavor.off, Environment.org) 65 | + "/data/taxonomies/languages.full.json", 66 | TaxonomyType.other_nutritional_substance: URLBuilder.static( 67 | Flavor.off, Environment.org 68 | ) 69 | + "/data/taxonomies/other_nutritional_substances.full.json", 70 | } 71 | 72 | 73 | class TaxonomyNode: 74 | """A taxonomy element. 75 | 76 | Each node has 0+ parents and 0+ children. Each node has the following 77 | attributes: 78 | 79 | - `id`: the node identifier, it starts with a language prefix (ex: `en:`) 80 | - `names`: a dict mapping language 2-letter code to the node name for this 81 | language 82 | - `parents`: the list of the node parents 83 | - `children`: the list of the node children 84 | - `properties`: additional properties of the node (taxonomy-dependent) 85 | - `synonyms`: a dict mapping language 2-letter code to a list of synonyms 86 | for this language 87 | """ 88 | 89 | __slots__ = ("id", "names", "parents", "children", "synonyms", "properties") 90 | 91 | def __init__( 92 | self, 93 | identifier: str, 94 | names: Dict[str, str], 95 | synonyms: Optional[Dict[str, List[str]]], 96 | properties: Optional[Dict[str, Any]] = None, 97 | ): 98 | self.id: str = identifier 99 | self.names: Dict[str, str] = names 100 | self.parents: List["TaxonomyNode"] = [] 101 | self.children: List["TaxonomyNode"] = [] 102 | self.properties = properties or {} 103 | 104 | if synonyms: 105 | self.synonyms = synonyms 106 | else: 107 | self.synonyms = {} 108 | 109 | def is_child_of(self, item: "TaxonomyNode") -> bool: 110 | """Return True if `item` is a child of `self` in the taxonomy.""" 111 | if not self.parents: 112 | return False 113 | 114 | if item in self.parents: 115 | return True 116 | 117 | for parent in self.parents: 118 | is_parent = parent.is_child_of(item) 119 | 120 | if is_parent: 121 | return True 122 | 123 | return False 124 | 125 | def is_parent_of(self, candidate: "TaxonomyNode") -> bool: 126 | """Return True if `self` is parent of `candidate`, False otherwise. 127 | 128 | :param candidate: a TaxonomyNode of the same Taxonomy 129 | """ 130 | return candidate.is_child_of(self) 131 | 132 | def is_parent_of_any(self, candidates: Iterable["TaxonomyNode"]) -> bool: 133 | """Return True if `self` is a parent of any of `candidates`, False 134 | otherwise. 135 | 136 | :param candidates: an iterable of TaxonomyNodes of the same Taxonomy 137 | """ 138 | for candidate in candidates: 139 | if candidate.is_child_of(self): 140 | return True 141 | 142 | return False 143 | 144 | def get_parents_hierarchy(self) -> List["TaxonomyNode"]: 145 | """Return the list of all parent nodes (direct and indirect).""" 146 | all_parents = [] 147 | seen: Set[str] = set() 148 | 149 | if not self.parents: 150 | return [] 151 | 152 | for self_parent in self.parents: 153 | if self_parent.id not in seen: 154 | all_parents.append(self_parent) 155 | seen.add(self_parent.id) 156 | 157 | for parent_parent in self_parent.get_parents_hierarchy(): 158 | if parent_parent.id not in seen: 159 | all_parents.append(parent_parent) 160 | seen.add(parent_parent.id) 161 | 162 | return all_parents 163 | 164 | def get_localized_name(self, lang: str) -> str: 165 | """Return the localized name of the node. 166 | 167 | We first check if there is an entry in `names` under the provided 168 | `lang`. Otherwise, we check the existence of an international name 169 | (`xx`). We eventually return the node ID if none of the previous 170 | checks were successful. 171 | 172 | :param lang: the language code 173 | """ 174 | if lang in self.names: 175 | return self.names[lang] 176 | 177 | if "xx" in self.names: 178 | # Return international name if it exists 179 | return self.names["xx"] 180 | 181 | return self.id 182 | 183 | def get_synonyms(self, lang: str) -> List[str]: 184 | return self.synonyms.get(lang, []) 185 | 186 | def add_parents(self, parents: Iterable["TaxonomyNode"]): 187 | for parent in parents: 188 | if parent not in self.parents: 189 | self.parents.append(parent) 190 | parent.children.append(self) 191 | 192 | def to_dict(self) -> JSONType: 193 | return {"name": self.names, "parents": [p.id for p in self.parents]} 194 | 195 | def __repr__(self): 196 | return "" % self.id 197 | 198 | 199 | class Taxonomy: 200 | """A class representing a taxonomy. 201 | 202 | For more information about taxonomy, see 203 | https://wiki.openfoodfacts.org/Global_taxonomies. 204 | 205 | A Taxonomy instance has only a single `nodes` attribute, that maps the 206 | node identifier to a `TaxonomyNode`. 207 | """ 208 | 209 | def __init__(self) -> None: 210 | self.nodes: Dict[str, TaxonomyNode] = {} 211 | 212 | def add(self, key: str, node: TaxonomyNode) -> None: 213 | """Add a node to the taxonomy under the id `key`. 214 | 215 | :param key: The node id 216 | :param node: the TaxonomyNode 217 | """ 218 | self.nodes[key] = node 219 | 220 | def __contains__(self, item: str): 221 | """Return True if `item` (a taxonomy id) is in the taxonomy, False 222 | otherwise.""" 223 | return item in self.nodes 224 | 225 | def __getitem__(self, item: str): 226 | return self.nodes.get(item) 227 | 228 | def __len__(self) -> int: 229 | """Return the number of items in the taxonomy.""" 230 | return len(self.nodes) 231 | 232 | def iter_nodes(self) -> Iterable[TaxonomyNode]: 233 | """Iterate over the nodes of the taxonomy.""" 234 | return iter(self.nodes.values()) 235 | 236 | def keys(self) -> Iterable[str]: 237 | """Return all node IDs from the taxonomy.""" 238 | return self.nodes.keys() 239 | 240 | def find_deepest_nodes(self, nodes: List[TaxonomyNode]) -> List[TaxonomyNode]: 241 | """Given a list of nodes, returns the list of nodes where all the 242 | parents within the list have been removed. 243 | 244 | For example, for a taxonomy, 'fish' -> 'salmon' -> 'smoked-salmon': 245 | 246 | ['fish', 'salmon'] -> ['salmon'] ['fish', 'smoked-salmon'] -> 247 | [smoked-salmon'] 248 | """ 249 | excluded: Set[str] = set() 250 | 251 | for node in nodes: 252 | for second_node in ( 253 | n for n in nodes if n.id not in excluded and n.id != node.id 254 | ): 255 | if node.is_child_of(second_node): 256 | excluded.add(second_node.id) 257 | 258 | return [node for node in nodes if node.id not in excluded] 259 | 260 | def is_parent_of_any( 261 | self, item: str, candidates: Iterable[str], raises: bool = True 262 | ) -> bool: 263 | """Return True if `item` is parent of any candidate, False otherwise. 264 | 265 | If the item is not in the taxonomy and raises is False, return False. 266 | 267 | :param item: The item to compare 268 | :param candidates: A list of candidates 269 | :param raises: if True, raises a ValueError if item is not in the 270 | taxonomy, defaults to True. 271 | """ 272 | node: TaxonomyNode = self[item] 273 | 274 | if node is None: 275 | if raises: 276 | raise ValueError("unknown id in taxonomy: %s", node) 277 | else: 278 | return False 279 | 280 | to_check_nodes: Set[TaxonomyNode] = set() 281 | 282 | for candidate in candidates: 283 | candidate_node = self[candidate] 284 | 285 | if candidate_node is not None: 286 | to_check_nodes.add(candidate_node) 287 | 288 | return node.is_parent_of_any(to_check_nodes) 289 | 290 | def get_localized_name(self, key: str, lang: str) -> str: 291 | """Return the name of a taxonomy element in a given language. 292 | 293 | If `key` is not in the taxonomy or if no name is available for the 294 | requested language, return `key`. 295 | 296 | :param key: the taxonomy element id 297 | :param lang: the 2-letter language code 298 | :return: the localized name 299 | """ 300 | if key not in self.nodes: 301 | return key 302 | 303 | return self.nodes[key].get_localized_name(lang) 304 | 305 | def to_dict(self) -> JSONType: 306 | """Generate a dict from the Taxonomy.""" 307 | export = {} 308 | 309 | for key, node in self.nodes.items(): 310 | export[key] = node.to_dict() 311 | 312 | return export 313 | 314 | @classmethod 315 | def from_dict(cls, data: JSONType) -> "Taxonomy": 316 | """Create a Taxonomy from `data`. 317 | 318 | :param data: the taxonomy as a dict 319 | :return: a Taxonomy 320 | """ 321 | taxonomy = Taxonomy() 322 | 323 | for key, key_data in data.items(): 324 | if key not in taxonomy: 325 | node = TaxonomyNode( 326 | identifier=key, 327 | names=key_data.get("name", {}), 328 | synonyms=key_data.get("synonyms", None), 329 | properties={ 330 | k: v 331 | for k, v in key_data.items() 332 | if k not in {"parents", "name", "synonyms", "children"} 333 | }, 334 | ) 335 | taxonomy.add(key, node) 336 | 337 | for key, key_data in data.items(): 338 | node = taxonomy[key] 339 | parents = [taxonomy[ref] for ref in key_data.get("parents", [])] 340 | node.add_parents(parents) 341 | 342 | return taxonomy 343 | 344 | @classmethod 345 | def from_path(cls, file_path: Union[str, Path]) -> "Taxonomy": 346 | """Create a Taxonomy from a JSON file. 347 | 348 | :param file_path: a JSON file, gzipped (.json.gz) files are supported 349 | :return: a Taxonomy 350 | """ 351 | return cls.from_dict(load_json(file_path)) # type: ignore 352 | 353 | @classmethod 354 | def from_url( 355 | cls, url: str, session: Optional[requests.Session] = None, timeout: int = 120 356 | ) -> "Taxonomy": 357 | """Create a Taxonomy from a taxonomy file hosted at `url`. 358 | 359 | :param url: the URL of the taxonomy 360 | :param session: the requests session, use a default session if None 361 | :param timeout: the request timeout, defaults to 120 362 | :return: a Taxonomy 363 | """ 364 | session = http_session if session is None else session 365 | r = session.get(url, timeout=timeout) 366 | data = r.json() 367 | return cls.from_dict(data) 368 | 369 | @classmethod 370 | def from_type(cls, taxonomy_type: TaxonomyType) -> "Taxonomy": 371 | """Create a Taxonomy from a taxonomy file hosted online from a 372 | taxonomy type. 373 | 374 | :param taxonomy_type: the taxonomy type 375 | :return: a Taxonomy 376 | """ 377 | url = TAXONOMY_URLS[TaxonomyType[taxonomy_type]] 378 | return cls.from_url(url) 379 | 380 | 381 | def get_taxonomy( 382 | taxonomy_type: Union[TaxonomyType, str], 383 | force_download: bool = False, 384 | download_newer: bool = False, 385 | cache_dir: Optional[Path] = None, 386 | ) -> Taxonomy: 387 | """Return the taxonomy of the provided type. 388 | 389 | The taxonomy file is downloaded and cached locally. 390 | 391 | :param taxonomy_type: the requested taxonomy type 392 | :param force_download: if True, (re)download the taxonomy even if it was 393 | cached, defaults to False 394 | :param download_newer: if True, download the taxonomy if a more recent 395 | version compared to the cached version is available (based on file 396 | Etag). This parameter if ignored if force_download is True, defaults 397 | to False. 398 | :param cache_dir: the cache directory to use, defaults to 399 | ~/.cache/openfoodfacts/taxonomy 400 | :return: a Taxonomy 401 | """ 402 | taxonomy_type = TaxonomyType[taxonomy_type] 403 | filename = f"{taxonomy_type.name}.json" 404 | 405 | cache_dir = DEFAULT_CACHE_DIR if cache_dir is None else cache_dir 406 | taxonomy_path = cache_dir / filename 407 | url = TAXONOMY_URLS[taxonomy_type] 408 | 409 | if not should_download_file(url, taxonomy_path, force_download, download_newer): 410 | return Taxonomy.from_path(taxonomy_path) 411 | 412 | cache_dir.mkdir(parents=True, exist_ok=True) 413 | logger.info("Downloading taxonomy, saving it in %s", taxonomy_path) 414 | download_file(url, taxonomy_path) 415 | return Taxonomy.from_path(taxonomy_path) 416 | 417 | 418 | def create_taxonomy_mapping(taxonomy: Taxonomy) -> Dict[str, str]: 419 | """From a taxonomy, create a mapping of tags to taxonomy node ids. 420 | 421 | The mapping is created by iterating over the nodes of the taxonomy and 422 | creating a tag from the name and synonyms of each node. 423 | 424 | The taxonomy mapping has the following format: 425 | { 426 | "fr:noix": "en:nuts", 427 | "en:nuts": "en:nuts", 428 | ... 429 | } 430 | 431 | :param taxonomy: the taxonomy to use 432 | :return: a dict mapping tags (with language prefix) to taxonomy node ids 433 | """ 434 | mapping = {} 435 | for node in taxonomy.iter_nodes(): 436 | for lang, name in node.names.items(): 437 | tag = get_tag(name) 438 | tag_id = f"{lang}:{tag}".lower() 439 | mapping[tag_id] = node.id 440 | 441 | for lang, synonyms in node.synonyms.items(): 442 | for synonym in synonyms: 443 | tag = get_tag(synonym) 444 | tag_id = f"{lang}:{tag}".lower() 445 | mapping[tag_id] = node.id 446 | return mapping 447 | 448 | 449 | def is_prefixed_value(value: str) -> bool: 450 | """Return True if the given value has a language prefix (en:, fr:,...), 451 | False otherwise.""" 452 | return len(value) > 3 and value[2] == ":" 453 | 454 | 455 | def create_brand_taxonomy_mapping(taxonomy: Taxonomy) -> Dict[str, str]: 456 | """From a brand taxonomy, create a mapping of tags to taxonomy brand names. 457 | 458 | The mapping generated is different than the mapping generated by the 459 | `create_taxonomy_mapping` function, as it maps an unprefixed value 460 | (ex: `nestle`) to a brand name, with capitalization and accents 461 | (ex: `Nestlé`). 462 | 463 | The taxonomy mapping has the following format: 464 | { 465 | "alva": "Alva", 466 | "benecop": "Bénécop", 467 | ... 468 | } 469 | 470 | :param taxonomy: the taxonomy to use (brand taxonomy) 471 | :return: a dict mapping tags (*without* language prefix) to brand values 472 | (capitalized) 473 | """ 474 | mapping = {} 475 | for node in taxonomy.iter_nodes(): 476 | unprefixed_key = node.id 477 | if is_prefixed_value(node.id): 478 | prefix = node.id[:2] 479 | unprefixed_key = node.id[3:] 480 | mapping[unprefixed_key] = node.names.get( 481 | "xx", node.names.get("en", node.names.get(prefix, unprefixed_key)) 482 | ) 483 | return mapping 484 | 485 | 486 | def map_to_canonical_id( 487 | taxonomy_mapping: Dict[str, str], values: List[str] 488 | ) -> Dict[str, str]: 489 | """Map a list of values to their canonical taxonomy id. 490 | 491 | Each value should be a tag in the form `lang:tag`. If a value is not found 492 | in the taxonomy mapping, it is returned as is, in its tag form. 493 | 494 | :param taxonomy_mapping: a mapping of tags to taxonomy node ids, generated 495 | by `create_taxonomy_mapping` 496 | :param values: a list of string values 497 | :return: a dict mapping values to their canonical taxonomy id 498 | """ 499 | for value in values: 500 | if len(value) < 3 or value[2] != ":": 501 | raise ValueError( 502 | f"Invalid value: '{value}', expected value to be in 'lang:tag' format" 503 | ) 504 | 505 | output = {} 506 | for value in values: 507 | tag = get_tag(value) 508 | output[value] = ( 509 | # Look for a direct match first 510 | taxonomy_mapping.get(tag) 511 | # Then look for a match with the xx prefix (language-independent 512 | # entry) 513 | or taxonomy_mapping.get(replace_lang_prefix(tag, "xx")) 514 | # If no match is found, return the original taggified value 515 | or tag 516 | ) 517 | 518 | return output 519 | --------------------------------------------------------------------------------