├── .github
    ├── FUNDING.yml
    ├── workflows
    │   ├── pre-commit.yml
    │   └── build-publish.yml
    ├── ISSUE_TEMPLATE
    │   ├── question.md
    │   ├── bug.md
    │   └── proposal.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── stale.yml
├── tinyscaler-text.png
├── MANIFEST.in
├── .pre-commit-config.yaml
├── src
    ├── scaler.h
    ├── tinyscaler.pyx
    └── scaler.c
├── setup.py
├── LICENSE
├── pyproject.toml
├── examples
    ├── benchmark_area.py
    └── benchmark_bilinear.py
└── README.md


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: Farama-Foundation
2 | 


--------------------------------------------------------------------------------
/tinyscaler-text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Farama-Foundation/TinyScaler/HEAD/tinyscaler-text.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE.md
2 | include CMakeLists.txt
3 | include src/tinyscaler.pyx src/scaler.h src/scaler.c
4 | recursive-exclude examples/
5 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | repos:
 3 |   - repo: https://github.com/psf/black
 4 |     rev: 23.3.0
 5 |     hooks:
 6 |       - id: black
 7 |   - repo: https://github.com/PyCQA/flake8
 8 |     rev: 6.0.0
 9 |     hooks:
10 |       - id: flake8
11 |         args:
12 |           - --max-complexity=30
13 |           - --max-line-length=456
14 |           - --show-source
15 |           - --statistics
16 |   - repo: https://github.com/PyCQA/isort
17 |     rev: 5.12.0
18 |     hooks:
19 |       - id: isort
20 |         args: ["--profile", "black"]
21 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | # https://pre-commit.com
 2 | # This GitHub Action assumes that the repo contains a valid .pre-commit-config.yaml file.
 3 | name: pre-commit
 4 | on:
 5 |   pull_request:
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | permissions:
10 |   contents: read # to fetch code (actions/checkout)
11 | 
12 | jobs:
13 |   pre-commit:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v3
17 |       - uses: actions/setup-python@v4
18 |       - run: pip install pre-commit
19 |       - run: pre-commit --version
20 |       - run: pre-commit install
21 |       - run: pre-commit run --all-files
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask a question
 4 | title: "[Question] Question title"
 5 | ---
 6 | 
 7 | 
 8 | ### Question
 9 | 
10 | If you're a beginner and have basic questions, please ask on [r/reinforcementlearning](https://www.reddit.com/r/reinforcementlearning/) or in the [RL Discord](https://discord.com/invite/xhfNqQv) (if you're new please use the beginners channel). Basic questions that are not bugs or feature requests will be closed without reply, because GitHub issues are not an appropriate venue for these.
11 | 
12 | Advanced/nontrivial questions, especially in areas where documentation is lacking, are very much welcome.
13 | 


--------------------------------------------------------------------------------
/src/scaler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdlib.h>
 4 | #include <math.h>
 5 | #include <memory.h>
 6 | 
 7 | #if defined(__x86_64__)
 8 | #include <pmmintrin.h> // SSE
 9 | #elif defined(__arm__)
10 | #include <arm_neon.h> // Neon
11 | #endif
12 | 
13 | #define RGBA32F_SIZE 16 // Byte size of a pixel
14 | 
15 | #ifndef max
16 | #define max(a,b) (((a) > (b)) ? (a) : (b))
17 | #define min(a,b) (((a) < (b)) ? (a) : (b))
18 | #endif
19 | 
20 | typedef unsigned char u8;
21 | typedef int i32;
22 | typedef float f32;
23 | typedef double f64;
24 | 
25 | // Scalers
26 | void scale_nearest_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height);
27 | void scale_bilinear_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height);
28 | void scale_area_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height);
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | 
 3 | from Cython.Build import cythonize
 4 | from setuptools import Extension, setup
 5 | 
 6 | ext_modules = []
 7 | 
 8 | if platform.system() == "Windows":  # Windows
 9 |     ext_modules = [Extension("tinyscaler", ["src/*.pyx"])]
10 | else:  # Not Windows
11 |     if platform.machine() in ["x86_64", "arm64", "aarch64"]:  # Detect 64-bit platforms
12 |         ext_modules = [Extension("tinyscaler", ["src/*.pyx"])]
13 |     else:  # Arm assumed
14 |         ext_modules = [
15 |             Extension(
16 |                 "tinyscaler",
17 |                 ["src/*.pyx"],
18 |                 extra_compile_args=["-mfpu=neon"],
19 |                 extra_link_args=["-mfpu=neon"],
20 |             )
21 |         ]
22 | 
23 | setup(
24 |     name="tinyscaler",
25 |     ext_modules=cythonize(
26 |         ext_modules, language_level=3, compiler_directives={"annotation_typing": False}
27 |     ),
28 | )
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Submit a bug report
 4 | title: "[Bug Report] Bug title"
 5 | 
 6 | ---
 7 | 
 8 | If you are submitting a bug report, please fill in the following details and use the tag [bug].
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Code example**
14 | Please try to provide a minimal example to reproduce the bug. Error messages and stack traces are also helpful.
15 | 
16 | **System Info**
17 | Describe the characteristic of your environment:
18 |  * Describe how TinyScaler was installed (pip, docker, source, ...)
19 |  * What OS/version you're using. 
20 |  * Python version
21 | 
22 | **Additional context**
23 | Add any other context about the problem here.
24 | 
25 | ### Checklist
26 | 
27 | - [ ] I have checked that there is no similar [issue](https://github.com/Farama-Foundation/TinyScaler/issues) in the repo (**required**)
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/proposal.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Proposal
 3 | about: Propose changes that are not fixes bugs
 4 | title: "[Proposal] Proposal title"
 5 | ---
 6 | 
 7 | 
 8 | 
 9 | ### Proposal 
10 | 
11 | A clear and concise description of the proposal.
12 | 
13 | ### Motivation
14 | 
15 | Please outline the motivation for the proposal.
16 | Is your feature request related to a problem? e.g.,"I'm always frustrated when [...]".
17 | If this is related to another GitHub issue, please link here too.
18 | 
19 | ### Pitch
20 | 
21 | A clear and concise description of what you want to happen.
22 | 
23 | ### Alternatives
24 | 
25 | A clear and concise description of any alternative solutions or features you've considered, if any.
26 | 
27 | ### Additional context
28 | 
29 | Add any other context or screenshots about the feature request here.
30 | 
31 | ### Checklist
32 | 
33 | - [ ] I have checked that there is no similar [issue](https://github.com/Farama-Foundation/TinyScaler/issues) in the repo (**required**)
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2021 Farama Foundation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Package ######################################################################
 2 | 
 3 | [build-system]
 4 | requires = ["setuptools", "wheel", "Cython"]
 5 | build-backend = "setuptools.build_meta"
 6 | 
 7 | [project]
 8 | name = "tinyscaler"
 9 | description = "A tiny, simple image scaler."
10 | readme = "README.md"
11 | requires-python = ">= 3.8"
12 | authors = [{ name = "Farama Foundation", email = "contact@farama.org" }]
13 | license = { text = "MIT License" }
14 | keywords = ["Reinforcement Learning", "Gymnasium", "PettingZoo"]
15 | classifiers = [
16 |     "Development Status :: 4 - Beta",  # change to `5 - Production/Stable` when ready
17 |     "License :: OSI Approved :: MIT License",
18 |     "Programming Language :: Python :: 3",
19 |     "Programming Language :: Python :: 3.8",
20 |     "Programming Language :: Python :: 3.9",
21 |     "Programming Language :: Python :: 3.10",
22 |     "Programming Language :: Python :: 3.11",
23 |     "Programming Language :: Python :: 3.12",
24 |     'Intended Audience :: Science/Research',
25 |     'Topic :: Scientific/Engineering :: Artificial Intelligence',
26 | ]
27 | version="1.2.8"
28 | dependencies = [
29 |     "numpy >=1.21.0",
30 | ]
31 | 
32 | [project.urls]
33 | Homepage = "https://farama.org"
34 | Repository = "https://github.com/Farama-Foundation/TinyScaler"
35 | Documentation = "https://github.com/Farama-Foundation/TinyScaler"
36 | "Bug Report" = "https://github.com/Farama-Foundation/TinyScaler/issues"
37 | 


--------------------------------------------------------------------------------
/examples/benchmark_area.py:
--------------------------------------------------------------------------------
 1 | # Benchmark between tinyscaler, OpenCV, Pillow, and skImage using area filtering
 2 | import time
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | import tinyscaler
 7 | 
 8 | # Disable multithreading and GPU support for OpenCV for a single-threaded CPU comparison
 9 | cv2.setNumThreads(1)
10 | cv2.ocl.setUseOpenCL(False)
11 | 
12 | # Number of scales to perform
13 | numScales = 100
14 | 
15 | # Loading this image: https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png
16 | img8 = cv2.cvtColor(cv2.imread("nasa-4928x3279.png"), cv2.COLOR_BGR2RGBA)
17 | img = (img8 / 255.0).astype(np.float32)  # Preferred format
18 | 
19 | targetSize = (852, 567)
20 | 
21 | dst = np.empty((targetSize[1], targetSize[0], 4), dtype=np.float32)
22 | 
23 | start = time.perf_counter()
24 | 
25 | for t in range(numScales):
26 |     tinyscaler.scale(img, targetSize, mode="area", dst=dst)
27 | 
28 | end = time.perf_counter()
29 | 
30 | print("Time elapsed for tinyscaler: " + str(end - start))
31 | 
32 | # Save the result from tinyscaler for viewing
33 | cv2.imwrite(
34 |     "result.png", cv2.cvtColor((dst * 255.0).astype(np.uint8), cv2.COLOR_RGBA2BGR)
35 | )
36 | 
37 | start = time.perf_counter()
38 | 
39 | for t in range(numScales):
40 |     cv2.resize(img, targetSize, dst=dst, interpolation=cv2.INTER_AREA)
41 | 
42 | end = time.perf_counter()
43 | 
44 | cv2.imwrite(
45 |     "result_cv.png", cv2.cvtColor((dst * 255.0).astype(np.uint8), cv2.COLOR_RGBA2BGR)
46 | )
47 | 
48 | print("Time elapsed for OpenCV: " + str(end - start))
49 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
 4 | 
 5 | Fixes # (issue)
 6 | 
 7 | ## Type of change
 8 | 
 9 | Please delete options that are not relevant.
10 | 
11 | - [ ] Bug fix (non-breaking change which fixes an issue)
12 | - [ ] New feature (non-breaking change which adds functionality)
13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
14 | - [ ] This change requires a documentation update
15 | 
16 | ### Screenshots
17 | Please attach before and after screenshots of the change if applicable.
18 | 
19 | <!--
20 | Example:
21 | 
22 | | Before | After |
23 | | ------ | ----- |
24 | | _gif/png before_ | _gif/png after_ |
25 | 
26 | 
27 | To upload images to a PR -- simply drag and drop an image while in edit mode and it should upload the image directly. You can then paste that source into the above before/after sections.
28 | -->
29 | 
30 | # Checklist:
31 | 
32 | - [ ] I have run the [`pre-commit` checks](https://pre-commit.com/) with `pre-commit run --all-files` (see `CONTRIBUTING.md` instructions to set it up)
33 | - [ ] I have commented my code, particularly in hard-to-understand areas
34 | - [ ] I have made corresponding changes to the documentation
35 | - [ ] My changes generate no new warnings
36 | - [ ] I have added tests that prove my fix is effective or that my feature works
37 | - [ ] New and existing unit tests pass locally with my changes
38 | 
39 | <!--
40 | As you go through the checklist above, you can mark something as done by putting an x character in it
41 | 
42 | For example,
43 | - [x] I have done this task
44 | - [ ] I have not done this task
45 | -->
46 | 


--------------------------------------------------------------------------------
/examples/benchmark_bilinear.py:
--------------------------------------------------------------------------------
 1 | # Benchmark between tinyscaler, OpenCV, Pillow, and skImage using bilinear filtering
 2 | import time
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | import tinyscaler
 7 | from PIL import Image
 8 | from skimage.transform import resize
 9 | 
10 | # Disable multithreading and GPU support for OpenCV for a single-threaded CPU comparison
11 | cv2.setNumThreads(1)
12 | cv2.ocl.setUseOpenCL(False)
13 | 
14 | # Number of scales to perform
15 | numScales = 100
16 | 
17 | # Loading this image: https://github.com/Cykooz/fast_image_resize/blob/main/data/nasa-4928x3279.png
18 | img8 = cv2.cvtColor(cv2.imread("nasa-4928x3279.png"), cv2.COLOR_BGR2RGBA)
19 | img = (img8 / 255.0).astype(np.float32)  # Preferred format
20 | 
21 | targetSize = (852, 567)
22 | 
23 | dst = np.empty((targetSize[1], targetSize[0], 4), dtype=np.float32)
24 | 
25 | start = time.perf_counter()
26 | 
27 | for t in range(numScales):
28 |     tinyscaler.scale(img, targetSize, mode="bilinear", dst=dst)
29 | 
30 | end = time.perf_counter()
31 | 
32 | print("Time elapsed for tinyscaler: " + str(end - start))
33 | 
34 | # Save the result from tinyscaler for viewing
35 | cv2.imwrite(
36 |     "result.png", cv2.cvtColor((dst * 255.0).astype(np.uint8), cv2.COLOR_RGBA2BGR)
37 | )
38 | 
39 | start = time.perf_counter()
40 | 
41 | for t in range(numScales):
42 |     cv2.resize(img, targetSize, dst=dst, interpolation=cv2.INTER_LINEAR)
43 | 
44 | end = time.perf_counter()
45 | 
46 | cv2.imwrite(
47 |     "result_cv.png", cv2.cvtColor((dst * 255.0).astype(np.uint8), cv2.COLOR_RGBA2BGR)
48 | )
49 | 
50 | print("Time elapsed for OpenCV: " + str(end - start))
51 | 
52 | pimg = Image.fromarray(img8)
53 | 
54 | start = time.perf_counter()
55 | 
56 | for t in range(numScales):
57 |     pimg.resize(targetSize, Image.Resampling.BILINEAR)
58 | 
59 | end = time.perf_counter()
60 | 
61 | print("Time elapsed for Pillow: " + str(end - start))
62 | 
63 | start = time.perf_counter()
64 | 
65 | for t in range(numScales):
66 |     resize(img, targetSize)
67 | 
68 | end = time.perf_counter()
69 | 
70 | print("Time elapsed for skimage: " + str(end - start))
71 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for probot-stale - https://github.com/probot/stale
 2 | 
 3 | # Number of days of inactivity before an Issue or Pull Request becomes stale
 4 | daysUntilStale: 60
 5 | 
 6 | # Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
 7 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
 8 | daysUntilClose: 14
 9 | 
10 | # Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled)
11 | onlyLabels:
12 |   - more-information-needed
13 | 
14 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
15 | exemptLabels:
16 |   - pinned
17 |   - security
18 |   - "[Status] Maybe Later"
19 | 
20 | # Set to true to ignore issues in a project (defaults to false)
21 | exemptProjects: true
22 | 
23 | # Set to true to ignore issues in a milestone (defaults to false)
24 | exemptMilestones: true
25 | 
26 | # Set to true to ignore issues with an assignee (defaults to false)
27 | exemptAssignees: true
28 | 
29 | # Label to use when marking as stale
30 | staleLabel: stale
31 | 
32 | # Comment to post when marking as stale. Set to `false` to disable
33 | markComment: >
34 |   This issue has been automatically marked as stale because it has not had
35 |   recent activity. It will be closed if no further activity occurs. Thank you
36 |   for your contributions.
37 | 
38 | # Comment to post when removing the stale label.
39 | # unmarkComment: >
40 | #   Your comment here.
41 | 
42 | # Comment to post when closing a stale Issue or Pull Request.
43 | # closeComment: >
44 | #   Your comment here.
45 | 
46 | # Limit the number of actions per hour, from 1-30. Default is 30
47 | limitPerRun: 30
48 | 
49 | # Limit to only `issues` or `pulls`
50 | only: issues
51 | 
52 | # Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
53 | # pulls:
54 | #   daysUntilStale: 30
55 | #   markComment: >
56 | #     This pull request has been automatically marked as stale because it has not had
57 | #     recent activity. It will be closed if no further activity occurs. Thank you
58 | #     for your contributions.
59 | 
60 | # issues:
61 | #   exemptLabels:
62 | #     - confirmed


--------------------------------------------------------------------------------
/.github/workflows/build-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will build and (if release) publish Python distributions to PyPI
 2 | # For more information see:
 3 | #   - https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 4 | #   - https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 5 | #
 6 | # derived from https://github.com/Farama-Foundation/PettingZoo/blob/e230f4d80a5df3baf9bd905149f6d4e8ce22be31/.github/workflows/build-publish.yml
 7 | name: build-publish
 8 | 
 9 | on:
10 |   push:
11 |     branches: [main]
12 |   pull_request:
13 |     branches: [main]
14 |   release:
15 |     types: [published]
16 | 
17 | jobs:
18 |   matrix-build:
19 |     runs-on: ${{ matrix.os }}
20 |     strategy:
21 |       matrix:
22 |         os: [macos-latest, windows-latest]
23 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
24 |     steps:
25 |     - uses: actions/checkout@v3
26 |     - name: Set up Python
27 |       uses: actions/setup-python@v4
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |     - name: Install dependencies
31 |       run: python -m pip install --upgrade setuptools wheel cython build
32 |     - name: Build distribution
33 |       shell: bash
34 |       run: python -m build .
35 |     - name: Store
36 |       uses: actions/upload-artifact@v3
37 |       with:
38 |         name: artifact
39 |         path: dist/*
40 |         if-no-files-found: error
41 | 
42 |   linux-build:
43 |     runs-on: ubuntu-latest
44 |     steps:
45 |     - uses: actions/checkout@v3
46 |     - name: Set up Python
47 |       uses: actions/setup-python@v4
48 |       with:
49 |         python-version: "3.x"
50 |     - name: Install dependencies
51 |       run: python -m pip install --upgrade setuptools wheel cython build
52 |     - name: Build manylinux Python wheels
53 |       uses: RalfG/python-wheels-manylinux-build@v0.5.0
54 |       with:
55 |         python-versions: 'cp38-cp38 cp39-cp39 cp310-cp310 cp311-cp311 cp312-cp312'
56 |         pip-wheel-args: '-w ./dist --no-deps'
57 |     - name: Store
58 |       uses: actions/upload-artifact@v3
59 |       with:
60 |         name: artifact
61 |         path: dist/*-manylinux*.whl
62 |         if-no-files-found: error
63 | 
64 |   source-build:
65 |     runs-on: ubuntu-latest
66 |     steps:
67 |     - uses: actions/checkout@v3
68 |     - name: Set up Python
69 |       uses: actions/setup-python@v4
70 |       with:
71 |         python-version: "3.x"
72 |     - name: Install dependencies
73 |       run: python -m pip install --upgrade setuptools wheel cython build
74 |     - name: Build source distribution
75 |       shell: bash
76 |       run: python -m build . --sdist
77 |     - name: Store
78 |       uses: actions/upload-artifact@v3
79 |       with:
80 |         name: artifact
81 |         path: dist/*
82 |         if-no-files-found: error
83 | 
84 |   publish:
85 |     runs-on: ubuntu-latest
86 |     needs: [matrix-build, linux-build, source-build]
87 |     if: github.event_name == 'release' && github.event.action == 'published'
88 |     steps:
89 |     - name: Download dists
90 |       uses: actions/download-artifact@v3
91 |       with:
92 |         name: artifact
93 |         path: dist
94 |     - name: Publish
95 |       uses: pypa/gh-action-pypi-publish@release/v1
96 |       with:
97 |         password: ${{ secrets.PYPI_API_TOKEN }}
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |     <img src="https://raw.githubusercontent.com/Farama-Foundation/TinyScaler/main/tinyscaler-text.png" width="500px"/>
 3 | </p>
 4 | 
 5 | **Aug 11, 2025: This project has been deprecated due to a lack of wide spread community use, and is no longer planned to receive any additional updates or support.**
 6 | 
 7 | Tinyscaler was small CPU image scaling library with SIMD support on x86_64 and Arm (Neon). This project was aimed to replace OpenCV for image resizing, resolving installation inconveniences and compatibility issues. We developed this for future use in Gymnasium and PettingZoo wrappers.
 8 | 
 9 | ## Installation
10 | You can install from PyPI using `pip install tinyscaler`. Linux and macOS with Python >=3.8 are supported.
11 | 
12 | ## Usage
13 | Tinyscaler contains a single external function, `scale` that using a numpy array input for the image and the new resized shape, returns the resized image. 
14 | 
15 | ```python
16 | import numpy as np
17 | import tinyscaler
18 | 
19 | img = np.random.rand(64, 64, 4).astype(np.float32)
20 | 
21 | resize_img = tinyscaler.scale(img, (32, 32))
22 | print(resize_img.shape, resize_img.dtype)  # (32, 32) np.float32
23 | ```
24 | 
25 | TinyScaler supports mode='area', mode='bilinear', and mode='nearest' filtering. It also allows one to pass a destination buffer in order to avoid duplicate memory allocations.
26 | 
27 | Area filtering is only really useful for downscaling, bilinear will be used even when area filtering is set if upscaling. Area filtering is also likely not worth it when downscaling less than or equal to 2x.
28 | 
29 | TinyScaler is used through a single function. The full signature is:
30 | 
31 | ```python
32 | scale(src : np.ndarray, size : tuple, mode='area', dst : np.ndarray = None)
33 | ```
34 | 
35 | Note that the `size` tuple parameter is (width, height). However, the numpy arrays have dimensions ordered as (height, width, channels). This is similar to OpenCV.
36 | 
37 | TinyScaler expects a contiguous numpy array. If it is not contiguous, it will throw an error. You can make a non-contiguous numpy array contiguous by calling `np.ascontiguousarray`. Usually a numpy array will already be contiguous.
38 | 
39 | If the final array dimension is not 4 (RGBA), it will automatically convert to it. Further, if the array is uint8, it will be converted to float32. So the prefered array has a shape `(height, width, 4)` and `dtype=np.float32`.
40 | 
41 | Finally, downscaling is the focus of TinyScaler. It can also upscale, but it will not be as fast as a more complex separable algorithm in that case.
42 | 
43 | ## Performance
44 | In a [simple benchmark](./examples/benchmark.py), we resized the same image (4928x3279) down to (852x567) 100 times using bilinear filtering with several libraries. Here are the times (in seconds) spent (measured with Python's perf_counter) on a AMD 1950x:
45 | 
46 | ```
47 | Time elapsed for tinyscaler: 0.7968465110002398
48 | Time elapsed for OpenCV: 0.48667862100001
49 | Time elapsed for Pillow: 12.672875003999707
50 | Time elapsed for skimage: 164.45401711399973
51 | ```
52 | 
53 | And with area filtering (just TinyScaler and OpenCV):
54 | 
55 | ```
56 | Time elapsed for tinyscaler: 4.34793155800071
57 | Time elapsed for OpenCV: 8.118138265999733
58 | ```
59 | 
60 | All methods were forced to use a single thread. OpenCV is slightly faster than TinyScaler for bilinear filtering, but TinyScaler remains very fast regardless.
61 | 
62 | Interestingly, for area filtering, TinyScaler is faster (almost 2x).
63 | 
64 | 


--------------------------------------------------------------------------------
/src/tinyscaler.pyx:
--------------------------------------------------------------------------------
  1 | import cython
  2 | 
  3 | 
  4 | cdef extern from "scaler.c":
  5 |     pass
  6 | 
  7 | cdef extern from "scaler.h":
  8 |     ctypedef unsigned char u8;
  9 |     ctypedef int i32;
 10 |     ctypedef float f32;
 11 |     ctypedef double f64;
 12 | 
 13 |     void scale_nearest_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height)
 14 |     void scale_bilinear_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height)
 15 |     void scale_area_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height)
 16 | 
 17 | import numpy as np
 18 | 
 19 | auto_convert = True # Global controlling whether automatic channel/type conversions take place
 20 | 
 21 | def _scale_4f32(src: np.ndarray, size: tuple, mode: str = 'area', dst: np.ndarray = None) -> np.ndarray:
 22 |     assert(len(src.shape) == 3 and src.shape[2] == 4) # Must be 4 channel
 23 | 
 24 |     if not src.flags['C_CONTIGUOUS']:
 25 |         src = np.ascontiguousarray(src)
 26 | 
 27 |     cdef float[:, :, ::1] src_memview = src
 28 | 
 29 |     if dst is None:
 30 |         dst = np.empty((size[1], size[0], 4), dtype=np.float32)
 31 |     else:
 32 |         if len(dst.shape) != 3 or dst.shape[0] != size[1] or dst.shape[1] != size[0] or dst.shape[2] != 4:
 33 |             raise Exception('Incorrect dst size!')
 34 |         elif dst.dtype != np.float32:
 35 |             raise Exception('Incorrect dst type (must be float32)!')
 36 | 
 37 |     if not dst.flags['C_CONTIGUOUS']:
 38 |         dst = np.ascontiguousarray(dst)
 39 | 
 40 |     cdef float[:, :, ::1] dst_memview = dst
 41 | 
 42 |     if mode == 'bilinear':
 43 |         scale_bilinear_4f32(&src_memview[0][0][0], &dst_memview[0][0][0], src.shape[1], src.shape[0], size[0], size[1])
 44 |     elif mode == 'area':
 45 |         scale_area_4f32(&src_memview[0][0][0], &dst_memview[0][0][0], src.shape[1], src.shape[0], size[0], size[1])
 46 |     else:
 47 |         scale_nearest_4f32(&src_memview[0][0][0], &dst_memview[0][0][0], src.shape[1], src.shape[0], size[0], size[1])
 48 | 
 49 |     return dst.reshape((size[1], size[0], 4))
 50 | 
 51 | @cython.binding(True)
 52 | def scale(src: np.ndarray, size: tuple, mode: str = 'area', dst: np.ndarray = None) -> np.ndarray:
 53 |     '''
 54 |     scale (resize) a source image to a specified size
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     src : numpy.ndarray
 59 |         the source array, must have len(src.shape) == 2 or len(src.shape) == 3. Must be contiguous (using numpy.ascontiguousarray if not already).
 60 |         Ideally (most efficient) the shape is (width, height, 4) with dtype=numpy.float32 (others will cause a conversion to occur)
 61 |     size :
 62 |         target size, a tuple of two positive integers (width, height)
 63 |     mode : {'area', 'bilinear', 'nearest'}, optional
 64 |         interpolation method to use. Defaults to area, can also be bilinear or nearest
 65 |     dst : {None}, optional
 66 |         destination buffer to put resized image in. Leaving this = None will result in an allocation.
 67 |         For efficient code, set dst to a buffer of shape (size[0], size[1], 4) with dtype=numpy.float32
 68 | 
 69 |     Returns
 70 |     -------
 71 |     numpy.ndarray
 72 |         the scaled image
 73 | 
 74 |     Raises
 75 |     ------
 76 |     Exception
 77 |         when src is not contiguous or the wrong shape
 78 |     '''
 79 | 
 80 |     if not src.data.contiguous:
 81 |         raise Exception('Input image must be contiguous! Use np.ascontiguousarray(image) maybe?')
 82 | 
 83 |     src_dims = len(src.shape)
 84 |     src_channels = 4
 85 |     src_type = src.dtype
 86 | 
 87 |     if mode == 'area' and (src.shape[1] < size[0] or src.shape[0] < size[1]):
 88 |         mode = 'bilinear' # Switch to bilinear if upscaling with area filter, as area filter only applies to downscaling
 89 | 
 90 |     # Automatic conversion
 91 |     if auto_convert:
 92 |         if len(src.shape) != 2 and len(src.shape) != 3:
 93 |             raise Exception('Incorrect number of dimensions - need 2 or 3, received ' + str(len(src.shape)))
 94 | 
 95 |         if src.dtype != np.float32:
 96 |             if src.dtype == np.uint8:
 97 |                 src = src.astype(np.float32)
 98 | 
 99 |                 src *= float(1.0 / 255.0)
100 |             else:
101 |                 src = src.astype(np.float32)
102 | 
103 |         if len(src.shape) == 2 or src.shape[2] == 1: # Gray
104 |             if len(src.shape) == 3:
105 |                 src = src.reshape((src.shape[0], src.shape[1]))
106 | 
107 |             src_channels = 1
108 | 
109 |             src_new = np.empty((src.shape[0], src.shape[1], 4), dtype=np.float32)
110 |             src_new[:, :, 0] = src
111 |             src_new[:, :, 1] = src
112 |             src_new[:, :, 2] = src
113 |             src_new[:, :, 3] = src
114 | 
115 |             src = src_new
116 |         elif src.shape[2] == 3: # RGB
117 |             src_channels = 3
118 | 
119 |             src_new = np.empty((src.shape[0], src.shape[1], 4), dtype=np.float32)
120 |             src_new[:, :, :3] = src
121 |             src_new[:, :, 3] = np.ones((src.shape[0], src.shape[1]))
122 | 
123 |             src = src_new
124 |         elif src.shape[2] != 4:
125 |             raise Exception('Passed an invalid number of channels, must be 1, 3, or 4 (received ' + str(src.shape[2]) + ')!')
126 |         
127 |     result = None
128 | 
129 |     try:
130 |         result = _scale_4f32(src, size, mode, dst)
131 |     except Exception as e:
132 |         raise e
133 | 
134 |     # Covert back
135 |     if auto_convert:
136 |         if src_type == np.uint8:
137 |             result = (result * float(255.0)).astype(np.uint8)[:, :, :src_channels]
138 |         else:
139 |             result = result[:, :, :src_channels].astype(src_type)
140 | 
141 |         if src_dims == 2:
142 |             result = result.reshape((size[1], size[0]))
143 | 
144 |     return result
145 | 


--------------------------------------------------------------------------------
/src/scaler.c:
--------------------------------------------------------------------------------
  1 | #include "scaler.h"
  2 | 
  3 | // Nearest does not use SIMD
  4 | void scale_nearest_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
  5 |     f32 ratio_x = (f32)src_width / (f32)dst_width;
  6 |     f32 ratio_y = (f32)src_height / (f32)dst_height;
  7 |     i32 dst_width4 = dst_width << 2;
  8 |     i32 src_width4 = src_width << 2;
  9 | 
 10 |     for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
 11 |         i32 src_y = (i32)((dst_y + 0.5f) * ratio_y);
 12 | 
 13 |         i32 dst_offset4 = dst_width4 * dst_y;
 14 |         i32 src_offset4 = src_width4 * src_y;
 15 | 
 16 |         for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
 17 |             i32 src_x = (i32)((dst_x + 0.5f) * ratio_x);
 18 | 
 19 |             memcpy(&dst[(dst_x << 2) + dst_offset4], &src[(src_x << 2) + src_offset4], RGBA32F_SIZE);
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | #if defined(__x86_64__) // SSE implementation
 25 | 
 26 | void scale_bilinear_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
 27 |     f32 ratio_x = (f32)(src_width - 1) / (f32)dst_width;
 28 |     f32 ratio_y = (f32)(src_height - 1) / (f32)dst_height;
 29 |     i32 dst_width4 = dst_width << 2;
 30 |     i32 src_width4 = src_width << 2;
 31 |     i32 src_width4_4 = src_width4 + 4;
 32 | 
 33 |     if ((((size_t)src | (size_t)dst) & 0x0f) == 0) { // Aligned memory
 34 |         for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
 35 |             f32 src_y_f = (dst_y + 0.5f) * ratio_y;
 36 |             i32 src_y = (i32)src_y_f;
 37 |             f32 interp_y = src_y_f - src_y;
 38 | 
 39 |             __m128 iy = _mm_set1_ps(interp_y);
 40 |             __m128 iy1 = _mm_set1_ps(1.0f - interp_y);
 41 | 
 42 |             i32 dst_offset4 = dst_width4 * dst_y;
 43 |             i32 src_offset4 = src_width4 * src_y;
 44 | 
 45 |             for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
 46 |                 f32 src_x_f = (dst_x + 0.5f) * ratio_x;
 47 |                 i32 src_x = (i32)src_x_f;
 48 |                 f32 interp_x = src_x_f - src_x;
 49 | 
 50 |                 i32 dst_start = (dst_x << 2) + dst_offset4;
 51 | 
 52 |                 f32* src_start00 = src + (src_x << 2) + src_offset4;
 53 | 
 54 |                 __m128 ix = _mm_set1_ps(interp_x);
 55 |                 __m128 ix1 = _mm_set1_ps(1.0f - interp_x);
 56 | 
 57 |                 __m128 p00 = _mm_load_ps(src_start00);
 58 |                 __m128 p01 = _mm_load_ps(src_start00 + 4);
 59 |                 __m128 p10 = _mm_load_ps(src_start00 + src_width4);
 60 |                 __m128 p11 = _mm_load_ps(src_start00 + src_width4_4);
 61 | 
 62 |                 p00 = _mm_add_ps(_mm_mul_ps(p00, iy1), _mm_mul_ps(p10, iy));
 63 |                 p01 = _mm_add_ps(_mm_mul_ps(p01, iy1), _mm_mul_ps(p11, iy));
 64 | 
 65 |                 p00 = _mm_add_ps(_mm_mul_ps(p00, ix1), _mm_mul_ps(p01, ix));
 66 | 
 67 |                 _mm_store_ps(dst + dst_start, p00);
 68 |             }
 69 |         }
 70 |     }
 71 |     else { // Unaligned memory
 72 |         for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
 73 |             f32 src_y_f = (dst_y + 0.5f) * ratio_y;
 74 |             i32 src_y = (i32)src_y_f;
 75 |             f32 interp_y = src_y_f - src_y;
 76 | 
 77 |             __m128 iy = _mm_set1_ps(interp_y);
 78 |             __m128 iy1 = _mm_set1_ps(1.0f - interp_y);
 79 | 
 80 |             i32 dst_offset4 = dst_width4 * dst_y;
 81 |             i32 src_offset4 = src_width4 * src_y;
 82 | 
 83 |             for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
 84 |                 f32 src_x_f = (dst_x + 0.5f) * ratio_x;
 85 |                 i32 src_x = (i32)src_x_f;
 86 |                 f32 interp_x = src_x_f - src_x;
 87 | 
 88 |                 i32 dst_start = (dst_x << 2) + dst_offset4;
 89 | 
 90 |                 f32* src_start00 = src + (src_x << 2) + src_offset4;
 91 | 
 92 |                 __m128 ix = _mm_set1_ps(interp_x);
 93 |                 __m128 ix1 = _mm_set1_ps(1.0f - interp_x);
 94 | 
 95 |                 __m128 p00 = _mm_loadu_ps(src_start00);
 96 |                 __m128 p01 = _mm_loadu_ps(src_start00 + 4);
 97 |                 __m128 p10 = _mm_loadu_ps(src_start00 + src_width4);
 98 |                 __m128 p11 = _mm_loadu_ps(src_start00 + src_width4_4);
 99 | 
100 |                 p00 = _mm_add_ps(_mm_mul_ps(p00, iy1), _mm_mul_ps(p10, iy));
101 |                 p01 = _mm_add_ps(_mm_mul_ps(p01, iy1), _mm_mul_ps(p11, iy));
102 | 
103 |                 p00 = _mm_add_ps(_mm_mul_ps(p00, ix1), _mm_mul_ps(p01, ix));
104 | 
105 |                 _mm_storeu_ps(dst + dst_start, p00);
106 |             }
107 |         }
108 |     }
109 | }
110 | 
111 | void scale_area_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
112 |     f32 ratio_x = (f32)src_width / (f32)dst_width;
113 |     f32 ratio_y = (f32)src_height / (f32)dst_height;
114 |     i32 src_width4 = src_width << 2;
115 |     i32 dst_width4 = dst_width << 2;
116 | 
117 |     if ((((size_t)src | (size_t)dst) & 0x0f) == 0) { // Aligned memory
118 |         for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
119 |             f32 src_lower_y_f = dst_y * ratio_y;
120 |             f32 src_upper_y_f = (dst_y + 1.0f) * ratio_y;
121 | 
122 |             i32 src_lower_y = max(0, (i32)src_lower_y_f);
123 |             i32 src_upper_y = min(src_height, (i32)src_upper_y_f + 1);
124 | 
125 |             f32 over_height = src_lower_y_f - (i32)src_lower_y_f;
126 |             f32 over_height1 = 1.0f - (src_upper_y_f - (i32)src_lower_y_f);
127 | 
128 |             i32 dst_offset4 = dst_width4 * dst_y;
129 | 
130 |             for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
131 |                 f32 src_lower_x_f = dst_x * ratio_x;
132 |                 f32 src_upper_x_f = (dst_x + 1.0f) * ratio_x;
133 | 
134 |                 i32 src_lower_x = max(0, (i32)src_lower_x_f);
135 |                 i32 src_upper_x = min(src_width, (i32)src_upper_x_f + 1);
136 | 
137 |                 f32 over_width = src_lower_x_f - (i32)src_lower_x_f;
138 |                 f32 over_width1 = 1.0f - (src_upper_x_f - (i32)src_lower_x_f);
139 | 
140 |                 i32 dst_start = (dst_x << 2) + dst_offset4;
141 | 
142 |                 __m128 res = _mm_set1_ps(0.0f);
143 | 
144 |                 f32 weight_total = 0.0f;
145 | 
146 |                 for (i32 area_y = src_lower_y; area_y < src_upper_y; area_y++) {
147 |                     i32 src_offset4 = src_width4 * area_y;
148 | 
149 |                     for (i32 area_x = src_lower_x; area_x < src_upper_x; area_x++) {
150 |                         f32 weight = (1.0f - (area_y == src_lower_y) * over_height) * (1.0f - (area_x == src_lower_x) * over_width) +
151 |                             (1.0f - (area_y == src_upper_y) * over_height1) * (1.0f - (area_x == src_upper_x) * over_width1);
152 | 
153 |                         i32 src_start = (area_x << 2) + src_offset4;
154 | 
155 |                         __m128 p = _mm_load_ps(src + src_start);
156 | 
157 |                         res = _mm_add_ps(res, _mm_mul_ps(p, _mm_set1_ps(weight)));
158 |                         weight_total += weight;
159 |                     }
160 |                 }
161 | 
162 |                 f32 div = 1.0f / weight_total;
163 | 
164 |                 res = _mm_mul_ps(res, _mm_set1_ps(div));
165 | 
166 |                 _mm_store_ps(dst + dst_start, res);
167 |             }
168 |         }
169 |     }
170 |     else { // Unaligned memory
171 |         for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
172 |             f32 src_lower_y_f = dst_y * ratio_y;
173 |             f32 src_upper_y_f = (dst_y + 1.0f) * ratio_y;
174 | 
175 |             i32 src_lower_y = max(0, (i32)src_lower_y_f);
176 |             i32 src_upper_y = min(src_height, (i32)src_upper_y_f + 1);
177 | 
178 |             f32 over_height = src_lower_y_f - (i32)src_lower_y_f;
179 |             f32 over_height1 = 1.0f - (src_upper_y_f - (i32)src_lower_y_f);
180 | 
181 |             i32 dst_offset4 = dst_width4 * dst_y;
182 | 
183 |             for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
184 |                 f32 src_lower_x_f = dst_x * ratio_x;
185 |                 f32 src_upper_x_f = (dst_x + 1.0f) * ratio_x;
186 | 
187 |                 i32 src_lower_x = max(0, (i32)src_lower_x_f);
188 |                 i32 src_upper_x = min(src_width, (i32)src_upper_x_f + 1);
189 | 
190 |                 f32 over_width = src_lower_x_f - (i32)src_lower_x_f;
191 |                 f32 over_width1 = 1.0f - (src_upper_x_f - (i32)src_lower_x_f);
192 | 
193 |                 i32 dst_start = (dst_x << 2) + dst_offset4;
194 | 
195 |                 __m128 res = _mm_set1_ps(0.0f);
196 | 
197 |                 f32 weight_total = 0.0f;
198 | 
199 |                 for (i32 area_y = src_lower_y; area_y < src_upper_y; area_y++) {
200 |                     i32 src_offset4 = src_width4 * area_y;
201 | 
202 |                     for (i32 area_x = src_lower_x; area_x < src_upper_x; area_x++) {
203 |                         f32 weight = (1.0f - (area_y == src_lower_y) * over_height) * (1.0f - (area_x == src_lower_x) * over_width) +
204 |                             (1.0f - (area_y == src_upper_y) * over_height1) * (1.0f - (area_x == src_upper_x) * over_width1);
205 | 
206 |                         i32 src_start = (area_x << 2) + src_offset4;
207 | 
208 |                         __m128 p = _mm_loadu_ps(src + src_start);
209 | 
210 |                         res = _mm_add_ps(res, _mm_mul_ps(p, _mm_set1_ps(weight)));
211 |                         weight_total += weight;
212 |                     }
213 |                 }
214 | 
215 |                 f32 div = 1.0f / weight_total;
216 | 
217 |                 res = _mm_mul_ps(res, _mm_set1_ps(div));
218 | 
219 |                 _mm_storeu_ps(dst + dst_start, res);
220 |             }
221 |         }
222 |     }
223 | }
224 | 
225 | #elif defined(__arm__) // ARM Neon implementation
226 | 
227 | void scale_bilinear_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
228 |     f32 ratio_x = (f32)(src_width - 1) / (f32)dst_width;
229 |     f32 ratio_y = (f32)(src_height - 1) / (f32)dst_height;
230 |     i32 dst_width4 = dst_width << 2;
231 |     i32 src_width4 = src_width << 2;
232 |     i32 src_width4_4 = src_width4 + 4;
233 | 
234 |     for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
235 |         f32 src_y_f = (dst_y + 0.5f) * ratio_y;
236 |         i32 src_y = (i32)src_y_f;
237 |         f32 interp_y = src_y_f - src_y;
238 | 
239 |         float32x4_t iy = vdupq_n_f32(interp_y);
240 |         float32x4_t iy1 = vdupq_n_f32(1.0f - interp_y);
241 | 
242 |         i32 dst_offset4 = dst_width4 * dst_y;
243 |         i32 src_offset4 = src_width4 * src_y;
244 | 
245 |         for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
246 |             f32 src_x_f = (dst_x + 0.5f) * ratio_x;
247 |             i32 src_x = (i32)src_x_f;
248 |             f32 interp_x = src_x_f - src_x;
249 | 
250 |             i32 dst_start = (dst_x << 2) + dst_offset4;
251 | 
252 |             f32* src_start00 = src + (src_x << 2) + src_offset4;
253 | 
254 |             float32x4_t ix = vdupq_n_f32(interp_x); // Use q versions (128 bit)
255 |             float32x4_t ix1 = vdupq_n_f32(1.0f - interp_x);
256 | 
257 |             float32x4_t p00 = vld1q_f32(src_start00);
258 |             float32x4_t p01 = vld1q_f32(src_start00 + 4);
259 |             float32x4_t p10 = vld1q_f32(src_start00 + src_width4);
260 |             float32x4_t p11 = vld1q_f32(src_start00 + src_width4_4);
261 | 
262 |             p00 = vaddq_f32(vmulq_f32(p00, iy1), vmulq_f32(p10, iy));
263 |             p01 = vaddq_f32(vmulq_f32(p01, iy1), vmulq_f32(p11, iy));
264 | 
265 |             p00 = vaddq_f32(vmulq_f32(p00, ix1), vmulq_f32(p01, ix));
266 | 
267 |             vst1q_f32(dst + dst_start, p00);
268 |         }
269 |     }
270 | }
271 | 
272 | void scale_area_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
273 |     f32 ratio_x = (f32)src_width / (f32)dst_width;
274 |     f32 ratio_y = (f32)src_height / (f32)dst_height;
275 |     i32 src_width4 = src_width << 2;
276 |     i32 dst_width4 = dst_width << 2;
277 | 
278 |     for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
279 |         f32 src_lower_y_f = dst_y * ratio_y;
280 |         f32 src_upper_y_f = (dst_y + 1.0f) * ratio_y;
281 | 
282 |         i32 src_lower_y = max(0, (i32)src_lower_y_f);
283 |         i32 src_upper_y = min(src_height, (i32)src_upper_y_f + 1);
284 | 
285 |         f32 over_height = src_lower_y_f - (i32)src_lower_y_f;
286 |         f32 over_height1 = 1.0f - (src_upper_y_f - (i32)src_lower_y_f);
287 | 
288 |         i32 dst_offset4 = dst_width4 * dst_y;
289 | 
290 |         for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
291 |             f32 src_lower_x_f = dst_x * ratio_x;
292 |             f32 src_upper_x_f = (dst_x + 1.0f) * ratio_x;
293 | 
294 |             i32 src_lower_x = max(0, (i32)src_lower_x_f);
295 |             i32 src_upper_x = min(src_width, (i32)src_upper_x_f + 1);
296 | 
297 |             f32 over_width = src_lower_x_f - (i32)src_lower_x_f;
298 |             f32 over_width1 = 1.0f - (src_upper_x_f - (i32)src_lower_x_f);
299 | 
300 |             i32 dst_start = (dst_x << 2) + dst_offset4;
301 | 
302 |             float32x4_t res = vdupq_n_f32(0.0f);
303 | 
304 |             f32 weight_total = 0.0f;
305 | 
306 |             for (i32 area_y = src_lower_y; area_y < src_upper_y; area_y++) {
307 |                 i32 src_offset4 = src_width4 * area_y;
308 | 
309 |                 for (i32 area_x = src_lower_x; area_x < src_upper_x; area_x++) {
310 |                     f32 weight = (1.0f - (area_y == src_lower_y) * over_height) * (1.0f - (area_x == src_lower_x) * over_width) +
311 |                         (1.0f - (area_y == src_upper_y) * over_height1) * (1.0f - (area_x == src_upper_x) * over_width1);
312 | 
313 |                     i32 src_start = (area_x << 2) + src_offset4;
314 | 
315 |                     float32x4_t p = vld1q_f32(src + src_start);
316 | 
317 |                     res = vaddq_f32(res, vmulq_f32(p, vdupq_n_f32(weight)));
318 |                     weight_total += weight;
319 |                 }
320 |             }
321 | 
322 |             f32 div = 1.0f / weight_total;
323 | 
324 |             res = vmulq_f32(res, vdupq_n_f32(div));
325 | 
326 |             vst1q_f32(dst + dst_start, res);
327 |         }
328 |     }
329 | }
330 | 
331 | #else // No SIMD implementation
332 | 
333 | void scale_bilinear_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
334 |     f32 ratio_x = (f32)(src_width - 1) / (f32)dst_width;
335 |     f32 ratio_y = (f32)(src_height - 1) / (f32)dst_height;
336 |     i32 dst_width4 = dst_width << 2;
337 | 
338 |     i32 src_width4s[8];
339 |     src_width4s[0] = src_width << 2;
340 |     
341 |     for (i32 i = 1; i < 8; i++)
342 |         src_width4s[i] = src_width4s[0] + i;
343 | 
344 |     for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
345 |         f32 src_y_f = (dst_y + 0.5f) * ratio_y;
346 |         i32 src_y = (i32)src_y_f;
347 |         f32 interp_y = src_y_f - src_y;
348 |         f32 interp_y1 = 1.0f - interp_y;
349 | 
350 |         i32 dst_offset4 = dst_width4 * dst_y;
351 |         i32 src_offset4 = src_width4s[0] * src_y;
352 | 
353 |         for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
354 |             f32 src_x_f = (dst_x + 0.5f) * ratio_x;
355 |             i32 src_x = (i32)src_x_f;
356 |             f32 interp_x = src_x_f - src_x;
357 | 
358 |             i32 dst_start = (dst_x << 2) + dst_offset4;
359 | 
360 |             i32 src_start00 = (src_x << 2) + src_offset4;
361 | 
362 |             f32 interp_x1 = 1.0f - interp_x;
363 | 
364 |             f32 pr0 = interp_y1 * src[src_start00    ] + interp_y * src[src_start00 + src_width4s[0]];
365 |             f32 pr1 = interp_y1 * src[src_start00 + 4] + interp_y * src[src_start00 + src_width4s[4]];
366 | 
367 |             f32 pg0 = interp_y1 * src[src_start00 + 1] + interp_y * src[src_start00 + src_width4s[1]];
368 |             f32 pg1 = interp_y1 * src[src_start00 + 5] + interp_y * src[src_start00 + src_width4s[5]];
369 | 
370 |             f32 pb0 = interp_y1 * src[src_start00 + 2] + interp_y * src[src_start00 + src_width4s[2]];
371 |             f32 pb1 = interp_y1 * src[src_start00 + 6] + interp_y * src[src_start00 + src_width4s[6]];
372 | 
373 |             f32 pa0 = interp_y1 * src[src_start00 + 3] + interp_y * src[src_start00 + src_width4s[3]];
374 |             f32 pa1 = interp_y1 * src[src_start00 + 7] + interp_y * src[src_start00 + src_width4s[7]];
375 | 
376 |             dst[dst_start    ] = interp_x1 * pr0 + interp_x * pr1;
377 |             dst[dst_start + 1] = interp_x1 * pg0 + interp_x * pg1;
378 |             dst[dst_start + 2] = interp_x1 * pb0 + interp_x * pb1;
379 |             dst[dst_start + 3] = interp_x1 * pa0 + interp_x * pa1;
380 |         }
381 |     }
382 | }
383 | 
384 | void scale_area_4f32(f32 src[], f32 dst[], i32 src_width, i32 src_height, i32 dst_width, i32 dst_height) {
385 |     f32 ratio_x = (f32)src_width / (f32)dst_width;
386 |     f32 ratio_y = (f32)src_height / (f32)dst_height;
387 |     i32 src_width4 = src_width << 2;
388 |     i32 dst_width4 = dst_width << 2;
389 | 
390 |     for (i32 dst_y = 0; dst_y < dst_height; dst_y++) {
391 |         f32 src_lower_y_f = dst_y * ratio_y;
392 |         f32 src_upper_y_f = (dst_y + 1.0f) * ratio_y;
393 | 
394 |         i32 src_lower_y = max(0, (i32)src_lower_y_f);
395 |         i32 src_upper_y = min(src_height, (i32)src_upper_y_f + 1);
396 | 
397 |         f32 over_height = src_lower_y_f - (i32)src_lower_y_f;
398 |         f32 over_height1 = 1.0f - (src_upper_y_f - (i32)src_lower_y_f);
399 | 
400 |         i32 dst_offset4 = dst_width4 * dst_y;
401 | 
402 |         for (i32 dst_x = 0; dst_x < dst_width; dst_x++) {
403 |             f32 src_lower_x_f = dst_x * ratio_x;
404 |             f32 src_upper_x_f = (dst_x + 1.0f) * ratio_x;
405 | 
406 |             i32 src_lower_x = max(0, (i32)src_lower_x_f);
407 |             i32 src_upper_x = min(src_width, (i32)src_upper_x_f + 1);
408 | 
409 |             f32 over_width = src_lower_x_f - (i32)src_lower_x_f;
410 |             f32 over_width1 = 1.0f - (src_upper_x_f - (i32)src_lower_x_f);
411 | 
412 |             i32 dst_start = (dst_x << 2) + dst_offset4;
413 | 
414 |             f32 r = 0.0f;
415 |             f32 g = 0.0f;
416 |             f32 b = 0.0f;
417 |             f32 a = 0.0f;
418 | 
419 |             f32 weight_total = 0.0f;
420 | 
421 |             for (i32 area_y = src_lower_y; area_y < src_upper_y; area_y++) {
422 |                 i32 src_offset4 = src_width4 * area_y;
423 | 
424 |                 for (i32 area_x = src_lower_x; area_x < src_upper_x; area_x++) {
425 |                     f32 weight = (1.0f - (area_y == src_lower_y) * over_height) * (1.0f - (area_x == src_lower_x) * over_width) +
426 |                         (1.0f - (area_y == src_upper_y) * over_height1) * (1.0f - (area_x == src_upper_x) * over_width1);
427 | 
428 |                     i32 src_start = (area_x << 2) + src_offset4;
429 | 
430 |                     r += weight * src[src_start    ];
431 |                     g += weight * src[src_start + 1];
432 |                     b += weight * src[src_start + 2];
433 |                     a += weight * src[src_start + 3];
434 |                     weight_total += weight;
435 |                 }
436 |             }
437 | 
438 |             f32 div = 1.0f / weight_total;
439 | 
440 |             dst[dst_start    ] = r * div;
441 |             dst[dst_start + 1] = g * div;
442 |             dst[dst_start + 2] = b * div;
443 |             dst[dst_start + 3] = a * div;
444 |         }
445 |     }
446 | }
447 | 
448 | #endif
449 | 


--------------------------------------------------------------------------------