├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── links-checker.yml
    │   └── runtime-docker.yml
├── .gitignore
├── .gitlab-ci.yml
├── LICENSE
├── README.md
├── bench
    ├── README.md
    ├── client
    │   └── locustfile.py
    ├── fast-api
    │   └── main.py
    ├── flask-server
    │   └── app.py
    ├── go-http-server
    │   ├── go.mod
    │   └── main.go
    └── http-server
    │   └── server.py
├── deploy
    ├── README.md
    └── docker
    │   ├── Dockerfile
    │   ├── entrypoint.sh
    │   └── start.sh
├── docs
    └── README.md
├── examples
    ├── imagenet_from_disk.py
    ├── imagenet_in_memory.py
    ├── in_memory_notebook.ipynb
    └── remote_execution.ipynb
├── runtime
    ├── Makefile
    ├── README.md
    └── python
    │   ├── Dockerfile
    │   ├── Makefile
    │   ├── bootstrap.py
    │   ├── builder.sh
    │   ├── io-comm
    │       ├── Makefile
    │       ├── cmn.go
    │       ├── go.mod
    │       └── main.go
    │   └── server.py
└── transformers
    ├── FFmpeg
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── benchmark.py
        ├── fastapi_server.py
        ├── flask_server.py
        ├── http_server.py
        └── pod.yaml
    ├── Makefile
    ├── NeMo
        └── audio_split_consolidate
        │   ├── README.md
        │   ├── audio_manager
        │       ├── Dockerfile
        │       ├── Makefile
        │       ├── fastapi_server.py
        │       └── pod.yaml
        │   ├── audio_split_consolidate_diagram.png
        │   └── audio_splitter
        │       ├── Dockerfile
        │       ├── Makefile
        │       ├── fastapi_server.py
        │       └── pod.yaml
    ├── README.md
    ├── batch_rename
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── fastapi_server.py
        └── pod.yaml
    ├── benchmarks
        └── audio_split_consolidate.py
    ├── compress
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── pod.yaml
        ├── requirements.txt
        └── server.py
    ├── echo
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── fastapi_server.py
        ├── flask_server.py
        ├── http_server.py
        └── pod.yaml
    ├── face_detection
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── main.py
        ├── pod.yaml
        ├── requirements.txt
        └── sample
        │   └── output_face_detection.png
    ├── go_FFmpeg
        ├── Dockerfile
        ├── Makefile
        ├── pod.yaml
        └── src
        │   ├── go.mod
        │   ├── go.sum
        │   ├── main.go
        │   └── main_test.go
    ├── go_echo
        ├── Dockerfile
        ├── Makefile
        ├── pod.yaml
        └── src
        │   ├── go.mod
        │   ├── go.sum
        │   └── main.go
    ├── go_hello_world
        ├── Dockerfile
        ├── Makefile
        ├── pod.yaml
        └── src
        │   ├── go.mod
        │   ├── go.sum
        │   └── main.go
    ├── hash_with_args
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── fastapi_server.py
        ├── flask_server.py
        ├── http_server.py
        ├── pod.yaml
        ├── requirements.txt
        └── server.py
    ├── hello_world
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── fastapi_server.py
        ├── flask_server.py
        ├── http_server.py
        └── pod.yaml
    ├── keras_preprocess
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── flask-gunicorn
        │   ├── Dockerfile
        │   ├── Makefile
        │   ├── app.py
        │   ├── pod.yaml
        │   └── requirements.txt
        ├── http-multithreaded-server
        │   ├── Dockerfile
        │   ├── Makefile
        │   ├── pod.yaml
        │   ├── requirements.txt
        │   └── server.py
        ├── main.py
        ├── pod.yaml
        └── requirements.txt
    ├── md5
        ├── Dockerfile
        ├── Makefile
        ├── fastapi_server.py
        ├── flask_server.py
        ├── http_server.py
        └── pod.yaml
    ├── tar2tf
        ├── .dockerignore
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── pod.yaml
        └── src
        │   ├── cached.go
        │   ├── cmn
        │       ├── assert.go
        │       ├── cmn.go
        │       └── io.go
        │   ├── go.mod
        │   ├── go.sum
        │   ├── main.go
        │   ├── tar-single.tar
        │   ├── tar2tf_test.go
        │   └── transforms
        │       ├── job.go
        │       └── pipeline.go
    ├── tests
        ├── __init__.py
        ├── base.py
        ├── conftest.py
        ├── const.py
        ├── local_benchmark
        │   └── ffmpeg_benchmark.py
        ├── requirements.txt
        ├── resources
        │   ├── test-audio-flac.flac
        │   ├── test-audio-mp3.mp3
        │   ├── test-audio-wav.wav
        │   ├── test-face-detection.png
        │   ├── test-image.jpg
        │   ├── test-image.jpg.bz2
        │   ├── test-image.jpg.gz
        │   ├── test-manifest.jsonl
        │   ├── test-tar-single.tar
        │   ├── test-text.txt
        │   ├── test-text.txt.bz2
        │   └── test-text.txt.gz
        ├── test_audio_split.py
        ├── test_audio_split_consolidate.py
        ├── test_batch_rename.py
        ├── test_compress.py
        ├── test_echo.py
        ├── test_echo_stress.py
        ├── test_face_detection.py
        ├── test_face_detection_stress.py
        ├── test_ffmpeg.py
        ├── test_hash_with_args.py
        ├── test_hello_world.py
        ├── test_hello_world_stress.py
        ├── test_keras_stress.py
        ├── test_keras_transformer.py
        ├── test_md5.py
        ├── test_md5_stress.py
        ├── test_tar2tf.py
        ├── test_torchvision_transformer.py
        └── utils.py
    └── torchvision_preprocess
        ├── Dockerfile
        ├── Makefile
        ├── README.md
        ├── http-multithreaded-server
            ├── Dockerfile
            ├── Makefile
            ├── pod.yaml
            ├── requirements.txt
            └── server.py
        ├── main.py
        ├── pod.yaml
        └── requirements.txt


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Please see the documentation for all configuration options:
 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | # and
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "weekly"
12 |   - package-ecosystem: "docker"
13 |     directory: "/"
14 |     schedule:
15 |       interval: "weekly"
16 |   - package-ecosystem: "pip"
17 |     directory: "/"
18 |     schedule:
19 |       interval: "weekly"
20 |   - package-ecosystem: "gomod"
21 |     directory: "/"
22 |     schedule:
23 |       interval: "weekly"


--------------------------------------------------------------------------------
/.github/workflows/links-checker.yml:
--------------------------------------------------------------------------------
 1 | name: Links Checker
 2 | 
 3 | on:
 4 |   repository_dispatch:
 5 |   workflow_dispatch:
 6 |   schedule:
 7 |     - cron: "00 12 * * 1"
 8 | 
 9 | jobs:
10 |   linkChecker:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Link Checker
16 |         id: lychee
17 |         uses: lycheeverse/lychee-action@v2
18 |         with:
19 |             fail: true


--------------------------------------------------------------------------------
/.github/workflows/runtime-docker.yml:
--------------------------------------------------------------------------------
 1 | name: Python Runtime Docker Images
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     paths:
 7 |       - 'runtime/python/**'
 8 | 
 9 | env:
10 |   RUNTIME_IMAGE: 'aistorage/runtime_python'
11 |   REGISTRY_URL: 'docker.io/aistorage'
12 | 
13 | jobs:
14 |   docker:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Login to DockerHub
19 |       uses: docker/login-action@v3
20 |       with:
21 |         username: ${{ secrets.DOCKERHUB_USERNAME }}
22 |         password: ${{ secrets.DOCKERHUB_TOKEN }}
23 |     - name: Python Runtime Images
24 |       run: |
25 |         pushd $GITHUB_WORKSPACE/runtime
26 |         make all
27 |         popd
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # JetBrains IDE 
142 | .idea/
143 | transformers/face_detection/model/weights.caffemodel
144 | transformers/face_detection/model/architecture.txt
145 | transformers/tests/metrics.txt
146 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 NVIDIA Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository contains:
2 | * [transformers](/transformers/README.md) - set of ETL transformers which are ready to be deployed on AIStore cluster.
3 | * [runtime](/runtime/README.md) - ETL runtimes definition that are used when starting transformers with code.
4 | 
5 | Please also see the main [AIStore repository](https://github.com/NVIDIA/aistore) and [AIStore documentation](https://aiatscale.org/docs).
6 | 


--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking AIStore ETL
 2 | 
 3 | You have the flexibility to customize your own ETL pipelines in AIStore. You can choose the language (Python, Go, etc.) and web server implementation. With so many options, it can get complicated to select the right ones. 
 4 | 
 5 | This directory provides sample web server implementations and benchmarks their performance in terms of request handling capacity.
 6 | 
 7 | ## Web Servers
 8 | 
 9 | There are many frameworks available for running web servers. Below is a comparison of web servers, frameworks, languages, and locations of basic implementations that can run them.
10 | 
11 | | Language | Framework | Web Server | Location | Remarks |
12 | |-|-|-|-|-|  
13 | | Python | - | ThreadedHTTPServer | [/http-server](bench/http-server/) | Built-in to Python, very easy to implement, doesn't scale well |
14 | | Python | Flask | Flask Built-in Webserver | [/flask-server](bench/flask-server/) | Built-in flask webserver, not suited for production |
15 | | Python | Flask | [Gunicorn](https://gunicorn.org/) | [/flask-server](bench/flask-server/) | Python WSGI HTTP server, scales well |
16 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | ASGI web server implementation for Python |
17 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) + [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | Gunicorn manages multiple Uvicorn processes |
18 | | Go | Go | Net/HTTP Server | [/go-http-server](bench/go-http-server/) | Built-in to Go, easy to implement, scales well |
19 | 
20 | To benchmark these servers on your infrastructure, you can use the [client](bench/client). The client is based on [Locust](https://locust.io/), a simple open source load testing tool.
21 | 
22 | Here are sample results from a 12 core/16GB machine:
23 | 
24 | | Language | Framework | Web Server | Location | Avg. Requests Per Second |
25 | |-|-|-|-|-|
26 | | Python | - | ThreadedHTTPServer | [/http-server](bench/http-server/) | 1020 |  
27 | | Python | Flask | Flask Built-in Webserver | [/http-server](bench/http-server/) | 950 |
28 | | Python | Flask | [Gunicorn](https://gunicorn.org/) | [/flask-server](bench/flask-server/) | 1060 |
29 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | 1620 |
30 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) + [Gunicorn](https://gunicorn.org/) | [/fast-api](bench/fast-api/) | 1670 | 
31 | | Go | Go | Net/HTTP Server | [/go-http-server](bench/go-http-server/) | 1675 |
32 | 
33 | An important consideration is how your ETL container pods will communicate with the AIStore cluster. There are several [communication mechanisms](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms) to choose from depending on your needs. There's no one perfect solution - pick the mechanism that best fits your ETL workflow.
34 | 
35 | 


--------------------------------------------------------------------------------
/bench/client/locustfile.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test client for all the webservers.
 3 | 
 4 | Steps to run:
 5 | $ pip install locust
 6 | $ locust
 7 | 
 8 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | from locust import HttpUser, task
12 | 
13 | 
14 | class MyTestUser(HttpUser):
15 |     @task
16 |     def test_put_request(self):
17 |         self._perform_put_request()
18 | 
19 |     @task
20 |     def test_get_request(self):
21 |         self._perform_get_request()
22 | 
23 |     def _perform_put_request(self):
24 |         url = "/"
25 |         data = "test"
26 |         self.client.put(url=url, data=data)
27 | 
28 |     def _perform_get_request(self):
29 |         url = "/"
30 |         self.client.get(url=url)
31 | 


--------------------------------------------------------------------------------
/bench/fast-api/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A basic web server using FastAPI for demonstration purposes.
 3 | 
 4 | Steps to run: 
 5 | $ # with uvicorn
 6 | $ uvicorn main:app --reload 
 7 | $ # with multiple uvicorn processes managed by gunicorn
 8 | $ gunicorn main:app --workers 4 --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000 
 9 | 
10 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | from fastapi import FastAPI, Request
13 | 
14 | app = FastAPI()
15 | 
16 | @app.put("/")
17 | @app.put("/{full_path:path}")
18 | async def put_handler(request: Request, full_path: str):
19 |     """
20 |     Handles PUT requests.
21 |     Reads bytes from the request, performs byte transformation,
22 |     and returns the modified bytes.
23 |     """
24 |     # Read bytes from request (request.body)
25 |     # Transform the bytes
26 |     # Return the transformed bytes
27 |     return b"Hello World from PUT!"
28 | 
29 | @app.get("/")
30 | @app.get("/{full_path:path}")
31 | async def get_handler(request: Request, full_path: str):
32 |     """
33 |     Handles GET requests.
34 |     Retrieves the destination/name of the object from the URL or the full_path variable,
35 |     fetches the object from the AIS target based on the destination/name,
36 |     transforms the bytes, and returns the modified bytes.
37 |     """
38 |     # Get destination/name of object from URL or from full_path variable
39 |     # Fetch object from AIS target based on the destination/name
40 |     # Perform byte transformation
41 |     # Return the transformed bytes
42 |     return b"Hello World from GET!"
43 | 


--------------------------------------------------------------------------------
/bench/flask-server/app.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A basic web server using Flask for demonstration purposes.
 3 | 
 4 | Steps to run:
 5 | $ # with built-in flask server
 6 | $ flask --app app run
 7 | $ # with gunicorn
 8 | $ gunicorn -w 4 'app:app'
 9 | 
10 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | import logging
13 | from flask import Flask, request
14 | 
15 | app = Flask(__name__)
16 | 
17 | 
18 | @app.route("/", defaults={"path": ""}, methods=["PUT", "GET"])
19 | @app.route("/<path:path>", methods=["PUT", "GET"])
20 | def image_handler(path):
21 |     try:
22 |         if request.method == "PUT":
23 |             # Read the request body
24 |             # Transform the bytes
25 |             # Return the transformed bytes
26 |             transformed_data = b"Hello World!"
27 |             return transformed_data, 200
28 | 
29 |         elif request.method == "GET":
30 |             # Get the destination/name of the object from the URL or the path variable
31 |             # Fetch the object from the AIS target based on the destination/name
32 |             # Use request.get(ais_target_url + "/" + path).get to get the object
33 |             # Transform the bytes
34 |             # Return the transformed bytes
35 |             transformed_data = b"Hello World!"
36 |             return transformed_data, 200
37 | 
38 |     except Exception as exception:
39 |         logging.error("Error processing request: %s", str(exception))
40 |         return "Data processing failed", 500
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     app.run()
45 | 


--------------------------------------------------------------------------------
/bench/go-http-server/go.mod:
--------------------------------------------------------------------------------
1 | module main
2 | 
3 | go 1.21
4 | 


--------------------------------------------------------------------------------
/bench/go-http-server/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * A basic webserver using golang
 3 |  *
 4 |  * Steps to run:
 5 |  * $ go run main.go
 6 |  *
 7 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 8 |  */
 9 | package main
10 | 
11 | import (
12 | 	"flag"
13 | 	"fmt"
14 | 	"io"
15 | 	"log"
16 | 	"net/http"
17 | 	"os"
18 | )
19 | 
20 | var (
21 | 	endpoint string
22 | 
23 | 	logger *log.Logger
24 | )
25 | 
26 | func initVars(ipAddress string, port int) {
27 | 	endpoint = fmt.Sprintf("%s:%d", ipAddress, port)
28 | }
29 | 
30 | func main() {
31 | 	var (
32 | 		ipAddressArg = flag.String("l", "localhost", "Specify the IP address on which the server listens")
33 | 		portArg      = flag.Int("p", 8000, "Specify the port on which the server listens")
34 | 	)
35 | 
36 | 	flag.Parse()
37 | 
38 | 	initVars(*ipAddressArg, *portArg)
39 | 
40 | 	logger = log.New(os.Stdout, "[TestServer] ", log.LstdFlags|log.Lmicroseconds|log.Lshortfile)
41 | 
42 | 	http.HandleFunc("/", requestHandler)
43 | 
44 | 	logger.Printf("Starting hello world transformer at %s", endpoint)
45 | 	logger.Fatal(http.ListenAndServe(endpoint, nil))
46 | }
47 | 
48 | func requestHandler(w http.ResponseWriter, r *http.Request) {
49 | 	switch r.Method {
50 | 	case http.MethodPut:
51 | 		putHandler(w, r)
52 | 	case http.MethodGet:
53 | 		geHandler(w, r)
54 | 	default:
55 | 		http.Error(w, fmt.Sprintf("Invalid HTTP method %q, expected %q or %q", r.Method, http.MethodPut, http.MethodGet), http.StatusBadRequest)
56 | 	}
57 | }
58 | 
59 | // PUT /
60 | func putHandler(w http.ResponseWriter, r *http.Request) {
61 | 	escapePath := r.URL.EscapedPath()
62 | 	defer r.Body.Close()
63 | 	readContent(w, r.Body, r.ContentLength, escapePath)
64 | 	writeContent(w, escapePath)
65 | }
66 | 
67 | // GET /
68 | func geHandler(w http.ResponseWriter, r *http.Request) {
69 | 	writeContent(w, r.URL.Path)
70 | }
71 | 
72 | func logAndRespondError(w http.ResponseWriter, err error, msg string, status int) {
73 | 	logError(err, msg)
74 | 	http.Error(w, msg, status)
75 | }
76 | 
77 | func logError(err error, msg string) {
78 | 	logger.Printf("%s: %v\n", msg, err)
79 | }
80 | 
81 | func readContent(w http.ResponseWriter, body io.ReadCloser, contentLength int64, path string) {
82 | 	n, err := io.Copy(io.Discard, body)
83 | 
84 | 	if err != nil {
85 | 		logAndRespondError(w, err, fmt.Sprintf("Error reading request body for %q", path), http.StatusBadRequest)
86 | 		return
87 | 	}
88 | 	if contentLength > 0 && contentLength != int64(n) {
89 | 		logAndRespondError(w, nil, fmt.Sprintf("Content length mismatch for %q", path), http.StatusBadRequest)
90 | 		return
91 | 	}
92 | }
93 | 
94 | func writeContent(w http.ResponseWriter, path string) {
95 | 	if _, err := w.Write([]byte("Hello World!")); err != nil {
96 | 		logError(err, fmt.Sprintf("Error writing response for %q", path))
97 | 	}
98 | }
99 | 


--------------------------------------------------------------------------------
/bench/http-server/server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Basic HTTP Multithreaeded Server.
 3 | 
 4 | Steps to run:
 5 | $ python server.py
 6 | 
 7 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 8 | """
 9 | import argparse
10 | from http.server import HTTPServer, BaseHTTPRequestHandler
11 | from socketserver import ThreadingMixIn
12 | 
13 | 
14 | class Handler(BaseHTTPRequestHandler):
15 |     def log_request(self, code="-", size="-"):
16 |         # Don't log successful requests info. Unsuccessful logged by log_error().
17 |         pass
18 | 
19 |     def _set_headers(self):
20 |         self.send_response(200)
21 |         self.send_header("Content-Type", "text/plain")
22 |         self.end_headers()
23 | 
24 |     def do_PUT(self):
25 |         self._set_headers()
26 |         self.wfile.write(b"Hello World!")
27 | 
28 |     def do_GET(self):
29 |         if self.path == "/health":
30 |             self._set_headers()
31 |             self.wfile.write(b"Running")
32 |             return
33 |         self._set_headers()
34 |         self.wfile.write(b"Hello World!")
35 | 
36 | 
37 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
38 |     """Handle requests in a separate thread."""
39 | 
40 | 
41 | def run(addr="localhost", port=8000):
42 |     server = ThreadedHTTPServer((addr, port), Handler)
43 |     print(f"Starting HTTP server on {addr}:{port}")
44 |     server.serve_forever()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser(description="Run a simple HTTP server")
49 |     parser.add_argument(
50 |         "-l",
51 |         "--listen",
52 |         default="localhost",
53 |         help="Specify the IP address on which the server listens",
54 |     )
55 |     parser.add_argument(
56 |         "-p",
57 |         "--port",
58 |         type=int,
59 |         default=8000,
60 |         help="Specify the port on which the server listens",
61 |     )
62 |     args = parser.parse_args()
63 |     run(addr=args.listen, port=args.port)
64 | 


--------------------------------------------------------------------------------
/deploy/README.md:
--------------------------------------------------------------------------------
 1 | ## tar2tf Demo - Docker
 2 | 
 3 | tar2tf Docker deployment shows capabilities of tar2tf module.
 4 | Within a docker instance it creates ready to use setup to interact with tar2tf.
 5 | 
 6 | ```console
 7 | $ ./docker/start.sh
 8 | ```
 9 | 
10 | This command will build and start Docker container, output logs to the current terminal window, deploy AIS cluster with
11 | `tar-bucket` bucket, put necessary data and start Jupyter notebook server.  
12 | 
13 | To begin the demo, go to `localhost:8888` or to the link displayed by Jupyter in the console.
14 | Go to `examples/in_memory_notebook.ipynb` and interact with it.
15 | 
16 | Please note that the first build might take a lot of time, as it has to fetch all necessary dependencies.
17 | Subsequent builds will be much faster, thanks to docker caching.
18 | 
19 | To kill the docker, click Jupyter `Shut Down` button in the browser or send `kill` to the console.
20 | 
21 | ### Datasets
22 | 
23 | By default, `gs://lpr-imagenet/imagenet_train-{0000..0002}.tgz` tars will be downloaded and uploaded to `tar-bucket`.
24 | 
25 | To use locally stored datasets, specify path to the directory in the command line with option `-v`.
26 | 
27 | ```console
28 | $ ./docker/start.sh -v=/home/user/dataset/
29 | ```


--------------------------------------------------------------------------------
/deploy/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.13-buster
 2 | 
 3 | RUN apt-get clean && apt-get update &&\
 4 |   set -eux &&\
 5 |   apt-get --no-install-recommends -y install curl git ca-certificates wget vim python3-setuptools python3 python3-pip \
 6 |    python3-venv sysstat attr net-tools iproute2 build-essential lsof iputils-ping fuse &&\
 7 |   apt-get -y clean all
 8 | 
 9 | RUN pip3 install awscli
10 | 
11 | ARG cld_provider=0
12 | ENV CLD_PROVIDER ${cld_provider}
13 | 
14 | COPY requirements-jupyter.txt .
15 | RUN pip3 install virtualenv && virtualenv -p /usr/bin/python3 /venv && . /venv/bin/activate && \
16 |     pip3 install -r requirements-jupyter.txt && rm requirements-jupyter.txt
17 | 
18 | RUN mkdir -p $GOPATH/src/github.com/NVIDIA/ && git clone https://github.com/NVIDIA/aistore.git $GOPATH/src/github.com/NVIDIA/aistore && echo "$GOPATH/src/github.com/NVIDIA/" && ls $GOPATH/src/github.com/NVIDIA/
19 | COPY . $GOPATH/src/github.com/NVIDIA/ais-tar2tf/
20 | 
21 | WORKDIR $GOPATH/src/github.com/NVIDIA/ais-tar2tf/
22 | EXPOSE 8888
23 | 
24 | ENTRYPOINT [ "sh", "-c", "$GOPATH/src/github.com/NVIDIA/ais-tar2tf/deploy/docker/entrypoint.sh" ]
25 | 


--------------------------------------------------------------------------------
/deploy/docker/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BUCKET=docker_local_bucket
 4 | AISTORE_PATH=$GOPATH/src/github.com/NVIDIA/aistore
 5 | DOCKER_DATA_DIR="/data"
 6 | 
 7 | if [[ ${CLD_PROVIDER} == 1 ]]; then
 8 |     BUCKET=${HOSTNAME}
 9 |     aws s3api create-bucket --bucket ${BUCKET} --region ${AWS_DEFAULT_REGION} --create-bucket-configuration LocationConstraint=${AWS_DEFAULT_REGION}
10 | elif [[ ${CLD_PROVIDER} == 2 ]]; then
11 |     BUCKET=smth # TODO:
12 | fi
13 | 
14 | function cleanup {
15 |     if [[ ${CLD_PROVIDER} == 1 ]]; then
16 |         aws s3 rb s3://${BUCKET} --force
17 |     elif [[ ${CLD_PROVIDER} == 2 ]]; then
18 |         : # TODO: currently noop
19 |     fi
20 | }
21 | trap cleanup EXIT
22 | 
23 | pushd $AISTORE_PATH > /dev/null
24 | (echo -e "4\n4\n3\n${CLD_PROVIDER}" | make deploy) && make cli && sleep 5
25 | popd > /dev/null
26 | 
27 | ais create bucket tar-bucket
28 | if [[ -d $DOCKER_DATA_DIR ]]; then
29 |    find $DOCKER_DATA_DIR -type f -regex ".*\(.tar.gz\|.tar\|.tar.xz\|.tgz\|.txz\)" -exec ais put {} ais://tar-bucket --progress --verbose \;
30 |  else
31 |    ais show download $(ais start download "gs://lpr-imagenet/imagenet_train-{0000..0002}.tgz" ais://tar-bucket) --progress
32 | fi
33 | source /venv/bin/activate && jupyter lab --port=8888 --no-browser --ip=0.0.0.0 --allow-root
34 | 
35 | exit
36 | 


--------------------------------------------------------------------------------
/deploy/docker/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | S_PATH=$(cd "$(dirname "$0")"; pwd -P)
 4 | AIS_TAR2TF_PATH=$(cd "$S_PATH/../.."; pwd -P)
 5 | CONTAINER_NAME=ais-tar2tf
 6 | TAG_NAME=ais-tar2tf
 7 | CLD_PROVIDER=0
 8 | RUN_FLAGS=""
 9 | MOUNT_FLAG=""
10 | DOCKER_DATA_DIR="/data/"
11 | 
12 | if [[ -n $(netstat --help 2>/dev/null) ]]; then
13 |   [[ -n $(netstat -tulpn | grep :::8888 >/dev/null) ]] && echo "Make sure that nothing is listening on port 8888"
14 |   exit 1
15 | fi
16 | 
17 | for i in "$@"; do
18 | case ${i} in
19 |     --name=*)
20 |         CONTAINER_NAME="${i#*=}"
21 |         shift # past argument=value
22 |         ;;
23 | 
24 |     --v=*)
25 |         MOUNT_FLAG="-v ${i#*=}:${DOCKER_DATA_DIR}"
26 |         shift
27 |         ;;
28 | 
29 |     --aws=*)
30 |         ENV_DIR="${i#*=}"
31 |         ENV_DIR="${ENV_DIR/#\~/$HOME}"
32 |         if [[ ! -d ${ENV_DIR} ]]; then
33 |             echo "${ENV_DIR} is not directory"
34 |             exit 1
35 |         fi
36 | 
37 |         TMP_FILE=${ENV_DIR}/.aws.env
38 |         cat ${ENV_DIR}/credentials > ${TMP_FILE}
39 |         cat ${ENV_DIR}/config >> ${TMP_FILE}
40 | 
41 |         sed -i 's/\[default\]//g' ${TMP_FILE}
42 |         sed -i 's/ = /=/g' ${TMP_FILE}
43 |         sed -i 's/aws_access_key_id/AWS_ACCESS_KEY_ID/g' ${TMP_FILE}
44 |         sed -i 's/aws_secret_access_key/AWS_SECRET_ACCESS_KEY/g' ${TMP_FILE}
45 |         sed -i 's/region/AWS_DEFAULT_REGION/g' ${TMP_FILE}
46 | 
47 |         RUN_FLAGS="${RUN_FLAGS} --env-file ${TMP_FILE}"
48 |         CLD_PROVIDER=1
49 | 
50 |         shift # past argument=value
51 |         ;;
52 | 
53 |     -g|--gcp)
54 |         CLD_PROVIDER=2
55 |         shift # past argument
56 |         ;;
57 | 
58 |     *)
59 |         echo "Invalid usage"
60 |         exit 1
61 | esac
62 | done
63 | 
64 | if [[ -n $(docker ps -q -f name=${CONTAINER_NAME}) ]]; then
65 |     echo "Container with ${CONTAINER_NAME} name already exists/running"
66 |     exit 1
67 | fi
68 | 
69 | function cleanup {
70 |     rm -f ${AIS_TAR2TF_PATH}/.dockerignore
71 | }
72 | trap cleanup EXIT INT TERM
73 | 
74 | set -e # don't allow errors in build and volume creation
75 | echo ".git" > ${AIS_TAR2TF_PATH}/.dockerignore
76 | docker volume create ${CONTAINER_NAME} # mount filesystem for docker so AIS can see that
77 | docker build -t $TAG_NAME -f ${S_PATH}/Dockerfile ${AIS_TAR2TF_PATH} \
78 |     --build-arg cld_provider=${CLD_PROVIDER}
79 | cleanup
80 | set +e # now we can allow fails
81 | 
82 | docker run -it ${RUN_FLAGS} \
83 |     $MOUNT_FLAG \
84 |     --ulimit nofile=100000:100000 \
85 |     --name=${CONTAINER_NAME} \
86 |     --privileged \
87 |     -p 8888:8888 \
88 |     $TAG_NAME
89 | 
90 | 
91 | # Removing container and volume
92 | docker rm -f ${CONTAINER_NAME} > /dev/null 2>&1
93 | docker volume rm ${CONTAINER_NAME} > /dev/null 2>&1
94 | 


--------------------------------------------------------------------------------
/examples/imagenet_from_disk.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | 
 5 | from aistore.tf import Dataset, default_record_parser
 6 | from aistore.tf.ops import Select, Decode, Convert, Resize
 7 | 
 8 | 
 9 | def path_generator():
10 |     i = 1
11 |     while True:
12 |         yield "train.record-{}".format(i)
13 |         i += 1
14 | 
15 | 
16 | EPOCHS = 10
17 | BATCH_SIZE = 20
18 | 
19 | # ADJUST Dataset PARAMETERS BELOW
20 | 
21 | BUCKET_NAME = "tar-bucket"
22 | PROXY_URL = "http://localhost:8080"
23 | 
24 | # Create Dataset.
25 | # Values will be extracted from tar-records according to Resize(Convert(Decode("jpg"), tf.float32), (224, 224)) operation,
26 | # meaning that bytes under "jpg" in tar-record will be decoded as an image, converted to tf.float32 type and then Resized to (224, 224)
27 | # Labels will be extracted from tar-records according to Select("cls") operation, meaning that bytes under "cls" will be treated as label.
28 | dataset = Dataset(BUCKET_NAME, PROXY_URL, [Decode("jpg"), Convert("jpg", tf.float32), Resize("jpg", (224, 224))], [Select("jpg"), Select("cls")])
29 | 
30 | # prepare your bucket, for example from `gsutil ls gs://lpr-gtc2020`
31 | # save multiple TFRecord files with max size 2MB to paths generated by path_generator
32 | train_records_files = dataset.load("train-{0..3}.tar", path=path_generator, max_shard_size="2MB", num_workers=4)
33 | # save TFRecord file to test.record path
34 | dataset.load("train-{4..7}.tar", path="test.record", num_workers=4)
35 | 
36 | train_dataset = tf.data.TFRecordDataset(filenames=train_records_files)
37 | train_dataset = train_dataset.map(default_record_parser)
38 | train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE)
39 | 
40 | test_dataset = tf.data.TFRecordDataset(filenames=["test.record"])
41 | test_dataset = test_dataset.map(default_record_parser).batch(BATCH_SIZE)
42 | 
43 | # TRAINING PART BELOW
44 | 
45 | inputs = keras.Input(shape=(224, 224, 3), name="images")
46 | x = layers.Flatten()(inputs)
47 | x = layers.Dense(64, activation="relu", name="dense_1")(x)
48 | x = layers.Dense(64, activation="relu", name="dense_2")(x)
49 | outputs = layers.Dense(10, name="predictions")(x)
50 | model = keras.Model(inputs=inputs, outputs=outputs)
51 | 
52 | model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=keras.losses.mean_squared_error, metrics=["acc"])
53 | 
54 | model.summary()
55 | 
56 | model.fit(train_dataset, epochs=EPOCHS)
57 | result = model.evaluate(test_dataset)
58 | print(dict(zip(model.metrics_names, result)))
59 | dataset.stop()
60 | 


--------------------------------------------------------------------------------
/examples/imagenet_in_memory.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | 
 5 | from aistore.tf import Dataset
 6 | from aistore.tf.ops import Decode, Convert, Resize
 7 | 
 8 | EPOCHS = 5
 9 | BATCH_SIZE = 20
10 | 
11 | # ADJUST Dataset PARAMETERS BELOW
12 | 
13 | BUCKET_NAME = "tar-bucket"
14 | PROXY_URL = "http://localhost:8080"
15 | 
16 | # Create Dataset.
17 | # Values will be extracted from tar-records according to Resize(Convert(Decode("jpg"), tf.float32), (224, 224)) operation,
18 | # meaning that bytes under "jpg" in tar-record will be decoded as an image, converted to tf.float32 type and then Resized to (224, 224)
19 | # Labels will be extracted from tar-records according to Select("cls") operation, meaning that bytes under "cls" will be treated as label.
20 | conversions = [Decode("jpg"), Convert("jpg", tf.float32), Resize("jpg", (224, 224))]
21 | selections = ["jpg", "cls"]
22 | dataset = Dataset(BUCKET_NAME, PROXY_URL, conversions, selections)
23 | 
24 | # prepare your bucket first with tars (for instance gsutil ls gs://lpr-gtc2020)
25 | train_dataset = dataset.load("train-{0..5}.tar", remote_exec=False,
26 |                          num_workers=4).prefetch(EPOCHS * BATCH_SIZE).shuffle(buffer_size=1024).batch(BATCH_SIZE)
27 | 
28 | test_dataset = dataset.load("train-{5..10}.tar", remote_exec=False, num_workers=4).prefetch(BATCH_SIZE).batch(BATCH_SIZE)
29 | 
30 | # TRAINING PART BELOW
31 | inputs = keras.Input(shape=(224, 224, 3), name="images")
32 | x = layers.Flatten()(inputs)
33 | x = layers.Dense(64, activation="relu", name="dense_1")(x)
34 | x = layers.Dense(64, activation="relu", name="dense_2")(x)
35 | outputs = layers.Dense(10, name="predictions")(x)
36 | model = keras.Model(inputs=inputs, outputs=outputs)
37 | 
38 | model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=keras.losses.mean_squared_error, metrics=["acc"])
39 | model.summary()
40 | 
41 | model.fit(train_dataset, epochs=EPOCHS)
42 | result = model.evaluate(test_dataset)
43 | print(dict(zip(model.metrics_names, result)))
44 | 


--------------------------------------------------------------------------------
/runtime/Makefile:
--------------------------------------------------------------------------------
1 | SUBDIRS := $(wildcard */.)
2 | 
3 | all: $(SUBDIRS)
4 | $(SUBDIRS):
5 | 	$(MAKE) -C $@
6 | 
7 | .PHONY: all $(SUBDIRS)
8 | 


--------------------------------------------------------------------------------
/runtime/README.md:
--------------------------------------------------------------------------------
 1 | Here are placed Dockerfiles for different runtimes used by ETL build.
 2 | In each runtime package there is `Makefile` that should be used to build and push the images.
 3 | 
 4 | Current list of runtimes:
 5 | * Python:
 6 |   * `runtime_python:3.9v2` -> `python3.9` is used.
 7 |   * `runtime_python:3.10v2` -> `python3.10` is used.
 8 |   * `runtime_python:3.11v2` -> `python3.11` is used.
 9 |   * `runtime_python:3.12v2` -> `python3.12` is used.
10 |   * `runtime_python:3.13v2` -> `python3.13` is used.
11 | 


--------------------------------------------------------------------------------
/runtime/python/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG PYTHON_VERSION
 2 | 
 3 | FROM docker.io/library/python:${PYTHON_VERSION}-alpine
 4 | 
 5 | RUN pip3 install --upgrade aistore[etl]>=1.14.0
 6 | 
 7 | # Set working directory
 8 | RUN mkdir /code
 9 | WORKDIR /code
10 | 
11 | # Copy app code
12 | COPY bootstrap.py server.py ./
13 | 
14 | # Environment setup
15 | ENV PYTHONUNBUFFERED=1
16 | 
17 | # Expose the default port
18 | EXPOSE 8000
19 | 


--------------------------------------------------------------------------------
/runtime/python/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all
2 | all:
3 | 	@REGISTRY_URL=$(REGISTRY_URL) RUNTIME_TAG_MODIFIER=$(RUNTIME_TAG_MODIFIER) bash builder.sh
4 | 


--------------------------------------------------------------------------------
/runtime/python/builder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Defines mapping: "runtime name" -> "python version".
 6 | declare -A python_versions=(
 7 |   [3.9]="3.9"
 8 |   [3.10]="3.10"
 9 |   [3.11]="3.11"
10 |   [3.12]="3.12"
11 |   [3.13]="3.13"
12 | )
13 | 
14 | for runtime_name in "${!python_versions[@]}"; do
15 |   echo "BUILDING AND PUSHING ${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}"
16 |   echo "PYTHON_VERSION=${python_versions[${runtime_name}]}"
17 | 	docker build --pull --no-cache \
18 | 	  -t "${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}" \
19 | 	  --build-arg PYTHON_VERSION="${python_versions[${runtime_name}]}" \
20 | 	  .
21 | 	docker push "${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}"
22 | done
23 | 


--------------------------------------------------------------------------------
/runtime/python/io-comm/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | build:
3 | 	go build -o server
4 | 


--------------------------------------------------------------------------------
/runtime/python/io-comm/cmn.go:
--------------------------------------------------------------------------------
 1 | // Package main is an entry point to ioComm server
 2 | /*
 3 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | 
 6 | package main
 7 | 
 8 | import (
 9 | 	"errors"
10 | 	"fmt"
11 | 	"io/ioutil"
12 | 	"log"
13 | 	"net/http"
14 | 	"runtime/debug"
15 | 	"strconv"
16 | )
17 | 
18 | const (
19 | 	headerContentLength = "Content-Length"
20 | 	headerContentType   = "Content-Type"
21 | 
22 | 	getContentType = "binary/octet-stream"
23 | )
24 | 
25 | func invalidMsgHandler(w http.ResponseWriter, errCode int, format string, a ...interface{}) {
26 | 	logErrorf(format, a...)
27 | 	w.Header().Set("Content-type", "text/plain")
28 | 	w.WriteHeader(errCode)
29 | 	w.Write([]byte(fmt.Sprintf(format, a...)))
30 | }
31 | 
32 | func setResponseHeaders(header http.Header, size int64) {
33 | 	header.Set(headerContentLength, strconv.FormatInt(size, 10))
34 | 	header.Set(headerContentType, getContentType)
35 | }
36 | 
37 | // Returns an error with message if status code was > 200
38 | func wrapHttpError(resp *http.Response, err error) (*http.Response, error) {
39 | 	if err != nil {
40 | 		return resp, err
41 | 	}
42 | 
43 | 	if resp.StatusCode > http.StatusOK {
44 | 		if resp.Body == nil {
45 | 			return resp, errors.New(resp.Status)
46 | 		}
47 | 		b, err := ioutil.ReadAll(resp.Body)
48 | 		if err != nil {
49 | 			return resp, err
50 | 		}
51 | 		return resp, fmt.Errorf("%s %s", resp.Status, string(b))
52 | 	}
53 | 
54 | 	return resp, nil
55 | }
56 | 
57 | func logErrorf(format string, a ...interface{}) {
58 | 	log.Printf(string(debug.Stack())+" : "+format, a...)
59 | }
60 | 


--------------------------------------------------------------------------------
/runtime/python/io-comm/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/NVIDIA/ais-etl/runtime/python/io-comm
2 | 
3 | go 1.18
4 | 


--------------------------------------------------------------------------------
/runtime/python/io-comm/main.go:
--------------------------------------------------------------------------------
  1 | // Package main is an entry point to ioComm server
  2 | /*
  3 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  4 |  */
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"log"
 12 | 	"net/http"
 13 | 	"os"
 14 | 	"os/exec"
 15 | 	"strings"
 16 | )
 17 | 
 18 | var (
 19 | 	aisTargetURL string
 20 | 	endpoint     string
 21 | 
 22 | 	client *http.Client
 23 | )
 24 | 
 25 | func initVars(ipAddress string, port int) {
 26 | 	endpoint = fmt.Sprintf("%s:%d", ipAddress, port)
 27 | 	aisTargetURL = os.Getenv("AIS_TARGET_URL")
 28 | 	client = &http.Client{}
 29 | }
 30 | 
 31 | func main() {
 32 | 	var (
 33 | 		ipAddressArg = flag.String("l", "0.0.0.0", "Specify the IP address on which the server listens")
 34 | 		portArg      = flag.Int("p", 80, "Specify the port on which the server listens")
 35 | 	)
 36 | 
 37 | 	flag.Parse()
 38 | 
 39 | 	initVars(*ipAddressArg, *portArg)
 40 | 
 41 | 	http.HandleFunc("/", ioHandler)
 42 | 	http.HandleFunc("/health", healthHandler)
 43 | 
 44 | 	log.Printf("Starting io comm server at %s", endpoint)
 45 | 	log.Fatal(http.ListenAndServe(endpoint, nil))
 46 | }
 47 | 
 48 | func healthHandler(w http.ResponseWriter, r *http.Request) {
 49 | 	switch r.Method {
 50 | 	case http.MethodGet:
 51 | 		w.WriteHeader(http.StatusOK)
 52 | 		w.Write([]byte("Running"))
 53 | 	default:
 54 | 		invalidMsgHandler(w, http.StatusBadRequest, "invalid http method %s", r.Method)
 55 | 	}
 56 | 
 57 | }
 58 | 
 59 | func ioHandler(w http.ResponseWriter, r *http.Request) {
 60 | 	switch r.Method {
 61 | 	case http.MethodPut:
 62 | 		ioPutHandler(w, r)
 63 | 	case http.MethodGet:
 64 | 		ioGetHandler(w, r)
 65 | 	default:
 66 | 		invalidMsgHandler(w, http.StatusBadRequest, "invalid http method %s", r.Method)
 67 | 	}
 68 | }
 69 | 
 70 | // PUT /
 71 | func ioPutHandler(w http.ResponseWriter, r *http.Request) {
 72 | 	command, ok := r.URL.Query()["command"]
 73 | 	if !ok {
 74 | 		invalidMsgHandler(w, http.StatusBadRequest, "missing command to execute")
 75 | 		return
 76 | 	}
 77 | 
 78 | 	r.Header.Set("Content-Type", "application/octet-stream")
 79 | 	// TODO: validate command to execute (Security!)
 80 | 	cmd := exec.Command(command[0], command[1:]...)
 81 | 	stdin, err := cmd.StdinPipe()
 82 | 	if err != nil {
 83 | 		return
 84 | 	}
 85 | 
 86 | 	pr, pw := io.Pipe()
 87 | 	cmd.Stdout = pw
 88 | 	cmd.Stderr = os.Stderr
 89 | 	go func() {
 90 | 		io.Copy(stdin, r.Body)
 91 | 		stdin.Close()
 92 | 	}()
 93 | 	go io.Copy(w, pr)
 94 | 
 95 | 	err = cmd.Run()
 96 | 	pw.Close()
 97 | 	if err != nil {
 98 | 		logErrorf("failed to exec command, err: %v", err)
 99 | 	}
100 | }
101 | 
102 | // GET /
103 | func ioGetHandler(w http.ResponseWriter, r *http.Request) {
104 | 	if aisTargetURL == "" {
105 | 		invalidMsgHandler(w, http.StatusBadRequest, "missing env variable AIS_TARGET_URL")
106 | 		return
107 | 	}
108 | 
109 | 	path := strings.TrimPrefix(r.URL.EscapedPath(), "/")
110 | 	if path == "health" {
111 | 		return
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/runtime/python/server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Entry point for launching a deserialized ETL server instance.
 3 | This module reads a base64-encoded ETL class definition from the
 4 | ETL_CLASS_PAYLOAD environment variable, deserializes it into a subclass
 5 | of `ETLServer`, and instantiates it.
 6 | 
 7 | This file is intended to be used by uvicorn/gunicorn like:
 8 |     uvicorn server:server.app --workers=4 ...
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import os
14 | from typing import Type
15 | import logging
16 | 
17 | from aistore.sdk.etl.webserver.base_etl_server import ETLServer
18 | from aistore.sdk.etl.webserver.utils import deserialize_class
19 | 
20 | logging.basicConfig(
21 |     level=logging.INFO,
22 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
23 | )
24 | # ------------------------------------------------------------------------------
25 | # Load and validate payload
26 | # ------------------------------------------------------------------------------
27 | ETL_CLASS_PAYLOAD: str = os.getenv("ETL_CLASS_PAYLOAD", "")
28 | if not ETL_CLASS_PAYLOAD:
29 |     raise RuntimeError("ETL_CLASS_PAYLOAD environment variable is not set")
30 | 
31 | # ------------------------------------------------------------------------------
32 | # Deserialize the ETL class and instantiate the server
33 | # ------------------------------------------------------------------------------
34 | try:
35 |     ETLClass: Type[ETLServer] = deserialize_class(ETL_CLASS_PAYLOAD)
36 | except Exception as e:
37 |     raise RuntimeError(f"Failed to deserialize ETL class: {e}") from e
38 | server = ETLClass()
39 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | # Install ffmpeg
 4 | RUN apk add --no-cache ffmpeg
 5 | 
 6 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 7 | 
 8 | # Set working directory
 9 | RUN mkdir /code
10 | WORKDIR /code
11 | 
12 | # Copy app code
13 | COPY flask_server.py fastapi_server.py http_server.py ./
14 | 
15 | # Environment setup
16 | ENV PYTHONUNBUFFERED=1
17 | 
18 | # Expose the default port
19 | EXPOSE 8000
20 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_ffmpeg:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_ffmpeg:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FFmpeg ETL Transformer (Fast-API)
 3 | 
 4 | This module implements an ETL transformer as a FastAPI-based server
 5 | that transform audio files into WAV format with control over
 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import os
12 | import subprocess
13 | 
14 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
15 | 
16 | 
17 | class FFmpegServer(FastAPIServer):
18 |     """
19 |     FastAPI-based server for FFmpeg-based ETL transformation.
20 |     """
21 | 
22 |     def __init__(self, host: str = "0.0.0.0", port: int = 8000):
23 |         super().__init__(host=host, port=port)
24 |         # configure from environment or defaults
25 |         self.channels = os.getenv("AC", "1")
26 |         self.samplerate = os.getenv("AR", "44100")
27 |         # base ffmpeg command, reading from stdin, writing WAV to stdout
28 |         self.ffmpeg_cmd = [
29 |             "ffmpeg",
30 |             "-nostdin",
31 |             "-loglevel",
32 |             "error",
33 |             "-i",
34 |             "pipe:0",
35 |             "-ac",
36 |             self.channels,
37 |             "-ar",
38 |             self.samplerate,
39 |             "-c:a",
40 |             "pcm_s16le",
41 |             "-f",
42 |             "wav",
43 |             "pipe:1",
44 |         ]
45 |         self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"}
46 | 
47 |     def transform(self, data: bytes, path: str, _etl_args: str) -> bytes:
48 |         """
49 |         Run FFmpeg to convert raw audio into WAV format.
50 |         Raises an RuntimeError on FFmpeg failure.
51 |         """
52 |         ext = os.path.splitext(path)[1].lower()
53 |         # If it doesn’t look like audio, just pass it back without processing it
54 |         if ext not in self.audio_exts:
55 |             return data
56 | 
57 |         with subprocess.Popen(
58 |             self.ffmpeg_cmd,
59 |             stdin=subprocess.PIPE,
60 |             stdout=subprocess.PIPE,
61 |             stderr=subprocess.PIPE,
62 |         ) as proc:
63 |             out, err = proc.communicate(input=data)
64 |             if proc.returncode != 0:
65 |                 msg = err.decode("utf-8", errors="ignore").strip()
66 |                 self.logger.error("FFmpeg error: %s", msg)
67 |                 raise RuntimeError(f"FFmpeg process failed: {msg}")
68 |             return out
69 | 
70 |     def get_mime_type(self) -> str:
71 |         """
72 |         Return the MIME type for the transformed data.
73 |         """
74 |         return "audio/wav"
75 | 
76 | 
77 | # Create the server instance and expose the FastAPI app
78 | fastapi_server = FFmpegServer(port=8000)
79 | fastapi_server.logger.setLevel("DEBUG")
80 | fastapi_app = fastapi_server.app  # Expose the FastAPI app
81 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/flask_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FFmpeg ETL Transformer (Flask)
 3 | 
 4 | This module implements an ETL transformer as a FastAPI-based server
 5 | that transform audio files into WAV format with control over
 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import os
12 | import subprocess
13 | 
14 | from aistore.sdk.etl.webserver.flask_server import FlaskServer
15 | 
16 | 
17 | class FFmpegServer(FlaskServer):
18 |     """
19 |     Flask-based server for FFmpeg-based ETL transformation.
20 |     """
21 | 
22 |     def __init__(self, host: str = "0.0.0.0", port: int = 8000):
23 |         super().__init__(host=host, port=port)
24 |         # configure from environment or defaults
25 |         self.channels = os.getenv("AC", "1")
26 |         self.samplerate = os.getenv("AR", "44100")
27 |         # base ffmpeg command, reading from stdin, writing WAV to stdout
28 |         self.ffmpeg_cmd = [
29 |             "ffmpeg",
30 |             "-nostdin",
31 |             "-loglevel",
32 |             "error",
33 |             "-i",
34 |             "pipe:0",
35 |             "-ac",
36 |             self.channels,
37 |             "-ar",
38 |             self.samplerate,
39 |             "-c:a",
40 |             "pcm_s16le",
41 |             "-f",
42 |             "wav",
43 |             "pipe:1",
44 |         ]
45 |         self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"}
46 | 
47 |     def transform(self, data: bytes, path: str, _etl_args: str) -> bytes:
48 |         """
49 |         Run FFmpeg to convert raw audio into WAV format.
50 |         Raises an error on FFmpeg failure.
51 |         """
52 |         ext = os.path.splitext(path)[1].lower()
53 |         # If it doesn’t look like audio, just pass it back without processing it
54 |         if ext not in self.audio_exts:
55 |             return data
56 | 
57 |         with subprocess.Popen(
58 |             self.ffmpeg_cmd,
59 |             stdin=subprocess.PIPE,
60 |             stdout=subprocess.PIPE,
61 |             stderr=subprocess.PIPE,
62 |         ) as proc:
63 |             out, err = proc.communicate(input=data)
64 |             if proc.returncode != 0:
65 |                 msg = err.decode("utf-8", errors="ignore").strip()
66 |                 self.logger.error("FFmpeg error: %s", msg)
67 |                 raise RuntimeError(f"FFmpeg process failed: {msg}")
68 |             return out
69 | 
70 |     def get_mime_type(self) -> str:
71 |         """
72 |         Return the MIME type for the transformed data.
73 |         """
74 |         return "audio/wav"
75 | 
76 | 
77 | flask_server = FFmpegServer(port=8000)
78 | flask_server.logger.setLevel("DEBUG")
79 | flask_app = flask_server.app
80 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/http_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FFmpeg ETL Transformer (HTTP-based Server)
 3 | 
 4 | This module implements an ETL transformer as a FastAPI-based server
 5 | that transform audio files into WAV format with control over
 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import os
12 | import subprocess
13 | 
14 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer
15 | 
16 | 
17 | class FFmpegServer(HTTPMultiThreadedServer):
18 |     """
19 |     Multi-threaded HTTP server for FFmpeg-based ETL transformation.
20 |     """
21 | 
22 |     def __init__(self, host: str = "0.0.0.0", port: int = 8000):
23 |         super().__init__(host=host, port=port)
24 |         # configure from environment or defaults
25 |         self.channels = os.getenv("AC", "2")
26 |         self.samplerate = os.getenv("AR", "44100")
27 |         # base ffmpeg command, reading from stdin, writing WAV to stdout
28 |         self.ffmpeg_cmd = [
29 |             "ffmpeg",
30 |             "-nostdin",
31 |             "-loglevel",
32 |             "error",
33 |             "-i",
34 |             "pipe:0",
35 |             "-ac",
36 |             str(self.channels),
37 |             "-ar",
38 |             str(self.samplerate),
39 |             "-c:a",
40 |             "pcm_s16le",
41 |             "-f",
42 |             "wav",
43 |             "pipe:1",
44 |         ]
45 |         self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"}
46 | 
47 |     def transform(self, data: bytes, path: str, _etl_args: str) -> bytes:
48 |         """
49 |         Run FFmpeg to convert raw audio into WAV format.
50 |         Raises an error on FFmpeg failure.
51 |         """
52 |         ext = os.path.splitext(path)[1].lower()
53 |         # If it doesn’t look like audio, just pass it back without processing it
54 |         if ext not in self.audio_exts:
55 |             return data
56 | 
57 |         with subprocess.Popen(
58 |             self.ffmpeg_cmd,
59 |             stdin=subprocess.PIPE,
60 |             stdout=subprocess.PIPE,
61 |             stderr=subprocess.PIPE,
62 |         ) as proc:
63 |             out, err = proc.communicate(input=data)
64 |             if proc.returncode != 0:
65 |                 msg = err.decode("utf-8", errors="ignore").strip()
66 |                 self.logger.error("FFmpeg error: %s", msg)
67 |                 raise RuntimeError(f"FFmpeg process failed: {msg}")
68 |             return out
69 | 
70 |     def get_mime_type(self) -> str:
71 |         """
72 |         Return the MIME type for the transformed data.
73 |         """
74 |         return "audio/wav"
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     server = FFmpegServer()
79 |     server.logger.setLevel("DEBUG")
80 |     server.start()
81 | 


--------------------------------------------------------------------------------
/transformers/FFmpeg/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-nemo-ffmpeg
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: "hpull://"
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_nemo_ffmpeg:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       # for flask based app
19 |       # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"]
20 |       # for http based app
21 |       # command: ["python", "http_server.py"]
22 |       # for fastapi based app
23 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
24 |       readinessProbe:
25 |         httpGet:
26 |           path: /health
27 |           port: default
28 |       env:
29 |         - name: AR
30 |           value: "16000"
31 |         - name: AC
32 |           value: "1"
33 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
34 |       # where the objects are stored on AIStore targets. This allows the ETL container 
35 |       # to access the files directly by absolute path.
36 |   #     volumeMounts:
37 |   #       - name: sda
38 |   #         mountPath: /ais/sda
39 |   #       - name: sdb
40 |   #         mountPath: /ais/sdb
41 |   #       ...
42 |   # volumes:
43 |   #   - name: sda
44 |   #     hostPath:
45 |   #       path: /ais/sda
46 |   #       type: Directory
47 |   #   - name: sdb
48 |   #     hostPath:
49 |   #       path: /ais/sdb
50 |   #       type
51 |   #   ...


--------------------------------------------------------------------------------
/transformers/Makefile:
--------------------------------------------------------------------------------
1 | common_deps:
2 | 	pip install -r tests/requirements.txt
3 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/README.md:
--------------------------------------------------------------------------------
 1 | # AIStore Audio Split & Consolidate Transformer
 2 | 
 3 | This transformer splits and consolidates audio files using a [JSONL](https://jsonlines.org/) manifest file as input. It extracts segments specified in the manifest, consolidates them, and returns the result as a tarball.
 4 | 
 5 | This transformer consists of two components:
 6 | 
 7 | 1. **Audio Manager** – Processes the manifest and dispatches splitting tasks.
 8 | 2. **Audio Splitter** – Splits individual audio files based on instructions from the Audio Manager.
 9 | 
10 | ---
11 | 
12 | ## Why two separate transformers?
13 | 
14 | Using separate transformers ensures scalability through distributed processing. A single transformer combining both roles would not scale efficiently, as audio files might not reside on the same node, causing performance issues due to unnecessary data movement between nodes. Separating the roles allows efficient distributed processing across the AIStore cluster.
15 | 
16 | ![Audio Split Consolidate Overview](audio_split_consolidate_diagram.png)
17 | 
18 | ---
19 | 
20 | ## Example Input Manifest
21 | 
22 | `manifest.jsonl`:
23 | ```json
24 | {"id": "youtube_vid_id_1", "part": 1, "from_time": 0.36, "to_time": 2.36}
25 | {"id": "youtube_vid_id_1", "part": 2, "from_time": 3.36, "to_time": 9.36}
26 | {"id": "youtube_vid_id_2", "part": 1, "from_time": 0.0, "to_time": 4.0}
27 | ```
28 | 
29 | Output:
30 | - A tarball (`manifest.tar`) containing:
31 |   - `youtube_vid_id_1_1`
32 |   - `youtube_vid_id_1_2`
33 |   - `youtube_vid_id_2_1`
34 | 
35 | Each file will contain audio trimmed to the specified duration.
36 | 
37 | ---
38 | 
39 | ## How to Get Started
40 | 
41 | ### Step 1: Prepare the Manifest
42 | 
43 | Create a JSON Lines (`.jsonl`) file where each line contains:
44 | - `id`: Identifier of the audio file.
45 | - `part`: Part number.
46 | - `from_time` and `to_time`: Segment duration.
47 | 
48 | ---
49 | 
50 | ## Deploy ETLs
51 | 
52 | ### Audio Splitter ETL
53 | 
54 | Review and edit the configuration ([`audio_splitter/pod.yaml`](audio_splitter/pod.yaml)) as needed.
55 | 
56 | ```bash
57 | ais etl init spec --from-file audio_splitter/pod.yaml --comm-type hpush --name audio-splitter
58 | ```
59 | 
60 | ### Audio Manager ETL
61 | 
62 | Review and edit the configuration ([`audio_manager/pod.yaml`](audio_manager/pod.yaml)), ensuring settings match your environment.
63 | 
64 | ```bash
65 | ais etl init spec --from-file audio_manager/pod.yaml --comm-type hpush --name audio-manager
66 | ```
67 | 
68 | Ensure the manifest file is accessible by the Audio Manager.
69 | 
70 | ---
71 | 
72 | ## Run Transformations
73 | 
74 | ### Single Manifest File
75 | 
76 | ```bash
77 | ais etl object audio-manager ais://manifests/manifest.jsonl manifest.tar
78 | ```
79 | 
80 | ### Batch Operation (Multiple Manifest Files - Bucket Transform)
81 | 
82 | ```bash
83 | ais etl bucket audio-manager ais://bench_manifests ais://output_bucket --ext "{jsonl:tar}"
84 | ```
85 | 
86 | This will process each `.jsonl` file in the source bucket and output consolidated audio tarballs (`.tar`) into the specified output bucket.
87 | 
88 | 
89 | ## Performance  
90 | 
91 | Our [benchmark](../../benchmarks/audio_split_consolidate.py) demonstrates that using our ETL can accelerate data processing by **up to 13x** compared to single-threaded local execution. Performance scales **linearly** with the number of targets and disks in the AIStore cluster.


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_manager/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 4 | 
 5 | # Set working directory
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | 
 9 | # Copy app code
10 | COPY fastapi_server.py ./
11 | 
12 | # Environment setup
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | # Expose the default port
16 | EXPOSE 8000
17 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_manager/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | 
 4 | REGISTRY_URL ?= docker.io/aistorage
 5 | 
 6 | ifeq ($(GIT_TEST), true)
 7 | 	TAG := test
 8 | endif
 9 | 
10 | all: build push
11 | 
12 | build:
13 | 	docker build -t $(REGISTRY_URL)/transformer_audio_manager:$(TAG) .
14 | 
15 | push:
16 | 	docker push $(REGISTRY_URL)/transformer_audio_manager:$(TAG)
17 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_manager/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-audio-manager
 5 |   annotations:
 6 |     communication_type: "hpull://"
 7 |     wait_timeout: 10m
 8 | spec:
 9 |   containers:
10 |     - name: server
11 |       image: aistorage/transformer_audio_manager:latest
12 |       imagePullPolicy: Always
13 |       ports:
14 |         - name: default
15 |           containerPort: 8000
16 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
17 |       readinessProbe:
18 |         httpGet:
19 |           path: /health
20 |           port: default
21 |       env:
22 |         # AIS endpoint
23 |         - name: AIS_ENDPOINT
24 |           value: "http://<ais-proxy>:51080"
25 |         # Bucket name of the audio files
26 |         - name: SRC_BUCKET
27 |           value: "<bucket-name>"
28 |         # Provider of the audio files (ais, gcp, aws) 
29 |         - name: SRC_PROVIDER
30 |           value: "ais"
31 |         # Prefix of the audio files
32 |         - name: OBJ_PREFIX
33 |           value: ""
34 |         # Extension of the audio files
35 |         - name: OBJ_EXTENSION
36 |           value: "wav"
37 |         # ETL Name of the Audio Splitter ETL you previously initialised
38 |         - name: ETL_NAME
39 |           value: "<etl-name>"
40 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
41 |       # where the objects are stored on AIStore targets. This allows the ETL container 
42 |       # to access the files directly by absolute path.
43 |   #     volumeMounts:
44 |   #       - name: ais
45 |   #         mountPath: /tmp/
46 |   # volumes:
47 |   #   - name: ais
48 |   #     hostPath:
49 |   #       path: /tmp/
50 |   #       type: Directory
51 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_split_consolidate_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/NeMo/audio_split_consolidate/audio_split_consolidate_diagram.png


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_splitter/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | # need this for soundfile
 4 | RUN apk add --no-cache libsndfile-dev
 5 | # Install dependencies
 6 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 soundfile
 7 | 
 8 | # Set working directory
 9 | RUN mkdir /code
10 | WORKDIR /code
11 | 
12 | # Copy app code
13 | COPY fastapi_server.py ./
14 | 
15 | # Environment setup
16 | ENV PYTHONUNBUFFERED=1
17 | 
18 | # Expose the default port
19 | EXPOSE 8000
20 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_splitter/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | 
 4 | ifeq ($(GIT_TEST), true)
 5 | 	TAG := test
 6 | endif
 7 | 
 8 | REGISTRY_URL ?= docker.io/aistorage
 9 | 
10 | all: build push
11 | 
12 | build:
13 | 	docker build -t $(REGISTRY_URL)/transformer_audio_splitter:$(TAG) .
14 | 
15 | push:
16 | 	docker push $(REGISTRY_URL)/transformer_audio_splitter:$(TAG)
17 | 


--------------------------------------------------------------------------------
/transformers/NeMo/audio_split_consolidate/audio_splitter/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-audio-splitter
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: "hpull://"
 8 |     wait_timeout: 10m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_audio_splitter:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 8000
17 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
18 |       readinessProbe:
19 |         httpGet:
20 |           path: /health
21 |           port: default
22 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
23 |       # where the objects are stored on AIStore targets. This allows the ETL container 
24 |       # to access the files directly by absolute path.
25 |   #     volumeMounts:
26 |   #       - name: ais
27 |   #         mountPath: /tmp/
28 |   # volumes:
29 |   #   - name: ais
30 |   #     hostPath:
31 |   #       path: /tmp/
32 |   #       type: Directory
33 | 


--------------------------------------------------------------------------------
/transformers/batch_rename/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 4 | 
 5 | # Set working directory
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | 
 9 | # Copy app code
10 | COPY fastapi_server.py ./
11 | 
12 | # Environment setup
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | # Expose the default port
16 | EXPOSE 8000
17 | 


--------------------------------------------------------------------------------
/transformers/batch_rename/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build --no-cache -t $(REGISTRY_URL)/transformer_batch_rename:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_batch_rename:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/batch_rename/README.md:
--------------------------------------------------------------------------------
 1 | # Batch Rename Transformer
 2 | 
 3 | The **Batch Rename Transformer** reads objects from a source bucket, and if their path matches a given regex pattern, it writes them to a destination bucket with a modified name (prefixed path). This is useful in ETL pipelines where data normalization, path restructuring, or archival tagging is needed. 
 4 | 
 5 | Even if an object does not match the pattern, the transformer still returns the original object bytes to the caller. This allows it to support both inline and offline transformation modes seamlessly.
 6 | 
 7 | Its basically a copy operation, your data will be copied to new path. Users are responsible for deleting the old objects.
 8 | 
 9 | The transformer supports both `hpull` and `hpush` communication mechanisms, enabling seamless integration into AIStore-based pipelines.
10 | 
11 | > For more information on ETL communication mechanisms, see [AIStore ETL Documentation](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms).
12 | 
13 | ---
14 | 
15 | ### Environment Variables
16 | 
17 | | Variable              | Description                                               | Required |
18 | | --------------------- | --------------------------------------------------------- | -------- |
19 | | `AIS_ENDPOINT`        | URL of the AIStore proxy (e.g., `http://ais-proxy:51080`) | ✅ Yes    |
20 | | `DST_BUCKET`          | Name of the destination bucket                            | ✅ Yes    |
21 | | `DST_BUCKET_PROVIDER` | Provider for the destination bucket (default: `ais`)      | No       |
22 | | `FILE_PATTERN`        | Regex pattern to match source object names                | ✅ Yes    |
23 | | `DST_PREFIX`          | Prefix to prepend to renamed object paths                 | ✅ Yes    |
24 | 
25 | ---
26 | 
27 | ### Initializing ETL with AIStore CLI
28 | 
29 | Follow these steps to initialize the batch rename transformer using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md):
30 | 
31 | ```bash
32 | $ cd transformers/batch_rename
33 | 
34 | # Set communication type: either 'hpull://' or 'hpush://'
35 | $ export COMMUNICATION_TYPE='hpull://'
36 | 
37 | # Initialize the ETL with a chosen name
38 | $ ais etl init spec --from-file init_spec.yaml --name <etl-name> --comm-type "$COMMUNICATION_TYPE"
39 | 
40 | # Inline transformation (single object)
41 | # If the object matches the pattern, it will be renamed and saved to the destination bucket.
42 | # The content will also be returned to the caller.
43 | $ ais etl object <etl-name> ais://<src-bucket>/<object-name> -
44 | 
45 | # (Optional) Discard content if not needed
46 | $ ais etl object <etl-name> ais://<src-bucket>/<object-name> /dev/null
47 | 
48 | # To run transformation offline (bucket-to-bucket)
49 | $ ais etl bucket <etl-name> ais://<src-bucket> ais://<dst-bucket>
50 | ```
51 | 


--------------------------------------------------------------------------------
/transformers/batch_rename/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A FastAPI-based ETL server that renames objects based on a regex pattern
 3 | and stores them to a destination bucket with a new prefix.
 4 | 
 5 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 6 | """
 7 | 
 8 | import os
 9 | import re
10 | 
11 | from aistore import Client
12 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
13 | 
14 | 
15 | class BatchRenameServer(FastAPIServer):
16 |     """
17 |     ETL server that renames input objects based on a pattern match.
18 | 
19 |     If the object path matches the regex pattern defined by FILE_PATTERN,
20 |     the object is renamed by applying DST_PREFIX and written to DST_BUCKET.
21 | 
22 |     Environment Variables:
23 |         FILE_PATTERN         - Regex pattern to match object paths (required)
24 |         DST_PREFIX          - Prefix to apply to renamed objects (required)
25 |         DST_BUCKET           - Destination bucket name (required)
26 |         DST_BUCKET_PROVIDER  - Storage provider for the destination bucket (default: "ais")
27 |         AIS_ENDPOINT         - AIStore endpoint URL (required)
28 |     """
29 | 
30 |     def __init__(self, host: str = "0.0.0.0", port: int = 8000):
31 |         super().__init__(host=host, port=port)
32 |         self.pattern = os.getenv("FILE_PATTERN") or self._fatal("FILE_PATTERN")
33 |         self.prefix = os.getenv("DST_PREFIX") or self._fatal("DST_PREFIX")
34 |         self.dst_bucket = os.getenv("DST_BUCKET") or self._fatal("DST_BUCKET")
35 |         self.ais_endpoint = os.getenv("AIS_ENDPOINT") or self._fatal("AIS_ENDPOINT")
36 |         self.dst_provider = os.getenv("DST_BUCKET_PROVIDER", "ais")
37 |         self.ais_client = Client(self.ais_endpoint, timeout=None)
38 | 
39 |     @staticmethod
40 |     def _fatal(var: str) -> None:
41 |         """Raise an error for missing required environment variables."""
42 |         raise ValueError(f"Environment variable '{var}' is required")
43 | 
44 |     def transform(self, data: bytes, path: str, *_):
45 |         """
46 |         Rename and redirect matching input object to a new path in the destination bucket.
47 | 
48 |         Args:
49 |             data (bytes): Object content.
50 |             path (str): Original object path.
51 | 
52 |         Returns:
53 |             bytes: The original object content (unmodified).
54 |         """
55 |         if re.search(self.pattern, path):
56 |             new_path = f"{self.prefix}{os.path.basename(path)}"
57 |             # TODO: Add directly to target option
58 |             self.ais_client.bucket(self.dst_bucket, provider=self.dst_provider).object(
59 |                 new_path
60 |             ).get_writer().put_content(data)
61 |         return data
62 | 
63 | 
64 | # Initialize the ETL server and expose the FastAPI application
65 | fastapi_server = BatchRenameServer()
66 | fastapi_server.logger.setLevel("DEBUG")
67 | fastapi_app = fastapi_server.app
68 | 


--------------------------------------------------------------------------------
/transformers/batch_rename/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-batch-rename
 5 |   annotations:
 6 |     # Values it can take ["hpull://", "hpush://"]
 7 |     communication_type: "hpull://"
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_batch_rename:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       # Adjust the num of workers based on the number of CPU cores available
19 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
20 |       readinessProbe:
21 |         httpGet:
22 |           path: /health
23 |           port: default
24 |       env:
25 |         # Required: AIStore endpoint
26 |         - name: AIS_ENDPOINT
27 |           value: "http://<proxy-lb-ip>:51080"
28 | 
29 |         # Required: Destination bucket to write renamed objects
30 |         - name: DST_BUCKET
31 |           value: "<dst-bucket-name>"
32 | 
33 |         # Optional: Provider for the destination bucket (default: ais)
34 |         - name: DST_BUCKET_PROVIDER
35 |           value: "ais"
36 | 
37 |         # Required: Regex pattern to match files
38 |         - name: FILE_PATTERN
39 |         # all .flac files
40 |           value: '.*\.flac$'
41 | 
42 |         # Required: Prefix to apply to renamed files
43 |         - name: DST_PREFIX
44 |           value: "renamed/"
45 | 


--------------------------------------------------------------------------------
/transformers/compress/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.11-alpine
 2 | 
 3 | COPY requirements.txt requirements.txt
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | COPY server.py server.py
 9 | 
10 | ENV PYTHONUNBUFFERED 1
11 | 
12 | EXPOSE 80
13 | 


--------------------------------------------------------------------------------
/transformers/compress/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_compress:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_compress:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/compress/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-compress
 5 |   annotations:
 6 |     # Values `communication_type` can take are ["hpull://", "hpush://"].
 7 |     # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 
 8 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 9 |     wait_timeout: 5m
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_compress:latest
14 |       imagePullPolicy: IfNotPresent
15 |       ports:
16 |         - name: default
17 |           containerPort: 80
18 |       # For more information on additional arguments, please refer to
19 |       # https://github.com/NVIDIA/ais-etl/blob/main/transformers/compress/README.md
20 |       command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80']
21 |       env:
22 |         # COMPRESS_OPTIONS is a dictionary of COMPRESS parameters, which includes `mode` and `compression`.
23 |         # For more information, refer to https://github.com/NVIDIA/ais-etl/blob/main/transformers/compress/README.md.
24 |         - name: COMPRESS_OPTIONS
25 |           value: ${COMPRESS_OPTIONS:-"{}"}
26 |       readinessProbe:
27 |         httpGet:
28 |           path: /health
29 |           port: default
30 | 


--------------------------------------------------------------------------------
/transformers/compress/requirements.txt:
--------------------------------------------------------------------------------
1 | requests


--------------------------------------------------------------------------------
/transformers/compress/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #
  4 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
  5 | #
  6 | 
  7 | import argparse
  8 | import bz2
  9 | import gzip
 10 | import json
 11 | import logging
 12 | import os
 13 | 
 14 | from http.server import HTTPServer, BaseHTTPRequestHandler
 15 | from socketserver import ThreadingMixIn
 16 | 
 17 | import requests
 18 | 
 19 | host_target = os.environ["AIS_TARGET_URL"]
 20 | compress_options = json.loads(os.environ["COMPRESS_OPTIONS"])
 21 | 
 22 | if "mode" not in compress_options:
 23 |     mode = "compress"
 24 | else:
 25 |     mode = compress_options["mode"]
 26 | 
 27 | if "compression" not in compress_options:
 28 |     compression = "gzip"
 29 | else:
 30 |     compression = compress_options["compression"]
 31 | 
 32 | 
 33 | class Handler(BaseHTTPRequestHandler):
 34 |     # Overriding log_request to not log successful requests
 35 |     def log_request(self, code="-", size="-"):
 36 |         pass
 37 | 
 38 |     # Set standard headers for responses
 39 |     def _set_headers(self):
 40 |         self.send_response(200)
 41 |         self.send_header("Content-Type", "application/octet-stream")
 42 |         self.end_headers()
 43 | 
 44 |     def process_data(self, data):
 45 |         if mode == "compress" and compression == "gzip":
 46 |             return gzip.compress(data)
 47 |         if mode == "compress" and compression == "bz2":
 48 |             return bz2.compress(data)
 49 |         if mode == "decompress" and compression == "gzip":
 50 |             return gzip.decompress(data)
 51 |         if mode == "decompress" and compression == "bz2":
 52 |             return bz2.decompress(data)
 53 |         raise ValueError(
 54 |             f"Unsupported data processing mode ({mode}) or compression algorithm ({compression})"
 55 |         )
 56 | 
 57 |     # PUT handler supports `hpush` operation
 58 |     def do_PUT(self):
 59 |         try:
 60 |             content_length = int(self.headers["Content-Length"])
 61 |             post_data = self.rfile.read(content_length)
 62 |             processed_data = self.process_data(post_data)
 63 |             self._set_headers()
 64 |             self.wfile.write(processed_data)
 65 |         except Exception as exception:
 66 |             logging.error("Error processing PUT request: %s", str(exception))
 67 |             self.send_response(500)
 68 |             self.end_headers()
 69 |             self.wfile.write(b"Data processing failed")
 70 | 
 71 |     # GET handler supports `hpull` operation
 72 |     def do_GET(self):
 73 |         try:
 74 |             if self.path == "/health":
 75 |                 self._set_headers()
 76 |                 self.wfile.write(b"Running")
 77 |                 return
 78 | 
 79 |             response = requests.get(host_target + self.path)
 80 |             processed_data = self.process_data(response.content)
 81 | 
 82 |             self._set_headers()
 83 |             self.wfile.write(processed_data)
 84 | 
 85 |         except Exception as exception:
 86 |             logging.error("Error processing GET request: %s", str(exception))
 87 |             self.send_response(500)
 88 |             self.end_headers()
 89 |             self.wfile.write(b"Data processing failed")
 90 | 
 91 | 
 92 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
 93 |     """Handle requests in a separate thread."""
 94 | 
 95 | 
 96 | def run(addr, port):
 97 |     server = ThreadedHTTPServer((addr, port), Handler)
 98 |     print(f"Starting HTTP server on {addr}:{port}")
 99 |     server.serve_forever()
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     parser = argparse.ArgumentParser(description="Run a simple HTTP server")
104 |     parser.add_argument(
105 |         "-l",
106 |         "--listen",
107 |         help="Specify the IP address on which the server listens",
108 |     )
109 |     parser.add_argument(
110 |         "-p",
111 |         "--port",
112 |         type=int,
113 |         help="Specify the port on which the server listens",
114 |     )
115 |     args = parser.parse_args()
116 |     run(addr=args.listen, port=args.port)
117 | 


--------------------------------------------------------------------------------
/transformers/echo/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 4 | 
 5 | # Set working directory
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | 
 9 | # Copy app code
10 | COPY flask_server.py fastapi_server.py http_server.py ./
11 | 
12 | # Environment setup
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | # Expose the default port
16 | EXPOSE 8000
17 | 


--------------------------------------------------------------------------------
/transformers/echo/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build --no-cache -t $(REGISTRY_URL)/transformer_echo:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_echo:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/echo/README.md:
--------------------------------------------------------------------------------
 1 | # Echo Transformer
 2 | 
 3 | A simple echo transformer that takes objects (bytes) and simply echoes or repeats those bytes back as output. It's a simple and straightforward way to demonstrate or test the functionality of your container pod. An echo transformer might be used for debugging, understanding how data flows through a system, or verifying that certain processes are functioning as expected.
 4 | 
 5 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration.
 6 | 
 7 | > **Note:** This transformer is using [`FastAPI`](https://fastapi.tiangolo.com/) as framework and [`Gunicorn`](https://gunicorn.org/) + [Uvicorn](https://www.uvicorn.org/) as webserver, multithreaded HTTP server for the same implmentation under [`http-multithreaded-server`](/http-multithreaded-server/) folder.
 8 | 
 9 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms).
10 | 
11 | ### Initializing ETL with AIStore CLI
12 | 
13 | The following steps demonstrate how to initialize the `hello-world-transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md):
14 | 
15 | ```!bash
16 | $ cd transformers/hello_world
17 | 
18 | $ # Mention communication type b/w target and container
19 | $ export COMMUNICATION_TYPE='hpull://'
20 | 
21 | # Substitute env variables in spec file
22 | $ envsubst < pod.yaml > init_spec.yaml
23 | 
24 | $ # Initialize ETL
25 | $ ais etl init spec --from-file init_spec.yaml --name <etl-name> --comm-type "hpull://"
26 | 
27 | $ # Transform and retrieve objects from the bucket using this ETL
28 | $ # For inline transformation
29 | $ ais etl object <etl-name> ais://<bck-name>/<obj-name>.<ext> -
30 | 
31 | $ # Or, for offline (bucket-to-bucket) transformation
32 | $ ais etl bucket <etl-name> ais://src-bck ais://dst-bck 
33 | ```


--------------------------------------------------------------------------------
/transformers/echo/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A FastAPI echo server that returns the input data as output.
 3 | 
 4 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 5 | """
 6 | 
 7 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
 8 | 
 9 | 
10 | class EchoServerFastAPI(FastAPIServer):
11 |     """
12 |     A simple echo server using FastAPI that returns the input data as output.
13 |     """
14 | 
15 |     def transform(self, data, *_args):
16 |         return data
17 | 
18 | 
19 | # Create the server instance and expose the FastAPI app
20 | fastapi_server = EchoServerFastAPI(port=8000)
21 | fastapi_server.logger.setLevel("DEBUG")
22 | fastapi_app = fastapi_server.app  # Expose the FastAPI app
23 | 


--------------------------------------------------------------------------------
/transformers/echo/flask_server.py:
--------------------------------------------------------------------------------
 1 | """A simple echo server that returns the input data as output.
 2 | 
 3 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | """
 6 | 
 7 | from aistore.sdk.etl.webserver.flask_server import FlaskServer
 8 | 
 9 | 
10 | class EchoServerFlask(FlaskServer):
11 |     """
12 |     A simple echo server that returns the input data as output.
13 |     """
14 | 
15 |     def transform(self, data, *_args):
16 |         return data
17 | 
18 | 
19 | flask_server = EchoServerFlask(port=8000)
20 | flask_server.logger.setLevel("DEBUG")
21 | flask_app = flask_server.app
22 | 


--------------------------------------------------------------------------------
/transformers/echo/http_server.py:
--------------------------------------------------------------------------------
 1 | """A simple echo server that returns the input data as output.
 2 | 
 3 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | """
 6 | 
 7 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer
 8 | 
 9 | 
10 | class EchoServer(HTTPMultiThreadedServer):
11 |     """
12 |     A simple echo server that returns the input data as output.
13 |     """
14 | 
15 |     def transform(self, data, *_args):
16 |         return data
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     echo_server = EchoServer(port=8000)
21 |     echo_server.logger.setLevel("DEBUG")
22 |     echo_server.start()
23 | 


--------------------------------------------------------------------------------
/transformers/echo/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-echo
 5 |   annotations:
 6 |     # Values it can take ["hpull://", "hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_echo:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       # for flask based app
19 |       # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"]
20 |       # for http based app
21 |       # command: ["python", "http_server.py"]
22 |       # for fastapi based app
23 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
24 |       readinessProbe:
25 |         httpGet:
26 |           path: /health
27 |           port: default
28 |   #     volumeMounts:
29 |   #       - name: ais
30 |   #         mountPath: /tmp/
31 |   # volumes:
32 |   #   - name: ais
33 |   #     hostPath:
34 |   #       path: /tmp/
35 |   #       type: Directory
36 | 


--------------------------------------------------------------------------------
/transformers/face_detection/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Prior to building this image make you own kaggle_creds.json file
 2 | # containing kaggle keys to download dataset
 3 | FROM docker.io/library/python:3.8-slim
 4 | 
 5 | WORKDIR /
 6 | 
 7 | # install packages needed for open-cv to work
 8 | RUN apt-get update && apt-get -y install gcc ffmpeg libsm6 libxext6 unzip curl
 9 | 
10 | # install python dependencies
11 | COPY ./requirements.txt requirements.txt
12 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
13 | 
14 | # Make .kaggle directory and copy creds
15 | RUN mkdir ~/.kaggle
16 | COPY kaggle_creds.json /root/.kaggle/kaggle.json
17 | 
18 | # Give read and write permissions to kaggle.json
19 | RUN chmod 600 /root/.kaggle/kaggle.json
20 | 
21 | # Create a directory to store the model
22 | RUN mkdir model
23 | 
24 | # Download the dataset
25 | RUN kaggle datasets download -d sambitmukherjee/caffe-face-detector-opencv-pretrained-model && \
26 |     unzip caffe-face-detector-opencv-pretrained-model.zip -d model/ && \
27 |     rm caffe-face-detector-opencv-pretrained-model.zip && \
28 |     rm /root/.kaggle/kaggle.json
29 | 
30 | COPY main.py main.py
31 | 
32 | ENV PYTHONUNBUFFERED 1
33 | 
34 | ENV LOG_LEVEL DEBUG
35 | 
36 | EXPOSE 8000
37 | 


--------------------------------------------------------------------------------
/transformers/face_detection/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_face_detection:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_face_detection:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/face_detection/README.md:
--------------------------------------------------------------------------------
 1 | # Face Detection Using Single Shot Multibox Detector (SSD) Model
 2 | 
 3 | This document outlines the process of utilizing the `Single Shot MultiBox Detector (SSD)` model for face detection in images. The SSD model predicts and places bounding boxes over faces in an image. For further reading on the SSD model, visit the [research paper](https://arxiv.org/abs/1512.02325).
 4 | 
 5 | ![output](sample/output_face_detection.png)
 6 | 
 7 | > **Note**: Due to size constraints, the model's weights and architecture are not included in this directory. They are pre-loaded in the transformer's Docker [image](https://hub.docker.com/r/aistorage/transformer_face_detection).
 8 | 
 9 | ## Image Format Specification
10 | 
11 | The image formats (`jpeg`, `png`, etc.) for processing or storage are defined in the [`pod.yaml`](pod.yaml) file.
12 | 
13 | ## Transformer Communication Mechanisms
14 | 
15 | The transformer is compatible with `hpull` and `hpush` for seamless integration. Detailed information about these communication mechanisms can be found [here](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms).
16 | 
17 | ## Recommended Parameter Setting
18 | 
19 | For efficient transformation, use `fqn` as `ARG_TYPE` in the [`pod.yaml`](pod.yaml) file. This approach allows for local object reading from the target, reducing the time required for each transformation.
20 | 
21 | ## Web Server Framework
22 | 
23 | The transformer employs the [`FastAPI`](https://fastapi.tiangolo.com/) framework, and uses [`Gunicorn`](https://gunicorn.org/) and [Uvicorn](https://www.uvicorn.org/) as the web server combination.
24 | 
25 | ## Configurable Parameters
26 | 
27 | Adjust the following parameters in the `pod.yaml` file as per your requirements:
28 | 
29 | | Argument   | Description                                                         | Default Value |
30 | |------------|---------------------------------------------------------------------|---------------|
31 | | `FORMAT`   | Image format for processing/storing (png, jpeg, etc.)                | "jpeg"        |
32 | | `ARG_TYPE` | Local object reading (`fqn`) vs. HTTP request for object retrieval   | ""            |
33 | | `FILE_FORMAT` | Configure as "tar" for processing datasets in the webdataset format or for handling batches of images packaged in a tarball   | ""            |
34 | 
35 | ### Setting Up the Face Detection Transformer with AIStore CLI
36 | 
37 | To initialize the `Face Detection Transformer` using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md), follow these steps:
38 | 
39 | ```bash
40 | # Navigate to the transformer directory
41 | cd transformers/face_detection
42 | 
43 | # Set FORMAT and ARG_TYPE environment variables
44 | export FORMAT="jpeg"
45 | export ARG_TYPE="" # Or use 'fqn' for local reading
46 | export FILE_FORMAT="" # or use "tar", if using webdataset format
47 | 
48 | # Define communication type
49 | export COMMUNICATION_TYPE="hpush://"
50 | 
51 | # Generate an initialization specification file
52 | envsubst < pod.yaml > init_spec.yaml
53 | 
54 | # Initialize the ETL process
55 | ais etl init spec --from-file init_spec.yaml --name <etl-name> --comm-type $COMMUNICATION_TYPE
56 | 
57 | # Use the ETL for transforming and retrieving objects
58 | # For inline transformation
59 | ais etl object <etl-name> ais://src/<image-name>.JPEG dst.JPEG
60 | 
61 | # For offline (bucket-to-bucket) transformation
62 | ais etl bucket <etl-name> ais://src-bck ais://dst-bck --ext="{jpg:jpg}"
63 | 
64 | # or, if using webdataset style format
65 | # ais etl bucket <etl-name> ais://src-bck ais://dst-bck --ext="{tar:tar}"
66 | ```


--------------------------------------------------------------------------------
/transformers/face_detection/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-face-detection
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: "${COMMUNICATION_TYPE}"
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_face_detection:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 8000
17 |       command:  ["gunicorn", "main:app", "--workers", "5", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
18 |       env:
19 |         - name: FORMAT
20 |         # Expected Values - png, jpeg, etc.
21 |           value: "${FORMAT}"
22 |         - name: ARG_TYPE
23 |           value: "${ARG_TYPE}"
24 |         - name: FILE_FORMAT
25 |           value: "${FILE_FORMAT}"
26 |       # This is a health check endpoint which one should specify
27 |       # for aistore to determine the health of the ETL container.
28 |       readinessProbe:
29 |         httpGet:
30 |           path: /health
31 |           port: default
32 |       # volume mounts and volumes are needed if you are planning to use ARG_TYPE = `fqn`
33 |       volumeMounts:
34 |         - name: ais
35 |           mountPath: /mnt/data/ais
36 |   volumes:
37 |     - name: ais
38 |       hostPath:
39 |         path: /mnt/data/ais
40 |         type: Directory
41 | 


--------------------------------------------------------------------------------
/transformers/face_detection/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi>=0.109.1
 2 | uvicorn[standard]
 3 | gunicorn
 4 | aiohttp>=3.9.2
 5 | numpy
 6 | opencv-python
 7 | aiofiles
 8 | kaggle==1.5.16
 9 | webdataset==0.2.86
10 | Pillow>=10.0.1


--------------------------------------------------------------------------------
/transformers/face_detection/sample/output_face_detection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/face_detection/sample/output_face_detection.png


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Stage 1: Build the binary
 2 | FROM docker.io/library/golang:1.24-alpine AS builder
 3 | 
 4 | RUN apk add --no-cache git
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY src/go.mod src/go.sum ./
 9 | RUN go mod download
10 | 
11 | COPY src/ ./
12 | RUN go build -o go_ffmpeg
13 | 
14 | # Stage 2: Minimal runtime image
15 | FROM alpine:3.19
16 | 
17 | # Install ffmpeg
18 | RUN apk add --no-cache ffmpeg
19 | 
20 | WORKDIR /app
21 | COPY --from=builder /app/go_ffmpeg .
22 | 
23 | EXPOSE 8000
24 | CMD ["./go_ffmpeg"]
25 | 


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_ffmpeg_go:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_ffmpeg_go:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: ffmpeg-go
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_ffmpeg_go:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       command: ['./go_ffmpeg', '-l', '0.0.0.0', '-p', '8000']
19 |       readinessProbe:
20 |         httpGet:
21 |           path: /health
22 |           port: default
23 | 


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/src/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/ais-etl/transformers/go_ffmpeg/src
 2 | 
 3 | go 1.24
 4 | 
 5 | require github.com/NVIDIA/aistore v1.3.29-0.20250514164659-82fcb58b08f3
 6 | 
 7 | require (
 8 | 	github.com/OneOfOne/xxhash v1.2.8 // indirect
 9 | 	github.com/beorn7/perks v1.0.1 // indirect
10 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
11 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
12 | 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
13 | 	github.com/fxamacker/cbor/v2 v2.8.0 // indirect
14 | 	github.com/go-logr/logr v1.4.2 // indirect
15 | 	github.com/go-openapi/jsonpointer v0.21.1 // indirect
16 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
17 | 	github.com/go-openapi/swag v0.23.1 // indirect
18 | 	github.com/gogo/protobuf v1.3.2 // indirect
19 | 	github.com/golang/protobuf v1.5.4 // indirect
20 | 	github.com/google/gnostic-models v0.6.9 // indirect
21 | 	github.com/google/go-cmp v0.7.0 // indirect
22 | 	github.com/google/gofuzz v1.2.0 // indirect
23 | 	github.com/google/uuid v1.6.0 // indirect
24 | 	github.com/gorilla/websocket v1.5.3 // indirect
25 | 	github.com/josharian/intern v1.0.0 // indirect
26 | 	github.com/json-iterator/go v1.1.12 // indirect
27 | 	github.com/karrick/godirwalk v1.17.0 // indirect
28 | 	github.com/lufia/iostat v1.2.1 // indirect
29 | 	github.com/mailru/easyjson v0.9.0 // indirect
30 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
31 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
32 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
33 | 	github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect
34 | 	github.com/pierrec/lz4/v4 v4.1.22 // indirect
35 | 	github.com/pkg/errors v0.9.1 // indirect
36 | 	github.com/prometheus/client_golang v1.22.0 // indirect
37 | 	github.com/prometheus/client_model v0.6.2 // indirect
38 | 	github.com/prometheus/common v0.63.0 // indirect
39 | 	github.com/prometheus/procfs v0.16.0 // indirect
40 | 	github.com/spf13/pflag v1.0.6 // indirect
41 | 	github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect
42 | 	github.com/tinylib/msgp v1.2.5 // indirect
43 | 	github.com/x448/float16 v0.8.4 // indirect
44 | 	golang.org/x/net v0.39.0 // indirect
45 | 	golang.org/x/oauth2 v0.29.0 // indirect
46 | 	golang.org/x/sync v0.13.0 // indirect
47 | 	golang.org/x/sys v0.32.0 // indirect
48 | 	golang.org/x/term v0.31.0 // indirect
49 | 	golang.org/x/text v0.24.0 // indirect
50 | 	golang.org/x/time v0.11.0 // indirect
51 | 	google.golang.org/protobuf v1.36.6 // indirect
52 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
53 | 	gopkg.in/inf.v0 v0.9.1 // indirect
54 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
55 | 	k8s.io/api v0.32.3 // indirect
56 | 	k8s.io/apimachinery v0.32.3 // indirect
57 | 	k8s.io/client-go v0.32.3 // indirect
58 | 	k8s.io/klog/v2 v2.130.1 // indirect
59 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
60 | 	k8s.io/metrics v0.32.3 // indirect
61 | 	k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect
62 | 	sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
63 | 	sigs.k8s.io/randfill v1.0.0 // indirect
64 | 	sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
65 | 	sigs.k8s.io/yaml v1.4.0 // indirect
66 | )
67 | 


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/src/main.go:
--------------------------------------------------------------------------------
 1 | // Package main is implementation of FFmpeg transformation in golang.
 2 | /*
 3 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package main
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"flag"
10 | 	"fmt"
11 | 	"io"
12 | 	"log"
13 | 	"os"
14 | 	"os/exec"
15 | 	"path/filepath"
16 | 	"strings"
17 | 
18 | 	"github.com/NVIDIA/aistore/cmn/cos"
19 | 	"github.com/NVIDIA/aistore/ext/etl/webserver"
20 | )
21 | 
22 | type FFmpegServer struct {
23 | 	webserver.ETLServer
24 | 	channels   string
25 | 	samplerate string
26 | }
27 | 
28 | var audioExts = cos.NewStrSet(".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg")
29 | 
30 | func (fs *FFmpegServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) {
31 | 	ext := strings.ToLower(filepath.Ext(path))
32 | 	if !audioExts.Contains(ext) {
33 | 		// If it's not an audio file we recognize, return as-is
34 | 		buf, err := io.ReadAll(input)
35 | 		if err != nil {
36 | 			return nil, fmt.Errorf("reading input: %w", err)
37 | 		}
38 | 		return io.NopCloser(bytes.NewReader(buf)), nil
39 | 	}
40 | 
41 | 	cmd := exec.Command("ffmpeg",
42 | 		"-nostdin",
43 | 		"-loglevel", "error",
44 | 		"-i", "pipe:0",
45 | 		"-ac", fs.channels,
46 | 		"-ar", fs.samplerate,
47 | 		"-c:a", "pcm_s16le",
48 | 		"-f", "wav",
49 | 		"pipe:1",
50 | 	)
51 | 	cmd.Stderr = &bytes.Buffer{}
52 | 	cmd.Stdin = input
53 | 	out, err := cmd.Output() // TODO: use cmd.StdoutPipe() to achieve better concurrency
54 | 	if err != nil {
55 | 		errMsg := cmd.Stderr.(*bytes.Buffer).String()
56 | 		return nil, fmt.Errorf("ffmpeg error: %s", strings.TrimSpace(errMsg))
57 | 	}
58 | 	return io.NopCloser(bytes.NewReader(out)), nil
59 | }
60 | 
61 | var _ webserver.ETLServer = (*FFmpegServer)(nil)
62 | 
63 | func main() {
64 | 	listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on")
65 | 	port := flag.Int("p", 8000, "Port to listen on")
66 | 	flag.Parse()
67 | 
68 | 	svr := &FFmpegServer{}
69 | 	if svr.channels = os.Getenv("AC"); svr.channels == "" {
70 | 		svr.channels = "1"
71 | 	}
72 | 	if svr.samplerate = os.Getenv("AR"); svr.samplerate == "" {
73 | 		svr.samplerate = "44100"
74 | 	}
75 | 
76 | 	if err := webserver.Run(svr, *listenAddr, *port); err != nil {
77 | 		log.Fatalf("Failed to start server: %v", err)
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/transformers/go_FFmpeg/src/main_test.go:
--------------------------------------------------------------------------------
 1 | // Package main is implementation of FFmpeg transformation in golang.
 2 | /*
 3 |  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package main
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"io"
10 | 	"os"
11 | 	"testing"
12 | 
13 | 	"github.com/NVIDIA/aistore/tools/tassert"
14 | 	"github.com/NVIDIA/aistore/tools/tlog"
15 | )
16 | 
17 | // NOTE: This test requires ffmpeg to be installed and available in the PATH.
18 | func TestFFmpegTransform(t *testing.T) {
19 | 	filename := "../../tests/resources/test-audio-wav.wav"
20 | 	input, err := os.Open(filename)
21 | 	tassert.CheckError(t, err)
22 | 
23 | 	// Send it to the ETL server
24 | 	svr := &FFmpegServer{
25 | 		channels:   "1",
26 | 		samplerate: "44100",
27 | 	}
28 | 
29 | 	transformed, err := svr.Transform(input, filename, "")
30 | 	tassert.CheckError(t, err)
31 | 
32 | 	output, err := io.ReadAll(transformed)
33 | 	tlog.Logf("Transformed output size: %d\n", len(output))
34 | 	tassert.CheckError(t, err)
35 | 	tassert.Fatalf(t, bytes.HasPrefix(output, []byte("RIFF")), "Output is not a valid WAV file")
36 | }
37 | 
38 | // NOTE: This test requires ffmpeg to be installed and available in the PATH.
39 | func TestFFmpegTransformMP3(t *testing.T) {
40 | 	filename := "../../tests/resources/test-audio-mp3.mp3"
41 | 	input, err := os.Open(filename)
42 | 	tassert.CheckError(t, err)
43 | 
44 | 	svr := &FFmpegServer{
45 | 		channels:   "1",
46 | 		samplerate: "16000", // downsample to emphasize transformation
47 | 	}
48 | 
49 | 	// Run the transform
50 | 	transformed, err := svr.Transform(input, filename, "")
51 | 	tassert.CheckError(t, err)
52 | 
53 | 	// Read result
54 | 	output, err := io.ReadAll(transformed)
55 | 	tassert.CheckError(t, err)
56 | 
57 | 	tlog.Logf("Transformed output size: %d bytes\n", len(output))
58 | 	tlog.Logln(string(output[:10]))
59 | 
60 | 	// Assert basic WAV structure
61 | 	tassert.Fatalf(t, bytes.HasPrefix(output, []byte("RIFF")), "Missing RIFF header")
62 | 	tassert.Fatalf(t, bytes.Contains(output, []byte("WAVEfmt ")), "Missing WAVE format chunk")
63 | 	tassert.Fatalf(t, bytes.Contains(output, []byte("data")), "Missing data chunk")
64 | 
65 | 	// Make sure it's not identical to input (to verify it's transformed)
66 | 	input.Seek(0, io.SeekStart)
67 | 	original, err := io.ReadAll(input)
68 | 	tassert.CheckError(t, err)
69 | 	tassert.Fatalf(t, !bytes.Equal(output, original), "Output should not be identical to input")
70 | }
71 | 


--------------------------------------------------------------------------------
/transformers/go_echo/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Stage 1: Build the binary
 2 | FROM docker.io/library/golang:1.24-alpine AS builder
 3 | 
 4 | RUN apk add --no-cache git
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY src/go.mod src/go.sum ./
 9 | RUN go mod download
10 | 
11 | COPY src/ ./
12 | RUN go build -o echo
13 | 
14 | # Stage 2: Minimal runtime image
15 | FROM alpine:3.19
16 | 
17 | WORKDIR /app
18 | COPY --from=builder /app/echo .
19 | 
20 | EXPOSE 8000
21 | CMD ["./echo"]
22 | 


--------------------------------------------------------------------------------
/transformers/go_echo/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_echo_go:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_echo_go:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/go_echo/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: echo-go
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_echo_go:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       command: ['./echo', '-l', '0.0.0.0', '-p', '8000']
19 |       readinessProbe:
20 |         httpGet:
21 |           path: /health
22 |           port: default
23 | 


--------------------------------------------------------------------------------
/transformers/go_echo/src/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/ais-etl/transformers/go_echo/src
 2 | 
 3 | go 1.24
 4 | 
 5 | require github.com/NVIDIA/aistore v1.3.29-0.20250514164659-82fcb58b08f3
 6 | 
 7 | require (
 8 | 	github.com/OneOfOne/xxhash v1.2.8 // indirect
 9 | 	github.com/beorn7/perks v1.0.1 // indirect
10 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
11 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
12 | 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
13 | 	github.com/fxamacker/cbor/v2 v2.8.0 // indirect
14 | 	github.com/go-logr/logr v1.4.2 // indirect
15 | 	github.com/go-openapi/jsonpointer v0.21.1 // indirect
16 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
17 | 	github.com/go-openapi/swag v0.23.1 // indirect
18 | 	github.com/gogo/protobuf v1.3.2 // indirect
19 | 	github.com/golang/protobuf v1.5.4 // indirect
20 | 	github.com/google/gnostic-models v0.6.9 // indirect
21 | 	github.com/google/go-cmp v0.7.0 // indirect
22 | 	github.com/google/gofuzz v1.2.0 // indirect
23 | 	github.com/google/uuid v1.6.0 // indirect
24 | 	github.com/gorilla/websocket v1.5.3 // indirect
25 | 	github.com/josharian/intern v1.0.0 // indirect
26 | 	github.com/json-iterator/go v1.1.12 // indirect
27 | 	github.com/karrick/godirwalk v1.17.0 // indirect
28 | 	github.com/lufia/iostat v1.2.1 // indirect
29 | 	github.com/mailru/easyjson v0.9.0 // indirect
30 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
31 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
32 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
33 | 	github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect
34 | 	github.com/pierrec/lz4/v4 v4.1.22 // indirect
35 | 	github.com/pkg/errors v0.9.1 // indirect
36 | 	github.com/prometheus/client_golang v1.22.0 // indirect
37 | 	github.com/prometheus/client_model v0.6.2 // indirect
38 | 	github.com/prometheus/common v0.63.0 // indirect
39 | 	github.com/prometheus/procfs v0.16.0 // indirect
40 | 	github.com/spf13/pflag v1.0.6 // indirect
41 | 	github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect
42 | 	github.com/tinylib/msgp v1.2.5 // indirect
43 | 	github.com/x448/float16 v0.8.4 // indirect
44 | 	golang.org/x/net v0.39.0 // indirect
45 | 	golang.org/x/oauth2 v0.29.0 // indirect
46 | 	golang.org/x/sync v0.13.0 // indirect
47 | 	golang.org/x/sys v0.32.0 // indirect
48 | 	golang.org/x/term v0.31.0 // indirect
49 | 	golang.org/x/text v0.24.0 // indirect
50 | 	golang.org/x/time v0.11.0 // indirect
51 | 	google.golang.org/protobuf v1.36.6 // indirect
52 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
53 | 	gopkg.in/inf.v0 v0.9.1 // indirect
54 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
55 | 	k8s.io/api v0.32.3 // indirect
56 | 	k8s.io/apimachinery v0.32.3 // indirect
57 | 	k8s.io/client-go v0.32.3 // indirect
58 | 	k8s.io/klog/v2 v2.130.1 // indirect
59 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
60 | 	k8s.io/metrics v0.32.3 // indirect
61 | 	k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect
62 | 	sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
63 | 	sigs.k8s.io/randfill v1.0.0 // indirect
64 | 	sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
65 | 	sigs.k8s.io/yaml v1.4.0 // indirect
66 | )
67 | 


--------------------------------------------------------------------------------
/transformers/go_echo/src/main.go:
--------------------------------------------------------------------------------
 1 | // Package main is implementation of ID (echo) transformation in golang.
 2 | /*
 3 |  * Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package main
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"flag"
10 | 	"io"
11 | 	"log"
12 | 
13 | 	"github.com/NVIDIA/aistore/ext/etl/webserver"
14 | )
15 | 
16 | type EchoServer struct {
17 | 	webserver.ETLServer
18 | }
19 | 
20 | func (es *EchoServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) {
21 | 	data, err := io.ReadAll(input)
22 | 	if err != nil {
23 | 		return nil, err
24 | 	}
25 | 	input.Close()
26 | 	return io.NopCloser(bytes.NewReader(data)), nil
27 | }
28 | 
29 | var _ webserver.ETLServer = (*EchoServer)(nil)
30 | 
31 | func main() {
32 | 	listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on")
33 | 	port := flag.Int("p", 8000, "Port to listen on")
34 | 	flag.Parse()
35 | 
36 | 	svr := &EchoServer{}
37 | 	if err := webserver.Run(svr, *listenAddr, *port); err != nil {
38 | 		log.Fatalf("Failed to start server: %v", err)
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/transformers/go_hello_world/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Stage 1: Build the binary
 2 | FROM docker.io/library/golang:1.24-alpine AS builder
 3 | 
 4 | RUN apk add --no-cache git
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | COPY src/go.mod src/go.sum ./
 9 | RUN go mod download
10 | 
11 | COPY src/ ./
12 | RUN go build -o hello_world
13 | 
14 | # Stage 2: Minimal runtime image
15 | FROM alpine:3.19
16 | 
17 | WORKDIR /app
18 | COPY --from=builder /app/hello_world .
19 | 
20 | EXPOSE 80
21 | CMD ["./hello_world"]
22 | 


--------------------------------------------------------------------------------
/transformers/go_hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_hello_world_go:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_hello_world_go:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/go_hello_world/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: hello-world-go-transformer
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_hello_world_go:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 80
17 |       command: ['./echo', '-l', '0.0.0.0', '-p', '80']
18 |       readinessProbe:
19 |         httpGet:
20 |           path: /health
21 |           port: default
22 | 


--------------------------------------------------------------------------------
/transformers/go_hello_world/src/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/ais-etl/transformers/go_echo/src
 2 | 
 3 | go 1.24
 4 | 
 5 | require github.com/NVIDIA/aistore v1.3.28-0.20250501012007-d85f26c3c672
 6 | 
 7 | require (
 8 | 	github.com/OneOfOne/xxhash v1.2.8 // indirect
 9 | 	github.com/beorn7/perks v1.0.1 // indirect
10 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
11 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
12 | 	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
13 | 	github.com/fxamacker/cbor/v2 v2.8.0 // indirect
14 | 	github.com/go-logr/logr v1.4.2 // indirect
15 | 	github.com/go-openapi/jsonpointer v0.21.1 // indirect
16 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
17 | 	github.com/go-openapi/swag v0.23.1 // indirect
18 | 	github.com/gogo/protobuf v1.3.2 // indirect
19 | 	github.com/golang/protobuf v1.5.4 // indirect
20 | 	github.com/google/gnostic-models v0.6.9 // indirect
21 | 	github.com/google/go-cmp v0.7.0 // indirect
22 | 	github.com/google/gofuzz v1.2.0 // indirect
23 | 	github.com/google/uuid v1.6.0 // indirect
24 | 	github.com/gorilla/websocket v1.5.3 // indirect
25 | 	github.com/josharian/intern v1.0.0 // indirect
26 | 	github.com/json-iterator/go v1.1.12 // indirect
27 | 	github.com/karrick/godirwalk v1.17.0 // indirect
28 | 	github.com/lufia/iostat v1.2.1 // indirect
29 | 	github.com/mailru/easyjson v0.9.0 // indirect
30 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
31 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
32 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
33 | 	github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect
34 | 	github.com/pierrec/lz4/v4 v4.1.22 // indirect
35 | 	github.com/pkg/errors v0.9.1 // indirect
36 | 	github.com/prometheus/client_golang v1.22.0 // indirect
37 | 	github.com/prometheus/client_model v0.6.2 // indirect
38 | 	github.com/prometheus/common v0.63.0 // indirect
39 | 	github.com/prometheus/procfs v0.16.0 // indirect
40 | 	github.com/spf13/pflag v1.0.6 // indirect
41 | 	github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect
42 | 	github.com/tinylib/msgp v1.2.5 // indirect
43 | 	github.com/x448/float16 v0.8.4 // indirect
44 | 	golang.org/x/net v0.39.0 // indirect
45 | 	golang.org/x/oauth2 v0.29.0 // indirect
46 | 	golang.org/x/sync v0.13.0 // indirect
47 | 	golang.org/x/sys v0.32.0 // indirect
48 | 	golang.org/x/term v0.31.0 // indirect
49 | 	golang.org/x/text v0.24.0 // indirect
50 | 	golang.org/x/time v0.11.0 // indirect
51 | 	google.golang.org/protobuf v1.36.6 // indirect
52 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
53 | 	gopkg.in/inf.v0 v0.9.1 // indirect
54 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
55 | 	k8s.io/api v0.32.3 // indirect
56 | 	k8s.io/apimachinery v0.32.3 // indirect
57 | 	k8s.io/client-go v0.32.3 // indirect
58 | 	k8s.io/klog/v2 v2.130.1 // indirect
59 | 	k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
60 | 	k8s.io/metrics v0.32.3 // indirect
61 | 	k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect
62 | 	sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
63 | 	sigs.k8s.io/randfill v1.0.0 // indirect
64 | 	sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect
65 | 	sigs.k8s.io/yaml v1.4.0 // indirect
66 | )
67 | 


--------------------------------------------------------------------------------
/transformers/go_hello_world/src/main.go:
--------------------------------------------------------------------------------
 1 | // Package main is implementation of a simple hello world transformation in golang.
 2 | /*
 3 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package main
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"flag"
10 | 	"io"
11 | 	"log"
12 | 
13 | 	"github.com/NVIDIA/aistore/ext/etl/webserver"
14 | )
15 | 
16 | type HelloWorldServer struct {
17 | 	response string
18 | 	webserver.ETLServer
19 | }
20 | 
21 | func (es *HelloWorldServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) {
22 | 	input.Close()
23 | 	return io.NopCloser(bytes.NewReader([]byte(es.response))), nil
24 | }
25 | 
26 | var _ webserver.ETLServer = (*HelloWorldServer)(nil)
27 | 
28 | func main() {
29 | 	listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on")
30 | 	port := flag.Int("p", 80, "Port to listen on")
31 | 	flag.Parse()
32 | 
33 | 	svr := &HelloWorldServer{
34 | 		response: "Hello World!",
35 | 	}
36 | 
37 | 	if err := webserver.Run(svr, *listenAddr, *port); err != nil {
38 | 		log.Fatalf("Failed to start server: %v", err)
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.13-alpine
 2 | 
 3 | # Install git and build dependencies
 4 | RUN apk update && apk add --no-cache git gcc musl-dev libffi-dev
 5 | 
 6 | # Set working directory
 7 | WORKDIR /code
 8 | 
 9 | # Install Python dependencies
10 | RUN pip install --no-cache-dir --upgrade \
11 |     "git+https://github.com/NVIDIA/aistore.git@etl-args-webserver#subdirectory=python" \
12 |     "fastapi>=0.109.1" \
13 |     "httpx>=0.28.0" \
14 |     "aiofiles>=23.2.1" \
15 |     "uvicorn[standard]>=0.32.0" \
16 |     "flask>=2.3.0" \
17 |     "gunicorn>=23.0.0"
18 | 
19 | # Copy application code
20 | COPY flask_server.py fastapi_server.py http_server.py ./
21 | 
22 | # Environment setup
23 | ENV PYTHONUNBUFFERED=1
24 | 
25 | # Expose default port
26 | EXPOSE 8000
27 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_hash_with_args:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_hash_with_args:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/README.md:
--------------------------------------------------------------------------------
 1 | # Hash with Args Transformer
 2 | 
 3 | A simple hash transformer that processes objects (bytes) by extracting ETL arguments from an inline transform request and using it as a seed value to compute a seeded hash. This example demonstrates how to pass custom metadata for each individual object through an ETL inline transform and utilize it within your pod.
 4 | 
 5 | ### Initializing ETL with AIStore CLI
 6 | 
 7 | The following steps demonstrate how to initialize the `transformer-hash-with-args` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md):
 8 | 
 9 | ```!bash
10 | $ cd transformers/hash_with_args
11 | 
12 | $ # Mention communication type b/w target and container
13 | $ export COMMUNICATION_TYPE='hpull://'
14 | 
15 | # Substitute env variables in spec file
16 | $ envsubst < pod.yaml > init_spec.yaml
17 | 
18 | $ # Initialize ETL
19 | $ ais etl init spec --from-file init_spec.yaml --name <etl-name> --comm-type "hpull://"
20 | 
21 | $ # Put an object
22 | $ ais object put <your-file> ais://<bck-name>
23 | 
24 | $ # Transform and retrieve objects from the bucket using this ETL with arguments
25 | $ curl -L -X GET "${AIS_ENDPOINT}/v1/objects/<bck-name>/<your-file>?etl_name=<etl-name>&etl_meta=100000"
26 | ```


--------------------------------------------------------------------------------
/transformers/hash_with_args/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HashWithArgs ETL transformer (FastAPI)
 3 | 
 4 | FastAPI-based ETL server that computes an XXHash64 digest of each request's payload,
 5 | optionally seeded via the `etl_args` query parameter.
 6 | 
 7 | Environment:
 8 |   SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0)
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import os
14 | import logging
15 | from typing import Optional
16 | 
17 | import xxhash
18 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
19 | 
20 | 
21 | class HashWithArgs(FastAPIServer):
22 |     """
23 |     ETL server that computes an XXHash64 digest of each payload.
24 | 
25 |     Supports an optional `etl_args` parameter (string) specifying the numeric seed.
26 |     """
27 | 
28 |     def __init__(
29 |         self,
30 |         port: int = 8000,
31 |         *,
32 |         default_seed: Optional[int] = None,
33 |     ) -> None:
34 |         """
35 |         Initialize the HashWithArgs server.
36 | 
37 |         Args:
38 |             port: TCP port to listen on (default 8000).
39 |             default_seed: fallback seed if ETL args absent/invalid.
40 |                 If None, reads `SEED_DEFAULT` env var (defaulting to 0).
41 |         """
42 |         super().__init__(port=port)
43 |         self.logger.setLevel(logging.DEBUG)
44 |         if default_seed:
45 |             self.default_seed = default_seed
46 |         else:
47 |             try:
48 |                 self.default_seed = int(os.getenv("SEED_DEFAULT", "0"))
49 |             except ValueError:
50 |                 self.logger.warning(
51 |                     "Invalid SEED_DEFAULT='%s', falling back to 0",
52 |                     os.getenv("SEED_DEFAULT"),
53 |                 )
54 |                 self.default_seed = 0
55 | 
56 |     def transform(
57 |         self,
58 |         data: bytes,
59 |         _path: str,
60 |         etl_args: str,
61 |     ) -> bytes:
62 |         """
63 |         Compute the XXHash64 digest of the input data.
64 | 
65 |         Args:
66 |             data: Raw request payload.
67 |             path: Request path or object key (unused here).
68 |             etl_args: optional seed passed via `?etl_args=<seed>`.
69 | 
70 |         Returns:
71 |             The lowercase hexadecimal digest as ASCII-encoded bytes.
72 |         """
73 |         seed = self.default_seed
74 |         if etl_args:
75 |             try:
76 |                 seed = int(etl_args)
77 |             except ValueError:
78 |                 self.logger.warning(
79 |                     "Invalid etl_args seed=%r, using default_seed=%d",
80 |                     etl_args,
81 |                     self.default_seed,
82 |                 )
83 |         hasher = xxhash.xxh64(seed=seed)
84 |         hasher.update(data)
85 |         # hexdigest() is str → encode to ASCII bytes
86 |         return hasher.hexdigest().encode("ascii")
87 | 
88 | 
89 | # instantiate and expose
90 | fastapi_server = HashWithArgs()
91 | fastapi_app = fastapi_server.app
92 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/flask_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HashWithArgs ETL transformer (Flask)
 3 | 
 4 | Flask-based ETL server that computes an XXHash64 digest of each request's payload,
 5 | optionally seeded via the `etl_args` query parameter.
 6 | 
 7 | Environment:
 8 |   SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0)
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import os
14 | import logging
15 | from typing import Optional
16 | 
17 | import xxhash
18 | from aistore.sdk.etl.webserver.flask_server import FlaskServer
19 | 
20 | 
21 | class HashWithArgs(FlaskServer):
22 |     """
23 |     ETL server that computes an XXHash64 digest of each payload.
24 | 
25 |     Supports an optional `etl_args` parameter (string) specifying the numeric seed.
26 |     """
27 | 
28 |     def __init__(
29 |         self,
30 |         port: int = 8000,
31 |         *,
32 |         default_seed: Optional[int] = None,
33 |     ) -> None:
34 |         """
35 |         Initialize the HashWithArgs server.
36 | 
37 |         Args:
38 |             host: interface to bind on (default "0.0.0.0").
39 |             port: TCP port to listen on (default 8000).
40 |             default_seed: fallback seed if ETL args absent/invalid.
41 |                 If None, reads `SEED_DEFAULT` env var (defaulting to 0).
42 |         """
43 |         super().__init__(port=port)
44 |         self.logger.setLevel(logging.DEBUG)
45 |         if default_seed:
46 |             self.default_seed = default_seed
47 |         else:
48 |             try:
49 |                 self.default_seed = int(os.getenv("SEED_DEFAULT", "0"))
50 |             except ValueError:
51 |                 self.logger.warning(
52 |                     "Invalid SEED_DEFAULT='%s', falling back to 0",
53 |                     os.getenv("SEED_DEFAULT"),
54 |                 )
55 |                 self.default_seed = 0
56 | 
57 |     def transform(
58 |         self,
59 |         data: bytes,
60 |         _path: str,
61 |         etl_args: str,
62 |     ) -> bytes:
63 |         """
64 |         Compute the XXHash64 digest of the input data.
65 | 
66 |         Args:
67 |             data: Raw request payload.
68 |             path: Request path or object key (unused here).
69 |             etl_args: optional seed passed via `?etl_args=<seed>`.
70 | 
71 |         Returns:
72 |             The lowercase hexadecimal digest as ASCII-encoded bytes.
73 |         """
74 |         seed = self.default_seed
75 |         if etl_args:
76 |             try:
77 |                 seed = int(etl_args)
78 |             except ValueError:
79 |                 self.logger.warning(
80 |                     "Invalid etl_args seed=%r, using default_seed=%d",
81 |                     etl_args,
82 |                     self.default_seed,
83 |                 )
84 |         hasher = xxhash.xxh64(seed=seed)
85 |         hasher.update(data)
86 |         # hexdigest() is str → encode to ASCII bytes
87 |         return hasher.hexdigest().encode("ascii")
88 | 
89 | 
90 | # instantiate and expose
91 | flask_server = HashWithArgs()
92 | flask_app = flask_server.app
93 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/http_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HashWithArgs ETL transformer (HTTP Server)
 3 | 
 4 | HTTP-based ETL server that computes an XXHash64 digest of each request's payload,
 5 | optionally seeded via the `etl_args` query parameter.
 6 | 
 7 | Environment:
 8 |   SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0)
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import os
14 | import logging
15 | from typing import Optional
16 | 
17 | import xxhash
18 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer
19 | 
20 | 
21 | class HashWithArgs(HTTPMultiThreadedServer):
22 |     """
23 |     ETL server that computes an XXHash64 digest of each payload.
24 | 
25 |     Supports an optional `etl_args` parameter (string) specifying the numeric seed.
26 |     """
27 | 
28 |     def __init__(
29 |         self,
30 |         port: int = 8000,
31 |         *,
32 |         default_seed: Optional[int] = None,
33 |     ) -> None:
34 |         """
35 |         Initialize the HashWithArgs server.
36 | 
37 |         Args:
38 |             port: TCP port to listen on (default 8000).
39 |             default_seed: fallback seed if ETL args absent/invalid.
40 |                 If None, reads `SEED_DEFAULT` env var (defaulting to 0).
41 |         """
42 |         super().__init__(port=port)
43 |         self.logger.setLevel(logging.DEBUG)
44 |         if default_seed:
45 |             self.default_seed = default_seed
46 |         else:
47 |             try:
48 |                 self.default_seed = int(os.getenv("SEED_DEFAULT", "0"))
49 |             except ValueError:
50 |                 self.logger.warning(
51 |                     "Invalid SEED_DEFAULT='%s', falling back to 0",
52 |                     os.getenv("SEED_DEFAULT"),
53 |                 )
54 |                 self.default_seed = 0
55 | 
56 |     def transform(
57 |         self,
58 |         data: bytes,
59 |         _path: str,
60 |         etl_args: str,
61 |     ) -> bytes:
62 |         """
63 |         Compute the XXHash64 digest of the input data.
64 | 
65 |         Args:
66 |             data: Raw request payload.
67 |             path: Request path or object key (unused here).
68 |             etl_args: optional seed passed via `?etl_args=<seed>`.
69 | 
70 |         Returns:
71 |             The lowercase hexadecimal digest as ASCII-encoded bytes.
72 |         """
73 |         seed = self.default_seed
74 |         if etl_args:
75 |             try:
76 |                 seed = int(etl_args)
77 |             except ValueError:
78 |                 self.logger.warning(
79 |                     "Invalid etl_args seed=%r, using default_seed=%d",
80 |                     etl_args,
81 |                     self.default_seed,
82 |                 )
83 |         hasher = xxhash.xxh64(seed=seed)
84 |         hasher.update(data)
85 |         # hexdigest() is str → encode to ASCII bytes
86 |         return hasher.hexdigest().encode("ascii")
87 | 
88 | 
89 | # instantiate and expose
90 | if __name__ == "__main__":
91 |     server = HashWithArgs()
92 |     server.start()
93 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-hash-with-args
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_hash_with_args:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 8000
17 |       # for flask based app
18 |       # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"]
19 |       # for http based app
20 |       # command: ["python", "http_server.py"]
21 |       # for fastapi based app
22 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
23 |       readinessProbe:
24 |         httpGet:
25 |           path: /health
26 |           port: default
27 |       env:
28 |         - name: SEED_DEFAULT
29 |           value: "0"
30 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
31 |       # where the objects are stored on AIStore targets. This allows the ETL container 
32 |       # to access the files directly by absolute path.
33 |   #     volumeMounts:
34 |   #       - name: ais
35 |   #         mountPath: /tmp/
36 |   # volumes:
37 |   #   - name: ais
38 |   #     hostPath:
39 |   #       path: /tmp/
40 |   #       type: Directory
41 | 


--------------------------------------------------------------------------------
/transformers/hash_with_args/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | xxhash


--------------------------------------------------------------------------------
/transformers/hash_with_args/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import xxhash
  5 | import requests
  6 | import os
  7 | import logging
  8 | from urllib.parse import urlparse, parse_qs
  9 | from http.server import HTTPServer, BaseHTTPRequestHandler
 10 | from socketserver import ThreadingMixIn
 11 | 
 12 | host_target = os.environ["AIS_TARGET_URL"]
 13 | seed_default = int(os.getenv("SEED_DEFAULT", "0"))
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(
 17 |     level=logging.INFO,
 18 |     format="%(asctime)s - %(levelname)s - %(message)s",
 19 | )
 20 | 
 21 | 
 22 | class Handler(BaseHTTPRequestHandler):
 23 |     def log_request(self, code="-", size="-"):
 24 |         # Don't log successful requests info. Unsuccessful logged by log_error().
 25 |         pass
 26 | 
 27 |     def _set_headers(self):
 28 |         self.send_response(200)
 29 |         self.send_header("Content-Type", "text/plain")
 30 |         self.end_headers()
 31 | 
 32 |     def do_PUT(self):
 33 |         try:
 34 |             content_length = int(self.headers["Content-Length"])
 35 |             post_data = self.rfile.read(content_length)
 36 |             parsed_url = urlparse(self.path)
 37 |             seed = seed_default
 38 |             logging.info("PUT request received")
 39 |             params = parse_qs(parsed_url.query)
 40 |             if "etl_args" in params:
 41 |                 seed = int(params["etl_args"][0])
 42 |                 logging.info("PUT request with seed %d", seed)
 43 | 
 44 |             hash_result = self.calculate_xxhash(post_data, seed)
 45 |             self._set_headers()
 46 |             self.wfile.write(hash_result.encode())
 47 |         except Exception as e:
 48 |             logging.error("Error in PUT request: %s", e)
 49 |             self.send_error(500, f"Internal Server Error: {e}")
 50 | 
 51 |     def do_GET(self):
 52 |         if self.path == "/health":
 53 |             self._set_headers()
 54 |             self.wfile.write(b"Running")
 55 |             return
 56 | 
 57 |         try:
 58 |             parsed_url = urlparse(self.path)
 59 |             x = requests.get(host_target + self.path)
 60 | 
 61 |             seed = seed_default
 62 |             logging.info("GET request received")
 63 |             params = parse_qs(parsed_url.query)
 64 |             if "etl_args" in params:
 65 |                 seed = int(params["etl_args"][0])
 66 |                 logging.info("GET request with seed %d", seed)
 67 | 
 68 |             hash_result = self.calculate_xxhash(x.content, seed)
 69 |             self._set_headers()
 70 |             self.wfile.write(hash_result.encode())
 71 |         except requests.HTTPError as http_err:
 72 |             logging.error("HTTP error in GET request: %s", http_err)
 73 |             self.send_error(502, f"Bad Gateway: {http_err}")
 74 |         except Exception as e:
 75 |             logging.error("Error in GET request: %s", e)
 76 |             self.send_error(500, f"Internal Server Error: {e}")
 77 | 
 78 |     def calculate_xxhash(self, data, seed):
 79 |         hasher = xxhash.xxh64(seed=seed)
 80 |         hasher.update(data)
 81 |         return hasher.hexdigest()
 82 | 
 83 | 
 84 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
 85 |     """Handle requests in a separate thread."""
 86 | 
 87 | 
 88 | def run(addr="localhost", port=8000):
 89 |     """Start the threaded HTTP server."""
 90 |     logging.info("Starting HTTP server on %s:%s", addr, port)
 91 |     try:
 92 |         server = ThreadedHTTPServer((addr, port), Handler)
 93 |         server.serve_forever()
 94 |     except KeyboardInterrupt:
 95 |         logging.info("Shutting down the server.")
 96 |     except Exception as e:
 97 |         logging.error("Unexpected server error: %s", e)
 98 |     finally:
 99 |         logging.info("Server stopped.")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     parser = argparse.ArgumentParser(description="Run a simple HTTP server")
104 |     parser.add_argument(
105 |         "-l",
106 |         "--listen",
107 |         default="localhost",
108 |         help="Specify the IP address on which the server listens",
109 |     )
110 |     parser.add_argument(
111 |         "-p",
112 |         "--port",
113 |         type=int,
114 |         default=8000,
115 |         help="Specify the port on which the server listens",
116 |     )
117 |     args = parser.parse_args()
118 |     run(addr=args.listen, port=args.port)
119 | 


--------------------------------------------------------------------------------
/transformers/hello_world/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 4 | 
 5 | # Set working directory
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | 
 9 | # Copy app code
10 | COPY flask_server.py fastapi_server.py http_server.py ./
11 | 
12 | # Environment setup
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | # Expose the default port
16 | EXPOSE 8000
17 | 


--------------------------------------------------------------------------------
/transformers/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build --no-cache -t $(REGISTRY_URL)/transformer_hello_world:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_hello_world:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/hello_world/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Hello World Transformer
 2 | 
 3 | A simple hello world transformer that reads objects stored in AIStore and returns "Hello World" in bytes for every object stored.
 4 | 
 5 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration.
 6 | 
 7 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms).
 8 | 
 9 | ### Initializing ETL with AIStore CLI
10 | 
11 | The following steps demonstrate how to initialize the `hello-world-transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md):
12 | 
13 | ```!bash
14 | $ cd transformers/hello_world
15 | 
16 | $ # Mention communication type b/w target and container
17 | $ export COMMUNICATION_TYPE = 'hpull://'
18 | 
19 | # Substitute env variables in spec file
20 | $ envsubst < pod.yaml > init_spec.yaml
21 | 
22 | $ # Initialize ETL
23 | $ ais etl init spec --from-file init_spec.yaml --name <etl-name>
24 | 
25 | $ # Transform and retrieve objects from the bucket using this ETL
26 | $ # For inline transformation
27 | $ ais etl object <etl-name> ais://<bck-name>/<obj-name>.<ext> -
28 | 
29 | $ # Or, for offline (bucket-to-bucket) transformation
30 | $ ais etl bucket <etl-name> ais://src-bck ais://dst-bck 
31 | ```


--------------------------------------------------------------------------------
/transformers/hello_world/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A FastAPI-based beginner-friendly "Hello World" web server.
 3 | 
 4 | Responds with "Hello World!" to any GET or PUT request.
 5 | 
 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 7 | """
 8 | 
 9 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
10 | 
11 | 
12 | class HelloWorldServerFastAPI(FastAPIServer):
13 |     """
14 |     A simple FastAPI-based ETL transformer that returns b"Hello World!" as output
15 |     for any incoming data, regardless of the request path or content.
16 |     """
17 | 
18 |     def transform(self, *_args) -> bytes:
19 |         return b"Hello World!"
20 | 
21 | 
22 | # Instantiate the server and expose its FastAPI app
23 | fastapi_server = HelloWorldServerFastAPI(port=8000)
24 | fastapi_server.logger.setLevel("DEBUG")
25 | fastapi_app = fastapi_server.app  # This is what uvicorn will run
26 | 


--------------------------------------------------------------------------------
/transformers/hello_world/flask_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A Flask-based beginner-friendly "Hello World" web server.
 3 | 
 4 | Responds with "Hello World!" to any GET or PUT request.
 5 | 
 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 7 | 
 8 | """
 9 | 
10 | from aistore.sdk.etl.webserver.flask_server import FlaskServer
11 | 
12 | 
13 | class HelloWorldServerFlask(FlaskServer):
14 |     """
15 |     A simple Flask-based ETL transformer that returns b"Hello World!" as output
16 |     for any incoming data, regardless of the request path or content.
17 |     """
18 | 
19 |     def transform(self, *_args) -> bytes:
20 |         return b"Hello World!"
21 | 
22 | 
23 | flask_server = HelloWorldServerFlask(port=8000)
24 | flask_server.logger.setLevel("DEBUG")
25 | flask_app = flask_server.app
26 | 


--------------------------------------------------------------------------------
/transformers/hello_world/http_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A HTTP-based beginner-friendly "Hello World" web server.
 3 | 
 4 | Responds with "Hello World!" to any GET or PUT request.
 5 | 
 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 7 | 
 8 | """
 9 | 
10 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer
11 | 
12 | 
13 | class HelloWorldHTTPServer(HTTPMultiThreadedServer):
14 |     """
15 |     A simple HTTP-based ETL transformer that returns b"Hello World!" as output
16 |     for any incoming data, regardless of the request path or content.
17 |     """
18 | 
19 |     def transform(self, *_args) -> bytes:
20 |         return b"Hello World!"
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     http_server = HelloWorldHTTPServer(port=8000)
25 |     http_server.logger.setLevel("DEBUG")
26 |     http_server.start()
27 | 


--------------------------------------------------------------------------------
/transformers/hello_world/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-hello-world
 5 |   annotations:
 6 |     # Values it can take ["hpull://", "hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpush://\""}
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_hello_world:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       # for flask based app
19 |       # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"]
20 |       # for http based app
21 |       # command: ["python", "http_server.py"]
22 |       # for fastapi based app
23 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
24 |       readinessProbe:
25 |         httpGet:
26 |           path: /health
27 |           port: default
28 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
29 |       # where the objects are stored on AIStore targets. This allows the ETL container 
30 |       # to access the files directly by absolute path.
31 |       volumeMounts:
32 |         - name: ais
33 |           mountPath: /mnt/data/ais
34 |   volumes:
35 |     - name: ais
36 |       hostPath:
37 |         path: /mnt/data/ais
38 |         type: Directory
39 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.9-slim
 2 | 
 3 | WORKDIR /
 4 | 
 5 | COPY ./requirements.txt requirements.txt
 6 | 
 7 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
 8 | 
 9 | COPY main.py main.py
10 | 
11 | ENV PYTHONUNBUFFERED 1
12 | ENV TF_ENABLE_ONEDNN_OPTS 0
13 | 
14 | EXPOSE 8000
15 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_keras:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/README.md:
--------------------------------------------------------------------------------
 1 | # Keras Transformer - Image Data Augmentation and Preprocessing
 2 | 
 3 | The Keras Transformer is a powerful tool designed for image data preprocessing and data augmentation. Leveraging the `apply_transform` function from Keras (TensorFlow), this transformer allows users to define transformations by providing a JSON string with parameter-value pairs. Currently, the following parameters are supported:
 4 | 
 5 | | Parameter                | Description                                             |
 6 | |-------------------------|---------------------------------------------------------|
 7 | | 'theta'                 | Rotation angle in degrees.                              |
 8 | | 'tx'                     | Shift in the x direction.                                 |
 9 | | 'ty'                     | Shift in the y direction.                                 |
10 | | 'shear'                 | Shear angle in degrees.                                    |
11 | | 'zx'                     | Zoom in the x direction.                                  |
12 | | 'zy'                     | Zoom in the y direction.                                  |
13 | | 'flip_horizontal'    | Boolean. Enable horizontal flip.                          |
14 | | 'flip_vertical'        | Boolean. Enable vertical flip.                              |
15 | | 'channel_shift_intensity' | Float. Channel shift intensity.                          |
16 | | 'brightness'            | Float. Brightness shift intensity.                          |
17 | 
18 | The image format (JPEG, PNG, etc.) of the images to be processed or stored is specified in the `spec.yaml`.
19 | 
20 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration.
21 | 
22 | **Please Note:** This transformer utilizes the [`FastAPI`](https://fastapi.tiangolo.com/) framework alongside the [`Gunicorn`](https://gunicorn.org/) + [Uvicorn](https://www.uvicorn.org/) combination as its web server. Alternate implementations of the same functionality are provided using [`Flask`](https://flask.palletsprojects.com/en/2.3.x/) and [`Gunicorn`](https://gunicorn.org/) within the [`flask-gunicorn`](/flask-gunicorn) directory. Additionally, there's a version that employs a multithreaded HTTP server, which can be found in the [`http-multithreaded-server`](/http-multithreaded-server/) folder.
23 | 
24 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms).
25 | 
26 | ## Parameters
27 | Only two parameters need to be updated in the `pod.yaml` file.
28 | 
29 | | Argument    | Description                                                           | Default Value |
30 | | ----------- | --------------------------------------------------------------------- | ------------- |
31 | | `TRANSFORM`      | Specify a JSON string with operations to be performed | ``     |
32 | | `FORMAT`| To process/store images in which image format (PNG, JPEG,etc)           | `JPEG`          |
33 | 
34 | Please ensure to adjust these parameters according to your specific requirements.
35 | 
36 | ### Initializing ETL with AIStore CLI
37 | 
38 | The following steps demonstrate how to initialize the `Keras Transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md):
39 | 
40 | ```!bash
41 | $ cd transformers/keras_transformer
42 | 
43 | $ # Set values for FORMAT and TRANSFORM
44 | $ export FORMAT="JPEG"
45 | $ export TRANSFORM='{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}'
46 | 
47 | $ # Mention communication type b/w target and container
48 | $ export COMMUNICATION_TYPE = 'hpull://'
49 | 
50 | # Substitute env variables in spec file
51 | $ envsubst < pod.yaml > init_spec.yaml
52 | 
53 | $ # Initialize ETL
54 | $ ais etl init spec --from-file init_spec.yaml --name <etl-name>
55 | 
56 | $ # Transform and retrieve objects from the bucket using this ETL
57 | $ # For inline transformation
58 | $ ais etl object <etl-name> ais://src/<image-name>.JPEG dst.JPEG
59 | $ # Or, for offline (bucket-to-bucket) transformation
60 | $ ais etl bucket <etl-name> ais://src-bck ais://dst-bck --ext="{JPEG:JPEG}" 
61 | ```


--------------------------------------------------------------------------------
/transformers/keras_preprocess/flask-gunicorn/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:slim
 2 | 
 3 | COPY requirements.txt requirements.txt
 4 | RUN pip3 install --upgrade -r requirements.txt
 5 | 
 6 | COPY app.py app.py
 7 | 
 8 | ENV PYTHONUNBUFFERED 1
 9 | 
10 | ENV FLASK_APP=app.py
11 | 
12 | # Expose Gunicorn port
13 | EXPOSE 80
14 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/flask-gunicorn/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_keras:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/flask-gunicorn/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # pylint: disable=missing-class-docstring, missing-function-docstring, missing-module-docstring, broad-exception-caught
 6 | 
 7 | import os
 8 | import json
 9 | import logging
10 | import io
11 | 
12 | import urllib
13 | import requests
14 | from flask import Flask, request
15 | from keras.preprocessing.image import (
16 |     ImageDataGenerator,
17 |     load_img,
18 |     array_to_img,
19 |     img_to_array,
20 | )
21 | 
22 | app = Flask(__name__)
23 | 
24 | # Constants
25 | FORMAT = os.getenv("FORMAT", "JPEG")
26 | ARG_TYPE = os.getenv("ARG_TYPE", "bytes")
27 | 
28 | # Environment Variables
29 | host_target = os.environ.get("AIS_TARGET_URL")
30 | 
31 | logging.info(host_target)
32 | 
33 | TRANSFORM = os.environ.get("TRANSFORM")
34 | if not host_target:
35 |     raise EnvironmentError("AIS_TARGET_URL environment variable missing")
36 | if not TRANSFORM:
37 |     raise EnvironmentError(
38 |         "TRANSFORM environment variable missing. Check documentation for examples (link)"
39 |     )
40 | transform_dict = json.loads(TRANSFORM)
41 | 
42 | 
43 | def transform_image(data: bytes) -> bytes:
44 |     """Process image data as bytes using the specified transformation."""
45 |     try:
46 |         img = load_img(io.BytesIO(data))
47 |         img = img_to_array(img)
48 |         datagen = ImageDataGenerator()
49 |         img = datagen.apply_transform(x=img, transform_parameters=transform_dict)
50 |         img = array_to_img(img)
51 |         buf = io.BytesIO()
52 |         img.save(buf, format=FORMAT)
53 |         return buf.getvalue()
54 |     except Exception as exp:
55 |         logging.error("Error processing data in transform_image: %s", str(exp))
56 |         raise exp
57 | 
58 | 
59 | @app.route("/health")
60 | def health_check():
61 |     return "Running"
62 | 
63 | 
64 | @app.route("/", defaults={"path": ""}, methods=["PUT", "GET"])
65 | @app.route("/<path:path>", methods=["PUT", "GET"])
66 | def image_handler(path: str):  # pylint: disable=unused-argument
67 |     try:
68 |         if request.method == "PUT":
69 |             post_data = request.data
70 |             processed_data = transform_image(post_data)
71 |             if processed_data is not None:
72 |                 return processed_data, 200
73 |             return "Data processing failed", 500
74 | 
75 |         if request.method == "GET":
76 |             if ARG_TYPE == "url":
77 |                 # webdataset
78 |                 query_path = request.args.get("url")
79 |                 result = transform_image(requests.get(query_path, timeout=5).content)
80 |             else:
81 |                 # normal GET - hpull
82 |                 object_path = urllib.parse.quote(path, safe="@")
83 |                 object_url = f"{host_target}/{object_path}"
84 |                 resp = requests.get(object_url, timeout=5)
85 |                 if resp.status_code != 200:
86 |                     raise FileNotFoundError(
87 |                         f"Error getting '{path}' from '{host_target}'"
88 |                     )
89 |                 result = transform_image(resp.content)
90 | 
91 |             if result is not None:
92 |                 return result, 200
93 |             return "Data processing failed", 500
94 |     except Exception as exp:
95 |         logging.error("Error processing request: %s", str(exp))
96 |         return "Data processing failed", 500
97 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/flask-gunicorn/pod.yaml:
--------------------------------------------------------------------------------
 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: transformer-keras
 6 |   annotations:
 7 |     # Values `communication_type` can take are ["hpull://", "hpush://"].
 8 |     # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 
 9 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
10 |     wait_timeout: 10m
11 | spec:
12 |   containers:
13 |     - name: server
14 |       image: aistorage/transformer_keras:latest
15 |       imagePullPolicy: Always
16 |       ports:
17 |         - name: default
18 |           containerPort: 80
19 |       command: ["gunicorn", "--bind", "0.0.0.0:80", "--workers", "12", "app:app"]
20 | 
21 |       env:
22 |         - name: FORMAT
23 |         # expected values - PNG, JPEG, etc
24 |           value: ${FORMAT:-"JPEG"}
25 |         - name: TRANSFORM
26 |         # MANDATORY: expected json string parameter-value pairs. 
27 |         # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform
28 |         # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}'
29 |           value:  ${TRANSFORM}
30 |       readinessProbe:
31 |         httpGet:
32 |           path: /health
33 |           port: default
34 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/flask-gunicorn/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pillow
3 | scipy
4 | keras
5 | tensorflow
6 | Flask
7 | gunicorn


--------------------------------------------------------------------------------
/transformers/keras_preprocess/http-multithreaded-server/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:slim
 2 | 
 3 | COPY requirements.txt requirements.txt
 4 | RUN pip3 install --upgrade -r requirements.txt
 5 | 
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | COPY server.py server.py
 9 | 
10 | ENV PYTHONUNBUFFERED 1
11 | 
12 | EXPOSE 80
13 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/http-multithreaded-server/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_keras:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/http-multithreaded-server/pod.yaml:
--------------------------------------------------------------------------------
 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: transformer-keras
 6 |   annotations:
 7 |     # Values `communication_type` can take are ["hpull://", "hpush://"].
 8 |     # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 
 9 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
10 |     wait_timeout: 5m
11 | spec:
12 |   containers:
13 |     - name: server
14 |       image: aistorage/transformer_keras:latest
15 |       imagePullPolicy: Always
16 |       ports:
17 |         - name: default
18 |           containerPort: 80
19 |       command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80']
20 |       env:
21 |         - name: FORMAT
22 |         # expected values - PNG, JPEG, etc
23 |           value: ${FORMAT:-"JPEG"}
24 |         - name: TRANSFORM
25 |         # MANDATORY: expected json string parameter-value pairs. 
26 |         # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform
27 |         # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}'
28 |           value:  ${TRANSFORM}
29 |       readinessProbe:
30 |         httpGet:
31 |           path: /health
32 |           port: default
33 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/http-multithreaded-server/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pillow
3 | scipy
4 | keras
5 | tensorflow


--------------------------------------------------------------------------------
/transformers/keras_preprocess/pod.yaml:
--------------------------------------------------------------------------------
 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md
 2 | apiVersion: v1
 3 | kind: Pod
 4 | metadata:
 5 |   name: transformer-keras
 6 |   annotations:
 7 |     # Values `communication_type` can take are ["hpull://", "hpush://"].
 8 |     # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 
 9 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
10 |     wait_timeout: 10m
11 | spec:
12 |   containers:
13 |     - name: server
14 |       image: aistorage/transformer_keras:latest
15 |       imagePullPolicy: Always
16 |       ports:
17 |         - name: default
18 |           containerPort: 8000
19 |       # change worker nodes to x2 of number of cores (cpu) available
20 |       command:  ["gunicorn", "main:app", "--workers", "12", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000"]
21 |       env:
22 |         - name: FORMAT
23 |         # expected values - PNG, JPEG, etc
24 |           value: ${FORMAT:-"JPEG"}
25 |         - name: TRANSFORM
26 |         # MANDATORY: expected json string parameter-value pairs. 
27 |         # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform
28 |         # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}'
29 |           value:  ${TRANSFORM}
30 |         - name: ARG_TYPE
31 |           value: ${ARG_TYPE:-""}
32 |       readinessProbe:
33 |         httpGet:
34 |           path: /health
35 |           port: default
36 |       volumeMounts:
37 |         - name: ais
38 |           mountPath: /tmp/
39 |   volumes:
40 |     - name: ais
41 |       hostPath:
42 |         path: /tmp/
43 |         type: Directory
44 | 


--------------------------------------------------------------------------------
/transformers/keras_preprocess/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.109.1
2 | uvicorn==0.24.0.post1
3 | gunicorn==23.0.0
4 | aiohttp>=3.9.2
5 | pillow==10.3.0
6 | scipy==1.10.1
7 | tensorflow==2.18.0


--------------------------------------------------------------------------------
/transformers/md5/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.13-alpine
 2 | 
 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6
 4 | 
 5 | # Set working directory
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | 
 9 | # Copy app code
10 | COPY flask_server.py fastapi_server.py http_server.py ./
11 | 
12 | # Environment setup
13 | ENV PYTHONUNBUFFERED=1
14 | 
15 | # Expose the default port
16 | EXPOSE 8000
17 | 


--------------------------------------------------------------------------------
/transformers/md5/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build --no-cache -t $(REGISTRY_URL)/transformer_md5:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_md5:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/md5/fastapi_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MD5 Hashing ETL Transformer (Fast-API)
 3 | 
 4 | This module implements an ETL transformer as a FastAPI-based server
 5 | that computes the MD5 checksum of each incoming request's payload
 6 | and returns the hexadecimal digest in the response body.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import hashlib
12 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer
13 | 
14 | 
15 | class Md5Server(FastAPIServer):
16 |     """
17 |     FastAPI-based HTTP server for MD5 hashing.
18 | 
19 |     Inherits from FastAPIServer to handle concurrent transform requests.
20 |     """
21 | 
22 |     def transform(self, data: bytes, *_args) -> bytes:
23 |         """
24 |         Compute the MD5 digest of the request payload.
25 |         """
26 |         return hashlib.md5(data).hexdigest().encode()
27 | 
28 | 
29 | # Create the server instance and expose the FastAPI app
30 | fastapi_server = Md5Server(port=8000)
31 | fastapi_server.logger.setLevel("DEBUG")
32 | fastapi_app = fastapi_server.app  # Expose the FastAPI app
33 | 


--------------------------------------------------------------------------------
/transformers/md5/flask_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MD5 Hashing ETL Transformer (Flask)
 3 | 
 4 | This module implements an ETL transformer as a Flask-based HTTP server
 5 | that computes the MD5 checksum of each incoming request's payload
 6 | and returns the hexadecimal digest in the response body.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import hashlib
12 | from aistore.sdk.etl.webserver.flask_server import FlaskServer
13 | 
14 | 
15 | class Md5Server(FlaskServer):
16 |     """
17 |     Flask-based HTTP server for MD5 hashing.
18 | 
19 |     Inherits from FlaskServer to handle concurrent transform requests.
20 |     """
21 | 
22 |     def transform(self, data: bytes, *_args) -> bytes:
23 |         """
24 |         Compute the MD5 digest of the request payload.
25 |         """
26 |         return hashlib.md5(data).hexdigest().encode()
27 | 
28 | 
29 | flask_server = Md5Server(port=8000)
30 | flask_server.logger.setLevel("DEBUG")
31 | flask_app = flask_server.app
32 | 


--------------------------------------------------------------------------------
/transformers/md5/http_server.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MD5 Hashing ETL Transformer
 3 | 
 4 | This module implements an ETL transformer as a multi-threaded HTTP server
 5 | that computes the MD5 checksum of each incoming request's payload
 6 | and returns the hexadecimal digest in the response body.
 7 | 
 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 9 | """
10 | 
11 | import hashlib
12 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer
13 | 
14 | 
15 | class Md5Server(HTTPMultiThreadedServer):
16 |     """
17 |     Multi-threaded HTTP server for MD5 hashing.
18 | 
19 |     Inherits from HTTPMultiThreadedServer to handle concurrent transform
20 |     requests. Each request body is hashed independently.
21 |     """
22 | 
23 |     def transform(self, data: bytes, *_args) -> bytes:
24 |         """
25 |         Compute the MD5 digest of the request payload.
26 |         """
27 |         return hashlib.md5(data).hexdigest().encode()
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     server = Md5Server()
32 |     server.logger.setLevel("DEBUG")
33 |     server.start()
34 | 


--------------------------------------------------------------------------------
/transformers/md5/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-md5
 5 |   annotations:
 6 |     # Values it can take ["hpull://", "hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpush://\""}
 8 |     wait_timeout: 5m
 9 |     support_direct_put: "true"
10 | spec:
11 |   containers:
12 |     - name: server
13 |       image: aistorage/transformer_md5:latest
14 |       imagePullPolicy: Always
15 |       ports:
16 |         - name: default
17 |           containerPort: 8000
18 |       # for flask based app
19 |       # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"]
20 |       # for http based app
21 |       # command: ["python", "http_server.py"]
22 |       # for fastapi based app
23 |       command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"]
24 |       readinessProbe:
25 |         httpGet:
26 |           path: /health
27 |           port: default
28 |       # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 
29 |       # where the objects are stored on AIStore targets. This allows the ETL container 
30 |       # to access the files directly by absolute path.
31 |   #     volumeMounts:
32 |   #       - name: ais
33 |   #         mountPath: /mnt/data/ais
34 |   # volumes:
35 |   #   - name: ais
36 |   #     hostPath:
37 |   #       path: /mnt/data/ais
38 |   #       type: Directory
39 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/.dockerignore:
--------------------------------------------------------------------------------
1 | src/tar2tf_test.go
2 | src/tar-single.tar
3 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/golang:1.21-alpine
 2 | 
 3 | RUN apk add --no-cache git
 4 | 
 5 | RUN mkdir $GOPATH/tar2tf
 6 | WORKDIR $GOPATH/tar2tf
 7 | COPY src/ ./
 8 | RUN go build -o tar2tf
 9 | 
10 | EXPOSE 80
11 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_tar2tf:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_tar2tf:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/README.md:
--------------------------------------------------------------------------------
  1 | # Tar2Tf transformer
  2 | 
  3 | Tar2Tf transforms TAR and TAR.GZ files to TFRecord format.
  4 | Additionally, it accepts optional parameters to apply conversions to TAR records and select subset of keys  from a single TAR record.
  5 | 
  6 | ## Usage
  7 | 
  8 | ### Build
  9 | 
 10 | ```console
 11 | $ cd src && go build
 12 | ```
 13 | 
 14 | ### Run
 15 | 
 16 | #### Run without any conversions and selections, on localhost:80
 17 | ```console
 18 | $ ./tar2tf -l localhost -p 80
 19 | ```
 20 | 
 21 | #### Conversions ans selections
 22 | 
 23 | Currently there are 4 available conversions to apply to TAR Record.
 24 | 
 25 | To specify conversions and selections, use `--spec` or `--spec-file` argument to `./tar2tf` command.
 26 | 
 27 | `--spec` argument accepts conversions and selections specification in form of a string.
 28 | `--spec-file` argument accepts conversions and selections in form of path to a file containing specification.
 29 | 
 30 | ##### Specification format
 31 | 
 32 | ```json
 33 | {
 34 |   "conversions": [
 35 |     conversionSpec1,
 36 |     conversionSpec2,
 37 |     ...
 38 |   ],
 39 |   "selections": [
 40 |     selectionSpec1,
 41 |     selectionSpec2,
 42 |     ...
 43 |   ]
 44 | }
 45 | ```
 46 | 
 47 | Conversions are applied in the order of occurrence in specification.
 48 | If there aren't any selections provided, all keys from TAR records, and relevant values, will be used.
 49 | 
 50 | ##### Decode Conversion
 51 | 
 52 | Decodes PNG or JPEG image into object, allowing to apply further image transformations
 53 | 
 54 | ```json
 55 | {
 56 |   "type": "Decode",
 57 |   "ext_name": "png"
 58 | }
 59 | ```
 60 | 
 61 | ##### Rotate Conversion
 62 | 
 63 | Rotates an image clockwise, accordingly to specified angle. If `angle == 0`, then random rotation is applied.
 64 | 
 65 | ```json
 66 | {
 67 |   "type": "Rotate",
 68 |   "ext_name": "png",
 69 |   "angle": 90
 70 | }
 71 | ```
 72 | 
 73 | ##### Resize Conversion
 74 | 
 75 | Resizes an image accordingly to specified destination size.
 76 | 
 77 | ```json
 78 | {
 79 |   "type": "Resize",
 80 |   "ext_name": "png",
 81 |   "sizes": [28, 28]
 82 | }
 83 | ```
 84 | 
 85 | ##### Rename Conversion
 86 | 
 87 | Rename multiple keys into the specified key.
 88 | 
 89 | ```json
 90 | {
 91 |   "type": "Rename",
 92 |   "renames": {
 93 |     "img": ["png", "jpeg"],
 94 |     "video": ["mp4", "avi"]
 95 |   }
 96 | }
 97 | ```
 98 | 
 99 | > Command above renames "png" and "jpeg" to "img", and renames "mp4" and "avi" to "video"
100 | 
101 | ##### Selection
102 | 
103 | Select single key from TAR record
104 | 
105 | ```json
106 | {
107 |   "ext_name": "png"
108 | }
109 | 
110 | ```
111 | 
112 | #### Run with Decode and Rotate selection
113 | 
114 | ```console
115 | $ echo >spec.json "
116 | {
117 |     "conversions": [
118 |      {
119 |        "type": "Decode",
120 |        "ext_name": "png"
121 |      },
122 |      {
123 |        "type": "Rotate",
124 |        "ext_name": "png"
125 |      }
126 |     ],
127 |     "selections": [
128 |      {
129 |        "ext_name": "png"
130 |      },
131 |      {
132 |        "ext_name": "cls"
133 |      }
134 |     ]
135 | }
136 | "
137 | 
138 | $ ./tar2tf -l "0.0.0.0" -p 80 -spec-file spec.json
139 | ```
140 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: tar2tf
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_tar2tf:latest
13 |       imagePullPolicy: IfNotPresent
14 |       ports:
15 |         - name: default
16 |           containerPort: 80
17 |       # To enable conversion e.g.
18 |       command: ['./tar2tf', '-l', '0.0.0.0', '-p', '80', '${OPTION_KEY}', '${OPTION_VALUE}']
19 |       readinessProbe:
20 |         httpGet:
21 |           path: /health
22 |           port: default
23 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/src/cmn/assert.go:
--------------------------------------------------------------------------------
 1 | // Package cmn common low-level types and utilities
 2 | /*
 3 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package cmn
 6 | 
 7 | import (
 8 | 	"log"
 9 | )
10 | 
11 | func Assert(cond bool, msg string) {
12 | 	if !cond {
13 | 		panic(msg)
14 | 	}
15 | }
16 | 
17 | func AssertNoErr(err error) {
18 | 	if err != nil {
19 | 		Assert(false, err.Error())
20 | 	}
21 | }
22 | 
23 | func Exit(err error) {
24 | 	if err != nil {
25 | 		log.Fatal(err.Error())
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/src/cmn/io.go:
--------------------------------------------------------------------------------
 1 | // Package cmn common low-level types and utilities
 2 | /*
 3 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package cmn
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"io"
10 | 	"io/ioutil"
11 | 	"sync/atomic"
12 | )
13 | 
14 | type (
15 | 	OnCloseReader struct {
16 | 		R  io.Reader
17 | 		Cb func()
18 | 	}
19 | 
20 | 	WriteCounter struct {
21 | 		totalBytesWritten int64
22 | 	}
23 | 
24 | 	// ByteHandle is a byte buffer(made from []byte) that implements
25 | 	// ReadOpenCloser interface
26 | 	ByteHandle struct {
27 | 		b []byte
28 | 		*bytes.Reader
29 | 	}
30 | )
31 | 
32 | func (r *OnCloseReader) Read(p []byte) (int, error) {
33 | 	return r.R.Read(p)
34 | }
35 | 
36 | func (r *OnCloseReader) Close() {
37 | 	r.Cb()
38 | }
39 | 
40 | func (r *WriteCounter) Write(p []byte) (int, error) {
41 | 	atomic.AddInt64(&r.totalBytesWritten, int64(len(p)))
42 | 	return len(p), nil
43 | }
44 | 
45 | func (r *WriteCounter) Size() int64 {
46 | 	return atomic.LoadInt64(&r.totalBytesWritten)
47 | }
48 | 
49 | func CopySection(r io.Reader, w io.Writer, start, length int64) (n int64, err error) {
50 | 	// Discard first start bytes.
51 | 	n, err = io.CopyN(ioutil.Discard, r, start)
52 | 	if err != nil {
53 | 		return 0, err
54 | 	}
55 | 
56 | 	// Write only length bytes.
57 | 	return io.CopyN(w, r, length)
58 | }
59 | 
60 | func NewByteHandle(bt []byte) *ByteHandle {
61 | 	return &ByteHandle{bt, bytes.NewReader(bt)}
62 | }
63 | 
64 | func (b *ByteHandle) Close() error {
65 | 	return nil
66 | }
67 | func (b *ByteHandle) Open() (io.ReadCloser, error) {
68 | 	return ioutil.NopCloser(bytes.NewReader(b.b)), nil
69 | }
70 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/src/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/ais-etl/transformers/tar2tf/src
 2 | 
 3 | go 1.21
 4 | 
 5 | require (
 6 | 	github.com/NVIDIA/go-tfdata v0.3.2-0.20200714114828-1432f6c70e3a
 7 | 	github.com/disintegration/imaging v1.6.2
 8 | 	github.com/json-iterator/go v1.1.12
 9 | )
10 | 
11 | require (
12 | 	github.com/golang/protobuf v1.5.3 // indirect
13 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
14 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
15 | 	golang.org/x/image v0.24.0 // indirect
16 | 	google.golang.org/protobuf v1.33.0 // indirect
17 | )
18 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/src/tar-single.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tar2tf/src/tar-single.tar


--------------------------------------------------------------------------------
/transformers/tar2tf/src/tar2tf_test.go:
--------------------------------------------------------------------------------
 1 | // Package main is an entry point to Tar2Tf transformation
 2 | /*
 3 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package main
 6 | 
 7 | import (
 8 | 	"bytes"
 9 | 	"net/http"
10 | 	"net/url"
11 | 	"os"
12 | 	"testing"
13 | 
14 | 	"github.com/NVIDIA/go-tfdata/tfdata/core"
15 | )
16 | 
17 | const tarPath = "tar-single.tar"
18 | 
19 | func mockRequest(t *testing.T) (r *http.Request) {
20 | 	var err error
21 | 
22 | 	r = &http.Request{}
23 | 	r.Body, err = os.Open(tarPath)
24 | 	r.URL = &url.URL{}
25 | 	if err != nil {
26 | 		t.Fatal(err.Error())
27 | 	}
28 | 	return r
29 | }
30 | 
31 | func TestTar2TfSimple(t *testing.T) {
32 | 	initVars("localhost", 8080, nil)
33 | 
34 | 	var (
35 | 		req  = mockRequest(t)
36 | 		buff = bytes.NewBuffer(nil)
37 | 	)
38 | 
39 | 	err := onTheFlyTransformWholeObject(req, buff)
40 | 	if err != nil {
41 | 		t.Fatal(err.Error())
42 | 	}
43 | 
44 | 	r := core.NewTFRecordReader(buff)
45 | 	examples, err := r.ReadAllExamples(1)
46 | 	if err != nil {
47 | 		t.Fatal(err.Error())
48 | 	}
49 | 	if len(examples) != 1 {
50 | 		t.Fatalf("expected 1 example, got %d", len(examples))
51 | 	}
52 | }
53 | 
54 | func TestTar2TfConvTransform(t *testing.T) {
55 | 	var (
56 | 		req  = mockRequest(t)
57 | 		buff = bytes.NewBuffer(nil)
58 | 
59 | 		filterSpec = []byte(`
60 | 			{
61 | 			  "conversions": [
62 | 				{
63 | 				  "type": "Decode",
64 | 				  "ext_name": "png"
65 | 				},
66 | 				{
67 | 				  "type": "Rotate",
68 | 				  "ext_name": "png"
69 | 				}
70 | 			  ],
71 | 			  "selections": [
72 | 				{
73 | 				  "ext_name": "png"
74 | 				},
75 | 				{
76 | 				  "ext_name": "cls"
77 | 				}
78 | 			  ]
79 | 			}
80 | 		`)
81 | 	)
82 | 
83 | 	initVars("localhost", 8080, filterSpec)
84 | 	err := onTheFlyTransformWholeObject(req, buff)
85 | 	if err != nil {
86 | 		t.Fatal(err.Error())
87 | 	}
88 | 
89 | 	r := core.NewTFRecordReader(buff)
90 | 	examples, err := r.ReadAllExamples(1)
91 | 	if err != nil {
92 | 		t.Fatal(err.Error())
93 | 	}
94 | 	if len(examples) != 1 {
95 | 		t.Fatalf("expected 1 example, got %d", len(examples))
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/transformers/tar2tf/src/transforms/pipeline.go:
--------------------------------------------------------------------------------
 1 | // Package transforms provides tools to transform TAR to TFRecords files
 2 | /*
 3 |  * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 4 |  */
 5 | package transforms
 6 | 
 7 | import (
 8 | 	"io"
 9 | 
10 | 	"github.com/NVIDIA/go-tfdata/tfdata/core"
11 | 	"github.com/NVIDIA/go-tfdata/tfdata/pipeline"
12 | 	"github.com/NVIDIA/go-tfdata/tfdata/transform"
13 | )
14 | 
15 | func CreatePipeline(r io.Reader, w io.Writer, isTarGz bool, job *TransformJob) *pipeline.DefaultPipeline {
16 | 	if job != nil {
17 | 		return transformPipeline(r, w, isTarGz, job)
18 | 	}
19 | 	return defaultPipeline(r, w, isTarGz)
20 | }
21 | 
22 | func defaultPipeline(r io.Reader, w io.Writer, isTarGz bool) *pipeline.DefaultPipeline {
23 | 	p := pipeline.NewPipeline()
24 | 	if isTarGz {
25 | 		p.FromTarGz(r)
26 | 	} else {
27 | 		p.FromTar(r)
28 | 	}
29 | 	return p.SampleToTFExample().ToTFRecord(w, 8)
30 | }
31 | 
32 | func transformPipeline(r io.Reader, w io.Writer, isTarGz bool, job *TransformJob) *pipeline.DefaultPipeline {
33 | 	p := pipeline.NewPipeline()
34 | 	if isTarGz {
35 | 		p.FromTarGz(r)
36 | 	} else {
37 | 		p.FromTar(r)
38 | 	}
39 | 
40 | 	var transformations []transform.SampleTransformation
41 | 	transformations = append(transformations, job.Conversions...)
42 | 	if len(job.Selections) > 0 { // Select everything by default.
43 | 		transformations = append(transformations, transform.SampleSelections(job.Selections...))
44 | 	}
45 | 	p.TransformSamples(transformations...).WithSample2TFExampleStage(func(sr core.SampleReader) core.TFExampleReader {
46 | 		return &SampleToTFExampleReader{SampleReader: sr}
47 | 	}).ToTFRecord(w)
48 | 	return p
49 | }
50 | 


--------------------------------------------------------------------------------
/transformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/__init__.py


--------------------------------------------------------------------------------
/transformers/tests/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # pylint: disable=missing-class-docstring, missing-function-docstring, missing-module-docstring
 5 | 
 6 | import os
 7 | import unittest
 8 | from tests.utils import generate_random_string, log_etl
 9 | from aistore.sdk.errors import ErrETLNotFound
10 | from aistore import Client
11 | 
12 | 
13 | class TestBase(unittest.TestCase):
14 |     def setUp(self):
15 |         self.endpoint = os.environ.get("AIS_ENDPOINT", "http://192.168.49.2:8080")
16 |         self.git_test_mode = os.getenv("GIT_TEST", "false")
17 |         self.client = Client(self.endpoint)
18 |         self.test_bck = self.client.bucket(
19 |             "test-bucket" + generate_random_string()
20 |         ).create(exist_ok=True)
21 |         self.etls = []
22 | 
23 |     def tearDown(self):
24 |         self.test_bck.delete()
25 |         for etl_name in self.etls:
26 |             try:
27 |                 log_etl(self.client, etl_name)
28 |                 self.client.etl(etl_name).stop()
29 |                 self.client.etl(etl_name).delete()
30 |             except ErrETLNotFound:
31 |                 # ETL might be already deleted
32 |                 pass
33 | 


--------------------------------------------------------------------------------
/transformers/tests/requirements.txt:
--------------------------------------------------------------------------------
 1 | aistore>=1.13.5
 2 | filetype
 3 | keras
 4 | numpy
 5 | pillow
 6 | pyyaml
 7 | requests
 8 | scikit-image
 9 | scipy
10 | keras
11 | pytest
12 | tensorflow
13 | opencv-python
14 | aiofiles
15 | kaggle
16 | typing-extensions>=4.3.0


--------------------------------------------------------------------------------
/transformers/tests/resources/test-audio-flac.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-flac.flac


--------------------------------------------------------------------------------
/transformers/tests/resources/test-audio-mp3.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-mp3.mp3


--------------------------------------------------------------------------------
/transformers/tests/resources/test-audio-wav.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-wav.wav


--------------------------------------------------------------------------------
/transformers/tests/resources/test-face-detection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-face-detection.png


--------------------------------------------------------------------------------
/transformers/tests/resources/test-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg


--------------------------------------------------------------------------------
/transformers/tests/resources/test-image.jpg.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg.bz2


--------------------------------------------------------------------------------
/transformers/tests/resources/test-image.jpg.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg.gz


--------------------------------------------------------------------------------
/transformers/tests/resources/test-manifest.jsonl:
--------------------------------------------------------------------------------
1 | {"id":"test-audio-wav", "from_time":0, "to_time": 1, "part": 0}
2 | {"id":"test-audio-wav", "from_time":1, "to_time": 2, "part": 1}
3 | {"id":"test-audio-wav", "from_time":2, "to_time": 3, "part": 2}


--------------------------------------------------------------------------------
/transformers/tests/resources/test-tar-single.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-tar-single.tar


--------------------------------------------------------------------------------
/transformers/tests/resources/test-text.txt:
--------------------------------------------------------------------------------
 1 | Quod equidem non reprehendo;
 2 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quibus natura iure responderit non esse verum aliunde finem beate vivendi, a se principia rei gerendae peti; Quae enim adhuc protulisti, popularia sunt, ego autem a te elegantiora desidero. Duo Reges: constructio interrete. Tum Lucius: Mihi vero ista valde probata sunt, quod item fratri puto. Bestiarum vero nullum iudicium puto. Nihil enim iam habes, quod ad corpus referas; Deinde prima illa, quae in congressu solemus: Quid tu, inquit, huc? Et homini, qui ceteris animantibus plurimum praestat, praecipue a natura nihil datum esse dicemus?
 3 | 
 4 | Iam id ipsum absurdum, maximum malum neglegi. Quod ea non occurrentia fingunt, vincunt Aristonem; Atqui perspicuum est hominem e corpore animoque constare, cum primae sint animi partes, secundae corporis. Fieri, inquam, Triari, nullo pacto potest, ut non dicas, quid non probes eius, a quo dissentias. Equidem e Cn. An dubium est, quin virtus ita maximam partem optineat in rebus humanis, ut reliquas obruat?
 5 | 
 6 | Quis istum dolorem timet?
 7 | Summus dolor plures dies manere non potest? Dicet pro me ipsa virtus nec dubitabit isti vestro beato M. Tubulum fuisse, qua illum, cuius is condemnatus est rogatione, P. Quod si ita sit, cur opera philosophiae sit danda nescio.
 8 | 
 9 | Ex eorum enim scriptis et institutis cum omnis doctrina liberalis, omnis historia.
10 | Quod si ita est, sequitur id ipsum, quod te velle video, omnes semper beatos esse sapientes. Cum enim fertur quasi torrens oratio, quamvis multa cuiusque modi rapiat, nihil tamen teneas, nihil apprehendas, nusquam orationem rapidam coerceas. Ita redarguitur ipse a sese, convincunturque scripta eius probitate ipsius ac moribus. At quanta conantur! Mundum hunc omnem oppidum esse nostrum! Incendi igitur eos, qui audiunt, vides. Vide, ne magis, inquam, tuum fuerit, cum re idem tibi, quod mihi, videretur, non nova te rebus nomina inponere. Qui-vere falsone, quaerere mittimus-dicitur oculis se privasse; Si ista mala sunt, in quae potest incidere sapiens, sapientem esse non esse ad beate vivendum satis. At vero si ad vitem sensus accesserit, ut appetitum quendam habeat et per se ipsa moveatur, quid facturam putas?
11 | 
12 | Quem si tenueris, non modo meum Ciceronem, sed etiam me ipsum abducas licebit.
13 | Stulti autem malorum memoria torquentur, sapientes bona praeterita grata recordatione renovata delectant.
14 | Esse enim quam vellet iniquus iustus poterat inpune.
15 | Quae autem natura suae primae institutionis oblita est?
16 | Verum tamen cum de rebus grandioribus dicas, ipsae res verba rapiunt;
17 | Hoc est non modo cor non habere, sed ne palatum quidem.
18 | Voluptatem cum summum bonum diceret, primum in eo ipso parum vidit, deinde hoc quoque alienum; Sed tu istuc dixti bene Latine, parum plane. Nam haec ipsa mihi erunt in promptu, quae modo audivi, nec ante aggrediar, quam te ab istis, quos dicis, instructum videro. Fatebuntur Stoici haec omnia dicta esse praeclare, neque eam causam Zenoni desciscendi fuisse. Non autem hoc: igitur ne illud quidem. Ratio quidem vestra sic cogit. Cum audissem Antiochum, Brute, ut solebam, cum M. An quod ita callida est, ut optime possit architectari voluptates?
19 | 
20 | Idemne, quod iucunde?
21 | Haec mihi videtur delicatior, ut ita dicam, molliorque ratio, quam virtutis vis gravitasque postulat. Sed quoniam et advesperascit et mihi ad villam revertendum est, nunc quidem hactenus; Cuius ad naturam apta ratio vera illa et summa lex a philosophis dicitur. Neque solum ea communia, verum etiam paria esse dixerunt. Sed nunc, quod agimus; A mene tu?


--------------------------------------------------------------------------------
/transformers/tests/resources/test-text.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-text.txt.bz2


--------------------------------------------------------------------------------
/transformers/tests/resources/test-text.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-text.txt.gz


--------------------------------------------------------------------------------
/transformers/tests/test_audio_split.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest suite for the Audio Splitter ETL Transformer.
 3 | 
 4 | For each combination of communication mode and FQN-flag, this test:
 5 |   1. Uploads sample audio files into a fresh bucket.
 6 |   2. Initializes the Audio Splitter ETL with fixed from/to times.
 7 |   3. Fetches each transformed segment and compares it
 8 |      against a locally-trimmed version for bitwise equality.
 9 | """
10 | 
11 | import logging
12 | from io import BytesIO
13 | from itertools import product
14 | from pathlib import Path
15 | from typing import Dict
16 | 
17 | import pytest
18 | import soundfile as sf
19 | from aistore.sdk import Bucket
20 | from aistore.sdk.etl import ETLConfig
21 | 
22 | from tests.const import AUDIO_SPLITTER_TEMPLATE, COMM_TYPES, FQN_OPTIONS
23 | 
24 | logger = logging.getLogger(__name__)
25 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
26 | 
27 | 
28 | def trim_audio_bytes(buf: bytes, audio_format: str, start: float, end: float) -> bytes:
29 |     """
30 |     Trim `buf` audio between `start` and `end` seconds and return WAV bytes.
31 |     """
32 |     bio = BytesIO(buf)
33 |     with sf.SoundFile(bio, mode="r") as src:
34 |         sr, ch = src.samplerate, src.channels
35 |         start_frame = int(start * sr)
36 |         end_frame = int(end * sr)
37 |         src.seek(start_frame)
38 |         frames = src.read(end_frame - start_frame)
39 | 
40 |     out = BytesIO()
41 |     with sf.SoundFile(
42 |         out, mode="w", samplerate=sr, channels=ch, format=audio_format
43 |     ) as dst:
44 |         dst.write(frames)
45 |     return out.getvalue()
46 | 
47 | 
48 | @pytest.mark.parametrize("comm_type,use_fqn", product(COMM_TYPES, FQN_OPTIONS))
49 | def test_audio_splitter_transform(
50 |     test_bck: Bucket,
51 |     local_audio_files: Dict[str, Path],
52 |     etl_factory,
53 |     comm_type: str,
54 |     use_fqn: bool,
55 | ) -> None:
56 |     """
57 |     Validate the Audio Splitter ETL transformer.
58 | 
59 |     Args:
60 |         test_bck:        fresh bucket fixture
61 |         local_audio_files: map of filename -> Path for inputs
62 |         etl_factory:     factory to init & cleanup ETLs
63 |         comm_type:       one of COMM_TYPES
64 |         use_fqn:         whether to pass FQN as argument
65 |     """
66 |     # 1) upload
67 |     file_name = "test-audio-wav.wav"
68 |     path = local_audio_files[file_name]
69 |     test_bck.object(file_name).get_writer().put_file(path)
70 | 
71 |     # 2) init with fixed times
72 |     from_t, to_t = 1.0, 2.0
73 |     args = {"from_time": f"{from_t:.2f}", "to_time": f"{to_t:.2f}"}
74 |     etl_name = etl_factory(
75 |         tag="audio-splitter",
76 |         server_type="fastapi",
77 |         template=AUDIO_SPLITTER_TEMPLATE,
78 |         communication_type=comm_type,
79 |         use_fqn=use_fqn,
80 |         direct_put=True,
81 |     )
82 |     logger.info("Initialized ETL %s (comm=%s, fqn=%s)", etl_name, comm_type, use_fqn)
83 | 
84 |     # 3) fetch & compare
85 |     reader = test_bck.object(file_name).get_reader(etl=ETLConfig(etl_name, args=args))
86 |     transformed = reader.read_all()
87 |     original = Path(path).read_bytes()
88 |     expected = trim_audio_bytes(original, "wav", from_t, to_t)
89 | 
90 |     assert transformed == expected, f"{file_name}: payload mismatch (ETL={etl_name})"
91 | 


--------------------------------------------------------------------------------
/transformers/tests/test_batch_rename.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  3 | """
  4 | 
  5 | import logging
  6 | import re
  7 | from pathlib import Path
  8 | from typing import Dict
  9 | from itertools import product
 10 | 
 11 | import pytest
 12 | from aistore.sdk import Bucket
 13 | from aistore.sdk.etl import ETLConfig
 14 | 
 15 | from tests.const import (
 16 |     BATCH_RENAME_TEMPLATE,
 17 |     COMM_TYPES,
 18 |     FQN_OPTIONS,
 19 | )
 20 | 
 21 | # Configure module-level logger
 22 | logger = logging.getLogger(__name__)
 23 | logging.basicConfig(
 24 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
 25 | )
 26 | 
 27 | 
 28 | def _verify_renamed_files(
 29 |     bucket: Bucket,
 30 |     local_files: Dict[str, Path],
 31 |     etl_name: str,
 32 |     pattern: str,
 33 |     prefix: str,
 34 | ) -> None:
 35 |     """
 36 |     Verifies the output of the ETL transformer:
 37 |     - Ensures transformed objects match original content.
 38 |     - If a filename matches the pattern, it should also appear under a new prefixed name.
 39 |     """
 40 |     for filename, path in local_files.items():
 41 |         original_data = Path(path).read_bytes()
 42 |         output_data = (
 43 |             bucket.object(filename).get_reader(etl=ETLConfig(etl_name)).read_all()
 44 |         )
 45 |         assert (
 46 |             output_data == original_data
 47 |         ), f"{filename} was not echoed correctly by ETL '{etl_name}'"
 48 | 
 49 |         if re.match(pattern, filename):
 50 |             renamed_path = f"{prefix}{filename}"
 51 |             renamed_data = bucket.object(renamed_path).get_reader().read_all()
 52 |             assert (
 53 |                 renamed_data == original_data
 54 |             ), f"{filename} was not renamed correctly to {renamed_path}"
 55 | 
 56 | 
 57 | @pytest.mark.parametrize("comm_type, use_fqn", product(COMM_TYPES, FQN_OPTIONS))
 58 | def test_batch_rename_transformer(
 59 |     test_bck: Bucket,
 60 |     local_audio_files: Dict[str, Path],
 61 |     etl_factory,
 62 |     endpoint: str,
 63 |     comm_type: str,
 64 |     use_fqn: bool,
 65 | ) -> None:
 66 |     """
 67 |     Integration test for the Batch Rename ETL transformer.
 68 |     Uploads audio files to a bucket, initializes the transformer,
 69 |     and verifies renaming behavior using ETL output.
 70 |     """
 71 |     pattern = r".*\.flac$"
 72 |     prefix = "renamed_"
 73 | 
 74 |     # Upload input files to the test bucket
 75 |     for fname, fpath in local_audio_files.items():
 76 |         test_bck.object(fname).get_writer().put_file(str(fpath))
 77 | 
 78 |     # Build transformer spec
 79 |     transformer_spec = BATCH_RENAME_TEMPLATE.format(
 80 |         communication_type="{communication_type}",
 81 |         direct_put="{direct_put}",
 82 |         command="{command}",
 83 |         ais_endpoint=endpoint,
 84 |         bck_name=test_bck.name,
 85 |         regex_pattern=pattern,
 86 |         dst_prefix=prefix,
 87 |     )
 88 | 
 89 |     # Initialize transformer
 90 |     etl_name = etl_factory(
 91 |         tag="batch-rename",
 92 |         server_type="fastapi",
 93 |         template=transformer_spec,
 94 |         communication_type=comm_type,
 95 |         use_fqn=use_fqn,
 96 |         direct_put="true",
 97 |     )
 98 |     logger.info(
 99 |         "Initialized ETL '%s' (server=fastapi, comm=%s, fqn=%s)",
100 |         etl_name,
101 |         comm_type,
102 |         use_fqn,
103 |     )
104 | 
105 |     # Validate output
106 |     _verify_renamed_files(test_bck, local_audio_files, etl_name, pattern, prefix)
107 | 


--------------------------------------------------------------------------------
/transformers/tests/test_face_detection_stress.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Stress testing Face Detection Transformer for 1 Million objects across all communication types.
  3 | 
  4 | Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
  5 | """
  6 | 
  7 | import logging
  8 | from datetime import datetime
  9 | 
 10 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH
 11 | from aistore.sdk.etl.etl_templates import FACE_DETECTION_TRANSFORMER
 12 | 
 13 | from tests.base import TestBase
 14 | from tests.utils import (
 15 |     format_image_tag_for_git_test_mode,
 16 |     cases,
 17 |     generate_random_string,
 18 | )
 19 | 
 20 | # Configure logging
 21 | logging.basicConfig(
 22 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 23 |     level=logging.INFO,
 24 | )
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | class TestFaceDetectionStress(TestBase):
 29 |     """Stress test for AIStore ETL Face Detection transformation on a large dataset."""
 30 | 
 31 |     def setUp(self):
 32 |         """Sets up the test environment by defining the source bucket for face detection."""
 33 |         super().setUp()
 34 |         self.images_bck = self.client.bucket(bck_name="stress-test-face-detection")
 35 | 
 36 |     @cases(
 37 |         (ETL_COMM_HPUSH, "hpush_fastapi", ""),
 38 |         (ETL_COMM_HPULL, "hpull_fastapi", ""),
 39 |         "",
 40 |         (ETL_COMM_HPULL, "hpull_fastapi_fqn", "fqn"),
 41 |         (ETL_COMM_HPUSH, "hpush_fastapi_fqn", "fqn"),
 42 |     )
 43 |     def test_face_detection(self, test_case):
 44 |         comm_type, test_suffix, arg_type = test_case
 45 |         """Stress test face detection ETL transformation using various communication types."""
 46 |         test_name = f"test_face_detection_{test_suffix}"
 47 |         etl_name = f"face-detect-{generate_random_string(5)}-{test_suffix}"
 48 |         self.etls.append(etl_name)
 49 | 
 50 |         self.initialize_etl(comm_type, etl_name, arg_type)
 51 |         self.execute_etl_job(test_name, etl_name)
 52 | 
 53 |     def initialize_etl(self, comm_type: str, etl_name: str, arg_type: str):
 54 |         """Initializes the ETL transformation with the specified parameters."""
 55 |         template = FACE_DETECTION_TRANSFORMER.format(
 56 |             communication_type=comm_type, format="jpg", arg_type=arg_type
 57 |         )
 58 | 
 59 |         # Adjust template for Git test mode
 60 |         template = format_image_tag_for_git_test_mode(template, "face_detection")
 61 | 
 62 |         # Initialize ETL transformation
 63 |         self.client.etl(etl_name).init_spec(
 64 |             template=template, communication_type=comm_type, arg_type=arg_type
 65 |         )
 66 | 
 67 |         logger.info(
 68 |             "Initialized ETL: %s\n%s", etl_name, self.client.etl(etl_name).view()
 69 |         )
 70 | 
 71 |     def execute_etl_job(self, test_name: str, etl_name: str):
 72 |         """Executes the ETL transformation job and validates results."""
 73 |         start_time = datetime.now()
 74 | 
 75 |         # Start the transformation job
 76 |         job_id = self.images_bck.transform(
 77 |             etl_name=etl_name, timeout="5m", to_bck=self.test_bck
 78 |         )
 79 | 
 80 |         # Wait for job completion
 81 |         self.client.job(job_id).wait(timeout=600, verbose=False)
 82 | 
 83 |         # Calculate time taken
 84 |         time_elapsed = datetime.now() - start_time
 85 | 
 86 |         # Verify job status
 87 |         job_status = self.client.job(job_id).status()
 88 |         self.assertEqual(
 89 |             job_status.err, "", f"ETL Job {job_id} failed with error: {job_status.err}"
 90 |         )
 91 | 
 92 |         # Ensure object count matches between source and destination
 93 |         src_objects = len(self.images_bck.list_all_objects())
 94 |         dest_objects = len(self.test_bck.list_all_objects())
 95 |         self.assertEqual(
 96 |             src_objects,
 97 |             dest_objects,
 98 |             f"Mismatch in object count: {src_objects} vs {dest_objects}",
 99 |         )
100 | 
101 |         logger.info("Test: %s | Duration: %s", test_name, time_elapsed)
102 | 
103 |         # Log results to metrics file
104 |         with open("metrics.txt", "a+", encoding="utf-8") as file:
105 |             file.write(f"{test_name} {time_elapsed}\n")
106 | 


--------------------------------------------------------------------------------
/transformers/tests/test_hash_with_args.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pytest suite for the HashWithArgs ETL transformer.
  3 | 
  4 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
  5 | """
  6 | 
  7 | import random
  8 | import logging
  9 | from pathlib import Path
 10 | from typing import Dict
 11 | 
 12 | import pytest
 13 | import xxhash
 14 | from aistore.sdk.etl import ETLConfig
 15 | from aistore.sdk import Bucket
 16 | 
 17 | from tests.const import (
 18 |     INLINE_PARAM_COMBINATIONS,
 19 |     HASH_WITH_ARGS_TEMPLATE,
 20 | )
 21 | 
 22 | # Configure module-level logger
 23 | logger = logging.getLogger(__name__)
 24 | logging.basicConfig(
 25 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
 26 | )
 27 | 
 28 | 
 29 | def _upload_test_files(test_bck: Bucket, local_files: Dict[str, Path]) -> None:
 30 |     """
 31 |     Upload files to the specified bucket.
 32 |     """
 33 |     for filename, path in local_files.items():
 34 |         logger.debug("Uploading %s to bucket %s", filename, test_bck.name)
 35 |         test_bck.object(filename).get_writer().put_file(str(path))
 36 | 
 37 | 
 38 | def _calculate_hash(data, seed):
 39 |     """Computes the seeded hash of a given file."""
 40 |     hasher = xxhash.xxh64(seed=seed)
 41 |     hasher.update(data)
 42 |     return hasher.hexdigest().encode()
 43 | 
 44 | 
 45 | def _verify_test_files(
 46 |     test_bck: Bucket,
 47 |     local_files: Dict[str, Path],
 48 |     etl_name: str,
 49 | ) -> None:
 50 |     """
 51 |     Verify that the files in the bucket match the hash.
 52 |     """
 53 |     for filename, path in local_files.items():
 54 |         seed = random.randint(0, 1000)
 55 |         reader = test_bck.object(filename).get_reader(
 56 |             etl=ETLConfig(etl_name, args=str(seed))
 57 |         )
 58 |         transformed = reader.read_all()
 59 |         original = Path(path).read_bytes()
 60 |         original_hash = _calculate_hash(original, seed)
 61 |         assert (
 62 |             transformed == original_hash
 63 |         ), f"Hash mismatch for {filename}: expected {original_hash}, got {transformed}"
 64 | 
 65 | 
 66 | # pylint: disable=too-many-arguments
 67 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS)
 68 | def test_echo_transformer(
 69 |     test_bck: Bucket,
 70 |     local_files: Dict[str, Path],
 71 |     etl_factory,
 72 |     server_type: str,
 73 |     comm_type: str,
 74 |     use_fqn: bool,
 75 | ) -> None:
 76 |     """
 77 |     Validate the Python-based Hash With Args ETL transformer.
 78 |     Upload sample files, initialize the ETL, then assert hash.
 79 |     """
 80 |     # Upload inputs
 81 |     _upload_test_files(test_bck, local_files)
 82 | 
 83 |     # Build and initialize ETL
 84 |     etl_name = etl_factory(
 85 |         tag="hash-with-args",
 86 |         server_type=server_type,
 87 |         template=HASH_WITH_ARGS_TEMPLATE,
 88 |         communication_type=comm_type,
 89 |         use_fqn=use_fqn,
 90 |     )
 91 |     logger.info(
 92 |         "Initialized HashWithArgs ETL '%s' (server=%s, comm=%s, fqn=%s)",
 93 |         etl_name,
 94 |         server_type,
 95 |         comm_type,
 96 |         use_fqn,
 97 |     )
 98 | 
 99 |     _verify_test_files(
100 |         test_bck,
101 |         local_files,
102 |         etl_name,
103 |     )
104 | 


--------------------------------------------------------------------------------
/transformers/tests/test_hello_world.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest suite for the Hello-World ETL transformer.
 3 | 
 4 | For each combination of server framework (Flask, FastAPI, HTTP), communication mode (hpull/hpush),
 5 | and argument style (FQN vs relative), this test:
 6 |   1. Uploads two sample files into a fresh bucket.
 7 |   2. Creates an ETL job via `etl_factory`.
 8 |   3. Transforms each file and asserts the output equals `b"Hello World!"`.
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import logging
14 | from pathlib import Path
15 | 
16 | import pytest
17 | from aistore.sdk.etl import ETLConfig
18 | from aistore.sdk import Bucket
19 | 
20 | from tests.const import HELLO_WORLD_TEMPLATE, INLINE_PARAM_COMBINATIONS
21 | 
22 | # Configure module-level logger
23 | logger = logging.getLogger(__name__)
24 | logging.basicConfig(
25 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
26 | )
27 | 
28 | 
29 | # pylint: disable=too-many-arguments
30 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS)
31 | def test_hello_world_transformer(
32 |     test_bck: Bucket,
33 |     local_files: dict[str, Path],
34 |     etl_factory: callable,
35 |     server_type: str,
36 |     comm_type: str,
37 |     use_fqn: bool,
38 | ) -> None:
39 |     """
40 |     Transform local_files via the Hello-World ETL and verify output.
41 | 
42 |     Args:
43 |         client: AIS cluster client (session-scoped fixture).
44 |         test_bck: fresh bucket for this test (function-scoped).
45 |         local_files: mapping filename -> local Path of sample inputs.
46 |         etl_factory: fixture to create+cleanup ETL jobs.
47 |         server_type: framework to use ('flask', 'fastapi', 'http').
48 |         comm_type: ETL_COMM_HPULL or ETL_COMM_HPUSH.
49 |         use_fqn: whether to pass objects by fully-qualified name.
50 |     """
51 |     # Upload sample files
52 |     for filename, path in local_files.items():
53 |         logger.debug("Uploading %s to bucket %s", filename, test_bck.name)
54 |         test_bck.object(filename).get_writer().put_file(path)
55 | 
56 |     # Build and initialize ETL
57 |     etl_name = etl_factory(
58 |         tag="hello-world",
59 |         server_type=server_type,
60 |         template=HELLO_WORLD_TEMPLATE,
61 |         communication_type=comm_type,
62 |         use_fqn=use_fqn,
63 |     )
64 |     logger.info(
65 |         "Initialized Hello-World ETL '%s' (server=%s, comm=%s, fqn=%s)",
66 |         etl_name,
67 |         server_type,
68 |         comm_type,
69 |         use_fqn,
70 |     )
71 | 
72 |     # Execute transform and assert on each file
73 |     for filename in local_files:
74 |         reader = test_bck.object(filename).get_reader(etl=ETLConfig(etl_name))
75 |         output = reader.read_all()
76 |         assert (
77 |             output == b"Hello World!"
78 |         ), f"ETL {etl_name} produced unexpected output for '{filename}': {output!r}"
79 | 


--------------------------------------------------------------------------------
/transformers/tests/test_hello_world_stress.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest-based stress suite for the Hello-World ETL transformer.
 3 | 
 4 | This module:
 5 |   - Uses a pre-populated `stress_bucket` with 10,000 objects (session-scoped fixture).
 6 |   - Creates a fresh `test_bck` destination bucket per test.
 7 |   - Runs the Hello-World ETL across all server/comm/FQN combinations in parallel.
 8 |   - Verifies object counts and payload correctness on a random sample.
 9 |   - Records per-test durations into `metrics.txt`.
10 | 
11 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
12 | """
13 | 
14 | import random
15 | import logging
16 | 
17 | import pytest
18 | from aistore.sdk import Bucket
19 | 
20 | from tests.const import PARAM_COMBINATIONS, HELLO_WORLD_TEMPLATE, LABEL_FMT
21 | 
22 | logger = logging.getLogger(__name__)
23 | logging.basicConfig(
24 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
25 | )
26 | 
27 | 
28 | # pylint: disable=too-many-arguments, too-many-locals
29 | @pytest.mark.stress
30 | @pytest.mark.parametrize(
31 |     "server_type, comm_type, use_fqn, direct_put", PARAM_COMBINATIONS
32 | )
33 | def test_hello_world_stress(
34 |     stress_client,
35 |     stress_bucket: Bucket,
36 |     test_bck: Bucket,
37 |     etl_factory,
38 |     stress_metrics,
39 |     stress_object_count,
40 |     server_type: str,
41 |     comm_type: str,
42 |     use_fqn: bool,
43 |     direct_put: str,
44 | ):
45 |     """
46 |     Stress test for Hello-World ETL: copy 10k objects with transformation.
47 |     """
48 |     # 1) Initialize ETL
49 |     label = LABEL_FMT.format(
50 |         name="HELLO WORLD",
51 |         server=server_type,
52 |         comm=comm_type,
53 |         arg="fqn" if use_fqn else "",
54 |         direct=direct_put,
55 |     )
56 |     etl_name = etl_factory(
57 |         tag="hello-world",
58 |         server_type=server_type,
59 |         template=HELLO_WORLD_TEMPLATE,
60 |         communication_type=comm_type,
61 |         use_fqn=use_fqn,
62 |         direct_put=direct_put,
63 |     )
64 | 
65 |     # 2) Run transform job
66 |     job_id = stress_bucket.transform(
67 |         etl_name=etl_name,
68 |         to_bck=test_bck,
69 |         num_workers=24,
70 |         timeout="10m",
71 |     )
72 |     job = stress_client.job(job_id)
73 |     job.wait(timeout=600)
74 |     duration = job.get_total_time()
75 | 
76 |     logger.info(
77 |         "ETL '%s' completed in %ss (srv=%s, comm=%s, fqn=%s)",
78 |         etl_name,
79 |         duration,
80 |         server_type,
81 |         comm_type,
82 |         use_fqn,
83 |     )
84 | 
85 |     # 3) Verify counts
86 |     objs = list(test_bck.list_all_objects())
87 |     assert (
88 |         len(objs) == stress_object_count
89 |     ), f"Expected {stress_object_count} objects, got {len(objs)}"
90 | 
91 |     # 4) Sample and verify payload
92 |     samples = random.sample(objs, 10)
93 |     for entry in samples:
94 |         data = test_bck.object(entry.name).get_reader().read_all()
95 |         assert data == b"Hello World!", f"Mismatch in object {entry.name}"
96 | 
97 |     # 5) Record metric
98 |     stress_metrics.append((label, duration))
99 | 


--------------------------------------------------------------------------------
/transformers/tests/test_keras_stress.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Stress testing Keras Transformer for 50K images across all communication types.
  3 | 
  4 | Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
  5 | """
  6 | 
  7 | import logging
  8 | from datetime import datetime
  9 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH
 10 | from aistore.sdk.etl.etl_templates import KERAS_TRANSFORMER
 11 | 
 12 | from tests.base import TestBase
 13 | from tests.utils import cases, generate_random_string
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(
 17 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 18 |     level=logging.INFO,
 19 | )
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class TestKerasStress(TestBase):
 24 |     """Stress test for Keras Transformer with 50K images using different communication types."""
 25 | 
 26 |     def setUp(self):
 27 |         """Sets up the test environment by defining the source bucket for images."""
 28 |         super().setUp()
 29 |         self.images_bck = self.client.bucket(bck_name="stress-test-images")
 30 | 
 31 |     def run_test(self, comm_type: str, test_name: str, fqn_flag: bool = False):
 32 |         """
 33 |         Runs a Keras transformation stress test using AIStore ETL.
 34 | 
 35 |         Args:
 36 |             comm_type (str): ETL communication type (HPULL, HPUSH).
 37 |             test_name (str): Name of the test case for logging.
 38 |             fqn_flag (bool, optional): Whether to use fully qualified names (FQN). Defaults to False.
 39 |         """
 40 |         arg_type = "fqn" if fqn_flag else ""
 41 | 
 42 |         # Generate a unique ETL name
 43 |         etl_name = f"keras-transformer-{generate_random_string(5)}"
 44 |         self.etls.append(etl_name)
 45 | 
 46 |         # Generate the ETL template
 47 |         template = KERAS_TRANSFORMER.format(
 48 |             communication_type=comm_type,
 49 |             format="JPEG",
 50 |             transform='{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}',
 51 |             arg_type=arg_type,
 52 |         )
 53 | 
 54 |         # Initialize ETL transformation
 55 |         self.client.etl(etl_name).init_spec(
 56 |             template=template, communication_type=comm_type, arg_type=arg_type
 57 |         )
 58 | 
 59 |         logger.info(
 60 |             "Starting ETL test: %s (ETL: %s)\n%s",
 61 |             test_name,
 62 |             etl_name,
 63 |             self.client.etl(etl_name).view(),
 64 |         )
 65 | 
 66 |         start_time = datetime.now()
 67 | 
 68 |         # Start transformation job
 69 |         job_id = self.images_bck.transform(
 70 |             etl_name=etl_name,
 71 |             timeout="30m",
 72 |             to_bck=self.test_bck,
 73 |             ext={"JPEG": "JPEG"},
 74 |         )
 75 | 
 76 |         # Wait for the job to complete
 77 |         self.client.job(job_id).wait(timeout=1800)
 78 |         time_elapsed = datetime.now() - start_time
 79 | 
 80 |         # Check job status
 81 |         job_status = self.client.job(job_id).status()
 82 |         self.assertEqual(
 83 |             job_status.err, "", f"ETL Job {job_id} failed with error: {job_status.err}"
 84 |         )
 85 | 
 86 |         # Ensure all images were transformed correctly
 87 |         self.assertEqual(
 88 |             len(self.images_bck.list_all_objects()),
 89 |             len(self.test_bck.list_all_objects()),
 90 |             "Mismatch in number of transformed images.",
 91 |         )
 92 | 
 93 |         logger.info("Test: %s | Duration: %s", test_name, time_elapsed)
 94 | 
 95 |         # Log results to a metrics file
 96 |         with open("metrics.txt", "a+", encoding="utf-8") as file:
 97 |             file.write(f"{test_name} {time_elapsed}\n")
 98 | 
 99 |     @cases(
100 |         (ETL_COMM_HPUSH, "test_keras_hpush_fastapi", False),
101 |         (ETL_COMM_HPULL, "test_keras_hpull_fastapi", False),
102 |         (ETL_COMM_HPULL, "test_keras_hpull_fastapi_fqn", True),
103 |         (ETL_COMM_HPUSH, "test_keras_hpush_fastapi_fqn", True),
104 |     )
105 |     def test_keras_transformer(self, test_case):
106 |         """Stress tests Keras ETL transformation using different communication types."""
107 |         comm_type, test_name, fqn_flag = test_case
108 |         self.run_test(comm_type, test_name, fqn_flag)
109 | 


--------------------------------------------------------------------------------
/transformers/tests/test_md5.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pytest suite for the MD5 ETL transformer.
 3 | 
 4 | For each combination of server backend (Flask, FastAPI, HTTP),
 5 | communication mode (HPULL/HPUSH), and argument style (FQN vs relative), this test:
 6 |   1. Uploads sample image and text files into a fresh bucket.
 7 |   2. Creates an MD5 ETL job via `etl_factory`.
 8 |   3. Transforms each file and asserts the output matches the MD5 checksum.
 9 | 
10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
11 | """
12 | 
13 | import logging
14 | import hashlib
15 | from pathlib import Path
16 | from typing import Dict
17 | 
18 | import pytest
19 | from aistore.sdk.etl import ETLConfig
20 | from aistore.sdk import Bucket
21 | 
22 | from tests.const import MD5_TEMPLATE, INLINE_PARAM_COMBINATIONS
23 | 
24 | # Configure module‐level logging
25 | logging.basicConfig(
26 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
27 | )
28 | 
29 | 
30 | # pylint: disable=too-many-arguments
31 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS)
32 | def test_md5_transformer(
33 |     test_bck: Bucket,
34 |     local_files: Dict[str, Path],
35 |     etl_factory,
36 |     server_type: str,
37 |     comm_type: str,
38 |     use_fqn: bool,
39 | ) -> None:
40 |     """
41 |     Validate the MD5 ETL transformer across runtimes and communication modes.
42 | 
43 |     Args:
44 |         test_bck:    fresh bucket fixture
45 |         local_files: mapping of filename -> Path for inputs
46 |         etl_factory: factory fixture to create ETL jobs
47 |         server_type: 'flask' | 'fastapi' | 'http'
48 |         comm_type:   ETL_COMM_HPULL | ETL_COMM_HPUSH
49 |         use_fqn:     whether to pass FQN or relative paths
50 |     """
51 |     # 1) Upload inputs
52 |     for filename, path in local_files.items():
53 |         logging.debug("Uploading %s to %s", filename, test_bck.name)
54 |         test_bck.object(filename).get_writer().put_file(str(path))
55 | 
56 |     # 2) Initialize ETL
57 |     etl_name = etl_factory(
58 |         tag="md5",
59 |         server_type=server_type,
60 |         template=MD5_TEMPLATE,
61 |         communication_type=comm_type,
62 |         use_fqn=use_fqn,
63 |     )
64 |     logging.info(
65 |         "Initialized MD5 ETL '%s' (server=%s, comm=%s, fqn=%s)",
66 |         etl_name,
67 |         server_type,
68 |         comm_type,
69 |         use_fqn,
70 |     )
71 | 
72 |     # 3) Run transform and assert checksum
73 |     for filename, path in local_files.items():
74 |         # compute expected MD5 of original file
75 |         expected = hashlib.md5(Path(path).read_bytes()).hexdigest().encode()
76 | 
77 |         # fetch transformed result
78 |         result_bytes = (
79 |             test_bck.object(filename).get_reader(etl=ETLConfig(etl_name)).read_all()
80 |         )
81 | 
82 |         assert (
83 |             result_bytes == expected
84 |         ), f"ETL {etl_name} MD5 mismatch for {filename}: expected {expected!r}, got {result_bytes!r}"
85 | 


--------------------------------------------------------------------------------
/transformers/tests/test_md5_stress.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pytest-based stress suite for the MD5 ETL transformer.
  3 | 
  4 | This module:
  5 |   - Uses a pre-populated `stress_bucket` with 10,000 objects (session-scoped fixture).
  6 |   - Creates a fresh `test_bck` destination bucket per test.
  7 |   - Runs the MD5 ETL across all server/comm/FQN combinations in parallel.
  8 |   - Verifies object counts and payload correctness on a random sample.
  9 |   - Records per-test durations into `metrics.txt`.
 10 | 
 11 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 12 | """
 13 | 
 14 | import random
 15 | import logging
 16 | import hashlib
 17 | 
 18 | import pytest
 19 | from aistore.sdk import Bucket
 20 | 
 21 | from tests.const import PARAM_COMBINATIONS, MD5_TEMPLATE, LABEL_FMT
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logging.basicConfig(
 25 |     level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s"
 26 | )
 27 | 
 28 | 
 29 | # pylint: disable=too-many-arguments, too-many-locals
 30 | @pytest.mark.stress
 31 | @pytest.mark.parametrize(
 32 |     "server_type, comm_type, use_fqn, direct_put", PARAM_COMBINATIONS
 33 | )
 34 | def test_md5_stress(
 35 |     stress_client,
 36 |     stress_bucket: Bucket,
 37 |     test_bck: Bucket,
 38 |     etl_factory,
 39 |     stress_metrics,
 40 |     stress_object_count,
 41 |     server_type: str,
 42 |     comm_type: str,
 43 |     use_fqn: bool,
 44 |     direct_put: str,
 45 | ):
 46 |     """
 47 |     Stress test for MD5 ETL: copy 10k objects with transformation.
 48 |     """
 49 |     # 1) Initialize ETL
 50 |     label = LABEL_FMT.format(
 51 |         name="MD5",
 52 |         server=server_type,
 53 |         comm=comm_type,
 54 |         arg="fqn" if use_fqn else "",
 55 |         direct=direct_put,
 56 |     )
 57 |     etl_name = etl_factory(
 58 |         tag="md5",
 59 |         server_type=server_type,
 60 |         template=MD5_TEMPLATE,
 61 |         communication_type=comm_type,
 62 |         use_fqn=use_fqn,
 63 |         direct_put=direct_put,
 64 |     )
 65 | 
 66 |     # 2) Run transform job
 67 |     job_id = stress_bucket.transform(
 68 |         etl_name=etl_name,
 69 |         to_bck=test_bck,
 70 |         num_workers=24,
 71 |         timeout="10m",
 72 |     )
 73 |     job = stress_client.job(job_id)
 74 |     job.wait(timeout=600)
 75 |     duration = job.get_total_time()
 76 | 
 77 |     logger.info(
 78 |         "ETL '%s' completed in %ss (srv=%s, comm=%s, fqn=%s)",
 79 |         etl_name,
 80 |         duration,
 81 |         server_type,
 82 |         comm_type,
 83 |         use_fqn,
 84 |     )
 85 | 
 86 |     # 3) Verify counts
 87 |     objs = list(test_bck.list_all_objects())
 88 |     assert (
 89 |         len(objs) == stress_object_count
 90 |     ), f"Expected {stress_object_count} objects, got {len(objs)}"
 91 | 
 92 |     # 4) Sample and verify payload
 93 |     samples = random.sample(objs, 10)
 94 |     for entry in samples:
 95 |         data = test_bck.object(entry.name).get_reader().read_all()
 96 |         acutal_obj = stress_bucket.object(entry.name).get_reader().read_all()
 97 |         expected = hashlib.md5(acutal_obj).hexdigest().encode()
 98 |         assert data == expected, f"MD5 checksum not matching for {entry.name}"
 99 | 
100 |     # 5) Record metric
101 |     stress_metrics.append((label, duration))
102 | 


--------------------------------------------------------------------------------
/transformers/tests/test_torchvision_transformer.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | 
  5 | import io
  6 | from PIL import Image
  7 | from torchvision import transforms
  8 | 
  9 | from tests.base import TestBase
 10 | from tests.utils import (
 11 |     format_image_tag_for_git_test_mode,
 12 |     cases,
 13 |     generate_random_string,
 14 | )
 15 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH
 16 | from aistore.sdk.etl.etl_templates import TORCHVISION_TRANSFORMER
 17 | from aistore.sdk.etl import ETLConfig
 18 | 
 19 | 
 20 | class TestTorchVisionTransformer(TestBase):
 21 |     """Unit tests for TorchVision-based image transformations using AIStore ETL."""
 22 | 
 23 |     def setUp(self):
 24 |         """Set up test environment by uploading a test image to the bucket."""
 25 |         super().setUp()
 26 |         self.test_image_filename = "test-image.jpg"
 27 |         self.test_image_source = "./resources/test-image.jpg"
 28 | 
 29 |         self.test_bck.object(self.test_image_filename).get_writer().put_file(
 30 |             self.test_image_source
 31 |         )
 32 | 
 33 |     def run_torchvision_test(self, communication_type):
 34 |         """
 35 |         Compares AIStore ETL-transformed images with locally transformed images.
 36 | 
 37 |         Args:
 38 |             communication_type (str): The ETL communication type (HPULL, HPUSH).
 39 |         """
 40 |         etl_name = f"torchvision-transformer-{generate_random_string(5)}"
 41 |         self.etls.append(etl_name)
 42 | 
 43 |         # Define AIStore ETL transformation template
 44 |         template = TORCHVISION_TRANSFORMER.format(
 45 |             communication_type=communication_type,
 46 |             transform='{"Resize": {"size": [100, 100]}, "Grayscale": {"num_output_channels": 1}}',
 47 |             format="JPEG",
 48 |         )
 49 | 
 50 |         # Modify template for Git test mode
 51 |         if self.git_test_mode:
 52 |             template = format_image_tag_for_git_test_mode(template, "torchvision")
 53 | 
 54 |         # Initialize ETL and apply transformation via AIStore
 55 |         self.client.etl(etl_name).init_spec(
 56 |             template=template, communication_type=communication_type, timeout="10m"
 57 |         )
 58 | 
 59 |         etl_transformed_image_bytes = (
 60 |             self.test_bck.object(self.test_image_filename)
 61 |             .get_reader(etl=ETLConfig(etl_name))
 62 |             .read_all()
 63 |         )
 64 | 
 65 |         # Perform the same transformation locally using TorchVision
 66 |         transformed_image_bytes = self.get_transformed_image_local()
 67 | 
 68 |         # Assert that AIStore ETL and local transformations produce identical outputs
 69 |         self.assertEqual(transformed_image_bytes, etl_transformed_image_bytes)
 70 | 
 71 |     def get_transformed_image_local(self) -> bytes:
 72 |         """
 73 |         Applies the same transformation locally using TorchVision to compare against AIStore ETL output.
 74 | 
 75 |         Returns:
 76 |             bytes: The locally transformed image in JPEG format.
 77 |         """
 78 |         transform = transforms.Compose(
 79 |             [
 80 |                 transforms.Resize((100, 100)),  # Resize to 100x100 pixels
 81 |                 transforms.Grayscale(num_output_channels=1),  # Convert to grayscale
 82 |             ]
 83 |         )
 84 |         image = Image.open(self.test_image_source)
 85 |         transformed_tensor = transform(transforms.ToTensor()(image))
 86 |         transformed_image = transforms.ToPILImage()(transformed_tensor)
 87 | 
 88 |         # Convert transformed image to bytes
 89 |         byte_arr = io.BytesIO()
 90 |         transformed_image.save(byte_arr, format="JPEG")
 91 |         return byte_arr.getvalue()
 92 | 
 93 |     @cases(
 94 |         ETL_COMM_HPULL,
 95 |         ETL_COMM_HPUSH,
 96 |     )
 97 |     def test_torchvision_transform(self, communication_type):
 98 |         """Runs the TorchVision ETL transformation for different communication types."""
 99 |         self.run_torchvision_test(communication_type)
100 | 


--------------------------------------------------------------------------------
/transformers/tests/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | 
 5 | import os
 6 | import random
 7 | import string
 8 | import base64
 9 | import logging
10 | import json
11 | import yaml
12 | 
13 | from aistore import Client
14 | from aistore.sdk.const import URL_PATH_ETL, HTTP_METHOD_GET
15 | 
16 | 
17 | def generate_random_string(length: int = 5) -> str:
18 |     """Generates a random lowercase string of the specified length."""
19 |     return "".join(random.choices(string.ascii_lowercase, k=length))
20 | 
21 | 
22 | def format_image_tag_for_git_test_mode(template: str, image_name: str) -> str:
23 |     """
24 |     Modifies the container image in the given YAML template to use a test-specific image tag.
25 | 
26 |     Args:
27 |         template (str): YAML template as a string.
28 |         image_name (str): Name of the image to be formatted.
29 | 
30 |     Returns:
31 |         str: Updated YAML template as a string.
32 |     """
33 |     parsed_template = yaml.safe_load(template)
34 |     parsed_template["spec"]["containers"][0][
35 |         "image"
36 |     ] = f"aistorage/transformer_{image_name}:test"
37 |     return yaml.dump(parsed_template)
38 | 
39 | 
40 | def cases(*args):
41 |     """
42 |     Decorator for running a test function with multiple test cases.
43 | 
44 |     Args:
45 |         *args: Arguments to be passed to the test function.
46 | 
47 |     Returns:
48 |         Function wrapper.
49 |     """
50 | 
51 |     def decorator(func):
52 |         def wrapper(self, *inner_args, **kwargs):
53 |             for arg in args:
54 |                 with self.subTest(arg=arg):
55 |                     func(self, arg, *inner_args, **kwargs)
56 | 
57 |         return wrapper
58 | 
59 |     return decorator
60 | 
61 | 
62 | # pylint: disable=protected-access
63 | def log_etl(client: Client, etl_name: str) -> None:
64 |     """
65 |     Fetches and saves the logs of a specified ETL job.
66 |     """
67 |     logs_dir = os.path.join(os.getcwd(), "logs")
68 |     os.makedirs(logs_dir, exist_ok=True)
69 |     log_path = os.path.join(logs_dir, f"{etl_name}.log")
70 | 
71 |     try:
72 |         resp = client._request_client.request(
73 |             HTTP_METHOD_GET,
74 |             f"/{URL_PATH_ETL}/{etl_name}/logs",
75 |             timeout=20,
76 |         )
77 |         entries = json.loads(resp.content.decode("utf-8"))
78 | 
79 |         with open(log_path, "w", encoding="utf-8") as f:
80 |             for entry in entries:
81 |                 tid = entry.get("target_id", "unknown")
82 |                 b64 = entry.get("logs", "").strip()
83 | 
84 |                 raw = base64.b64decode(b64)
85 |                 decoded = raw.decode("utf-8", errors="replace")
86 | 
87 |                 f.write(f"Target ID: {tid}\n")
88 |                 f.write(decoded)
89 |                 if not decoded.endswith("\n"):
90 |                     f.write("\n")
91 |                 f.write("\n")
92 | 
93 |     except Exception as e:
94 |         logging.error(
95 |             "Warning: failed to fetch or write logs for ETL '%s': %s", etl_name, e
96 |         )
97 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:3.9-slim
 2 | 
 3 | WORKDIR /
 4 | 
 5 | COPY ./requirements.txt requirements.txt
 6 | 
 7 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt
 8 | 
 9 | COPY main.py main.py
10 | 
11 | ENV PYTHONUNBUFFERED 1
12 | 
13 | EXPOSE 8000
14 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_torchvision:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_torchvision:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/http-multithreaded-server/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/library/python:slim
 2 | 
 3 | COPY requirements.txt requirements.txt
 4 | RUN pip3 install -r requirements.txt
 5 | 
 6 | RUN mkdir /code
 7 | WORKDIR /code
 8 | COPY server.py server.py
 9 | 
10 | ENV PYTHONUNBUFFERED 1
11 | 
12 | EXPOSE 80
13 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/http-multithreaded-server/Makefile:
--------------------------------------------------------------------------------
 1 | # Default image tag is 'latest'
 2 | TAG := latest
 3 | ifeq ($(GIT_TEST), true)
 4 | 	TAG := test
 5 | endif
 6 | 
 7 | REGISTRY_URL ?= docker.io/aistorage
 8 | 
 9 | all: build push
10 | 
11 | build:
12 | 	docker build -t $(REGISTRY_URL)/transformer_torchvision:$(TAG) .
13 | 
14 | push:
15 | 	docker push $(REGISTRY_URL)/transformer_torchvision:$(TAG)
16 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/http-multithreaded-server/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-torchvision
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_torchvision:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 80
17 |       command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80']
18 |       env:
19 |         - name: FORMAT
20 |         # Expected Values - PNG, JPEG, etc.
21 |           value: ${FORMAT}
22 |         - name: TRANSFORM
23 |         # MANDATORY: Expected JSON string parameter-value pairs. 
24 |         # https://pytorch.org/vision/0.9/transforms.html
25 |         # e.g. '{"ColorJitter": {"brightness": 0.8, "contrast": 0.4}, "RandomRotation": {"degrees": 30}}'
26 |           value:  ${TRANSFORM}
27 |       # This is a health check endpoint which one should specify
28 |       # for aistore to determine the health of the ETL container.
29 |       readinessProbe:
30 |         httpGet:
31 |           path: /health
32 |           port: default
33 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/http-multithreaded-server/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | requests
3 | torchvision


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: transformer-torchvision
 5 |   annotations:
 6 |     # Values it can take ["hpull://","hpush://"]
 7 |     communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""}
 8 |     wait_timeout: 5m
 9 | spec:
10 |   containers:
11 |     - name: server
12 |       image: aistorage/transformer_torchvision:latest
13 |       imagePullPolicy: Always
14 |       ports:
15 |         - name: default
16 |           containerPort: 8000
17 |       command:  ["gunicorn", "main:app", "--workers", "4", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000"]
18 |       env:
19 |         - name: FORMAT
20 |         # Expected Values - PNG, JPEG, etc.
21 |           value: ${FORMAT}
22 |         - name: TRANSFORM
23 |         # MANDATORY: Expected JSON string parameter-value pairs. 
24 |         # https://pytorch.org/vision/0.9/transforms.html
25 |         # e.g. '{"ColorJitter": {"brightness": 0.8, "contrast": 0.4}, "RandomRotation": {"degrees": 30}}'
26 |           value:  ${TRANSFORM}
27 |       # This is a health check endpoint which one should specify
28 |       # for aistore to determine the health of the ETL container.
29 |       readinessProbe:
30 |         httpGet:
31 |           path: /health
32 |           port: default
33 | 


--------------------------------------------------------------------------------
/transformers/torchvision_preprocess/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.109.1
2 | uvicorn==0.24.0.post1
3 | gunicorn==23.0.0
4 | aiohttp>=3.9.2
5 | pillow==10.3.0
6 | torchvision==0.21.0


--------------------------------------------------------------------------------