├── .github ├── dependabot.yml └── workflows │ ├── links-checker.yml │ └── runtime-docker.yml ├── .gitignore ├── .gitlab-ci.yml ├── LICENSE ├── README.md ├── bench ├── README.md ├── client │ └── locustfile.py ├── fast-api │ └── main.py ├── flask-server │ └── app.py ├── go-http-server │ ├── go.mod │ └── main.go └── http-server │ └── server.py ├── deploy ├── README.md └── docker │ ├── Dockerfile │ ├── entrypoint.sh │ └── start.sh ├── docs └── README.md ├── examples ├── imagenet_from_disk.py ├── imagenet_in_memory.py ├── in_memory_notebook.ipynb └── remote_execution.ipynb ├── runtime ├── Makefile ├── README.md └── python │ ├── Dockerfile │ ├── Makefile │ ├── bootstrap.py │ ├── builder.sh │ ├── io-comm │ ├── Makefile │ ├── cmn.go │ ├── go.mod │ └── main.go │ └── server.py └── transformers ├── FFmpeg ├── Dockerfile ├── Makefile ├── README.md ├── benchmark.py ├── fastapi_server.py ├── flask_server.py ├── http_server.py └── pod.yaml ├── Makefile ├── NeMo └── audio_split_consolidate │ ├── README.md │ ├── audio_manager │ ├── Dockerfile │ ├── Makefile │ ├── fastapi_server.py │ └── pod.yaml │ ├── audio_split_consolidate_diagram.png │ └── audio_splitter │ ├── Dockerfile │ ├── Makefile │ ├── fastapi_server.py │ └── pod.yaml ├── README.md ├── batch_rename ├── Dockerfile ├── Makefile ├── README.md ├── fastapi_server.py └── pod.yaml ├── benchmarks └── audio_split_consolidate.py ├── compress ├── Dockerfile ├── Makefile ├── README.md ├── pod.yaml ├── requirements.txt └── server.py ├── echo ├── Dockerfile ├── Makefile ├── README.md ├── fastapi_server.py ├── flask_server.py ├── http_server.py └── pod.yaml ├── face_detection ├── Dockerfile ├── Makefile ├── README.md ├── main.py ├── pod.yaml ├── requirements.txt └── sample │ └── output_face_detection.png ├── go_FFmpeg ├── Dockerfile ├── Makefile ├── pod.yaml └── src │ ├── go.mod │ ├── go.sum │ ├── main.go │ └── main_test.go ├── go_echo ├── Dockerfile ├── Makefile ├── pod.yaml └── src │ ├── go.mod │ ├── go.sum │ └── main.go ├── go_hello_world ├── Dockerfile ├── Makefile ├── pod.yaml └── src │ ├── go.mod │ ├── go.sum │ └── main.go ├── hash_with_args ├── Dockerfile ├── Makefile ├── README.md ├── fastapi_server.py ├── flask_server.py ├── http_server.py ├── pod.yaml ├── requirements.txt └── server.py ├── hello_world ├── Dockerfile ├── Makefile ├── README.md ├── fastapi_server.py ├── flask_server.py ├── http_server.py └── pod.yaml ├── keras_preprocess ├── Dockerfile ├── Makefile ├── README.md ├── flask-gunicorn │ ├── Dockerfile │ ├── Makefile │ ├── app.py │ ├── pod.yaml │ └── requirements.txt ├── http-multithreaded-server │ ├── Dockerfile │ ├── Makefile │ ├── pod.yaml │ ├── requirements.txt │ └── server.py ├── main.py ├── pod.yaml └── requirements.txt ├── md5 ├── Dockerfile ├── Makefile ├── fastapi_server.py ├── flask_server.py ├── http_server.py └── pod.yaml ├── tar2tf ├── .dockerignore ├── Dockerfile ├── Makefile ├── README.md ├── pod.yaml └── src │ ├── cached.go │ ├── cmn │ ├── assert.go │ ├── cmn.go │ └── io.go │ ├── go.mod │ ├── go.sum │ ├── main.go │ ├── tar-single.tar │ ├── tar2tf_test.go │ └── transforms │ ├── job.go │ └── pipeline.go ├── tests ├── __init__.py ├── base.py ├── conftest.py ├── const.py ├── local_benchmark │ └── ffmpeg_benchmark.py ├── requirements.txt ├── resources │ ├── test-audio-flac.flac │ ├── test-audio-mp3.mp3 │ ├── test-audio-wav.wav │ ├── test-face-detection.png │ ├── test-image.jpg │ ├── test-image.jpg.bz2 │ ├── test-image.jpg.gz │ ├── test-manifest.jsonl │ ├── test-tar-single.tar │ ├── test-text.txt │ ├── test-text.txt.bz2 │ └── test-text.txt.gz ├── test_audio_split.py ├── test_audio_split_consolidate.py ├── test_batch_rename.py ├── test_compress.py ├── test_echo.py ├── test_echo_stress.py ├── test_face_detection.py ├── test_face_detection_stress.py ├── test_ffmpeg.py ├── test_hash_with_args.py ├── test_hello_world.py ├── test_hello_world_stress.py ├── test_keras_stress.py ├── test_keras_transformer.py ├── test_md5.py ├── test_md5_stress.py ├── test_tar2tf.py ├── test_torchvision_transformer.py └── utils.py └── torchvision_preprocess ├── Dockerfile ├── Makefile ├── README.md ├── http-multithreaded-server ├── Dockerfile ├── Makefile ├── pod.yaml ├── requirements.txt └── server.py ├── main.py ├── pod.yaml └── requirements.txt /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | # and 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | - package-ecosystem: "docker" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | - package-ecosystem: "pip" 17 | directory: "/" 18 | schedule: 19 | interval: "weekly" 20 | - package-ecosystem: "gomod" 21 | directory: "/" 22 | schedule: 23 | interval: "weekly" -------------------------------------------------------------------------------- /.github/workflows/links-checker.yml: -------------------------------------------------------------------------------- 1 | name: Links Checker 2 | 3 | on: 4 | repository_dispatch: 5 | workflow_dispatch: 6 | schedule: 7 | - cron: "00 12 * * 1" 8 | 9 | jobs: 10 | linkChecker: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Link Checker 16 | id: lychee 17 | uses: lycheeverse/lychee-action@v2 18 | with: 19 | fail: true -------------------------------------------------------------------------------- /.github/workflows/runtime-docker.yml: -------------------------------------------------------------------------------- 1 | name: Python Runtime Docker Images 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | paths: 7 | - 'runtime/python/**' 8 | 9 | env: 10 | RUNTIME_IMAGE: 'aistorage/runtime_python' 11 | REGISTRY_URL: 'docker.io/aistorage' 12 | 13 | jobs: 14 | docker: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Login to DockerHub 19 | uses: docker/login-action@v3 20 | with: 21 | username: ${{ secrets.DOCKERHUB_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_TOKEN }} 23 | - name: Python Runtime Images 24 | run: | 25 | pushd $GITHUB_WORKSPACE/runtime 26 | make all 27 | popd 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # JetBrains IDE 142 | .idea/ 143 | transformers/face_detection/model/weights.caffemodel 144 | transformers/face_detection/model/architecture.txt 145 | transformers/tests/metrics.txt 146 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 NVIDIA Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains: 2 | * [transformers](/transformers/README.md) - set of ETL transformers which are ready to be deployed on AIStore cluster. 3 | * [runtime](/runtime/README.md) - ETL runtimes definition that are used when starting transformers with code. 4 | 5 | Please also see the main [AIStore repository](https://github.com/NVIDIA/aistore) and [AIStore documentation](https://aiatscale.org/docs). 6 | -------------------------------------------------------------------------------- /bench/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking AIStore ETL 2 | 3 | You have the flexibility to customize your own ETL pipelines in AIStore. You can choose the language (Python, Go, etc.) and web server implementation. With so many options, it can get complicated to select the right ones. 4 | 5 | This directory provides sample web server implementations and benchmarks their performance in terms of request handling capacity. 6 | 7 | ## Web Servers 8 | 9 | There are many frameworks available for running web servers. Below is a comparison of web servers, frameworks, languages, and locations of basic implementations that can run them. 10 | 11 | | Language | Framework | Web Server | Location | Remarks | 12 | |-|-|-|-|-| 13 | | Python | - | ThreadedHTTPServer | [/http-server](bench/http-server/) | Built-in to Python, very easy to implement, doesn't scale well | 14 | | Python | Flask | Flask Built-in Webserver | [/flask-server](bench/flask-server/) | Built-in flask webserver, not suited for production | 15 | | Python | Flask | [Gunicorn](https://gunicorn.org/) | [/flask-server](bench/flask-server/) | Python WSGI HTTP server, scales well | 16 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | ASGI web server implementation for Python | 17 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) + [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | Gunicorn manages multiple Uvicorn processes | 18 | | Go | Go | Net/HTTP Server | [/go-http-server](bench/go-http-server/) | Built-in to Go, easy to implement, scales well | 19 | 20 | To benchmark these servers on your infrastructure, you can use the [client](bench/client). The client is based on [Locust](https://locust.io/), a simple open source load testing tool. 21 | 22 | Here are sample results from a 12 core/16GB machine: 23 | 24 | | Language | Framework | Web Server | Location | Avg. Requests Per Second | 25 | |-|-|-|-|-| 26 | | Python | - | ThreadedHTTPServer | [/http-server](bench/http-server/) | 1020 | 27 | | Python | Flask | Flask Built-in Webserver | [/http-server](bench/http-server/) | 950 | 28 | | Python | Flask | [Gunicorn](https://gunicorn.org/) | [/flask-server](bench/flask-server/) | 1060 | 29 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) | [/fast-api](bench/fast-api/) | 1620 | 30 | | Python | [FastAPI](https://fastapi.tiangolo.com/) | [Uvicorn](https://www.uvicorn.org/) + [Gunicorn](https://gunicorn.org/) | [/fast-api](bench/fast-api/) | 1670 | 31 | | Go | Go | Net/HTTP Server | [/go-http-server](bench/go-http-server/) | 1675 | 32 | 33 | An important consideration is how your ETL container pods will communicate with the AIStore cluster. There are several [communication mechanisms](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms) to choose from depending on your needs. There's no one perfect solution - pick the mechanism that best fits your ETL workflow. 34 | 35 | -------------------------------------------------------------------------------- /bench/client/locustfile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test client for all the webservers. 3 | 4 | Steps to run: 5 | $ pip install locust 6 | $ locust 7 | 8 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | from locust import HttpUser, task 12 | 13 | 14 | class MyTestUser(HttpUser): 15 | @task 16 | def test_put_request(self): 17 | self._perform_put_request() 18 | 19 | @task 20 | def test_get_request(self): 21 | self._perform_get_request() 22 | 23 | def _perform_put_request(self): 24 | url = "/" 25 | data = "test" 26 | self.client.put(url=url, data=data) 27 | 28 | def _perform_get_request(self): 29 | url = "/" 30 | self.client.get(url=url) 31 | -------------------------------------------------------------------------------- /bench/fast-api/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | A basic web server using FastAPI for demonstration purposes. 3 | 4 | Steps to run: 5 | $ # with uvicorn 6 | $ uvicorn main:app --reload 7 | $ # with multiple uvicorn processes managed by gunicorn 8 | $ gunicorn main:app --workers 4 --worker-class uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000 9 | 10 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | from fastapi import FastAPI, Request 13 | 14 | app = FastAPI() 15 | 16 | @app.put("/") 17 | @app.put("/{full_path:path}") 18 | async def put_handler(request: Request, full_path: str): 19 | """ 20 | Handles PUT requests. 21 | Reads bytes from the request, performs byte transformation, 22 | and returns the modified bytes. 23 | """ 24 | # Read bytes from request (request.body) 25 | # Transform the bytes 26 | # Return the transformed bytes 27 | return b"Hello World from PUT!" 28 | 29 | @app.get("/") 30 | @app.get("/{full_path:path}") 31 | async def get_handler(request: Request, full_path: str): 32 | """ 33 | Handles GET requests. 34 | Retrieves the destination/name of the object from the URL or the full_path variable, 35 | fetches the object from the AIS target based on the destination/name, 36 | transforms the bytes, and returns the modified bytes. 37 | """ 38 | # Get destination/name of object from URL or from full_path variable 39 | # Fetch object from AIS target based on the destination/name 40 | # Perform byte transformation 41 | # Return the transformed bytes 42 | return b"Hello World from GET!" 43 | -------------------------------------------------------------------------------- /bench/flask-server/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | A basic web server using Flask for demonstration purposes. 3 | 4 | Steps to run: 5 | $ # with built-in flask server 6 | $ flask --app app run 7 | $ # with gunicorn 8 | $ gunicorn -w 4 'app:app' 9 | 10 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | import logging 13 | from flask import Flask, request 14 | 15 | app = Flask(__name__) 16 | 17 | 18 | @app.route("/", defaults={"path": ""}, methods=["PUT", "GET"]) 19 | @app.route("/", methods=["PUT", "GET"]) 20 | def image_handler(path): 21 | try: 22 | if request.method == "PUT": 23 | # Read the request body 24 | # Transform the bytes 25 | # Return the transformed bytes 26 | transformed_data = b"Hello World!" 27 | return transformed_data, 200 28 | 29 | elif request.method == "GET": 30 | # Get the destination/name of the object from the URL or the path variable 31 | # Fetch the object from the AIS target based on the destination/name 32 | # Use request.get(ais_target_url + "/" + path).get to get the object 33 | # Transform the bytes 34 | # Return the transformed bytes 35 | transformed_data = b"Hello World!" 36 | return transformed_data, 200 37 | 38 | except Exception as exception: 39 | logging.error("Error processing request: %s", str(exception)) 40 | return "Data processing failed", 500 41 | 42 | 43 | if __name__ == "__main__": 44 | app.run() 45 | -------------------------------------------------------------------------------- /bench/go-http-server/go.mod: -------------------------------------------------------------------------------- 1 | module main 2 | 3 | go 1.21 4 | -------------------------------------------------------------------------------- /bench/go-http-server/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * A basic webserver using golang 3 | * 4 | * Steps to run: 5 | * $ go run main.go 6 | * 7 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 8 | */ 9 | package main 10 | 11 | import ( 12 | "flag" 13 | "fmt" 14 | "io" 15 | "log" 16 | "net/http" 17 | "os" 18 | ) 19 | 20 | var ( 21 | endpoint string 22 | 23 | logger *log.Logger 24 | ) 25 | 26 | func initVars(ipAddress string, port int) { 27 | endpoint = fmt.Sprintf("%s:%d", ipAddress, port) 28 | } 29 | 30 | func main() { 31 | var ( 32 | ipAddressArg = flag.String("l", "localhost", "Specify the IP address on which the server listens") 33 | portArg = flag.Int("p", 8000, "Specify the port on which the server listens") 34 | ) 35 | 36 | flag.Parse() 37 | 38 | initVars(*ipAddressArg, *portArg) 39 | 40 | logger = log.New(os.Stdout, "[TestServer] ", log.LstdFlags|log.Lmicroseconds|log.Lshortfile) 41 | 42 | http.HandleFunc("/", requestHandler) 43 | 44 | logger.Printf("Starting hello world transformer at %s", endpoint) 45 | logger.Fatal(http.ListenAndServe(endpoint, nil)) 46 | } 47 | 48 | func requestHandler(w http.ResponseWriter, r *http.Request) { 49 | switch r.Method { 50 | case http.MethodPut: 51 | putHandler(w, r) 52 | case http.MethodGet: 53 | geHandler(w, r) 54 | default: 55 | http.Error(w, fmt.Sprintf("Invalid HTTP method %q, expected %q or %q", r.Method, http.MethodPut, http.MethodGet), http.StatusBadRequest) 56 | } 57 | } 58 | 59 | // PUT / 60 | func putHandler(w http.ResponseWriter, r *http.Request) { 61 | escapePath := r.URL.EscapedPath() 62 | defer r.Body.Close() 63 | readContent(w, r.Body, r.ContentLength, escapePath) 64 | writeContent(w, escapePath) 65 | } 66 | 67 | // GET / 68 | func geHandler(w http.ResponseWriter, r *http.Request) { 69 | writeContent(w, r.URL.Path) 70 | } 71 | 72 | func logAndRespondError(w http.ResponseWriter, err error, msg string, status int) { 73 | logError(err, msg) 74 | http.Error(w, msg, status) 75 | } 76 | 77 | func logError(err error, msg string) { 78 | logger.Printf("%s: %v\n", msg, err) 79 | } 80 | 81 | func readContent(w http.ResponseWriter, body io.ReadCloser, contentLength int64, path string) { 82 | n, err := io.Copy(io.Discard, body) 83 | 84 | if err != nil { 85 | logAndRespondError(w, err, fmt.Sprintf("Error reading request body for %q", path), http.StatusBadRequest) 86 | return 87 | } 88 | if contentLength > 0 && contentLength != int64(n) { 89 | logAndRespondError(w, nil, fmt.Sprintf("Content length mismatch for %q", path), http.StatusBadRequest) 90 | return 91 | } 92 | } 93 | 94 | func writeContent(w http.ResponseWriter, path string) { 95 | if _, err := w.Write([]byte("Hello World!")); err != nil { 96 | logError(err, fmt.Sprintf("Error writing response for %q", path)) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /bench/http-server/server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic HTTP Multithreaeded Server. 3 | 4 | Steps to run: 5 | $ python server.py 6 | 7 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 8 | """ 9 | import argparse 10 | from http.server import HTTPServer, BaseHTTPRequestHandler 11 | from socketserver import ThreadingMixIn 12 | 13 | 14 | class Handler(BaseHTTPRequestHandler): 15 | def log_request(self, code="-", size="-"): 16 | # Don't log successful requests info. Unsuccessful logged by log_error(). 17 | pass 18 | 19 | def _set_headers(self): 20 | self.send_response(200) 21 | self.send_header("Content-Type", "text/plain") 22 | self.end_headers() 23 | 24 | def do_PUT(self): 25 | self._set_headers() 26 | self.wfile.write(b"Hello World!") 27 | 28 | def do_GET(self): 29 | if self.path == "/health": 30 | self._set_headers() 31 | self.wfile.write(b"Running") 32 | return 33 | self._set_headers() 34 | self.wfile.write(b"Hello World!") 35 | 36 | 37 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): 38 | """Handle requests in a separate thread.""" 39 | 40 | 41 | def run(addr="localhost", port=8000): 42 | server = ThreadedHTTPServer((addr, port), Handler) 43 | print(f"Starting HTTP server on {addr}:{port}") 44 | server.serve_forever() 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser(description="Run a simple HTTP server") 49 | parser.add_argument( 50 | "-l", 51 | "--listen", 52 | default="localhost", 53 | help="Specify the IP address on which the server listens", 54 | ) 55 | parser.add_argument( 56 | "-p", 57 | "--port", 58 | type=int, 59 | default=8000, 60 | help="Specify the port on which the server listens", 61 | ) 62 | args = parser.parse_args() 63 | run(addr=args.listen, port=args.port) 64 | -------------------------------------------------------------------------------- /deploy/README.md: -------------------------------------------------------------------------------- 1 | ## tar2tf Demo - Docker 2 | 3 | tar2tf Docker deployment shows capabilities of tar2tf module. 4 | Within a docker instance it creates ready to use setup to interact with tar2tf. 5 | 6 | ```console 7 | $ ./docker/start.sh 8 | ``` 9 | 10 | This command will build and start Docker container, output logs to the current terminal window, deploy AIS cluster with 11 | `tar-bucket` bucket, put necessary data and start Jupyter notebook server. 12 | 13 | To begin the demo, go to `localhost:8888` or to the link displayed by Jupyter in the console. 14 | Go to `examples/in_memory_notebook.ipynb` and interact with it. 15 | 16 | Please note that the first build might take a lot of time, as it has to fetch all necessary dependencies. 17 | Subsequent builds will be much faster, thanks to docker caching. 18 | 19 | To kill the docker, click Jupyter `Shut Down` button in the browser or send `kill` to the console. 20 | 21 | ### Datasets 22 | 23 | By default, `gs://lpr-imagenet/imagenet_train-{0000..0002}.tgz` tars will be downloaded and uploaded to `tar-bucket`. 24 | 25 | To use locally stored datasets, specify path to the directory in the command line with option `-v`. 26 | 27 | ```console 28 | $ ./docker/start.sh -v=/home/user/dataset/ 29 | ``` -------------------------------------------------------------------------------- /deploy/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.13-buster 2 | 3 | RUN apt-get clean && apt-get update &&\ 4 | set -eux &&\ 5 | apt-get --no-install-recommends -y install curl git ca-certificates wget vim python3-setuptools python3 python3-pip \ 6 | python3-venv sysstat attr net-tools iproute2 build-essential lsof iputils-ping fuse &&\ 7 | apt-get -y clean all 8 | 9 | RUN pip3 install awscli 10 | 11 | ARG cld_provider=0 12 | ENV CLD_PROVIDER ${cld_provider} 13 | 14 | COPY requirements-jupyter.txt . 15 | RUN pip3 install virtualenv && virtualenv -p /usr/bin/python3 /venv && . /venv/bin/activate && \ 16 | pip3 install -r requirements-jupyter.txt && rm requirements-jupyter.txt 17 | 18 | RUN mkdir -p $GOPATH/src/github.com/NVIDIA/ && git clone https://github.com/NVIDIA/aistore.git $GOPATH/src/github.com/NVIDIA/aistore && echo "$GOPATH/src/github.com/NVIDIA/" && ls $GOPATH/src/github.com/NVIDIA/ 19 | COPY . $GOPATH/src/github.com/NVIDIA/ais-tar2tf/ 20 | 21 | WORKDIR $GOPATH/src/github.com/NVIDIA/ais-tar2tf/ 22 | EXPOSE 8888 23 | 24 | ENTRYPOINT [ "sh", "-c", "$GOPATH/src/github.com/NVIDIA/ais-tar2tf/deploy/docker/entrypoint.sh" ] 25 | -------------------------------------------------------------------------------- /deploy/docker/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BUCKET=docker_local_bucket 4 | AISTORE_PATH=$GOPATH/src/github.com/NVIDIA/aistore 5 | DOCKER_DATA_DIR="/data" 6 | 7 | if [[ ${CLD_PROVIDER} == 1 ]]; then 8 | BUCKET=${HOSTNAME} 9 | aws s3api create-bucket --bucket ${BUCKET} --region ${AWS_DEFAULT_REGION} --create-bucket-configuration LocationConstraint=${AWS_DEFAULT_REGION} 10 | elif [[ ${CLD_PROVIDER} == 2 ]]; then 11 | BUCKET=smth # TODO: 12 | fi 13 | 14 | function cleanup { 15 | if [[ ${CLD_PROVIDER} == 1 ]]; then 16 | aws s3 rb s3://${BUCKET} --force 17 | elif [[ ${CLD_PROVIDER} == 2 ]]; then 18 | : # TODO: currently noop 19 | fi 20 | } 21 | trap cleanup EXIT 22 | 23 | pushd $AISTORE_PATH > /dev/null 24 | (echo -e "4\n4\n3\n${CLD_PROVIDER}" | make deploy) && make cli && sleep 5 25 | popd > /dev/null 26 | 27 | ais create bucket tar-bucket 28 | if [[ -d $DOCKER_DATA_DIR ]]; then 29 | find $DOCKER_DATA_DIR -type f -regex ".*\(.tar.gz\|.tar\|.tar.xz\|.tgz\|.txz\)" -exec ais put {} ais://tar-bucket --progress --verbose \; 30 | else 31 | ais show download $(ais start download "gs://lpr-imagenet/imagenet_train-{0000..0002}.tgz" ais://tar-bucket) --progress 32 | fi 33 | source /venv/bin/activate && jupyter lab --port=8888 --no-browser --ip=0.0.0.0 --allow-root 34 | 35 | exit 36 | -------------------------------------------------------------------------------- /deploy/docker/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | S_PATH=$(cd "$(dirname "$0")"; pwd -P) 4 | AIS_TAR2TF_PATH=$(cd "$S_PATH/../.."; pwd -P) 5 | CONTAINER_NAME=ais-tar2tf 6 | TAG_NAME=ais-tar2tf 7 | CLD_PROVIDER=0 8 | RUN_FLAGS="" 9 | MOUNT_FLAG="" 10 | DOCKER_DATA_DIR="/data/" 11 | 12 | if [[ -n $(netstat --help 2>/dev/null) ]]; then 13 | [[ -n $(netstat -tulpn | grep :::8888 >/dev/null) ]] && echo "Make sure that nothing is listening on port 8888" 14 | exit 1 15 | fi 16 | 17 | for i in "$@"; do 18 | case ${i} in 19 | --name=*) 20 | CONTAINER_NAME="${i#*=}" 21 | shift # past argument=value 22 | ;; 23 | 24 | --v=*) 25 | MOUNT_FLAG="-v ${i#*=}:${DOCKER_DATA_DIR}" 26 | shift 27 | ;; 28 | 29 | --aws=*) 30 | ENV_DIR="${i#*=}" 31 | ENV_DIR="${ENV_DIR/#\~/$HOME}" 32 | if [[ ! -d ${ENV_DIR} ]]; then 33 | echo "${ENV_DIR} is not directory" 34 | exit 1 35 | fi 36 | 37 | TMP_FILE=${ENV_DIR}/.aws.env 38 | cat ${ENV_DIR}/credentials > ${TMP_FILE} 39 | cat ${ENV_DIR}/config >> ${TMP_FILE} 40 | 41 | sed -i 's/\[default\]//g' ${TMP_FILE} 42 | sed -i 's/ = /=/g' ${TMP_FILE} 43 | sed -i 's/aws_access_key_id/AWS_ACCESS_KEY_ID/g' ${TMP_FILE} 44 | sed -i 's/aws_secret_access_key/AWS_SECRET_ACCESS_KEY/g' ${TMP_FILE} 45 | sed -i 's/region/AWS_DEFAULT_REGION/g' ${TMP_FILE} 46 | 47 | RUN_FLAGS="${RUN_FLAGS} --env-file ${TMP_FILE}" 48 | CLD_PROVIDER=1 49 | 50 | shift # past argument=value 51 | ;; 52 | 53 | -g|--gcp) 54 | CLD_PROVIDER=2 55 | shift # past argument 56 | ;; 57 | 58 | *) 59 | echo "Invalid usage" 60 | exit 1 61 | esac 62 | done 63 | 64 | if [[ -n $(docker ps -q -f name=${CONTAINER_NAME}) ]]; then 65 | echo "Container with ${CONTAINER_NAME} name already exists/running" 66 | exit 1 67 | fi 68 | 69 | function cleanup { 70 | rm -f ${AIS_TAR2TF_PATH}/.dockerignore 71 | } 72 | trap cleanup EXIT INT TERM 73 | 74 | set -e # don't allow errors in build and volume creation 75 | echo ".git" > ${AIS_TAR2TF_PATH}/.dockerignore 76 | docker volume create ${CONTAINER_NAME} # mount filesystem for docker so AIS can see that 77 | docker build -t $TAG_NAME -f ${S_PATH}/Dockerfile ${AIS_TAR2TF_PATH} \ 78 | --build-arg cld_provider=${CLD_PROVIDER} 79 | cleanup 80 | set +e # now we can allow fails 81 | 82 | docker run -it ${RUN_FLAGS} \ 83 | $MOUNT_FLAG \ 84 | --ulimit nofile=100000:100000 \ 85 | --name=${CONTAINER_NAME} \ 86 | --privileged \ 87 | -p 8888:8888 \ 88 | $TAG_NAME 89 | 90 | 91 | # Removing container and volume 92 | docker rm -f ${CONTAINER_NAME} > /dev/null 2>&1 93 | docker volume rm ${CONTAINER_NAME} > /dev/null 2>&1 94 | -------------------------------------------------------------------------------- /examples/imagenet_from_disk.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | 5 | from aistore.tf import Dataset, default_record_parser 6 | from aistore.tf.ops import Select, Decode, Convert, Resize 7 | 8 | 9 | def path_generator(): 10 | i = 1 11 | while True: 12 | yield "train.record-{}".format(i) 13 | i += 1 14 | 15 | 16 | EPOCHS = 10 17 | BATCH_SIZE = 20 18 | 19 | # ADJUST Dataset PARAMETERS BELOW 20 | 21 | BUCKET_NAME = "tar-bucket" 22 | PROXY_URL = "http://localhost:8080" 23 | 24 | # Create Dataset. 25 | # Values will be extracted from tar-records according to Resize(Convert(Decode("jpg"), tf.float32), (224, 224)) operation, 26 | # meaning that bytes under "jpg" in tar-record will be decoded as an image, converted to tf.float32 type and then Resized to (224, 224) 27 | # Labels will be extracted from tar-records according to Select("cls") operation, meaning that bytes under "cls" will be treated as label. 28 | dataset = Dataset(BUCKET_NAME, PROXY_URL, [Decode("jpg"), Convert("jpg", tf.float32), Resize("jpg", (224, 224))], [Select("jpg"), Select("cls")]) 29 | 30 | # prepare your bucket, for example from `gsutil ls gs://lpr-gtc2020` 31 | # save multiple TFRecord files with max size 2MB to paths generated by path_generator 32 | train_records_files = dataset.load("train-{0..3}.tar", path=path_generator, max_shard_size="2MB", num_workers=4) 33 | # save TFRecord file to test.record path 34 | dataset.load("train-{4..7}.tar", path="test.record", num_workers=4) 35 | 36 | train_dataset = tf.data.TFRecordDataset(filenames=train_records_files) 37 | train_dataset = train_dataset.map(default_record_parser) 38 | train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE) 39 | 40 | test_dataset = tf.data.TFRecordDataset(filenames=["test.record"]) 41 | test_dataset = test_dataset.map(default_record_parser).batch(BATCH_SIZE) 42 | 43 | # TRAINING PART BELOW 44 | 45 | inputs = keras.Input(shape=(224, 224, 3), name="images") 46 | x = layers.Flatten()(inputs) 47 | x = layers.Dense(64, activation="relu", name="dense_1")(x) 48 | x = layers.Dense(64, activation="relu", name="dense_2")(x) 49 | outputs = layers.Dense(10, name="predictions")(x) 50 | model = keras.Model(inputs=inputs, outputs=outputs) 51 | 52 | model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=keras.losses.mean_squared_error, metrics=["acc"]) 53 | 54 | model.summary() 55 | 56 | model.fit(train_dataset, epochs=EPOCHS) 57 | result = model.evaluate(test_dataset) 58 | print(dict(zip(model.metrics_names, result))) 59 | dataset.stop() 60 | -------------------------------------------------------------------------------- /examples/imagenet_in_memory.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras import layers 4 | 5 | from aistore.tf import Dataset 6 | from aistore.tf.ops import Decode, Convert, Resize 7 | 8 | EPOCHS = 5 9 | BATCH_SIZE = 20 10 | 11 | # ADJUST Dataset PARAMETERS BELOW 12 | 13 | BUCKET_NAME = "tar-bucket" 14 | PROXY_URL = "http://localhost:8080" 15 | 16 | # Create Dataset. 17 | # Values will be extracted from tar-records according to Resize(Convert(Decode("jpg"), tf.float32), (224, 224)) operation, 18 | # meaning that bytes under "jpg" in tar-record will be decoded as an image, converted to tf.float32 type and then Resized to (224, 224) 19 | # Labels will be extracted from tar-records according to Select("cls") operation, meaning that bytes under "cls" will be treated as label. 20 | conversions = [Decode("jpg"), Convert("jpg", tf.float32), Resize("jpg", (224, 224))] 21 | selections = ["jpg", "cls"] 22 | dataset = Dataset(BUCKET_NAME, PROXY_URL, conversions, selections) 23 | 24 | # prepare your bucket first with tars (for instance gsutil ls gs://lpr-gtc2020) 25 | train_dataset = dataset.load("train-{0..5}.tar", remote_exec=False, 26 | num_workers=4).prefetch(EPOCHS * BATCH_SIZE).shuffle(buffer_size=1024).batch(BATCH_SIZE) 27 | 28 | test_dataset = dataset.load("train-{5..10}.tar", remote_exec=False, num_workers=4).prefetch(BATCH_SIZE).batch(BATCH_SIZE) 29 | 30 | # TRAINING PART BELOW 31 | inputs = keras.Input(shape=(224, 224, 3), name="images") 32 | x = layers.Flatten()(inputs) 33 | x = layers.Dense(64, activation="relu", name="dense_1")(x) 34 | x = layers.Dense(64, activation="relu", name="dense_2")(x) 35 | outputs = layers.Dense(10, name="predictions")(x) 36 | model = keras.Model(inputs=inputs, outputs=outputs) 37 | 38 | model.compile(optimizer=keras.optimizers.Adam(1e-4), loss=keras.losses.mean_squared_error, metrics=["acc"]) 39 | model.summary() 40 | 41 | model.fit(train_dataset, epochs=EPOCHS) 42 | result = model.evaluate(test_dataset) 43 | print(dict(zip(model.metrics_names, result))) 44 | -------------------------------------------------------------------------------- /runtime/Makefile: -------------------------------------------------------------------------------- 1 | SUBDIRS := $(wildcard */.) 2 | 3 | all: $(SUBDIRS) 4 | $(SUBDIRS): 5 | $(MAKE) -C $@ 6 | 7 | .PHONY: all $(SUBDIRS) 8 | -------------------------------------------------------------------------------- /runtime/README.md: -------------------------------------------------------------------------------- 1 | Here are placed Dockerfiles for different runtimes used by ETL build. 2 | In each runtime package there is `Makefile` that should be used to build and push the images. 3 | 4 | Current list of runtimes: 5 | * Python: 6 | * `runtime_python:3.9v2` -> `python3.9` is used. 7 | * `runtime_python:3.10v2` -> `python3.10` is used. 8 | * `runtime_python:3.11v2` -> `python3.11` is used. 9 | * `runtime_python:3.12v2` -> `python3.12` is used. 10 | * `runtime_python:3.13v2` -> `python3.13` is used. 11 | -------------------------------------------------------------------------------- /runtime/python/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION 2 | 3 | FROM docker.io/library/python:${PYTHON_VERSION}-alpine 4 | 5 | RUN pip3 install --upgrade aistore[etl]>=1.14.0 6 | 7 | # Set working directory 8 | RUN mkdir /code 9 | WORKDIR /code 10 | 11 | # Copy app code 12 | COPY bootstrap.py server.py ./ 13 | 14 | # Environment setup 15 | ENV PYTHONUNBUFFERED=1 16 | 17 | # Expose the default port 18 | EXPOSE 8000 19 | -------------------------------------------------------------------------------- /runtime/python/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: 3 | @REGISTRY_URL=$(REGISTRY_URL) RUNTIME_TAG_MODIFIER=$(RUNTIME_TAG_MODIFIER) bash builder.sh 4 | -------------------------------------------------------------------------------- /runtime/python/builder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | # Defines mapping: "runtime name" -> "python version". 6 | declare -A python_versions=( 7 | [3.9]="3.9" 8 | [3.10]="3.10" 9 | [3.11]="3.11" 10 | [3.12]="3.12" 11 | [3.13]="3.13" 12 | ) 13 | 14 | for runtime_name in "${!python_versions[@]}"; do 15 | echo "BUILDING AND PUSHING ${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}" 16 | echo "PYTHON_VERSION=${python_versions[${runtime_name}]}" 17 | docker build --pull --no-cache \ 18 | -t "${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}" \ 19 | --build-arg PYTHON_VERSION="${python_versions[${runtime_name}]}" \ 20 | . 21 | docker push "${REGISTRY_URL}/runtime_python:${runtime_name}${RUNTIME_TAG_MODIFIER}" 22 | done 23 | -------------------------------------------------------------------------------- /runtime/python/io-comm/Makefile: -------------------------------------------------------------------------------- 1 | 2 | build: 3 | go build -o server 4 | -------------------------------------------------------------------------------- /runtime/python/io-comm/cmn.go: -------------------------------------------------------------------------------- 1 | // Package main is an entry point to ioComm server 2 | /* 3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | 6 | package main 7 | 8 | import ( 9 | "errors" 10 | "fmt" 11 | "io/ioutil" 12 | "log" 13 | "net/http" 14 | "runtime/debug" 15 | "strconv" 16 | ) 17 | 18 | const ( 19 | headerContentLength = "Content-Length" 20 | headerContentType = "Content-Type" 21 | 22 | getContentType = "binary/octet-stream" 23 | ) 24 | 25 | func invalidMsgHandler(w http.ResponseWriter, errCode int, format string, a ...interface{}) { 26 | logErrorf(format, a...) 27 | w.Header().Set("Content-type", "text/plain") 28 | w.WriteHeader(errCode) 29 | w.Write([]byte(fmt.Sprintf(format, a...))) 30 | } 31 | 32 | func setResponseHeaders(header http.Header, size int64) { 33 | header.Set(headerContentLength, strconv.FormatInt(size, 10)) 34 | header.Set(headerContentType, getContentType) 35 | } 36 | 37 | // Returns an error with message if status code was > 200 38 | func wrapHttpError(resp *http.Response, err error) (*http.Response, error) { 39 | if err != nil { 40 | return resp, err 41 | } 42 | 43 | if resp.StatusCode > http.StatusOK { 44 | if resp.Body == nil { 45 | return resp, errors.New(resp.Status) 46 | } 47 | b, err := ioutil.ReadAll(resp.Body) 48 | if err != nil { 49 | return resp, err 50 | } 51 | return resp, fmt.Errorf("%s %s", resp.Status, string(b)) 52 | } 53 | 54 | return resp, nil 55 | } 56 | 57 | func logErrorf(format string, a ...interface{}) { 58 | log.Printf(string(debug.Stack())+" : "+format, a...) 59 | } 60 | -------------------------------------------------------------------------------- /runtime/python/io-comm/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-etl/runtime/python/io-comm 2 | 3 | go 1.18 4 | -------------------------------------------------------------------------------- /runtime/python/io-comm/main.go: -------------------------------------------------------------------------------- 1 | // Package main is an entry point to ioComm server 2 | /* 3 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "io" 11 | "log" 12 | "net/http" 13 | "os" 14 | "os/exec" 15 | "strings" 16 | ) 17 | 18 | var ( 19 | aisTargetURL string 20 | endpoint string 21 | 22 | client *http.Client 23 | ) 24 | 25 | func initVars(ipAddress string, port int) { 26 | endpoint = fmt.Sprintf("%s:%d", ipAddress, port) 27 | aisTargetURL = os.Getenv("AIS_TARGET_URL") 28 | client = &http.Client{} 29 | } 30 | 31 | func main() { 32 | var ( 33 | ipAddressArg = flag.String("l", "0.0.0.0", "Specify the IP address on which the server listens") 34 | portArg = flag.Int("p", 80, "Specify the port on which the server listens") 35 | ) 36 | 37 | flag.Parse() 38 | 39 | initVars(*ipAddressArg, *portArg) 40 | 41 | http.HandleFunc("/", ioHandler) 42 | http.HandleFunc("/health", healthHandler) 43 | 44 | log.Printf("Starting io comm server at %s", endpoint) 45 | log.Fatal(http.ListenAndServe(endpoint, nil)) 46 | } 47 | 48 | func healthHandler(w http.ResponseWriter, r *http.Request) { 49 | switch r.Method { 50 | case http.MethodGet: 51 | w.WriteHeader(http.StatusOK) 52 | w.Write([]byte("Running")) 53 | default: 54 | invalidMsgHandler(w, http.StatusBadRequest, "invalid http method %s", r.Method) 55 | } 56 | 57 | } 58 | 59 | func ioHandler(w http.ResponseWriter, r *http.Request) { 60 | switch r.Method { 61 | case http.MethodPut: 62 | ioPutHandler(w, r) 63 | case http.MethodGet: 64 | ioGetHandler(w, r) 65 | default: 66 | invalidMsgHandler(w, http.StatusBadRequest, "invalid http method %s", r.Method) 67 | } 68 | } 69 | 70 | // PUT / 71 | func ioPutHandler(w http.ResponseWriter, r *http.Request) { 72 | command, ok := r.URL.Query()["command"] 73 | if !ok { 74 | invalidMsgHandler(w, http.StatusBadRequest, "missing command to execute") 75 | return 76 | } 77 | 78 | r.Header.Set("Content-Type", "application/octet-stream") 79 | // TODO: validate command to execute (Security!) 80 | cmd := exec.Command(command[0], command[1:]...) 81 | stdin, err := cmd.StdinPipe() 82 | if err != nil { 83 | return 84 | } 85 | 86 | pr, pw := io.Pipe() 87 | cmd.Stdout = pw 88 | cmd.Stderr = os.Stderr 89 | go func() { 90 | io.Copy(stdin, r.Body) 91 | stdin.Close() 92 | }() 93 | go io.Copy(w, pr) 94 | 95 | err = cmd.Run() 96 | pw.Close() 97 | if err != nil { 98 | logErrorf("failed to exec command, err: %v", err) 99 | } 100 | } 101 | 102 | // GET / 103 | func ioGetHandler(w http.ResponseWriter, r *http.Request) { 104 | if aisTargetURL == "" { 105 | invalidMsgHandler(w, http.StatusBadRequest, "missing env variable AIS_TARGET_URL") 106 | return 107 | } 108 | 109 | path := strings.TrimPrefix(r.URL.EscapedPath(), "/") 110 | if path == "health" { 111 | return 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /runtime/python/server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entry point for launching a deserialized ETL server instance. 3 | This module reads a base64-encoded ETL class definition from the 4 | ETL_CLASS_PAYLOAD environment variable, deserializes it into a subclass 5 | of `ETLServer`, and instantiates it. 6 | 7 | This file is intended to be used by uvicorn/gunicorn like: 8 | uvicorn server:server.app --workers=4 ... 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import os 14 | from typing import Type 15 | import logging 16 | 17 | from aistore.sdk.etl.webserver.base_etl_server import ETLServer 18 | from aistore.sdk.etl.webserver.utils import deserialize_class 19 | 20 | logging.basicConfig( 21 | level=logging.INFO, 22 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 23 | ) 24 | # ------------------------------------------------------------------------------ 25 | # Load and validate payload 26 | # ------------------------------------------------------------------------------ 27 | ETL_CLASS_PAYLOAD: str = os.getenv("ETL_CLASS_PAYLOAD", "") 28 | if not ETL_CLASS_PAYLOAD: 29 | raise RuntimeError("ETL_CLASS_PAYLOAD environment variable is not set") 30 | 31 | # ------------------------------------------------------------------------------ 32 | # Deserialize the ETL class and instantiate the server 33 | # ------------------------------------------------------------------------------ 34 | try: 35 | ETLClass: Type[ETLServer] = deserialize_class(ETL_CLASS_PAYLOAD) 36 | except Exception as e: 37 | raise RuntimeError(f"Failed to deserialize ETL class: {e}") from e 38 | server = ETLClass() 39 | -------------------------------------------------------------------------------- /transformers/FFmpeg/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | # Install ffmpeg 4 | RUN apk add --no-cache ffmpeg 5 | 6 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 7 | 8 | # Set working directory 9 | RUN mkdir /code 10 | WORKDIR /code 11 | 12 | # Copy app code 13 | COPY flask_server.py fastapi_server.py http_server.py ./ 14 | 15 | # Environment setup 16 | ENV PYTHONUNBUFFERED=1 17 | 18 | # Expose the default port 19 | EXPOSE 8000 20 | -------------------------------------------------------------------------------- /transformers/FFmpeg/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_ffmpeg:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_ffmpeg:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/FFmpeg/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | FFmpeg ETL Transformer (Fast-API) 3 | 4 | This module implements an ETL transformer as a FastAPI-based server 5 | that transform audio files into WAV format with control over 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import os 12 | import subprocess 13 | 14 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 15 | 16 | 17 | class FFmpegServer(FastAPIServer): 18 | """ 19 | FastAPI-based server for FFmpeg-based ETL transformation. 20 | """ 21 | 22 | def __init__(self, host: str = "0.0.0.0", port: int = 8000): 23 | super().__init__(host=host, port=port) 24 | # configure from environment or defaults 25 | self.channels = os.getenv("AC", "1") 26 | self.samplerate = os.getenv("AR", "44100") 27 | # base ffmpeg command, reading from stdin, writing WAV to stdout 28 | self.ffmpeg_cmd = [ 29 | "ffmpeg", 30 | "-nostdin", 31 | "-loglevel", 32 | "error", 33 | "-i", 34 | "pipe:0", 35 | "-ac", 36 | self.channels, 37 | "-ar", 38 | self.samplerate, 39 | "-c:a", 40 | "pcm_s16le", 41 | "-f", 42 | "wav", 43 | "pipe:1", 44 | ] 45 | self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"} 46 | 47 | def transform(self, data: bytes, path: str, _etl_args: str) -> bytes: 48 | """ 49 | Run FFmpeg to convert raw audio into WAV format. 50 | Raises an RuntimeError on FFmpeg failure. 51 | """ 52 | ext = os.path.splitext(path)[1].lower() 53 | # If it doesn’t look like audio, just pass it back without processing it 54 | if ext not in self.audio_exts: 55 | return data 56 | 57 | with subprocess.Popen( 58 | self.ffmpeg_cmd, 59 | stdin=subprocess.PIPE, 60 | stdout=subprocess.PIPE, 61 | stderr=subprocess.PIPE, 62 | ) as proc: 63 | out, err = proc.communicate(input=data) 64 | if proc.returncode != 0: 65 | msg = err.decode("utf-8", errors="ignore").strip() 66 | self.logger.error("FFmpeg error: %s", msg) 67 | raise RuntimeError(f"FFmpeg process failed: {msg}") 68 | return out 69 | 70 | def get_mime_type(self) -> str: 71 | """ 72 | Return the MIME type for the transformed data. 73 | """ 74 | return "audio/wav" 75 | 76 | 77 | # Create the server instance and expose the FastAPI app 78 | fastapi_server = FFmpegServer(port=8000) 79 | fastapi_server.logger.setLevel("DEBUG") 80 | fastapi_app = fastapi_server.app # Expose the FastAPI app 81 | -------------------------------------------------------------------------------- /transformers/FFmpeg/flask_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | FFmpeg ETL Transformer (Flask) 3 | 4 | This module implements an ETL transformer as a FastAPI-based server 5 | that transform audio files into WAV format with control over 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import os 12 | import subprocess 13 | 14 | from aistore.sdk.etl.webserver.flask_server import FlaskServer 15 | 16 | 17 | class FFmpegServer(FlaskServer): 18 | """ 19 | Flask-based server for FFmpeg-based ETL transformation. 20 | """ 21 | 22 | def __init__(self, host: str = "0.0.0.0", port: int = 8000): 23 | super().__init__(host=host, port=port) 24 | # configure from environment or defaults 25 | self.channels = os.getenv("AC", "1") 26 | self.samplerate = os.getenv("AR", "44100") 27 | # base ffmpeg command, reading from stdin, writing WAV to stdout 28 | self.ffmpeg_cmd = [ 29 | "ffmpeg", 30 | "-nostdin", 31 | "-loglevel", 32 | "error", 33 | "-i", 34 | "pipe:0", 35 | "-ac", 36 | self.channels, 37 | "-ar", 38 | self.samplerate, 39 | "-c:a", 40 | "pcm_s16le", 41 | "-f", 42 | "wav", 43 | "pipe:1", 44 | ] 45 | self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"} 46 | 47 | def transform(self, data: bytes, path: str, _etl_args: str) -> bytes: 48 | """ 49 | Run FFmpeg to convert raw audio into WAV format. 50 | Raises an error on FFmpeg failure. 51 | """ 52 | ext = os.path.splitext(path)[1].lower() 53 | # If it doesn’t look like audio, just pass it back without processing it 54 | if ext not in self.audio_exts: 55 | return data 56 | 57 | with subprocess.Popen( 58 | self.ffmpeg_cmd, 59 | stdin=subprocess.PIPE, 60 | stdout=subprocess.PIPE, 61 | stderr=subprocess.PIPE, 62 | ) as proc: 63 | out, err = proc.communicate(input=data) 64 | if proc.returncode != 0: 65 | msg = err.decode("utf-8", errors="ignore").strip() 66 | self.logger.error("FFmpeg error: %s", msg) 67 | raise RuntimeError(f"FFmpeg process failed: {msg}") 68 | return out 69 | 70 | def get_mime_type(self) -> str: 71 | """ 72 | Return the MIME type for the transformed data. 73 | """ 74 | return "audio/wav" 75 | 76 | 77 | flask_server = FFmpegServer(port=8000) 78 | flask_server.logger.setLevel("DEBUG") 79 | flask_app = flask_server.app 80 | -------------------------------------------------------------------------------- /transformers/FFmpeg/http_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | FFmpeg ETL Transformer (HTTP-based Server) 3 | 4 | This module implements an ETL transformer as a FastAPI-based server 5 | that transform audio files into WAV format with control over 6 | Audio Channels (`AC`) and Audio Rate (`AR`) with help of FFmpeg utility. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import os 12 | import subprocess 13 | 14 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer 15 | 16 | 17 | class FFmpegServer(HTTPMultiThreadedServer): 18 | """ 19 | Multi-threaded HTTP server for FFmpeg-based ETL transformation. 20 | """ 21 | 22 | def __init__(self, host: str = "0.0.0.0", port: int = 8000): 23 | super().__init__(host=host, port=port) 24 | # configure from environment or defaults 25 | self.channels = os.getenv("AC", "2") 26 | self.samplerate = os.getenv("AR", "44100") 27 | # base ffmpeg command, reading from stdin, writing WAV to stdout 28 | self.ffmpeg_cmd = [ 29 | "ffmpeg", 30 | "-nostdin", 31 | "-loglevel", 32 | "error", 33 | "-i", 34 | "pipe:0", 35 | "-ac", 36 | str(self.channels), 37 | "-ar", 38 | str(self.samplerate), 39 | "-c:a", 40 | "pcm_s16le", 41 | "-f", 42 | "wav", 43 | "pipe:1", 44 | ] 45 | self.audio_exts = {".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg"} 46 | 47 | def transform(self, data: bytes, path: str, _etl_args: str) -> bytes: 48 | """ 49 | Run FFmpeg to convert raw audio into WAV format. 50 | Raises an error on FFmpeg failure. 51 | """ 52 | ext = os.path.splitext(path)[1].lower() 53 | # If it doesn’t look like audio, just pass it back without processing it 54 | if ext not in self.audio_exts: 55 | return data 56 | 57 | with subprocess.Popen( 58 | self.ffmpeg_cmd, 59 | stdin=subprocess.PIPE, 60 | stdout=subprocess.PIPE, 61 | stderr=subprocess.PIPE, 62 | ) as proc: 63 | out, err = proc.communicate(input=data) 64 | if proc.returncode != 0: 65 | msg = err.decode("utf-8", errors="ignore").strip() 66 | self.logger.error("FFmpeg error: %s", msg) 67 | raise RuntimeError(f"FFmpeg process failed: {msg}") 68 | return out 69 | 70 | def get_mime_type(self) -> str: 71 | """ 72 | Return the MIME type for the transformed data. 73 | """ 74 | return "audio/wav" 75 | 76 | 77 | if __name__ == "__main__": 78 | server = FFmpegServer() 79 | server.logger.setLevel("DEBUG") 80 | server.start() 81 | -------------------------------------------------------------------------------- /transformers/FFmpeg/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-nemo-ffmpeg 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: "hpull://" 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_nemo_ffmpeg:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | # for flask based app 19 | # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"] 20 | # for http based app 21 | # command: ["python", "http_server.py"] 22 | # for fastapi based app 23 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 24 | readinessProbe: 25 | httpGet: 26 | path: /health 27 | port: default 28 | env: 29 | - name: AR 30 | value: "16000" 31 | - name: AC 32 | value: "1" 33 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 34 | # where the objects are stored on AIStore targets. This allows the ETL container 35 | # to access the files directly by absolute path. 36 | # volumeMounts: 37 | # - name: sda 38 | # mountPath: /ais/sda 39 | # - name: sdb 40 | # mountPath: /ais/sdb 41 | # ... 42 | # volumes: 43 | # - name: sda 44 | # hostPath: 45 | # path: /ais/sda 46 | # type: Directory 47 | # - name: sdb 48 | # hostPath: 49 | # path: /ais/sdb 50 | # type 51 | # ... -------------------------------------------------------------------------------- /transformers/Makefile: -------------------------------------------------------------------------------- 1 | common_deps: 2 | pip install -r tests/requirements.txt 3 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/README.md: -------------------------------------------------------------------------------- 1 | # AIStore Audio Split & Consolidate Transformer 2 | 3 | This transformer splits and consolidates audio files using a [JSONL](https://jsonlines.org/) manifest file as input. It extracts segments specified in the manifest, consolidates them, and returns the result as a tarball. 4 | 5 | This transformer consists of two components: 6 | 7 | 1. **Audio Manager** – Processes the manifest and dispatches splitting tasks. 8 | 2. **Audio Splitter** – Splits individual audio files based on instructions from the Audio Manager. 9 | 10 | --- 11 | 12 | ## Why two separate transformers? 13 | 14 | Using separate transformers ensures scalability through distributed processing. A single transformer combining both roles would not scale efficiently, as audio files might not reside on the same node, causing performance issues due to unnecessary data movement between nodes. Separating the roles allows efficient distributed processing across the AIStore cluster. 15 | 16 | ![Audio Split Consolidate Overview](audio_split_consolidate_diagram.png) 17 | 18 | --- 19 | 20 | ## Example Input Manifest 21 | 22 | `manifest.jsonl`: 23 | ```json 24 | {"id": "youtube_vid_id_1", "part": 1, "from_time": 0.36, "to_time": 2.36} 25 | {"id": "youtube_vid_id_1", "part": 2, "from_time": 3.36, "to_time": 9.36} 26 | {"id": "youtube_vid_id_2", "part": 1, "from_time": 0.0, "to_time": 4.0} 27 | ``` 28 | 29 | Output: 30 | - A tarball (`manifest.tar`) containing: 31 | - `youtube_vid_id_1_1` 32 | - `youtube_vid_id_1_2` 33 | - `youtube_vid_id_2_1` 34 | 35 | Each file will contain audio trimmed to the specified duration. 36 | 37 | --- 38 | 39 | ## How to Get Started 40 | 41 | ### Step 1: Prepare the Manifest 42 | 43 | Create a JSON Lines (`.jsonl`) file where each line contains: 44 | - `id`: Identifier of the audio file. 45 | - `part`: Part number. 46 | - `from_time` and `to_time`: Segment duration. 47 | 48 | --- 49 | 50 | ## Deploy ETLs 51 | 52 | ### Audio Splitter ETL 53 | 54 | Review and edit the configuration ([`audio_splitter/pod.yaml`](audio_splitter/pod.yaml)) as needed. 55 | 56 | ```bash 57 | ais etl init spec --from-file audio_splitter/pod.yaml --comm-type hpush --name audio-splitter 58 | ``` 59 | 60 | ### Audio Manager ETL 61 | 62 | Review and edit the configuration ([`audio_manager/pod.yaml`](audio_manager/pod.yaml)), ensuring settings match your environment. 63 | 64 | ```bash 65 | ais etl init spec --from-file audio_manager/pod.yaml --comm-type hpush --name audio-manager 66 | ``` 67 | 68 | Ensure the manifest file is accessible by the Audio Manager. 69 | 70 | --- 71 | 72 | ## Run Transformations 73 | 74 | ### Single Manifest File 75 | 76 | ```bash 77 | ais etl object audio-manager ais://manifests/manifest.jsonl manifest.tar 78 | ``` 79 | 80 | ### Batch Operation (Multiple Manifest Files - Bucket Transform) 81 | 82 | ```bash 83 | ais etl bucket audio-manager ais://bench_manifests ais://output_bucket --ext "{jsonl:tar}" 84 | ``` 85 | 86 | This will process each `.jsonl` file in the source bucket and output consolidated audio tarballs (`.tar`) into the specified output bucket. 87 | 88 | 89 | ## Performance 90 | 91 | Our [benchmark](../../benchmarks/audio_split_consolidate.py) demonstrates that using our ETL can accelerate data processing by **up to 13x** compared to single-threaded local execution. Performance scales **linearly** with the number of targets and disks in the AIStore cluster. -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_manager/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 4 | 5 | # Set working directory 6 | RUN mkdir /code 7 | WORKDIR /code 8 | 9 | # Copy app code 10 | COPY fastapi_server.py ./ 11 | 12 | # Environment setup 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | # Expose the default port 16 | EXPOSE 8000 17 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_manager/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | 4 | REGISTRY_URL ?= docker.io/aistorage 5 | 6 | ifeq ($(GIT_TEST), true) 7 | TAG := test 8 | endif 9 | 10 | all: build push 11 | 12 | build: 13 | docker build -t $(REGISTRY_URL)/transformer_audio_manager:$(TAG) . 14 | 15 | push: 16 | docker push $(REGISTRY_URL)/transformer_audio_manager:$(TAG) 17 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_manager/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-audio-manager 5 | annotations: 6 | communication_type: "hpull://" 7 | wait_timeout: 10m 8 | spec: 9 | containers: 10 | - name: server 11 | image: aistorage/transformer_audio_manager:latest 12 | imagePullPolicy: Always 13 | ports: 14 | - name: default 15 | containerPort: 8000 16 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 17 | readinessProbe: 18 | httpGet: 19 | path: /health 20 | port: default 21 | env: 22 | # AIS endpoint 23 | - name: AIS_ENDPOINT 24 | value: "http://:51080" 25 | # Bucket name of the audio files 26 | - name: SRC_BUCKET 27 | value: "" 28 | # Provider of the audio files (ais, gcp, aws) 29 | - name: SRC_PROVIDER 30 | value: "ais" 31 | # Prefix of the audio files 32 | - name: OBJ_PREFIX 33 | value: "" 34 | # Extension of the audio files 35 | - name: OBJ_EXTENSION 36 | value: "wav" 37 | # ETL Name of the Audio Splitter ETL you previously initialised 38 | - name: ETL_NAME 39 | value: "" 40 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 41 | # where the objects are stored on AIStore targets. This allows the ETL container 42 | # to access the files directly by absolute path. 43 | # volumeMounts: 44 | # - name: ais 45 | # mountPath: /tmp/ 46 | # volumes: 47 | # - name: ais 48 | # hostPath: 49 | # path: /tmp/ 50 | # type: Directory 51 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_split_consolidate_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/NeMo/audio_split_consolidate/audio_split_consolidate_diagram.png -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_splitter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | # need this for soundfile 4 | RUN apk add --no-cache libsndfile-dev 5 | # Install dependencies 6 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 soundfile 7 | 8 | # Set working directory 9 | RUN mkdir /code 10 | WORKDIR /code 11 | 12 | # Copy app code 13 | COPY fastapi_server.py ./ 14 | 15 | # Environment setup 16 | ENV PYTHONUNBUFFERED=1 17 | 18 | # Expose the default port 19 | EXPOSE 8000 20 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_splitter/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | 4 | ifeq ($(GIT_TEST), true) 5 | TAG := test 6 | endif 7 | 8 | REGISTRY_URL ?= docker.io/aistorage 9 | 10 | all: build push 11 | 12 | build: 13 | docker build -t $(REGISTRY_URL)/transformer_audio_splitter:$(TAG) . 14 | 15 | push: 16 | docker push $(REGISTRY_URL)/transformer_audio_splitter:$(TAG) 17 | -------------------------------------------------------------------------------- /transformers/NeMo/audio_split_consolidate/audio_splitter/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-audio-splitter 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: "hpull://" 8 | wait_timeout: 10m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_audio_splitter:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 8000 17 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 18 | readinessProbe: 19 | httpGet: 20 | path: /health 21 | port: default 22 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 23 | # where the objects are stored on AIStore targets. This allows the ETL container 24 | # to access the files directly by absolute path. 25 | # volumeMounts: 26 | # - name: ais 27 | # mountPath: /tmp/ 28 | # volumes: 29 | # - name: ais 30 | # hostPath: 31 | # path: /tmp/ 32 | # type: Directory 33 | -------------------------------------------------------------------------------- /transformers/batch_rename/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 4 | 5 | # Set working directory 6 | RUN mkdir /code 7 | WORKDIR /code 8 | 9 | # Copy app code 10 | COPY fastapi_server.py ./ 11 | 12 | # Environment setup 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | # Expose the default port 16 | EXPOSE 8000 17 | -------------------------------------------------------------------------------- /transformers/batch_rename/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build --no-cache -t $(REGISTRY_URL)/transformer_batch_rename:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_batch_rename:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/batch_rename/README.md: -------------------------------------------------------------------------------- 1 | # Batch Rename Transformer 2 | 3 | The **Batch Rename Transformer** reads objects from a source bucket, and if their path matches a given regex pattern, it writes them to a destination bucket with a modified name (prefixed path). This is useful in ETL pipelines where data normalization, path restructuring, or archival tagging is needed. 4 | 5 | Even if an object does not match the pattern, the transformer still returns the original object bytes to the caller. This allows it to support both inline and offline transformation modes seamlessly. 6 | 7 | Its basically a copy operation, your data will be copied to new path. Users are responsible for deleting the old objects. 8 | 9 | The transformer supports both `hpull` and `hpush` communication mechanisms, enabling seamless integration into AIStore-based pipelines. 10 | 11 | > For more information on ETL communication mechanisms, see [AIStore ETL Documentation](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms). 12 | 13 | --- 14 | 15 | ### Environment Variables 16 | 17 | | Variable | Description | Required | 18 | | --------------------- | --------------------------------------------------------- | -------- | 19 | | `AIS_ENDPOINT` | URL of the AIStore proxy (e.g., `http://ais-proxy:51080`) | ✅ Yes | 20 | | `DST_BUCKET` | Name of the destination bucket | ✅ Yes | 21 | | `DST_BUCKET_PROVIDER` | Provider for the destination bucket (default: `ais`) | No | 22 | | `FILE_PATTERN` | Regex pattern to match source object names | ✅ Yes | 23 | | `DST_PREFIX` | Prefix to prepend to renamed object paths | ✅ Yes | 24 | 25 | --- 26 | 27 | ### Initializing ETL with AIStore CLI 28 | 29 | Follow these steps to initialize the batch rename transformer using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md): 30 | 31 | ```bash 32 | $ cd transformers/batch_rename 33 | 34 | # Set communication type: either 'hpull://' or 'hpush://' 35 | $ export COMMUNICATION_TYPE='hpull://' 36 | 37 | # Initialize the ETL with a chosen name 38 | $ ais etl init spec --from-file init_spec.yaml --name --comm-type "$COMMUNICATION_TYPE" 39 | 40 | # Inline transformation (single object) 41 | # If the object matches the pattern, it will be renamed and saved to the destination bucket. 42 | # The content will also be returned to the caller. 43 | $ ais etl object ais:/// - 44 | 45 | # (Optional) Discard content if not needed 46 | $ ais etl object ais:/// /dev/null 47 | 48 | # To run transformation offline (bucket-to-bucket) 49 | $ ais etl bucket ais:// ais:// 50 | ``` 51 | -------------------------------------------------------------------------------- /transformers/batch_rename/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | A FastAPI-based ETL server that renames objects based on a regex pattern 3 | and stores them to a destination bucket with a new prefix. 4 | 5 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 6 | """ 7 | 8 | import os 9 | import re 10 | 11 | from aistore import Client 12 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 13 | 14 | 15 | class BatchRenameServer(FastAPIServer): 16 | """ 17 | ETL server that renames input objects based on a pattern match. 18 | 19 | If the object path matches the regex pattern defined by FILE_PATTERN, 20 | the object is renamed by applying DST_PREFIX and written to DST_BUCKET. 21 | 22 | Environment Variables: 23 | FILE_PATTERN - Regex pattern to match object paths (required) 24 | DST_PREFIX - Prefix to apply to renamed objects (required) 25 | DST_BUCKET - Destination bucket name (required) 26 | DST_BUCKET_PROVIDER - Storage provider for the destination bucket (default: "ais") 27 | AIS_ENDPOINT - AIStore endpoint URL (required) 28 | """ 29 | 30 | def __init__(self, host: str = "0.0.0.0", port: int = 8000): 31 | super().__init__(host=host, port=port) 32 | self.pattern = os.getenv("FILE_PATTERN") or self._fatal("FILE_PATTERN") 33 | self.prefix = os.getenv("DST_PREFIX") or self._fatal("DST_PREFIX") 34 | self.dst_bucket = os.getenv("DST_BUCKET") or self._fatal("DST_BUCKET") 35 | self.ais_endpoint = os.getenv("AIS_ENDPOINT") or self._fatal("AIS_ENDPOINT") 36 | self.dst_provider = os.getenv("DST_BUCKET_PROVIDER", "ais") 37 | self.ais_client = Client(self.ais_endpoint, timeout=None) 38 | 39 | @staticmethod 40 | def _fatal(var: str) -> None: 41 | """Raise an error for missing required environment variables.""" 42 | raise ValueError(f"Environment variable '{var}' is required") 43 | 44 | def transform(self, data: bytes, path: str, *_): 45 | """ 46 | Rename and redirect matching input object to a new path in the destination bucket. 47 | 48 | Args: 49 | data (bytes): Object content. 50 | path (str): Original object path. 51 | 52 | Returns: 53 | bytes: The original object content (unmodified). 54 | """ 55 | if re.search(self.pattern, path): 56 | new_path = f"{self.prefix}{os.path.basename(path)}" 57 | # TODO: Add directly to target option 58 | self.ais_client.bucket(self.dst_bucket, provider=self.dst_provider).object( 59 | new_path 60 | ).get_writer().put_content(data) 61 | return data 62 | 63 | 64 | # Initialize the ETL server and expose the FastAPI application 65 | fastapi_server = BatchRenameServer() 66 | fastapi_server.logger.setLevel("DEBUG") 67 | fastapi_app = fastapi_server.app 68 | -------------------------------------------------------------------------------- /transformers/batch_rename/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-batch-rename 5 | annotations: 6 | # Values it can take ["hpull://", "hpush://"] 7 | communication_type: "hpull://" 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_batch_rename:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | # Adjust the num of workers based on the number of CPU cores available 19 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 20 | readinessProbe: 21 | httpGet: 22 | path: /health 23 | port: default 24 | env: 25 | # Required: AIStore endpoint 26 | - name: AIS_ENDPOINT 27 | value: "http://:51080" 28 | 29 | # Required: Destination bucket to write renamed objects 30 | - name: DST_BUCKET 31 | value: "" 32 | 33 | # Optional: Provider for the destination bucket (default: ais) 34 | - name: DST_BUCKET_PROVIDER 35 | value: "ais" 36 | 37 | # Required: Regex pattern to match files 38 | - name: FILE_PATTERN 39 | # all .flac files 40 | value: '.*\.flac$' 41 | 42 | # Required: Prefix to apply to renamed files 43 | - name: DST_PREFIX 44 | value: "renamed/" 45 | -------------------------------------------------------------------------------- /transformers/compress/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.11-alpine 2 | 3 | COPY requirements.txt requirements.txt 4 | RUN pip3 install -r requirements.txt 5 | 6 | RUN mkdir /code 7 | WORKDIR /code 8 | COPY server.py server.py 9 | 10 | ENV PYTHONUNBUFFERED 1 11 | 12 | EXPOSE 80 13 | -------------------------------------------------------------------------------- /transformers/compress/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_compress:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_compress:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/compress/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-compress 5 | annotations: 6 | # Values `communication_type` can take are ["hpull://", "hpush://"]. 7 | # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 8 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 9 | wait_timeout: 5m 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_compress:latest 14 | imagePullPolicy: IfNotPresent 15 | ports: 16 | - name: default 17 | containerPort: 80 18 | # For more information on additional arguments, please refer to 19 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/compress/README.md 20 | command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80'] 21 | env: 22 | # COMPRESS_OPTIONS is a dictionary of COMPRESS parameters, which includes `mode` and `compression`. 23 | # For more information, refer to https://github.com/NVIDIA/ais-etl/blob/main/transformers/compress/README.md. 24 | - name: COMPRESS_OPTIONS 25 | value: ${COMPRESS_OPTIONS:-"{}"} 26 | readinessProbe: 27 | httpGet: 28 | path: /health 29 | port: default 30 | -------------------------------------------------------------------------------- /transformers/compress/requirements.txt: -------------------------------------------------------------------------------- 1 | requests -------------------------------------------------------------------------------- /transformers/compress/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | 7 | import argparse 8 | import bz2 9 | import gzip 10 | import json 11 | import logging 12 | import os 13 | 14 | from http.server import HTTPServer, BaseHTTPRequestHandler 15 | from socketserver import ThreadingMixIn 16 | 17 | import requests 18 | 19 | host_target = os.environ["AIS_TARGET_URL"] 20 | compress_options = json.loads(os.environ["COMPRESS_OPTIONS"]) 21 | 22 | if "mode" not in compress_options: 23 | mode = "compress" 24 | else: 25 | mode = compress_options["mode"] 26 | 27 | if "compression" not in compress_options: 28 | compression = "gzip" 29 | else: 30 | compression = compress_options["compression"] 31 | 32 | 33 | class Handler(BaseHTTPRequestHandler): 34 | # Overriding log_request to not log successful requests 35 | def log_request(self, code="-", size="-"): 36 | pass 37 | 38 | # Set standard headers for responses 39 | def _set_headers(self): 40 | self.send_response(200) 41 | self.send_header("Content-Type", "application/octet-stream") 42 | self.end_headers() 43 | 44 | def process_data(self, data): 45 | if mode == "compress" and compression == "gzip": 46 | return gzip.compress(data) 47 | if mode == "compress" and compression == "bz2": 48 | return bz2.compress(data) 49 | if mode == "decompress" and compression == "gzip": 50 | return gzip.decompress(data) 51 | if mode == "decompress" and compression == "bz2": 52 | return bz2.decompress(data) 53 | raise ValueError( 54 | f"Unsupported data processing mode ({mode}) or compression algorithm ({compression})" 55 | ) 56 | 57 | # PUT handler supports `hpush` operation 58 | def do_PUT(self): 59 | try: 60 | content_length = int(self.headers["Content-Length"]) 61 | post_data = self.rfile.read(content_length) 62 | processed_data = self.process_data(post_data) 63 | self._set_headers() 64 | self.wfile.write(processed_data) 65 | except Exception as exception: 66 | logging.error("Error processing PUT request: %s", str(exception)) 67 | self.send_response(500) 68 | self.end_headers() 69 | self.wfile.write(b"Data processing failed") 70 | 71 | # GET handler supports `hpull` operation 72 | def do_GET(self): 73 | try: 74 | if self.path == "/health": 75 | self._set_headers() 76 | self.wfile.write(b"Running") 77 | return 78 | 79 | response = requests.get(host_target + self.path) 80 | processed_data = self.process_data(response.content) 81 | 82 | self._set_headers() 83 | self.wfile.write(processed_data) 84 | 85 | except Exception as exception: 86 | logging.error("Error processing GET request: %s", str(exception)) 87 | self.send_response(500) 88 | self.end_headers() 89 | self.wfile.write(b"Data processing failed") 90 | 91 | 92 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): 93 | """Handle requests in a separate thread.""" 94 | 95 | 96 | def run(addr, port): 97 | server = ThreadedHTTPServer((addr, port), Handler) 98 | print(f"Starting HTTP server on {addr}:{port}") 99 | server.serve_forever() 100 | 101 | 102 | if __name__ == "__main__": 103 | parser = argparse.ArgumentParser(description="Run a simple HTTP server") 104 | parser.add_argument( 105 | "-l", 106 | "--listen", 107 | help="Specify the IP address on which the server listens", 108 | ) 109 | parser.add_argument( 110 | "-p", 111 | "--port", 112 | type=int, 113 | help="Specify the port on which the server listens", 114 | ) 115 | args = parser.parse_args() 116 | run(addr=args.listen, port=args.port) 117 | -------------------------------------------------------------------------------- /transformers/echo/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 4 | 5 | # Set working directory 6 | RUN mkdir /code 7 | WORKDIR /code 8 | 9 | # Copy app code 10 | COPY flask_server.py fastapi_server.py http_server.py ./ 11 | 12 | # Environment setup 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | # Expose the default port 16 | EXPOSE 8000 17 | -------------------------------------------------------------------------------- /transformers/echo/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build --no-cache -t $(REGISTRY_URL)/transformer_echo:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_echo:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/echo/README.md: -------------------------------------------------------------------------------- 1 | # Echo Transformer 2 | 3 | A simple echo transformer that takes objects (bytes) and simply echoes or repeats those bytes back as output. It's a simple and straightforward way to demonstrate or test the functionality of your container pod. An echo transformer might be used for debugging, understanding how data flows through a system, or verifying that certain processes are functioning as expected. 4 | 5 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration. 6 | 7 | > **Note:** This transformer is using [`FastAPI`](https://fastapi.tiangolo.com/) as framework and [`Gunicorn`](https://gunicorn.org/) + [Uvicorn](https://www.uvicorn.org/) as webserver, multithreaded HTTP server for the same implmentation under [`http-multithreaded-server`](/http-multithreaded-server/) folder. 8 | 9 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms). 10 | 11 | ### Initializing ETL with AIStore CLI 12 | 13 | The following steps demonstrate how to initialize the `hello-world-transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md): 14 | 15 | ```!bash 16 | $ cd transformers/hello_world 17 | 18 | $ # Mention communication type b/w target and container 19 | $ export COMMUNICATION_TYPE='hpull://' 20 | 21 | # Substitute env variables in spec file 22 | $ envsubst < pod.yaml > init_spec.yaml 23 | 24 | $ # Initialize ETL 25 | $ ais etl init spec --from-file init_spec.yaml --name --comm-type "hpull://" 26 | 27 | $ # Transform and retrieve objects from the bucket using this ETL 28 | $ # For inline transformation 29 | $ ais etl object ais:///. - 30 | 31 | $ # Or, for offline (bucket-to-bucket) transformation 32 | $ ais etl bucket ais://src-bck ais://dst-bck 33 | ``` -------------------------------------------------------------------------------- /transformers/echo/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | A FastAPI echo server that returns the input data as output. 3 | 4 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 5 | """ 6 | 7 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 8 | 9 | 10 | class EchoServerFastAPI(FastAPIServer): 11 | """ 12 | A simple echo server using FastAPI that returns the input data as output. 13 | """ 14 | 15 | def transform(self, data, *_args): 16 | return data 17 | 18 | 19 | # Create the server instance and expose the FastAPI app 20 | fastapi_server = EchoServerFastAPI(port=8000) 21 | fastapi_server.logger.setLevel("DEBUG") 22 | fastapi_app = fastapi_server.app # Expose the FastAPI app 23 | -------------------------------------------------------------------------------- /transformers/echo/flask_server.py: -------------------------------------------------------------------------------- 1 | """A simple echo server that returns the input data as output. 2 | 3 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | 5 | """ 6 | 7 | from aistore.sdk.etl.webserver.flask_server import FlaskServer 8 | 9 | 10 | class EchoServerFlask(FlaskServer): 11 | """ 12 | A simple echo server that returns the input data as output. 13 | """ 14 | 15 | def transform(self, data, *_args): 16 | return data 17 | 18 | 19 | flask_server = EchoServerFlask(port=8000) 20 | flask_server.logger.setLevel("DEBUG") 21 | flask_app = flask_server.app 22 | -------------------------------------------------------------------------------- /transformers/echo/http_server.py: -------------------------------------------------------------------------------- 1 | """A simple echo server that returns the input data as output. 2 | 3 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | 5 | """ 6 | 7 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer 8 | 9 | 10 | class EchoServer(HTTPMultiThreadedServer): 11 | """ 12 | A simple echo server that returns the input data as output. 13 | """ 14 | 15 | def transform(self, data, *_args): 16 | return data 17 | 18 | 19 | if __name__ == "__main__": 20 | echo_server = EchoServer(port=8000) 21 | echo_server.logger.setLevel("DEBUG") 22 | echo_server.start() 23 | -------------------------------------------------------------------------------- /transformers/echo/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-echo 5 | annotations: 6 | # Values it can take ["hpull://", "hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_echo:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | # for flask based app 19 | # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"] 20 | # for http based app 21 | # command: ["python", "http_server.py"] 22 | # for fastapi based app 23 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 24 | readinessProbe: 25 | httpGet: 26 | path: /health 27 | port: default 28 | # volumeMounts: 29 | # - name: ais 30 | # mountPath: /tmp/ 31 | # volumes: 32 | # - name: ais 33 | # hostPath: 34 | # path: /tmp/ 35 | # type: Directory 36 | -------------------------------------------------------------------------------- /transformers/face_detection/Dockerfile: -------------------------------------------------------------------------------- 1 | # Prior to building this image make you own kaggle_creds.json file 2 | # containing kaggle keys to download dataset 3 | FROM docker.io/library/python:3.8-slim 4 | 5 | WORKDIR / 6 | 7 | # install packages needed for open-cv to work 8 | RUN apt-get update && apt-get -y install gcc ffmpeg libsm6 libxext6 unzip curl 9 | 10 | # install python dependencies 11 | COPY ./requirements.txt requirements.txt 12 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt 13 | 14 | # Make .kaggle directory and copy creds 15 | RUN mkdir ~/.kaggle 16 | COPY kaggle_creds.json /root/.kaggle/kaggle.json 17 | 18 | # Give read and write permissions to kaggle.json 19 | RUN chmod 600 /root/.kaggle/kaggle.json 20 | 21 | # Create a directory to store the model 22 | RUN mkdir model 23 | 24 | # Download the dataset 25 | RUN kaggle datasets download -d sambitmukherjee/caffe-face-detector-opencv-pretrained-model && \ 26 | unzip caffe-face-detector-opencv-pretrained-model.zip -d model/ && \ 27 | rm caffe-face-detector-opencv-pretrained-model.zip && \ 28 | rm /root/.kaggle/kaggle.json 29 | 30 | COPY main.py main.py 31 | 32 | ENV PYTHONUNBUFFERED 1 33 | 34 | ENV LOG_LEVEL DEBUG 35 | 36 | EXPOSE 8000 37 | -------------------------------------------------------------------------------- /transformers/face_detection/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_face_detection:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_face_detection:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/face_detection/README.md: -------------------------------------------------------------------------------- 1 | # Face Detection Using Single Shot Multibox Detector (SSD) Model 2 | 3 | This document outlines the process of utilizing the `Single Shot MultiBox Detector (SSD)` model for face detection in images. The SSD model predicts and places bounding boxes over faces in an image. For further reading on the SSD model, visit the [research paper](https://arxiv.org/abs/1512.02325). 4 | 5 | ![output](sample/output_face_detection.png) 6 | 7 | > **Note**: Due to size constraints, the model's weights and architecture are not included in this directory. They are pre-loaded in the transformer's Docker [image](https://hub.docker.com/r/aistorage/transformer_face_detection). 8 | 9 | ## Image Format Specification 10 | 11 | The image formats (`jpeg`, `png`, etc.) for processing or storage are defined in the [`pod.yaml`](pod.yaml) file. 12 | 13 | ## Transformer Communication Mechanisms 14 | 15 | The transformer is compatible with `hpull` and `hpush` for seamless integration. Detailed information about these communication mechanisms can be found [here](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms). 16 | 17 | ## Recommended Parameter Setting 18 | 19 | For efficient transformation, use `fqn` as `ARG_TYPE` in the [`pod.yaml`](pod.yaml) file. This approach allows for local object reading from the target, reducing the time required for each transformation. 20 | 21 | ## Web Server Framework 22 | 23 | The transformer employs the [`FastAPI`](https://fastapi.tiangolo.com/) framework, and uses [`Gunicorn`](https://gunicorn.org/) and [Uvicorn](https://www.uvicorn.org/) as the web server combination. 24 | 25 | ## Configurable Parameters 26 | 27 | Adjust the following parameters in the `pod.yaml` file as per your requirements: 28 | 29 | | Argument | Description | Default Value | 30 | |------------|---------------------------------------------------------------------|---------------| 31 | | `FORMAT` | Image format for processing/storing (png, jpeg, etc.) | "jpeg" | 32 | | `ARG_TYPE` | Local object reading (`fqn`) vs. HTTP request for object retrieval | "" | 33 | | `FILE_FORMAT` | Configure as "tar" for processing datasets in the webdataset format or for handling batches of images packaged in a tarball | "" | 34 | 35 | ### Setting Up the Face Detection Transformer with AIStore CLI 36 | 37 | To initialize the `Face Detection Transformer` using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md), follow these steps: 38 | 39 | ```bash 40 | # Navigate to the transformer directory 41 | cd transformers/face_detection 42 | 43 | # Set FORMAT and ARG_TYPE environment variables 44 | export FORMAT="jpeg" 45 | export ARG_TYPE="" # Or use 'fqn' for local reading 46 | export FILE_FORMAT="" # or use "tar", if using webdataset format 47 | 48 | # Define communication type 49 | export COMMUNICATION_TYPE="hpush://" 50 | 51 | # Generate an initialization specification file 52 | envsubst < pod.yaml > init_spec.yaml 53 | 54 | # Initialize the ETL process 55 | ais etl init spec --from-file init_spec.yaml --name --comm-type $COMMUNICATION_TYPE 56 | 57 | # Use the ETL for transforming and retrieving objects 58 | # For inline transformation 59 | ais etl object ais://src/.JPEG dst.JPEG 60 | 61 | # For offline (bucket-to-bucket) transformation 62 | ais etl bucket ais://src-bck ais://dst-bck --ext="{jpg:jpg}" 63 | 64 | # or, if using webdataset style format 65 | # ais etl bucket ais://src-bck ais://dst-bck --ext="{tar:tar}" 66 | ``` -------------------------------------------------------------------------------- /transformers/face_detection/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-face-detection 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: "${COMMUNICATION_TYPE}" 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_face_detection:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 8000 17 | command: ["gunicorn", "main:app", "--workers", "5", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"] 18 | env: 19 | - name: FORMAT 20 | # Expected Values - png, jpeg, etc. 21 | value: "${FORMAT}" 22 | - name: ARG_TYPE 23 | value: "${ARG_TYPE}" 24 | - name: FILE_FORMAT 25 | value: "${FILE_FORMAT}" 26 | # This is a health check endpoint which one should specify 27 | # for aistore to determine the health of the ETL container. 28 | readinessProbe: 29 | httpGet: 30 | path: /health 31 | port: default 32 | # volume mounts and volumes are needed if you are planning to use ARG_TYPE = `fqn` 33 | volumeMounts: 34 | - name: ais 35 | mountPath: /mnt/data/ais 36 | volumes: 37 | - name: ais 38 | hostPath: 39 | path: /mnt/data/ais 40 | type: Directory 41 | -------------------------------------------------------------------------------- /transformers/face_detection/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.109.1 2 | uvicorn[standard] 3 | gunicorn 4 | aiohttp>=3.9.2 5 | numpy 6 | opencv-python 7 | aiofiles 8 | kaggle==1.5.16 9 | webdataset==0.2.86 10 | Pillow>=10.0.1 -------------------------------------------------------------------------------- /transformers/face_detection/sample/output_face_detection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/face_detection/sample/output_face_detection.png -------------------------------------------------------------------------------- /transformers/go_FFmpeg/Dockerfile: -------------------------------------------------------------------------------- 1 | # Stage 1: Build the binary 2 | FROM docker.io/library/golang:1.24-alpine AS builder 3 | 4 | RUN apk add --no-cache git 5 | 6 | WORKDIR /app 7 | 8 | COPY src/go.mod src/go.sum ./ 9 | RUN go mod download 10 | 11 | COPY src/ ./ 12 | RUN go build -o go_ffmpeg 13 | 14 | # Stage 2: Minimal runtime image 15 | FROM alpine:3.19 16 | 17 | # Install ffmpeg 18 | RUN apk add --no-cache ffmpeg 19 | 20 | WORKDIR /app 21 | COPY --from=builder /app/go_ffmpeg . 22 | 23 | EXPOSE 8000 24 | CMD ["./go_ffmpeg"] 25 | -------------------------------------------------------------------------------- /transformers/go_FFmpeg/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_ffmpeg_go:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_ffmpeg_go:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/go_FFmpeg/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: ffmpeg-go 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_ffmpeg_go:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | command: ['./go_ffmpeg', '-l', '0.0.0.0', '-p', '8000'] 19 | readinessProbe: 20 | httpGet: 21 | path: /health 22 | port: default 23 | -------------------------------------------------------------------------------- /transformers/go_FFmpeg/src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-etl/transformers/go_ffmpeg/src 2 | 3 | go 1.24 4 | 5 | require github.com/NVIDIA/aistore v1.3.29-0.20250514164659-82fcb58b08f3 6 | 7 | require ( 8 | github.com/OneOfOne/xxhash v1.2.8 // indirect 9 | github.com/beorn7/perks v1.0.1 // indirect 10 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 11 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 12 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 13 | github.com/fxamacker/cbor/v2 v2.8.0 // indirect 14 | github.com/go-logr/logr v1.4.2 // indirect 15 | github.com/go-openapi/jsonpointer v0.21.1 // indirect 16 | github.com/go-openapi/jsonreference v0.21.0 // indirect 17 | github.com/go-openapi/swag v0.23.1 // indirect 18 | github.com/gogo/protobuf v1.3.2 // indirect 19 | github.com/golang/protobuf v1.5.4 // indirect 20 | github.com/google/gnostic-models v0.6.9 // indirect 21 | github.com/google/go-cmp v0.7.0 // indirect 22 | github.com/google/gofuzz v1.2.0 // indirect 23 | github.com/google/uuid v1.6.0 // indirect 24 | github.com/gorilla/websocket v1.5.3 // indirect 25 | github.com/josharian/intern v1.0.0 // indirect 26 | github.com/json-iterator/go v1.1.12 // indirect 27 | github.com/karrick/godirwalk v1.17.0 // indirect 28 | github.com/lufia/iostat v1.2.1 // indirect 29 | github.com/mailru/easyjson v0.9.0 // indirect 30 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 31 | github.com/modern-go/reflect2 v1.0.2 // indirect 32 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 33 | github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect 34 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 35 | github.com/pkg/errors v0.9.1 // indirect 36 | github.com/prometheus/client_golang v1.22.0 // indirect 37 | github.com/prometheus/client_model v0.6.2 // indirect 38 | github.com/prometheus/common v0.63.0 // indirect 39 | github.com/prometheus/procfs v0.16.0 // indirect 40 | github.com/spf13/pflag v1.0.6 // indirect 41 | github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect 42 | github.com/tinylib/msgp v1.2.5 // indirect 43 | github.com/x448/float16 v0.8.4 // indirect 44 | golang.org/x/net v0.39.0 // indirect 45 | golang.org/x/oauth2 v0.29.0 // indirect 46 | golang.org/x/sync v0.13.0 // indirect 47 | golang.org/x/sys v0.32.0 // indirect 48 | golang.org/x/term v0.31.0 // indirect 49 | golang.org/x/text v0.24.0 // indirect 50 | golang.org/x/time v0.11.0 // indirect 51 | google.golang.org/protobuf v1.36.6 // indirect 52 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 53 | gopkg.in/inf.v0 v0.9.1 // indirect 54 | gopkg.in/yaml.v3 v3.0.1 // indirect 55 | k8s.io/api v0.32.3 // indirect 56 | k8s.io/apimachinery v0.32.3 // indirect 57 | k8s.io/client-go v0.32.3 // indirect 58 | k8s.io/klog/v2 v2.130.1 // indirect 59 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 60 | k8s.io/metrics v0.32.3 // indirect 61 | k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect 62 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 63 | sigs.k8s.io/randfill v1.0.0 // indirect 64 | sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect 65 | sigs.k8s.io/yaml v1.4.0 // indirect 66 | ) 67 | -------------------------------------------------------------------------------- /transformers/go_FFmpeg/src/main.go: -------------------------------------------------------------------------------- 1 | // Package main is implementation of FFmpeg transformation in golang. 2 | /* 3 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "flag" 10 | "fmt" 11 | "io" 12 | "log" 13 | "os" 14 | "os/exec" 15 | "path/filepath" 16 | "strings" 17 | 18 | "github.com/NVIDIA/aistore/cmn/cos" 19 | "github.com/NVIDIA/aistore/ext/etl/webserver" 20 | ) 21 | 22 | type FFmpegServer struct { 23 | webserver.ETLServer 24 | channels string 25 | samplerate string 26 | } 27 | 28 | var audioExts = cos.NewStrSet(".wav", ".flac", ".mp3", ".m4a", ".opus", ".ogg") 29 | 30 | func (fs *FFmpegServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) { 31 | ext := strings.ToLower(filepath.Ext(path)) 32 | if !audioExts.Contains(ext) { 33 | // If it's not an audio file we recognize, return as-is 34 | buf, err := io.ReadAll(input) 35 | if err != nil { 36 | return nil, fmt.Errorf("reading input: %w", err) 37 | } 38 | return io.NopCloser(bytes.NewReader(buf)), nil 39 | } 40 | 41 | cmd := exec.Command("ffmpeg", 42 | "-nostdin", 43 | "-loglevel", "error", 44 | "-i", "pipe:0", 45 | "-ac", fs.channels, 46 | "-ar", fs.samplerate, 47 | "-c:a", "pcm_s16le", 48 | "-f", "wav", 49 | "pipe:1", 50 | ) 51 | cmd.Stderr = &bytes.Buffer{} 52 | cmd.Stdin = input 53 | out, err := cmd.Output() // TODO: use cmd.StdoutPipe() to achieve better concurrency 54 | if err != nil { 55 | errMsg := cmd.Stderr.(*bytes.Buffer).String() 56 | return nil, fmt.Errorf("ffmpeg error: %s", strings.TrimSpace(errMsg)) 57 | } 58 | return io.NopCloser(bytes.NewReader(out)), nil 59 | } 60 | 61 | var _ webserver.ETLServer = (*FFmpegServer)(nil) 62 | 63 | func main() { 64 | listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on") 65 | port := flag.Int("p", 8000, "Port to listen on") 66 | flag.Parse() 67 | 68 | svr := &FFmpegServer{} 69 | if svr.channels = os.Getenv("AC"); svr.channels == "" { 70 | svr.channels = "1" 71 | } 72 | if svr.samplerate = os.Getenv("AR"); svr.samplerate == "" { 73 | svr.samplerate = "44100" 74 | } 75 | 76 | if err := webserver.Run(svr, *listenAddr, *port); err != nil { 77 | log.Fatalf("Failed to start server: %v", err) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /transformers/go_FFmpeg/src/main_test.go: -------------------------------------------------------------------------------- 1 | // Package main is implementation of FFmpeg transformation in golang. 2 | /* 3 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | "os" 11 | "testing" 12 | 13 | "github.com/NVIDIA/aistore/tools/tassert" 14 | "github.com/NVIDIA/aistore/tools/tlog" 15 | ) 16 | 17 | // NOTE: This test requires ffmpeg to be installed and available in the PATH. 18 | func TestFFmpegTransform(t *testing.T) { 19 | filename := "../../tests/resources/test-audio-wav.wav" 20 | input, err := os.Open(filename) 21 | tassert.CheckError(t, err) 22 | 23 | // Send it to the ETL server 24 | svr := &FFmpegServer{ 25 | channels: "1", 26 | samplerate: "44100", 27 | } 28 | 29 | transformed, err := svr.Transform(input, filename, "") 30 | tassert.CheckError(t, err) 31 | 32 | output, err := io.ReadAll(transformed) 33 | tlog.Logf("Transformed output size: %d\n", len(output)) 34 | tassert.CheckError(t, err) 35 | tassert.Fatalf(t, bytes.HasPrefix(output, []byte("RIFF")), "Output is not a valid WAV file") 36 | } 37 | 38 | // NOTE: This test requires ffmpeg to be installed and available in the PATH. 39 | func TestFFmpegTransformMP3(t *testing.T) { 40 | filename := "../../tests/resources/test-audio-mp3.mp3" 41 | input, err := os.Open(filename) 42 | tassert.CheckError(t, err) 43 | 44 | svr := &FFmpegServer{ 45 | channels: "1", 46 | samplerate: "16000", // downsample to emphasize transformation 47 | } 48 | 49 | // Run the transform 50 | transformed, err := svr.Transform(input, filename, "") 51 | tassert.CheckError(t, err) 52 | 53 | // Read result 54 | output, err := io.ReadAll(transformed) 55 | tassert.CheckError(t, err) 56 | 57 | tlog.Logf("Transformed output size: %d bytes\n", len(output)) 58 | tlog.Logln(string(output[:10])) 59 | 60 | // Assert basic WAV structure 61 | tassert.Fatalf(t, bytes.HasPrefix(output, []byte("RIFF")), "Missing RIFF header") 62 | tassert.Fatalf(t, bytes.Contains(output, []byte("WAVEfmt ")), "Missing WAVE format chunk") 63 | tassert.Fatalf(t, bytes.Contains(output, []byte("data")), "Missing data chunk") 64 | 65 | // Make sure it's not identical to input (to verify it's transformed) 66 | input.Seek(0, io.SeekStart) 67 | original, err := io.ReadAll(input) 68 | tassert.CheckError(t, err) 69 | tassert.Fatalf(t, !bytes.Equal(output, original), "Output should not be identical to input") 70 | } 71 | -------------------------------------------------------------------------------- /transformers/go_echo/Dockerfile: -------------------------------------------------------------------------------- 1 | # Stage 1: Build the binary 2 | FROM docker.io/library/golang:1.24-alpine AS builder 3 | 4 | RUN apk add --no-cache git 5 | 6 | WORKDIR /app 7 | 8 | COPY src/go.mod src/go.sum ./ 9 | RUN go mod download 10 | 11 | COPY src/ ./ 12 | RUN go build -o echo 13 | 14 | # Stage 2: Minimal runtime image 15 | FROM alpine:3.19 16 | 17 | WORKDIR /app 18 | COPY --from=builder /app/echo . 19 | 20 | EXPOSE 8000 21 | CMD ["./echo"] 22 | -------------------------------------------------------------------------------- /transformers/go_echo/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_echo_go:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_echo_go:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/go_echo/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: echo-go 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_echo_go:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | command: ['./echo', '-l', '0.0.0.0', '-p', '8000'] 19 | readinessProbe: 20 | httpGet: 21 | path: /health 22 | port: default 23 | -------------------------------------------------------------------------------- /transformers/go_echo/src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-etl/transformers/go_echo/src 2 | 3 | go 1.24 4 | 5 | require github.com/NVIDIA/aistore v1.3.29-0.20250514164659-82fcb58b08f3 6 | 7 | require ( 8 | github.com/OneOfOne/xxhash v1.2.8 // indirect 9 | github.com/beorn7/perks v1.0.1 // indirect 10 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 11 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 12 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 13 | github.com/fxamacker/cbor/v2 v2.8.0 // indirect 14 | github.com/go-logr/logr v1.4.2 // indirect 15 | github.com/go-openapi/jsonpointer v0.21.1 // indirect 16 | github.com/go-openapi/jsonreference v0.21.0 // indirect 17 | github.com/go-openapi/swag v0.23.1 // indirect 18 | github.com/gogo/protobuf v1.3.2 // indirect 19 | github.com/golang/protobuf v1.5.4 // indirect 20 | github.com/google/gnostic-models v0.6.9 // indirect 21 | github.com/google/go-cmp v0.7.0 // indirect 22 | github.com/google/gofuzz v1.2.0 // indirect 23 | github.com/google/uuid v1.6.0 // indirect 24 | github.com/gorilla/websocket v1.5.3 // indirect 25 | github.com/josharian/intern v1.0.0 // indirect 26 | github.com/json-iterator/go v1.1.12 // indirect 27 | github.com/karrick/godirwalk v1.17.0 // indirect 28 | github.com/lufia/iostat v1.2.1 // indirect 29 | github.com/mailru/easyjson v0.9.0 // indirect 30 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 31 | github.com/modern-go/reflect2 v1.0.2 // indirect 32 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 33 | github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect 34 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 35 | github.com/pkg/errors v0.9.1 // indirect 36 | github.com/prometheus/client_golang v1.22.0 // indirect 37 | github.com/prometheus/client_model v0.6.2 // indirect 38 | github.com/prometheus/common v0.63.0 // indirect 39 | github.com/prometheus/procfs v0.16.0 // indirect 40 | github.com/spf13/pflag v1.0.6 // indirect 41 | github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect 42 | github.com/tinylib/msgp v1.2.5 // indirect 43 | github.com/x448/float16 v0.8.4 // indirect 44 | golang.org/x/net v0.39.0 // indirect 45 | golang.org/x/oauth2 v0.29.0 // indirect 46 | golang.org/x/sync v0.13.0 // indirect 47 | golang.org/x/sys v0.32.0 // indirect 48 | golang.org/x/term v0.31.0 // indirect 49 | golang.org/x/text v0.24.0 // indirect 50 | golang.org/x/time v0.11.0 // indirect 51 | google.golang.org/protobuf v1.36.6 // indirect 52 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 53 | gopkg.in/inf.v0 v0.9.1 // indirect 54 | gopkg.in/yaml.v3 v3.0.1 // indirect 55 | k8s.io/api v0.32.3 // indirect 56 | k8s.io/apimachinery v0.32.3 // indirect 57 | k8s.io/client-go v0.32.3 // indirect 58 | k8s.io/klog/v2 v2.130.1 // indirect 59 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 60 | k8s.io/metrics v0.32.3 // indirect 61 | k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect 62 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 63 | sigs.k8s.io/randfill v1.0.0 // indirect 64 | sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect 65 | sigs.k8s.io/yaml v1.4.0 // indirect 66 | ) 67 | -------------------------------------------------------------------------------- /transformers/go_echo/src/main.go: -------------------------------------------------------------------------------- 1 | // Package main is implementation of ID (echo) transformation in golang. 2 | /* 3 | * Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "flag" 10 | "io" 11 | "log" 12 | 13 | "github.com/NVIDIA/aistore/ext/etl/webserver" 14 | ) 15 | 16 | type EchoServer struct { 17 | webserver.ETLServer 18 | } 19 | 20 | func (es *EchoServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) { 21 | data, err := io.ReadAll(input) 22 | if err != nil { 23 | return nil, err 24 | } 25 | input.Close() 26 | return io.NopCloser(bytes.NewReader(data)), nil 27 | } 28 | 29 | var _ webserver.ETLServer = (*EchoServer)(nil) 30 | 31 | func main() { 32 | listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on") 33 | port := flag.Int("p", 8000, "Port to listen on") 34 | flag.Parse() 35 | 36 | svr := &EchoServer{} 37 | if err := webserver.Run(svr, *listenAddr, *port); err != nil { 38 | log.Fatalf("Failed to start server: %v", err) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /transformers/go_hello_world/Dockerfile: -------------------------------------------------------------------------------- 1 | # Stage 1: Build the binary 2 | FROM docker.io/library/golang:1.24-alpine AS builder 3 | 4 | RUN apk add --no-cache git 5 | 6 | WORKDIR /app 7 | 8 | COPY src/go.mod src/go.sum ./ 9 | RUN go mod download 10 | 11 | COPY src/ ./ 12 | RUN go build -o hello_world 13 | 14 | # Stage 2: Minimal runtime image 15 | FROM alpine:3.19 16 | 17 | WORKDIR /app 18 | COPY --from=builder /app/hello_world . 19 | 20 | EXPOSE 80 21 | CMD ["./hello_world"] 22 | -------------------------------------------------------------------------------- /transformers/go_hello_world/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_hello_world_go:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_hello_world_go:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/go_hello_world/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: hello-world-go-transformer 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_hello_world_go:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 80 17 | command: ['./echo', '-l', '0.0.0.0', '-p', '80'] 18 | readinessProbe: 19 | httpGet: 20 | path: /health 21 | port: default 22 | -------------------------------------------------------------------------------- /transformers/go_hello_world/src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-etl/transformers/go_echo/src 2 | 3 | go 1.24 4 | 5 | require github.com/NVIDIA/aistore v1.3.28-0.20250501012007-d85f26c3c672 6 | 7 | require ( 8 | github.com/OneOfOne/xxhash v1.2.8 // indirect 9 | github.com/beorn7/perks v1.0.1 // indirect 10 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 11 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 12 | github.com/emicklei/go-restful/v3 v3.12.2 // indirect 13 | github.com/fxamacker/cbor/v2 v2.8.0 // indirect 14 | github.com/go-logr/logr v1.4.2 // indirect 15 | github.com/go-openapi/jsonpointer v0.21.1 // indirect 16 | github.com/go-openapi/jsonreference v0.21.0 // indirect 17 | github.com/go-openapi/swag v0.23.1 // indirect 18 | github.com/gogo/protobuf v1.3.2 // indirect 19 | github.com/golang/protobuf v1.5.4 // indirect 20 | github.com/google/gnostic-models v0.6.9 // indirect 21 | github.com/google/go-cmp v0.7.0 // indirect 22 | github.com/google/gofuzz v1.2.0 // indirect 23 | github.com/google/uuid v1.6.0 // indirect 24 | github.com/gorilla/websocket v1.5.3 // indirect 25 | github.com/josharian/intern v1.0.0 // indirect 26 | github.com/json-iterator/go v1.1.12 // indirect 27 | github.com/karrick/godirwalk v1.17.0 // indirect 28 | github.com/lufia/iostat v1.2.1 // indirect 29 | github.com/mailru/easyjson v0.9.0 // indirect 30 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 31 | github.com/modern-go/reflect2 v1.0.2 // indirect 32 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 33 | github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect 34 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 35 | github.com/pkg/errors v0.9.1 // indirect 36 | github.com/prometheus/client_golang v1.22.0 // indirect 37 | github.com/prometheus/client_model v0.6.2 // indirect 38 | github.com/prometheus/common v0.63.0 // indirect 39 | github.com/prometheus/procfs v0.16.0 // indirect 40 | github.com/spf13/pflag v1.0.6 // indirect 41 | github.com/teris-io/shortid v0.0.0-20220617161101-71ec9f2aa569 // indirect 42 | github.com/tinylib/msgp v1.2.5 // indirect 43 | github.com/x448/float16 v0.8.4 // indirect 44 | golang.org/x/net v0.39.0 // indirect 45 | golang.org/x/oauth2 v0.29.0 // indirect 46 | golang.org/x/sync v0.13.0 // indirect 47 | golang.org/x/sys v0.32.0 // indirect 48 | golang.org/x/term v0.31.0 // indirect 49 | golang.org/x/text v0.24.0 // indirect 50 | golang.org/x/time v0.11.0 // indirect 51 | google.golang.org/protobuf v1.36.6 // indirect 52 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 53 | gopkg.in/inf.v0 v0.9.1 // indirect 54 | gopkg.in/yaml.v3 v3.0.1 // indirect 55 | k8s.io/api v0.32.3 // indirect 56 | k8s.io/apimachinery v0.32.3 // indirect 57 | k8s.io/client-go v0.32.3 // indirect 58 | k8s.io/klog/v2 v2.130.1 // indirect 59 | k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect 60 | k8s.io/metrics v0.32.3 // indirect 61 | k8s.io/utils v0.0.0-20250321185631-1f6e0b77f77e // indirect 62 | sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect 63 | sigs.k8s.io/randfill v1.0.0 // indirect 64 | sigs.k8s.io/structured-merge-diff/v4 v4.6.0 // indirect 65 | sigs.k8s.io/yaml v1.4.0 // indirect 66 | ) 67 | -------------------------------------------------------------------------------- /transformers/go_hello_world/src/main.go: -------------------------------------------------------------------------------- 1 | // Package main is implementation of a simple hello world transformation in golang. 2 | /* 3 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "flag" 10 | "io" 11 | "log" 12 | 13 | "github.com/NVIDIA/aistore/ext/etl/webserver" 14 | ) 15 | 16 | type HelloWorldServer struct { 17 | response string 18 | webserver.ETLServer 19 | } 20 | 21 | func (es *HelloWorldServer) Transform(input io.ReadCloser, path, args string) (io.ReadCloser, error) { 22 | input.Close() 23 | return io.NopCloser(bytes.NewReader([]byte(es.response))), nil 24 | } 25 | 26 | var _ webserver.ETLServer = (*HelloWorldServer)(nil) 27 | 28 | func main() { 29 | listenAddr := flag.String("l", "0.0.0.0", "IP address to listen on") 30 | port := flag.Int("p", 80, "Port to listen on") 31 | flag.Parse() 32 | 33 | svr := &HelloWorldServer{ 34 | response: "Hello World!", 35 | } 36 | 37 | if err := webserver.Run(svr, *listenAddr, *port); err != nil { 38 | log.Fatalf("Failed to start server: %v", err) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /transformers/hash_with_args/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-alpine 2 | 3 | # Install git and build dependencies 4 | RUN apk update && apk add --no-cache git gcc musl-dev libffi-dev 5 | 6 | # Set working directory 7 | WORKDIR /code 8 | 9 | # Install Python dependencies 10 | RUN pip install --no-cache-dir --upgrade \ 11 | "git+https://github.com/NVIDIA/aistore.git@etl-args-webserver#subdirectory=python" \ 12 | "fastapi>=0.109.1" \ 13 | "httpx>=0.28.0" \ 14 | "aiofiles>=23.2.1" \ 15 | "uvicorn[standard]>=0.32.0" \ 16 | "flask>=2.3.0" \ 17 | "gunicorn>=23.0.0" 18 | 19 | # Copy application code 20 | COPY flask_server.py fastapi_server.py http_server.py ./ 21 | 22 | # Environment setup 23 | ENV PYTHONUNBUFFERED=1 24 | 25 | # Expose default port 26 | EXPOSE 8000 27 | -------------------------------------------------------------------------------- /transformers/hash_with_args/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_hash_with_args:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_hash_with_args:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/hash_with_args/README.md: -------------------------------------------------------------------------------- 1 | # Hash with Args Transformer 2 | 3 | A simple hash transformer that processes objects (bytes) by extracting ETL arguments from an inline transform request and using it as a seed value to compute a seeded hash. This example demonstrates how to pass custom metadata for each individual object through an ETL inline transform and utilize it within your pod. 4 | 5 | ### Initializing ETL with AIStore CLI 6 | 7 | The following steps demonstrate how to initialize the `transformer-hash-with-args` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md): 8 | 9 | ```!bash 10 | $ cd transformers/hash_with_args 11 | 12 | $ # Mention communication type b/w target and container 13 | $ export COMMUNICATION_TYPE='hpull://' 14 | 15 | # Substitute env variables in spec file 16 | $ envsubst < pod.yaml > init_spec.yaml 17 | 18 | $ # Initialize ETL 19 | $ ais etl init spec --from-file init_spec.yaml --name --comm-type "hpull://" 20 | 21 | $ # Put an object 22 | $ ais object put ais:// 23 | 24 | $ # Transform and retrieve objects from the bucket using this ETL with arguments 25 | $ curl -L -X GET "${AIS_ENDPOINT}/v1/objects//?etl_name=&etl_meta=100000" 26 | ``` -------------------------------------------------------------------------------- /transformers/hash_with_args/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | HashWithArgs ETL transformer (FastAPI) 3 | 4 | FastAPI-based ETL server that computes an XXHash64 digest of each request's payload, 5 | optionally seeded via the `etl_args` query parameter. 6 | 7 | Environment: 8 | SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0) 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import os 14 | import logging 15 | from typing import Optional 16 | 17 | import xxhash 18 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 19 | 20 | 21 | class HashWithArgs(FastAPIServer): 22 | """ 23 | ETL server that computes an XXHash64 digest of each payload. 24 | 25 | Supports an optional `etl_args` parameter (string) specifying the numeric seed. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | port: int = 8000, 31 | *, 32 | default_seed: Optional[int] = None, 33 | ) -> None: 34 | """ 35 | Initialize the HashWithArgs server. 36 | 37 | Args: 38 | port: TCP port to listen on (default 8000). 39 | default_seed: fallback seed if ETL args absent/invalid. 40 | If None, reads `SEED_DEFAULT` env var (defaulting to 0). 41 | """ 42 | super().__init__(port=port) 43 | self.logger.setLevel(logging.DEBUG) 44 | if default_seed: 45 | self.default_seed = default_seed 46 | else: 47 | try: 48 | self.default_seed = int(os.getenv("SEED_DEFAULT", "0")) 49 | except ValueError: 50 | self.logger.warning( 51 | "Invalid SEED_DEFAULT='%s', falling back to 0", 52 | os.getenv("SEED_DEFAULT"), 53 | ) 54 | self.default_seed = 0 55 | 56 | def transform( 57 | self, 58 | data: bytes, 59 | _path: str, 60 | etl_args: str, 61 | ) -> bytes: 62 | """ 63 | Compute the XXHash64 digest of the input data. 64 | 65 | Args: 66 | data: Raw request payload. 67 | path: Request path or object key (unused here). 68 | etl_args: optional seed passed via `?etl_args=`. 69 | 70 | Returns: 71 | The lowercase hexadecimal digest as ASCII-encoded bytes. 72 | """ 73 | seed = self.default_seed 74 | if etl_args: 75 | try: 76 | seed = int(etl_args) 77 | except ValueError: 78 | self.logger.warning( 79 | "Invalid etl_args seed=%r, using default_seed=%d", 80 | etl_args, 81 | self.default_seed, 82 | ) 83 | hasher = xxhash.xxh64(seed=seed) 84 | hasher.update(data) 85 | # hexdigest() is str → encode to ASCII bytes 86 | return hasher.hexdigest().encode("ascii") 87 | 88 | 89 | # instantiate and expose 90 | fastapi_server = HashWithArgs() 91 | fastapi_app = fastapi_server.app 92 | -------------------------------------------------------------------------------- /transformers/hash_with_args/flask_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | HashWithArgs ETL transformer (Flask) 3 | 4 | Flask-based ETL server that computes an XXHash64 digest of each request's payload, 5 | optionally seeded via the `etl_args` query parameter. 6 | 7 | Environment: 8 | SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0) 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import os 14 | import logging 15 | from typing import Optional 16 | 17 | import xxhash 18 | from aistore.sdk.etl.webserver.flask_server import FlaskServer 19 | 20 | 21 | class HashWithArgs(FlaskServer): 22 | """ 23 | ETL server that computes an XXHash64 digest of each payload. 24 | 25 | Supports an optional `etl_args` parameter (string) specifying the numeric seed. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | port: int = 8000, 31 | *, 32 | default_seed: Optional[int] = None, 33 | ) -> None: 34 | """ 35 | Initialize the HashWithArgs server. 36 | 37 | Args: 38 | host: interface to bind on (default "0.0.0.0"). 39 | port: TCP port to listen on (default 8000). 40 | default_seed: fallback seed if ETL args absent/invalid. 41 | If None, reads `SEED_DEFAULT` env var (defaulting to 0). 42 | """ 43 | super().__init__(port=port) 44 | self.logger.setLevel(logging.DEBUG) 45 | if default_seed: 46 | self.default_seed = default_seed 47 | else: 48 | try: 49 | self.default_seed = int(os.getenv("SEED_DEFAULT", "0")) 50 | except ValueError: 51 | self.logger.warning( 52 | "Invalid SEED_DEFAULT='%s', falling back to 0", 53 | os.getenv("SEED_DEFAULT"), 54 | ) 55 | self.default_seed = 0 56 | 57 | def transform( 58 | self, 59 | data: bytes, 60 | _path: str, 61 | etl_args: str, 62 | ) -> bytes: 63 | """ 64 | Compute the XXHash64 digest of the input data. 65 | 66 | Args: 67 | data: Raw request payload. 68 | path: Request path or object key (unused here). 69 | etl_args: optional seed passed via `?etl_args=`. 70 | 71 | Returns: 72 | The lowercase hexadecimal digest as ASCII-encoded bytes. 73 | """ 74 | seed = self.default_seed 75 | if etl_args: 76 | try: 77 | seed = int(etl_args) 78 | except ValueError: 79 | self.logger.warning( 80 | "Invalid etl_args seed=%r, using default_seed=%d", 81 | etl_args, 82 | self.default_seed, 83 | ) 84 | hasher = xxhash.xxh64(seed=seed) 85 | hasher.update(data) 86 | # hexdigest() is str → encode to ASCII bytes 87 | return hasher.hexdigest().encode("ascii") 88 | 89 | 90 | # instantiate and expose 91 | flask_server = HashWithArgs() 92 | flask_app = flask_server.app 93 | -------------------------------------------------------------------------------- /transformers/hash_with_args/http_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | HashWithArgs ETL transformer (HTTP Server) 3 | 4 | HTTP-based ETL server that computes an XXHash64 digest of each request's payload, 5 | optionally seeded via the `etl_args` query parameter. 6 | 7 | Environment: 8 | SEED_DEFAULT default integer seed if etl_args is missing or invalid (default: 0) 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import os 14 | import logging 15 | from typing import Optional 16 | 17 | import xxhash 18 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer 19 | 20 | 21 | class HashWithArgs(HTTPMultiThreadedServer): 22 | """ 23 | ETL server that computes an XXHash64 digest of each payload. 24 | 25 | Supports an optional `etl_args` parameter (string) specifying the numeric seed. 26 | """ 27 | 28 | def __init__( 29 | self, 30 | port: int = 8000, 31 | *, 32 | default_seed: Optional[int] = None, 33 | ) -> None: 34 | """ 35 | Initialize the HashWithArgs server. 36 | 37 | Args: 38 | port: TCP port to listen on (default 8000). 39 | default_seed: fallback seed if ETL args absent/invalid. 40 | If None, reads `SEED_DEFAULT` env var (defaulting to 0). 41 | """ 42 | super().__init__(port=port) 43 | self.logger.setLevel(logging.DEBUG) 44 | if default_seed: 45 | self.default_seed = default_seed 46 | else: 47 | try: 48 | self.default_seed = int(os.getenv("SEED_DEFAULT", "0")) 49 | except ValueError: 50 | self.logger.warning( 51 | "Invalid SEED_DEFAULT='%s', falling back to 0", 52 | os.getenv("SEED_DEFAULT"), 53 | ) 54 | self.default_seed = 0 55 | 56 | def transform( 57 | self, 58 | data: bytes, 59 | _path: str, 60 | etl_args: str, 61 | ) -> bytes: 62 | """ 63 | Compute the XXHash64 digest of the input data. 64 | 65 | Args: 66 | data: Raw request payload. 67 | path: Request path or object key (unused here). 68 | etl_args: optional seed passed via `?etl_args=`. 69 | 70 | Returns: 71 | The lowercase hexadecimal digest as ASCII-encoded bytes. 72 | """ 73 | seed = self.default_seed 74 | if etl_args: 75 | try: 76 | seed = int(etl_args) 77 | except ValueError: 78 | self.logger.warning( 79 | "Invalid etl_args seed=%r, using default_seed=%d", 80 | etl_args, 81 | self.default_seed, 82 | ) 83 | hasher = xxhash.xxh64(seed=seed) 84 | hasher.update(data) 85 | # hexdigest() is str → encode to ASCII bytes 86 | return hasher.hexdigest().encode("ascii") 87 | 88 | 89 | # instantiate and expose 90 | if __name__ == "__main__": 91 | server = HashWithArgs() 92 | server.start() 93 | -------------------------------------------------------------------------------- /transformers/hash_with_args/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-hash-with-args 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_hash_with_args:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 8000 17 | # for flask based app 18 | # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"] 19 | # for http based app 20 | # command: ["python", "http_server.py"] 21 | # for fastapi based app 22 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 23 | readinessProbe: 24 | httpGet: 25 | path: /health 26 | port: default 27 | env: 28 | - name: SEED_DEFAULT 29 | value: "0" 30 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 31 | # where the objects are stored on AIStore targets. This allows the ETL container 32 | # to access the files directly by absolute path. 33 | # volumeMounts: 34 | # - name: ais 35 | # mountPath: /tmp/ 36 | # volumes: 37 | # - name: ais 38 | # hostPath: 39 | # path: /tmp/ 40 | # type: Directory 41 | -------------------------------------------------------------------------------- /transformers/hash_with_args/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | xxhash -------------------------------------------------------------------------------- /transformers/hash_with_args/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import xxhash 5 | import requests 6 | import os 7 | import logging 8 | from urllib.parse import urlparse, parse_qs 9 | from http.server import HTTPServer, BaseHTTPRequestHandler 10 | from socketserver import ThreadingMixIn 11 | 12 | host_target = os.environ["AIS_TARGET_URL"] 13 | seed_default = int(os.getenv("SEED_DEFAULT", "0")) 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format="%(asctime)s - %(levelname)s - %(message)s", 19 | ) 20 | 21 | 22 | class Handler(BaseHTTPRequestHandler): 23 | def log_request(self, code="-", size="-"): 24 | # Don't log successful requests info. Unsuccessful logged by log_error(). 25 | pass 26 | 27 | def _set_headers(self): 28 | self.send_response(200) 29 | self.send_header("Content-Type", "text/plain") 30 | self.end_headers() 31 | 32 | def do_PUT(self): 33 | try: 34 | content_length = int(self.headers["Content-Length"]) 35 | post_data = self.rfile.read(content_length) 36 | parsed_url = urlparse(self.path) 37 | seed = seed_default 38 | logging.info("PUT request received") 39 | params = parse_qs(parsed_url.query) 40 | if "etl_args" in params: 41 | seed = int(params["etl_args"][0]) 42 | logging.info("PUT request with seed %d", seed) 43 | 44 | hash_result = self.calculate_xxhash(post_data, seed) 45 | self._set_headers() 46 | self.wfile.write(hash_result.encode()) 47 | except Exception as e: 48 | logging.error("Error in PUT request: %s", e) 49 | self.send_error(500, f"Internal Server Error: {e}") 50 | 51 | def do_GET(self): 52 | if self.path == "/health": 53 | self._set_headers() 54 | self.wfile.write(b"Running") 55 | return 56 | 57 | try: 58 | parsed_url = urlparse(self.path) 59 | x = requests.get(host_target + self.path) 60 | 61 | seed = seed_default 62 | logging.info("GET request received") 63 | params = parse_qs(parsed_url.query) 64 | if "etl_args" in params: 65 | seed = int(params["etl_args"][0]) 66 | logging.info("GET request with seed %d", seed) 67 | 68 | hash_result = self.calculate_xxhash(x.content, seed) 69 | self._set_headers() 70 | self.wfile.write(hash_result.encode()) 71 | except requests.HTTPError as http_err: 72 | logging.error("HTTP error in GET request: %s", http_err) 73 | self.send_error(502, f"Bad Gateway: {http_err}") 74 | except Exception as e: 75 | logging.error("Error in GET request: %s", e) 76 | self.send_error(500, f"Internal Server Error: {e}") 77 | 78 | def calculate_xxhash(self, data, seed): 79 | hasher = xxhash.xxh64(seed=seed) 80 | hasher.update(data) 81 | return hasher.hexdigest() 82 | 83 | 84 | class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): 85 | """Handle requests in a separate thread.""" 86 | 87 | 88 | def run(addr="localhost", port=8000): 89 | """Start the threaded HTTP server.""" 90 | logging.info("Starting HTTP server on %s:%s", addr, port) 91 | try: 92 | server = ThreadedHTTPServer((addr, port), Handler) 93 | server.serve_forever() 94 | except KeyboardInterrupt: 95 | logging.info("Shutting down the server.") 96 | except Exception as e: 97 | logging.error("Unexpected server error: %s", e) 98 | finally: 99 | logging.info("Server stopped.") 100 | 101 | 102 | if __name__ == "__main__": 103 | parser = argparse.ArgumentParser(description="Run a simple HTTP server") 104 | parser.add_argument( 105 | "-l", 106 | "--listen", 107 | default="localhost", 108 | help="Specify the IP address on which the server listens", 109 | ) 110 | parser.add_argument( 111 | "-p", 112 | "--port", 113 | type=int, 114 | default=8000, 115 | help="Specify the port on which the server listens", 116 | ) 117 | args = parser.parse_args() 118 | run(addr=args.listen, port=args.port) 119 | -------------------------------------------------------------------------------- /transformers/hello_world/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 4 | 5 | # Set working directory 6 | RUN mkdir /code 7 | WORKDIR /code 8 | 9 | # Copy app code 10 | COPY flask_server.py fastapi_server.py http_server.py ./ 11 | 12 | # Environment setup 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | # Expose the default port 16 | EXPOSE 8000 17 | -------------------------------------------------------------------------------- /transformers/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build --no-cache -t $(REGISTRY_URL)/transformer_hello_world:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_hello_world:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/hello_world/README.md: -------------------------------------------------------------------------------- 1 | # Simple Hello World Transformer 2 | 3 | A simple hello world transformer that reads objects stored in AIStore and returns "Hello World" in bytes for every object stored. 4 | 5 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration. 6 | 7 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms). 8 | 9 | ### Initializing ETL with AIStore CLI 10 | 11 | The following steps demonstrate how to initialize the `hello-world-transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md): 12 | 13 | ```!bash 14 | $ cd transformers/hello_world 15 | 16 | $ # Mention communication type b/w target and container 17 | $ export COMMUNICATION_TYPE = 'hpull://' 18 | 19 | # Substitute env variables in spec file 20 | $ envsubst < pod.yaml > init_spec.yaml 21 | 22 | $ # Initialize ETL 23 | $ ais etl init spec --from-file init_spec.yaml --name 24 | 25 | $ # Transform and retrieve objects from the bucket using this ETL 26 | $ # For inline transformation 27 | $ ais etl object ais:///. - 28 | 29 | $ # Or, for offline (bucket-to-bucket) transformation 30 | $ ais etl bucket ais://src-bck ais://dst-bck 31 | ``` -------------------------------------------------------------------------------- /transformers/hello_world/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | A FastAPI-based beginner-friendly "Hello World" web server. 3 | 4 | Responds with "Hello World!" to any GET or PUT request. 5 | 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 7 | """ 8 | 9 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 10 | 11 | 12 | class HelloWorldServerFastAPI(FastAPIServer): 13 | """ 14 | A simple FastAPI-based ETL transformer that returns b"Hello World!" as output 15 | for any incoming data, regardless of the request path or content. 16 | """ 17 | 18 | def transform(self, *_args) -> bytes: 19 | return b"Hello World!" 20 | 21 | 22 | # Instantiate the server and expose its FastAPI app 23 | fastapi_server = HelloWorldServerFastAPI(port=8000) 24 | fastapi_server.logger.setLevel("DEBUG") 25 | fastapi_app = fastapi_server.app # This is what uvicorn will run 26 | -------------------------------------------------------------------------------- /transformers/hello_world/flask_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Flask-based beginner-friendly "Hello World" web server. 3 | 4 | Responds with "Hello World!" to any GET or PUT request. 5 | 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 7 | 8 | """ 9 | 10 | from aistore.sdk.etl.webserver.flask_server import FlaskServer 11 | 12 | 13 | class HelloWorldServerFlask(FlaskServer): 14 | """ 15 | A simple Flask-based ETL transformer that returns b"Hello World!" as output 16 | for any incoming data, regardless of the request path or content. 17 | """ 18 | 19 | def transform(self, *_args) -> bytes: 20 | return b"Hello World!" 21 | 22 | 23 | flask_server = HelloWorldServerFlask(port=8000) 24 | flask_server.logger.setLevel("DEBUG") 25 | flask_app = flask_server.app 26 | -------------------------------------------------------------------------------- /transformers/hello_world/http_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | A HTTP-based beginner-friendly "Hello World" web server. 3 | 4 | Responds with "Hello World!" to any GET or PUT request. 5 | 6 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 7 | 8 | """ 9 | 10 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer 11 | 12 | 13 | class HelloWorldHTTPServer(HTTPMultiThreadedServer): 14 | """ 15 | A simple HTTP-based ETL transformer that returns b"Hello World!" as output 16 | for any incoming data, regardless of the request path or content. 17 | """ 18 | 19 | def transform(self, *_args) -> bytes: 20 | return b"Hello World!" 21 | 22 | 23 | if __name__ == "__main__": 24 | http_server = HelloWorldHTTPServer(port=8000) 25 | http_server.logger.setLevel("DEBUG") 26 | http_server.start() 27 | -------------------------------------------------------------------------------- /transformers/hello_world/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-hello-world 5 | annotations: 6 | # Values it can take ["hpull://", "hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpush://\""} 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_hello_world:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | # for flask based app 19 | # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"] 20 | # for http based app 21 | # command: ["python", "http_server.py"] 22 | # for fastapi based app 23 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 24 | readinessProbe: 25 | httpGet: 26 | path: /health 27 | port: default 28 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 29 | # where the objects are stored on AIStore targets. This allows the ETL container 30 | # to access the files directly by absolute path. 31 | volumeMounts: 32 | - name: ais 33 | mountPath: /mnt/data/ais 34 | volumes: 35 | - name: ais 36 | hostPath: 37 | path: /mnt/data/ais 38 | type: Directory 39 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.9-slim 2 | 3 | WORKDIR / 4 | 5 | COPY ./requirements.txt requirements.txt 6 | 7 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt 8 | 9 | COPY main.py main.py 10 | 11 | ENV PYTHONUNBUFFERED 1 12 | ENV TF_ENABLE_ONEDNN_OPTS 0 13 | 14 | EXPOSE 8000 15 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_keras:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Keras Transformer - Image Data Augmentation and Preprocessing 2 | 3 | The Keras Transformer is a powerful tool designed for image data preprocessing and data augmentation. Leveraging the `apply_transform` function from Keras (TensorFlow), this transformer allows users to define transformations by providing a JSON string with parameter-value pairs. Currently, the following parameters are supported: 4 | 5 | | Parameter | Description | 6 | |-------------------------|---------------------------------------------------------| 7 | | 'theta' | Rotation angle in degrees. | 8 | | 'tx' | Shift in the x direction. | 9 | | 'ty' | Shift in the y direction. | 10 | | 'shear' | Shear angle in degrees. | 11 | | 'zx' | Zoom in the x direction. | 12 | | 'zy' | Zoom in the y direction. | 13 | | 'flip_horizontal' | Boolean. Enable horizontal flip. | 14 | | 'flip_vertical' | Boolean. Enable vertical flip. | 15 | | 'channel_shift_intensity' | Float. Channel shift intensity. | 16 | | 'brightness' | Float. Brightness shift intensity. | 17 | 18 | The image format (JPEG, PNG, etc.) of the images to be processed or stored is specified in the `spec.yaml`. 19 | 20 | The transformer supports both `hpull` and `hpush` communication mechanisms for seamless integration. 21 | 22 | **Please Note:** This transformer utilizes the [`FastAPI`](https://fastapi.tiangolo.com/) framework alongside the [`Gunicorn`](https://gunicorn.org/) + [Uvicorn](https://www.uvicorn.org/) combination as its web server. Alternate implementations of the same functionality are provided using [`Flask`](https://flask.palletsprojects.com/en/2.3.x/) and [`Gunicorn`](https://gunicorn.org/) within the [`flask-gunicorn`](/flask-gunicorn) directory. Additionally, there's a version that employs a multithreaded HTTP server, which can be found in the [`http-multithreaded-server`](/http-multithreaded-server/) folder. 23 | 24 | > For more information on communication mechanisms, please refer to [this link](https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms). 25 | 26 | ## Parameters 27 | Only two parameters need to be updated in the `pod.yaml` file. 28 | 29 | | Argument | Description | Default Value | 30 | | ----------- | --------------------------------------------------------------------- | ------------- | 31 | | `TRANSFORM` | Specify a JSON string with operations to be performed | `` | 32 | | `FORMAT`| To process/store images in which image format (PNG, JPEG,etc) | `JPEG` | 33 | 34 | Please ensure to adjust these parameters according to your specific requirements. 35 | 36 | ### Initializing ETL with AIStore CLI 37 | 38 | The following steps demonstrate how to initialize the `Keras Transformer` with using the [AIStore CLI](https://github.com/NVIDIA/aistore/blob/main/docs/cli.md): 39 | 40 | ```!bash 41 | $ cd transformers/keras_transformer 42 | 43 | $ # Set values for FORMAT and TRANSFORM 44 | $ export FORMAT="JPEG" 45 | $ export TRANSFORM='{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}' 46 | 47 | $ # Mention communication type b/w target and container 48 | $ export COMMUNICATION_TYPE = 'hpull://' 49 | 50 | # Substitute env variables in spec file 51 | $ envsubst < pod.yaml > init_spec.yaml 52 | 53 | $ # Initialize ETL 54 | $ ais etl init spec --from-file init_spec.yaml --name 55 | 56 | $ # Transform and retrieve objects from the bucket using this ETL 57 | $ # For inline transformation 58 | $ ais etl object ais://src/.JPEG dst.JPEG 59 | $ # Or, for offline (bucket-to-bucket) transformation 60 | $ ais etl bucket ais://src-bck ais://dst-bck --ext="{JPEG:JPEG}" 61 | ``` -------------------------------------------------------------------------------- /transformers/keras_preprocess/flask-gunicorn/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:slim 2 | 3 | COPY requirements.txt requirements.txt 4 | RUN pip3 install --upgrade -r requirements.txt 5 | 6 | COPY app.py app.py 7 | 8 | ENV PYTHONUNBUFFERED 1 9 | 10 | ENV FLASK_APP=app.py 11 | 12 | # Expose Gunicorn port 13 | EXPOSE 80 14 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/flask-gunicorn/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_keras:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/flask-gunicorn/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # pylint: disable=missing-class-docstring, missing-function-docstring, missing-module-docstring, broad-exception-caught 6 | 7 | import os 8 | import json 9 | import logging 10 | import io 11 | 12 | import urllib 13 | import requests 14 | from flask import Flask, request 15 | from keras.preprocessing.image import ( 16 | ImageDataGenerator, 17 | load_img, 18 | array_to_img, 19 | img_to_array, 20 | ) 21 | 22 | app = Flask(__name__) 23 | 24 | # Constants 25 | FORMAT = os.getenv("FORMAT", "JPEG") 26 | ARG_TYPE = os.getenv("ARG_TYPE", "bytes") 27 | 28 | # Environment Variables 29 | host_target = os.environ.get("AIS_TARGET_URL") 30 | 31 | logging.info(host_target) 32 | 33 | TRANSFORM = os.environ.get("TRANSFORM") 34 | if not host_target: 35 | raise EnvironmentError("AIS_TARGET_URL environment variable missing") 36 | if not TRANSFORM: 37 | raise EnvironmentError( 38 | "TRANSFORM environment variable missing. Check documentation for examples (link)" 39 | ) 40 | transform_dict = json.loads(TRANSFORM) 41 | 42 | 43 | def transform_image(data: bytes) -> bytes: 44 | """Process image data as bytes using the specified transformation.""" 45 | try: 46 | img = load_img(io.BytesIO(data)) 47 | img = img_to_array(img) 48 | datagen = ImageDataGenerator() 49 | img = datagen.apply_transform(x=img, transform_parameters=transform_dict) 50 | img = array_to_img(img) 51 | buf = io.BytesIO() 52 | img.save(buf, format=FORMAT) 53 | return buf.getvalue() 54 | except Exception as exp: 55 | logging.error("Error processing data in transform_image: %s", str(exp)) 56 | raise exp 57 | 58 | 59 | @app.route("/health") 60 | def health_check(): 61 | return "Running" 62 | 63 | 64 | @app.route("/", defaults={"path": ""}, methods=["PUT", "GET"]) 65 | @app.route("/", methods=["PUT", "GET"]) 66 | def image_handler(path: str): # pylint: disable=unused-argument 67 | try: 68 | if request.method == "PUT": 69 | post_data = request.data 70 | processed_data = transform_image(post_data) 71 | if processed_data is not None: 72 | return processed_data, 200 73 | return "Data processing failed", 500 74 | 75 | if request.method == "GET": 76 | if ARG_TYPE == "url": 77 | # webdataset 78 | query_path = request.args.get("url") 79 | result = transform_image(requests.get(query_path, timeout=5).content) 80 | else: 81 | # normal GET - hpull 82 | object_path = urllib.parse.quote(path, safe="@") 83 | object_url = f"{host_target}/{object_path}" 84 | resp = requests.get(object_url, timeout=5) 85 | if resp.status_code != 200: 86 | raise FileNotFoundError( 87 | f"Error getting '{path}' from '{host_target}'" 88 | ) 89 | result = transform_image(resp.content) 90 | 91 | if result is not None: 92 | return result, 200 93 | return "Data processing failed", 500 94 | except Exception as exp: 95 | logging.error("Error processing request: %s", str(exp)) 96 | return "Data processing failed", 500 97 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/flask-gunicorn/pod.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md 2 | apiVersion: v1 3 | kind: Pod 4 | metadata: 5 | name: transformer-keras 6 | annotations: 7 | # Values `communication_type` can take are ["hpull://", "hpush://"]. 8 | # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 9 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 10 | wait_timeout: 10m 11 | spec: 12 | containers: 13 | - name: server 14 | image: aistorage/transformer_keras:latest 15 | imagePullPolicy: Always 16 | ports: 17 | - name: default 18 | containerPort: 80 19 | command: ["gunicorn", "--bind", "0.0.0.0:80", "--workers", "12", "app:app"] 20 | 21 | env: 22 | - name: FORMAT 23 | # expected values - PNG, JPEG, etc 24 | value: ${FORMAT:-"JPEG"} 25 | - name: TRANSFORM 26 | # MANDATORY: expected json string parameter-value pairs. 27 | # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform 28 | # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}' 29 | value: ${TRANSFORM} 30 | readinessProbe: 31 | httpGet: 32 | path: /health 33 | port: default 34 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/flask-gunicorn/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pillow 3 | scipy 4 | keras 5 | tensorflow 6 | Flask 7 | gunicorn -------------------------------------------------------------------------------- /transformers/keras_preprocess/http-multithreaded-server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:slim 2 | 3 | COPY requirements.txt requirements.txt 4 | RUN pip3 install --upgrade -r requirements.txt 5 | 6 | RUN mkdir /code 7 | WORKDIR /code 8 | COPY server.py server.py 9 | 10 | ENV PYTHONUNBUFFERED 1 11 | 12 | EXPOSE 80 13 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/http-multithreaded-server/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_keras:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_keras:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/http-multithreaded-server/pod.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md 2 | apiVersion: v1 3 | kind: Pod 4 | metadata: 5 | name: transformer-keras 6 | annotations: 7 | # Values `communication_type` can take are ["hpull://", "hpush://"]. 8 | # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 9 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 10 | wait_timeout: 5m 11 | spec: 12 | containers: 13 | - name: server 14 | image: aistorage/transformer_keras:latest 15 | imagePullPolicy: Always 16 | ports: 17 | - name: default 18 | containerPort: 80 19 | command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80'] 20 | env: 21 | - name: FORMAT 22 | # expected values - PNG, JPEG, etc 23 | value: ${FORMAT:-"JPEG"} 24 | - name: TRANSFORM 25 | # MANDATORY: expected json string parameter-value pairs. 26 | # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform 27 | # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}' 28 | value: ${TRANSFORM} 29 | readinessProbe: 30 | httpGet: 31 | path: /health 32 | port: default 33 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/http-multithreaded-server/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | pillow 3 | scipy 4 | keras 5 | tensorflow -------------------------------------------------------------------------------- /transformers/keras_preprocess/pod.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/NVIDIA/ais-etl/blob/main/transformers/keras_transformer/README.md 2 | apiVersion: v1 3 | kind: Pod 4 | metadata: 5 | name: transformer-keras 6 | annotations: 7 | # Values `communication_type` can take are ["hpull://", "hpush://"]. 8 | # Visit https://github.com/NVIDIA/aistore/blob/main/docs/etl.md#communication-mechanisms 9 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 10 | wait_timeout: 10m 11 | spec: 12 | containers: 13 | - name: server 14 | image: aistorage/transformer_keras:latest 15 | imagePullPolicy: Always 16 | ports: 17 | - name: default 18 | containerPort: 8000 19 | # change worker nodes to x2 of number of cores (cpu) available 20 | command: ["gunicorn", "main:app", "--workers", "12", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000"] 21 | env: 22 | - name: FORMAT 23 | # expected values - PNG, JPEG, etc 24 | value: ${FORMAT:-"JPEG"} 25 | - name: TRANSFORM 26 | # MANDATORY: expected json string parameter-value pairs. 27 | # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#apply_transform 28 | # e.g. '{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}' 29 | value: ${TRANSFORM} 30 | - name: ARG_TYPE 31 | value: ${ARG_TYPE:-""} 32 | readinessProbe: 33 | httpGet: 34 | path: /health 35 | port: default 36 | volumeMounts: 37 | - name: ais 38 | mountPath: /tmp/ 39 | volumes: 40 | - name: ais 41 | hostPath: 42 | path: /tmp/ 43 | type: Directory 44 | -------------------------------------------------------------------------------- /transformers/keras_preprocess/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.109.1 2 | uvicorn==0.24.0.post1 3 | gunicorn==23.0.0 4 | aiohttp>=3.9.2 5 | pillow==10.3.0 6 | scipy==1.10.1 7 | tensorflow==2.18.0 -------------------------------------------------------------------------------- /transformers/md5/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.13-alpine 2 | 3 | RUN pip3 install --upgrade aistore[etl]>=1.13.6 4 | 5 | # Set working directory 6 | RUN mkdir /code 7 | WORKDIR /code 8 | 9 | # Copy app code 10 | COPY flask_server.py fastapi_server.py http_server.py ./ 11 | 12 | # Environment setup 13 | ENV PYTHONUNBUFFERED=1 14 | 15 | # Expose the default port 16 | EXPOSE 8000 17 | -------------------------------------------------------------------------------- /transformers/md5/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build --no-cache -t $(REGISTRY_URL)/transformer_md5:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_md5:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/md5/fastapi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | MD5 Hashing ETL Transformer (Fast-API) 3 | 4 | This module implements an ETL transformer as a FastAPI-based server 5 | that computes the MD5 checksum of each incoming request's payload 6 | and returns the hexadecimal digest in the response body. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import hashlib 12 | from aistore.sdk.etl.webserver.fastapi_server import FastAPIServer 13 | 14 | 15 | class Md5Server(FastAPIServer): 16 | """ 17 | FastAPI-based HTTP server for MD5 hashing. 18 | 19 | Inherits from FastAPIServer to handle concurrent transform requests. 20 | """ 21 | 22 | def transform(self, data: bytes, *_args) -> bytes: 23 | """ 24 | Compute the MD5 digest of the request payload. 25 | """ 26 | return hashlib.md5(data).hexdigest().encode() 27 | 28 | 29 | # Create the server instance and expose the FastAPI app 30 | fastapi_server = Md5Server(port=8000) 31 | fastapi_server.logger.setLevel("DEBUG") 32 | fastapi_app = fastapi_server.app # Expose the FastAPI app 33 | -------------------------------------------------------------------------------- /transformers/md5/flask_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | MD5 Hashing ETL Transformer (Flask) 3 | 4 | This module implements an ETL transformer as a Flask-based HTTP server 5 | that computes the MD5 checksum of each incoming request's payload 6 | and returns the hexadecimal digest in the response body. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import hashlib 12 | from aistore.sdk.etl.webserver.flask_server import FlaskServer 13 | 14 | 15 | class Md5Server(FlaskServer): 16 | """ 17 | Flask-based HTTP server for MD5 hashing. 18 | 19 | Inherits from FlaskServer to handle concurrent transform requests. 20 | """ 21 | 22 | def transform(self, data: bytes, *_args) -> bytes: 23 | """ 24 | Compute the MD5 digest of the request payload. 25 | """ 26 | return hashlib.md5(data).hexdigest().encode() 27 | 28 | 29 | flask_server = Md5Server(port=8000) 30 | flask_server.logger.setLevel("DEBUG") 31 | flask_app = flask_server.app 32 | -------------------------------------------------------------------------------- /transformers/md5/http_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | MD5 Hashing ETL Transformer 3 | 4 | This module implements an ETL transformer as a multi-threaded HTTP server 5 | that computes the MD5 checksum of each incoming request's payload 6 | and returns the hexadecimal digest in the response body. 7 | 8 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 9 | """ 10 | 11 | import hashlib 12 | from aistore.sdk.etl.webserver.http_multi_threaded_server import HTTPMultiThreadedServer 13 | 14 | 15 | class Md5Server(HTTPMultiThreadedServer): 16 | """ 17 | Multi-threaded HTTP server for MD5 hashing. 18 | 19 | Inherits from HTTPMultiThreadedServer to handle concurrent transform 20 | requests. Each request body is hashed independently. 21 | """ 22 | 23 | def transform(self, data: bytes, *_args) -> bytes: 24 | """ 25 | Compute the MD5 digest of the request payload. 26 | """ 27 | return hashlib.md5(data).hexdigest().encode() 28 | 29 | 30 | if __name__ == "__main__": 31 | server = Md5Server() 32 | server.logger.setLevel("DEBUG") 33 | server.start() 34 | -------------------------------------------------------------------------------- /transformers/md5/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-md5 5 | annotations: 6 | # Values it can take ["hpull://", "hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpush://\""} 8 | wait_timeout: 5m 9 | support_direct_put: "true" 10 | spec: 11 | containers: 12 | - name: server 13 | image: aistorage/transformer_md5:latest 14 | imagePullPolicy: Always 15 | ports: 16 | - name: default 17 | containerPort: 8000 18 | # for flask based app 19 | # command: ["gunicorn", "flask_server:flask_app", "--bind", "0.0.0.0:8000", "--workers", "4", "--log-level", "debug"] 20 | # for http based app 21 | # command: ["python", "http_server.py"] 22 | # for fastapi based app 23 | command: ["uvicorn", "fastapi_server:fastapi_app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--no-access-log"] 24 | readinessProbe: 25 | httpGet: 26 | path: /health 27 | port: default 28 | # If using `arg_type=fqn`, ensure the `mountPath` matches the file system path 29 | # where the objects are stored on AIStore targets. This allows the ETL container 30 | # to access the files directly by absolute path. 31 | # volumeMounts: 32 | # - name: ais 33 | # mountPath: /mnt/data/ais 34 | # volumes: 35 | # - name: ais 36 | # hostPath: 37 | # path: /mnt/data/ais 38 | # type: Directory 39 | -------------------------------------------------------------------------------- /transformers/tar2tf/.dockerignore: -------------------------------------------------------------------------------- 1 | src/tar2tf_test.go 2 | src/tar-single.tar 3 | -------------------------------------------------------------------------------- /transformers/tar2tf/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/golang:1.21-alpine 2 | 3 | RUN apk add --no-cache git 4 | 5 | RUN mkdir $GOPATH/tar2tf 6 | WORKDIR $GOPATH/tar2tf 7 | COPY src/ ./ 8 | RUN go build -o tar2tf 9 | 10 | EXPOSE 80 11 | -------------------------------------------------------------------------------- /transformers/tar2tf/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_tar2tf:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_tar2tf:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/tar2tf/README.md: -------------------------------------------------------------------------------- 1 | # Tar2Tf transformer 2 | 3 | Tar2Tf transforms TAR and TAR.GZ files to TFRecord format. 4 | Additionally, it accepts optional parameters to apply conversions to TAR records and select subset of keys from a single TAR record. 5 | 6 | ## Usage 7 | 8 | ### Build 9 | 10 | ```console 11 | $ cd src && go build 12 | ``` 13 | 14 | ### Run 15 | 16 | #### Run without any conversions and selections, on localhost:80 17 | ```console 18 | $ ./tar2tf -l localhost -p 80 19 | ``` 20 | 21 | #### Conversions ans selections 22 | 23 | Currently there are 4 available conversions to apply to TAR Record. 24 | 25 | To specify conversions and selections, use `--spec` or `--spec-file` argument to `./tar2tf` command. 26 | 27 | `--spec` argument accepts conversions and selections specification in form of a string. 28 | `--spec-file` argument accepts conversions and selections in form of path to a file containing specification. 29 | 30 | ##### Specification format 31 | 32 | ```json 33 | { 34 | "conversions": [ 35 | conversionSpec1, 36 | conversionSpec2, 37 | ... 38 | ], 39 | "selections": [ 40 | selectionSpec1, 41 | selectionSpec2, 42 | ... 43 | ] 44 | } 45 | ``` 46 | 47 | Conversions are applied in the order of occurrence in specification. 48 | If there aren't any selections provided, all keys from TAR records, and relevant values, will be used. 49 | 50 | ##### Decode Conversion 51 | 52 | Decodes PNG or JPEG image into object, allowing to apply further image transformations 53 | 54 | ```json 55 | { 56 | "type": "Decode", 57 | "ext_name": "png" 58 | } 59 | ``` 60 | 61 | ##### Rotate Conversion 62 | 63 | Rotates an image clockwise, accordingly to specified angle. If `angle == 0`, then random rotation is applied. 64 | 65 | ```json 66 | { 67 | "type": "Rotate", 68 | "ext_name": "png", 69 | "angle": 90 70 | } 71 | ``` 72 | 73 | ##### Resize Conversion 74 | 75 | Resizes an image accordingly to specified destination size. 76 | 77 | ```json 78 | { 79 | "type": "Resize", 80 | "ext_name": "png", 81 | "sizes": [28, 28] 82 | } 83 | ``` 84 | 85 | ##### Rename Conversion 86 | 87 | Rename multiple keys into the specified key. 88 | 89 | ```json 90 | { 91 | "type": "Rename", 92 | "renames": { 93 | "img": ["png", "jpeg"], 94 | "video": ["mp4", "avi"] 95 | } 96 | } 97 | ``` 98 | 99 | > Command above renames "png" and "jpeg" to "img", and renames "mp4" and "avi" to "video" 100 | 101 | ##### Selection 102 | 103 | Select single key from TAR record 104 | 105 | ```json 106 | { 107 | "ext_name": "png" 108 | } 109 | 110 | ``` 111 | 112 | #### Run with Decode and Rotate selection 113 | 114 | ```console 115 | $ echo >spec.json " 116 | { 117 | "conversions": [ 118 | { 119 | "type": "Decode", 120 | "ext_name": "png" 121 | }, 122 | { 123 | "type": "Rotate", 124 | "ext_name": "png" 125 | } 126 | ], 127 | "selections": [ 128 | { 129 | "ext_name": "png" 130 | }, 131 | { 132 | "ext_name": "cls" 133 | } 134 | ] 135 | } 136 | " 137 | 138 | $ ./tar2tf -l "0.0.0.0" -p 80 -spec-file spec.json 139 | ``` 140 | -------------------------------------------------------------------------------- /transformers/tar2tf/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: tar2tf 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_tar2tf:latest 13 | imagePullPolicy: IfNotPresent 14 | ports: 15 | - name: default 16 | containerPort: 80 17 | # To enable conversion e.g. 18 | command: ['./tar2tf', '-l', '0.0.0.0', '-p', '80', '${OPTION_KEY}', '${OPTION_VALUE}'] 19 | readinessProbe: 20 | httpGet: 21 | path: /health 22 | port: default 23 | -------------------------------------------------------------------------------- /transformers/tar2tf/src/cmn/assert.go: -------------------------------------------------------------------------------- 1 | // Package cmn common low-level types and utilities 2 | /* 3 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package cmn 6 | 7 | import ( 8 | "log" 9 | ) 10 | 11 | func Assert(cond bool, msg string) { 12 | if !cond { 13 | panic(msg) 14 | } 15 | } 16 | 17 | func AssertNoErr(err error) { 18 | if err != nil { 19 | Assert(false, err.Error()) 20 | } 21 | } 22 | 23 | func Exit(err error) { 24 | if err != nil { 25 | log.Fatal(err.Error()) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /transformers/tar2tf/src/cmn/io.go: -------------------------------------------------------------------------------- 1 | // Package cmn common low-level types and utilities 2 | /* 3 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package cmn 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | "io/ioutil" 11 | "sync/atomic" 12 | ) 13 | 14 | type ( 15 | OnCloseReader struct { 16 | R io.Reader 17 | Cb func() 18 | } 19 | 20 | WriteCounter struct { 21 | totalBytesWritten int64 22 | } 23 | 24 | // ByteHandle is a byte buffer(made from []byte) that implements 25 | // ReadOpenCloser interface 26 | ByteHandle struct { 27 | b []byte 28 | *bytes.Reader 29 | } 30 | ) 31 | 32 | func (r *OnCloseReader) Read(p []byte) (int, error) { 33 | return r.R.Read(p) 34 | } 35 | 36 | func (r *OnCloseReader) Close() { 37 | r.Cb() 38 | } 39 | 40 | func (r *WriteCounter) Write(p []byte) (int, error) { 41 | atomic.AddInt64(&r.totalBytesWritten, int64(len(p))) 42 | return len(p), nil 43 | } 44 | 45 | func (r *WriteCounter) Size() int64 { 46 | return atomic.LoadInt64(&r.totalBytesWritten) 47 | } 48 | 49 | func CopySection(r io.Reader, w io.Writer, start, length int64) (n int64, err error) { 50 | // Discard first start bytes. 51 | n, err = io.CopyN(ioutil.Discard, r, start) 52 | if err != nil { 53 | return 0, err 54 | } 55 | 56 | // Write only length bytes. 57 | return io.CopyN(w, r, length) 58 | } 59 | 60 | func NewByteHandle(bt []byte) *ByteHandle { 61 | return &ByteHandle{bt, bytes.NewReader(bt)} 62 | } 63 | 64 | func (b *ByteHandle) Close() error { 65 | return nil 66 | } 67 | func (b *ByteHandle) Open() (io.ReadCloser, error) { 68 | return ioutil.NopCloser(bytes.NewReader(b.b)), nil 69 | } 70 | -------------------------------------------------------------------------------- /transformers/tar2tf/src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/ais-etl/transformers/tar2tf/src 2 | 3 | go 1.21 4 | 5 | require ( 6 | github.com/NVIDIA/go-tfdata v0.3.2-0.20200714114828-1432f6c70e3a 7 | github.com/disintegration/imaging v1.6.2 8 | github.com/json-iterator/go v1.1.12 9 | ) 10 | 11 | require ( 12 | github.com/golang/protobuf v1.5.3 // indirect 13 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 14 | github.com/modern-go/reflect2 v1.0.2 // indirect 15 | golang.org/x/image v0.24.0 // indirect 16 | google.golang.org/protobuf v1.33.0 // indirect 17 | ) 18 | -------------------------------------------------------------------------------- /transformers/tar2tf/src/tar-single.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tar2tf/src/tar-single.tar -------------------------------------------------------------------------------- /transformers/tar2tf/src/tar2tf_test.go: -------------------------------------------------------------------------------- 1 | // Package main is an entry point to Tar2Tf transformation 2 | /* 3 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package main 6 | 7 | import ( 8 | "bytes" 9 | "net/http" 10 | "net/url" 11 | "os" 12 | "testing" 13 | 14 | "github.com/NVIDIA/go-tfdata/tfdata/core" 15 | ) 16 | 17 | const tarPath = "tar-single.tar" 18 | 19 | func mockRequest(t *testing.T) (r *http.Request) { 20 | var err error 21 | 22 | r = &http.Request{} 23 | r.Body, err = os.Open(tarPath) 24 | r.URL = &url.URL{} 25 | if err != nil { 26 | t.Fatal(err.Error()) 27 | } 28 | return r 29 | } 30 | 31 | func TestTar2TfSimple(t *testing.T) { 32 | initVars("localhost", 8080, nil) 33 | 34 | var ( 35 | req = mockRequest(t) 36 | buff = bytes.NewBuffer(nil) 37 | ) 38 | 39 | err := onTheFlyTransformWholeObject(req, buff) 40 | if err != nil { 41 | t.Fatal(err.Error()) 42 | } 43 | 44 | r := core.NewTFRecordReader(buff) 45 | examples, err := r.ReadAllExamples(1) 46 | if err != nil { 47 | t.Fatal(err.Error()) 48 | } 49 | if len(examples) != 1 { 50 | t.Fatalf("expected 1 example, got %d", len(examples)) 51 | } 52 | } 53 | 54 | func TestTar2TfConvTransform(t *testing.T) { 55 | var ( 56 | req = mockRequest(t) 57 | buff = bytes.NewBuffer(nil) 58 | 59 | filterSpec = []byte(` 60 | { 61 | "conversions": [ 62 | { 63 | "type": "Decode", 64 | "ext_name": "png" 65 | }, 66 | { 67 | "type": "Rotate", 68 | "ext_name": "png" 69 | } 70 | ], 71 | "selections": [ 72 | { 73 | "ext_name": "png" 74 | }, 75 | { 76 | "ext_name": "cls" 77 | } 78 | ] 79 | } 80 | `) 81 | ) 82 | 83 | initVars("localhost", 8080, filterSpec) 84 | err := onTheFlyTransformWholeObject(req, buff) 85 | if err != nil { 86 | t.Fatal(err.Error()) 87 | } 88 | 89 | r := core.NewTFRecordReader(buff) 90 | examples, err := r.ReadAllExamples(1) 91 | if err != nil { 92 | t.Fatal(err.Error()) 93 | } 94 | if len(examples) != 1 { 95 | t.Fatalf("expected 1 example, got %d", len(examples)) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /transformers/tar2tf/src/transforms/pipeline.go: -------------------------------------------------------------------------------- 1 | // Package transforms provides tools to transform TAR to TFRecords files 2 | /* 3 | * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. 4 | */ 5 | package transforms 6 | 7 | import ( 8 | "io" 9 | 10 | "github.com/NVIDIA/go-tfdata/tfdata/core" 11 | "github.com/NVIDIA/go-tfdata/tfdata/pipeline" 12 | "github.com/NVIDIA/go-tfdata/tfdata/transform" 13 | ) 14 | 15 | func CreatePipeline(r io.Reader, w io.Writer, isTarGz bool, job *TransformJob) *pipeline.DefaultPipeline { 16 | if job != nil { 17 | return transformPipeline(r, w, isTarGz, job) 18 | } 19 | return defaultPipeline(r, w, isTarGz) 20 | } 21 | 22 | func defaultPipeline(r io.Reader, w io.Writer, isTarGz bool) *pipeline.DefaultPipeline { 23 | p := pipeline.NewPipeline() 24 | if isTarGz { 25 | p.FromTarGz(r) 26 | } else { 27 | p.FromTar(r) 28 | } 29 | return p.SampleToTFExample().ToTFRecord(w, 8) 30 | } 31 | 32 | func transformPipeline(r io.Reader, w io.Writer, isTarGz bool, job *TransformJob) *pipeline.DefaultPipeline { 33 | p := pipeline.NewPipeline() 34 | if isTarGz { 35 | p.FromTarGz(r) 36 | } else { 37 | p.FromTar(r) 38 | } 39 | 40 | var transformations []transform.SampleTransformation 41 | transformations = append(transformations, job.Conversions...) 42 | if len(job.Selections) > 0 { // Select everything by default. 43 | transformations = append(transformations, transform.SampleSelections(job.Selections...)) 44 | } 45 | p.TransformSamples(transformations...).WithSample2TFExampleStage(func(sr core.SampleReader) core.TFExampleReader { 46 | return &SampleToTFExampleReader{SampleReader: sr} 47 | }).ToTFRecord(w) 48 | return p 49 | } 50 | -------------------------------------------------------------------------------- /transformers/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/__init__.py -------------------------------------------------------------------------------- /transformers/tests/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # pylint: disable=missing-class-docstring, missing-function-docstring, missing-module-docstring 5 | 6 | import os 7 | import unittest 8 | from tests.utils import generate_random_string, log_etl 9 | from aistore.sdk.errors import ErrETLNotFound 10 | from aistore import Client 11 | 12 | 13 | class TestBase(unittest.TestCase): 14 | def setUp(self): 15 | self.endpoint = os.environ.get("AIS_ENDPOINT", "http://192.168.49.2:8080") 16 | self.git_test_mode = os.getenv("GIT_TEST", "false") 17 | self.client = Client(self.endpoint) 18 | self.test_bck = self.client.bucket( 19 | "test-bucket" + generate_random_string() 20 | ).create(exist_ok=True) 21 | self.etls = [] 22 | 23 | def tearDown(self): 24 | self.test_bck.delete() 25 | for etl_name in self.etls: 26 | try: 27 | log_etl(self.client, etl_name) 28 | self.client.etl(etl_name).stop() 29 | self.client.etl(etl_name).delete() 30 | except ErrETLNotFound: 31 | # ETL might be already deleted 32 | pass 33 | -------------------------------------------------------------------------------- /transformers/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | aistore>=1.13.5 2 | filetype 3 | keras 4 | numpy 5 | pillow 6 | pyyaml 7 | requests 8 | scikit-image 9 | scipy 10 | keras 11 | pytest 12 | tensorflow 13 | opencv-python 14 | aiofiles 15 | kaggle 16 | typing-extensions>=4.3.0 -------------------------------------------------------------------------------- /transformers/tests/resources/test-audio-flac.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-flac.flac -------------------------------------------------------------------------------- /transformers/tests/resources/test-audio-mp3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-mp3.mp3 -------------------------------------------------------------------------------- /transformers/tests/resources/test-audio-wav.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-audio-wav.wav -------------------------------------------------------------------------------- /transformers/tests/resources/test-face-detection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-face-detection.png -------------------------------------------------------------------------------- /transformers/tests/resources/test-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg -------------------------------------------------------------------------------- /transformers/tests/resources/test-image.jpg.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg.bz2 -------------------------------------------------------------------------------- /transformers/tests/resources/test-image.jpg.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-image.jpg.gz -------------------------------------------------------------------------------- /transformers/tests/resources/test-manifest.jsonl: -------------------------------------------------------------------------------- 1 | {"id":"test-audio-wav", "from_time":0, "to_time": 1, "part": 0} 2 | {"id":"test-audio-wav", "from_time":1, "to_time": 2, "part": 1} 3 | {"id":"test-audio-wav", "from_time":2, "to_time": 3, "part": 2} -------------------------------------------------------------------------------- /transformers/tests/resources/test-tar-single.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-tar-single.tar -------------------------------------------------------------------------------- /transformers/tests/resources/test-text.txt: -------------------------------------------------------------------------------- 1 | Quod equidem non reprehendo; 2 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quibus natura iure responderit non esse verum aliunde finem beate vivendi, a se principia rei gerendae peti; Quae enim adhuc protulisti, popularia sunt, ego autem a te elegantiora desidero. Duo Reges: constructio interrete. Tum Lucius: Mihi vero ista valde probata sunt, quod item fratri puto. Bestiarum vero nullum iudicium puto. Nihil enim iam habes, quod ad corpus referas; Deinde prima illa, quae in congressu solemus: Quid tu, inquit, huc? Et homini, qui ceteris animantibus plurimum praestat, praecipue a natura nihil datum esse dicemus? 3 | 4 | Iam id ipsum absurdum, maximum malum neglegi. Quod ea non occurrentia fingunt, vincunt Aristonem; Atqui perspicuum est hominem e corpore animoque constare, cum primae sint animi partes, secundae corporis. Fieri, inquam, Triari, nullo pacto potest, ut non dicas, quid non probes eius, a quo dissentias. Equidem e Cn. An dubium est, quin virtus ita maximam partem optineat in rebus humanis, ut reliquas obruat? 5 | 6 | Quis istum dolorem timet? 7 | Summus dolor plures dies manere non potest? Dicet pro me ipsa virtus nec dubitabit isti vestro beato M. Tubulum fuisse, qua illum, cuius is condemnatus est rogatione, P. Quod si ita sit, cur opera philosophiae sit danda nescio. 8 | 9 | Ex eorum enim scriptis et institutis cum omnis doctrina liberalis, omnis historia. 10 | Quod si ita est, sequitur id ipsum, quod te velle video, omnes semper beatos esse sapientes. Cum enim fertur quasi torrens oratio, quamvis multa cuiusque modi rapiat, nihil tamen teneas, nihil apprehendas, nusquam orationem rapidam coerceas. Ita redarguitur ipse a sese, convincunturque scripta eius probitate ipsius ac moribus. At quanta conantur! Mundum hunc omnem oppidum esse nostrum! Incendi igitur eos, qui audiunt, vides. Vide, ne magis, inquam, tuum fuerit, cum re idem tibi, quod mihi, videretur, non nova te rebus nomina inponere. Qui-vere falsone, quaerere mittimus-dicitur oculis se privasse; Si ista mala sunt, in quae potest incidere sapiens, sapientem esse non esse ad beate vivendum satis. At vero si ad vitem sensus accesserit, ut appetitum quendam habeat et per se ipsa moveatur, quid facturam putas? 11 | 12 | Quem si tenueris, non modo meum Ciceronem, sed etiam me ipsum abducas licebit. 13 | Stulti autem malorum memoria torquentur, sapientes bona praeterita grata recordatione renovata delectant. 14 | Esse enim quam vellet iniquus iustus poterat inpune. 15 | Quae autem natura suae primae institutionis oblita est? 16 | Verum tamen cum de rebus grandioribus dicas, ipsae res verba rapiunt; 17 | Hoc est non modo cor non habere, sed ne palatum quidem. 18 | Voluptatem cum summum bonum diceret, primum in eo ipso parum vidit, deinde hoc quoque alienum; Sed tu istuc dixti bene Latine, parum plane. Nam haec ipsa mihi erunt in promptu, quae modo audivi, nec ante aggrediar, quam te ab istis, quos dicis, instructum videro. Fatebuntur Stoici haec omnia dicta esse praeclare, neque eam causam Zenoni desciscendi fuisse. Non autem hoc: igitur ne illud quidem. Ratio quidem vestra sic cogit. Cum audissem Antiochum, Brute, ut solebam, cum M. An quod ita callida est, ut optime possit architectari voluptates? 19 | 20 | Idemne, quod iucunde? 21 | Haec mihi videtur delicatior, ut ita dicam, molliorque ratio, quam virtutis vis gravitasque postulat. Sed quoniam et advesperascit et mihi ad villam revertendum est, nunc quidem hactenus; Cuius ad naturam apta ratio vera illa et summa lex a philosophis dicitur. Neque solum ea communia, verum etiam paria esse dixerunt. Sed nunc, quod agimus; A mene tu? -------------------------------------------------------------------------------- /transformers/tests/resources/test-text.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-text.txt.bz2 -------------------------------------------------------------------------------- /transformers/tests/resources/test-text.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ais-etl/5387018437c7908d34b425003bee14c793d2ad3d/transformers/tests/resources/test-text.txt.gz -------------------------------------------------------------------------------- /transformers/tests/test_audio_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest suite for the Audio Splitter ETL Transformer. 3 | 4 | For each combination of communication mode and FQN-flag, this test: 5 | 1. Uploads sample audio files into a fresh bucket. 6 | 2. Initializes the Audio Splitter ETL with fixed from/to times. 7 | 3. Fetches each transformed segment and compares it 8 | against a locally-trimmed version for bitwise equality. 9 | """ 10 | 11 | import logging 12 | from io import BytesIO 13 | from itertools import product 14 | from pathlib import Path 15 | from typing import Dict 16 | 17 | import pytest 18 | import soundfile as sf 19 | from aistore.sdk import Bucket 20 | from aistore.sdk.etl import ETLConfig 21 | 22 | from tests.const import AUDIO_SPLITTER_TEMPLATE, COMM_TYPES, FQN_OPTIONS 23 | 24 | logger = logging.getLogger(__name__) 25 | logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") 26 | 27 | 28 | def trim_audio_bytes(buf: bytes, audio_format: str, start: float, end: float) -> bytes: 29 | """ 30 | Trim `buf` audio between `start` and `end` seconds and return WAV bytes. 31 | """ 32 | bio = BytesIO(buf) 33 | with sf.SoundFile(bio, mode="r") as src: 34 | sr, ch = src.samplerate, src.channels 35 | start_frame = int(start * sr) 36 | end_frame = int(end * sr) 37 | src.seek(start_frame) 38 | frames = src.read(end_frame - start_frame) 39 | 40 | out = BytesIO() 41 | with sf.SoundFile( 42 | out, mode="w", samplerate=sr, channels=ch, format=audio_format 43 | ) as dst: 44 | dst.write(frames) 45 | return out.getvalue() 46 | 47 | 48 | @pytest.mark.parametrize("comm_type,use_fqn", product(COMM_TYPES, FQN_OPTIONS)) 49 | def test_audio_splitter_transform( 50 | test_bck: Bucket, 51 | local_audio_files: Dict[str, Path], 52 | etl_factory, 53 | comm_type: str, 54 | use_fqn: bool, 55 | ) -> None: 56 | """ 57 | Validate the Audio Splitter ETL transformer. 58 | 59 | Args: 60 | test_bck: fresh bucket fixture 61 | local_audio_files: map of filename -> Path for inputs 62 | etl_factory: factory to init & cleanup ETLs 63 | comm_type: one of COMM_TYPES 64 | use_fqn: whether to pass FQN as argument 65 | """ 66 | # 1) upload 67 | file_name = "test-audio-wav.wav" 68 | path = local_audio_files[file_name] 69 | test_bck.object(file_name).get_writer().put_file(path) 70 | 71 | # 2) init with fixed times 72 | from_t, to_t = 1.0, 2.0 73 | args = {"from_time": f"{from_t:.2f}", "to_time": f"{to_t:.2f}"} 74 | etl_name = etl_factory( 75 | tag="audio-splitter", 76 | server_type="fastapi", 77 | template=AUDIO_SPLITTER_TEMPLATE, 78 | communication_type=comm_type, 79 | use_fqn=use_fqn, 80 | direct_put=True, 81 | ) 82 | logger.info("Initialized ETL %s (comm=%s, fqn=%s)", etl_name, comm_type, use_fqn) 83 | 84 | # 3) fetch & compare 85 | reader = test_bck.object(file_name).get_reader(etl=ETLConfig(etl_name, args=args)) 86 | transformed = reader.read_all() 87 | original = Path(path).read_bytes() 88 | expected = trim_audio_bytes(original, "wav", from_t, to_t) 89 | 90 | assert transformed == expected, f"{file_name}: payload mismatch (ETL={etl_name})" 91 | -------------------------------------------------------------------------------- /transformers/tests/test_batch_rename.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | """ 4 | 5 | import logging 6 | import re 7 | from pathlib import Path 8 | from typing import Dict 9 | from itertools import product 10 | 11 | import pytest 12 | from aistore.sdk import Bucket 13 | from aistore.sdk.etl import ETLConfig 14 | 15 | from tests.const import ( 16 | BATCH_RENAME_TEMPLATE, 17 | COMM_TYPES, 18 | FQN_OPTIONS, 19 | ) 20 | 21 | # Configure module-level logger 22 | logger = logging.getLogger(__name__) 23 | logging.basicConfig( 24 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 25 | ) 26 | 27 | 28 | def _verify_renamed_files( 29 | bucket: Bucket, 30 | local_files: Dict[str, Path], 31 | etl_name: str, 32 | pattern: str, 33 | prefix: str, 34 | ) -> None: 35 | """ 36 | Verifies the output of the ETL transformer: 37 | - Ensures transformed objects match original content. 38 | - If a filename matches the pattern, it should also appear under a new prefixed name. 39 | """ 40 | for filename, path in local_files.items(): 41 | original_data = Path(path).read_bytes() 42 | output_data = ( 43 | bucket.object(filename).get_reader(etl=ETLConfig(etl_name)).read_all() 44 | ) 45 | assert ( 46 | output_data == original_data 47 | ), f"{filename} was not echoed correctly by ETL '{etl_name}'" 48 | 49 | if re.match(pattern, filename): 50 | renamed_path = f"{prefix}{filename}" 51 | renamed_data = bucket.object(renamed_path).get_reader().read_all() 52 | assert ( 53 | renamed_data == original_data 54 | ), f"{filename} was not renamed correctly to {renamed_path}" 55 | 56 | 57 | @pytest.mark.parametrize("comm_type, use_fqn", product(COMM_TYPES, FQN_OPTIONS)) 58 | def test_batch_rename_transformer( 59 | test_bck: Bucket, 60 | local_audio_files: Dict[str, Path], 61 | etl_factory, 62 | endpoint: str, 63 | comm_type: str, 64 | use_fqn: bool, 65 | ) -> None: 66 | """ 67 | Integration test for the Batch Rename ETL transformer. 68 | Uploads audio files to a bucket, initializes the transformer, 69 | and verifies renaming behavior using ETL output. 70 | """ 71 | pattern = r".*\.flac$" 72 | prefix = "renamed_" 73 | 74 | # Upload input files to the test bucket 75 | for fname, fpath in local_audio_files.items(): 76 | test_bck.object(fname).get_writer().put_file(str(fpath)) 77 | 78 | # Build transformer spec 79 | transformer_spec = BATCH_RENAME_TEMPLATE.format( 80 | communication_type="{communication_type}", 81 | direct_put="{direct_put}", 82 | command="{command}", 83 | ais_endpoint=endpoint, 84 | bck_name=test_bck.name, 85 | regex_pattern=pattern, 86 | dst_prefix=prefix, 87 | ) 88 | 89 | # Initialize transformer 90 | etl_name = etl_factory( 91 | tag="batch-rename", 92 | server_type="fastapi", 93 | template=transformer_spec, 94 | communication_type=comm_type, 95 | use_fqn=use_fqn, 96 | direct_put="true", 97 | ) 98 | logger.info( 99 | "Initialized ETL '%s' (server=fastapi, comm=%s, fqn=%s)", 100 | etl_name, 101 | comm_type, 102 | use_fqn, 103 | ) 104 | 105 | # Validate output 106 | _verify_renamed_files(test_bck, local_audio_files, etl_name, pattern, prefix) 107 | -------------------------------------------------------------------------------- /transformers/tests/test_face_detection_stress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stress testing Face Detection Transformer for 1 Million objects across all communication types. 3 | 4 | Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. 5 | """ 6 | 7 | import logging 8 | from datetime import datetime 9 | 10 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH 11 | from aistore.sdk.etl.etl_templates import FACE_DETECTION_TRANSFORMER 12 | 13 | from tests.base import TestBase 14 | from tests.utils import ( 15 | format_image_tag_for_git_test_mode, 16 | cases, 17 | generate_random_string, 18 | ) 19 | 20 | # Configure logging 21 | logging.basicConfig( 22 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 23 | level=logging.INFO, 24 | ) 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | class TestFaceDetectionStress(TestBase): 29 | """Stress test for AIStore ETL Face Detection transformation on a large dataset.""" 30 | 31 | def setUp(self): 32 | """Sets up the test environment by defining the source bucket for face detection.""" 33 | super().setUp() 34 | self.images_bck = self.client.bucket(bck_name="stress-test-face-detection") 35 | 36 | @cases( 37 | (ETL_COMM_HPUSH, "hpush_fastapi", ""), 38 | (ETL_COMM_HPULL, "hpull_fastapi", ""), 39 | "", 40 | (ETL_COMM_HPULL, "hpull_fastapi_fqn", "fqn"), 41 | (ETL_COMM_HPUSH, "hpush_fastapi_fqn", "fqn"), 42 | ) 43 | def test_face_detection(self, test_case): 44 | comm_type, test_suffix, arg_type = test_case 45 | """Stress test face detection ETL transformation using various communication types.""" 46 | test_name = f"test_face_detection_{test_suffix}" 47 | etl_name = f"face-detect-{generate_random_string(5)}-{test_suffix}" 48 | self.etls.append(etl_name) 49 | 50 | self.initialize_etl(comm_type, etl_name, arg_type) 51 | self.execute_etl_job(test_name, etl_name) 52 | 53 | def initialize_etl(self, comm_type: str, etl_name: str, arg_type: str): 54 | """Initializes the ETL transformation with the specified parameters.""" 55 | template = FACE_DETECTION_TRANSFORMER.format( 56 | communication_type=comm_type, format="jpg", arg_type=arg_type 57 | ) 58 | 59 | # Adjust template for Git test mode 60 | template = format_image_tag_for_git_test_mode(template, "face_detection") 61 | 62 | # Initialize ETL transformation 63 | self.client.etl(etl_name).init_spec( 64 | template=template, communication_type=comm_type, arg_type=arg_type 65 | ) 66 | 67 | logger.info( 68 | "Initialized ETL: %s\n%s", etl_name, self.client.etl(etl_name).view() 69 | ) 70 | 71 | def execute_etl_job(self, test_name: str, etl_name: str): 72 | """Executes the ETL transformation job and validates results.""" 73 | start_time = datetime.now() 74 | 75 | # Start the transformation job 76 | job_id = self.images_bck.transform( 77 | etl_name=etl_name, timeout="5m", to_bck=self.test_bck 78 | ) 79 | 80 | # Wait for job completion 81 | self.client.job(job_id).wait(timeout=600, verbose=False) 82 | 83 | # Calculate time taken 84 | time_elapsed = datetime.now() - start_time 85 | 86 | # Verify job status 87 | job_status = self.client.job(job_id).status() 88 | self.assertEqual( 89 | job_status.err, "", f"ETL Job {job_id} failed with error: {job_status.err}" 90 | ) 91 | 92 | # Ensure object count matches between source and destination 93 | src_objects = len(self.images_bck.list_all_objects()) 94 | dest_objects = len(self.test_bck.list_all_objects()) 95 | self.assertEqual( 96 | src_objects, 97 | dest_objects, 98 | f"Mismatch in object count: {src_objects} vs {dest_objects}", 99 | ) 100 | 101 | logger.info("Test: %s | Duration: %s", test_name, time_elapsed) 102 | 103 | # Log results to metrics file 104 | with open("metrics.txt", "a+", encoding="utf-8") as file: 105 | file.write(f"{test_name} {time_elapsed}\n") 106 | -------------------------------------------------------------------------------- /transformers/tests/test_hash_with_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest suite for the HashWithArgs ETL transformer. 3 | 4 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 5 | """ 6 | 7 | import random 8 | import logging 9 | from pathlib import Path 10 | from typing import Dict 11 | 12 | import pytest 13 | import xxhash 14 | from aistore.sdk.etl import ETLConfig 15 | from aistore.sdk import Bucket 16 | 17 | from tests.const import ( 18 | INLINE_PARAM_COMBINATIONS, 19 | HASH_WITH_ARGS_TEMPLATE, 20 | ) 21 | 22 | # Configure module-level logger 23 | logger = logging.getLogger(__name__) 24 | logging.basicConfig( 25 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 26 | ) 27 | 28 | 29 | def _upload_test_files(test_bck: Bucket, local_files: Dict[str, Path]) -> None: 30 | """ 31 | Upload files to the specified bucket. 32 | """ 33 | for filename, path in local_files.items(): 34 | logger.debug("Uploading %s to bucket %s", filename, test_bck.name) 35 | test_bck.object(filename).get_writer().put_file(str(path)) 36 | 37 | 38 | def _calculate_hash(data, seed): 39 | """Computes the seeded hash of a given file.""" 40 | hasher = xxhash.xxh64(seed=seed) 41 | hasher.update(data) 42 | return hasher.hexdigest().encode() 43 | 44 | 45 | def _verify_test_files( 46 | test_bck: Bucket, 47 | local_files: Dict[str, Path], 48 | etl_name: str, 49 | ) -> None: 50 | """ 51 | Verify that the files in the bucket match the hash. 52 | """ 53 | for filename, path in local_files.items(): 54 | seed = random.randint(0, 1000) 55 | reader = test_bck.object(filename).get_reader( 56 | etl=ETLConfig(etl_name, args=str(seed)) 57 | ) 58 | transformed = reader.read_all() 59 | original = Path(path).read_bytes() 60 | original_hash = _calculate_hash(original, seed) 61 | assert ( 62 | transformed == original_hash 63 | ), f"Hash mismatch for {filename}: expected {original_hash}, got {transformed}" 64 | 65 | 66 | # pylint: disable=too-many-arguments 67 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS) 68 | def test_echo_transformer( 69 | test_bck: Bucket, 70 | local_files: Dict[str, Path], 71 | etl_factory, 72 | server_type: str, 73 | comm_type: str, 74 | use_fqn: bool, 75 | ) -> None: 76 | """ 77 | Validate the Python-based Hash With Args ETL transformer. 78 | Upload sample files, initialize the ETL, then assert hash. 79 | """ 80 | # Upload inputs 81 | _upload_test_files(test_bck, local_files) 82 | 83 | # Build and initialize ETL 84 | etl_name = etl_factory( 85 | tag="hash-with-args", 86 | server_type=server_type, 87 | template=HASH_WITH_ARGS_TEMPLATE, 88 | communication_type=comm_type, 89 | use_fqn=use_fqn, 90 | ) 91 | logger.info( 92 | "Initialized HashWithArgs ETL '%s' (server=%s, comm=%s, fqn=%s)", 93 | etl_name, 94 | server_type, 95 | comm_type, 96 | use_fqn, 97 | ) 98 | 99 | _verify_test_files( 100 | test_bck, 101 | local_files, 102 | etl_name, 103 | ) 104 | -------------------------------------------------------------------------------- /transformers/tests/test_hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest suite for the Hello-World ETL transformer. 3 | 4 | For each combination of server framework (Flask, FastAPI, HTTP), communication mode (hpull/hpush), 5 | and argument style (FQN vs relative), this test: 6 | 1. Uploads two sample files into a fresh bucket. 7 | 2. Creates an ETL job via `etl_factory`. 8 | 3. Transforms each file and asserts the output equals `b"Hello World!"`. 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import logging 14 | from pathlib import Path 15 | 16 | import pytest 17 | from aistore.sdk.etl import ETLConfig 18 | from aistore.sdk import Bucket 19 | 20 | from tests.const import HELLO_WORLD_TEMPLATE, INLINE_PARAM_COMBINATIONS 21 | 22 | # Configure module-level logger 23 | logger = logging.getLogger(__name__) 24 | logging.basicConfig( 25 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 26 | ) 27 | 28 | 29 | # pylint: disable=too-many-arguments 30 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS) 31 | def test_hello_world_transformer( 32 | test_bck: Bucket, 33 | local_files: dict[str, Path], 34 | etl_factory: callable, 35 | server_type: str, 36 | comm_type: str, 37 | use_fqn: bool, 38 | ) -> None: 39 | """ 40 | Transform local_files via the Hello-World ETL and verify output. 41 | 42 | Args: 43 | client: AIS cluster client (session-scoped fixture). 44 | test_bck: fresh bucket for this test (function-scoped). 45 | local_files: mapping filename -> local Path of sample inputs. 46 | etl_factory: fixture to create+cleanup ETL jobs. 47 | server_type: framework to use ('flask', 'fastapi', 'http'). 48 | comm_type: ETL_COMM_HPULL or ETL_COMM_HPUSH. 49 | use_fqn: whether to pass objects by fully-qualified name. 50 | """ 51 | # Upload sample files 52 | for filename, path in local_files.items(): 53 | logger.debug("Uploading %s to bucket %s", filename, test_bck.name) 54 | test_bck.object(filename).get_writer().put_file(path) 55 | 56 | # Build and initialize ETL 57 | etl_name = etl_factory( 58 | tag="hello-world", 59 | server_type=server_type, 60 | template=HELLO_WORLD_TEMPLATE, 61 | communication_type=comm_type, 62 | use_fqn=use_fqn, 63 | ) 64 | logger.info( 65 | "Initialized Hello-World ETL '%s' (server=%s, comm=%s, fqn=%s)", 66 | etl_name, 67 | server_type, 68 | comm_type, 69 | use_fqn, 70 | ) 71 | 72 | # Execute transform and assert on each file 73 | for filename in local_files: 74 | reader = test_bck.object(filename).get_reader(etl=ETLConfig(etl_name)) 75 | output = reader.read_all() 76 | assert ( 77 | output == b"Hello World!" 78 | ), f"ETL {etl_name} produced unexpected output for '{filename}': {output!r}" 79 | -------------------------------------------------------------------------------- /transformers/tests/test_hello_world_stress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest-based stress suite for the Hello-World ETL transformer. 3 | 4 | This module: 5 | - Uses a pre-populated `stress_bucket` with 10,000 objects (session-scoped fixture). 6 | - Creates a fresh `test_bck` destination bucket per test. 7 | - Runs the Hello-World ETL across all server/comm/FQN combinations in parallel. 8 | - Verifies object counts and payload correctness on a random sample. 9 | - Records per-test durations into `metrics.txt`. 10 | 11 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 12 | """ 13 | 14 | import random 15 | import logging 16 | 17 | import pytest 18 | from aistore.sdk import Bucket 19 | 20 | from tests.const import PARAM_COMBINATIONS, HELLO_WORLD_TEMPLATE, LABEL_FMT 21 | 22 | logger = logging.getLogger(__name__) 23 | logging.basicConfig( 24 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 25 | ) 26 | 27 | 28 | # pylint: disable=too-many-arguments, too-many-locals 29 | @pytest.mark.stress 30 | @pytest.mark.parametrize( 31 | "server_type, comm_type, use_fqn, direct_put", PARAM_COMBINATIONS 32 | ) 33 | def test_hello_world_stress( 34 | stress_client, 35 | stress_bucket: Bucket, 36 | test_bck: Bucket, 37 | etl_factory, 38 | stress_metrics, 39 | stress_object_count, 40 | server_type: str, 41 | comm_type: str, 42 | use_fqn: bool, 43 | direct_put: str, 44 | ): 45 | """ 46 | Stress test for Hello-World ETL: copy 10k objects with transformation. 47 | """ 48 | # 1) Initialize ETL 49 | label = LABEL_FMT.format( 50 | name="HELLO WORLD", 51 | server=server_type, 52 | comm=comm_type, 53 | arg="fqn" if use_fqn else "", 54 | direct=direct_put, 55 | ) 56 | etl_name = etl_factory( 57 | tag="hello-world", 58 | server_type=server_type, 59 | template=HELLO_WORLD_TEMPLATE, 60 | communication_type=comm_type, 61 | use_fqn=use_fqn, 62 | direct_put=direct_put, 63 | ) 64 | 65 | # 2) Run transform job 66 | job_id = stress_bucket.transform( 67 | etl_name=etl_name, 68 | to_bck=test_bck, 69 | num_workers=24, 70 | timeout="10m", 71 | ) 72 | job = stress_client.job(job_id) 73 | job.wait(timeout=600) 74 | duration = job.get_total_time() 75 | 76 | logger.info( 77 | "ETL '%s' completed in %ss (srv=%s, comm=%s, fqn=%s)", 78 | etl_name, 79 | duration, 80 | server_type, 81 | comm_type, 82 | use_fqn, 83 | ) 84 | 85 | # 3) Verify counts 86 | objs = list(test_bck.list_all_objects()) 87 | assert ( 88 | len(objs) == stress_object_count 89 | ), f"Expected {stress_object_count} objects, got {len(objs)}" 90 | 91 | # 4) Sample and verify payload 92 | samples = random.sample(objs, 10) 93 | for entry in samples: 94 | data = test_bck.object(entry.name).get_reader().read_all() 95 | assert data == b"Hello World!", f"Mismatch in object {entry.name}" 96 | 97 | # 5) Record metric 98 | stress_metrics.append((label, duration)) 99 | -------------------------------------------------------------------------------- /transformers/tests/test_keras_stress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stress testing Keras Transformer for 50K images across all communication types. 3 | 4 | Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. 5 | """ 6 | 7 | import logging 8 | from datetime import datetime 9 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH 10 | from aistore.sdk.etl.etl_templates import KERAS_TRANSFORMER 11 | 12 | from tests.base import TestBase 13 | from tests.utils import cases, generate_random_string 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 18 | level=logging.INFO, 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class TestKerasStress(TestBase): 24 | """Stress test for Keras Transformer with 50K images using different communication types.""" 25 | 26 | def setUp(self): 27 | """Sets up the test environment by defining the source bucket for images.""" 28 | super().setUp() 29 | self.images_bck = self.client.bucket(bck_name="stress-test-images") 30 | 31 | def run_test(self, comm_type: str, test_name: str, fqn_flag: bool = False): 32 | """ 33 | Runs a Keras transformation stress test using AIStore ETL. 34 | 35 | Args: 36 | comm_type (str): ETL communication type (HPULL, HPUSH). 37 | test_name (str): Name of the test case for logging. 38 | fqn_flag (bool, optional): Whether to use fully qualified names (FQN). Defaults to False. 39 | """ 40 | arg_type = "fqn" if fqn_flag else "" 41 | 42 | # Generate a unique ETL name 43 | etl_name = f"keras-transformer-{generate_random_string(5)}" 44 | self.etls.append(etl_name) 45 | 46 | # Generate the ETL template 47 | template = KERAS_TRANSFORMER.format( 48 | communication_type=comm_type, 49 | format="JPEG", 50 | transform='{"theta":40, "brightness":0.8, "zx":0.9, "zy":0.9}', 51 | arg_type=arg_type, 52 | ) 53 | 54 | # Initialize ETL transformation 55 | self.client.etl(etl_name).init_spec( 56 | template=template, communication_type=comm_type, arg_type=arg_type 57 | ) 58 | 59 | logger.info( 60 | "Starting ETL test: %s (ETL: %s)\n%s", 61 | test_name, 62 | etl_name, 63 | self.client.etl(etl_name).view(), 64 | ) 65 | 66 | start_time = datetime.now() 67 | 68 | # Start transformation job 69 | job_id = self.images_bck.transform( 70 | etl_name=etl_name, 71 | timeout="30m", 72 | to_bck=self.test_bck, 73 | ext={"JPEG": "JPEG"}, 74 | ) 75 | 76 | # Wait for the job to complete 77 | self.client.job(job_id).wait(timeout=1800) 78 | time_elapsed = datetime.now() - start_time 79 | 80 | # Check job status 81 | job_status = self.client.job(job_id).status() 82 | self.assertEqual( 83 | job_status.err, "", f"ETL Job {job_id} failed with error: {job_status.err}" 84 | ) 85 | 86 | # Ensure all images were transformed correctly 87 | self.assertEqual( 88 | len(self.images_bck.list_all_objects()), 89 | len(self.test_bck.list_all_objects()), 90 | "Mismatch in number of transformed images.", 91 | ) 92 | 93 | logger.info("Test: %s | Duration: %s", test_name, time_elapsed) 94 | 95 | # Log results to a metrics file 96 | with open("metrics.txt", "a+", encoding="utf-8") as file: 97 | file.write(f"{test_name} {time_elapsed}\n") 98 | 99 | @cases( 100 | (ETL_COMM_HPUSH, "test_keras_hpush_fastapi", False), 101 | (ETL_COMM_HPULL, "test_keras_hpull_fastapi", False), 102 | (ETL_COMM_HPULL, "test_keras_hpull_fastapi_fqn", True), 103 | (ETL_COMM_HPUSH, "test_keras_hpush_fastapi_fqn", True), 104 | ) 105 | def test_keras_transformer(self, test_case): 106 | """Stress tests Keras ETL transformation using different communication types.""" 107 | comm_type, test_name, fqn_flag = test_case 108 | self.run_test(comm_type, test_name, fqn_flag) 109 | -------------------------------------------------------------------------------- /transformers/tests/test_md5.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest suite for the MD5 ETL transformer. 3 | 4 | For each combination of server backend (Flask, FastAPI, HTTP), 5 | communication mode (HPULL/HPUSH), and argument style (FQN vs relative), this test: 6 | 1. Uploads sample image and text files into a fresh bucket. 7 | 2. Creates an MD5 ETL job via `etl_factory`. 8 | 3. Transforms each file and asserts the output matches the MD5 checksum. 9 | 10 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 11 | """ 12 | 13 | import logging 14 | import hashlib 15 | from pathlib import Path 16 | from typing import Dict 17 | 18 | import pytest 19 | from aistore.sdk.etl import ETLConfig 20 | from aistore.sdk import Bucket 21 | 22 | from tests.const import MD5_TEMPLATE, INLINE_PARAM_COMBINATIONS 23 | 24 | # Configure module‐level logging 25 | logging.basicConfig( 26 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 27 | ) 28 | 29 | 30 | # pylint: disable=too-many-arguments 31 | @pytest.mark.parametrize("server_type, comm_type, use_fqn", INLINE_PARAM_COMBINATIONS) 32 | def test_md5_transformer( 33 | test_bck: Bucket, 34 | local_files: Dict[str, Path], 35 | etl_factory, 36 | server_type: str, 37 | comm_type: str, 38 | use_fqn: bool, 39 | ) -> None: 40 | """ 41 | Validate the MD5 ETL transformer across runtimes and communication modes. 42 | 43 | Args: 44 | test_bck: fresh bucket fixture 45 | local_files: mapping of filename -> Path for inputs 46 | etl_factory: factory fixture to create ETL jobs 47 | server_type: 'flask' | 'fastapi' | 'http' 48 | comm_type: ETL_COMM_HPULL | ETL_COMM_HPUSH 49 | use_fqn: whether to pass FQN or relative paths 50 | """ 51 | # 1) Upload inputs 52 | for filename, path in local_files.items(): 53 | logging.debug("Uploading %s to %s", filename, test_bck.name) 54 | test_bck.object(filename).get_writer().put_file(str(path)) 55 | 56 | # 2) Initialize ETL 57 | etl_name = etl_factory( 58 | tag="md5", 59 | server_type=server_type, 60 | template=MD5_TEMPLATE, 61 | communication_type=comm_type, 62 | use_fqn=use_fqn, 63 | ) 64 | logging.info( 65 | "Initialized MD5 ETL '%s' (server=%s, comm=%s, fqn=%s)", 66 | etl_name, 67 | server_type, 68 | comm_type, 69 | use_fqn, 70 | ) 71 | 72 | # 3) Run transform and assert checksum 73 | for filename, path in local_files.items(): 74 | # compute expected MD5 of original file 75 | expected = hashlib.md5(Path(path).read_bytes()).hexdigest().encode() 76 | 77 | # fetch transformed result 78 | result_bytes = ( 79 | test_bck.object(filename).get_reader(etl=ETLConfig(etl_name)).read_all() 80 | ) 81 | 82 | assert ( 83 | result_bytes == expected 84 | ), f"ETL {etl_name} MD5 mismatch for {filename}: expected {expected!r}, got {result_bytes!r}" 85 | -------------------------------------------------------------------------------- /transformers/tests/test_md5_stress.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest-based stress suite for the MD5 ETL transformer. 3 | 4 | This module: 5 | - Uses a pre-populated `stress_bucket` with 10,000 objects (session-scoped fixture). 6 | - Creates a fresh `test_bck` destination bucket per test. 7 | - Runs the MD5 ETL across all server/comm/FQN combinations in parallel. 8 | - Verifies object counts and payload correctness on a random sample. 9 | - Records per-test durations into `metrics.txt`. 10 | 11 | Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 12 | """ 13 | 14 | import random 15 | import logging 16 | import hashlib 17 | 18 | import pytest 19 | from aistore.sdk import Bucket 20 | 21 | from tests.const import PARAM_COMBINATIONS, MD5_TEMPLATE, LABEL_FMT 22 | 23 | logger = logging.getLogger(__name__) 24 | logging.basicConfig( 25 | level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s: %(message)s" 26 | ) 27 | 28 | 29 | # pylint: disable=too-many-arguments, too-many-locals 30 | @pytest.mark.stress 31 | @pytest.mark.parametrize( 32 | "server_type, comm_type, use_fqn, direct_put", PARAM_COMBINATIONS 33 | ) 34 | def test_md5_stress( 35 | stress_client, 36 | stress_bucket: Bucket, 37 | test_bck: Bucket, 38 | etl_factory, 39 | stress_metrics, 40 | stress_object_count, 41 | server_type: str, 42 | comm_type: str, 43 | use_fqn: bool, 44 | direct_put: str, 45 | ): 46 | """ 47 | Stress test for MD5 ETL: copy 10k objects with transformation. 48 | """ 49 | # 1) Initialize ETL 50 | label = LABEL_FMT.format( 51 | name="MD5", 52 | server=server_type, 53 | comm=comm_type, 54 | arg="fqn" if use_fqn else "", 55 | direct=direct_put, 56 | ) 57 | etl_name = etl_factory( 58 | tag="md5", 59 | server_type=server_type, 60 | template=MD5_TEMPLATE, 61 | communication_type=comm_type, 62 | use_fqn=use_fqn, 63 | direct_put=direct_put, 64 | ) 65 | 66 | # 2) Run transform job 67 | job_id = stress_bucket.transform( 68 | etl_name=etl_name, 69 | to_bck=test_bck, 70 | num_workers=24, 71 | timeout="10m", 72 | ) 73 | job = stress_client.job(job_id) 74 | job.wait(timeout=600) 75 | duration = job.get_total_time() 76 | 77 | logger.info( 78 | "ETL '%s' completed in %ss (srv=%s, comm=%s, fqn=%s)", 79 | etl_name, 80 | duration, 81 | server_type, 82 | comm_type, 83 | use_fqn, 84 | ) 85 | 86 | # 3) Verify counts 87 | objs = list(test_bck.list_all_objects()) 88 | assert ( 89 | len(objs) == stress_object_count 90 | ), f"Expected {stress_object_count} objects, got {len(objs)}" 91 | 92 | # 4) Sample and verify payload 93 | samples = random.sample(objs, 10) 94 | for entry in samples: 95 | data = test_bck.object(entry.name).get_reader().read_all() 96 | acutal_obj = stress_bucket.object(entry.name).get_reader().read_all() 97 | expected = hashlib.md5(acutal_obj).hexdigest().encode() 98 | assert data == expected, f"MD5 checksum not matching for {entry.name}" 99 | 100 | # 5) Record metric 101 | stress_metrics.append((label, duration)) 102 | -------------------------------------------------------------------------------- /transformers/tests/test_torchvision_transformer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | 5 | import io 6 | from PIL import Image 7 | from torchvision import transforms 8 | 9 | from tests.base import TestBase 10 | from tests.utils import ( 11 | format_image_tag_for_git_test_mode, 12 | cases, 13 | generate_random_string, 14 | ) 15 | from aistore.sdk.etl.etl_const import ETL_COMM_HPULL, ETL_COMM_HPUSH 16 | from aistore.sdk.etl.etl_templates import TORCHVISION_TRANSFORMER 17 | from aistore.sdk.etl import ETLConfig 18 | 19 | 20 | class TestTorchVisionTransformer(TestBase): 21 | """Unit tests for TorchVision-based image transformations using AIStore ETL.""" 22 | 23 | def setUp(self): 24 | """Set up test environment by uploading a test image to the bucket.""" 25 | super().setUp() 26 | self.test_image_filename = "test-image.jpg" 27 | self.test_image_source = "./resources/test-image.jpg" 28 | 29 | self.test_bck.object(self.test_image_filename).get_writer().put_file( 30 | self.test_image_source 31 | ) 32 | 33 | def run_torchvision_test(self, communication_type): 34 | """ 35 | Compares AIStore ETL-transformed images with locally transformed images. 36 | 37 | Args: 38 | communication_type (str): The ETL communication type (HPULL, HPUSH). 39 | """ 40 | etl_name = f"torchvision-transformer-{generate_random_string(5)}" 41 | self.etls.append(etl_name) 42 | 43 | # Define AIStore ETL transformation template 44 | template = TORCHVISION_TRANSFORMER.format( 45 | communication_type=communication_type, 46 | transform='{"Resize": {"size": [100, 100]}, "Grayscale": {"num_output_channels": 1}}', 47 | format="JPEG", 48 | ) 49 | 50 | # Modify template for Git test mode 51 | if self.git_test_mode: 52 | template = format_image_tag_for_git_test_mode(template, "torchvision") 53 | 54 | # Initialize ETL and apply transformation via AIStore 55 | self.client.etl(etl_name).init_spec( 56 | template=template, communication_type=communication_type, timeout="10m" 57 | ) 58 | 59 | etl_transformed_image_bytes = ( 60 | self.test_bck.object(self.test_image_filename) 61 | .get_reader(etl=ETLConfig(etl_name)) 62 | .read_all() 63 | ) 64 | 65 | # Perform the same transformation locally using TorchVision 66 | transformed_image_bytes = self.get_transformed_image_local() 67 | 68 | # Assert that AIStore ETL and local transformations produce identical outputs 69 | self.assertEqual(transformed_image_bytes, etl_transformed_image_bytes) 70 | 71 | def get_transformed_image_local(self) -> bytes: 72 | """ 73 | Applies the same transformation locally using TorchVision to compare against AIStore ETL output. 74 | 75 | Returns: 76 | bytes: The locally transformed image in JPEG format. 77 | """ 78 | transform = transforms.Compose( 79 | [ 80 | transforms.Resize((100, 100)), # Resize to 100x100 pixels 81 | transforms.Grayscale(num_output_channels=1), # Convert to grayscale 82 | ] 83 | ) 84 | image = Image.open(self.test_image_source) 85 | transformed_tensor = transform(transforms.ToTensor()(image)) 86 | transformed_image = transforms.ToPILImage()(transformed_tensor) 87 | 88 | # Convert transformed image to bytes 89 | byte_arr = io.BytesIO() 90 | transformed_image.save(byte_arr, format="JPEG") 91 | return byte_arr.getvalue() 92 | 93 | @cases( 94 | ETL_COMM_HPULL, 95 | ETL_COMM_HPUSH, 96 | ) 97 | def test_torchvision_transform(self, communication_type): 98 | """Runs the TorchVision ETL transformation for different communication types.""" 99 | self.run_torchvision_test(communication_type) 100 | -------------------------------------------------------------------------------- /transformers/tests/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | 5 | import os 6 | import random 7 | import string 8 | import base64 9 | import logging 10 | import json 11 | import yaml 12 | 13 | from aistore import Client 14 | from aistore.sdk.const import URL_PATH_ETL, HTTP_METHOD_GET 15 | 16 | 17 | def generate_random_string(length: int = 5) -> str: 18 | """Generates a random lowercase string of the specified length.""" 19 | return "".join(random.choices(string.ascii_lowercase, k=length)) 20 | 21 | 22 | def format_image_tag_for_git_test_mode(template: str, image_name: str) -> str: 23 | """ 24 | Modifies the container image in the given YAML template to use a test-specific image tag. 25 | 26 | Args: 27 | template (str): YAML template as a string. 28 | image_name (str): Name of the image to be formatted. 29 | 30 | Returns: 31 | str: Updated YAML template as a string. 32 | """ 33 | parsed_template = yaml.safe_load(template) 34 | parsed_template["spec"]["containers"][0][ 35 | "image" 36 | ] = f"aistorage/transformer_{image_name}:test" 37 | return yaml.dump(parsed_template) 38 | 39 | 40 | def cases(*args): 41 | """ 42 | Decorator for running a test function with multiple test cases. 43 | 44 | Args: 45 | *args: Arguments to be passed to the test function. 46 | 47 | Returns: 48 | Function wrapper. 49 | """ 50 | 51 | def decorator(func): 52 | def wrapper(self, *inner_args, **kwargs): 53 | for arg in args: 54 | with self.subTest(arg=arg): 55 | func(self, arg, *inner_args, **kwargs) 56 | 57 | return wrapper 58 | 59 | return decorator 60 | 61 | 62 | # pylint: disable=protected-access 63 | def log_etl(client: Client, etl_name: str) -> None: 64 | """ 65 | Fetches and saves the logs of a specified ETL job. 66 | """ 67 | logs_dir = os.path.join(os.getcwd(), "logs") 68 | os.makedirs(logs_dir, exist_ok=True) 69 | log_path = os.path.join(logs_dir, f"{etl_name}.log") 70 | 71 | try: 72 | resp = client._request_client.request( 73 | HTTP_METHOD_GET, 74 | f"/{URL_PATH_ETL}/{etl_name}/logs", 75 | timeout=20, 76 | ) 77 | entries = json.loads(resp.content.decode("utf-8")) 78 | 79 | with open(log_path, "w", encoding="utf-8") as f: 80 | for entry in entries: 81 | tid = entry.get("target_id", "unknown") 82 | b64 = entry.get("logs", "").strip() 83 | 84 | raw = base64.b64decode(b64) 85 | decoded = raw.decode("utf-8", errors="replace") 86 | 87 | f.write(f"Target ID: {tid}\n") 88 | f.write(decoded) 89 | if not decoded.endswith("\n"): 90 | f.write("\n") 91 | f.write("\n") 92 | 93 | except Exception as e: 94 | logging.error( 95 | "Warning: failed to fetch or write logs for ETL '%s': %s", etl_name, e 96 | ) 97 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:3.9-slim 2 | 3 | WORKDIR / 4 | 5 | COPY ./requirements.txt requirements.txt 6 | 7 | RUN pip3 install --no-cache-dir --upgrade -r requirements.txt 8 | 9 | COPY main.py main.py 10 | 11 | ENV PYTHONUNBUFFERED 1 12 | 13 | EXPOSE 8000 14 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_torchvision:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_torchvision:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/http-multithreaded-server/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/library/python:slim 2 | 3 | COPY requirements.txt requirements.txt 4 | RUN pip3 install -r requirements.txt 5 | 6 | RUN mkdir /code 7 | WORKDIR /code 8 | COPY server.py server.py 9 | 10 | ENV PYTHONUNBUFFERED 1 11 | 12 | EXPOSE 80 13 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/http-multithreaded-server/Makefile: -------------------------------------------------------------------------------- 1 | # Default image tag is 'latest' 2 | TAG := latest 3 | ifeq ($(GIT_TEST), true) 4 | TAG := test 5 | endif 6 | 7 | REGISTRY_URL ?= docker.io/aistorage 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -t $(REGISTRY_URL)/transformer_torchvision:$(TAG) . 13 | 14 | push: 15 | docker push $(REGISTRY_URL)/transformer_torchvision:$(TAG) 16 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/http-multithreaded-server/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-torchvision 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_torchvision:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 80 17 | command: ['/code/server.py', '--listen', '0.0.0.0', '--port', '80'] 18 | env: 19 | - name: FORMAT 20 | # Expected Values - PNG, JPEG, etc. 21 | value: ${FORMAT} 22 | - name: TRANSFORM 23 | # MANDATORY: Expected JSON string parameter-value pairs. 24 | # https://pytorch.org/vision/0.9/transforms.html 25 | # e.g. '{"ColorJitter": {"brightness": 0.8, "contrast": 0.4}, "RandomRotation": {"degrees": 30}}' 26 | value: ${TRANSFORM} 27 | # This is a health check endpoint which one should specify 28 | # for aistore to determine the health of the ETL container. 29 | readinessProbe: 30 | httpGet: 31 | path: /health 32 | port: default 33 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/http-multithreaded-server/requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | requests 3 | torchvision -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: transformer-torchvision 5 | annotations: 6 | # Values it can take ["hpull://","hpush://"] 7 | communication_type: ${COMMUNICATION_TYPE:-"\"hpull://\""} 8 | wait_timeout: 5m 9 | spec: 10 | containers: 11 | - name: server 12 | image: aistorage/transformer_torchvision:latest 13 | imagePullPolicy: Always 14 | ports: 15 | - name: default 16 | containerPort: 8000 17 | command: ["gunicorn", "main:app", "--workers", "4", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000"] 18 | env: 19 | - name: FORMAT 20 | # Expected Values - PNG, JPEG, etc. 21 | value: ${FORMAT} 22 | - name: TRANSFORM 23 | # MANDATORY: Expected JSON string parameter-value pairs. 24 | # https://pytorch.org/vision/0.9/transforms.html 25 | # e.g. '{"ColorJitter": {"brightness": 0.8, "contrast": 0.4}, "RandomRotation": {"degrees": 30}}' 26 | value: ${TRANSFORM} 27 | # This is a health check endpoint which one should specify 28 | # for aistore to determine the health of the ETL container. 29 | readinessProbe: 30 | httpGet: 31 | path: /health 32 | port: default 33 | -------------------------------------------------------------------------------- /transformers/torchvision_preprocess/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.109.1 2 | uvicorn==0.24.0.post1 3 | gunicorn==23.0.0 4 | aiohttp>=3.9.2 5 | pillow==10.3.0 6 | torchvision==0.21.0 --------------------------------------------------------------------------------