├── .devcontainer
    ├── Dockerfile
    └── devcontainer.json
├── .dockerignore
├── .env
├── .gitattributes
├── .github
    ├── pull_request_template.md
    └── workflows
    │   ├── build_ecr_image.yaml
    │   ├── build_on_merge.yaml
    │   ├── publish_executor_containers.yaml
    │   ├── publish_indexify_pypi.yaml
    │   ├── publish_indexify_server.yaml
    │   ├── tests.yaml
    │   ├── ui.yaml
    │   └── wf_build_indexify_server_release_packages.yaml
├── .gitignore
├── .gitmodules
├── .prettierignore
├── .prettierrc.yaml
├── .repo
    └── conf
    │   └── distributions
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── docs
    ├── README.md
    ├── api-reference
    │   ├── documents
    │   │   ├── extract
    │   │   │   ├── extract-file-async.mdx
    │   │   │   └── extract-file-sync.mdx
    │   │   ├── process
    │   │   │   ├── parse-file-async.mdx
    │   │   │   └── parse-file-sync.mdx
    │   │   └── retrieve
    │   │   │   └── retrieve-result.mdx
    │   ├── openapi.json
    │   ├── root.mdx
    │   └── upload
    │   │   └── upload-file.mdx
    ├── architecture.mdx
    ├── cli-ui
    │   └── ui.mdx
    ├── comparisons.mdx
    ├── develop_extractors.mdx
    ├── development.mdx
    ├── download
    │   └── index.mdx
    ├── examples
    │   └── index.mdx
    ├── extractors.mdx
    ├── favicon.svg
    ├── images
    │   ├── Content_AI_Content.png
    │   ├── Extractor_Transformation_Concept.png
    │   ├── GS_Content.png
    │   ├── GS_ExtractionGraph.png
    │   ├── GS_Vector_Indexes.png
    │   ├── Homepage_Diagram.png
    │   ├── Indexify_Architecture_Distributed.png
    │   ├── Indexify_Architecture_Extractors.png
    │   ├── Indexify_Architecture_Ingestion_Server.png
    │   ├── Indexify_Architecture_Local.png
    │   ├── Indexify_Architecture_Server.png
    │   ├── Indexify_Home_Diagram.gif
    │   ├── Indexify_KAT.gif
    │   ├── Indexify_Logo_Wordmark.png
    │   ├── Indexify_Logo_Wordmark_Dark.png
    │   ├── PDF_Extraction_Demo-VEED.gif
    │   ├── PDF_Usecase.png
    │   ├── System_Architecture_Diagram.png
    │   ├── Tensorlake_Logo_LG.png
    │   ├── content_extractor_concept.jpg
    │   ├── content_extractor_concept.png
    │   ├── docs_intro_diagram.png
    │   ├── extraction_graph_getting_started.png
    │   ├── extraction_graph_key_concept.png
    │   ├── extractors_list.png
    │   ├── grafana
    │   │   └── total_requests.png
    │   ├── indexify_high_level.svg
    │   ├── indexify_high_level_abstract.png
    │   ├── indexify_high_level_idea.png
    │   ├── jaeger
    │   │   ├── detailed.png
    │   │   ├── flamegraph.png
    │   │   ├── stats.png
    │   │   └── traces.png
    │   ├── key_concepts_block_diagram.png
    │   ├── key_concepts_embeddings.png
    │   ├── key_concepts_extraction_policy.png
    │   ├── key_concepts_feature_extraction.png
    │   ├── key_concepts_transform.png
    │   ├── logo.png
    │   ├── logo
    │   │   ├── TL-Color.svg
    │   │   ├── TL-Dark.svg
    │   │   ├── TL-Light.svg
    │   │   └── favicon.png
    │   ├── ui_compute_graph_definition.png
    │   ├── ui_compute_graphs.png
    │   ├── ui_executors.png
    │   ├── ui_invocation_outputs.png
    │   ├── ui_invocation_tasks.png
    │   ├── ui_invocations.png
    │   └── ui_namespaces.png
    ├── inc-adoption.mdx
    ├── integrations
    │   ├── dspy.mdx
    │   └── langchain.mdx
    ├── introduction.mdx
    ├── key-concepts.mdx
    ├── mint.json
    ├── operations
    │   ├── configuration.mdx
    │   ├── deployment.mdx
    │   ├── develop.mdx
    │   ├── gpu-deployment.mdx
    │   ├── kubernetes.mdx
    │   └── metrics.mdx
    ├── package-lock.json
    ├── package.json
    ├── packaging-dependencies.mdx
    ├── sdks
    │   ├── compute_graphs.mdx
    │   ├── functions.mdx
    │   ├── index.mdx
    │   ├── namespaces.mdx
    │   └── retrieval.mdx
    └── usecases
    │   ├── audio_extraction.mdx
    │   ├── image_retrieval.mdx
    │   ├── pdf_extraction.mdx
    │   ├── rag.mdx
    │   └── video_rag.mdx
├── examples
    ├── basic_embedding
    │   └── workflow.py
    ├── container_images
    │   └── transformers.py
    ├── contextual_rag
    │   ├── README.md
    │   ├── __init__.py
    │   └── workflow.py
    ├── knowledge_graph
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── requirements.txt
    │   └── workflow.py
    ├── object_detection
    │   ├── .gitignore
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   └── workflow.py
    ├── pdf_document_extraction
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── chroma_db_retrieve.py
    │   ├── chromadb_writer.py
    │   ├── common_objects.py
    │   ├── docker-compose.yaml
    │   ├── elastic_writer.py
    │   ├── embedding.py
    │   ├── es_retrieve.py
    │   ├── images.py
    │   ├── lancedb_functions.py
    │   ├── pdf_parser_docling.py
    │   ├── requirements.txt
    │   └── workflow.py
    ├── pdf_structured_extraction
    │   ├── .env
    │   ├── docker-compose.yaml
    │   ├── document_ai_api_version_workflow.py
    │   └── workflow.py
    ├── readme
    │   ├── distributed_map.py
    │   ├── map_reduce_example.py
    │   ├── text_embedder.py
    │   └── website.py
    ├── tweetsgenerator
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── requirements.txt
    │   └── workflow.py
    ├── video_summarization
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── requirements.txt
    │   └── workflow.py
    └── website_audio_summary
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── requirements.txt
    │   └── workflow.py
├── indexify
    ├── Makefile
    ├── README.md
    ├── poetry.lock
    ├── pyproject.toml
    ├── src
    │   └── indexify
    │   │   ├── cli
    │   │       ├── __init__.py
    │   │       ├── build_image.py
    │   │       ├── deploy.py
    │   │       └── executor.py
    │   │   ├── executor
    │   │       ├── README.md
    │   │       ├── blob_store
    │   │       │   ├── blob_store.py
    │   │       │   ├── local_fs_blob_store.py
    │   │       │   ├── metrics
    │   │       │   │   └── blob_store.py
    │   │       │   └── s3_blob_store.py
    │   │       ├── channel_manager.py
    │   │       ├── executor.py
    │   │       ├── function_allowlist.py
    │   │       ├── function_executor
    │   │       │   ├── function_executor.py
    │   │       │   ├── health_checker.py
    │   │       │   ├── invocation_state_client.py
    │   │       │   ├── metrics
    │   │       │   │   ├── function_executor.py
    │   │       │   │   ├── health_checker.py
    │   │       │   │   └── invocation_state_client.py
    │   │       │   └── server
    │   │       │   │   ├── client_configuration.py
    │   │       │   │   ├── function_executor_server.py
    │   │       │   │   ├── function_executor_server_factory.py
    │   │       │   │   ├── subprocess_function_executor_server.py
    │   │       │   │   └── subprocess_function_executor_server_factory.py
    │   │       ├── function_executor_controller
    │   │       │   ├── __init__.py
    │   │       │   ├── completed_task_metrics.py
    │   │       │   ├── create_function_executor.py
    │   │       │   ├── debug_event_loop.py
    │   │       │   ├── destroy_function_executor.py
    │   │       │   ├── downloads.py
    │   │       │   ├── events.py
    │   │       │   ├── function_executor_controller.py
    │   │       │   ├── loggers.py
    │   │       │   ├── message_validators.py
    │   │       │   ├── metrics
    │   │       │   │   ├── completed_task_metrics.py
    │   │       │   │   ├── downloads.py
    │   │       │   │   ├── function_executor_controller.py
    │   │       │   │   ├── run_task.py
    │   │       │   │   └── upload_task_output.py
    │   │       │   ├── prepare_task.py
    │   │       │   ├── run_task.py
    │   │       │   ├── task_info.py
    │   │       │   ├── task_output.py
    │   │       │   └── upload_task_output.py
    │   │       ├── host_resources
    │   │       │   ├── host_resources.py
    │   │       │   ├── nvidia_gpu.py
    │   │       │   └── nvidia_gpu_allocator.py
    │   │       ├── metrics
    │   │       │   ├── channel_manager.py
    │   │       │   ├── executor.py
    │   │       │   ├── state_reconciler.py
    │   │       │   └── state_reporter.py
    │   │       ├── monitoring
    │   │       │   ├── handler.py
    │   │       │   ├── health_check_handler.py
    │   │       │   ├── health_checker
    │   │       │   │   ├── generic_health_checker.py
    │   │       │   │   └── health_checker.py
    │   │       │   ├── metrics.py
    │   │       │   ├── prometheus_metrics_handler.py
    │   │       │   ├── server.py
    │   │       │   └── startup_probe_handler.py
    │   │       ├── state_reconciler.py
    │   │       └── state_reporter.py
    │   │   └── proto
    │   │       ├── executor_api.proto
    │   │       ├── executor_api_pb2.py
    │   │       ├── executor_api_pb2.pyi
    │   │       └── executor_api_pb2_grpc.py
    └── tests
    │   ├── .gitignore
    │   ├── README.md
    │   ├── cli
    │       ├── test_environment_variables.py
    │       ├── test_function_allowlist.py
    │       ├── test_invoke_duration.py
    │       ├── test_metrics.py
    │       ├── test_server_task_distribution.py
    │       ├── test_startup_probe.py
    │       └── testing.py
    │   ├── executor
    │       ├── constants.py
    │       ├── test_function_executor_health_checker.py
    │       ├── test_function_executor_routing.py
    │       ├── test_graph_timeouts.py
    │       ├── test_metrics.py
    │       └── testing.py
    │   └── run_tests.sh
├── operations
    └── k8s
    │   ├── README.md
    │   └── helm
    │       ├── .helmignore
    │       ├── Chart.lock
    │       ├── Chart.yaml
    │       ├── local.yaml
    │       ├── templates
    │           ├── _helpers.tpl
    │           ├── config.yaml
    │           ├── executor.yaml
    │           ├── extra.yaml
    │           ├── ingress.yaml
    │           ├── secret.yaml
    │           └── server.yaml
    │       └── values.yaml
└── server
    ├── .dockerignore
    ├── .repo
        └── conf
        │   └── distributions
    ├── Cargo.lock
    ├── Cargo.toml
    ├── Cross.toml
    ├── Makefile
    ├── blob_store
        ├── Cargo.toml
        └── src
        │   └── lib.rs
    ├── build.rs
    ├── data_model
        ├── Cargo.toml
        └── src
        │   ├── filter.rs
        │   ├── host_resources_tests.rs
        │   ├── lib.rs
        │   └── test_objects.rs
    ├── dockerfiles
        ├── Dockerfile.builder_linux_aarch64
        ├── Dockerfile.builder_linux_x86
        ├── Dockerfile.local
        └── Dockerfile.release_server
    ├── indexify_ui
        ├── Cargo.toml
        ├── build.rs
        └── src
        │   └── lib.rs
    ├── metrics
        ├── Cargo.toml
        └── src
        │   └── lib.rs
    ├── processor
        ├── Cargo.toml
        └── src
        │   ├── gc.rs
        │   ├── graph_processor.rs
        │   ├── lib.rs
        │   ├── task_allocator.rs
        │   ├── task_cache.rs
        │   ├── task_creator.rs
        │   └── task_scheduler.rs
    ├── proto
        └── executor_api.proto
    ├── rust-toolchain.toml
    ├── rustfmt.toml
    ├── sample_config.yaml
    ├── src
        ├── config.rs
        ├── executor_api.rs
        ├── executors.rs
        ├── gc_test.rs
        ├── http_objects.rs
        ├── index.html
        ├── integration_test.rs
        ├── main.rs
        ├── openapi.rs
        ├── reconciliation_test.rs
        ├── routes.rs
        ├── routes
        │   ├── download.rs
        │   ├── internal_ingest.rs
        │   ├── invoke.rs
        │   └── logs.rs
        ├── service.rs
        └── testing.rs
    ├── state_store
        ├── Cargo.toml
        └── src
        │   ├── in_memory_state.rs
        │   ├── invocation_events.rs
        │   ├── kv.rs
        │   ├── lib.rs
        │   ├── migration_runner.rs
        │   ├── migrations
        │       ├── contexts.rs
        │       ├── migration_trait.rs
        │       ├── mod.rs
        │       ├── registry.rs
        │       ├── testing.rs
        │       ├── v1_task_status.rs
        │       ├── v2_invocation_ctx_timestamps.rs
        │       ├── v3_invocation_ctx_secondary_index.rs
        │       ├── v4_drop_executors.rs
        │       ├── v5_allocation_keys.rs
        │       ├── v6_clean_orphaned_tasks.rs
        │       ├── v7_reset_allocated_tasks.rs
        │       └── v8_rebuild_invocation_ctx_secondary_index.rs
        │   ├── requests.rs
        │   ├── scanner.rs
        │   ├── serializer.rs
        │   ├── state_changes.rs
        │   ├── state_machine.rs
        │   └── test_state_store.rs
    ├── ui
        ├── .dockerignore
        ├── .gitignore
        ├── package-lock.json
        ├── package.json
        ├── public
        │   ├── favicon.ico
        │   ├── index.html
        │   ├── logo.svg
        │   ├── manifest.json
        │   └── robots.txt
        ├── src
        │   ├── App.css
        │   ├── components
        │   │   ├── CopyText.tsx
        │   │   ├── CopyTextPopover.tsx
        │   │   ├── Footer.tsx
        │   │   ├── InfoBox.tsx
        │   │   ├── TruncatedDescription.tsx
        │   │   ├── TruncatedText.tsx
        │   │   ├── VersionDisplay.tsx
        │   │   ├── cards
        │   │   │   ├── ComputeGraphsCard.tsx
        │   │   │   ├── ExecutorsCard.tsx
        │   │   │   └── NamespacesCard.tsx
        │   │   └── tables
        │   │   │   ├── ComputeGraphTable.tsx
        │   │   │   ├── InvocationOutputTable.tsx
        │   │   │   ├── InvocationTasksTable.tsx
        │   │   │   └── InvocationsTable.tsx
        │   ├── error-page.tsx
        │   ├── index.css
        │   ├── index.tsx
        │   ├── react-app-env.d.ts
        │   ├── routes
        │   │   ├── Namespace
        │   │   │   ├── ComputeGraphsPage.tsx
        │   │   │   ├── ExecutorsPage.tsx
        │   │   │   ├── IndividualComputeGraphPage.tsx
        │   │   │   ├── IndividualInvocationPage.tsx
        │   │   │   ├── NamespacesPage.tsx
        │   │   │   ├── index.ts
        │   │   │   └── types.ts
        │   │   └── root.tsx
        │   ├── setupTests.ts
        │   ├── theme.ts
        │   ├── types.ts
        │   └── utils
        │   │   ├── helpers.ts
        │   │   └── loaders.ts
        ├── tsconfig.json
        └── vscode
        │   └── editor.json
    └── utils
        ├── Cargo.toml
        └── src
            ├── dynamic_sleep.rs
            └── lib.rs


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt update
 5 | RUN apt -y install software-properties-common
 6 | RUN add-apt-repository ppa:deadsnakes/ppa -y
 7 | RUN apt -y remove python3.10
 8 | 
 9 | RUN apt -y install build-essential make sudo cmake g++ sqlite3 python3.11 python3.11-distutils python3.11-venv python3.11-dev protobuf-compiler protobuf-compiler-grpc ca-certificates curl gnupg pkg-config libssl-dev vim docker-compose libclang-dev librocksdb-dev postgresql
10 | 
11 | RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
12 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain stable -y
13 | # rustfmt nightly is used for formatting
14 | RUN /root/.cargo/bin/rustup toolchain install nightly
15 | 
16 | RUN curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.7/install.sh | bash
17 | RUN bash -c 'export NVM_DIR="$([ -z "${XDG_CONFIG_HOME-}" ] && printf %s "${HOME}/.nvm" || printf %s "${XDG_CONFIG_HOME}/nvm")" && \. "$NVM_DIR/nvm.sh" && nvm install stable'
18 | 
19 | RUN service postgresql start
20 | RUN bash -c 'while sudo -u postgres psql -c "CREATE DATABASE indexify"; do sleep 2; done'
21 | 
22 | RUN echo 'alias python=python3.11' >> ~/.bashrc
23 | RUN echo 'export PYTHON=python3.11' >> ~/.bashrc
24 | RUN echo 'export PYO3_CROSS_PYTHON_VERSION=3.11' >> ~/.bashrc
25 | RUN echo 'export PYTHONPATH=${PYTHONPATH}:/workspaces/indexify/extractors' >> ~/.bashrc
26 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
 3 | {
 4 | 	"name": "Development Container",
 5 | 	"build": {
 6 | 		// Sets the run context to one level up instead of the .devcontainer folder.
 7 | 		"context": "..",
 8 | 		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
 9 | 		"dockerfile": "Dockerfile"
10 | 	},
11 | 	// Features to add to the dev container. More info: https://containers.dev/features.
12 | 	// "features": {},
13 | 	"features": {
14 | 		"ghcr.io/devcontainers/features/docker-in-docker:2": {
15 | 			"dockerDashComposeVersion": "v2"
16 | 		}
17 | 	},
18 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
19 | 	"forwardPorts": [
20 | 		8900
21 | 	],
22 | 	"mounts": [
23 | 		"source=${localEnv:HOME}/.cache/huggingface,target=/root/.cache/huggingface,type=bind,consistency=cached"
24 | 	],
25 | 	"customizations": {
26 | 		"vscode": {
27 | 			"extensions": [
28 | 				"rust-lang.rust-analyzer",
29 | 				"tamasfe.even-better-toml", // for Cargo.toml
30 | 				"eamodio.gitlens", // IDE Git information
31 | 				"davidanson.vscode-markdownlint",
32 | 				"ms-azuretools.vscode-docker", // Docker integration and linting
33 | 				"shardulm94.trailing-spaces", // Show trailing spaces
34 | 				"Gruntfuggly.todo-tree", // Highlights TODO comments
35 | 				"bierner.emojisense", // Emoji sense for markdown
36 | 				"stkb.rewrap", // rewrap comments after n characters on one line
37 | 				"vscode-icons-team.vscode-icons", // Better file extension icons
38 | 				"IBM.output-colorizer", // Colorize your output/test logs,
39 | 				"ms-azuretools.vscode-docker"
40 | 			],
41 | 			"settings": {
42 | 				"editor.formatOnSave": true,
43 | 				"rust-analyzer.rustfmt.extraArgs": [
44 | 					"+nightly"
45 | 				]
46 | 			}
47 | 		}
48 | 	}
49 | 	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
50 | 	//"remoteUser": "devcontainer"
51 | }


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | docs
 2 | target
 3 | .vscode
 4 | *.db
 5 | ve
 6 | site
 7 | *.pyc
 8 | sdk-py/dist
 9 | sdk-typescript/node_modules
10 | sdk-typescript/lib
11 | *.tgz
12 | 
13 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | DATABASE_URL=postgres://postgres:postgres@localhost/indexify
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | docs/images/PDF_Exatraction_Demo-VEED.gif filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Context
 2 | 
 3 | <!--
 4 | In a few sentences or less, please explain the context behind this change to help answer why this change is needed.
 5 | 
 6 | If this is a bug fix, make sure to include "fixes #xxxx", or
 7 | "closes #xxxx".
 8 | 
 9 | Screenshots, logs, code or other visual aids are greatly appreciated.
10 |  -->
11 | 
12 | ## What
13 | 
14 | <!--
15 | In a few sentences, please summarize the change to help reviewers.
16 | 
17 | Consider providing screenshots, logs, code or other visual aids to help the reviewer understand the approach taken.
18 | -->
19 | 
20 | ## Testing
21 | 
22 | <!--
23 | Please include steps used to verify the change.
24 | 
25 | Consider providing screenshots, logs, code or other visual aids to help the reviewer in their testing.
26 | -->
27 | 
28 | ## Contribution Checklist
29 | 
30 | - [ ] If a Python package was changed, please run `make fmt` in the package directory.
31 | - [ ] If the server was changed, please run `make fmt` in `server/`.
32 | - [ ] Make sure all PR Checks are passing.
33 | <!--
34 | Notes:
35 | 
36 | Tests of a Python package can be run manually. Start a Server and an Executor then
37 | run `make test` in the Python package directory.
38 | 
39 | To test if changes to the server are backward compatible with the latest
40 | release, label the PR with `ci_compat_test`. This might report failures
41 | unrelated to your change if previous incompatible changes were pushed without
42 | being released yet
43 | -->
44 | 


--------------------------------------------------------------------------------
/.github/workflows/build_ecr_image.yaml:
--------------------------------------------------------------------------------
 1 | name: Build ECR images
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       environment:
 7 |         description: "Environment to post the image to"
 8 |         required: true
 9 |         type: string
10 | 
11 |   workflow_call:
12 |     inputs:
13 |       environment:
14 |         description: "Environment to post the image to"
15 |         required: true
16 |         type: string
17 |         
18 | permissions:
19 |   id-token: write
20 |   contents: read
21 | 
22 | defaults:
23 |   run:
24 |     working-directory: ./server
25 | 
26 | jobs:
27 |   build-and-push:
28 |     name: Build indexify server and push to ${{ inputs.environment }}
29 |     environment: ${{ inputs.environment }}
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - name: Checkout
33 |         uses: actions/checkout@v4
34 | 
35 |       - name: Configure AWS credentials
36 |         uses: aws-actions/configure-aws-credentials@v4
37 |         with:
38 |           role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
39 |           role-session-name: github-actions-platform-api
40 |           aws-region: ${{ secrets.AWS_REGION }}
41 |         
42 |       - name: Login to Amazon ECR
43 |         id: login-ecr
44 |         uses: aws-actions/amazon-ecr-login@v2
45 | 
46 |       - name: Set up QEMU
47 |         uses: docker/setup-qemu-action@v3
48 | 
49 |       - name: Set up Docker Buildx
50 |         uses: docker/setup-buildx-action@v3
51 | 
52 |       - run: |
53 |           docker buildx build --platform=linux/amd64 --push . -t ${{ steps.login-ecr.outputs.registry }}/indexify-server:${{ github.sha }} -t ${{ steps.login-ecr.outputs.registry }}/indexify-server:latest -f dockerfiles/Dockerfile.local
54 | 


--------------------------------------------------------------------------------
/.github/workflows/build_on_merge.yaml:
--------------------------------------------------------------------------------
 1 | name: Build on merge
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |       - 'server-sse-stream-stable'
 8 | 
 9 | permissions:
10 |   id-token: write
11 |   contents: read
12 |   pull-requests: write
13 | 
14 | jobs:
15 |   build_dev_ecr:
16 |     uses: ./.github/workflows/build_ecr_image.yaml
17 |     with: 
18 |       environment: dev
19 |     secrets: inherit
20 | 
21 |   build_prod_ecr:
22 |     uses: ./.github/workflows/build_ecr_image.yaml
23 |     with: 
24 |       environment: prod
25 |     secrets: inherit
26 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_executor_containers.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Executor Containers
 2 | 
 3 | on:
 4 |   workflow_call:
 5 |     inputs:
 6 |       indexify_version:
 7 |         type: string
 8 |         description: |
 9 |           Indexify version to use to build the executor containers.
10 |           Note: Should ideally match the tag used for this workflow.
11 |         required: true
12 |   workflow_dispatch:
13 |     inputs:
14 |       indexify_version:
15 |         type: string
16 |         description: |
17 |           Indexify version to use to build the executor containers.
18 |           Note: Should ideally match the tag used for this workflow.
19 |         required: true
20 | 
21 | jobs:
22 |   build-and-push-docker-images:
23 |     name: Build and Push example Executor Docker images
24 |     runs-on: ubuntu-latest-xlarge
25 |     steps:
26 |       - uses: actions/checkout@v4
27 |         with:
28 |           ref: main
29 |       - uses: actions/setup-python@v4
30 |         with:
31 |           python-version: "3.11"
32 |       - name: Set up Docker Buildx
33 |         uses: docker/setup-buildx-action@v3
34 |       - name: Login to Docker Hub
35 |         uses: docker/login-action@v3
36 |         with:
37 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
38 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
39 |       - run: |
40 |           curl -LsSf https://astral.sh/uv/install.sh | sh  # install uv
41 |           uv pip install --system 'indexify==${{ inputs.indexify_version }}' tensorlake -U
42 |           indexify-cli build-image examples/pdf_document_extraction/images.py
43 |           indexify-cli build-image examples/pdf_structured_extraction/workflow.py
44 |           indexify-cli build-image examples/pdf_structured_extraction/document_ai_api_version_workflow.py
45 | 
46 |           docker image list --format json | jq '.Repository+":"+.Tag' | grep 'tensorlake/' | xargs -I {} docker image push {}
47 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_indexify_pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Release Indexify python package
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       release_message:
 7 |         type: string
 8 |         description: Release message
 9 |         required: true
10 |       prerelease:
11 |         type: boolean
12 |         description: Is this a pre-release version?
13 |         required: false
14 |         default: false
15 | 
16 | permissions:
17 |   contents: write
18 |   actions: write
19 |   packages: write
20 | 
21 | 
22 | defaults:
23 |   run:
24 |     working-directory: ./indexify
25 | 
26 | jobs:
27 |   extract-version:
28 |     name: Extract Version Number
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       version: ${{ steps.version_extraction.outputs.version }}
32 |     steps:
33 |       - uses: actions/checkout@v4
34 |         with:
35 |           submodules: true
36 |       - uses: actions/setup-python@v4
37 |         with:
38 |           python-version: '3.10'
39 |       - name: Install toml
40 |         run: pip install toml
41 |       - id: version_extraction
42 |         run: echo "version=$(python -c 'import toml; print(toml.load("pyproject.toml")["tool"]["poetry"]["version"])')" >> $GITHUB_OUTPUT
43 |           
44 |   publish-indexify-to-pypi:
45 |     name: Publish Indexify package
46 |     runs-on: ubuntu-latest
47 |     environment:
48 |       name: pypi
49 |       url: https://pypi.org/p/indexify
50 |     permissions:
51 |       id-token: write
52 |     steps:
53 |       - uses: actions/checkout@v4
54 |         with:
55 |           submodules: true
56 |       - uses: actions/setup-python@v4
57 |         with:
58 |           python-version: '3.10'
59 |       - name: Install Poetry
60 |         run: pipx install --force 'poetry==2.0.0'
61 |       - name: Build python-sdk
62 |         run: make build
63 |       - name: Publish Indexify to PyPI
64 |         uses: pypa/gh-action-pypi-publish@release/v1
65 |         with:
66 |           packages-dir: indexify/dist/
67 | 
68 |   publish-containers:
69 |     name: Publish Containers
70 |     needs:
71 |       - extract-version
72 |       - publish-indexify-to-pypi
73 |     uses: ./.github/workflows/publish_executor_containers.yaml
74 |     with:
75 |       indexify_version: ${{ needs.extract-version.outputs.version }}
76 |     secrets: inherit
77 | 


--------------------------------------------------------------------------------
/.github/workflows/ui.yaml:
--------------------------------------------------------------------------------
 1 | name: UI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |   pull_request:
 8 |     branches:
 9 |       - 'main'
10 |     paths:
11 |       - 'server/ui/**'
12 |       - '.github/workflows/tests_ui.yaml'
13 | 
14 | jobs:
15 |   lint:
16 |     name: Lint Checks
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - name: Setup Node.js
22 |         uses: actions/setup-node@v3
23 |         with:
24 |           node-version: '20'
25 | 
26 |       - name: Npm Install
27 |         run: cd server/ui && npm ci
28 | 
29 |       - name: UI Lint Check
30 |         run: cd server/ui && npm run lint
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # IDE and editor files
 2 | .vscode
 3 | .idea
 4 | 
 5 | # Python-related files
 6 | __pycache__
 7 | *.pyc
 8 | *.egg-info
 9 | .venv
10 | .python-version
11 | venv
12 | ve
13 | 
14 | # JavaScript and TypeScript-related files
15 | node_modules
16 | sdk-typescript/node_modules
17 | sdk-typescript/lib
18 | ui/node_modules
19 | 
20 | # Build and distribution directories
21 | dist
22 | build
23 | site
24 | sdk-py/dist
25 | charts/*.tgz
26 | 
27 | # Local development and cache directories
28 | indexify_storage
29 | indexify_local_runner_cache
30 | server/indexify_storage
31 | local_cache
32 | .dev-tls
33 | src/state/store/snapshots/*
34 | .ipynb_checkpoints/
35 | 
36 | # Data and media files
37 | *.db
38 | *.pdf
39 | *.mp4
40 | *.mp3
41 | data/
42 | upload.pdf
43 | *.wav
44 | *.tgz
45 | 
46 | # MacOS-specific files
47 | .DS_Store
48 | 
49 | # Specific project files and directories
50 | target
51 | indexify-server
52 | indexify-extractor/**/*
53 | rag/**/*
54 | docs/docs/example_code/
55 | local_server_config_*.yaml
56 | sqlite*
57 | *.trace
58 | 
59 | # logs
60 | *.log
61 | 
62 | # Miscellaneous
63 | /executor-py/~
64 | openapi.yaml
65 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorlake"]
2 | 	path = tensorlake
3 | 	url = git@github.com:tensorlakeai/tensorlake.git
4 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | operations/k8s/helm/templates
2 | 


--------------------------------------------------------------------------------
/.prettierrc.yaml:
--------------------------------------------------------------------------------
1 | # @format
2 | 
3 | semi: false
4 | singleQuote: true
5 | proseWrap: always
6 | 


--------------------------------------------------------------------------------
/.repo/conf/distributions:
--------------------------------------------------------------------------------
 1 | Codename: buster
 2 | Suite: stable
 3 | Components: main
 4 | Architectures: amd64 arm64
 5 | 
 6 | Codename: jammy
 7 | Suite: stable
 8 | Components: main restricted universe multiverse
 9 | Architectures: amd64 arm64
10 | 
11 | Codename: focal
12 | Suite: stable
13 | Components: main restricted universe multiverse
14 | Architectures: amd64 arm64
15 | 
16 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.analysis.extraPaths": [
3 |         "./indexify/src",
4 |         "./tensorlake/src"
5 |     ]
6 | }
7 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Indexify Docs
 2 | 
 3 | ### Development
 4 | 
 5 | Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
 6 | 
 7 | ```
 8 | npm i -g mintlify
 9 | ```
10 | 
11 | Run the following command at the root of your documentation (where mint.json is)
12 | 
13 | ```
14 | mintlify dev
15 | ```
16 | 
17 | ### Publishing Changes
18 | 
19 | Install our Github App to auto propagate changes from your repo to your deployment. Changes will be deployed to production automatically after pushing to the default branch. Find the link to install on your dashboard. 
20 | 
21 | #### Troubleshooting
22 | 
23 | - Mintlify dev isn't running - Run `mintlify install` it'll re-install dependencies.
24 | - Page loads as a 404 - Make sure you are running in a folder with `mint.json`
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/api-reference/documents/extract/extract-file-async.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: post /documents/v1/extract_async
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/documents/extract/extract-file-sync.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: post /documents/v1/extract
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/documents/process/parse-file-async.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: post /documents/v1/parse_async
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/documents/process/parse-file-sync.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: post /documents/v1/parse
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/documents/retrieve/retrieve-result.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: get /documents/v1/parse/retrieve/{job_id}/
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/root.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: get /
3 | ---


--------------------------------------------------------------------------------
/docs/api-reference/upload/upload-file.mdx:
--------------------------------------------------------------------------------
1 | ---
2 | openapi: post /documents/v1/upload
3 | ---


--------------------------------------------------------------------------------
/docs/cli-ui/ui.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: UI
 3 | ---
 4 | 
 5 | Indexify offers a built-in user interface (UI) that simplifies debugging and content visualization. This UI works seamlessly with Indexify's APIs and consists of three main sections:
 6 | 
 7 | 1. Compute Graphs
 8 | 2. Namespaces
 9 | 3. Executors
10 | 
11 | To access the UI after running Indexify locally, visit: http://localhost:8900/ui
12 | 
13 | ## Compute Graphs
14 | 
15 | The home page displays all Compute Graphs within your namespaces (default name is "default").
16 | 
17 | ![Compute Graphs](/images/ui_compute_graphs.png)
18 | 
19 | Click on any Compute Graph to view its detailed information, including:
20 | - Creation time
21 | - Edges
22 | - Nodes
23 | - Description
24 | - Other relevant details
25 | 
26 | ![Compute Graph Definition](/images/ui_compute_graph_definition.png)
27 | 
28 | ### Invocations
29 | 
30 | Below the Compute Graph definitions, you'll find all associated Invocation IDs. This section shows:
31 | - Invocation IDs
32 | - Payload sizes
33 | - Option to delete an Invocation (via the Delete Icon)
34 | 
35 | ![Invocations](/images/ui_invocations.png)
36 | 
37 | Clicking an Invocation ID reveals two key features:
38 | 
39 | #### Invocation Outputs
40 | 
41 | This section displays:
42 | - Outputs for the selected Invocation ID
43 | - Associated Compute Functions
44 | - Log access via "View stdout" and "View stderr" buttons
45 | - Search functionality for outputs
46 | - Collapsible Compute Function tables for better visibility
47 | 
48 | ![Invocation Outputs](/images/ui_invocation_outputs.png)
49 | 
50 | #### Invocation Tasks
51 | 
52 | Below the Outputs, you'll see the Tasks that ran for your Invocation, including:
53 | - Task ID
54 | - Compute Function
55 | - Input Key
56 | - Task Outcome
57 | - Search functionality for tasks by ID
58 | 
59 | ![Invocation Tasks](/images/ui_invocation_tasks.png)
60 | 
61 | ## Namespaces
62 | 
63 | This section provides an overview of all namespaces created in your Indexify instance.
64 | 
65 | ![Namespaces](/images/ui_namespaces.png)
66 | 
67 | ## Executors
68 | 
69 | The Executors section shows all executors connected to Indexify, displaying:
70 | - Executor name
71 | - Executor ID
72 | - Address
73 | - Associated labels
74 | 
75 | ![Executors](/images/ui_executors.png)
76 | 


--------------------------------------------------------------------------------
/docs/comparisons.mdx:
--------------------------------------------------------------------------------
 1 | Indexify offers a versatile platform for developing Generative AI applications, featuring a flexible workflow compute engine that accommodates various extractors and data transformation processes. This means there are a number of existing tools that overlap with the capabilities of Indexify. It should be noted that Indexify is not mutually exclusive with other tools in the AI Infrastructure landscape.
 2 | 
 3 | ## Indexify vs LlamaIndex
 4 | 
 5 | Indexify is the distributed data framework and compute engine. Your extraction and data processing workflows will run asynchronously and reliably in Indexify. LlamaIndex is an LLM application framework for querying data from vector stores and for response synthesis with LLMs. It doesn't include a fault tolerant and reliable distributed orchestration engine in the open source library. LlamaIndex doesn't include a deletion framework and a robust incremental compute engine, when data source are updated or deleted. 
 6 | 
 7 | LlamaIndex and Indexify are complementary, you can use LlamaIndex's query engine and other components such as data loaders to ingest content for transformation and extraction using Indexify. 
 8 | 
 9 | ## Indexify vs Spark
10 | 
11 | Spark works well with tabular data and with compute functions written in Java. Indexify is faster than Spark as it doesn't rely on an external scheduler like Kubernetes or Mesos for task scheduling. Spark being only a compute engine doesn't remember where the extracted features are written, so you will also have to build a control plane to track data if deletion or updating them is necessary for your usecase. Indexify tracks data lineage and updates extracted content when the source changes.
12 | 


--------------------------------------------------------------------------------
/docs/development.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Development'
 3 | description: 'Learn how to preview changes locally'
 4 | ---
 5 | 
 6 | <Info>
 7 |   **Prerequisite** You should have installed Node.js (version 18.10.0 or
 8 |   higher).
 9 | </Info>
10 | 
11 | Step 1. Install Mintlify on your OS:
12 | 
13 | <CodeGroup>
14 | 
15 | ```bash npm
16 | npm i -g mintlify
17 | ```
18 | 
19 | ```bash yarn
20 | yarn global add mintlify
21 | ```
22 | 
23 | </CodeGroup>
24 | 
25 | Step 2. Go to the docs are located (where you can find `mint.json`) and run the following command:
26 | 
27 | ```bash
28 | mintlify dev
29 | ```
30 | 
31 | The documentation website is now available at `http://localhost:3000`.
32 | 
33 | ### Custom Ports
34 | 
35 | Mintlify uses port 3000 by default. You can use the `--port` flag to customize the port Mintlify runs on. For example, use this command to run in port 3333:
36 | 
37 | ```bash
38 | mintlify dev --port 3333
39 | ```
40 | 
41 | You will see an error like this if you try to run Mintlify in a port that's already taken:
42 | 
43 | ```md
44 | Error: listen EADDRINUSE: address already in use :::3000
45 | ```
46 | 
47 | ## Mintlify Versions
48 | 
49 | Each CLI is linked to a specific version of Mintlify. Please update the CLI if your local website looks different than production.
50 | 
51 | <CodeGroup>
52 | 
53 | ```bash npm
54 | npm i -g mintlify@latest
55 | ```
56 | 
57 | ```bash yarn
58 | yarn global upgrade mintlify
59 | ```
60 | 
61 | </CodeGroup>
62 | 
63 | ## Deployment
64 | 
65 | <Tip>
66 |   Unlimited editors available under the [Startup
67 |   Plan](https://mintlify.com/pricing)
68 | </Tip>
69 | 
70 | You should see the following if the deploy successfully went through:
71 | 
72 | <Frame>
73 |   <img src="/images/checks-passed.png" style={{ borderRadius: '0.5rem' }} />
74 | </Frame>
75 | 
76 | ## Troubleshooting
77 | 
78 | Here's how to solve some common problems when working with the CLI.
79 | 
80 | <AccordionGroup>
81 |   <Accordion title="Mintlify is not loading">
82 |     Update to Node v18. Run `mintlify install` and try again.
83 |   </Accordion>
84 |   <Accordion title="No such file or directory on Windows">
85 | Go to the `C:/Users/Username/.mintlify/` directory and remove the `mint`
86 | folder. Then Open the Git Bash in this location and run `git clone
87 | https://github.com/mintlify/mint.git`.
88 | 
89 | Repeat step 3.
90 | 
91 |   </Accordion>
92 |   <Accordion title="Getting an unknown error">
93 |     Try navigating to the root of your device and delete the ~/.mintlify folder.
94 |     Then run `mintlify dev` again.
95 |   </Accordion>
96 | </AccordionGroup>
97 | 
98 | Curious about what changed in a CLI version? [Check out the CLI changelog.](/changelog/command-line)
99 | 


--------------------------------------------------------------------------------
/docs/download/index.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Download
 3 | ---
 4 | 
 5 | ## Server
 6 | 
 7 | #### Binary
 8 | 
 9 | <Tabs>
10 |   <Tab title="cURL">
11 |     ```bash
12 |   curl https://www.tensorlake.ai | sh
13 |   ```
14 |   </Tab>
15 |   <Tab title="Docker">
16 |     ```bash
17 |   docker pull tensorlake/indexify-server
18 |   ```
19 |   </Tab>
20 |   <Tab title="Github Releases">
21 |     🔗 https://github.com/tensorlakeai/indexify/releases
22 |   </Tab>
23 | </Tabs>
24 | 
25 | ### Executor CLI and SDK
26 | 
27 | The SDK and executor CLI can be installed using pip.
28 | 
29 | ```bash For Python
30 | pip install indexify
31 | ```
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/examples/index.mdx:
--------------------------------------------------------------------------------
 1 | # All Examples
 2 | 
 3 | Here are some of examples of use-cases you could accomplish with Indexify
 4 | 
 5 | ## PDF Extraction
 6 | 
 7 | - [Image Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/image)
 8 | - [Table Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/table_extraction)
 9 | - [Markdown Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/pdf_to_markdown)
10 | - [Chunk Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/chunking)
11 | - [Multi-Modal RAG](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/indexing_and_rag)
12 | 
13 | ## Audio Extraction
14 | - [Transcription](https://github.com/tensorlakeai/indexify/tree/main/examples/audio/transcription)
15 | - [Summarization](https://github.com/tensorlakeai/indexify/tree/main/examples/audio/summarization)
16 | - [Topic Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/audio/topic_extraction)
17 | - [Indexing and RAG](https://github.com/tensorlakeai/indexify/tree/main/examples/audio/chunking_and_indexing)
18 | 
19 | ## Image & Video Extraction
20 | - [Object Detection using YOLO](https://github.com/tensorlakeai/indexify/tree/main/examples/image/detect)
21 | - [Video Transcript with Diarization](https://github.com/tensorlakeai/indexify/tree/main/examples/video/transcript)
22 | - [Image Captioning, Object Detection & Segmentation with Florence 2](https://github.com/tensorlakeai/indexify/tree/main/examples/image/florence)
23 | 
24 | ## Invoice Extraction
25 | - [Structured Extraction using GPT4](https://github.com/tensorlakeai/indexify/tree/main/examples/invoices/structured_extraction)
26 | 
27 | 
28 | ## LLM Integrations
29 | #### Mistral
30 | - [Summarization](https://github.com/tensorlakeai/indexify/tree/main/examples/llm_integrations/mistral/pdf-summarization)
31 | - [Entity Extraction](https://github.com/tensorlakeai/indexify/tree/main/examples/llm_integrations/mistral/pdf-entity-extraction)
32 | 
33 | #### OpenAI
34 | - [PDF Translation](https://github.com/tensorlakeai/indexify/tree/main/examples/llm_integrations/openai_pdf_translation)
35 | 
36 | ## Framework Integrations
37 | #### LangChain
38 | - [PDF Question Answering (Uses LangChain)](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/langchain)
39 | - [Adaptive RAG using LangGraph](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/langgraph)
40 | 


--------------------------------------------------------------------------------
/docs/extractors.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Indexify CLI'
 3 | ---
 4 | 
 5 | Indexify CLI is a command line tool to start Executors that run functions, and to build docker images with dependencies for running them.
 6 | 
 7 | ## Start Executor 
 8 | 
 9 | ```bash
10 | indexify-cli executor --server-addr <server-ip>:<server-port>
11 | ```
12 | 
13 | This will start an executor that connects to the server at the given address. The server address can be found from the Indexify server's configuration.
14 | 
15 | **Default Server Address:** `localhost:8900`
16 | 
17 | ## Install the SDK 
18 | The tools to build, test and package extractors are available through the Python package `indexify-extractor-sdk`. A typescript SDK is under development also. 
19 | 
20 | ```bash
21 | pip install indexify-extractor-sdk
22 | ```
23 | 
24 | ## Running Extractors
25 | 
26 | ### Running extractor locally
27 | 
28 | ```bash
29 | indexify-extractor run-local <extractor_file:ClassNameOfExtractor> --text "hello world"
30 | ```
31 | 
32 | This will invoke the extractor in `extractor_file.py` and create a `Content` with *hello world* as the data.
33 | 
34 | If you want to pass in the contents of a file into the payload use `--file </path/to/file>`
35 | 
36 | ### Running the extractor to continuously extract ingested content
37 | You can run the extractor as a long running process to continuously receive stream of content and extract them from the Indexify control plane. You can run as many instances of the extractors you want and achieve scalability.
38 | 
39 | ```bash
40 | indexify-extractor join-server --coordinator-addr localhost:8950 --ingestion-addr localhost:8900
41 | ```
42 | 
43 | The addresses here can be found from Indexify server's configuration.
44 | 
45 | ### From Packaged Containers
46 | 
47 | Extractors can be deployed in production with ease using docker along with Indexify server in a cluster. You can test an extractor packaged with docker by running docker locally.
48 | 
49 | ```bash
50 | docker run tensorlake/minilm-l6 run-local --text hello
51 | ```
52 | 
53 | Passing a local file from your laptop to docker requires bind-mounting the file into the container 
54 | 
55 | ```bash
56 |  docker run -v /path/to/local/file:/tmp/image.jpg tensorlake/yolo-extractor run-local yolo_extractor:YoloExtractor --file /tmp/image.jpg
57 | ```
58 | 
59 | The name of the extractor module is retrieved from an environment variable of the container `EXTRACTOR_PATH`


--------------------------------------------------------------------------------
/docs/images/Content_AI_Content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Content_AI_Content.png


--------------------------------------------------------------------------------
/docs/images/Extractor_Transformation_Concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Extractor_Transformation_Concept.png


--------------------------------------------------------------------------------
/docs/images/GS_Content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/GS_Content.png


--------------------------------------------------------------------------------
/docs/images/GS_ExtractionGraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/GS_ExtractionGraph.png


--------------------------------------------------------------------------------
/docs/images/GS_Vector_Indexes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/GS_Vector_Indexes.png


--------------------------------------------------------------------------------
/docs/images/Homepage_Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Homepage_Diagram.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Architecture_Distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Architecture_Distributed.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Architecture_Extractors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Architecture_Extractors.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Architecture_Ingestion_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Architecture_Ingestion_Server.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Architecture_Local.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Architecture_Local.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Architecture_Server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Architecture_Server.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Home_Diagram.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Home_Diagram.gif


--------------------------------------------------------------------------------
/docs/images/Indexify_KAT.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_KAT.gif


--------------------------------------------------------------------------------
/docs/images/Indexify_Logo_Wordmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Logo_Wordmark.png


--------------------------------------------------------------------------------
/docs/images/Indexify_Logo_Wordmark_Dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Indexify_Logo_Wordmark_Dark.png


--------------------------------------------------------------------------------
/docs/images/PDF_Extraction_Demo-VEED.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/PDF_Extraction_Demo-VEED.gif


--------------------------------------------------------------------------------
/docs/images/PDF_Usecase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/PDF_Usecase.png


--------------------------------------------------------------------------------
/docs/images/System_Architecture_Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/System_Architecture_Diagram.png


--------------------------------------------------------------------------------
/docs/images/Tensorlake_Logo_LG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/Tensorlake_Logo_LG.png


--------------------------------------------------------------------------------
/docs/images/content_extractor_concept.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/content_extractor_concept.jpg


--------------------------------------------------------------------------------
/docs/images/content_extractor_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/content_extractor_concept.png


--------------------------------------------------------------------------------
/docs/images/docs_intro_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/docs_intro_diagram.png


--------------------------------------------------------------------------------
/docs/images/extraction_graph_getting_started.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/extraction_graph_getting_started.png


--------------------------------------------------------------------------------
/docs/images/extraction_graph_key_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/extraction_graph_key_concept.png


--------------------------------------------------------------------------------
/docs/images/extractors_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/extractors_list.png


--------------------------------------------------------------------------------
/docs/images/grafana/total_requests.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/grafana/total_requests.png


--------------------------------------------------------------------------------
/docs/images/indexify_high_level_abstract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/indexify_high_level_abstract.png


--------------------------------------------------------------------------------
/docs/images/indexify_high_level_idea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/indexify_high_level_idea.png


--------------------------------------------------------------------------------
/docs/images/jaeger/detailed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/jaeger/detailed.png


--------------------------------------------------------------------------------
/docs/images/jaeger/flamegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/jaeger/flamegraph.png


--------------------------------------------------------------------------------
/docs/images/jaeger/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/jaeger/stats.png


--------------------------------------------------------------------------------
/docs/images/jaeger/traces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/jaeger/traces.png


--------------------------------------------------------------------------------
/docs/images/key_concepts_block_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/key_concepts_block_diagram.png


--------------------------------------------------------------------------------
/docs/images/key_concepts_embeddings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/key_concepts_embeddings.png


--------------------------------------------------------------------------------
/docs/images/key_concepts_extraction_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/key_concepts_extraction_policy.png


--------------------------------------------------------------------------------
/docs/images/key_concepts_feature_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/key_concepts_feature_extraction.png


--------------------------------------------------------------------------------
/docs/images/key_concepts_transform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/key_concepts_transform.png


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/logo.png


--------------------------------------------------------------------------------
/docs/images/logo/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/logo/favicon.png


--------------------------------------------------------------------------------
/docs/images/ui_compute_graph_definition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_compute_graph_definition.png


--------------------------------------------------------------------------------
/docs/images/ui_compute_graphs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_compute_graphs.png


--------------------------------------------------------------------------------
/docs/images/ui_executors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_executors.png


--------------------------------------------------------------------------------
/docs/images/ui_invocation_outputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_invocation_outputs.png


--------------------------------------------------------------------------------
/docs/images/ui_invocation_tasks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_invocation_tasks.png


--------------------------------------------------------------------------------
/docs/images/ui_invocations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_invocations.png


--------------------------------------------------------------------------------
/docs/images/ui_namespaces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/docs/images/ui_namespaces.png


--------------------------------------------------------------------------------
/docs/integrations/dspy.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'DSPy'
 3 | ---
 4 | Indexify complements DSPy by providing a robust platform for indexing large volume of multi-modal content such as PDFs, raw text, audio and video. It provides a retriever API to retrieve context for LLMs.
 5 | 
 6 | We provide a Indexify Retreival Module for DSPy, that works with DSPy.
 7 | 
 8 | It will act as a wrapper that internally uses your indexify client for indexing and querying but externally leverages the modularity and design of DSPy.  
 9 | 
10 | 
11 | ### Install the Indexify DSPy retriever package - 
12 | ```bash
13 | pip install indexify-dspy indexify
14 | ```
15 | 
16 | ### Import the necessary libraries
17 | 
18 | ```python
19 | import dspy
20 | from indexify import IndexifyClient
21 | from indexify_dspy import IndexifyRM
22 | 
23 | ```
24 | 
25 | ### Instantiate the Retreival Model
26 | You can create a Retreival Model to retrieve from an index mantained by Indexify. Use the DSPy settings to confgiure the retriever model.
27 | 
28 | ```python
29 | turbo = dspy.OpenAI(model="gpt-3.5-turbo")
30 | indexify_client = IndexifyClient()
31 | indexify_retriever_model = IndexifyRM("index_name", indexify_client, k=3)
32 | 
33 | dspy.settings.configure(lm=turbo, rm=indexify_retriever_model)
34 | ```
35 | 
36 | Using the Retreival Model is very simple
37 | ```python
38 | retrieve = dspy.Retrieve(k=3)
39 | question = "Who are the NBA Finals MVPs"
40 | topK_passages = retrieve(question).passages
41 | ```
42 | 
43 | ### Create an indexify client and populate it with some documents
44 | 
45 | ```python
46 | indexify_client = IndexifyClient()
47 | 
48 | extraction_graph_spec = """
49 | name: 'myextractiongraph'
50 | extraction_policies:
51 |   - extractor: 'tensorlake/minilm-l6'
52 |     name: 'minilml6'
53 | """
54 | extraction_graph = ExtractionGraph.from_yaml(extraction_graph_spec)
55 | client.create_extraction_graph(extraction_graph)  
56 | 
57 | indexify_client.add_documents(
58 |     "myextractiongraph",
59 |     [
60 |         "Indexify is amazing!",
61 |         "Indexify is a retrieval service for LLM agents!",
62 |         "Steph Curry is the best basketball player in the world.",
63 |     ],
64 | )
65 | ```
66 | 
67 | Initialize the IndexifyRM class
68 | 
69 | ### Using the RM class 
70 | ```python
71 | retrieve = IndexifyRM(indexify_client)
72 | topk_passages = retrieve("Sports", "myextractiongraph.minilml6.embedding", k=2).passages
73 | print(topk_passages)
74 | ```
75 | 
76 | ### Setting up DSPy Module with Indexify
77 | 
78 | You can use IndexifyRM like any other DSPy module or build your own wrapper for retrieval using the Indexify client following this example.  
79 | 
80 | ```python
81 | class RAG(dspy.Module):
82 |     def __init__(self, num_passages=2):
83 |         super().__init__()
84 | 
85 |         self.retrieve = dspy.Retrieve(k=num_passages)
86 |         ...
87 | 
88 |     def forward(self, question):
89 |         context = self.retrieve(question).passages
90 |         ...
91 | ```


--------------------------------------------------------------------------------
/docs/integrations/langchain.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Langchain'
 3 | ---
 4 | 
 5 | Indexify complements LangChain by providing a robust ingestion engine for indexing large volume of multi-modal content such as PDFs, raw text, audio and video. 
 6 | It provides a Langchain retriever to retrieve context for LLMs.
 7 | 
 8 | ## Install
 9 | <Tabs>
10 |   <Tab title="Python">
11 |     ```bash
12 |     pip install indexify-langchain
13 |     ```
14 |   </Tab>
15 | </Tabs>
16 | 
17 | ## Initiate the retriever
18 | <Tabs>
19 |   <Tab title="Python">
20 |     ```python
21 |     params = {"name": "minilml6.embedding", "top_k": 9}
22 |     retriever = IndexifyRetriever(client=client, params=params)
23 |     ```
24 |   </Tab>
25 | </Tabs>
26 | 
27 | ## Complete Examples
28 | 
29 | We developed a few examples to demonstrate the integration of LangChain with Indexify.
30 | 
31 | - [LangChain Integration with Indexify for PDF QA](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/langchain)
32 | - [Adaptive RAG with LangGraph](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf/langgraph)
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/operations/deployment.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Deployment Options 
 3 | ---
 4 | Indexify can be deployed in the following ways -
 5 | * Bare Metal and VMs
 6 | * Docker Compose
 7 | * Kubernetes (or any other container orchestrator)
 8 | 
 9 | ## Bare Metal
10 | 
11 | Indexify doesn't depend on Kubernetes or Docker, you can run the server and executors on any VM or
12 | bare metal machines.
13 | 
14 | #### Start Server
15 | Start the server on one machine. Read the configuration reference to understand how to customize the server to use blob stores for storing function outputs.
16 | ```bash
17 | indexify-server
18 | ```
19 | <Note>
20 | We have a replicated mode for the server, based on Raft consensus protocol. It's not public yet because
21 | we are still figuring out how to make it easy to configure, operate and use by developers.
22 | If you are interested in using it, please reach out to us.
23 | </Note>
24 | 
25 | #### Start Executor
26 | Start as many executors you want in different machines.
27 | ```bash
28 | indexify-cli executor --server-addr <server-ip>:<server-port>
29 | ```
30 | 
31 | 
32 | ## Docker Compose
33 | You can spin up the server and executor using docker compose, and deploy and run in a production-like environment. Copy the [docker-compose.yaml file from here](https://raw.githubusercontent.com/tensorlakeai/indexify/refs/heads/main/docker-compose.yaml).
34 | 
35 | ```bash
36 | docker compose up
37 | ```
38 | 
39 | This starts the server and two replicas of the executor in separate containers.
40 | Change the `replicas` field for the executor in docker compose to add more executors (i.e parallelism) to the workflow.
41 | 
42 | This uses a default executor container based on Debian and a vanilla Python installation.
43 | We generally provide docker compose files for local testing of every example project in the repository.
44 | 
45 | ## Kubernetes
46 | 
47 | We provide some basic Helm charts to deploy Indexify on Kubernetes.
48 | If you'd like to try with your own cluster, check out the
49 | [instructions][operations/k8s].
50 | 
51 | [operations/k8s]:
52 |   https://github.com/tensorlakeai/indexify/tree/main/operations/k8s
53 | 


--------------------------------------------------------------------------------
/docs/operations/gpu-deployment.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'GPUs'
 3 | description: 'Running Extractor on GPU'
 4 | ---
 5 | In this guide, we will show you how to run your extractor on a AWS EC2 instance running on Ubuntu. We will skip the steps on how to spin up an EC2 instance as there are already many guides available online on how to do that.
 6 | 
 7 | ## Step 1: Install NVIDIA Drivers
 8 | 
 9 | First of all, try running the command below to see if you have NVIDIA drivers installed:
10 | 
11 | ```bash
12 | nvidia-smi
13 | ```
14 | 
15 | If you are able to see the NVIDIA driver version, then you are good to go and you can skip this step. 
16 | If not, you need to install the NVIDIA drivers by running following this guide: 
17 | [Installing NVIDIA Drivers on Ubuntu](https://ubuntu.com/server/docs/nvidia-drivers-installation).
18 | 
19 | You might want to choose the driver dedicated for a server instead of the desktop version. 
20 | After installing the drivers, you might need to reboot your machine.
21 | 
22 | ## Step 2: Install NVIDIA Container Toolkit
23 | 
24 | Next, you need to install the NVIDIA Container Toolkit. This is required to run Docker containers on your GPU. 
25 | You can install it by following this guide: [Installing NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
26 | 
27 | Make sure to follow the guide to configure the toolkit with Docker.
28 | 
29 | ## Step 3: Run Executor with GPU
30 | 
31 | ```sh
32 | docker run \
33 |     --gpus all \
34 |     tensorlake/default-executor-image \
35 |     indexify-cli\
36 |     executor \
37 |     --server-addr $ADDRESS:8900
38 | ```
39 | 


--------------------------------------------------------------------------------
/docs/operations/metrics.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Telemetry'
 3 | ---
 4 | 
 5 | Indexify servers and coordinators export the metrics in Prometheus format on the following urls:
 6 | 
 7 | `coordinator:8960/metrics` - cluster metrics for content upload and extraction 
 8 | 
 9 | `server:8900/metrics` - http api metrics on this node
10 | 
11 | `server:8900/metrics/ingest` - metrics for content upload and extraction on this node
12 | 
13 | ### The following metrics are specific to Indexify cluster operation:
14 | 
15 | - indexify_coordinator_executors_online
16 | - indexify_coordinator_tasks_in_progress 
17 | - indexify_coordinator_content_uploads_total
18 | - indexify_coordinator_content_bytes_uploaded_total
19 | - indexify_coordinator_content_extracted_total
20 | - indexify_coordinator_content_bytes_extracted_total
21 | - indexify_coordinator_tasks_completed_total
22 | - indexify_coordinator_tasks_errored_total
23 | 
24 | This is an example of prometheus configuration to collect metrics from server and coordinator hosts:
25 | 
26 | ```yaml
27 | scrape_configs:
28 |   - job_name: 'indexify-server-ingest'
29 |     metrics_path: /metrics/ingest
30 |     static_configs:
31 |       - targets: ['server:8900']
32 | 
33 |   - job_name: 'indexify-server'
34 |     metrics_path: /metrics
35 |     static_configs:
36 |       - targets: ['server:8900']
37 | 
38 |   - job_name: 'indexify-coordinator'
39 |     metrics_path: /metrics
40 |     static_configs:
41 |       - targets: ['coordinator:8960']
42 | ```
43 | 


--------------------------------------------------------------------------------
/docs/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "mintlify": "^4.0.459"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/docs/packaging-dependencies.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Packaging Dependencies
 3 | ---
 4 | 
 5 | Python and System dependencies of functions can be packaged into images. 
 6 | 
 7 | ## Custom Docker Images
 8 | 
 9 | Specify the commands to install dependencies in a custom Docker image. You can choose any base image, and install any system or python dependencies.
10 | 
11 | An image can be used to run multiple functions. You can specify the image name in the function decorator.
12 | 
13 | ### Step 1: Define the Image
14 | ```python
15 | from indexify import Image
16 | 
17 | image = (
18 |     Image()
19 |     .name("my-pdf-parser-image")
20 |     .base_image("ubuntu:22.04")
21 |     .run("apt update")
22 |     .run("apt install -y libgl1-mesa-glx git g++")
23 |     .run("pip install torch")
24 |     .run("pip install numpy")
25 |     .run("pip install langchain")
26 |     .run("pip install git+https://github.com/facebookresearch/detectron2.git@v0.6")
27 |     .run("apt install -y tesseract-ocr")
28 |     .run("apt install -y libtesseract-dev")
29 | )
30 | ```
31 | 
32 | This defines an `Image` object and specify the name of the image. We then run commands to install the dependencies.
33 | You can use any base image, the default being `python:3.11.10-slim-bookworm`.
34 | <Note>
35 | The Indexify `executor` process is automatically installed in the image. You don't need to install it manually. The executor is responsible
36 | for running the functions in the image.
37 | </Note>
38 | 
39 | ### Step 2: Use the Image in a Function
40 | ```python
41 | from indexify import indexify_function
42 | 
43 | @indexify_function(image=image)
44 | def parse_pdf(pdf_path: str) -> str:
45 |     ...
46 | ```
47 | 
48 | In the function decorator, we pass the `image` object. This tells Indexify to run the function in the specified image.
49 | 
50 | ### Step 3: Build the Image
51 | You can build the Docker image using the `indexify build-image` command.
52 | 
53 | Assuming the function is in a file named `pdf_parser.py`, you can run:
54 | 
55 | ```bash
56 | indexify build-image pdf_parser.py my-pdf-parser-image
57 | ```
58 | 
59 | This will build the Docker image, named `my-pdf-parser-image`. You can push the image to your container registry or Docker Hub.
60 | 
61 | ### Step 4: Deploying Functions
62 | 
63 | When you create a graph, which references the `pdf-parser` function, Indexify will automatically route the function to the specified image.
64 | 
65 | It does so based on the association you made in the function decorator. `@indexify_function(image=image)`
66 | 
67 | <Note>
68 | You would need to use Kubernetes, ECR, or any other container orchestration engine to deploy your images.
69 | If the image is not running, Indexify will simply queue up the function internally, and execute it when an executor container 
70 | with the image is available.
71 | </Note>
72 | 


--------------------------------------------------------------------------------
/docs/sdks/index.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Python SDK'
 3 | ---
 4 | 
 5 | We provide a Python SDK at the moment. A TypeScript SDK is in the works.
 6 | 
 7 | ## Install 
 8 | 
 9 | <Tabs>
10 |   <Tab title="Python">
11 |     ```bash
12 |     pip install indexify
13 |     ```
14 | 
15 |     PyPi - [https://pypi.org/project/indexify/](https://pypi.org/project/indexify)
16 |   </Tab>
17 | </Tabs>
18 | 


--------------------------------------------------------------------------------
/docs/sdks/namespaces.mdx:
--------------------------------------------------------------------------------
 1 | Namespaces are used to isolate content uploaded by applications.
 2 | 
 3 | <Note>
 4 |   A default namespace, named `default` is created when Indexify is started.
 5 | </Note>
 6 |     
 7 | 
 8 | ## Create a Namespace
 9 | A namespace can be created by specifying a unique name, and any additional labels or extraction policies.
10 | 
11 | ```python Python
12 | from indexify import IndexifyClient
13 | 
14 | IndexifyClient.create_namespace(
15 |     name="research",
16 | )
17 | ```    


--------------------------------------------------------------------------------
/docs/sdks/retrieval.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Retrieval APIs'
 3 | ---
 4 | 
 5 | Retrieval APIs allow querying the indexes, derived from the content added. Currently there are two types of indexes supported:
 6 | 
 7 | - Vector Indexes for Semantic Search 
 8 | - Content Attribute Indexes
 9 | 
10 | ## Vector Indexes
11 | 
12 | Vector Indexes are created by running embedding models on content. They allow doing semantic search on the indexes. The search results contain the chunks of text which matched the query and their corresponding scores.
13 | 
14 | The following example searches for the index `embeddings` for the query `good` and returns the top `k` results.
15 | 
16 | <CodeGroup>
17 |       ```python Python
18 |       client.search_index("minilml6.embedding", "good", 3)
19 |       ```
20 | </CodeGroup>
21 |       
22 | 
23 | #### Output
24 | 
25 | ```json JSON
26 | {
27 | "results":[{
28 |       "text":"Indexify is amazing!",
29 |       "metadata":{
30 |             "key":"k1"
31 |             }
32 |       }
33 | ]}
34 | ```
35 | 
36 | ## Metadata Indexes
37 | Metadata Indexes are created by extractors powered by AI Models which produced structured data. The output of such extractors are JSON documents and stored in a document store. 
38 | 
39 | The schema of such indexes are defined by the extractors. The retrieval API for metadata indexes allows querying all the metadata in the index or the ones of a specific content id. 
40 | 
41 | In the future we will add support for searching these indexes as well using sparse vectors, or add them to knowledge graphs.
42 | 
43 | The following example queries the index `entities` and returns the metadata for the content id `foo`.
44 | 
45 | <CodeGroup>
46 |       ```python Python
47 |       client.query_metadata(index_name="entities", content_id="foo")
48 |       ```
49 | </CodeGroup>


--------------------------------------------------------------------------------
/docs/usecases/audio_extraction.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Audio Processing'
 3 | description: 'Real Time Speech Recognition Pipelines'
 4 | ---
 5 | 
 6 | You can build real time pipelines with Indexify that incorporates speech, build applications that retrieve information from the audio.
 7 | 
 8 | ## Speech to Text
 9 | 
10 | Generally, audio processing pipelines starts of by converting audio to text.
11 | 
12 | * **Automatic Speech Recognition (ASR)** - Converting speech to text. If all you want is a plain transcription, you can use a model like **Whisper** in a function that
13 | extracts transcriptions from audio. The entire text and chunks with timestamps are represented as metadata of your function's output.
14 | ```python
15 | @indexify_function()
16 | def transcribe_audio(audio: bytes) -> str:
17 |    # Invoke an ASR model to transcribe the audio
18 |    ...
19 | ```
20 | * **Speaker Diarization** - Applications such as meeting transcriptions, often require idenitfying who-said-what. Speaker Diarization is the process of 
21 | segmenting and clustering the audio into speaker segments. This can be done using a model like **PyAnnotate**.
22 | ```python
23 | class SpeakerSegment(BaseModel):
24 |     speaker: str
25 |     transcript: str
26 |     start: float
27 |     end: float
28 | 
29 | @indexify_function()
30 | def diarize_audio(audio: bytes) -> List[SpeakerSegment]:
31 |    # Invoke a Speaker Diarization model to segment the audio
32 |    ...
33 | ```
34 | 
35 | <Note>
36 | You can use commercial ASR and Speaker Diarizaition services with Indeixfy as well. They often perform better for accented speech.
37 | </Note>
38 | 
39 | ## Speech to Speech and Text to Speech
40 | 
41 | * **Voice Interfaces** - You can build voice interfaces that take in speech and generate speech. You would be using a TTS model in a function
42 | that accepts text and generates speech. The function would return a byte array of the audio.
43 | 
44 | ## Dynamic Routing 
45 | 
46 | Speech processing is complex, and often a single model doesn't perform well on all accents and languages. You can insert a dynamic Router
47 | in your pipeline, which routes the audio to different ASR models by classifying accent, language or other features in the audio.
48 | 
49 | 
50 | ## Examples 
51 | [Meeting Transcription and Summarization](https://github.com/tensorlakeai/indexify/tree/main/examples/video_summarization)
52 | * Speaker Diarization
53 | * Classification of meeting intent 
54 | * Dynamic Routing between summarization models
55 | 


--------------------------------------------------------------------------------
/docs/usecases/image_retrieval.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Image Understanding'
 3 | ---
 4 | 
 5 | Image Retrieval based on natural language is typically done in the following manner -
 6 | 
 7 | #### Embedding Based Retreieval
 8 | 1. Embed images with CLIP
 9 | 2. Embed the query using the same model and do KNN search to retrieve semantically similar images.
10 | 
11 | #### Visual LLM Based Retrieval
12 | 1. Describe an image using a Visual LLM such as LLava and GPT-V.
13 | 2. Index the description and retrieve images by searching the descriptions.
14 | 
15 | #### SQL Based Retrieval
16 | 
17 | Semantic search on descriptions or CLIP based algorithms retrieve semantically similar images so they can be less accurate. 
18 | Structured Extraction by object detection enables querying by object name classes using SQL.
19 | 1. Run object detection models like YoloV9 or Grounding Dino on the images to extract objects.
20 | 2. Write the object names, bounding boxes and other image metadata to a structured table.
21 | 3. Query the structured table using SQL to retrieve images. 
22 | 
23 | ### Examples
24 | Visual Search Engine for E-commerce


--------------------------------------------------------------------------------
/docs/usecases/pdf_extraction.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'PDF Extraction'
 3 | ---
 4 | PDF is a widely used file format for sharing documents. Often, Enterprise LLM applications need to derive information that are locked
 5 | inside PDF documents.
 6 | You can build workflows with Indexify that uses any PDF Extraction Model to extract tables, images and text from PDFs. 
 7 | 
 8 | You can use many different PDF models or APIs within a single workflow. Dynamic routing can be used to route the PDF to different 
 9 | models based on the document layout.
10 | 
11 | ## Examples
12 | [Multi-Modal RAG from PDFs using Inkwell](https://github.com/tensorlakeai/indexify/tree/main/examples/pdf_document_extraction)
13 | * Table, Text and Image Extraction 
14 | * Chunking Text
15 | * Embedding of Image, Text and Tables using Sentence Transformers
16 | * Using LanceDB for indexing and retrieval


--------------------------------------------------------------------------------
/docs/usecases/rag.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Multi-Modal RAG'
 3 | ---
 4 | 
 5 | Retrieval-Augmented Generation (RAG) makes general purpose LLMs access your private data sources. Many LLM use-cases RAG
 6 | under the hood to ground the LLMs on accurate and up-to-date data. 
 7 | 
 8 | RAG is usually comprised of the following stages -
 9 | 1. **Indexing** - The process of loading your data from various sources, convert them into forms that can be queried easily by LLM applications.
10 | 2. **Querying and Generation** - The process of querying the indexed data and generating responses. The querying part is usually done by a retriever that retrieves the most relevant data from the indexed data.
11 | 
12 | <Note>
13 |     Indexing and Querying in the real world is usually done in parallel and continously. This means that beyond the core RAG algorithms,
14 |     you need to build a system that can continously and reliably index and query data.
15 | </Note>
16 | 
17 | ### RAG using Indexify
18 | 
19 | You can perform data loading, impelement indexing and other data transformation algorithms as workflows. Indexify makes it easy to build and operationalize pipelines that can process
20 | and index data continously.
21 | 
22 | You can migrate from one RAG algorithm to another, re-index already processed data if you migrate embedding models by running data migrations. Create namespaces for different 
23 | security sensitive data, and manage access control.
24 | 
25 | You have full control over how you want to index and query data. Which one you choose depends on your use-case, data and LLMs you use for generation.
26 | 
27 | ### Example RAG Algorithms
28 | We show some examples of RAG algorithms that can be implemented using Indexify.
29 | 
30 | * Adaptive RAG
31 | * [Contextual RAG](https://github.com/tensorlakeai/indexify/tree/main/examples/contextual_rag)
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/basic_embedding/workflow.py:
--------------------------------------------------------------------------------
 1 | from tensorlake import RemoteGraph, Graph, Image
 2 | from tensorlake.functions_sdk.functions import (
 3 |     TensorlakeCompute,
 4 | )
 5 | from pydantic import BaseModel
 6 | from typing import List
 7 | from sentence_transformers import SentenceTransformer
 8 | 
 9 | 
10 | tf_image = (
11 |     Image()
12 |     .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
13 |     .name("tensorlake/common-torch-deps-indexify-executor")
14 |     .run("pip install transformers")
15 |     .run("pip install sentence_transformers")
16 |     .run("pip install langchain")
17 | )
18 | 
19 | class Embedding(BaseModel):
20 |     embedding: List[List[float]]
21 | 
22 | class Sentences(BaseModel):
23 |     sentences: List[str]
24 | 
25 | class EmbeddingFunction(TensorlakeCompute):
26 |     name = "sentence_embedder"
27 |     image = tf_image 
28 | 
29 |     def __init__(self):
30 |         super().__init__()
31 |         self.model = SentenceTransformer("all-MiniLM-L6-v2")
32 | 
33 |     def run(self, sentences: Sentences) -> Embedding:
34 |         #embeddings = self.model.encode(sentences.sentences)
35 |         #embeddings = [embedding.tolist() for embedding in embeddings]
36 |         return Embedding(embedding=[[1.0, 2.0], [3.0, 4.0]])
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     import sys
41 |     g = Graph(name="basic_embedding", start_node=EmbeddingFunction, additional_modules=[sys.modules[__name__]])
42 |     g = RemoteGraph.deploy(g)
43 |     sentences = Sentences(sentences=["hello world", "how are you"])
44 |     invocation_id = g.run(block_until_done=True, img=sentences)
45 |     output = g.output(invocation_id, "sentence_embedder")
46 |     print(output)
47 | 


--------------------------------------------------------------------------------
/examples/container_images/transformers.py:
--------------------------------------------------------------------------------
 1 | from tensorlake import Image
 2 | 
 3 | 
 4 | tf_image = (
 5 |     Image()
 6 |     .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
 7 |     .name("tensorlake/common-torch-deps-indexify-executor")
 8 |     .run("pip install transformers")
 9 |     .run("pip install sentence_transformers")
10 |     .run("pip install langchain")
11 | )
12 | 


--------------------------------------------------------------------------------
/examples/contextual_rag/README.md:
--------------------------------------------------------------------------------
 1 | # Contextual RAG
 2 | 
 3 | This example demonstrates setting up a Contextual RAG pipeline, introduced in [this](https://www.anthropic.com/news/contextual-retrieval) Anthropic blogpost.
 4 | 
 5 | We will be building a simple contextual chunker which uses the Prompt Caching feature supported by Anthropic's Claude
 6 | (and OpenAI's) APIs, and write the data, for querying, to a local lancedb vector store.
 7 | 
 8 | This pipeline can be run locally for quick testing and the lancedb vector store can be queried from any application.
 9 | 
10 | ## Run Application Locally
11 | 
12 | 1. Simple install a new virtual environment for python dependencies.
13 | 2. Use the requirements file to install them, `pip install -r requirements.txt`.
14 | 3. Run the python workflow file, `python workflow.py`
15 | 
16 | ## Explanation of the Workflow
17 | The workflow involves the following steps,
18 | 1. `generate_chunk_contexts` - Chunks the input document, and generates contexts using Anthropic's model API.
19 | 2. `TextEmbeddingExtractor` - Computes an embedding for the chunk contexts.
20 | 3. `LanceDBWriter` - Writes the embeddings to the vector store. 
21 | 
22 | ## Customization
23 | 
24 | ### Modifying the Example
25 | 
26 | Copy the folder, modify the code as you like and simply test the new Graph.
27 | 
28 | This example calls for a local lancedb vector store to write the contextual embedding data. We use a local instance (written to disk) to
29 | demonstrate how Indexify pipelines work. For a production deployment we would replace the `LanceDBWriter` to call the production
30 | deployment of the vector store.
31 | 
32 | This example also relies on `sentence-transformers/all-MiniLM-L6-v2` to perform the embedding. We can replace this to use services like
33 | OpenAI or Amazon Bedrock, or run a custom executor that can be deployed as needed on a GPU accelerated resource.
34 | 
35 | ### Images
36 | 
37 | The example is also setup with a single `image` for all the functions. This image installs all the dependencies needed 
38 | for this example. However, for production deployments it might make sense to have an individual image for each function
39 | that has a smaller set of dependencies.


--------------------------------------------------------------------------------
/examples/contextual_rag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/examples/contextual_rag/__init__.py


--------------------------------------------------------------------------------
/examples/knowledge_graph/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | networks:
  2 |   server:
  3 | services:
  4 |   indexify:
  5 |     image: tensorlake/indexify-server
  6 |     ports:
  7 |       - 8900:8900
  8 |     networks:
  9 |       server:
 10 |         aliases:
 11 |           - indexify-server
 12 |     volumes:
 13 |       - data:/tmp/indexify-blob-storage
 14 | 
 15 |   nlp-executor:
 16 |     image: tensorlake/nlp-image:latest
 17 |     command:
 18 |       [
 19 |         "indexify-cli",
 20 |         "executor",
 21 |         "--server-addr",
 22 |         "indexify:8900"
 23 |       ]
 24 |     networks:
 25 |       server:
 26 |     volumes:
 27 |       - data:/tmp/indexify-blob-storage
 28 |     deploy:
 29 |       mode: replicated
 30 |       replicas: 1
 31 | 
 32 |   embedding-executor:
 33 |     image: tensorlake/embedding-image:latest
 34 |     command:
 35 |       [
 36 |         "indexify-cli",
 37 |         "executor",
 38 |         "--server-addr",
 39 |         "indexify:8900"
 40 |       ]
 41 |     networks:
 42 |       server:
 43 |     volumes:
 44 |       - data:/tmp/indexify-blob-storage
 45 |     deploy:
 46 |       mode: replicated
 47 |       replicas: 1
 48 | 
 49 |   neo4j-executor:
 50 |     image: tensorlake/neo4j-image:latest
 51 |     environment:
 52 |       - NEO4J_URI=bolt://neo4j-server:7687
 53 |       - NEO4J_USER=neo4j
 54 |       - NEO4J_PASSWORD=indexify
 55 |     command:
 56 |       [
 57 |         "indexify-cli",
 58 |         "executor",
 59 |         "--server-addr",
 60 |         "indexify:8900"
 61 |       ]
 62 |     networks:
 63 |       server:
 64 |     volumes:
 65 |       - data:/tmp/indexify-blob-storage
 66 |     deploy:
 67 |       mode: replicated
 68 |       replicas: 1
 69 | 
 70 |   gemini-executor:
 71 |     image: tensorlake/gemini-image:latest
 72 |     environment:
 73 |       - GOOGLE_API_KEY=${GOOGLE_API_KEY}
 74 |     command:
 75 |       [
 76 |         "indexify-cli",
 77 |         "executor",
 78 |         "--server-addr",
 79 |         "indexify:8900"
 80 |       ]
 81 |     networks:
 82 |       server:
 83 |     volumes:
 84 |       - data:/tmp/indexify-blob-storage
 85 |     deploy:
 86 |       mode: replicated
 87 |       replicas: 1
 88 |   
 89 |   base-executor:
 90 |     image: tensorlake/base-image:latest
 91 |     command:
 92 |       [
 93 |         "indexify-cli",
 94 |         "executor",
 95 |         "--server-addr",
 96 |         "indexify:8900"
 97 |       ]
 98 |     networks:
 99 |       server:
100 |     volumes:
101 |       - data:/tmp/indexify-blob-storage
102 |     deploy:
103 |       mode: replicated
104 |       replicas: 1
105 | 
106 |   neo4j-server:
107 |     image: neo4j:4.4
108 |     environment:
109 |       - NEO4J_AUTH=neo4j/indexify
110 |     ports:
111 |       - "7474:7474"
112 |       - "7687:7687"
113 |     networks:
114 |       server:
115 |     volumes:
116 |       - data:/tmp/indexify-blob-storage
117 |     deploy:
118 |       mode: replicated
119 |       replicas: 1
120 | 
121 | volumes:
122 |   data:
123 |   neo4j_data:
124 | 


--------------------------------------------------------------------------------
/examples/knowledge_graph/requirements.txt:
--------------------------------------------------------------------------------
1 | indexify
2 | neo4j
3 | spacy
4 | sentence-transformers
5 | google-generativeai


--------------------------------------------------------------------------------
/examples/object_detection/.gitignore:
--------------------------------------------------------------------------------
1 | yolo*


--------------------------------------------------------------------------------
/examples/object_detection/README.md:
--------------------------------------------------------------------------------
 1 | # Object Detection and Description Pipeline
 2 | 
 3 | This project is a pipeline for object detection and description. It uses Ultralytics YOLOv8 to detect objects in images. Visual Description is generated using a pre-trained moondream model.
 4 | 
 5 | ## How It Works
 6 | 
 7 | The pipeline has two compute classes:
 8 | 
 9 | 1. Object Detection
10 | 2. Visual Description
11 | 
12 | The output of the object detection is a list of bounding boxes and the class of the object. The original image and the result of the object detection are passed to the Visual Description model. The output of the Visual Description model has the description and the bounding boxes detected.
13 | 
14 | ## How to Run 
15 | 
16 | ### Locally on your Laptop
17 | 
18 | 1. Start the server and an image with the dependencies of the functions.
19 | 
20 | This example works only on GPU machines.
21 | 
22 | ```bash
23 | docker compose up
24 | ```
25 | 
26 | 2. Run the Workflow
27 | ```python
28 | python workflow.py
29 | ```
30 | 
31 | Here is the output:
32 | ```
33 | [ImageDescription(description='The image captures a bustling street scene in Times Square, New York, teeming with yellow taxis and surrounded by a vibrant array of billboards and advertisements.', detections=[Detection(bbox=[588.925048828125, 468.69464111328125, 796.9473876953125, 619.639404296875], label='car', confidence=0.8861740827560425), Detection(bbox=[319.2535095214844, 480.70361328125, 454.46826171875, 559.7138671875], label='car', confidence=0.836341142654419), Detection(bbox=[746.5311889648438, 475.47247314453125, 918.6951293945312, 579.5167236328125], label='car', confidence=0.7883055806159973), Detection(bbox=[72.3926010131836, 517.8421630859375, 144.1722412109375, 592.4739990234375], label='potted plant', confidence=0.7109927535057068), Detection(bbox=[545.43994140625, 468.8044738769531, 593.8861083984375, 500.2347106933594], label='car', confidence=0.708862841129303), Detection(bbox=[907.5588989257812, 469.9573059082031, 924.8134765625, 513.6497192382812], label='person', confidence=0.4035480320453644), Detection(bbox=[148.98741149902344, 470.06207275390625, 197.73593139648438, 532.3275756835938], label='potted plant', confidence=0.30130401253700256), Detection(bbox=[519.849853515625, 471.2547912597656, 548.244140625, 497.1222229003906], label='car', confidence=0.29631689190864563), Detection(bbox=[778.8235473632812, 462.0633850097656, 826.0460205078125, 485.4345703125], label='car', confidence=0.28815868496894836), Detection(bbox=[967.5180053710938, 462.99041748046875, 979.6304321289062, 526.1993408203125], label='person', confidence=0.26838958263397217)])]
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/examples/object_detection/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | networks:
 3 |   server:
 4 | services:
 5 |   indexify:
 6 |     image: tensorlake/indexify-server
 7 |     ports:
 8 |       - 8900:8900
 9 |     networks:
10 |       server:
11 |         aliases:
12 |           - indexify-server
13 |     volumes:
14 |       - data:/tmp/indexify-blob-storage
15 |   ultralytics-image:
16 |     # Use this for GPU support
17 |     # image: tensorlake/pdf-blueprint-pdf-parser-gpu:latest
18 |     image: tensorlake/blueprints-ultralytics:latest
19 |     command:
20 |       [
21 |         "indexify-cli",
22 |         "executor",
23 |         "--server-addr",
24 |         "indexify:8900"
25 |       ]
26 |     networks:
27 |       server:
28 |     volumes:
29 |       - data:/tmp/indexify-blob-storage
30 |     deploy:
31 |       mode: replicated
32 |       replicas: 1
33 |       # Uncomment this for GPU support
34 |       resources:
35 |         reservations:
36 |           devices:
37 |             - driver: nvidia
38 |               capabilities: [gpu]
39 | 
40 | volumes:
41 |   data:
42 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/.gitignore:
--------------------------------------------------------------------------------
1 | *.lance
2 | indexify_local*
3 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/examples/pdf_document_extraction/__init__.py


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/chroma_db_retrieve.py:
--------------------------------------------------------------------------------
 1 | from chromadb  import HttpClient, QueryResult
 2 | client = HttpClient(host="localhost", port=8000)
 3 | from sentence_transformers import SentenceTransformer
 4 | 
 5 | # Query VectorDB for similar text
 6 | text_collection = client.get_collection("text_embeddings")
 7 | query_embeddings = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True).encode(["transformers"])
 8 | result: QueryResult = text_collection.query(query_embeddings, n_results=5, )
 9 | documents = result["documents"][0]
10 | distances = result["distances"][0]
11 | for i, document in enumerate(documents):
12 |     print(f"document {document}, score: {distances[i]}")
13 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/chromadb_writer.py:
--------------------------------------------------------------------------------
 1 | from tensorlake.functions_sdk.functions import TensorlakeCompute
 2 | from typing import Union
 3 | from common_objects import ImageWithEmbedding, TextChunk
 4 | from images import chroma_image
 5 | 
 6 | class ChromaDBWriter(TensorlakeCompute):
 7 |     name = "chroma_db_writer"
 8 |     image = chroma_image
 9 | 
10 |     def __init__(self):
11 |         import chromadb
12 |         super().__init__()
13 |         self._client = chromadb.HttpClient(host="chromadb", port=8000)
14 |         self._text_collection = self._client.create_collection(name="text_embeddings", metadata={"hnsw:space": "cosine"}, get_or_create=True)
15 |         self._image_collection = self._client.create_collection(name="image_embeddings", metadata={"hnsw:space": "cosine"}, get_or_create=True)
16 | 
17 |     def run(self, input: Union[ImageWithEmbedding, TextChunk]) -> bool:
18 |         import uuid
19 |         from PIL import Image
20 |         import io
21 |         import numpy as np
22 |         if type(input) == ImageWithEmbedding:
23 |             img_arr = np.array(Image.open(io.BytesIO(input.image_bytes)))
24 |             self._image_collection.upsert(
25 |                 ids=[str(uuid.uuid4())],
26 |                 embeddings=[input.embedding],
27 |                 metadatas=[{"page_number": input.page_number}],
28 |                 images=[img_arr]
29 |             )
30 |         elif type(input) == TextChunk:
31 |             self._text_collection.upsert(
32 |                 ids=[str(uuid.uuid4())],
33 |                 embeddings=[input.embeddings],
34 |                 metadatas=[{"page_number": input.page_number}],
35 |                 documents=[input.chunk]
36 |             )
37 |         return True
38 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/common_objects.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Any
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | 
 6 | class TextChunk(BaseModel):
 7 |     chunk: str
 8 |     page_number: Optional[int] = None
 9 |     embeddings: Optional[List[float]] = None
10 | 
11 | 
12 | class ImageWithEmbedding(BaseModel):
13 |     embedding: List[float]
14 |     image_bytes: bytes
15 |     page_number: int
16 | 
17 | 
18 | # Docling Example Objects
19 | class PDFParserDoclingOutput(BaseModel):
20 |     texts: List[str]
21 |     images: List[str]


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/embedding.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from typing import Any, List
 3 | 
 4 | from tensorlake.functions_sdk.functions import TensorlakeCompute, tensorlake_function
 5 | from common_objects import ImageWithEmbedding, TextChunk, PDFParserDoclingOutput
 6 | from images import st_image
 7 | 
 8 | @tensorlake_function(image=st_image)
 9 | def chunk_text_docling(document: PDFParserDoclingOutput) -> List[TextChunk]:
10 |     """
11 |     Extract chunks from documents
12 |     """
13 |     from langchain_text_splitters import RecursiveCharacterTextSplitter
14 | 
15 |     chunks = []
16 | 
17 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
18 |     for i, text in enumerate(document.texts):
19 |         splits = text_splitter.split_text(text)
20 |         for split in splits:
21 |             chunks.append(TextChunk(chunk=split, page_number=i+1))
22 | 
23 |     return chunks
24 | 
25 | 
26 | class TextEmbeddingExtractor(TensorlakeCompute):
27 |     name = "text-embedding-extractor"
28 |     description = "Extractor class that captures an embedding model"
29 |     system_dependencies = []
30 |     input_mime_types = ["text"]
31 |     image = st_image
32 | 
33 |     def __init__(self):
34 |         super().__init__()
35 |         from sentence_transformers import SentenceTransformer
36 |         self.model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
37 | 
38 |     def run(self, input: TextChunk) -> TextChunk:
39 |         embeddings = self.model.encode(input.chunk)
40 |         input.embeddings = embeddings.tolist()
41 |         return input
42 | 
43 | 
44 | class ImageEmbeddingDoclingExtractor(TensorlakeCompute):
45 |     name = "image-embedding-docling"
46 |     description = "Extractor class that captures an embedding model"
47 |     image=st_image
48 | 
49 |     def __init__(self):
50 |         super().__init__()
51 |         from sentence_transformers import SentenceTransformer
52 |         self.model = SentenceTransformer("clip-ViT-B-32")
53 | 
54 |     def run(self, document: PDFParserDoclingOutput) -> List[ImageWithEmbedding]:
55 |         import io
56 |         from PIL import Image as PILImage
57 | 
58 |         embeddings = []
59 |         for i, image_str in enumerate(document.images):
60 |             img_bytes = io.BytesIO(base64.b64decode(image_str))
61 |             img_bytes.seek(0)
62 |             img_emb = self.model.encode(PILImage.open(img_bytes))
63 |             img_bytes.seek(0)
64 |             embeddings.append(
65 |                 ImageWithEmbedding(
66 |                     embedding=img_emb,
67 |                     image_bytes=img_bytes.getvalue(),
68 |                     page_number=i+1,
69 |                 )
70 |             )
71 | 
72 |         return embeddings
73 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/es_retrieve.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | import random
 3 | 
 4 | #### Connect from host machine, using the port exposed by Docker
 5 | es = Elasticsearch(["http://localhost:9200"])
 6 | print(es.cluster.health())
 7 | 
 8 | #### List all indices
 9 | indices = es.indices.get(index='*')
10 | for index_name in indices:
11 |     print(index_name)
12 | 
13 | #### With details
14 | indices_info = es.cat.indices(format='json')
15 | for index in indices_info:
16 |     print(f"{index['index']} doc count: {index['docs.count']}")
17 | 
18 | #### Print fields in index.
19 | # Indexes from this example are `image_embeddings` and `text_embeddings`.
20 | 
21 | INDEX_NAME = "text_embeddings"
22 | 
23 | mapping = es.indices.get_mapping(index=INDEX_NAME)
24 | index_name = list(mapping.body.keys())[0]
25 | fields = mapping.body[index_name]['mappings']['properties'].keys()
26 | 
27 | print("Fields in index:")
28 | for field in fields:
29 |     print(field)
30 | 
31 | #### Sanity test.
32 | # A random text embedding vector to test the pipeline. In production you would have to call the same model as the workflow to compute the embedding.
33 | # Text embedding as per the workflow has a size of 768. Change the seed or the uniform generator for different results.
34 | 
35 | #random.seed(42)
36 | random.seed(4)
37 | query_vec = [random.uniform(0.8, 1.) for _ in range(768)]
38 | #query_vec = [random.uniform(0.5, .6) for _ in range(768)]
39 | 
40 | # Query the documents with knn. Change k and num_candidates (k <= num_candidates)
41 | QUERY_FIELD_TEXT_INDEX = "embedding"
42 | response = es.search(
43 |     index=INDEX_NAME,
44 |     body={
45 |         "knn": {
46 |             "field": QUERY_FIELD_TEXT_INDEX,
47 |             "query_vector": query_vec,
48 |             "k": 2,
49 |             "num_candidates": 3
50 |         }
51 |     }
52 | )
53 | 
54 | #### Print document structure
55 | print("\n KNN Documents:")
56 | for hit in response['hits']['hits']:
57 |     print(f"Document ID: {hit['_id']}, Score: {hit['_score']}")
58 |     print(hit['_source']['chunk'])


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/images.py:
--------------------------------------------------------------------------------
 1 | from tensorlake import Image
 2 | 
 3 | chroma_image = (
 4 |     Image()
 5 |     .name("tensorlake/blueprints-chromadb")
 6 |     .base_image(f"python:3.11-slim-bookworm")
 7 |     .run("pip install chromadb")
 8 |     .run("pip install pillow")
 9 | )
10 | 
11 | st_image = (
12 |     Image()
13 |     .name("tensorlake/pdf-blueprint-st")
14 |     .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-devel")
15 |     .run("pip install sentence-transformers")
16 |     .run("pip install langchain")
17 |     .run("pip install pillow")
18 |     .run("pip install opentelemetry-api")
19 |     .run("pip install elasticsearch")
20 |     .run("pip install elastic-transport")
21 | )
22 | 
23 | lance_image = (
24 |     Image()
25 |     .name("tensorlake/pdf-blueprint-lancdb")
26 |     .base_image(f"python:3.11-slim-bookworm")
27 |     .run("pip install lancedb")
28 | )
29 | 
30 | inkwell_image_gpu = (
31 |     Image()
32 |     .name("tensorlake/pdf-blueprint-pdf-parser-gpu")
33 |     .base_image("pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel")
34 |     .run("apt update")
35 |     .run("apt install -y libgl1-mesa-glx")
36 |     .run('pip install docling')
37 |     .run('pip install torch==2.5.1 torchvision==0.2.1')
38 | )
39 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/lancedb_functions.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | from tensorlake.functions_sdk.functions import TensorlakeCompute
 4 | from common_objects import ImageWithEmbedding, TextChunk
 5 | import lancedb
 6 | from lancedb.pydantic import LanceModel, Vector
 7 | from images import lance_image
 8 | 
 9 | 
10 | class ImageEmbeddingTable(LanceModel):
11 |     vector: Vector(512)
12 |     image_bytes: bytes
13 |     page_number: int
14 | 
15 | class TextEmbeddingTable(LanceModel):
16 |     vector: Vector(384)
17 |     text: str
18 |     page_number: int
19 | 
20 | class LanceDBWriter(TensorlakeCompute):
21 |     name = "lancedb_writer"
22 |     image = lance_image
23 | 
24 |     def __init__(self):
25 |         super().__init__()
26 |         self._client = lancedb.connect("vectordb.lance")
27 |         self._text_table = self._client.create_table(
28 |             "text_embeddings", schema=TextEmbeddingTable, exist_ok=True
29 |         )
30 |         self._clip_table = self._client.create_table(
31 |             "image_embeddings", schema=ImageEmbeddingTable, exist_ok=True
32 |         )
33 | 
34 |     def run(self, input: Union[ImageWithEmbedding, TextChunk]) -> bool:
35 |         if type(input) == ImageWithEmbedding:
36 |             self._clip_table.add(
37 |                 [
38 |                     ImageEmbeddingTable(
39 |                         vector=input.embedding,
40 |                         image_bytes=input.image_bytes,
41 |                         page_number=input.page_number,
42 |                     )
43 |                 ]
44 |             )
45 |         elif type(input) == TextChunk:
46 |             self._text_table.add(
47 |                 [
48 |                     TextEmbeddingTable(
49 |                         vector=input.embeddings,
50 |                         text=input.chunk,
51 |                         page_number=input.page_number,
52 |                     )
53 |                 ]
54 |             )
55 |         return True
56 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/pdf_parser_docling.py:
--------------------------------------------------------------------------------
 1 | from common_objects import PDFParserDoclingOutput
 2 | from tensorlake.functions_sdk.data_objects import File
 3 | from tensorlake.functions_sdk.functions import TensorlakeCompute
 4 | 
 5 | from images import inkwell_image_gpu
 6 | 
 7 | 
 8 | class PDFParserDocling(TensorlakeCompute):
 9 |     name = "pdf-parse-docling"
10 |     description = "Parser class that captures a pdf file"
11 |     # Change to gpu_image to use GPU
12 |     image = inkwell_image_gpu
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def run(self, file: File) -> PDFParserDoclingOutput:
18 |         from docling.datamodel.pipeline_options import PdfPipelineOptions
19 |         IMAGE_RESOLUTION_SCALE = 2.0
20 |         pipeline_options = PdfPipelineOptions()
21 |         pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
22 |         pipeline_options.generate_page_images = True
23 | 
24 |         from docling.document_converter import DocumentConverter, PdfFormatOption
25 |         from docling.datamodel.base_models import InputFormat
26 | 
27 |         import tempfile
28 |         with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as f:
29 |             f.write(file.data)
30 |             converter = DocumentConverter(
31 |                 format_options={
32 |                     InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
33 |                 }
34 |             )
35 |             result = converter.convert(f.name)
36 | 
37 |             texts = []
38 |             for i in range(len(result.pages)):
39 |                 page_result = result.document.export_to_markdown(page_no=i+1)
40 |                 texts.append(page_result)
41 | 
42 |             images = []
43 |             for element, _level in result.document.iterate_items():
44 |                 from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
45 |                 if isinstance(element, PictureItem):
46 |                     pil_image = element.get_image(result.document)
47 | 
48 |                     # Using docling APIs to avoid confusion.
49 |                     b64 = element._image_to_base64(pil_image)
50 |                     images.append(b64)
51 | 
52 |             return PDFParserDoclingOutput(texts=texts, images=images)
53 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/requirements.txt:
--------------------------------------------------------------------------------
 1 | indexify
 2 | pydantic
 3 | docling==2.14.0
 4 | docling-core
 5 | sentence-transformers
 6 | chromadb
 7 | elasticsearch
 8 | langchain-text-splitters
 9 | elastic-transport
10 | 


--------------------------------------------------------------------------------
/examples/pdf_document_extraction/workflow.py:
--------------------------------------------------------------------------------
 1 | from tensorlake import RemoteGraph
 2 | from elastic_writer import ElasticSearchWriter
 3 | from embedding import chunk_text_docling, ImageEmbeddingDoclingExtractor
 4 | from tensorlake.functions_sdk.data_objects import File
 5 | from tensorlake.functions_sdk.graph import Graph
 6 | from tensorlake.functions_sdk.functions import tensorlake_function
 7 | 
 8 | 
 9 | # This graph is the alternate approach.
10 | # This graph extracts text and image embeddings from the PDF using docling
11 | # and writes them to ElasticSearch.
12 | def create_graph() -> Graph:
13 |     from embedding import TextEmbeddingExtractor
14 |     from pdf_parser_docling import PDFParserDocling
15 | 
16 |     g = Graph(
17 |         "Extract_pages_tables_images_pdf_docling",
18 |         start_node=PDFParserDocling,
19 |         version="0.1",  # update when deploying to keep track of graph versions (param is defaulted in the sdk).
20 |         additional_modules=[common_objects, images],
21 |     )
22 | 
23 |     # Send the parse output to the text chunker and the image embedder.
24 |     g.add_edge(PDFParserDocling, chunk_text_docling)
25 |     g.add_edge(PDFParserDocling, ImageEmbeddingDoclingExtractor)
26 | 
27 |     ## Compute the text embedding vectors
28 |     g.add_edge(chunk_text_docling, TextEmbeddingExtractor)
29 | 
30 |     ## Write text and image embeddings to vectordb
31 |     g.add_edge(ImageEmbeddingDoclingExtractor, ElasticSearchWriter)
32 |     g.add_edge(TextEmbeddingExtractor, ElasticSearchWriter)
33 |     return g
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     graph: Graph = create_graph()
38 | 
39 |     file_url = "https://arxiv.org/pdf/1706.03762"
40 |     import httpx
41 |     resp = httpx.get(url=file_url, follow_redirects=True)
42 |     resp.raise_for_status()
43 | 
44 |     file = File(data=resp.content, mime_type="application/pdf")
45 | 
46 |     # uncomment to run locally
47 |     #invocation_id = graph.run(file=file)
48 |     #exit(0)
49 | 
50 |     import common_objects
51 |     import images
52 | 
53 |     remote_graph = RemoteGraph.deploy(
54 |         graph,
55 |         server_url="http://localhost:8900",
56 |     )
57 | 
58 |     invocation_id = remote_graph.run(file=file)
59 |     print(f"Invocation ID: {invocation_id}")
60 | 


--------------------------------------------------------------------------------
/examples/pdf_structured_extraction/.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=xxxxxxxxxxxxxxxxxxxxxx
2 | 


--------------------------------------------------------------------------------
/examples/pdf_structured_extraction/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | networks:
 3 |   server:
 4 | services:
 5 |   indexify:
 6 |     image: tensorlake/indexify-server
 7 |     ports:
 8 |       - 8900:8900
 9 |     networks:
10 |       server:
11 |         aliases:
12 |           - indexify-server
13 |     volumes:
14 |       - data:/tmp/indexify-blob-storage
15 |   executor:
16 |     image: tensorlake/blueprint-pdf-structured-extraction:latest
17 |     env_file:
18 |       - .env
19 |     command:
20 |       [
21 |         "indexify-cli",
22 |         "executor",
23 |         "--server-addr",
24 |         "indexify:8900"
25 |       ]
26 |     networks:
27 |       server:
28 |     volumes:
29 |       - data:/tmp/indexify-blob-storage
30 |     deploy:
31 |       mode: replicated
32 |       replicas: 1
33 |   postgres:
34 |     image: postgres:13
35 |     environment:
36 |       POSTGRES_USER: db_user 
37 |       POSTGRES_PASSWORD: db_password 
38 |       POSTGRES_DB: indexify_demo
39 |     networks:
40 |       server:
41 | volumes:
42 |   data:
43 | 


--------------------------------------------------------------------------------
/examples/readme/distributed_map.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from tensorlake import tensorlake_function, tensorlake_router, Graph
 3 | from typing import List, Union
 4 | 
 5 | @tensorlake_function()
 6 | def generate_sequence(a: int) -> List[int]:
 7 |     return [i for i in range(a)]
 8 | 
 9 | @tensorlake_function()
10 | def squared(x: int) -> int:
11 |     return x * x
12 | 
13 | if __name__ == '__main__':
14 |     g = Graph(name="sequence_summer", start_node=generate_sequence, description="Simple Sequence Summer")
15 |     g.add_edge(generate_sequence, squared)
16 | 
17 |     from tensorlake import RemoteGraph
18 |     graph = RemoteGraph.deploy(g)
19 | 
20 |     num_iter = 90
21 |     invocation_id = graph.run(block_until_done=True, a=num_iter)
22 |     result = graph.output(invocation_id, "squared")
23 |     if len(result) != num_iter:
24 |         raise Exception(f"Missing outputs - {len(result)} != {num_iter}")
25 |     else:
26 |         print(f"Success with {num_iter} outputs")
27 | 


--------------------------------------------------------------------------------
/examples/readme/map_reduce_example.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from tensorlake import tensorlake_function, Graph
 3 | from typing import List
 4 | 
 5 | class Total(BaseModel):
 6 |     val: int = 0
 7 | 
 8 | @tensorlake_function()
 9 | def generate_numbers(a: int) -> List[int]:
10 |     return [i for i in range(a)]
11 | 
12 | @tensorlake_function()
13 | def square(x: int) -> int:
14 |     return x ** 2
15 | 
16 | @tensorlake_function(accumulate=Total)
17 | def add(total: Total, new: int) -> Total:
18 |     total.val += new
19 |     return total
20 | 
21 | g = Graph(name="sequence_summer", start_node=generate_numbers, description="Simple Sequence Summer")
22 | g.add_edge(generate_numbers, square)
23 | g.add_edge(square, add)
24 | 
25 | if __name__ == "__main__":
26 |     #invocation_id = g.run(a=10)
27 |     #result = g.get_output(invocation_id, "add")
28 |     #print(result)
29 | 
30 |     from tensorlake import RemoteGraph
31 |     graph = RemoteGraph.deploy(g)
32 |     invocation_id = graph.run(block_until_done=True, a=10)
33 |     result = graph.output(invocation_id, "add")
34 |     print(result)
35 | 
36 |     graph = RemoteGraph.by_name("sequence_summer")
37 |     invocation_id = graph.run(block_until_done=True, a=5)
38 |     print(graph.output(invocation_id, "add"))
39 | 


--------------------------------------------------------------------------------
/examples/readme/website.py:
--------------------------------------------------------------------------------
 1 | from tensorlake import tensorlake_function, Graph
 2 | from pydantic import BaseModel
 3 | 
 4 | class Audio(BaseModel):
 5 |     file: bytes
 6 | 
 7 | @tensorlake_function()
 8 | def scrape_website(url: str) -> str:
 9 |     import requests
10 |     return requests.get(f"http://r.jina.ai/{url}").text
11 | 
12 | @tensorlake_function()
13 | def summarize_text(text: str) -> str:
14 |     from openai import OpenAI
15 |     completion = OpenAI().chat.completions.create(
16 |         model="gpt-4o-mini",
17 |         messages=[
18 |             {"role": "system", "content": "Generate a summary of this website. Don't add asterisks or any other markdown to the text. Keep the summary short. Write something funny and light-hearted about the topic."},
19 |             {"role": "user", "content": text},
20 |         ],
21 |     )
22 |     return completion.choices[0].message.content
23 | 
24 | @tensorlake_function()
25 | def create_audio(summary: str) -> Audio:
26 |     import elevenlabs
27 |     from elevenlabs import save
28 | 
29 |     voice = "Rachel"  # You can choose a different voice if needed
30 |     client = elevenlabs.ElevenLabs()
31 |     audio = client.generate(text=summary, voice=voice)
32 |     save(audio, "tensorlake-daily.mp3")
33 |     with open("tensorlake-daily.mp3", "rb") as f:
34 |         return Audio(file=f.read())
35 |     return None
36 | 
37 | g = Graph(name="website-summarizer", start_node=scrape_website)
38 | g.add_edge(scrape_website, summarize_text)
39 | g.add_edge(summarize_text, create_audio)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     #g.run(url="https://en.wikipedia.org/wiki/Golden_State_Warriors")
44 |     from tensorlake import RemoteGraph
45 |     RemoteGraph.deploy(g, server_url="http://localhost:8900")
46 |     graph = RemoteGraph.by_name(name="website-summarizer", server_url="http://localhost:8900")
47 |     invocation_id = graph.run(block_until_done=True, url="https://en.wikipedia.org/wiki/Golden_State_Warriors")
48 |     summary = graph.output(invocation_id, "summarize_text")
49 |     print(summary)
50 |     audio = graph.output(invocation_id, "create_audio")
51 |     from elevenlabs import play
52 |     play(audio[0].file)
53 | 


--------------------------------------------------------------------------------
/examples/tweetsgenerator/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | networks:
 3 |   server:
 4 | 
 5 | services:
 6 |   indexify:
 7 |     image: tensorlake/indexify-server
 8 |     ports:
 9 |       - 8900:8900
10 |     networks:
11 |       server:
12 |         aliases:
13 |           - indexify-server
14 |     volumes:
15 |       - data:/tmp/indexify-blob-storage
16 | 
17 |   openai:
18 |     image: tensorlake/openai-image:3.10
19 |     environment:
20 |       - OPENAI_API_KEY=${OPENAI_API_KEY}
21 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
22 |     networks:
23 |       server:
24 |     volumes:
25 |       - data:/tmp/indexify-blob-storage
26 | 
27 |   base-executor:
28 |     image: tensorlake/base-image:3.10
29 |     command:
30 |       [
31 |         "indexify-cli",
32 |         "executor",
33 |         "--server-addr",
34 |         "indexify:8900"
35 |       ]
36 |     networks:
37 |       server:
38 |     volumes:
39 |       - data:/tmp/indexify-blob-storage
40 | 
41 | volumes:
42 |   data:
43 | 


--------------------------------------------------------------------------------
/examples/tweetsgenerator/requirements.txt:
--------------------------------------------------------------------------------
1 | indexify


--------------------------------------------------------------------------------
/examples/video_summarization/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | networks:
 2 |   server:
 3 | services:
 4 |   indexify:
 5 |     image: tensorlake/indexify-server
 6 |     ports:
 7 |       - 8900:8900
 8 |     networks:
 9 |       server:
10 |         aliases:
11 |           - indexify-server
12 |     volumes:
13 |       - data:/app
14 | 
15 |   yt-downloader:
16 |     image: tensorlake/yt-downloader:latest
17 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
18 |     networks:
19 |       server:
20 |     volumes:
21 |       - data:/app
22 | 
23 |   audio-processor:
24 |     image: tensorlake/audio-processor:latest
25 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
26 |     networks:
27 |       server:
28 |     volumes:
29 |       - data:/app
30 | 
31 |   transcriber:
32 |     image: tensorlake/transcriber:latest
33 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
34 |     networks:
35 |       server:
36 |     volumes:
37 |       - data:/app
38 | 
39 |   llama-cpp:
40 |     image: tensorlake/llama-cpp:latest
41 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
42 |     networks:
43 |       server:
44 |     volumes:
45 |       - data:/app
46 |       
47 |   router:
48 |     image: tensorlake/base-image:latest
49 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
50 |     networks:
51 |       server:
52 |     volumes:
53 |       - data:/app
54 | 
55 | volumes:
56 |   data:
57 | 


--------------------------------------------------------------------------------
/examples/video_summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | indexify
2 | pytubefix
3 | pydub
4 | faster_whisper
5 | llama_cpp_python
6 | rich


--------------------------------------------------------------------------------
/examples/website_audio_summary/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | networks:
 3 |   server:
 4 | 
 5 | services:
 6 |   indexify:
 7 |     image: tensorlake/indexify-server
 8 |     ports:
 9 |       - 8900:8900
10 |     networks:
11 |       server:
12 |         aliases:
13 |           - indexify-server
14 |     volumes:
15 |       - data:/tmp/indexify-blob-storage
16 | 
17 |   scraper:
18 |     image: tensorlake/scraper-image:latest
19 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
20 |     networks:
21 |       server:
22 |     volumes:
23 |       - data:/tmp/indexify-blob-storage
24 | 
25 |   openai:
26 |     image: tensorlake/openai-image:latest
27 |     environment:
28 |       - OPENAI_API_KEY=${OPENAI_API_KEY}
29 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
30 |     networks:
31 |       server:
32 |     volumes:
33 |       - data:/tmp/indexify-blob-storage
34 | 
35 |   elevenlabs:
36 |     image: tensorlake/elevenlabs-image:latest
37 |     environment:
38 |       - ELEVENLABS_API_KEY=${ELEVENLABS_API_KEY}
39 |     command: ["indexify-cli", "executor", "--server-addr", "indexify:8900"]
40 |     networks:
41 |       server:
42 |     volumes:
43 |       - data:/tmp/indexify-blob-storage
44 | 
45 | volumes:
46 |   data:
47 | 


--------------------------------------------------------------------------------
/examples/website_audio_summary/requirements.txt:
--------------------------------------------------------------------------------
1 | indexify
2 | openai
3 | elevenlabs


--------------------------------------------------------------------------------
/indexify/Makefile:
--------------------------------------------------------------------------------
 1 | all: build
 2 | 
 3 | build: build_proto
 4 | 	@rm -rf dist
 5 | 	@poetry install
 6 | 	@poetry build
 7 | 
 8 | SERVER_API_PY_CLIENT_PROTO_DIR_PATH=indexify/proto
 9 | SERVER_API_PROTO_DIR_PATH=../server/proto
10 | 
11 | build_proto: ${SERVER_API_PROTO_DIR_PATH}/executor_api.proto
12 | 	@poetry install
13 | 	@# .proto file and generated Python files have to be in the same directory.
14 | 	@# See known issue https://github.com/grpc/grpc/issues/29459.
15 | 	@cp ${SERVER_API_PROTO_DIR_PATH}/executor_api.proto src/${SERVER_API_PY_CLIENT_PROTO_DIR_PATH}/executor_api.proto
16 | 	@cd src && poetry run python -m grpc_tools.protoc \
17 | 		--proto_path=. \
18 | 		--python_out=. \
19 | 		--pyi_out=. \
20 | 		--grpc_python_out=. \
21 | 		${SERVER_API_PY_CLIENT_PROTO_DIR_PATH}/executor_api.proto
22 | 	@#The generated proto files don't pass linter checks and need to get reformatted.
23 | 	@poetry run black src/${SERVER_API_PY_CLIENT_PROTO_DIR_PATH}
24 | 	@poetry run isort src/${SERVER_API_PY_CLIENT_PROTO_DIR_PATH} --profile black
25 | 
26 | fmt:
27 | 	@poetry run black .
28 | 	@poetry run isort . --profile black
29 | 
30 | check:
31 | 	@poetry run black --check .
32 | 	@poetry run isort . --check-only --profile black
33 | 
34 | test:
35 | 	cd tests && ./run_tests.sh
36 | 
37 | .PHONY: all build build_proto fmt lint test
38 | 


--------------------------------------------------------------------------------
/indexify/README.md:
--------------------------------------------------------------------------------
1 | ## Overview
2 | 
3 | This a package with all Open Source Indexify components and helper tools
4 | available via a CLI.
5 | 
6 | The CLI allows to:
7 | * Setup a local or a distributed Indexify cluster.
8 | * Build container images for Indexify functions.


--------------------------------------------------------------------------------
/indexify/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "indexify"
 3 | # Incremented if any of the components provided in this packages are updated.
 4 | version = "0.4.3"
 5 | description = "Open Source Indexify components and helper tools"
 6 | authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
 7 | license = "Apache 2.0"
 8 | readme = "README.md"
 9 | homepage = "https://github.com/tensorlakeai/indexify"
10 | repository = "https://github.com/tensorlakeai/indexify"
11 | 
12 | [tool.poetry.scripts]
13 | indexify-cli = "indexify.cli:cli"
14 | 
15 | [tool.poetry.dependencies]
16 | # Common dependencies
17 | python = "^3.10"
18 | # structlog is provided by tensorlake
19 | # pyyaml is provided by tensorlake
20 | # httpx is provided by tensorlake
21 | 
22 | # Executor only
23 | aiohttp = "^3.11.0"
24 | prometheus-client = "^0.21.1"
25 | psutil = "^7.0.0"
26 | # Adds function-executor binary and utils lib.
27 | tensorlake = ">=0.1"
28 | # Uncomment the next line to use local tensorlake package (only for development!)
29 | # tensorlake = { path = "../tensorlake", develop = true }
30 | # pydantic is provided by tensorlake
31 | # grpcio is provided by tensorlake
32 | # grpcio-tools is provided by tensorlake
33 | 
34 | # CLI only
35 | # nanoid is provided by tensorlake
36 | # click is provided by tensorlake
37 | boto3 = "^1.37.30"
38 | 
39 | [tool.poetry.group.dev.dependencies]
40 | black = "^24.10.0"
41 | pylint = "^3.3.0"
42 | parameterized = "^0.9.0"
43 | 
44 | [build-system]
45 | requires = ["poetry==2.0.0"]
46 | build-backend = "poetry.core.masonry.api"
47 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from . import build_image, deploy, executor
 4 | 
 5 | 
 6 | @click.group()
 7 | @click.version_option(package_name="indexify", prog_name="indexify-cli")
 8 | @click.pass_context
 9 | def cli(ctx: click.Context):
10 |     """
11 |     Indexify CLI to manage and deploy workflows to Indexify Server and run Indexify Executors.
12 |     """
13 |     pass
14 | 
15 | 
16 | cli.add_command(build_image.build_image)
17 | cli.add_command(deploy.deploy)
18 | cli.add_command(executor.executor)
19 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/cli/build_image.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from tensorlake.functions_sdk.image import Image
 3 | from tensorlake.functions_sdk.workflow_module import (
 4 |     WorkflowModuleInfo,
 5 |     load_workflow_module_info,
 6 | )
 7 | 
 8 | 
 9 | @click.command(
10 |     short_help="Build images for graphs/workflows defined in the workflow file"
11 | )
12 | # Path to the file where the graphs/workflows are defined as global variables
13 | @click.argument(
14 |     "workflow-file-path",
15 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
16 | )
17 | @click.option(
18 |     "-i",
19 |     "--image-names",
20 |     multiple=True,
21 |     help="Names of images to build. Can be specified multiple times. If not provided, all images will be built.",
22 | )
23 | def build_image(
24 |     workflow_file_path: str,
25 |     image_names: tuple[str, ...] = None,
26 | ):
27 |     try:
28 |         workflow_module_info: WorkflowModuleInfo = load_workflow_module_info(
29 |             workflow_file_path
30 |         )
31 |     except Exception as e:
32 |         click.secho(
33 |             f"Failed loading workflow file, please check the error message: {e}",
34 |             fg="red",
35 |         )
36 |         raise click.Abort
37 | 
38 |     for image in workflow_module_info.images.keys():
39 |         image: Image
40 |         if image_names is not None and image.image_name not in image_names:
41 |             click.echo(
42 |                 f"Skipping image `{image.image_name}` as it is not in the provided image names."
43 |             )
44 |             continue
45 | 
46 |         click.echo(f"Building image `{image.image_name}`")
47 |         built_image, generator = image.build()
48 |         for output in generator:
49 |             click.secho(output)
50 | 
51 |         click.secho(f"built image: {built_image.tags[0]}", fg="green")
52 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/cli/deploy.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from tensorlake import Graph
 3 | from tensorlake.functions_sdk.graph_serialization import graph_code_dir_path
 4 | from tensorlake.functions_sdk.workflow_module import (
 5 |     WorkflowModuleInfo,
 6 |     load_workflow_module_info,
 7 | )
 8 | from tensorlake.remote_graph import RemoteGraph
 9 | 
10 | 
11 | @click.command(
12 |     short_help="Deploy all graphs/workflows defined in the workflow file to Indexify"
13 | )
14 | # Path to the file where the graphs/workflows are defined as global variables
15 | @click.argument(
16 |     "workflow-file-path",
17 |     type=click.Path(exists=True, file_okay=True, dir_okay=False),
18 | )
19 | @click.option(
20 |     "-u",
21 |     "--upgrade-queued-requests",
22 |     is_flag=True,
23 |     default=False,
24 |     help="Upgrade invocations that are already queued or running to use the deployed version of the graphs/workflows",
25 | )
26 | def deploy(
27 |     workflow_file_path: str,
28 |     upgrade_queued_invocations: bool,
29 | ):
30 |     click.echo(f"Preparing deployment for {workflow_file_path}")
31 |     try:
32 |         workflow_module_info: WorkflowModuleInfo = load_workflow_module_info(
33 |             workflow_file_path
34 |         )
35 |     except Exception as e:
36 |         click.secho(
37 |             f"Failed loading workflow file, please check the error message: {e}",
38 |             fg="red",
39 |         )
40 |         raise click.Abort
41 | 
42 |     for graph in workflow_module_info.graphs:
43 |         graph: Graph
44 |         try:
45 |             RemoteGraph.deploy(
46 |                 graph,
47 |                 code_dir_path=graph_code_dir_path(workflow_file_path),
48 |                 upgrade_tasks_to_latest_version=upgrade_queued_invocations,
49 |             )
50 |         except Exception as e:
51 |             click.secho(
52 |                 f"Graph {graph.name} could not be deployed, please check the error message: {e}",
53 |                 fg="red",
54 |             )
55 |             raise click.Abort
56 | 
57 |         click.secho(f"Deployed {graph.name}", fg="green")
58 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | Executor registers at Indexify Server and continuously pulls tasks assigned to it from the Indexify Server
 4 | and executes them. While registering it shares its capabilities like available hardware with the Indexify
 5 | Server and periodically updates the Server about its current state. Executor spins up Function Executors
 6 | to run customer functions. Executor should never link with Tensorlake Python-SDK. It should not know anything
 7 | about programming languages and runtime environments used by Tensorlake Functions. Function Executor is
 8 | responsible for this.
 9 | 
10 | This subpackage doesn't provide an executable entry point that runs an Executor. This is intentional
11 | as Executor has many configurable sub-components. indexify cli subpackage provides `executor`
12 | command that runs Executor with functionality available in Open Source offering.
13 | 
14 | ## Deployment
15 | 
16 | ### Production setup
17 | 
18 | A single Executor runs in a Virtual Machine, container or a in bare metal host. An Indexify cluster
19 | is scaled by adding more Executor hosts. Open Source users manage and scale the hosts themselves e.g.
20 | using Kubernetes, any other orchestrator or even manually. E.g. the users provision secrets,
21 | persistent volumes to each host using the orchestrator or manually. Each Executor runs a single function.
22 | The function name and other qualifiers are defined in Executor arguments.
23 | 
24 | ### Development setup
25 | 
26 | To make Indexify development and testing easier an Executor in development mode can run any function.
27 | Running multiple Executors on the same host is supported too. In this case each Executor requires a
28 | unique port range passed to it in its arguments.
29 | 
30 | ## Threat model
31 | 
32 | A VM/container/bare metal host where an Executor is running is fully trusted. This works well for single
33 | tenant deployments where customer functions' code is fully trusted. If this is not the case then Function
34 | Executors that run customer functions need to get isolated from Executor using e.g. Virtual Machines.
35 | This functionality is not included into the Open Source offering.
36 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/blob_store/blob_store.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | from .local_fs_blob_store import LocalFSBLOBStore
 4 | from .metrics.blob_store import (
 5 |     metric_get_blob_errors,
 6 |     metric_get_blob_latency,
 7 |     metric_get_blob_requests,
 8 |     metric_put_blob_errors,
 9 |     metric_put_blob_latency,
10 |     metric_put_blob_requests,
11 | )
12 | from .s3_blob_store import S3BLOBStore
13 | 
14 | 
15 | class BLOBStore:
16 |     """Dispatches generic BLOB store calls to their real backends."""
17 | 
18 |     def __init__(
19 |         self, local: Optional[LocalFSBLOBStore] = None, s3: Optional[S3BLOBStore] = None
20 |     ):
21 |         """Creates a BLOB store that uses the supplied BLOB stores."""
22 |         self._local: Optional[LocalFSBLOBStore] = local
23 |         self._s3: Optional[S3BLOBStore] = s3
24 | 
25 |     async def get(self, uri: str, logger: Any) -> bytes:
26 |         """Returns binary value stored in BLOB with the supplied URI.
27 | 
28 |         Raises Exception on error. Raises KeyError if the BLOB doesn't exist.
29 |         """
30 |         with (
31 |             metric_get_blob_errors.count_exceptions(),
32 |             metric_get_blob_latency.time(),
33 |         ):
34 |             metric_get_blob_requests.inc()
35 |             if _is_file_uri(uri):
36 |                 self._check_local_is_available()
37 |                 return await self._local.get(uri, logger)
38 |             else:
39 |                 self._check_s3_is_available()
40 |                 return await self._s3.get(uri, logger)
41 | 
42 |     async def put(self, uri: str, value: bytes, logger: Any) -> None:
43 |         """Stores the supplied binary value in a BLOB with the supplied URI.
44 | 
45 |         Overwrites existing BLOB. Raises Exception on error.
46 |         """
47 |         with (
48 |             metric_put_blob_errors.count_exceptions(),
49 |             metric_put_blob_latency.time(),
50 |         ):
51 |             metric_put_blob_requests.inc()
52 |             if _is_file_uri(uri):
53 |                 self._check_local_is_available()
54 |                 await self._local.put(uri, value, logger)
55 |             else:
56 |                 self._check_s3_is_available()
57 |                 await self._s3.put(uri, value, logger)
58 | 
59 |     def _check_local_is_available(self):
60 |         if self._local is None:
61 |             raise RuntimeError("Local file system BLOB store is not available")
62 | 
63 |     def _check_s3_is_available(self):
64 |         if self._s3 is None:
65 |             raise RuntimeError("S3 BLOB store is not available")
66 | 
67 | 
68 | def _is_file_uri(uri: str) -> bool:
69 |     return uri.startswith("file://")
70 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/blob_store/local_fs_blob_store.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | import os.path
 4 | from typing import Any
 5 | 
 6 | 
 7 | class LocalFSBLOBStore:
 8 |     """BLOB store that stores BLOBs in local file system."""
 9 | 
10 |     async def get(self, uri: str, logger: Any) -> bytes:
11 |         """Returns binary value stored in file at the supplied URI.
12 | 
13 |         The URI must be a file URI (starts with "file://"). The path must be absolute.
14 |         Raises Exception on error. Raises KeyError if the file doesn't exist.
15 |         """
16 |         # Run synchronous code in a thread to not block the event loop.
17 |         return await asyncio.to_thread(self._sync_get, _path_from_file_uri(uri))
18 | 
19 |     async def put(self, uri: str, value: bytes, logger: Any) -> None:
20 |         """Stores the supplied binary value in a file at the supplied URI.
21 | 
22 |         The URI must be a file URI (starts with "file://"). The path must be absolute.
23 |         Overwrites existing file. Raises Exception on error.
24 |         """
25 |         # Run synchronous code in a thread to not block the event loop.
26 |         return await asyncio.to_thread(self._sync_put, _path_from_file_uri(uri), value)
27 | 
28 |     def _sync_get(self, path: str) -> bytes:
29 |         if not os.path.isabs(path):
30 |             raise ValueError(f"Path {path} must be absolute")
31 | 
32 |         if os.path.exists(path):
33 |             with open(path, mode="rb") as blob_file:
34 |                 return blob_file.read()
35 |         else:
36 |             raise KeyError(f"File at {path} does not exist")
37 | 
38 |     def _sync_put(self, path: str, value: bytes) -> None:
39 |         if not os.path.isabs(path):
40 |             raise ValueError(f"Path {path} must be absolute")
41 | 
42 |         os.makedirs(os.path.dirname(path), exist_ok=True)
43 |         with open(path, mode="wb") as blob_file:
44 |             blob_file.write(value)
45 | 
46 | 
47 | def _path_from_file_uri(uri: str) -> str:
48 |     return uri[7:]  # strip "file://" prefix
49 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/blob_store/metrics/blob_store.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ...monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | metric_get_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
 6 |     "get_blob_requests",
 7 |     "Number of get blob requests",
 8 | )
 9 | metric_get_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
10 |     "get_blob_request_errors",
11 |     "Number of get blob request errors",
12 | )
13 | metric_get_blob_latency: prometheus_client.Histogram = (
14 |     latency_metric_for_fast_operation(
15 |         "get_blob_request",
16 |         "get blob request",
17 |     )
18 | )
19 | 
20 | metric_put_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
21 |     "put_blob_requests",
22 |     "Number of put blob requests",
23 | )
24 | metric_put_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
25 |     "put_blob_request_errors",
26 |     "Number of put blob request errors",
27 | )
28 | metric_put_blob_latency: prometheus_client.Histogram = (
29 |     latency_metric_for_fast_operation(
30 |         "put_blob_request",
31 |         "put blob request",
32 |     )
33 | )
34 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_allowlist.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Dict, List, Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class FunctionURI:
 7 |     namespace: str
 8 |     compute_graph: str
 9 |     compute_fn: str
10 |     version: Optional[str] = None
11 | 
12 | 
13 | def function_allowlist_to_indexed_dict(
14 |     function_allowlist: List[FunctionURI],
15 | ) -> Dict[str, str]:
16 |     """Returns a dictionary with each function URI in the allowlist as a key-value pair.
17 | 
18 |     The keys are prefixed indexes in function allowlist, and the values are the function URIs
19 |     """
20 |     indexed_dict = {}
21 |     counter = 0
22 |     for function_uri in function_allowlist:
23 |         function_uri: FunctionURI
24 |         indexed_dict[f"function_allowlist_{counter}"] = ":".join(
25 |             [
26 |                 function_uri.namespace,
27 |                 function_uri.compute_graph,
28 |                 function_uri.compute_fn,
29 |                 str(function_uri.version),
30 |             ]
31 |         )
32 |         counter += 1
33 |     return indexed_dict
34 | 
35 | 
36 | def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
37 |     """Parses a list of function URIs from strings to FunctionURI objects."""
38 |     uris: List[FunctionURI] = []
39 |     for uri_str in function_uri_strs:
40 |         tokens = uri_str.split(":")
41 |         if len(tokens) < 3 or len(tokens) > 4:
42 |             raise ValueError(
43 |                 "Function should be specified as <namespace>:<workflow>:<function>:<version> or"
44 |                 "<namespace>:<workflow>:<function>"
45 |             )
46 |         version: Optional[str] = None
47 |         if len(tokens) == 4:
48 |             version = tokens[3]
49 | 
50 |         uris.append(
51 |             FunctionURI(
52 |                 namespace=tokens[0],
53 |                 compute_graph=tokens[1],
54 |                 compute_fn=tokens[2],
55 |                 version=version,
56 |             )
57 |         )
58 | 
59 |     return uris
60 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/metrics/health_checker.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ...monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | # This file contains all metrics used by HealthChecker.
 6 | 
 7 | metric_failed_health_checks = prometheus_client.Counter(
 8 |     "function_executor_failed_health_checks",
 9 |     "Number of health checks that were not successful",
10 | )
11 | metric_health_check_latency = latency_metric_for_fast_operation(
12 |     "function_executor_health_check",
13 |     "Function Executor health check",
14 | )
15 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/metrics/invocation_state_client.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ...monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | # This file contains all metrics used by InvocationStateClient.
 6 | 
 7 | # General metrics.
 8 | metric_request_read_errors: prometheus_client.Counter = prometheus_client.Counter(
 9 |     "function_executor_invocation_state_client_request_read_errors",
10 |     "Number of failed request reads in Function Executor Invocation State client resulting in its early termination",
11 | )
12 | 
13 | # Get invocation state key-value Server API metrics.
14 | metric_server_get_state_requests: prometheus_client.Counter = prometheus_client.Counter(
15 |     "server_get_invocation_state_requests",
16 |     "Number of get invocation state requests sent to the Server on behalf of Function Executor",
17 | )
18 | metric_server_get_state_request_errors: prometheus_client.Counter = (
19 |     prometheus_client.Counter(
20 |         "server_get_invocation_state_request_errors",
21 |         "Server get invocation state request errors",
22 |     )
23 | )
24 | metric_server_get_state_request_latency: prometheus_client.Histogram = (
25 |     latency_metric_for_fast_operation(
26 |         "server_get_invocation_state_request", "Server get invocation state request"
27 |     )
28 | )
29 | 
30 | # Set invocation state key-value Server API metrics.
31 | metric_server_set_state_requests: prometheus_client.Counter = prometheus_client.Counter(
32 |     "server_set_invocation_state_requests",
33 |     "Number of set invocation state requests sent to the Server on behalf of Function Executor",
34 | )
35 | metric_server_set_state_request_errors: prometheus_client.Counter = (
36 |     prometheus_client.Counter(
37 |         "server_set_invocation_state_request_errors",
38 |         "Server set invocation state request errors",
39 |     )
40 | )
41 | metric_server_set_state_request_latency: prometheus_client.Histogram = (
42 |     latency_metric_for_fast_operation(
43 |         "server_set_invocation_state_request", "Server set invocation state request"
44 |     )
45 | )
46 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/server/client_configuration.py:
--------------------------------------------------------------------------------
 1 | # We send function inputs and outputs over gRPC.
 2 | # -1 means unlimited. We don't want to limit the size of data customers are using.
 3 | # The effective max message size in this case is about 1.9 GB, see the max payload test.
 4 | # This is due to internal hard gRPC limits. When we want to increase the message sizes
 5 | # we'll have to implement chunking for large messages.
 6 | _MAX_GRPC_MESSAGE_LENGTH = -1
 7 | 
 8 | # Optimize the channels for low latency connection establishement as we are running on the same host.
 9 | _RECONNECT_BACKOFF_MS = 100
10 | 
11 | GRPC_CHANNEL_OPTIONS = [
12 |     ("grpc.max_receive_message_length", _MAX_GRPC_MESSAGE_LENGTH),
13 |     ("grpc.max_send_message_length", _MAX_GRPC_MESSAGE_LENGTH),
14 |     ("grpc.min_reconnect_backoff_ms", _RECONNECT_BACKOFF_MS),
15 |     ("grpc.max_reconnect_backoff_ms", _RECONNECT_BACKOFF_MS),
16 |     ("grpc.initial_reconnect_backoff_ms", _RECONNECT_BACKOFF_MS),
17 | ]
18 | 
19 | # If a health check takes more than this duration then the server is considered unhealthy.
20 | HEALTH_CHECK_TIMEOUT_SEC = 5
21 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/server/function_executor_server.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import grpc
 4 | 
 5 | # Timeout for Function Executor Server startup in seconds. The timeout is counted from
 6 | # the moment when a server just started.
 7 | FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC = 5
 8 | 
 9 | 
10 | class FunctionExecutorServer:
11 |     """Abstract interface for a Function Executor Server.
12 | 
13 |     FunctionExecutorServer is a class that executes tasks for a particular function.
14 |     The communication with FunctionExecutorServer is typicall done via gRPC.
15 |     """
16 | 
17 |     async def create_channel(self, logger: Any) -> grpc.aio.Channel:
18 |         """Creates a new async gRPC channel to the Function Executor Server.
19 | 
20 |         The channel is in ready state. It can only be used in the same thread where the
21 |         function was called. Caller should close the channel when it's no longer needed.
22 | 
23 |         Raises Exception if an error occurred."""
24 |         raise NotImplementedError
25 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/server/function_executor_server_factory.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, List, Optional
 3 | 
 4 | from .function_executor_server import FunctionExecutorServer
 5 | 
 6 | 
 7 | @dataclass
 8 | class FunctionExecutorServerConfiguration:
 9 |     """Configuration for creating a FunctionExecutorServer.
10 | 
11 |     This configuration only includes data that must be known
12 |     during creation of the FunctionExecutorServer. If some data
13 |     is not required during the creation then it shouldn't be here.
14 | 
15 |     A particular factory implementation might ignore certain
16 |     configuration parameters or raise an exception if it can't implement
17 |     them."""
18 | 
19 |     executor_id: str
20 |     function_executor_id: str
21 |     namespace: str
22 |     graph_name: str
23 |     function_name: str
24 |     graph_version: str
25 |     image_uri: Optional[str]
26 |     secret_names: List[str]
27 |     cpu_ms_per_sec: int
28 |     memory_bytes: int
29 |     disk_bytes: int
30 |     gpu_count: int
31 | 
32 | 
33 | class FunctionExecutorServerFactory:
34 |     """Abstract class for creating FunctionExecutorServers."""
35 | 
36 |     async def create(
37 |         self, config: FunctionExecutorServerConfiguration, logger: Any
38 |     ) -> FunctionExecutorServer:
39 |         """Creates a new FunctionExecutorServer.
40 | 
41 |         Raises an exception if the creation failed or the configuration is not supported.
42 |         Args:
43 |             config: configuration of the FunctionExecutorServer.
44 |             logger: logger to be used during the function call."""
45 |         raise NotImplementedError()
46 | 
47 |     async def destroy(self, server: FunctionExecutorServer, logger: Any) -> None:
48 |         """Destroys the FunctionExecutorServer and release all its resources.
49 | 
50 |         Args:
51 |             logger: logger to be used during the function call.
52 |         FunctionExecutorServer and customer code that it's running are not notified about the destruction.
53 |         Never raises any Exceptions."""
54 |         raise NotImplementedError
55 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Any
 3 | 
 4 | import grpc
 5 | 
 6 | from .client_configuration import GRPC_CHANNEL_OPTIONS
 7 | from .function_executor_server import FunctionExecutorServer
 8 | 
 9 | 
10 | class SubprocessFunctionExecutorServer(FunctionExecutorServer):
11 |     """A FunctionExecutorServer that runs in a child process."""
12 | 
13 |     def __init__(
14 |         self,
15 |         process: asyncio.subprocess.Process,
16 |         port: int,
17 |         address: str,
18 |     ):
19 |         self._proc = process
20 |         self._port = port
21 |         self._address = address
22 | 
23 |     async def create_channel(self, logger: Any) -> grpc.aio.Channel:
24 |         return grpc.aio.insecure_channel(self._address, options=GRPC_CHANNEL_OPTIONS)
25 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/__init__.py:
--------------------------------------------------------------------------------
 1 | from .function_executor_controller import FunctionExecutorController
 2 | from .loggers import function_executor_logger, task_logger
 3 | from .message_validators import validate_function_executor_description, validate_task
 4 | from .task_output import TaskOutput
 5 | 
 6 | __all__ = [
 7 |     "function_executor_logger",
 8 |     "task_logger",
 9 |     "validate_function_executor_description",
10 |     "validate_task",
11 |     "FunctionExecutorController",
12 |     "TaskOutput",
13 | ]
14 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/debug_event_loop.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, List
 3 | 
 4 | from .events import BaseEvent
 5 | 
 6 | _DEBUG_EVENT_LOOP: bool = (
 7 |     os.getenv("INDEXIFY_FUNCTION_EXECUTOR_CONTROLLER_DEBUG_EVENT_LOOP", "0")
 8 | ) == "1"
 9 | 
10 | 
11 | def debug_print_processing_event(event: BaseEvent, logger: Any) -> None:
12 |     if _DEBUG_EVENT_LOOP:
13 |         logger.debug(
14 |             "processing event in control loop",
15 |             fe_event=str(event),
16 |         )
17 | 
18 | 
19 | def debug_print_adding_event(event: BaseEvent, source: str, logger: Any) -> None:
20 |     if _DEBUG_EVENT_LOOP:
21 |         logger.debug(
22 |             "adding event to control loop",
23 |             source=source,
24 |             fe_event=str(event),
25 |         )
26 | 
27 | 
28 | def debug_print_events(events: List[BaseEvent], logger: Any) -> None:
29 |     if _DEBUG_EVENT_LOOP:
30 |         if len(events) == 0:
31 |             logger.debug("no events n control loop")
32 |         else:
33 |             logger.debug(
34 |                 "events in control loop",
35 |                 count=len(events),
36 |                 fe_events=[str(event) for event in events],
37 |             )
38 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/destroy_function_executor.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | 
 3 | from indexify.executor.function_executor.function_executor import FunctionExecutor
 4 | from indexify.proto.executor_api_pb2 import FunctionExecutorTerminationReason
 5 | 
 6 | from .events import FunctionExecutorDestroyed
 7 | 
 8 | 
 9 | async def destroy_function_executor(
10 |     function_executor: Optional[FunctionExecutor],
11 |     termination_reason: FunctionExecutorTerminationReason,
12 |     logger: Any,
13 | ) -> FunctionExecutorDestroyed:
14 |     """Destroys a function executor.
15 | 
16 |     Doesn't raise any exceptions.
17 |     """
18 |     logger = logger.bind(module=__name__)
19 | 
20 |     if function_executor is not None:
21 |         logger.info(
22 |             "destroying function executor",
23 |         )
24 |         await function_executor.destroy()
25 | 
26 |     return FunctionExecutorDestroyed(
27 |         is_success=True, termination_reason=termination_reason
28 |     )
29 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/loggers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from indexify.proto.executor_api_pb2 import (
 4 |     FunctionExecutorDescription,
 5 |     Task,
 6 | )
 7 | 
 8 | 
 9 | def function_executor_logger(
10 |     function_executor_description: FunctionExecutorDescription, logger: Any
11 | ) -> Any:
12 |     """Returns a logger bound with the FE's metadata.
13 | 
14 |     The function assumes that the FE might be invalid."""
15 |     return logger.bind(
16 |         function_executor_id=(
17 |             function_executor_description.id
18 |             if function_executor_description.HasField("id")
19 |             else None
20 |         ),
21 |         namespace=(
22 |             function_executor_description.namespace
23 |             if function_executor_description.HasField("namespace")
24 |             else None
25 |         ),
26 |         graph_name=(
27 |             function_executor_description.graph_name
28 |             if function_executor_description.HasField("graph_name")
29 |             else None
30 |         ),
31 |         graph_version=(
32 |             function_executor_description.graph_version
33 |             if function_executor_description.HasField("graph_version")
34 |             else None
35 |         ),
36 |         function_name=(
37 |             function_executor_description.function_name
38 |             if function_executor_description.HasField("function_name")
39 |             else None
40 |         ),
41 |     )
42 | 
43 | 
44 | def task_logger(task: Task, logger: Any) -> Any:
45 |     """Returns a logger bound with the task's metadata.
46 | 
47 |     The function assumes that the task might be invalid."""
48 |     return logger.bind(
49 |         task_id=task.id if task.HasField("id") else None,
50 |         namespace=task.namespace if task.HasField("namespace") else None,
51 |         graph_name=task.graph_name if task.HasField("graph_name") else None,
52 |         graph_version=task.graph_version if task.HasField("graph_version") else None,
53 |         function_name=task.function_name if task.HasField("function_name") else None,
54 |         graph_invocation_id=(
55 |             task.graph_invocation_id if task.HasField("graph_invocation_id") else None
56 |         ),
57 |     )
58 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/message_validators.py:
--------------------------------------------------------------------------------
 1 | from tensorlake.function_executor.proto.message_validator import MessageValidator
 2 | 
 3 | from indexify.proto.executor_api_pb2 import (
 4 |     DataPayload,
 5 |     FunctionExecutorDescription,
 6 |     Task,
 7 | )
 8 | 
 9 | 
10 | def validate_function_executor_description(
11 |     function_executor_description: FunctionExecutorDescription,
12 | ) -> None:
13 |     """Validates the supplied FE description.
14 | 
15 |     Raises ValueError if the description is not valid.
16 |     """
17 |     validator = MessageValidator(function_executor_description)
18 |     validator.required_field("id")
19 |     validator.required_field("namespace")
20 |     validator.required_field("graph_name")
21 |     validator.required_field("graph_version")
22 |     validator.required_field("function_name")
23 |     # image_uri is optional.
24 |     # secret_names can be empty.
25 |     validator.required_field("customer_code_timeout_ms")
26 |     validator.required_field("graph")
27 |     validator.required_field("resources")
28 | 
29 |     _validate_data_payload(function_executor_description.graph)
30 | 
31 |     validator = MessageValidator(function_executor_description.resources)
32 |     validator.required_field("cpu_ms_per_sec")
33 |     validator.required_field("memory_bytes")
34 |     validator.required_field("disk_bytes")
35 | 
36 |     if function_executor_description.resources.HasField("gpu"):
37 |         validator = MessageValidator(function_executor_description.resources.gpu)
38 |         validator.required_field("count")
39 |         validator.required_field("model")
40 | 
41 | 
42 | def validate_task(task: Task) -> None:
43 |     """Validates the supplied Task.
44 | 
45 |     Raises ValueError if the Task is not valid.
46 |     """
47 |     validator = MessageValidator(task)
48 |     validator.required_field("id")
49 |     validator.required_field("namespace")
50 |     validator.required_field("graph_name")
51 |     validator.required_field("graph_version")
52 |     validator.required_field("function_name")
53 |     validator.required_field("graph_invocation_id")
54 |     validator.required_field("timeout_ms")
55 |     validator.required_field("input")
56 |     validator.required_field("output_payload_uri_prefix")
57 |     validator.required_field("retry_policy")
58 | 
59 |     _validate_data_payload(task.input)
60 |     if task.HasField("reducer_input"):
61 |         _validate_data_payload(task.reducer_input)
62 | 
63 | 
64 | def _validate_data_payload(data_payload: DataPayload) -> None:
65 |     """Validates the supplied DataPayload.
66 | 
67 |     Raises ValueError if the DataPayload is not valid.
68 |     """
69 |     (MessageValidator(data_payload).required_field("uri").required_field("encoding"))
70 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from indexify.executor.monitoring.metrics import (
 4 |     latency_metric_for_customer_controlled_operation,
 5 |     latency_metric_for_fast_operation,
 6 | )
 7 | 
 8 | metric_control_loop_handle_event_latency: prometheus_client.Histogram = (
 9 |     latency_metric_for_fast_operation(
10 |         "handle_function_executor_control_loop_event",
11 |         "Handle Function Executor control loop event",
12 |     )
13 | )
14 | 
15 | metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
16 |     "tasks_fetched", "Number of tasks that were fetched from Server"
17 | )
18 | 
19 | metric_schedule_task_latency: prometheus_client.Histogram = (
20 |     latency_metric_for_customer_controlled_operation(
21 |         "schedule_task",
22 |         "Schedule a task for execution after it got ready for execution",
23 |     )
24 | )
25 | metric_runnable_tasks: prometheus_client.Gauge = prometheus_client.Gauge(
26 |     "runnable_tasks",
27 |     "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
28 | )
29 | metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
30 |     prometheus_client.Gauge(
31 |         "runnable_tasks_per_function_name",
32 |         "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
33 |         ["function_name"],
34 |     )
35 | )
36 | 
37 | metric_function_executors_with_status: prometheus_client.Gauge = (
38 |     prometheus_client.Gauge(
39 |         "function_executors_with_status",
40 |         "Number of Function Executors with a particular status",
41 |         ["status"],
42 |     )
43 | )
44 | METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
45 | METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
46 | METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
47 | METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
48 | 
49 | metric_function_executors_with_status.labels(
50 |     status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
51 | )
52 | metric_function_executors_with_status.labels(
53 |     status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
54 | )
55 | metric_function_executors_with_status.labels(
56 |     status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
57 | )
58 | metric_function_executors_with_status.labels(
59 |     status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
60 | )
61 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/metrics/run_task.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from indexify.executor.monitoring.metrics import (
 4 |     latency_metric_for_customer_controlled_operation,
 5 | )
 6 | 
 7 | metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
 8 |     prometheus_client.Counter(
 9 |         "function_executor_run_task_rpcs", "Number of Function Executor run task RPCs"
10 |     )
11 | )
12 | metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
13 |     prometheus_client.Counter(
14 |         "function_executor_run_task_rpc_errors",
15 |         "Number of Function Executor run task RPC errors",
16 |     )
17 | )
18 | metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
19 |     latency_metric_for_customer_controlled_operation(
20 |         "function_executor_run_task_rpc", "Function Executor run task RPC"
21 |     )
22 | )
23 | metric_function_executor_run_task_rpcs_in_progress: prometheus_client.Gauge = (
24 |     prometheus_client.Gauge(
25 |         "function_executor_run_task_rpcs_in_progress",
26 |         "Number of Function Executor run task RPCs in progress",
27 |     )
28 | )
29 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | # Task output upload metrics.
 6 | metric_task_output_uploads: prometheus_client.Counter = prometheus_client.Counter(
 7 |     "task_output_uploads",
 8 |     "Number of task output uploads",
 9 | )
10 | metric_tasks_uploading_outputs: prometheus_client.Gauge = prometheus_client.Gauge(
11 |     "tasks_uploading_output",
12 |     "Number of tasks currently uploading their outputs",
13 | )
14 | metric_task_output_upload_latency: prometheus_client.Histogram = (
15 |     latency_metric_for_fast_operation("task_output_upload", "task output upload")
16 | )
17 | metric_task_output_upload_retries: prometheus_client.Counter = (
18 |     prometheus_client.Counter(
19 |         "tasks_output_upload_retries", "Number of task output upload retries"
20 |     )
21 | )
22 | 
23 | # Metrics for individual blob store operations.
24 | metric_task_output_blob_store_uploads: prometheus_client.Counter = (
25 |     prometheus_client.Counter(
26 |         "task_output_blob_store_uploads", "Number of task output uploads to blob store"
27 |     )
28 | )
29 | metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
30 |     prometheus_client.Counter(
31 |         "task_output_blob_store_upload_errors",
32 |         "Number of failed task output uploads to blob store",
33 |     )
34 | )
35 | metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
36 |     latency_metric_for_fast_operation(
37 |         "task_output_blob_store_upload", "Upload task output to blob store"
38 |     )
39 | )
40 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/prepare_task.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from indexify.executor.blob_store.blob_store import BLOBStore
 4 | 
 5 | from .downloads import download_init_value, download_input
 6 | from .events import TaskPreparationFinished
 7 | from .task_info import TaskInfo
 8 | 
 9 | 
10 | async def prepare_task(
11 |     task_info: TaskInfo, blob_store: BLOBStore, logger: Any
12 | ) -> TaskPreparationFinished:
13 |     """Prepares the task by downloading the input and init value if available.
14 | 
15 |     Doesn't raise any exceptions.
16 |     """
17 |     logger = logger.bind(module=__name__)
18 |     try:
19 |         task_info.input = await download_input(
20 |             data_payload=task_info.task.input,
21 |             blob_store=blob_store,
22 |             logger=logger,
23 |         )
24 | 
25 |         if task_info.task.HasField("reducer_input"):
26 |             task_info.init_value = await download_init_value(
27 |                 data_payload=task_info.task.reducer_input,
28 |                 blob_store=blob_store,
29 |                 logger=logger,
30 |             )
31 | 
32 |         return TaskPreparationFinished(task_info=task_info, is_success=True)
33 |     except Exception as e:
34 |         logger.error(
35 |             "Failed to prepare task",
36 |             exc_info=e,
37 |         )
38 |         return TaskPreparationFinished(task_info=task_info, is_success=False)
39 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/function_executor_controller/task_info.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | from typing import Optional
 4 | 
 5 | from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
 6 | 
 7 | from indexify.proto.executor_api_pb2 import Task
 8 | 
 9 | from .task_output import TaskOutput
10 | 
11 | 
12 | @dataclass
13 | class TaskInfo:
14 |     """Object used to track a task during its full lifecycle in the FunctionExecutorController."""
15 | 
16 |     task: Task
17 |     allocation_id: str
18 |     # time.monotonic() timestamp
19 |     start_time: float
20 |     # time.monotonic() timestamp when the task was prepared for execution
21 |     prepared_time: float = 0.0
22 |     # True if the task was cancelled.
23 |     is_cancelled: bool = False
24 |     # aio task that is currently executing a lifecycle step of this task.
25 |     aio_task: Optional[asyncio.Task] = None
26 |     # Downloaded input if function was prepared successfully.
27 |     input: Optional[SerializedObject] = None
28 |     # Downloaded init value if function was prepared successfully and is a reducer.
29 |     init_value: Optional[SerializedObject] = None
30 |     # Output of the task.
31 |     output: Optional[TaskOutput] = None
32 |     # True if the task is fully completed and was added to state reporter.
33 |     is_completed: bool = False
34 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/host_resources/nvidia_gpu_allocator.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List
 2 | 
 3 | from .nvidia_gpu import NvidiaGPUInfo, fetch_nvidia_gpu_infos, nvidia_gpus_are_available
 4 | 
 5 | 
 6 | class NvidiaGPUAllocator:
 7 |     """NvidiaGPUAllocator is a class that manages the allocation and deallocation of GPUs."""
 8 | 
 9 |     def __init__(self, logger: Any):
10 |         gpu_infos: List[NvidiaGPUInfo] = []
11 | 
12 |         if nvidia_gpus_are_available():
13 |             gpu_infos = fetch_nvidia_gpu_infos(logger)
14 |             logger.bind(module=__name__).info(
15 |                 "Fetched information about NVIDIA GPUs:", info=gpu_infos
16 |             )
17 | 
18 |         self._all_gpus: List[NvidiaGPUInfo] = gpu_infos
19 |         self._free_gpus: List[NvidiaGPUInfo] = list(gpu_infos)
20 | 
21 |     def allocate(self, count: int, logger: Any) -> List[NvidiaGPUInfo]:
22 |         """
23 |         Allocates a specified number of GPUs.
24 | 
25 |         Args:
26 |             count (int): The number of GPUs to allocate.
27 | 
28 |         Returns:
29 |             List[NvidiaGPUInfo]: A list of allocated GPUs. The list is empty if count is 0.
30 | 
31 |         Raises:
32 |             ValueError: If the requested number of GPUs exceeds free GPUs.
33 |             Exception: If an error occurs during allocation.
34 |         """
35 |         if count > len(self._free_gpus):
36 |             raise ValueError(
37 |                 f"Not enough free GPUs available, requested={count}, available={len(self._free_gpus)}"
38 |             )
39 | 
40 |         allocated_gpus: List[NvidiaGPUInfo] = []
41 |         for _ in range(count):
42 |             allocated_gpus.append(self._free_gpus.pop())
43 | 
44 |         if len(allocated_gpus) > 0:
45 |             logger.bind(module=__name__).info("allocated GPUs:", gpus=allocated_gpus)
46 | 
47 |         return allocated_gpus
48 | 
49 |     def deallocate(self, gpus: List[NvidiaGPUInfo], logger: Any) -> None:
50 |         self._free_gpus.extend(gpus)
51 | 
52 |         if len(gpus) > 0:
53 |             logger.bind(module=__name__).info("deallocated GPUs:", gpus=gpus)
54 | 
55 |     def list_all(self) -> List[NvidiaGPUInfo]:
56 |         return list(self._all_gpus)  # Return a copy to avoid external modification
57 | 
58 |     def list_free(self) -> List[NvidiaGPUInfo]:
59 |         return list(self._free_gpus)  # Return a copy to avoid external modification
60 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/metrics/channel_manager.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ..monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | metric_grpc_server_channel_creations = prometheus_client.Counter(
 6 |     "grpc_server_channel_creations",
 7 |     "Number of times a channel to gRPC Server was created",
 8 | )
 9 | metric_grpc_server_channel_creation_retries = prometheus_client.Counter(
10 |     "grpc_server_channel_creation_retries",
11 |     "Number of retries during a channel creation to gRPC Server",
12 | )
13 | metric_grpc_server_channel_creation_latency: prometheus_client.Histogram = (
14 |     latency_metric_for_fast_operation(
15 |         "grpc_server_channel_creation",
16 |         "gRPC server channel creation",
17 |     )
18 | )
19 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/metrics/executor.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | # This file contains all metrics used by Executor.
 4 | 
 5 | # Executor overview metrics.
 6 | metric_executor_info: prometheus_client.Info = prometheus_client.Info(
 7 |     "executor", "Executor information"
 8 | )
 9 | metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
10 |     "executor_state",
11 |     "Current Executor state",
12 |     states=["starting", "running", "shutting_down"],
13 | )
14 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/metrics/state_reconciler.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ..monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | metric_state_reconciliations = prometheus_client.Counter(
 6 |     "state_reconciliations",
 7 |     "Number of Executor state reconciliations",
 8 | )
 9 | metric_state_reconciliation_errors = prometheus_client.Counter(
10 |     "state_reconciliation_errors",
11 |     "Number of Executor state reconciliation errors after all retries",
12 | )
13 | metric_state_reconciliation_latency: prometheus_client.Histogram = (
14 |     latency_metric_for_fast_operation(
15 |         "state_reconciliation", "Executor state reconciliation"
16 |     )
17 | )
18 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/metrics/state_reporter.py:
--------------------------------------------------------------------------------
 1 | import prometheus_client
 2 | 
 3 | from ..monitoring.metrics import latency_metric_for_fast_operation
 4 | 
 5 | metric_state_report_rpcs = prometheus_client.Counter(
 6 |     "state_report_rpcs",
 7 |     "Number of Executor state report RPCs to Server",
 8 | )
 9 | metric_state_report_errors = prometheus_client.Counter(
10 |     "state_report_rpc_errors",
11 |     "Number of Executor state report RPC errors",
12 | )
13 | metric_state_report_latency: prometheus_client.Histogram = (
14 |     latency_metric_for_fast_operation(
15 |         "state_report_rpc", "Executor state report rpc to Server"
16 |     )
17 | )
18 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/handler.py:
--------------------------------------------------------------------------------
1 | from aiohttp import web
2 | 
3 | 
4 | class Handler:
5 |     """Abstract base class for all request handlers."""
6 | 
7 |     async def handle(self, request: web.Request) -> web.Response:
8 |         raise NotImplementedError("Subclasses must implement this method.")
9 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/health_check_handler.py:
--------------------------------------------------------------------------------
 1 | from aiohttp import web
 2 | 
 3 | from .handler import Handler
 4 | from .health_checker.health_checker import HealthChecker, HealthCheckResult
 5 | 
 6 | 
 7 | class HealthCheckHandler(Handler):
 8 |     def __init__(self, health_checker: HealthChecker):
 9 |         self._health_checker = health_checker
10 | 
11 |     async def handle(self, request: web.Request) -> web.Response:
12 |         result: HealthCheckResult = await self._health_checker.check()
13 |         return web.json_response(
14 |             {
15 |                 "status": "ok" if result.is_success else "nok",
16 |                 "message": result.status_message,
17 |                 "checker": result.checker_name,
18 |             },
19 |             status=200 if result.is_success else 503,
20 |         )
21 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/health_checker/generic_health_checker.py:
--------------------------------------------------------------------------------
 1 | from .health_checker import HealthChecker, HealthCheckResult
 2 | 
 3 | HEALTH_CHECKER_NAME = "GenericHealthChecker"
 4 | 
 5 | 
 6 | class GenericHealthChecker(HealthChecker):
 7 |     """A generic health checker that doesn't depend on machine type and other features of the environment.
 8 | 
 9 |     The health checker uses software signals available in all environments like Function Executor failure rates.
10 |     """
11 | 
12 |     def __init__(self):
13 |         pass
14 | 
15 |     async def check(self) -> HealthCheckResult:
16 |         return HealthCheckResult(
17 |             is_success=True,
18 |             status_message="The health check is always successful",
19 |             checker_name=HEALTH_CHECKER_NAME,
20 |         )
21 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/health_checker/health_checker.py:
--------------------------------------------------------------------------------
 1 | class HealthCheckResult:
 2 |     def __init__(self, checker_name: str, is_success: bool, status_message: str):
 3 |         self.checker_name = checker_name
 4 |         self.is_success = is_success
 5 |         self.status_message = status_message
 6 | 
 7 | 
 8 | class HealthChecker:
 9 |     """Abstract base class for health checkers."""
10 | 
11 |     async def check(self) -> HealthCheckResult:
12 |         raise NotImplementedError("Subclasses must implement this method.")
13 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/prometheus_metrics_handler.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aiohttp import web
 4 | from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
 5 | 
 6 | from .handler import Handler
 7 | 
 8 | 
 9 | class PrometheusMetricsHandler(Handler):
10 |     async def handle(self, request: web.Request) -> web.Response:
11 |         # Run the synchronous metrics generation code in ThreadPool thread
12 |         # to not block the main asyncio loop.
13 |         return await asyncio.to_thread(self._handle_sync)
14 | 
15 |     def _handle_sync(self) -> web.Response:
16 |         return web.Response(
17 |             body=generate_latest(), headers={"Content-Type": CONTENT_TYPE_LATEST}
18 |         )
19 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/server.py:
--------------------------------------------------------------------------------
 1 | from aiohttp import web
 2 | 
 3 | from .handler import Handler
 4 | 
 5 | 
 6 | class MonitoringServer:
 7 |     def __init__(
 8 |         self,
 9 |         host: str,
10 |         port: int,
11 |         startup_probe_handler: Handler,
12 |         health_probe_handler: Handler,
13 |         metrics_handler: Handler,
14 |     ):
15 |         self._host = host
16 |         self._port = port
17 |         self._app: web.Application = web.Application()
18 |         self._app.add_routes(
19 |             [
20 |                 web.get("/monitoring/startup", startup_probe_handler.handle),
21 |                 web.get("/monitoring/health", health_probe_handler.handle),
22 |                 web.get("/monitoring/metrics", metrics_handler.handle),
23 |             ]
24 |         )
25 |         self._app_runner: web.AppRunner = web.AppRunner(self._app)
26 | 
27 |     async def run(self):
28 |         await self._app_runner.setup()
29 |         site = web.TCPSite(
30 |             runner=self._app_runner,
31 |             host=self._host,
32 |             port=self._port,
33 |             # Allow to listen when there's a closed socket in TIME_WAIT state
34 |             reuse_address=True,
35 |             # Don't allow other TCP sockets to actively listen on this address
36 |             reuse_port=False,
37 |         )
38 |         await site.start()
39 | 
40 |     async def shutdown(self):
41 |         await self._app_runner.cleanup()
42 | 


--------------------------------------------------------------------------------
/indexify/src/indexify/executor/monitoring/startup_probe_handler.py:
--------------------------------------------------------------------------------
 1 | from aiohttp import web
 2 | 
 3 | from .handler import Handler
 4 | 
 5 | 
 6 | class StartupProbeHandler(Handler):
 7 |     def __init__(self):
 8 |         self._ready = False
 9 | 
10 |     def set_ready(self):
11 |         self._ready = True
12 | 
13 |     async def handle(self, request: web.Request) -> web.Response:
14 |         if self._ready:
15 |             return web.json_response({"status": "ok"})
16 |         else:
17 |             return web.json_response({"status": "nok"}, status=503)
18 | 


--------------------------------------------------------------------------------
/indexify/tests/.gitignore:
--------------------------------------------------------------------------------
1 | .run_tests_summary.txt


--------------------------------------------------------------------------------
/indexify/tests/README.md:
--------------------------------------------------------------------------------
1 | ## Overview
2 | 
3 | Put here tests for end-to-end integration of Open Source Indexify components
4 | and tools provided in this package.


--------------------------------------------------------------------------------
/indexify/tests/cli/test_environment_variables.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import unittest
 4 | from typing import Dict
 5 | 
 6 | import pydantic
 7 | import testing
 8 | from tensorlake import Graph, RemoteGraph, tensorlake_function
 9 | from tensorlake.functions_sdk.graph_serialization import graph_code_dir_path
10 | from testing import (
11 |     ExecutorProcessContextManager,
12 |     executor_pid,
13 |     test_graph_name,
14 |     wait_executor_startup,
15 | )
16 | 
17 | 
18 | class Response(pydantic.BaseModel):
19 |     executor_pid: int
20 |     environment: Dict[str, str]
21 | 
22 | 
23 | @tensorlake_function()
24 | def function_a() -> Response:
25 |     return Response(executor_pid=executor_pid(), environment=os.environ.copy())
26 | 
27 | 
28 | class TestEnvironmentVariables(unittest.TestCase):
29 |     def test_executor_env_variables_are_passed_to_functions(self):
30 |         with ExecutorProcessContextManager(
31 |             [
32 |                 "--monitoring-server-port",
33 |                 "7001",
34 |             ],
35 |             keep_std_outputs=False,
36 |             extra_env={
37 |                 "INDEXIFY_TEST_ENV_VAR": "test_value",
38 |                 "INDEXIFY_TEST_ENV_VAR_2": "test_value_2",
39 |             },
40 |         ) as executor_a:
41 |             executor_a: subprocess.Popen
42 |             print(f"Started Executor A with PID: {executor_a.pid}")
43 |             wait_executor_startup(7001)
44 | 
45 |             graph = Graph(
46 |                 name=test_graph_name(self),
47 |                 description="test",
48 |                 start_node=function_a,
49 |             )
50 |             graph = RemoteGraph.deploy(
51 |                 graph=graph, code_dir_path=graph_code_dir_path(__file__)
52 |             )
53 | 
54 |             # Run 10 times to have close to 100% chance of landing the functions on executor_a and not default test executor.
55 |             for _ in range(10):
56 |                 invocation_id = graph.run(block_until_done=True)
57 |                 output = graph.output(invocation_id, "function_a")
58 |                 self.assertEqual(len(output), 1)
59 |                 response: Response = output[0]
60 |                 if response.executor_pid == executor_a.pid:
61 |                     print(
62 |                         "The invocation landed on executor_a, verifying environment variables."
63 |                     )
64 |                     self.assertIn("INDEXIFY_TEST_ENV_VAR", response.environment)
65 |                     self.assertEqual(
66 |                         response.environment["INDEXIFY_TEST_ENV_VAR"], "test_value"
67 |                     )
68 |                     self.assertIn("INDEXIFY_TEST_ENV_VAR_2", response.environment)
69 |                     self.assertEqual(
70 |                         response.environment["INDEXIFY_TEST_ENV_VAR_2"], "test_value_2"
71 |                     )
72 |                     break
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/indexify/tests/cli/test_startup_probe.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import unittest
 3 | 
 4 | import httpx
 5 | from testing import ExecutorProcessContextManager, wait_executor_startup
 6 | 
 7 | 
 8 | class TestStartupProbe(unittest.TestCase):
 9 |     def test_success(self):
10 |         with ExecutorProcessContextManager(
11 |             [
12 |                 "--monitoring-server-port",
13 |                 "7001",
14 |             ]
15 |         ) as executor_a:
16 |             executor_a: subprocess.Popen
17 |             print(f"Started Executor A with PID: {executor_a.pid}")
18 |             wait_executor_startup(7001)
19 |             response = httpx.get(f"http://localhost:7001/monitoring/startup")
20 |             self.assertEqual(response.status_code, 200)
21 |             self.assertEqual(response.json(), {"status": "ok"})
22 | 
23 |     def test_failure(self):
24 |         # There's currently no way to reliably slow down Executor startup so this test is empty for now.
25 |         pass
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/indexify/tests/executor/constants.py:
--------------------------------------------------------------------------------
 1 | tls_config = {
 2 |     "use_tls": True,
 3 |     "tls_config": {
 4 |         "ca_bundle_path": "/path/to/ca_bundle.pem",
 5 |         "cert_path": "/path/to/cert.pem",
 6 |         "key_path": "/path/to/key.pem",
 7 |     },
 8 | }
 9 | 
10 | cert_path = tls_config["tls_config"]["cert_path"]
11 | key_path = tls_config["tls_config"]["key_path"]
12 | ca_bundle_path = tls_config["tls_config"]["ca_bundle_path"]
13 | server_address = "localhost:8900"
14 | config_path = "test/config/path"
15 | code_path = "test/code_path"
16 | 


--------------------------------------------------------------------------------
/indexify/tests/executor/testing.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import subprocess
 4 | import unittest
 5 | from typing import List, Optional
 6 | 
 7 | 
 8 | def test_graph_name(test_case: unittest.TestCase) -> str:
 9 |     """Converts a test case to a unique graph name.
10 | 
11 |     Example:
12 |     >>> class TestGraphReduce(unittest.TestCase):
13 |     ...     def test_simple(self):
14 |     ...         g = Graph(name=graph_name(self), start_node=generate_seq)
15 |     ...         # ...
16 |     ...         print(g.name)
17 |     ...         # test_graph_reduce_test_simple
18 |     """
19 |     return unittest.TestCase.id(test_case).replace(".", "_")
20 | 


--------------------------------------------------------------------------------
/indexify/tests/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ -z "$INDEXIFY_URL" ]]; then
 4 |     echo "Please set INDEXIFY_URL environment variable to specify"\
 5 |     "Indexify Server you are testing." \
 6 |     "Example: 'export INDEXIFY_URL=http://localhost:8900'" 1>&2
 7 |     exit 1
 8 | fi
 9 | 
10 | tests_exit_code=0
11 | 
12 | run_test_suite() {
13 |   local test_files=$1
14 |   local test_suite_name=$2
15 |   local test_suite_exit_code=0
16 | 
17 |   # Run each test file one by one sequentially. Set $tests_exit_code to non zero
18 |   # value if any of the test commands return non zero status code. Don't
19 |   # stop if a test command fails.
20 |   for test_file in $test_files; do
21 |     echo "Running $test_file for $test_suite_name test suite"
22 |     poetry run python $test_file
23 |     local test_file_exit_code=$?
24 |     if [ $test_file_exit_code -ne 0 ]; then
25 |       echo "One or more tests failed in $test_file for $test_suite_name test suite." | tee -a $summary_file
26 |     fi
27 |     tests_exit_code=$((tests_exit_code || test_file_exit_code))
28 |   done
29 | }
30 | 
31 | # cd to the script's directory.
32 | cd "$(dirname "$0")"
33 | 
34 | summary_file=".run_tests_summary.txt"
35 | rm -f $summary_file
36 | 
37 | # Indexify tests.
38 | indexify_test_files=$(find . -name 'test_*.py')
39 | # Tensorlke SDK tests verify user visible functionality end-to-end.
40 | tensorlake_sdk_test_files=$(find ../../tensorlake/tests/tensorlake -name 'test_*.py')
41 | 
42 | run_test_suite "$indexify_test_files" "Indexify"
43 | run_test_suite "$tensorlake_sdk_test_files" "Tensorlake SDK"
44 | 
45 | if [ $tests_exit_code -eq 0 ]; then
46 |   echo "All tests passed!" >> $summary_file
47 | else
48 |   echo "One or more tests failed. Please check output log for details." >> $summary_file
49 | fi
50 | 
51 | cat $summary_file
52 | exit $tests_exit_code


--------------------------------------------------------------------------------
/operations/k8s/README.md:
--------------------------------------------------------------------------------
 1 | # Kubernetes
 2 | 
 3 | ## Cluster Creation
 4 | 
 5 | You'll need a k8s cluster first. While there are a lot of different ways to get
 6 | a cluster, if you're doing this locally, we recommend using [k3d][k3d].
 7 | 
 8 | [k3d]: https://k3d.io/v5.6.3/#releases
 9 | 
10 | Note: the local example includes a basic ingress -
11 | [components/ingress](kustomize/components/ingress). The ingress exposes the API
12 | server and is required to use Indexify. If you're doing a different setup,
13 | you'll want to make an ingress definition that is specific to your environment.
14 | 
15 | ### Local
16 | 
17 | One way to create a cluster is using [k3d][k3d]. This will run a lightweight
18 | version of Kubernetes entirely within docker on your local system.
19 | 
20 | ```bash
21 | k3d cluster create -p "8900:80@loadbalancer" indexify
22 | ```
23 | 
24 | When using this setup, Indexify will be exposed via k3d's ingress which will be
25 | [http://localhost:8900](http://localhost:8900). You'll want to configure
26 | `IndexifyClient(service_url="http://localhost:8900")`.
27 | 
28 | ## Installation
29 | 
30 | ### Helm
31 | 
32 | To run locally, you can install the chart using some
33 | [pre-configured values](helm/local.yaml) and then go through the getting started
34 | guide. To install, run:
35 | 
36 | ```bash
37 | helm install local helm -f helm/local.yaml -n indexify --create-namespace
38 | ```
39 | 
40 | The chart is configured to run in a local environment. To run in a production
41 | environment, you'll want to make sure to configure the following:
42 | 
43 | - Blob Store - We're using minio for local development via the [official
44 |   chart][minio]. `local.yaml` configures it to run without persistence. To use
45 |   S3, set `minio.enabled=false` and make sure IAM has added the correct
46 |   credentials for accessing S3. To use other blob stores that support S3's API,
47 |   look into setting `blobStore.endpoint` and `blobStore.credentialSecret`.
48 | 
49 | - Persistence - By default, the Indexify server is configured to use the
50 |   local filesystem as the stateful set storage backend. To use a cloud-based
51 |   storage solution, set the `persistence.storageClassName` section to use your cloud provider's storage solution
52 |   i.e. `persistence.storageClassName: "ebs-csi-default-sc"`.
53 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/Chart.lock:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - name: minio
3 |   repository: https://charts.min.io/
4 |   version: 5.2.0
5 | digest: sha256:d47eef3ed8adcdfbc9fa429591e081aa6f886d68cbb9b27e92e72c2de2aae871
6 | generated: "2024-10-15T20:15:37.274227+02:00"
7 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: helm
 3 | description: Indexify
 4 | type: application
 5 | version: 0.2.0
 6 | 
 7 | dependencies:
 8 |   - name: minio
 9 |     repository: https://charts.min.io/
10 |     condition: minio.enabled
11 |     version: 5.2.0
12 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/local.yaml:
--------------------------------------------------------------------------------
 1 | blobStore:
 2 |   allowHTTP: true
 3 |   endpoint: http://blob-store:9000
 4 |   credentialSecret: blob-creds
 5 |   config:
 6 |     backend: s3
 7 |     s3:
 8 |       bucket: indexify
 9 |       region: us-east-1
10 |       accessKey:
11 |       secretKey:
12 | 
13 | server:
14 |   image: tensorlake/indexify-server:latest
15 |   ingress:
16 |     enabled: true
17 |     annotations: {}
18 |   persistence:
19 |     storageClassName: 'local-path'
20 |     size: 1Gi
21 | 
22 | executors:
23 |   - name: indexify-executor
24 |     image: tensorlake/indexify-executor-default:latest
25 |     replicas: 1
26 | 
27 | minio:
28 |   enabled: true
29 |   fullnameOverride: blob-store
30 |   persistence:
31 |     enabled: false
32 |   mode: standalone
33 |   rootUser: minioadmin
34 |   rootPassword: minioadmin
35 |   buckets:
36 |     - name: indexify
37 |   resources:
38 |     requests:
39 |       memory: 128Mi
40 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{- define "labels" -}}
 2 | app.kubernetes.io/name: {{ .name | replace "-" "" }}
 3 | app.kubernetes.io/component: {{ .component }}
 4 | app.kubernetes.io/part-of: {{ .global.Chart.Name }}
 5 | app.kubernetes.io/instance: {{ .global.Release.Name }}
 6 | app.kubernetes.io/managed-by: {{ .global.Release.Service }}
 7 | {{- end }}
 8 | 
 9 | {{- define "blobStore.env" -}}
10 | {{- with .blobStore -}}
11 | {{- if .endpoint -}}
12 | - name: AWS_ENDPOINT
13 |   value: {{ .endpoint }}
14 | - name: AWS_ENDPOINT_URL
15 |   value: {{ .endpoint }}
16 | {{- end }}
17 | {{- if .allowHTTP }}
18 | - name: AWS_ALLOW_HTTP
19 |   value: "true"
20 | {{- end }}
21 | {{- if .credentialSecret }}
22 | - name: AWS_ACCESS_KEY_ID
23 |   valueFrom:
24 |     secretKeyRef:
25 |       name: {{ .credentialSecret }}
26 |       key: AWS_ACCESS_KEY_ID
27 | - name: AWS_SECRET_ACCESS_KEY
28 |   valueFrom:
29 |     secretKeyRef:
30 |       name: {{ .credentialSecret }}
31 |       key: AWS_SECRET_ACCESS_KEY
32 | {{- end }}
33 | {{- end }}
34 | {{- end }}
35 | 
36 | {{- define "quorum" -}}
37 | {{- if eq (mod . 2) 0  }}
38 |   {{- fail "must be an odd number" }}
39 | {{- else }}
40 | {{- . -}}
41 | {{- end }}
42 | {{- end }}
43 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | apiVersion: v1
 3 | kind: ConfigMap
 4 | metadata:
 5 |   name: indexify-server
 6 |   labels:
 7 |     {{- include "labels" (dict "name" "indexify" "component" "config" "global" $) | nindent 4 }}
 8 | data:
 9 |   config.yaml: |-
10 |     listen_addr: 0.0.0.0:8900
11 |     state_store_path: /tmp/indexify/state
12 |     blob_storage:
13 |       {{- .Values.blobStore.config | toYaml | nindent 6 }}
14 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/executor.yaml:
--------------------------------------------------------------------------------
 1 | {{- range .Values.executors }}
 2 | ---
 3 | apiVersion: v1
 4 | kind: Service
 5 | metadata:
 6 |   name: {{ .name | replace "-" "" }}
 7 |   labels:
 8 |     {{- include "labels" (dict "name" .name "component" "executors" "global" $) | nindent 4 }}
 9 | spec:
10 |   ports:
11 |     - port: 9501
12 |   selector:
13 |     {{- include "labels" (dict "name" .name "component" "executors" "global" $) | nindent 4 }}
14 | ---
15 | apiVersion: apps/v1
16 | kind: Deployment
17 | metadata:
18 |   name: {{ .name | replace "-" "" }}
19 |   labels:
20 |     {{- include "labels" (dict "name" .name "component" "executors" "global" $) | nindent 4 }}
21 | spec:
22 |   replicas: {{ default .replicas 1 }}
23 |   selector:
24 |     matchLabels:
25 |       {{- include "labels" (dict "name" .name "component" "executors" "global" $) | nindent 6 }}
26 |   template:
27 |     metadata:
28 |       labels:
29 |         {{- include "labels" (dict "name" .name "component" "executors" "global" $) | nindent 8 }}
30 |     spec:
31 |       {{- if .nodeSelector }}
32 |       nodeSelector:
33 |         {{- toYaml .nodeSelector | nindent 8 }}
34 |       {{- end }}
35 |       containers:
36 |         - name: executor
37 |           image: {{ .image }}
38 |           command:
39 |             - indexify-cli
40 |             - executor
41 |             - --server-addr
42 |             - indexify-server:8900
43 | 
44 | {{- end }}
45 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/extra.yaml:
--------------------------------------------------------------------------------
1 | {{- range .Values.extraObjects -}}
2 | ---
3 | {{ . }}
4 | {{- end }}
5 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/ingress.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.server.ingress.enabled }}
 2 | ---
 3 | apiVersion: networking.k8s.io/v1
 4 | kind: Ingress
 5 | metadata:
 6 |   name: ingress
 7 |   {{- if .Values.server.ingress.annotations}}
 8 |   annotations:
 9 |     {{- toYaml .Values.server.ingress.annotations | nindent 4 }}
10 |   {{- end }}
11 |   labels:
12 |     app.kubernetes.io/component: api
13 | spec:
14 |   rules:
15 |     - http:
16 |         paths:
17 |           - path: /
18 |             pathType: Prefix
19 |             backend:
20 |               service:
21 |                 name: indexify-server
22 |                 port:
23 |                   number: 8900
24 | {{- end }}
25 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/secret.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.minio.enabled }}
 2 | ---
 3 | apiVersion: v1
 4 | kind: Secret
 5 | metadata:
 6 |   name: blob-creds
 7 | type: Opaque
 8 | stringData:
 9 |   AWS_ACCESS_KEY_ID: minioadmin
10 |   AWS_SECRET_ACCESS_KEY: minioadmin
11 | {{- else }}
12 | {{- if and .Values.blobStore.config.s3.accessKey .Values.blobStore.config.s3.secretKey }}
13 | ---
14 | apiVersion: v1
15 | kind: Secret
16 | metadata:
17 |   name: blob-creds
18 | type: Opaque
19 | stringData:
20 |   AWS_ACCESS_KEY_ID: {{ .Values.blobStore.config.s3.accessKey }}
21 |   AWS_SECRET_ACCESS_KEY: {{ .Values.blobStore.config.s3.secretKey }}
22 | {{- else}}
23 |   {{- fail "blob store credentials are not set" }}
24 | {{- end }}
25 | {{- end }}
26 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/templates/server.yaml:
--------------------------------------------------------------------------------
 1 | {{- with .Values.server }}
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: indexify-server
 6 | spec:
 7 |   ports:
 8 |     - port: 8900
 9 |   selector:
10 |     {{- include "labels" (dict "name" "server" "component" "server" "global" $) | nindent 4 }}
11 | ---
12 | apiVersion: apps/v1
13 | kind: StatefulSet
14 | metadata:
15 |   name: indexify-server
16 |   labels:
17 |     {{- include "labels" (dict "name" "server" "component" "server" "global" $) | nindent 4 }}
18 | spec:
19 |   selector:
20 |     matchLabels:
21 |       {{- include "labels" (dict "name" "server" "component" "server" "global" $) | nindent 6 }}
22 |   template:
23 |     metadata:
24 |       labels:
25 |         {{- include "labels" (dict "name" "server" "component" "server" "global" $) | nindent 8 }}
26 |     spec:
27 |       {{- if .nodeSelector }}
28 |       nodeSelector:
29 |         {{- toYaml .nodeSelector | nindent 8 }}
30 |       {{- end }}
31 |       containers:
32 |         - name: indexify
33 |           image: {{ .image }}
34 |           command: ["indexify-server"]
35 |           args: ["--config", "/indexify/config/config.yaml"]
36 | 
37 |           volumeMounts:
38 |             - name: config
39 |               mountPath: /indexify/config
40 |               readOnly: true
41 |             - name: data
42 |               mountPath: /tmp/indexify/state
43 | 
44 |           env:
45 |             {{- include "blobStore.env" $.Values | nindent 12 }}
46 | 
47 |           livenessProbe:
48 |             httpGet:
49 |               path: /
50 |               port: 8900
51 | 
52 |           readinessProbe:
53 |             httpGet:
54 |               path: /
55 |               port: 8900
56 | 
57 |       volumes:
58 |         - name: config
59 |           configMap:
60 |             name: indexify-server
61 | 
62 |   {{- with .persistence }}
63 |   volumeClaimTemplates:
64 |   - metadata:
65 |       name: data
66 |       labels:
67 |         {{- include "labels" (dict "name" "server" "component" "server" "global" $) | nindent 8 }}
68 |     spec:
69 |       accessModes: [ "ReadWriteOnce" ]
70 |       storageClassName: {{ .storageClassName | default "standard" }}
71 |       resources:
72 |         requests:
73 |           storage: {{ .size | default "1Gi" }}
74 |   {{- end -}}
75 | {{- end }}
76 | 


--------------------------------------------------------------------------------
/operations/k8s/helm/values.yaml:
--------------------------------------------------------------------------------
 1 | blobStore:
 2 |   # allowHTTP: true
 3 |   # endpoint: http://blob-store:9000
 4 |   credentialSecret: blob-creds
 5 |   config:
 6 |     backend: s3
 7 |     s3: {}
 8 |     #  accessKey: null
 9 |     #  secretKey: null
10 | 
11 | server:
12 |   image: tensorlake/indexify-server:latest
13 |   ingress:
14 |     enabled: false
15 |     annotations: {}
16 |   persistence: {}
17 |     # storageClassName: 'local-path'
18 |     # size: 1Gi
19 | 
20 | executors:
21 |   # Executors is an array of executor configurations.
22 |   - name: indexify-executor
23 |     image: tensorlake/indexify-executor-default:latest
24 |     replicas: 1
25 | 
26 | minio:
27 |   enabled: false
28 | 


--------------------------------------------------------------------------------
/server/.dockerignore:
--------------------------------------------------------------------------------
1 | target
2 | indexify_storage


--------------------------------------------------------------------------------
/server/.repo/conf/distributions:
--------------------------------------------------------------------------------
 1 | Codename: buster
 2 | Suite: stable
 3 | Components: main
 4 | Architectures: amd64 arm64
 5 | 
 6 | Codename: jammy
 7 | Suite: stable
 8 | Components: main restricted universe multiverse
 9 | Architectures: amd64 arm64
10 | 
11 | Codename: focal
12 | Suite: stable
13 | Components: main restricted universe multiverse
14 | Architectures: amd64 arm64


--------------------------------------------------------------------------------
/server/Cross.toml:
--------------------------------------------------------------------------------
 1 | [target.aarch64-unknown-linux-gnu]
 2 | dockerfile = "dockerfiles/Dockerfile.builder_linux_aarch64"
 3 | pre-build = [
 4 |     "dpkg --add-architecture $CROSS_DEB_ARCH",
 5 |     "mkdir -p /.npm",
 6 |     "chown -R 1001:128 /.npm"
 7 | ]
 8 | 
 9 | [target.x86_64-unknown-linux-gnu]
10 | dockerfile = "dockerfiles/Dockerfile.builder_linux_x86"
11 | pre-build = [
12 |     "mkdir -p /.npm",
13 |     "chown -R 1001:128 /.npm"
14 | ]
15 | 


--------------------------------------------------------------------------------
/server/blob_store/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "blob_store"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | metrics = {workspace = true}
 8 | object_store = {workspace = true}
 9 | futures = {workspace = true}
10 | anyhow = {workspace = true}
11 | bytes = {workspace = true}
12 | async-trait = {workspace = true}
13 | serde = {workspace = true}
14 | tokio = {workspace = true}
15 | tokio-stream = {workspace = true}
16 | tracing = {workspace = true}
17 | reqwest = {workspace = true}
18 | async-stream = {workspace = true}
19 | sha2 = {workspace=true}
20 | url = {workspace=true}
21 | opentelemetry = {workspace=true}
22 | 
23 | [dev-dependencies]
24 | tempfile = {workspace = true}
25 | 


--------------------------------------------------------------------------------
/server/build.rs:
--------------------------------------------------------------------------------
 1 | use std::{env, error::Error, path::PathBuf};
 2 | 
 3 | use vergen::{BuildBuilder, Emitter, SysinfoBuilder};
 4 | 
 5 | fn main() -> Result<(), Box<dyn Error>> {
 6 |     let build = BuildBuilder::all_build()?;
 7 |     let si = SysinfoBuilder::all_sysinfo()?;
 8 | 
 9 |     Emitter::default()
10 |         .add_instructions(&build)?
11 |         .add_instructions(&si)?
12 |         .emit()?;
13 | 
14 |     let proto_files = ["./proto/executor_api.proto"];
15 |     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
16 | 
17 |     tonic_build::configure()
18 |         .build_client(false) // Don't build client code as it's not needed for now
19 |         .build_server(true)
20 |         .file_descriptor_set_path(out_dir.join("executor_api_descriptor.bin"))
21 |         .protoc_arg("--experimental_allow_proto3_optional") // Required for building on Ubuntu 22.04
22 |         .compile_protos(&proto_files, &["proto"])?;
23 |     Ok(())
24 | }
25 | 


--------------------------------------------------------------------------------
/server/data_model/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "data_model"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | serde = { workspace = true }
10 | derive_builder = "0.20.1"
11 | im = { workspace = true }
12 | anyhow = { workspace = true }
13 | serde_json = { workspace = true }
14 | indexify_utils = { workspace = true }
15 | rand = { workspace = true }
16 | uuid = { workspace = true }
17 | nanoid = { workspace = true }
18 | sha2 = { workspace = true }
19 | strum = { workspace = true }
20 | tracing = { workspace = true }
21 | 


--------------------------------------------------------------------------------
/server/dockerfiles/Dockerfile.builder_linux_aarch64:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install --assume-yes --no-install-recommends \
 5 |     ca-certificates \
 6 |     build-essential pkg-config clang \
 7 |     g++-aarch64-linux-gnu libc6-dev-arm64-cross \
 8 |     protobuf-compiler \
 9 |     git curl
10 | 
11 | RUN curl -fsSL https://deb.nodesource.com/setup_22.x -o nodesource_setup.sh && \
12 |     bash -E nodesource_setup.sh && \
13 |     apt-get install --assume-yes --no-install-recommends nodejs
14 | 
15 | ENV CROSS_TOOLCHAIN_PREFIX=aarch64-linux-gnu-
16 | ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER="$CROSS_TOOLCHAIN_PREFIX"gcc \
17 |     AR_aarch64_unknown_linux_gnu="$CROSS_TOOLCHAIN_PREFIX"ar \
18 |     CC_aarch64_unknown_linux_gnu="$CROSS_TOOLCHAIN_PREFIX"gcc \
19 |     CXX_aarch64_unknown_linux_gnu="$CROSS_TOOLCHAIN_PREFIX"g++ \
20 |     PKG_CONFIG_PATH="/usr/lib/aarch64-linux-gnu/pkgconfig/:${PKG_CONFIG_PATH}"


--------------------------------------------------------------------------------
/server/dockerfiles/Dockerfile.builder_linux_x86:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install --assume-yes --no-install-recommends \
 5 |     ca-certificates \
 6 |     build-essential pkg-config clang \
 7 |     g++ \
 8 |     protobuf-compiler \
 9 |     git curl
10 | 
11 | RUN curl -fsSL https://deb.nodesource.com/setup_22.x -o nodesource_setup.sh && \
12 |     bash -E nodesource_setup.sh && \
13 |     apt-get install --assume-yes --no-install-recommends nodejs


--------------------------------------------------------------------------------
/server/dockerfiles/Dockerfile.local:
--------------------------------------------------------------------------------
 1 | FROM lukemathwalker/cargo-chef:latest-rust-slim-bookworm AS chef
 2 | WORKDIR /app
 3 | RUN apt-get update && apt-get install -y \
 4 |     software-properties-common unzip \
 5 |     build-essential make cmake ca-certificates \
 6 |     curl pkg-config git \
 7 |     sqlite3 clang gcc g++ \
 8 |     protobuf-compiler
 9 | RUN RUN curl -sL https://deb.nodesource.com/setup_22.x | bash && \
10 |     apt-get install -y \
11 |     nodejs \
12 |     npm
13 | 
14 | FROM chef AS planner
15 | COPY . .
16 | RUN cargo chef prepare --recipe-path recipe.json
17 | 
18 | FROM chef AS rust-builder
19 | COPY --from=planner /app/recipe.json recipe.json
20 | COPY rust-toolchain.toml .
21 | RUN cargo chef cook --release --recipe-path recipe.json
22 | COPY . .
23 | RUN cargo build --release --bin indexify-server
24 | 
25 | FROM ubuntu:22.04 AS runner
26 | 
27 | RUN apt update
28 | 
29 | RUN apt-get update && apt install -y \
30 |     curl \
31 |     libssl-dev \
32 |     python3-dev \
33 |     ca-certificates && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | RUN update-ca-certificates
37 | 
38 | WORKDIR /indexify
39 | 
40 | COPY --from=rust-builder /app/target/release/indexify-server ./
41 | 
42 | ENV PATH="/indexify:${PATH}"
43 | 
44 | ENTRYPOINT [ "/indexify/indexify-server" ]
45 | 


--------------------------------------------------------------------------------
/server/dockerfiles/Dockerfile.release_server:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | RUN apt update
 4 | 
 5 | RUN apt install -y lsb-release ca-certificates apt-transport-https
 6 | 
 7 | RUN update-ca-certificates
 8 | 
 9 | RUN echo "deb [trusted=yes] https://cf-repo.diptanu-6d5.workers.dev/repo $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/indexify-repo.list && \
10 |     apt-get update -y && \
11 |     apt-get install -y indexify-server && \
12 |     apt-get -y clean
13 | 
14 | WORKDIR /indexify
15 | 
16 | ENTRYPOINT [ "indexify-server" ]
17 | 


--------------------------------------------------------------------------------
/server/indexify_ui/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "indexify_ui"
 3 | version = "0.2.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | rust-embed.workspace = true
 8 | 
 9 | [build-dependencies]
10 | anyhow.workspace = true


--------------------------------------------------------------------------------
/server/indexify_ui/build.rs:
--------------------------------------------------------------------------------
 1 | use std::process::Command;
 2 | 
 3 | use anyhow::{anyhow, Result};
 4 | 
 5 | fn main() -> Result<()> {
 6 |     println!("cargo:rerun-if-changed=../ui/src");
 7 |     println!("cargo:rerun-if-changed=../ui/tsconfig.json");
 8 |     println!("cargo:rerun-if-changed=../ui/package.json");
 9 |     println!("cargo:rerun-if-changed=../ui/package-lock.json");
10 |     println!("cargo:rerun-if-changed=../ui/public");
11 | 
12 |     if !Command::new("sh")
13 |         .arg("-c")
14 |         .arg("cd ../ui && npm ci && npm run build")
15 |         .status()?
16 |         .success()
17 |     {
18 |         return Err(anyhow!(
19 |             "Failed to execute npm commands in the 'ui' directory"
20 |         ));
21 |     }
22 | 
23 |     Ok(())
24 | }
25 | 


--------------------------------------------------------------------------------
/server/indexify_ui/src/lib.rs:
--------------------------------------------------------------------------------
1 | use rust_embed::RustEmbed;
2 | 
3 | #[derive(RustEmbed)]
4 | #[folder = "../ui/build"]
5 | pub struct Assets;
6 | 


--------------------------------------------------------------------------------
/server/metrics/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "metrics"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | anyhow = {workspace=true}
 8 | opentelemetry = {workspace=true}
 9 | opentelemetry-prometheus = {workspace=true}
10 | opentelemetry_sdk = { workspace=true }
11 | prometheus = {workspace=true}
12 | pin-project-lite = {workspace=true}
13 | serde={workspace = true}
14 | once_cell = {workspace=true}
15 | data_model = {workspace=true}
16 | tracing = {workspace=true}
17 | 


--------------------------------------------------------------------------------
/server/processor/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "processor"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | anyhow.workspace = true
 8 | serde.workspace = true
 9 | rand.workspace = true
10 | serde_json.workspace = true
11 | data_model.workspace = true
12 | state_store.workspace = true
13 | tracing.workspace = true
14 | metrics.workspace = true
15 | tokio.workspace = true
16 | dashmap.workspace = true
17 | blob_store.workspace = true
18 | opentelemetry.workspace = true
19 | async-trait.workspace = true
20 | itertools.workspace = true
21 | indexify_utils.workspace = true
22 | im.workspace = true
23 | nanoid.workspace = true
24 | 


--------------------------------------------------------------------------------
/server/processor/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod gc;
2 | pub mod graph_processor;
3 | pub mod task_allocator;
4 | pub mod task_cache;
5 | pub mod task_creator;
6 | pub mod task_scheduler;
7 | 


--------------------------------------------------------------------------------
/server/processor/src/task_scheduler.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/server/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "1.82.0"
3 | 


--------------------------------------------------------------------------------
/server/rustfmt.toml:
--------------------------------------------------------------------------------
 1 | binop_separator = "Back"
 2 | combine_control_expr = false
 3 | condense_wildcard_suffixes = true
 4 | format_code_in_doc_comments = false
 5 | group_imports = "StdExternalCrate"
 6 | imports_granularity = "Crate"
 7 | imports_layout = "HorizontalVertical"
 8 | max_width = 100
 9 | normalize_doc_attributes = true
10 | reorder_impl_items = true
11 | use_field_init_shorthand = true
12 | wrap_comments = true
13 | 


--------------------------------------------------------------------------------
/server/sample_config.yaml:
--------------------------------------------------------------------------------
1 | state_store_path: indexify_server_state 
2 | listen_addr: 0.0.0.0:8900
3 | blob_storage:
4 |   path: "s3://indexifyblobs"
5 |   dynamodb_table: kvs_cas_table 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/server/src/routes/logs.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::anyhow;
 2 | use axum::{
 3 |     body::Body,
 4 |     extract::{Path, State},
 5 |     http::Response,
 6 | };
 7 | 
 8 | use super::RouteState;
 9 | use crate::http_objects::IndexifyAPIError;
10 | 
11 | #[utoipa::path(
12 |     get,
13 |     path = "/namespaces/{namespace}/compute_graphs/{compute_graph}/invocations/{invocation_id}/fn/{fn_name}/tasks/{task_id}/logs/{file}",
14 |     tag = "operations",
15 |     responses(
16 |         (status = 200, description = "Log file for a given task"),
17 |         (status = INTERNAL_SERVER_ERROR, description = "Internal Server Error")
18 |     ),
19 | )]
20 | pub async fn download_task_logs(
21 |     Path((namespace, compute_graph, invocation_id, fn_name, task_id, file)): Path<(
22 |         String,
23 |         String,
24 |         String,
25 |         String,
26 |         String,
27 |         String,
28 |     )>,
29 |     State(state): State<RouteState>,
30 | ) -> Result<Response<Body>, IndexifyAPIError> {
31 |     let payload = state
32 |         .indexify_state
33 |         .reader()
34 |         .get_diagnostic_payload(
35 |             &namespace,
36 |             &compute_graph,
37 |             &invocation_id,
38 |             &fn_name,
39 |             &task_id,
40 |             &file,
41 |         )
42 |         .map_err(|e| {
43 |             IndexifyAPIError::internal_error(anyhow!(
44 |                 "failed to download diagnostic payload: {}",
45 |                 e
46 |             ))
47 |         })?;
48 | 
49 |     if payload.is_none() {
50 |         return Response::builder()
51 |             .header("Content-Type", "application/octet-stream")
52 |             .header("Content-Length", 0)
53 |             .body(Body::empty())
54 |             .map_err(|e| IndexifyAPIError::internal_error_str(&e.to_string()));
55 |     }
56 |     let payload = payload.unwrap();
57 | 
58 |     let storage_reader = state
59 |         .blob_storage
60 |         .get(&payload.path)
61 |         .await
62 |         .map_err(IndexifyAPIError::internal_error)?;
63 |     Response::builder()
64 |         .header("Content-Type", "application/octet-stream")
65 |         .header("Content-Length", payload.size.to_string())
66 |         .body(Body::from_stream(storage_reader))
67 |         .map_err(|e| IndexifyAPIError::internal_error_str(&e.to_string()))
68 | }
69 | 


--------------------------------------------------------------------------------
/server/state_store/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "state_store"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | rocksdb = { workspace = true }
10 | slatedb = { workspace = true }
11 | bytes = { workspace = true }
12 | data_model = { workspace = true }
13 | indexify_utils = { workspace = true }
14 | anyhow = { workspace = true }
15 | serde = { workspace = true }
16 | serde_json = { workspace = true }
17 | strum = { workspace = true }
18 | tracing = { workspace = true }
19 | tokio = { workspace = true }
20 | tokio-stream = { workspace = true }
21 | futures.workspace = true
22 | async-stream = "0.3.5"
23 | tempfile = { workspace = true }
24 | object_store.workspace = true
25 | blob_store = { version = "0.1.0", path = "../blob_store" }
26 | url = { workspace = true }
27 | metrics = { workspace = true }
28 | opentelemetry = { workspace = true }
29 | im = { workspace = true }
30 | 


--------------------------------------------------------------------------------
/server/state_store/src/migrations/migration_trait.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use rocksdb::TransactionDB;
 3 | 
 4 | use super::contexts::{MigrationContext, PrepareContext};
 5 | 
 6 | /// Trait defining a database migration
 7 | pub trait Migration {
 8 |     /// The version this migration upgrades TO
 9 |     fn version(&self) -> u64;
10 | 
11 |     /// Name for logging purposes
12 |     fn name(&self) -> &'static str;
13 | 
14 |     /// DB preparation - column family operations before transaction
15 |     /// Default implementation simply opens the DB with existing column families
16 |     fn prepare(&self, ctx: &PrepareContext) -> Result<TransactionDB> {
17 |         ctx.open_db()
18 |     }
19 | 
20 |     /// Apply migration using provided context
21 |     fn apply(&self, ctx: &MigrationContext) -> Result<()>;
22 | 
23 |     fn box_clone(&self) -> Box<dyn Migration>;
24 | }
25 | 


--------------------------------------------------------------------------------
/server/state_store/src/migrations/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod contexts;
 2 | pub mod migration_trait;
 3 | pub mod registry;
 4 | #[cfg(test)]
 5 | mod testing;
 6 | 
 7 | // migrations
 8 | mod v1_task_status;
 9 | mod v2_invocation_ctx_timestamps;
10 | mod v3_invocation_ctx_secondary_index;
11 | mod v4_drop_executors;
12 | mod v5_allocation_keys;
13 | mod v6_clean_orphaned_tasks;
14 | mod v7_reset_allocated_tasks;
15 | mod v8_rebuild_invocation_ctx_secondary_index;
16 | // Add new migrations mod here
17 | 


--------------------------------------------------------------------------------
/server/state_store/src/serializer.rs:
--------------------------------------------------------------------------------
 1 | use std::{any::type_name, fmt::Debug};
 2 | 
 3 | use anyhow::Result;
 4 | use serde::de::DeserializeOwned;
 5 | 
 6 | pub struct JsonEncoder;
 7 | 
 8 | pub trait JsonEncode {
 9 |     fn encode<T: serde::Serialize + Debug>(value: &T) -> Result<Vec<u8>>;
10 |     fn decode<T: DeserializeOwned>(bytes: &[u8]) -> Result<T>;
11 | }
12 | 
13 | impl JsonEncode for JsonEncoder {
14 |     fn encode<T: serde::Serialize + Debug>(value: &T) -> Result<Vec<u8>> {
15 |         serde_json::to_vec(value).map_err(|e| {
16 |             anyhow::anyhow!(
17 |                 "error serializing into json: {}, type: {}, value: {:?}",
18 |                 e,
19 |                 type_name::<T>(),
20 |                 value
21 |             )
22 |         })
23 |     }
24 | 
25 |     fn decode<T: DeserializeOwned>(bytes: &[u8]) -> Result<T> {
26 |         serde_json::from_slice(bytes).map_err(|e| {
27 |             anyhow::anyhow!(
28 |                 "error deserializing from json bytes, {}, value: {:?}",
29 |                 e,
30 |                 type_name::<T>()
31 |             )
32 |         })
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/server/ui/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .DS_Store
3 | .git
4 | .gitignore
5 | .dockerignore
6 | .vscode


--------------------------------------------------------------------------------
/server/ui/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
 2 | 
 3 | # dependencies
 4 | /node_modules
 5 | /.pnp
 6 | .pnp.js
 7 | 
 8 | # testing
 9 | /coverage
10 | 
11 | # production
12 | /build
13 | 
14 | # misc
15 | .DS_Store
16 | .env.local
17 | .env.development.local
18 | .env.test.local
19 | .env.production.local
20 | 
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 | 


--------------------------------------------------------------------------------
/server/ui/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "indexify-ui-v2",
 3 |   "version": "0.2.0",
 4 |   "private": true,
 5 |   "homepage": "/ui/",
 6 |   "author": {
 7 |     "name": "Adithya Krishna",
 8 |     "email": "aadithya794@gmail.com"
 9 |   },
10 |   "license": "See License in <https://github.com/tensorlakeai/indexify/blob/main/LICENSE>",
11 |   "dependencies": {
12 |     "@emotion/react": "^11.11.3",
13 |     "@emotion/styled": "^11.11.0",
14 |     "@microlink/react-json-view": "^1.23.0",
15 |     "@mui/icons-material": "^5.15.7",
16 |     "@mui/material": "^5.15.7",
17 |     "@mui/x-data-grid": "^6.19.3",
18 |     "@testing-library/jest-dom": "^5.17.0",
19 |     "@testing-library/react": "^13.4.0",
20 |     "@testing-library/user-event": "^13.5.0",
21 |     "@types/jest": "^27.5.2",
22 |     "@types/node": "^16.18.78",
23 |     "@types/react": "^18.2.48",
24 |     "@types/react-dom": "^18.2.18",
25 |     "@types/react-router-dom": "^5.3.3",
26 |     "axios": "^1.8.4",
27 |     "crypto": "^1.0.1",
28 |     "date-fns": "^4.1.0",
29 |     "getindexify": "^0.1.5",
30 |     "iconsax-react": "^0.0.8",
31 |     "moment": "^2.30.1",
32 |     "react": "^18.2.0",
33 |     "react-dom": "^18.2.0",
34 |     "react-pdf": "^9.2.1",
35 |     "react-router-dom": "^6.26.2",
36 |     "react-toastify": "^10.0.5",
37 |     "react-virtuoso": "^4.10.4",
38 |     "serve": "^14.2.3",
39 |     "typescript": "^4.9.5",
40 |     "web-vitals": "^2.1.4"
41 |   },
42 |   "scripts": {
43 |     "start": "serve build --single",
44 |     "dev": "react-scripts start",
45 |     "build": "cross-env NODE_ENV=production react-scripts build",
46 |     "test": "react-scripts test",
47 |     "eject": "react-scripts eject",
48 |     "lint": "npx eslint --max-warnings 0 ./src"
49 |   },
50 |   "eslintConfig": {
51 |     "extends": [
52 |       "react-app",
53 |       "react-app/jest"
54 |     ]
55 |   },
56 |   "browserslist": {
57 |     "production": [
58 |       ">0.2%",
59 |       "not dead",
60 |       "not op_mini all"
61 |     ],
62 |     "development": [
63 |       "last 1 chrome version",
64 |       "last 1 firefox version",
65 |       "last 1 safari version"
66 |     ]
67 |   },
68 |   "devDependencies": {
69 |     "@babel/plugin-proposal-private-property-in-object": "^7.21.11",
70 |     "cross-env": "^7.0.3",
71 |     "eslint": "^8.56.0",
72 |     "react-scripts": "5.0.1"
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/server/ui/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlakeai/indexify/5d910ef31a95f3812f551e0428d2981579150f84/server/ui/public/favicon.ico


--------------------------------------------------------------------------------
/server/ui/public/logo.svg:
--------------------------------------------------------------------------------
 1 | <svg width="37" height="36" viewBox="0 0 37 36" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <g id="Tensorlake Logo" clip-path="url(#clip0_224_1639)">
 3 | <path id="Vector" d="M36.3199 20.0618C36.069 22.4836 35.349 24.7636 34.2362 26.8036C33.1235 27.2618 31.7817 27.5781 30.0362 27.5781C26.3708 27.5781 24.418 26.16 22.8471 25.0254C21.5053 24.0545 20.5344 23.3563 18.3744 23.3563C16.2144 23.3563 15.2435 24.0545 13.9017 25.0254C12.3308 26.16 10.378 27.5781 6.71259 27.5781C4.97804 27.5781 3.62532 27.2618 2.51259 26.8036C1.39986 24.7636 0.679864 22.4836 0.428955 20.0618C1.11623 20.4436 1.69441 20.8691 2.23986 21.2618C3.58168 22.2327 4.54168 22.9309 6.71259 22.9309C8.87259 22.9309 9.8435 22.2327 11.1853 21.2618C12.7562 20.1272 14.709 18.72 18.3744 18.72C22.0399 18.72 23.9926 20.1272 25.5635 21.2618C26.9053 22.2327 27.8762 22.9309 30.0362 22.9309C32.1962 22.9309 33.1671 22.2327 34.509 21.2618C35.0435 20.8691 35.6326 20.4545 36.3199 20.0618Z" fill="#4AA4F4"/>
 4 | <path id="Vector_2" d="M29.8508 32.1382C28.2253 33.48 26.3708 34.5381 24.3417 35.2472C23.7962 34.92 23.3053 34.56 22.858 34.2327C21.5162 33.2618 20.5453 32.5636 18.3853 32.5636C16.2253 32.5636 15.2544 33.2618 13.9126 34.2327C13.4544 34.56 12.9744 34.9091 12.418 35.2472C10.3889 34.5381 8.53439 33.48 6.90894 32.1382C8.93803 32.1054 9.88712 31.4181 11.1853 30.469C12.7562 29.3345 14.7089 27.9272 18.3744 27.9272C22.0398 27.9272 23.9926 29.3345 25.5635 30.469C26.8617 31.4181 27.8108 32.0945 29.8508 32.1382Z" fill="#4AA4F4"/>
 5 | <path id="Top" d="M18.3746 0.185547C27.8294 0.185547 35.5836 7.46021 36.3458 16.7183H30.6226C29.8833 10.6085 24.685 5.88009 18.3746 5.88009C12.0642 5.88009 6.86579 10.6085 6.12649 16.7183H0.40332C1.16552 7.46021 8.91969 0.185547 18.3746 0.185547Z" fill="#060D3F"/>
 6 | </g>
 7 | <defs>
 8 | <clipPath id="clip0_224_1639">
 9 | <rect width="36" height="36" fill="white" transform="translate(0.341797)"/>
10 | </clipPath>
11 | </defs>
12 | </svg>
13 | 


--------------------------------------------------------------------------------
/server/ui/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "Indexify UI",
 3 |   "name": "Indexify UI",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     }
10 |   ],
11 |   "start_url": ".",
12 |   "display": "standalone",
13 |   "theme_color": "#000000",
14 |   "background_color": "#ffffff"
15 | }
16 | 


--------------------------------------------------------------------------------
/server/ui/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/server/ui/src/App.css:
--------------------------------------------------------------------------------
 1 | .App {
 2 |   text-align: center;
 3 | }
 4 | 
 5 | .App-logo {
 6 |   height: 40vmin;
 7 |   pointer-events: none;
 8 | }
 9 | 
10 | @media (prefers-reduced-motion: no-preference) {
11 |   .App-logo {
12 |     animation: App-logo-spin infinite 20s linear;
13 |   }
14 | }
15 | 
16 | .App-header {
17 |   background-color: #282c34;
18 |   min-height: 100vh;
19 |   display: flex;
20 |   flex-direction: column;
21 |   align-items: center;
22 |   justify-content: center;
23 |   font-size: calc(10px + 2vmin);
24 |   color: white;
25 | }
26 | 
27 | .App-link {
28 |   color: #61dafb;
29 | }
30 | 
31 | @keyframes App-logo-spin {
32 |   from {
33 |     transform: rotate(0deg);
34 |   }
35 |   to {
36 |     transform: rotate(360deg);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/server/ui/src/components/CopyText.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, IconButton, Tooltip } from "@mui/material";
 2 | import ContentCopy from "@mui/icons-material/ContentCopy";
 3 | import { useState } from "react";
 4 | 
 5 | interface CopyTextProps {
 6 |   text: string;
 7 |   color?: string;
 8 |   className?: string;
 9 |   tooltipTitle?: string;
10 |   copiedTooltipTitle?: string;
11 | }
12 | 
13 | export function CopyText({
14 |   text,
15 |   className,
16 |   tooltipTitle = "Copy to clipboard",
17 |   copiedTooltipTitle = "Copied!"
18 | }: CopyTextProps) {
19 |   const [isCopied, setIsCopied] = useState(false);
20 | 
21 |   const handleCopy = async () => {
22 |     try {
23 |       await navigator.clipboard.writeText(text);
24 |       setIsCopied(true);
25 |       setTimeout(() => setIsCopied(false), 2000);
26 |     } catch (error) {
27 |       console.error('Failed to copy text:', error);
28 |     }
29 |   };
30 | 
31 |   return (
32 |     <Box className={className}>
33 |       <Tooltip title={isCopied ? copiedTooltipTitle : tooltipTitle}>
34 |         <IconButton onClick={handleCopy} size="small">
35 |           <ContentCopy sx={{ height: 20 }} />
36 |         </IconButton>
37 |       </Tooltip>
38 |     </Box>
39 |   );
40 | }
41 | 
42 | export default CopyText;
43 | 


--------------------------------------------------------------------------------
/server/ui/src/components/CopyTextPopover.tsx:
--------------------------------------------------------------------------------
 1 | import { Popover, Typography } from '@mui/material'
 2 | import { ContentCopy } from '@mui/icons-material'
 3 | import { useState, type ReactNode } from 'react'
 4 | 
 5 | interface CopyTextPopoverProps {
 6 |   text: string
 7 |   children: ReactNode
 8 | }
 9 | 
10 | function CopyTextPopover({ text, children }: CopyTextPopoverProps) {
11 |   const [anchorEl, setAnchorEl] = useState<HTMLElement | null>(null)
12 |   const [isCopied, setIsCopied] = useState(false)
13 | 
14 |   const handlePopoverOpen = (event: React.MouseEvent<HTMLElement>) => {
15 |     setAnchorEl(event.currentTarget)
16 |   }
17 | 
18 |   const handlePopoverClose = () => {
19 |     setAnchorEl(null)
20 |   }
21 | 
22 |   const handleCopy = async () => {
23 |     try {
24 |       await navigator.clipboard.writeText(text)
25 |       setIsCopied(true)
26 |       setTimeout(() => {
27 |         setIsCopied(false)
28 |         handlePopoverClose()
29 |       }, 1000)
30 |     } catch (err) {
31 |       console.error('Failed to copy text:', err)
32 |     }
33 |   }
34 | 
35 |   const open = Boolean(anchorEl)
36 | 
37 |   return (
38 |     <div 
39 |       onMouseEnter={handlePopoverOpen} 
40 |       onMouseLeave={handlePopoverClose}
41 |       style={{ display: 'inline-block' }}
42 |     >
43 |       {children}
44 |       <Popover
45 |         open={open}
46 |         anchorEl={anchorEl}
47 |         onClose={handlePopoverClose}
48 |         anchorOrigin={{
49 |           vertical: 'bottom',
50 |           horizontal: 'center',
51 |         }}
52 |         transformOrigin={{
53 |           vertical: 'top',
54 |           horizontal: 'center',
55 |         }}
56 |         sx={{
57 |           pointerEvents: 'none',
58 |           '& .MuiPopover-paper': {
59 |             pointerEvents: 'auto'
60 |           }
61 |         }}
62 |       >
63 |         <Typography
64 |           sx={{
65 |             p: 1,
66 |             display: 'flex',
67 |             alignItems: 'center',
68 |             gap: 1,
69 |             cursor: 'pointer',
70 |             '&:hover': {
71 |               bgcolor: 'action.hover'
72 |             },
73 |             fontSize: 12
74 |           }}
75 |           onClick={handleCopy}
76 |         >
77 |           <ContentCopy sx={{ fontSize: 12 }} />
78 |           {isCopied ? 'Copied!' : 'Click to copy'}
79 |         </Typography>
80 |       </Popover>
81 |     </div>
82 |   )
83 | }
84 | 
85 | export default CopyTextPopover


--------------------------------------------------------------------------------
/server/ui/src/components/Footer.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, Typography } from "@mui/material";
 2 | import Link from "@mui/material/Link";
 3 | 
 4 | export function Footer() {
 5 |   const currentYear = new Date().getFullYear();
 6 | 
 7 |   return (
 8 |     <Box py={2} textAlign="center">
 9 |       <Typography variant="caption" color="CaptionText" align="center">
10 |         Copyright © 
11 |         <Link color="inherit" href="https://tensorlake.ai/">
12 |           Tensorlake
13 |         </Link>
14 |         {` ${currentYear}.`}
15 |       </Typography>
16 |     </Box>
17 |   );
18 | }
19 | 
20 | export default Footer;
21 | 


--------------------------------------------------------------------------------
/server/ui/src/components/InfoBox.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, Typography } from '@mui/material';
 2 | import AccessTimeIcon from '@mui/icons-material/AccessTime';
 3 | 
 4 | interface InfoBoxProps {
 5 |     text?: string;
 6 | }
 7 | 
 8 | export function InfoBox({ text }: InfoBoxProps) {
 9 |   return (
10 |     <Box
11 |       sx={{
12 |         border: '1px dashed #B0D1F7',
13 |         borderRadius: '8px',
14 |         padding: '26px',
15 |         display: 'flex',
16 |         alignItems: 'center',
17 |         justifyContent: 'center',
18 |         maxWidth: '500px',
19 |         margin: '10px',
20 |         boxShadow: '0px 1px 2px 0px #00000040 inset',
21 |       }}
22 |     >
23 |       <AccessTimeIcon
24 |         sx={{
25 |           color: '#2196f3',
26 |           fontSize: '24px',
27 |           marginRight: '16px',
28 |           marginTop: '4px',
29 |         }}
30 |       />
31 |       <Typography variant="subtitle2" color="text.secondary">
32 |         {text}
33 |       </Typography>
34 |     </Box>
35 |   );
36 | }
37 | 
38 | export default InfoBox;
39 | 


--------------------------------------------------------------------------------
/server/ui/src/components/TruncatedDescription.tsx:
--------------------------------------------------------------------------------
 1 | import { useEffect, useRef, useState } from 'react';
 2 | import { Typography, Tooltip } from '@mui/material';
 3 | 
 4 | export const TruncatedDescription = ({ description }: { description: string }) => {
 5 |   const [isOverflowing, setIsOverflowing] = useState(false);
 6 |   const textRef = useRef<HTMLParagraphElement>(null);
 7 | 
 8 |   useEffect(() => {
 9 |     const checkOverflow = () => {
10 |       if (textRef.current) {
11 |         setIsOverflowing(textRef.current.scrollHeight > textRef.current.clientHeight);
12 |       }
13 |     };
14 |     checkOverflow();
15 |     window.addEventListener('resize', checkOverflow);
16 |     return () => window.removeEventListener('resize', checkOverflow);
17 |   }, [description]);
18 | 
19 |   return (
20 |     <Tooltip title={isOverflowing ? description : ''} arrow>
21 |       <Typography
22 |         ref={textRef}
23 |         variant="subtitle2"
24 |         paragraph
25 |         sx={{
26 |           display: '-webkit-box',
27 |           WebkitLineClamp: 2,
28 |           WebkitBoxOrient: 'vertical',
29 |           overflow: 'hidden',
30 |           textOverflow: 'ellipsis',
31 |           marginBottom: 0,
32 |           marginLeft: { xs: 0, lg: 1 },
33 |         }}
34 |       >
35 |         {description}
36 |       </Typography>
37 |     </Tooltip>
38 |   );
39 | };
40 | 


--------------------------------------------------------------------------------
/server/ui/src/components/TruncatedText.tsx:
--------------------------------------------------------------------------------
 1 | import { Tooltip, Typography } from '@mui/material';
 2 | 
 3 | interface TruncatedTextProps {
 4 |   text: string;
 5 |   maxLength?: number;
 6 | }
 7 | 
 8 | export function TruncatedText({ text, maxLength = 25 }: TruncatedTextProps) {
 9 |   const truncatedText = text.length > maxLength 
10 |     ? `${text.slice(0, maxLength)}...` 
11 |     : text;
12 | 
13 |   return (
14 |     <Tooltip title={text}>
15 |       <Typography
16 |         variant="h6"
17 |         component="div"
18 |         className="cursor-default"
19 |       >
20 |         {truncatedText}
21 |       </Typography>
22 |     </Tooltip>
23 |   );
24 | }
25 | 
26 | export default TruncatedText;
27 | 


--------------------------------------------------------------------------------
/server/ui/src/error-page.tsx:
--------------------------------------------------------------------------------
 1 | import { isRouteErrorResponse, useRouteError } from "react-router-dom";
 2 | 
 3 | export function ErrorPage() {
 4 |   const error = useRouteError();
 5 | 
 6 |   if (isRouteErrorResponse(error)) {
 7 |     return (
 8 |       <div id="error-page">
 9 |         <h1>Oops! {error.status}</h1>
10 |         <p>{error.statusText}</p>
11 |         {error.data?.message && (
12 |           <p>
13 |             <i>{error.data.message}</i>
14 |           </p>
15 |         )}
16 |       </div>
17 |     );
18 |   } else if (error instanceof Error) {
19 |     return (
20 |       <div id="error-page">
21 |         <h1>Oops! Unexpected Error</h1>
22 |         <p>Something went wrong.</p>
23 |         <p>
24 |           <i>{error.message}</i>
25 |         </p>
26 |       </div>
27 |     );
28 |   } else {
29 |     return <></>;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/server/ui/src/react-app-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="react-scripts" />
2 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/ComputeGraphsPage.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, Alert } from "@mui/material";
 2 | import { useLoaderData } from "react-router-dom";
 3 | import { ComputeGraphsCard } from "../../components/cards/ComputeGraphsCard";
 4 | import type { ComputeGraphLoaderData } from "./types";
 5 | 
 6 | const ComputeGraphsPage = () => {
 7 |   const { client, computeGraphs, namespace } = useLoaderData() as ComputeGraphLoaderData;
 8 | 
 9 |   if (!client || !computeGraphs || !namespace) {
10 |     return (
11 |       <Box>
12 |         <Alert severity="error">
13 |           Failed to load compute graphs data. Please try again.
14 |         </Alert>
15 |       </Box>
16 |     );
17 |   }
18 | 
19 |   return (
20 |     <Box>
21 |       <ComputeGraphsCard
22 |         computeGraphs={computeGraphs}
23 |         client={client}
24 |         namespace={namespace}
25 |       />
26 |     </Box>
27 |   );
28 | };
29 | 
30 | export default ComputeGraphsPage;
31 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/ExecutorsPage.tsx:
--------------------------------------------------------------------------------
 1 | import { Box } from '@mui/material';
 2 | import { useLoaderData } from 'react-router-dom';
 3 | import { ExecutorsCard } from '../../components/cards/ExecutorsCard';
 4 | import type { ExecutorsLoaderData } from './types';
 5 | 
 6 | const ExecutorsPage = () => {
 7 |   const { executors } = useLoaderData() as ExecutorsLoaderData;
 8 |   return (
 9 |     <Box>
10 |       <ExecutorsCard executors={executors} />
11 |     </Box>
12 |   );
13 | };
14 | 
15 | export default ExecutorsPage;
16 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/IndividualInvocationPage.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   Box,
 3 |   Breadcrumbs,
 4 |   Typography,
 5 |   Stack,
 6 | } from '@mui/material';
 7 | import { TableDocument } from 'iconsax-react';
 8 | import NavigateNextIcon from '@mui/icons-material/NavigateNext'
 9 | import { Link, useLoaderData } from 'react-router-dom';
10 | import CopyText from '../../components/CopyText';
11 | import InvocationOutputTable from '../../components/tables/InvocationOutputTable';
12 | import InvocationTasksTable from '../../components/tables/InvocationTasksTable';
13 | import CopyTextPopover from '../../components/CopyTextPopover';
14 | 
15 | const IndividualInvocationPage = () => {
16 |   const {
17 |     indexifyServiceURL, 
18 |     invocationId,
19 |     computeGraph,
20 |     namespace
21 |    } =
22 |     useLoaderData() as {
23 |       indexifyServiceURL: string
24 |       invocationId: string,
25 |       computeGraph: string,
26 |       namespace: string
27 |     }
28 | 
29 |   return (
30 |     <Stack direction="column" spacing={3}>
31 |       <Breadcrumbs
32 |         aria-label="breadcrumb"
33 |         separator={<NavigateNextIcon fontSize="small" />}
34 |       >
35 |         <CopyTextPopover text={namespace}>
36 |           <Typography color="text.primary">{namespace}</Typography>
37 |         </CopyTextPopover>
38 |         <Link color="inherit" to={`/${namespace}/compute-graphs`}>
39 |           <CopyTextPopover text="Compute Graphs">
40 |             <Typography color="text.primary">Compute Graphs</Typography>
41 |           </CopyTextPopover>
42 |         </Link>
43 |         <Link color="inherit" to={`/${namespace}/compute-graphs/${computeGraph}`}>
44 |           <CopyTextPopover text={computeGraph}>
45 |             <Typography color="text.primary">{computeGraph}</Typography>
46 |           </CopyTextPopover>
47 |         </Link>
48 |         <CopyTextPopover text={invocationId}>
49 |           <Typography color="text.primary">{invocationId}</Typography>
50 |         </CopyTextPopover>
51 |       </Breadcrumbs>
52 |       <Box sx={{ p: 0 }}>
53 |         <Box sx={{ mb: 3 }}>
54 |           <div className="content-table-header">
55 |             <div className="heading-icon-container">
56 |               <TableDocument size="25" className="heading-icons" variant="Outline"/>
57 |             </div>
58 |             <Typography variant="h4" display={'flex'} flexDirection={'row'}>
59 |               Invocation - {invocationId} <CopyText text={invocationId} />
60 |             </Typography>
61 |           </div>
62 |           <InvocationOutputTable indexifyServiceURL={indexifyServiceURL} invocationId={invocationId} namespace={namespace} computeGraph={computeGraph} />
63 |         </Box>
64 |         <InvocationTasksTable indexifyServiceURL={indexifyServiceURL} invocationId={invocationId} namespace={namespace} computeGraph={computeGraph} />
65 |       </Box>
66 |     </Stack>
67 |   );
68 | };
69 | 
70 | export default IndividualInvocationPage;
71 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/NamespacesPage.tsx:
--------------------------------------------------------------------------------
 1 | import { Box } from '@mui/material';
 2 | import { useLoaderData } from 'react-router-dom';
 3 | import NamespacesCard from '../../components/cards/NamespacesCard';
 4 | import type { NamespacesLoaderData } from './types';
 5 | 
 6 | const NamespacesPage = () => {
 7 |   const { namespaces } = useLoaderData() as NamespacesLoaderData;
 8 |   return (
 9 |     <Box>
10 |       <NamespacesCard namespaces={namespaces} />
11 |     </Box>
12 |   );
13 | };
14 | 
15 | export default NamespacesPage;
16 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/index.ts:
--------------------------------------------------------------------------------
1 | export { default as NamespacesPage } from "./NamespacesPage";
2 | export { default as ComputeGraphsPage } from "./ComputeGraphsPage";
3 | export { default as IndividualComputeGraphPage } from "./IndividualComputeGraphPage";
4 | export { default as IndividualInvocationPage } from "./IndividualInvocationPage";
5 | export { default as ExecutorsPage } from "./ExecutorsPage";
6 | 


--------------------------------------------------------------------------------
/server/ui/src/routes/Namespace/types.ts:
--------------------------------------------------------------------------------
 1 | import { IndexifyClient } from 'getindexify'
 2 | import {
 3 |   ComputeGraph,
 4 |   Namespace,
 5 |   ExecutorMetadata,
 6 |   ComputeGraphsList,
 7 |   Invocation,
 8 | } from '../../types'
 9 | 
10 | export interface NamespaceLoaderData {
11 |   namespace: string
12 |   client?: IndexifyClient
13 | }
14 | 
15 | export interface ComputeGraphLoaderData extends NamespaceLoaderData {
16 |   computeGraphs: ComputeGraphsList
17 | }
18 | 
19 | export interface IndividualComputeGraphLoaderData extends NamespaceLoaderData {
20 |   invocationsList: Invocation[]
21 |   computeGraph: ComputeGraph
22 |   prevCursor: string | null
23 |   nextCursor: string | null
24 |   direction?: string
25 | }
26 | 
27 | export interface IndividualInvocationLoaderData extends NamespaceLoaderData {
28 |   indexifyServiceURL: string
29 |   invocationId: string
30 |   computeGraph: string
31 | }
32 | 
33 | export interface ExecutorsLoaderData {
34 |   executors: ExecutorMetadata[]
35 | }
36 | 
37 | export interface NamespacesLoaderData {
38 |   namespaces: Namespace[]
39 | }
40 | 


--------------------------------------------------------------------------------
/server/ui/src/setupTests.ts:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 | 


--------------------------------------------------------------------------------
/server/ui/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es5",
 4 |     "lib": [
 5 |       "dom",
 6 |       "dom.iterable",
 7 |       "esnext"
 8 |     ],
 9 |     "allowJs": true,
10 |     "skipLibCheck": true,
11 |     "esModuleInterop": true,
12 |     "allowSyntheticDefaultImports": true,
13 |     "strict": true,
14 |     "forceConsistentCasingInFileNames": true,
15 |     "noFallthroughCasesInSwitch": true,
16 |     "module": "esnext",
17 |     "moduleResolution": "node",
18 |     "resolveJsonModule": true,
19 |     "isolatedModules": true,
20 |     "noEmit": true,
21 |     "jsx": "react-jsx"
22 |   },
23 |   "include": [
24 |     "src"
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/server/ui/vscode/editor.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "typescript.tsdk": "node_modules/typescript/lib",
 3 |   "editor.defaultFormatter": "esbenp.prettier-vscode",
 4 |   "editor.formatOnSave": true,
 5 |   "editor.codeActionsOnSave": {
 6 |     "source.fixAll": "explicit"
 7 |   },
 8 |   "eslint.validate": ["typescript", "typescriptreact", "javascript", "javascriptreact"],
 9 |   "javascript.preferences.importModuleSpecifier": "non-relative",
10 |   "javascript.preferences.useAliasesForRenames": false,
11 |   "typescript.enablePromptUseWorkspaceTsdk": true,
12 |   "files.eol": "\n",
13 |   "editor.tabSize": 2,
14 |   "editor.insertSpaces": true
15 | }
16 | 


--------------------------------------------------------------------------------
/server/utils/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "indexify_utils"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | futures = { workspace = true }
 8 | pin-project = { workspace = true }
 9 | serde_json = { workspace = true }
10 | ciborium = { workspace = true }
11 | anyhow = { workspace = true }
12 | tokio = { workspace = true }
13 | 


--------------------------------------------------------------------------------