├── .cargo └── config.toml ├── .env-example ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md ├── assets └── db2vec_screenshot.png ├── build-cross-release.sh ├── config └── exclude.json ├── dockerfile ├── docs ├── COMPATIBLE.md ├── DOCKER_SETUP.md ├── OPTION.md └── TEI.md ├── samples ├── mssql_sample.sql ├── mysql_sample.sql ├── oracle_sample.sql ├── postgres_sample.sql ├── profile_sample.txt ├── sqlite_sample.sql └── surreal_sample.surql ├── src ├── cli │ └── mod.rs ├── db │ ├── chroma.rs │ ├── milvus.rs │ ├── mod.rs │ ├── pinecone.rs │ ├── qdrant.rs │ ├── redis.rs │ └── surreal.rs ├── embedding │ ├── embeding.rs │ ├── mod.rs │ └── models │ │ ├── google.rs │ │ ├── mod.rs │ │ ├── ollama.rs │ │ └── tei.rs ├── lib.rs ├── main.rs ├── parser │ ├── mod.rs │ └── parse_regex │ │ ├── mod.rs │ │ ├── mssql.rs │ │ ├── mysql.rs │ │ ├── oracle.rs │ │ ├── postgres.rs │ │ ├── sqlite.rs │ │ └── surreal.rs ├── util │ ├── exclude.rs │ ├── handle_tei.rs │ ├── mod.rs │ ├── spinner.rs │ └── utils.rs └── workflow.rs ├── tei ├── tei-linux-x86 ├── tei-metal-mac-arm └── tei-onnx-mac-arm ├── tei_timeout.log ├── tests └── integration_test.rs └── vector-export-scripts └── qdrant.sh /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-pc-windows-gnu] 2 | linker = "x86_64-w64-mingw32-gcc" 3 | [target.x86_64-unknown-linux-gnu] 4 | linker = "x86_64-linux-gnu-gcc" -------------------------------------------------------------------------------- /.env-example: -------------------------------------------------------------------------------- 1 | # DB2VEC ENVIRONMENT CONFIGURATION 2 | # =============================== 3 | # This file contains environment variables used by db2vec 4 | # Copy this file to ".env" and customize as needed 5 | 6 | # INPUT/OUTPUT CONFIGURATION 7 | # -------------------------- 8 | # Path to the database dump file to process (.sql/.surql) 9 | DUMP_FILE=./surreal.surql 10 | 11 | # Target vector database type 12 | # Options: redis|chroma|milvus|qdrant|surrealdb|pinecone 13 | EXPORT_TYPE=redis 14 | 15 | # DEBUG MODE 16 | # ---------- 17 | # Print parsed JSON records before embedding 18 | DEBUG=false 19 | 20 | # VECTOR DATABASE CONNECTION 21 | # ------------------------- 22 | # Vector database URL/host endpoint 23 | VECTOR_HOST=redis://127.0.0.1:6379 24 | 25 | # Database authentication (user/password or API key) 26 | USER=root 27 | PASS= 28 | SECRET= 29 | AUTH=false 30 | 31 | # Database organization 32 | DATABASE=default_database 33 | TENANT=default_tenant 34 | NAMESPACE=default_namespace 35 | 36 | # Pinecone-specific settings 37 | INDEXES=default_indexes 38 | CLOUD=aws 39 | REGION=us-east-1 40 | 41 | # VECTOR CONFIGURATION 42 | # ------------------- 43 | # Vector dimension size (must match your embedding model) 44 | DIMENSION=768 45 | 46 | # Distance metric: l2|ip|cosine|euclidean|dotproduct 47 | METRIC=cosine 48 | 49 | # DATA HANDLING 50 | # ------------ 51 | # Max payload size (MB) per request 52 | PAYLOAD_SIZE_MB=12 53 | 54 | # Batch size for DB inserts 55 | CHUNK_SIZE=10 56 | 57 | # Group Redis records by table name (else use FT.CREATE/SEARCH) 58 | GROUP_REDIS=false 59 | 60 | # Use exclusion rules from config/exclude.json 61 | USE_EXCLUDE=false 62 | 63 | # EMBEDDING CONFIGURATION 64 | # --------------------- 65 | # Which embedding provider to use: ollama, tei, or google 66 | EMBEDDING_PROVIDER=ollama 67 | 68 | # Embedding model name/id 69 | # Examples: nomic-embed-text, text-embedding-004, nomic-embed-text-v2-moe 70 | EMBEDDING_MODEL=nomic-embed-text 71 | 72 | # API Key for Google Gemini (required if EMBEDDING_PROVIDER=google) 73 | # EMBEDDING_API_KEY= 74 | 75 | # URL endpoint for Ollama or Google embeddings (optional) 76 | # EMBEDDING_URL= 77 | 78 | # Embedding performance tuning 79 | EMBEDDING_MAX_CONCURRENCY=4 80 | EMBEDDING_BATCH_SIZE=16 81 | EMBEDDING_MAX_TOKENS=8000 82 | OLLAMA_TIMEOUT=60 83 | 84 | # Task type for Google Gemini 85 | EMBEDDING_TASK_TYPE=SEMANTIC_SIMILARITY 86 | 87 | # TEI (Text Embedding Inference) specific settings 88 | TEI_BINARY_PATH=tei/tei-metal 89 | TEI_LOCAL_PORT=8080 90 | 91 | # PERFORMANCE 92 | # ---------- 93 | # CPU threads for parallel tasks (0 = auto detect) 94 | NUM_THREADS=0 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .env 3 | /volumes 4 | /root 5 | /chroma-data 6 | history.txt 7 | milvus.yaml 8 | qdrant_storage 9 | hero.surql 10 | docker-compose.yml 11 | /dist 12 | .DS_Store 13 | tests/volumes 14 | tei_failure.log -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "db2vec" 3 | version = "0.5.7" 4 | edition = "2024" 5 | authors = ["Thanon Aphithanawat"] 6 | description = "High-performance tool to parse database dumps, generate vector embeddings, and load them into vector databases" 7 | readme = "README.md" 8 | repository = "https://github.com/DevsHero/db2vec" 9 | license = "MIT" 10 | keywords = ["vector-database", "embedding", "ollama", "database-export", "vector-search"] 11 | categories = ["database", "command-line-utilities", "text-processing"] 12 | 13 | [dependencies] 14 | redis = "0.29" 15 | serde = { version = "1", features = ["derive"] } 16 | serde_json = "1" 17 | reqwest = { version = "0.11" ,default-features = false, features = ["rustls-tls", "blocking", "json"] } 18 | tokio = { version = "1", features = ["full"] } 19 | uuid = { version = "1", features = ["v4", "rng-getrandom"] } 20 | regex = "1.11" 21 | byteorder = "1.5.0" 22 | base64 = "0.22" 23 | html2text = "0.14" 24 | clap = { version = "4", features = ["derive", "env"] } 25 | dotenvy = "0.15" 26 | log = "0.4" 27 | env_logger = "0.11" 28 | encoding_rs = "0.8" 29 | encoding_rs_io = "0.1" 30 | once_cell = "1" 31 | rayon = "1" 32 | num_cpus = "1" 33 | lazy_static = "1.5" 34 | async-trait = "0.1" 35 | futures = "0.3" 36 | portpicker = "0.1.1" 37 | 38 | [dev-dependencies] 39 | db2vec = { path = "." } 40 | 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # db2vec: From Database Dumps to Vector Search at Speed 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | 5 | Tired of waiting hours for Python scripts to embed large database exports, especially on machines without powerful GPUs? So was I. Processing millions of records demands performance, even on standard hardware. `db2vec` is a high‑performance Rust tool designed for efficient **CPU-based embedding generation**. It parses your database dumps, generates vector embeddings using local models (Ollama, text-embeddings-inference(TEI) ) or cloud APIs (Google Gemini), and loads them into your vector database of choice – all optimized for speed without requiring a dedicated GPU. 6 | 7 | ![db2vec CLI running](assets/db2vec_screenshot.png) 8 | 9 | --- 10 | 11 | ## Core Features 12 | 13 | * 🚀 **Blazing Fast:** Built in Rust for maximum throughput on large datasets, optimized for CPU. 14 | * 🔄 **Parallel Processing:** Adjustable concurrency and batch‑size for embedding generation (`--num‑threads`, `--embedding‑concurrency`, `--embedding‑batch-size`). 15 | * 📦 **Batch Inserts:** Configurable batch size (`-c, --chunk-size`) and payload limits (`-m, --max-payload-size-mb`) for efficient bulk loading into the target vector database. 16 | * 🛡️ **Data Filtering:** Exclude sensitive tables or fields via configuration for data privacy and reduced processing time. 17 | * 🔧 **Highly Configurable:** Fine-tune performance and behavior with extensive CLI arguments for embedding, database connections, batching, and more. 18 | * 📄 **Supported Dump Formats:** 19 | * `.sql` (MySQL, PostgreSQL, MSSQL, SQLite, Oracle) 20 | * **MSSQL:** 21 | ```bash 22 | sqlcmd -S server -U user -P pass -Q "SET NOCOUNT ON; SELECT * FROM dbo.TableName;" -o dump.sql 23 | ``` 24 | * *Oracle requires exporting via SQL Developer or similar into standard SQL.* 25 | * `.surql` (SurrealDB) 26 | * 🧠 **Flexible Embeddings:** Supports multiple providers: 27 | * **Ollama** – best for local CPU/GPU, extremely fast. 28 | * **TEI** – CPU-only Text Embeddings Inference (v1.7.0), slower than Ollama but faster than cloud. See [docs/TEI.md](docs/TEI.md) for details. 29 | * **Google Gemini** – cloud API, ideal if you have very limited local resources. Beware of rate limits; use small batch sizes to avoid throttling. 30 | * 💾 **Vector DB Targets:** Inserts vectors + metadata into: 31 | * Chroma 32 | * Milvus 33 | * Pinecone (Cloud & Local Dev Image) 34 | * Qdrant 35 | * Redis Stack 36 | * SurrealDB 37 | * ⚙️ **Pure Regex Parsing:** Fast, reliable record extraction (no AI). 38 | * 🔒 **Authentication:** Supports user/password, API key, tenants/namespaces per DB. 39 | * ☁️ **Pinecone Cloud Support:** Automatically creates/describes indexes, uses namespaces. 40 | * 🐞 **Debug Mode:** `--debug` prints parsed JSON records before embedding. 41 | 42 | --- 43 | 44 | ## Requirements 45 | 46 | * **Rust:** Latest stable (Edition 2021+). 47 | * **Embedding Provider:** One of the following configured: 48 | * **Ollama:** Running locally with your desired model(s) pulled (e.g., `ollama pull nomic-embed-text`). 49 | * **TEI:** Requires TEI binary (`tei-metal`) and compatible model (e.g., `nomic-embed-text-v2-moe`). See [docs/TEI.md](docs/TEI.md) for setup. 50 | * **Google Gemini:** A valid Google Cloud API key (`--secret` or `EMBEDDING_API_KEY`) with the Generative Language API enabled for your project. 51 | * **Target DB:** One of Chroma, Milvus, Pinecone, Qdrant, Redis Stack, SurrealDB (Docker recommended for local). 52 | * **(Optional) `.env`:** For setting default configuration values. 53 | 54 | --- 55 | 56 | ## Configuration 57 | 58 | Configuration can be set using CLI flags or by creating a `.env` file in the project root. CLI flags always override values set in the `.env` file. 59 | 60 | Refer to the `.env-example` file for a comprehensive list of available environment variables, their descriptions, and default values. 61 | 62 | --- 63 | 64 | ## How It Works 65 | 66 | 1. **Read & Detect:** Load dump (`.sql`/`.surql`), detect SQL dialect or SurrealDB. 67 | 2. **Parse (Regex):** Extract records and types. 68 | 3. **Apply Exclusions:** Skip tables or fields based on your exclusion rules (if enabled). 69 | 4. **Embed:** Call the selected embedding provider (`ollama`, `tei` on CPU, `google`) to get vectors. 70 | 5. **Auto-Schema:** Automatically create: 71 | * Target database if it doesn't exist 72 | * Collections/indices from table names in the dump 73 | * Proper dimension settings based on your `--dimension` parameter 74 | * Distance metrics using your specified `--metric` value 75 | 6. **Store:** Insert into your vector DB with metadata. 76 | 77 | --- 78 | 79 | ## Data Exclusion 80 | 81 | The exclusion feature allows you to skip entire tables or specific fields within records, which is useful for: 82 | 83 | * Protecting sensitive data (passwords, PII) 84 | * Improving performance by excluding large tables or fields not needed for search 85 | * Reducing storage costs in your vector database 86 | 87 | ### How to Use Exclusions 88 | 89 | 1. Create a `config/exclude.json` file with your exclusion rules 90 | 2. Enable exclusions with the `--use-exclude` flag 91 | 92 | ### Sample exclude.json 93 | 94 | ```json 95 | [ 96 | { 97 | "table": "users", 98 | "ignore_table": false, 99 | "exclude_fields": { 100 | "password": true, 101 | "email": true, 102 | "profile": ["ssn", "tax_id"] 103 | } 104 | }, 105 | { 106 | "table": "audit_logs", 107 | "ignore_table": true 108 | } 109 | ] 110 | ``` 111 | This configuration: 112 | 113 | Keeps the "users" table but removes password and email fields 114 | For the "profile" object field, only removes the "ssn" and "tax_id" subfields 115 | Completely skips the "audit_logs" table 116 | --- 117 | 118 | ## Automatic Collection Creation 119 | 120 | For each table in your source data dump, `db2vec` automatically: 121 | 122 | * Creates a corresponding collection/index in the target vector database 123 | * Names the collection after the source table name 124 | * Configures proper dimensions and metric type based on your CLI arguments 125 | * Creates the database first if it doesn't exist 126 | 127 | This zero-config schema creation means you don't need to manually set up your vector database structure before import. 128 | 129 | > **Note:** When using Redis with `--group-redis`, collections aren't created in the traditional sense. Instead, records are grouped by table name into Redis data structures (e.g., `table:profile` → [records]). Without this flag, Redis stores each record as an individual entry with a table label in the metadata. 130 | > 131 | > **Warning:** If collections already exist, their dimension must match the `--dimension` parameter you provide. Some databases like Pinecone will reject vectors with mismatched dimensions, causing the import to fail. 132 | 133 | --- 134 | 135 | ## Quick Start 136 | 137 | 1. **Clone & build** 138 | ```bash 139 | git clone https://github.com/DevsHero/db2vec.git 140 | cd db2vec 141 | cargo build --release 142 | ``` 143 | 2. **Prepare your dump** 144 | * MySQL/Postgres/Oracle: export `.sql` 145 | * MSSQL: `sqlcmd … > mssql_dump.sql` 146 | * SQLite: `sqlite3 mydb.db .dump > sqlite_dump.sql` 147 | * SurrealDB: `.surql` file 148 | 3. **(Optional) Create `.env`:** Copy `.env-example` to `.env` and customize defaults. 149 | 4. **Run** 150 | ```bash 151 | # MySQL → Milvus (using Ollama) 152 | ./target/release/db2vec \ 153 | -f samples/mysql_sample.sql \ 154 | -t milvus \ 155 | --host http://127.0.0.1:19530 \ 156 | --database mydb \ 157 | --embedding-provider ollama \ 158 | --embedding-model nomic-embed-text \ 159 | --dimension 768 \ 160 | -u root -p secret --use-auth \ 161 | --debug 162 | 163 | # SurrealDB → Pinecone (using TEI) 164 | ./target/release/db2vec \ 165 | -f samples/surreal_sample.surql \ 166 | -t pinecone \ 167 | --host https://index-123.svc.us-east-1.pinecone.io \ 168 | --namespace myns \ 169 | --embedding-provider tei \ 170 | --tei-binary-path tei/tei-metal \ 171 | --embedding-model nomic-embed-text-v2-moe \ 172 | --dimension 768 173 | 174 | # SQLite → Qdrant (using Google Gemini) 175 | ./target/release/db2vec \ 176 | -f samples/oracle_sample.sql \ 177 | -t qdrant \ 178 | --host http://localhost:6333 \ 179 | --embedding-provider google \ 180 | --embedding-model text-embedding-004 \ 181 | --dimension 768 \ 182 | --embedding-api-key \ 183 | --dimension 768 \ 184 | --debug 185 | ``` 186 | 187 | --- 188 | 189 | ## Usage 190 | 191 | ```bash 192 | # Cargo 193 | cargo run -- [OPTIONS] 194 | 195 | # Binary 196 | ./target/release/db2vec [OPTIONS] 197 | 198 | # Logging 199 | RUST_LOG=info ./target/release/db2vec [OPTIONS] 200 | RUST_LOG=debug ./target/release/db2vec --debug [OPTIONS] 201 | ``` 202 | 203 | ## Compatibility 204 | 205 | See [docs/compatible.md](docs/compatible.md) for the full compatibility matrix of supported vector database versions and import file formats. 206 | 207 | 208 | --- 209 | 210 | ## Docker Setup 211 | 212 | Run supported vector DBs locally via Docker – see [DOCKER_SETUP.md](docs/DOCKER_SETUP.md) for commands. 213 | 214 | 215 | --- 216 | 217 | ## Target Environment 218 | 219 | Primarily developed and tested against Docker‑hosted or cloud vector databases via RESTful APIs. Ensure your target is reachable from where you run `db2vec`. **Designed to run efficiently even on standard CPU hardware.** 220 | 221 | --- 222 | 223 | ## Testing 224 | 225 | ### Integration Tests 226 | 227 | db2vec includes comprehensive integration tests that verify functionality across all supported database types and embedding providers. 228 | 229 | #### Prerequisites 230 | 231 | - **Docker**: Required to run containerized instances of all supported vector databases 232 | - **Embedding Provider**: At least one of the supported embedding providers (Ollama/TEI/Google) 233 | 234 | #### Running Integration Tests 235 | 236 | The integration test suite will: 237 | 238 | 1. Spin up Docker containers for each supported vector database 239 | 2. Test all database import formats (MySQL, PostgreSQL, MSSQL, SQLite, Oracle, SurrealDB) 240 | 3. Generate embeddings using the specified provider 241 | 4. Verify proper storage and retrieval from each vector database 242 | 243 | ```bash 244 | # Test with Ollama (fastest, requires Ollama running locally) 245 | EMBEDDING_PROVIDER=ollama cargo test --test integration_test -- --nocapture 246 | 247 | # Test with TEI (CPU-based, no external dependencies) 248 | EMBEDDING_PROVIDER=tei cargo test --test integration_test -- --nocapture 249 | 250 | # Test with mock embeddings (no external provider required) 251 | EMBEDDING_PROVIDER=mock cargo test --test integration_test -- --nocapture 252 | ``` 253 | 254 | --- 255 | 256 | ## Contributing 257 | 258 | Issues, PRs, and feedback welcome! 259 | 260 | --- 261 | 262 | ## License 263 | 264 | MIT – see [LICENSE](LICENSE). -------------------------------------------------------------------------------- /assets/db2vec_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/assets/db2vec_screenshot.png -------------------------------------------------------------------------------- /build-cross-release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # 1) Name of your Rust binary (as in Cargo.toml) 5 | BIN_NAME="db2vec" 6 | 7 | # 2) Ensure cross is installed 8 | if ! command -v cross &>/dev/null; then 9 | echo "❌ 'cross' not found – installing..." 10 | cargo install cross --git https://github.com/cross-rs/cross :contentReference[oaicite:0]{index=0} 11 | fi 12 | 13 | # 3) Output directory 14 | DIST_DIR="$(pwd)/dist" 15 | mkdir -p "$DIST_DIR" 16 | 17 | # 4) List of targets 18 | TARGETS=( 19 | "x86_64-unknown-linux-gnu" # Linux x86_64 :contentReference[oaicite:1]{index=1} 20 | "aarch64-unknown-linux-gnu" # Linux ARM64 :contentReference[oaicite:2]{index=2} 21 | "x86_64-pc-windows-gnu" # Windows x64 :contentReference[oaicite:3]{index=3} 22 | ) 23 | 24 | # 5) Build loop 25 | for TARGET in "${TARGETS[@]}"; do 26 | echo "⏳ Building for $TARGET..." 27 | cross rustc --target "$TARGET" --release 28 | done 29 | 30 | # 6) Copy binaries into dist/ 31 | echo "📂 Collecting binaries into $DIST_DIR..." 32 | for TARGET in "${TARGETS[@]}"; do 33 | BIN_PATH="target/${TARGET}/release/${BIN_NAME}" 34 | # On Windows targets, add .exe 35 | if [[ "$TARGET" == *"windows"* ]]; then 36 | BIN_PATH+=".exe" 37 | fi 38 | 39 | if [[ -f "$BIN_PATH" ]]; then 40 | OUT_NAME="${BIN_NAME}-${TARGET}" 41 | # Preserve extension on Windows 42 | if [[ "$TARGET" == *"windows"* ]]; then 43 | OUT_NAME+=".exe" 44 | fi 45 | 46 | cp "$BIN_PATH" "$DIST_DIR/$OUT_NAME" 47 | echo "✅ $OUT_NAME" 48 | else 49 | echo "⚠️ Missing: $BIN_PATH" 50 | fi 51 | done 52 | 53 | echo "🎉 All done! Binaries are in $DIST_DIR." 54 | -------------------------------------------------------------------------------- /config/exclude.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "table": "users", 4 | "ignore_table": false, 5 | "exclude_fields": { 6 | "password": true, 7 | "email": true, 8 | "profile": ["ssn", "tax_id"] 9 | } 10 | }, 11 | { 12 | "table": "audit_logs", 13 | "ignore_table": true 14 | } 15 | ] -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # 1. Planner stage: generate the dependency recipe (with C++ compiler) 3 | ############################################################################### 4 | FROM rustlang/rust:nightly-bullseye-slim AS chef 5 | 6 | # Install system deps including C++ compiler 7 | RUN apt-get update && \ 8 | apt-get install -y --no-install-recommends \ 9 | build-essential \ 10 | pkg-config \ 11 | libssl-dev \ 12 | git \ 13 | ca-certificates && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | # Install cargo-chef and sccache 17 | RUN cargo install --locked cargo-chef sccache 18 | ENV RUSTC_WRAPPER="sccache" \ 19 | SCCACHE_DIR="/sccache" 20 | 21 | WORKDIR /app 22 | 23 | # Copy manifests and dummy main for cargo-chef 24 | COPY Cargo.toml Cargo.lock ./ 25 | RUN mkdir src && echo 'fn main() {}' > src/main.rs && \ 26 | cargo chef prepare --recipe-path recipe.json 27 | 28 | ############################################################################### 29 | # 2. Builder stage: compile dependencies & your code 30 | ############################################################################### 31 | FROM chef AS builder 32 | WORKDIR /app 33 | 34 | # Rehydrate dependencies 35 | COPY --from=chef /app/recipe.json recipe.json 36 | RUN --mount=type=cache,target=/usr/local/cargo/registry \ 37 | --mount=type=cache,target=/usr/local/cargo/git \ 38 | --mount=type=cache,target=$SCCACHE_DIR,sharing=locked \ 39 | cargo chef cook --release --recipe-path recipe.json 40 | 41 | # Build the full application 42 | COPY . . 43 | RUN --mount=type=cache,target=/usr/local/cargo/registry \ 44 | --mount=type=cache,target=/usr/local/cargo/git \ 45 | --mount=type=cache,target=$SCCACHE_DIR,sharing=locked \ 46 | cargo build --release 47 | 48 | ############################################################################### 49 | # 3. Runtime stage: minimal Debian image 50 | ############################################################################### 51 | FROM debian:bullseye-slim AS runtime 52 | 53 | RUN apt-get update && \ 54 | apt-get install -y --no-install-recommends ca-certificates && \ 55 | rm -rf /var/lib/apt/lists/* 56 | 57 | COPY --from=builder /app/target/release/db2vec /usr/local/bin/db2vec 58 | 59 | # Drop privileges: non-root user 60 | RUN useradd --system --uid 10001 --shell /usr/sbin/nologin appuser 61 | USER appuser 62 | 63 | ENTRYPOINT ["/usr/local/bin/db2vec"] 64 | -------------------------------------------------------------------------------- /docs/COMPATIBLE.md: -------------------------------------------------------------------------------- 1 | # Compatibility Matrix 2 | 3 | ## Supported Vector Database Versions 4 | 5 | | Vector DB | API Version | Notes | 6 | |--------------|-------------------------------------|-------------------------------| 7 | | Pinecone | 2025-01 | Pinecone Cloud Control Plane | 8 | | Milvus | v2 | Milvus Server API v2 | 9 | | Chroma | v2 | Chroma HTTP API v2 | 10 | | Qdrant | v1.14.0 | Qdrant Server v1.14.0 | 11 | | Redis Stack | redis-stack:7.4.0-v3 (as of 30/4/2025) | Includes RedisJSON, RediSearch | 12 | | SurrealDB | v2.3.0 (as of 30/4/2025) | SurrealDB HTTP API v2.3.0 | 13 | 14 | --- 15 | 16 | ## Supported Import File Formats 17 | 18 | All sample dumps use the latest database‐specific dump format as of 30/4/2025. 19 | 20 | | Format | Sample File | Notes | 21 | |--------------|-------------------------|---------------------------------| 22 | | MSSQL | `mssql_sample.sql` | SQLCMD export with `SET NOCOUNT ON` | 23 | | MySQL | `mysql_sample.sql` | mysqldump / standard SQL dump | 24 | | Oracle | `oracle_sample.sql` | SQL Developer / expdp format | 25 | | PostgreSQL | `postgres_sample.sql` | `pg_dump --format=plain` | 26 | | SQLite | `sqlite_sample.sql` | `sqlite3 .dump` | 27 | | SurrealDB | `surreal_sample.surql` | SurrealDB `.surql` export | 28 | 29 | --- 30 | 31 | ## Pinecone Cloud Support 32 | 33 | When `-t pinecone` is selected and `--host` is not a local URL: 34 | 35 | 1. **Create / Describe Index** 36 | * Uses the control plane `https://api.pinecone.io/indexes` 37 | * Requires `--indexes`, `--secret` (API key), `--cloud`, and `--region` 38 | * If the index does not exist, it is created with your `--dimension` and `--metric` 39 | * On `409 Conflict`, the existing index is described to retrieve its data‑plane host 40 | 41 | 2. **Data‑Plane Upserts** 42 | * Vectors are upserted to `https://` 43 | * Namespace = source table name (each table is a separate namespace) 44 | * Metadata includes a `"table": ""` field 45 | 46 | > **Note:** For local Pinecone dev images, index creation via API may not be supported. 47 | > Ensure your index exists or provide the full data‑plane URL with `--host`. 48 | 49 | ## Other Cloud-Hosted Vector Services (Untested) 50 | 51 | While we haven’t explicitly tested against managed cloud offerings beyond Pinecone, the same HTTP/API-key patterns should apply: 52 | 53 | - **Milvus Cloud** / Zilliz Cloud 54 | - **Qdrant Cloud** 55 | - **Redis Enterprise Cloud** 56 | - **Surreal Cloud** 57 | 58 | To try one of these services: 59 | 60 | 1. Set `--host` to your service’s HTTP endpoint. 61 | 2. Pass your API key or token via `--secret` and enable `--use-auth`. 62 | 3. Configure any provider-specific flags (e.g. `--indexes`, `--namespace`, etc.). 63 | 64 | db2vec uses standard REST calls and bearer-token auth under the hood, so you may find these services work out-of-the-box. Actual support may vary based on each provider’s API quirks. 65 | 66 | -------------------------------------------------------------------------------- /docs/DOCKER_SETUP.md: -------------------------------------------------------------------------------- 1 | # Local Vector Database Setup with Docker 2 | 3 | This guide provides quick‑start Docker commands for running supported vector databases locally with `db2vec`. For full details and advanced options, please refer to the official documentation links provided for each database. 4 | 5 | --- 6 | 7 | ## Pinecone (Local Development) 8 | 9 | Official docs: https://docs.pinecone.io/guides/operations/local-development#docker-cli 10 | 11 | ```bash 12 | docker run -d \ 13 | --name dense-index \ 14 | -e PORT=5081 \ 15 | -e INDEX_TYPE=serverless \ 16 | -e VECTOR_TYPE=dense \ 17 | -e DIMENSION=768 \ 18 | -e METRIC=cosine \ 19 | -p 5081:5081 \ 20 | --platform linux/amd64 \ 21 | ghcr.io/pinecone-io/pinecone-index:latest 22 | ``` 23 | 24 | --- 25 | 26 | ## SurrealDB 27 | 28 | Official docs: https://surrealdb.com/docs/surrealdb/installation/running/docker 29 | 30 | ```bash 31 | docker run -d --rm --pull always \ 32 | --name surreal \ 33 | -p 8000:8000 \ 34 | -v /mydata:/mydata \ 35 | surrealdb/surrealdb:latest \ 36 | start --user root --pass root 37 | ``` 38 | 39 | --- 40 | 41 | ## Milvus (Standalone) 42 | 43 | Official docs: https://milvus.io/docs/configure-docker.md?tab=component 44 | 45 | ```bash 46 | wget https://github.com/milvus-io/milvus/releases/download/v2.5.9/milvus-standalone-docker-compose.yml \ 47 | -O docker-compose.yml 48 | docker compose up -d 49 | ``` 50 | 51 | --- 52 | 53 | ## Redis Stack 54 | 55 | Official docs: https://hub.docker.com/r/redis/redis-stack 56 | 57 | ```bash 58 | docker run -d \ 59 | --name redis-stack \ 60 | -p 6379:6379 \ 61 | -p 8001:8001 \ 62 | redis/redis-stack:latest 63 | ``` 64 | 65 | --- 66 | 67 | ## Chroma 68 | 69 | Official docs: https://docs.trychroma.com/production/containers/docker 70 | 71 | ```bash 72 | docker run -d \ 73 | -v ./chroma-data:/data \ 74 | -p 8000:8000 \ 75 | chromadb/chroma 76 | ``` 77 | 78 | --- 79 | 80 | ## Qdrant 81 | 82 | Official docs: https://qdrant.tech/documentation/quickstart/ 83 | 84 | ```bash 85 | docker run -d \ 86 | --name qdrant \ 87 | -p 6333:6333 \ 88 | -p 6334:6334 \ 89 | -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \ 90 | qdrant/qdrant 91 | ``` 92 | 93 | --- 94 | 95 | > **Note:** Always consult the official documentation for each database for the latest setup instructions, environment variables, and recommended production configurations. 96 | > 97 | > Save this file as `DOCKER_SETUP.md` in your project root and copy the commands as needed. -------------------------------------------------------------------------------- /docs/OPTION.md: -------------------------------------------------------------------------------- 1 | # db2vec Command-Line Options 2 | 3 | Below is the full list of CLI flags, their environment-variable equivalents, defaults, and descriptions. 4 | (Note: `--tei-local-port` has been removed; only `--tei-binary-path` remains.) 5 | 6 | | Flag / Env Var | Default | Description | 7 | |-----------------------------------------------------|--------------------------|-----------------------------------------------------------------------------------------------| 8 | | -f, --data-file
DUMP_FILE | `./surreal.surql` | Path to the `.sql` / `.surql` database-dump file. | 9 | | -t, --vector-export-type
EXPORT_TYPE | `redis` | Target vector database: `redis` \| `chroma` \| `milvus` \| `qdrant` \| `surreal` \| `pinecone`.| 10 | | -u, --user
USER | `root` | Username for DB authentication (Milvus, SurrealDB). | 11 | | -p, --pass
PASS | `""` | Password for DB authentication (Milvus, SurrealDB, Redis). | 12 | | -k, --secret
SECRET | `""` | API key / token for DB auth (Chroma, Qdrant, Pinecone). | 13 | | --use-auth
AUTH | `false` | Enable authentication for the vector database. | 14 | | --debug
DEBUG | `false` | Print parsed JSON records before embedding. | 15 | | --vector-host
VECTOR_HOST | `redis://127.0.0.1:6379` | Vector-database URL or host endpoint. | 16 | | --database
DATABASE | `default_database` | Target database/collection name (Chroma, Milvus, Qdrant, Surreal). | 17 | | --indexes
INDEXES | `default_indexes` | Pinecone index name (only for `-t pinecone`). | 18 | | --cloud
CLOUD | `aws` | Pinecone cloud provider: `aws` \| `azure` \| `gcp`. | 19 | | --region
REGION | `us-east-1` | Pinecone cloud region (e.g. `us-east-1`). | 20 | | --tenant
TENANT | `default_tenant` | Tenant name for multi-tenant DBs (Chroma). | 21 | | --namespace
NAMESPACE | `default_namespace` | Namespace for SurrealDB or Pinecone. | 22 | | --dimension
DIMENSION | `768` | Vector dimension size (must match your embedding model). | 23 | | --metric
METRIC | `cosine` | Distance metric: `l2` \| `ip` \| `cosine` \| `euclidean` \| `dotproduct`. | 24 | | -m, --max-payload-size-mb
PAYLOAD_SIZE_MB | `12` | Max payload size **MB** per request (DB batch upload). | 25 | | -c, --chunk-size
CHUNK_SIZE | `10` | Number of records per batch insert. | 26 | | --embedding-provider
EMBEDDING_PROVIDER | `ollama` | Embedding provider: `ollama` (fast CPU/GPU) \| `tei` (CPU-only TEI v1.7.0) \| `google` (cloud).| 27 | | --embedding-api-key
EMBEDDING_API_KEY | _none_ | API Key for Google Gemini (required if provider=`google`). | 28 | | --embedding-model
EMBEDDING_MODEL | `nomic-embed-text` | Model name/ID for your provider (e.g. `nomic-embed-text`, `text-embedding-004`, `...-moe`). | 29 | | --embedding-url
EMBEDDING_URL | _none_ | URL endpoint for Ollama or Google embeddings (e.g. `http://localhost:11434`). | 30 | | --embedding-max-concurrency
EMBEDDING_MAX_CONCURRENCY | `4` | Parallel embedding requests. | 31 | | --embedding-batch-size
EMBEDDING_BATCH_SIZE | `16` | Number of texts per embedding batch. | 32 | | --embedding-max-tokens
EMBEDDING_MAX_TOKENS | `8000` | Max tokens per embedding request (provider-specific). | 33 | | --embedding-timeout
OLLAMA_TIMEOUT | `60` | Timeout (seconds) for embedding calls. | 34 | | --embedding-task-EXPORT_TYPE
EMBEDDING_TASK_EXPORT_TYPE | `SEMANTIC_SIMILARITY` | Optional task EXPORT_TYPE for Google Gemini API. | 35 | | --num-threads
NUM_THREADS | `0` | CPU threads for parallel tasks (0 = auto-detect). | 36 | | --group-redis
GROUP_REDIS | `false` | Group Redis records by table name (vs individual FT.CREATE/SEARCH). | 37 | | --tei-binary-path
TEI_BINARY_PATH | `tei/tei-metal` | Path to TEI binary (`tei-metal` or `tei-onnx`). If omitted, the embedded TEI is auto-extracted.| 38 | 39 | 40 | This document now reflects the removal of `--tei-local-port` and clearly lists the remaining CLI options, including how to invoke and configure the TEI binary.This document now reflects the removal of `--tei-local-port` and clearly lists the remaining CLI options, including how to invoke and configure the TEI binary. -------------------------------------------------------------------------------- /docs/TEI.md: -------------------------------------------------------------------------------- 1 | # TEI Provider (Text Embeddings Inference) 2 | 3 | This project ships two TEI binaries under the `tei/` folder, built from **v1.7.0**: 4 | 5 | - `tei/tei-metal` – for Apple Silicon (M1/M2) using the Metal backend 6 | - `tei/tei-onnx` – for x86_64 using the ONNX Runtime backend 7 | 8 | Feel free to build your own from source: 9 | 10 | ```bash 11 | git clone https://github.com/huggingface/text-embeddings-inference.git 12 | cd text-embeddings-inference 13 | 14 | # On x86_64 with ONNX backend (recommended) 15 | cargo install --path router -F ort 16 | 17 | # On x86_64 with Intel MKL 18 | cargo install --path router -F mkl 19 | 20 | # On Apple Silicon (M1/M2) with Metal 21 | cargo install --path router -F metal 22 | ``` 23 | 24 | You can also run the TEI router standalone: 25 | 26 | ```bash 27 | # e.g. on CPU: 28 | text-embeddings-router --model-id YOUR_MODEL_ID --port 8080 29 | ``` 30 | 31 | > Note: on Linux you may need OpenSSL & gcc: 32 | > `sudo apt-get install libssl-dev gcc -y` 33 | 34 | --- 35 | 36 | ## Using local TEI with db2vec 37 | 38 | 39 | ```bash 40 | cargo run --release -- \ 41 | -f your_dataset.surql \ 42 | -t pinecone \ 43 | --embedding-provider tei \ 44 | --tei-binary-path tei/tei-metal \ 45 | --embedding-model nomic-ai/nomic-embed-text-v2-moe \ 46 | --dimension 768 47 | ``` 48 | 49 | --tei-binary-path : path to tei-metal or tei-onnx 50 | Leave --embedding-url empty to start a local server 51 | 52 | -------------------------------------------------------------------------------- /samples/mssql_sample.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/samples/mssql_sample.sql -------------------------------------------------------------------------------- /samples/mysql_sample.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 9.3.0, for Linux (aarch64) 2 | -- 3 | -- Host: localhost Database: mydb 4 | -- ------------------------------------------------------ 5 | -- Server version 9.3.0 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!50503 SET NAMES utf8mb4 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Current Database: `mydb` 20 | -- 21 | 22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `mydb` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci */ /*!80016 DEFAULT ENCRYPTION='N' */; 23 | 24 | USE `mydb`; 25 | 26 | -- 27 | -- Table structure for table `products` 28 | -- 29 | 30 | DROP TABLE IF EXISTS `products`; 31 | /*!40101 SET @saved_cs_client = @@character_set_client */; 32 | /*!50503 SET character_set_client = utf8mb4 */; 33 | CREATE TABLE `products` ( 34 | `id` int NOT NULL AUTO_INCREMENT, 35 | `name` varchar(100) DEFAULT NULL, 36 | `tags` json DEFAULT NULL, 37 | `price` decimal(10,2) DEFAULT NULL, 38 | `meta` json DEFAULT NULL, 39 | PRIMARY KEY (`id`) 40 | ) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 41 | /*!40101 SET character_set_client = @saved_cs_client */; 42 | 43 | -- 44 | -- Dumping data for table `products` 45 | -- 46 | 47 | LOCK TABLES `products` WRITE; 48 | /*!40000 ALTER TABLE `products` DISABLE KEYS */; 49 | INSERT INTO `products` VALUES (1,'products','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}'); 50 | /*!40000 ALTER TABLE `products` ENABLE KEYS */; 51 | 52 | LOCK TABLES `cars` WRITE; 53 | /*!40000 ALTER TABLE `products` DISABLE KEYS */; 54 | INSERT INTO `cars` VALUES (1,'cars','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}'); 55 | /*!40000 ALTER TABLE `products` ENABLE KEYS */; 56 | UNLOCK TABLES; 57 | LOCK TABLES `home` WRITE; 58 | /*!40000 ALTER TABLE `products` DISABLE KEYS */; 59 | INSERT INTO `home` VALUES (1,'home','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}'); 60 | /*!40000 ALTER TABLE `products` ENABLE KEYS */; 61 | UNLOCK TABLES; 62 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 63 | 64 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 65 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 66 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 67 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 68 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 69 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 70 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 71 | 72 | -- Dump completed on 2025-04-19 1:45:18 73 | -------------------------------------------------------------------------------- /samples/oracle_sample.sql: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------- 2 | -- File created - Tuesday-April-22-2025 3 | -------------------------------------------------------- 4 | REM INSERTING into SYSTEM.CUSTOMER_PROFILES 5 | SET DEFINE OFF; 6 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (1,'John Smith','john.smith@example.com'); 7 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (2,'Maria Garcia','maria.garcia@example.com'); 8 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (3,'Ahmed Khan','ahmed.khan@example.com'); 9 | REM INSERTING into SYSTEM.PRODUCTS 10 | SET DEFINE OFF; 11 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (1,'Premium Laptop',1299.99,'High-performance laptop for professionals'); 12 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (2,'Ergonomic Office Chair',249.99,'Comfortable chair with lumbar support'); 13 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (3,'Smart Home Hub',179.5,'Voice-controlled smart home central hub with wireless connectivity'); 14 | REM INSERTING into SYSTEM.ANALYTICS_DATA 15 | SET DEFINE OFF; 16 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (1,'page_view',to_timestamp('20 APR 2025 08.15.32.452000000','DD MON RRRR HH24.MI.SSXFF'),1001); 17 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (2,'purchase',to_timestamp('20 APR 2025 10.38.14.129000000','DD MON RRRR HH24.MI.SSXFF'),1042); 18 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (3,'search',to_timestamp('21 APR 2025 15.22.40.781000000','DD MON RRRR HH24.MI.SSXFF'),1098); 19 | -------------------------------------------------------- 20 | -- DDL for Index SYS_C008435 21 | -------------------------------------------------------- 22 | 23 | CREATE UNIQUE INDEX "SYSTEM"."SYS_C008435" ON "SYSTEM"."CUSTOMER_PROFILES" ("ID") 24 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 25 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 26 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 27 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 28 | TABLESPACE "SYSTEM" ; 29 | -------------------------------------------------------- 30 | -- DDL for Index IDX_CUSTOMER_EMAIL 31 | -------------------------------------------------------- 32 | 33 | CREATE INDEX "SYSTEM"."IDX_CUSTOMER_EMAIL" ON "SYSTEM"."CUSTOMER_PROFILES" ("EMAIL") 34 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 35 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 36 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 37 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 38 | TABLESPACE "SYSTEM" ; 39 | -------------------------------------------------------- 40 | -- DDL for Index SYS_C008434 41 | -------------------------------------------------------- 42 | 43 | CREATE UNIQUE INDEX "SYSTEM"."SYS_C008434" ON "SYSTEM"."PRODUCTS" ("ID") 44 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 45 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 46 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 47 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 48 | TABLESPACE "SYSTEM" ; 49 | -------------------------------------------------------- 50 | -- DDL for Index IDX_PRODUCTS_NAME 51 | -------------------------------------------------------- 52 | 53 | CREATE INDEX "SYSTEM"."IDX_PRODUCTS_NAME" ON "SYSTEM"."PRODUCTS" ("NAME") 54 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 55 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 56 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 57 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 58 | TABLESPACE "SYSTEM" ; 59 | -------------------------------------------------------- 60 | -- DDL for Index SYS_C008436 61 | -------------------------------------------------------- 62 | 63 | CREATE UNIQUE INDEX "SYSTEM"."SYS_C008436" ON "SYSTEM"."ANALYTICS_DATA" ("ID") 64 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 65 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 66 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 67 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 68 | TABLESPACE "SYSTEM" ; 69 | -------------------------------------------------------- 70 | -- DDL for Index IDX_ANALYTICS_EVENT_TYPE 71 | -------------------------------------------------------- 72 | 73 | CREATE INDEX "SYSTEM"."IDX_ANALYTICS_EVENT_TYPE" ON "SYSTEM"."ANALYTICS_DATA" ("EVENT_TYPE") 74 | PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 75 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 76 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 77 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 78 | TABLESPACE "SYSTEM" ; 79 | -------------------------------------------------------- 80 | -- Constraints for Table CUSTOMER_PROFILES 81 | -------------------------------------------------------- 82 | 83 | ALTER TABLE "SYSTEM"."CUSTOMER_PROFILES" MODIFY ("ID" NOT NULL ENABLE); 84 | ALTER TABLE "SYSTEM"."CUSTOMER_PROFILES" ADD PRIMARY KEY ("ID") 85 | USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 86 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 87 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 88 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 89 | TABLESPACE "SYSTEM" ENABLE; 90 | -------------------------------------------------------- 91 | -- Constraints for Table PRODUCTS 92 | -------------------------------------------------------- 93 | 94 | ALTER TABLE "SYSTEM"."PRODUCTS" MODIFY ("ID" NOT NULL ENABLE); 95 | ALTER TABLE "SYSTEM"."PRODUCTS" ADD PRIMARY KEY ("ID") 96 | USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 97 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 98 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 99 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 100 | TABLESPACE "SYSTEM" ENABLE; 101 | ALTER TABLE "SYSTEM"."PRODUCTS" ADD CONSTRAINT "PRODUCTS_METADATA_JSON" CHECK (METADATA IS JSON) ENABLE; 102 | -------------------------------------------------------- 103 | -- Constraints for Table ANALYTICS_DATA 104 | -------------------------------------------------------- 105 | 106 | ALTER TABLE "SYSTEM"."ANALYTICS_DATA" MODIFY ("ID" NOT NULL ENABLE); 107 | ALTER TABLE "SYSTEM"."ANALYTICS_DATA" ADD PRIMARY KEY ("ID") 108 | USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 109 | STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645 110 | PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1 111 | BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT) 112 | TABLESPACE "SYSTEM" ENABLE; 113 | -------------------------------------------------------------------------------- /samples/profile_sample.txt: -------------------------------------------------------------------------------- 1 | -- Define the database 2 | DEFINE DATABASE portfolio; 3 | 4 | -- defined database 5 | DEFINE NAMESPACE IF NOT EXISTS surreal; 6 | 7 | -- use namespace and database 8 | USE NS surreal; 9 | USE DB portfolio; 10 | 11 | -- create table 12 | DEFINE TABLE profile; 13 | DEFINE TABLE contact; 14 | DEFINE TABLE experience; 15 | DEFINE TABLE portfolio; 16 | DEFINE TABLE education; 17 | DEFINE TABLE language; 18 | DEFINE TABLE skill; 19 | 20 | -- Insert data into the profile table 21 | INSERT INTO profile { 22 | about: "I am newbie developer", 23 | avatar: "https://raw.githubusercontent.com/marwin1991/profile-technology-icons/refs/heads/main/icons/github.png", 24 | birth_date: "2000-01-01", 25 | first_name: "John", 26 | gender: "Male", 27 | pdf: { 28 | show_about: true, 29 | show_contact: true, 30 | show_education: true, 31 | show_experience: true, 32 | show_language: true, 33 | show_portfolio: true, 34 | show_profile: true, 35 | show_avatar: true, 36 | show_skill: true, 37 | use_about_pdf_version: false, 38 | use_avatar_pdf_version:false, 39 | use_generate: true, 40 | use_pdf: true 41 | }, 42 | last_name: "Doe", 43 | nationality: "US", 44 | nick_name: "Mr.Robot", 45 | address: "CA USA" 46 | role: "Junior Developer" 47 | }; 48 | 49 | -- Insert data into the contact table 50 | INSERT INTO contact { 51 | contact_icon: "Facebook", 52 | contact_value: "https://www.facebook.com/zuck/", 53 | use_link: true 54 | }; 55 | -- Insert data into the language table 56 | INSERT INTO language { 57 | level: "Native", 58 | name: "English" 59 | }; 60 | 61 | INSERT INTO contact { 62 | contact_title: "My Email", 63 | contact_icon: "Mail", 64 | contact_value: "my@email.com", 65 | use_link: false 66 | }; 67 | 68 | -- Insert data into the experience table 69 | INSERT INTO experience { 70 | company_logo_url: "https://seeklogo.com/images/A/avengers-logo-5B0A68AFB3-seeklogo.com.png", 71 | company_name: "Avengers Team", 72 | company_url: "https://en.wikipedia.org/wiki/List_of_Avengers_members", 73 | describe: "Assisted in retrieving and securing dangerous alien technology (Captain America: Civil War).\nEngaged in high-stakes urban combat during Battle of New York (Infinity War).\nParticipated in intergalactic rescue missions; fought Thanos’ army on Titan.\nBlipped out of existence for five years, then returned to help in the final battle against Thanos (Endgame).\nProvided support in rebuilding efforts post-Blip, maintaining neighborhood security.", 74 | end_date: "2004-01-01", 75 | position_name: "Spider Man", 76 | company_address:"Hollywood USA", 77 | 78 | use_describe_pdf_version: false, 79 | start_date: "2000-01-01" 80 | }; 81 | 82 | -- Insert data into the experience table 83 | INSERT INTO education { 84 | degree: "bachelor's degree", 85 | gpa: "4.00", 86 | graduated_year: "2010", 87 | institute_address: "CA USA", 88 | institute_logo_url: "https://identity.stanford.edu/wp-content/uploads/sites/3/2020/07/SU_SealColor_web3.png", 89 | institute_name: "Stanford University", 90 | major: "computer science" 91 | }; 92 | 93 | -- Insert data into the portfolio table 94 | INSERT INTO portfolio { 95 | uuid: "0a6fb385-39ca-4a4f-8e8b-4ed1643462d7", 96 | index:0, 97 | is_opensource: false, 98 | portfolio_detail: "Fullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin system", 99 | portfolio_icon_url: "https://cdn-icons-png.flaticon.com/512/25/25231.png", 100 | portfolio_link: "https://github.com/zelda2003/leptos_portfolio_admin", 101 | portfolio_name: "Leptos Portfolio Admin", 102 | screenshots_url: [ 103 | "https://149842033.v2.pressablecdn.com/wp-content/uploads/2019/03/breed2-free-portfolio-website-templates.jpg", 104 | "https://themewagon.com/wp-content/uploads/2021/11/html.design.jpg" 105 | ], 106 | stacks: [ 107 | "Rust", 108 | "Leptos", 109 | "ActixWeb", 110 | "Tailwind", 111 | "SurrealDB" 112 | ], 113 | use_portfolio_detail_pdf_version: false 114 | }; 115 | 116 | -- Insert data into the skill table 117 | INSERT INTO skill { 118 | level: "Middle", 119 | name: "Postgresql" 120 | }; 121 | 122 | INSERT INTO skill { 123 | level: "Middle", 124 | name: "MongoDB" 125 | }; 126 | -------------------------------------------------------------------------------- /samples/sqlite_sample.sql: -------------------------------------------------------------------------------- 1 | PRAGMA foreign_keys=OFF; 2 | BEGIN TRANSACTION; 3 | CREATE TABLE items ( 4 | id INTEGER PRIMARY KEY AUTOINCREMENT, 5 | name TEXT NOT NULL, 6 | description TEXT, 7 | tags TEXT, -- To store comma-separated tags or array-like strings 8 | attributes TEXT, -- To store JSON-like strings 9 | price REAL, 10 | is_active BOOLEAN DEFAULT 1, 11 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP 12 | ); 13 | INSERT INTO items VALUES(1,'Laptop','A standard laptop',NULL,NULL,1200.5,1,'2025-04-20 02:04:22'); 14 | INSERT INTO items VALUES(2,'Keyboard','Mechanical keyboard','["gaming", "rgb", "mechanical"]','{"brand": "Keychron", "switches": "brown", "layout": "TKL"}',99.9899999999999949,1,'2025-04-20 02:04:22'); 15 | INSERT INTO items VALUES(3,'Mouse',NULL,'','',25.0,0,'2025-04-20 02:04:22'); 16 | INSERT INTO items VALUES(4,'Monitor','4K Monitor','["large", "4k", "ips", "monitor, curved"]','{"resolution": "3840x2160", "size_inches": 27, "ports": ["HDMI", "DP"]}',350.75,1,'2025-04-20 02:04:22'); 17 | INSERT INTO items VALUES(5,'Webcam','1080p Webcam','video, conference, usb',NULL,45.0,1,'2025-04-20 02:04:22'); 18 | INSERT INTO items VALUES(6,'Desk Chair','Ergonomic office chair','["furniture", "office", "ergonomic"]','{"material": "mesh", "color": "black", "adjustments": {"height": true, "lumbar": "fixed"}}',180.0,1,'2025-04-20 02:04:23'); 19 | DELETE FROM sqlite_sequence; 20 | INSERT INTO sqlite_sequence VALUES('items',6); 21 | COMMIT; 22 | CREATE TABLE drug ( 23 | id INTEGER PRIMARY KEY AUTOINCREMENT, 24 | name TEXT NOT NULL, 25 | description TEXT, 26 | tags TEXT, -- To store comma-separated tags or array-like strings 27 | attributes TEXT, -- To store JSON-like strings 28 | price REAL, 29 | is_active BOOLEAN DEFAULT 1, 30 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP 31 | ); 32 | INSERT INTO drug VALUES(1,'drug','A standard laptop',NULL,NULL,1200.5,1,'2025-04-20 02:04:22'); 33 | INSERT INTO drug VALUES(2,'Keyboard','Mechanical keyboard','["gaming", "rgb", "mechanical"]','{"brand": "Keychron", "switches": "brown", "layout": "TKL"}',99.9899999999999949,1,'2025-04-20 02:04:22'); 34 | INSERT INTO drug VALUES(3,'Mouse',NULL,'','',25.0,0,'2025-04-20 02:04:22'); 35 | INSERT INTO drug VALUES(4,'Monitor','4K Monitor','["large", "4k", "ips", "monitor, curved"]','{"resolution": "3840x2160", "size_inches": 27, "ports": ["HDMI", "DP"]}',350.75,1,'2025-04-20 02:04:22'); 36 | INSERT INTO drug VALUES(5,'Webcam','1080p Webcam','video, conference, usb',NULL,45.0,1,'2025-04-20 02:04:22'); 37 | INSERT INTO drug VALUES(6,'Desk Chair','Ergonomic office chair','["furniture", "office", "ergonomic"]','{"material": "mesh", "color": "black", "adjustments": {"height": true, "lumbar": "fixed"}}',180.0,1,'2025-04-20 02:04:23'); 38 | DELETE FROM sqlite_sequence; 39 | INSERT INTO sqlite_sequence VALUES('items',6); 40 | COMMIT; 41 | -------------------------------------------------------------------------------- /samples/surreal_sample.surql: -------------------------------------------------------------------------------- 1 | -- ------------------------------ 2 | -- OPTION 3 | -- ------------------------------ 4 | 5 | OPTION IMPORT; 6 | 7 | -- ------------------------------ 8 | -- TABLE: products 9 | -- ------------------------------ 10 | 11 | DEFINE TABLE products TYPE ANY SCHEMALESS PERMISSIONS NONE; 12 | 13 | 14 | 15 | 16 | -- ------------------------------ 17 | -- TABLE DATA: products 18 | -- ------------------------------ 19 | 20 | INSERT [ { id: products:dn5pqchc33kxacvcf7x6, name: 'Phone', price: 799.5f, tags: ['electronics', 'mobile', 'Android'], test: { a: 'b', c: 'd', e: 1 } }, { id: products:jqkzvhosxcme1dzert7y, name: 'Desk', price: 250, tags: ['furniture', 'wood', 'large'], test: { a: 'b', c: 'd', e: 1 } }, { id: products:zzbpfmkkmcj4lxqn5knf, name: 'Laptop', price: 1299.99f, tags: ['electronics', 'computer', '16GB RAM'] } ]; 21 | 22 | -------------------------------------------------------------------------------- /src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser, Debug, Clone)] 4 | #[command(author, version, about, long_about = None)] 5 | pub struct Args { 6 | /// Path to the .sql/.surql database dump file to process 7 | #[arg(short = 'f', env = "DUMP_FILE", long, default_value = "./surreal.surql")] 8 | pub dump_file: String, 9 | 10 | /// Target vector database: redis|chroma|milvus|qdrant|surreal|pinecone 11 | #[arg(short = 't', env = "EXPORT_TYPE", long, default_value = "redis")] 12 | pub vector_export_type: String, 13 | 14 | /// Username for database authentication (Milvus, SurrealDB) 15 | #[arg(short = 'u', env = "USER", long, default_value = "root")] 16 | pub user: String, 17 | 18 | /// Password for database authentication (Milvus, SurrealDB, Redis) 19 | #[arg(short = 'p', env = "PASS", long, default_value = "")] 20 | pub pass: String, 21 | 22 | /// API key/token for database authentication (Chroma, Qdrant, Pinecone) 23 | #[arg(short = 'k', env = "SECRET", long, default_value = "")] 24 | pub secret: String, 25 | 26 | /// Enable authentication for the vector database 27 | #[arg(long, env = "AUTH", default_value = "false")] 28 | pub use_auth: bool, 29 | 30 | /// Print parsed JSON records before embedding (debug mode) 31 | #[arg(long, env = "DEBUG", default_value = "false")] 32 | pub debug: bool, 33 | 34 | /// Vector database URL/host endpoint (e.g. redis://127.0.0.1:6379) 35 | #[arg(long, env = "VECTOR_HOST", default_value = "redis://127.0.0.1:6379")] 36 | pub vector_host: String, 37 | 38 | /// Target database name (Chroma, Milvus, Qdrant, Surreal) 39 | #[arg(long, env = "DATABASE", default_value = "default_database")] 40 | pub database: String, 41 | 42 | /// Pinecone index name (only for -t pinecone) 43 | #[arg(long, env = "INDEXES", default_value = "default_indexes")] 44 | pub indexes: String, 45 | 46 | /// Pinecone cloud provider: aws|azure|gcp 47 | #[arg(long, env = "CLOUD", default_value = "aws")] 48 | pub cloud: String, 49 | 50 | /// Pinecone cloud region, e.g. us-east-1 51 | #[arg(long, env = "REGION", default_value = "us-east-1")] 52 | pub region: String, 53 | 54 | /// Tenant name for multi-tenant DBs (Chroma) 55 | #[arg(long, env = "TENANT", default_value = "default_tenant")] 56 | pub tenant: String, 57 | 58 | /// Namespace for databases that support it (SurrealDB, Pinecone) 59 | #[arg(long, env = "NAMESPACE", default_value = "default_namespace")] 60 | pub namespace: String, 61 | 62 | /// Vector dimension size (must match your embedding model) 63 | #[arg(long, env = "DIMENSION", default_value = "768")] 64 | pub dimension: usize, 65 | 66 | /// Distance metric: l2|ip|cosine|euclidean|dotproduct 67 | #[arg(long, env = "METRIC", default_value = "cosine")] 68 | pub metric: String, 69 | 70 | /// Max payload size (MB) per request 71 | #[arg(short = 'm', env = "PAYLOAD_SIZE_MB", long, default_value = "12")] 72 | pub max_payload_size_mb: usize, 73 | 74 | /// Batch size for DB inserts 75 | #[arg(short = 'c', env = "CHUNK_SIZE", long, default_value = "10")] 76 | pub chunk_size: usize, 77 | 78 | /// Which embedding provider to use: ollama, tei, or google 79 | #[arg(long, env = "EMBEDDING_PROVIDER", default_value = "ollama")] 80 | pub embedding_provider: String, 81 | 82 | /// API Key for Google Gemini (required if --embedding-provider=google) 83 | #[arg(long, env = "EMBEDDING_API_KEY")] 84 | pub embedding_api_key: Option, 85 | 86 | /// Embedding model name/id (e.g. nomic-embed-text, text-embedding-004, nomic-embed-text-v2-moe) 87 | #[arg(long, env = "EMBEDDING_MODEL", default_value = "nomic-embed-text")] 88 | pub embedding_model: String, 89 | 90 | /// URL endpoint for Ollama or Google embeddings 91 | #[arg(long, env = "EMBEDDING_URL")] 92 | pub embedding_url: Option, 93 | 94 | /// Parallel embedding requests 95 | #[arg(long, env = "EMBEDDING_MAX_CONCURRENCY", default_value = "4")] 96 | pub embedding_concurrency: usize, 97 | 98 | /// Number of texts per embedding batch 99 | #[arg(long, env = "EMBEDDING_BATCH_SIZE", default_value = "16")] 100 | pub embedding_batch_size: usize, 101 | 102 | /// Max tokens per embedding request (provider-specific) 103 | #[arg(long, env = "EMBEDDING_MAX_TOKENS", default_value = "8192")] 104 | pub embedding_max_tokens: usize, 105 | 106 | /// Timeout (seconds) for embedding calls 107 | #[arg(long, env = "OLLAMA_TIMEOUT", default_value = "60")] 108 | pub embedding_timeout: u64, 109 | 110 | /// Task type for Google Gemini (default: SEMANTIC_SIMILARITY) 111 | #[arg(long, env = "EMBEDDING_TASK_TYPE", default_value = "SEMANTIC_SIMILARITY")] 112 | pub embedding_task_type: String, 113 | 114 | /// CPU threads for parallel tasks (0 = auto detect) 115 | #[arg(long, env = "NUM_THREADS", default_value = "0")] 116 | pub num_threads: usize, 117 | 118 | /// Group Redis records by table name if true (else use FT.CREATE/SEARCH) 119 | #[arg(long, env = "GROUP_REDIS", default_value = "false")] 120 | pub group_redis: bool, 121 | 122 | /// Path to TEI binary (tei-metal or tei-onnx). 123 | /// If you omit this, the embedded TEI will be extracted & launched. 124 | #[arg(long, env = "TEI_BINARY_PATH", default_value = "tei/tei-metal")] 125 | pub tei_binary_path: String, 126 | 127 | /// Port for the managed TEI server (only used if starting TEI locally) 128 | #[arg(long, env = "TEI_LOCAL_PORT", default_value_t = 8080)] 129 | pub tei_local_port: u16, 130 | 131 | /// Apply exclusion rules from config/exclude.json to remove sensitive fields 132 | #[arg(long, env = "USE_EXCLUDE", default_value = "false")] 133 | pub use_exclude: bool, 134 | } 135 | -------------------------------------------------------------------------------- /src/db/chroma.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn, debug }; 2 | use reqwest::blocking::Client; 3 | use serde_json::Value; 4 | use super::{ Database, DbError }; 5 | 6 | pub struct ChromaDatabase { 7 | client: Client, 8 | url: String, 9 | tenant: String, 10 | database: String, 11 | dimension: usize, 12 | auth_token: Option, 13 | metric: String, 14 | } 15 | 16 | impl ChromaDatabase { 17 | pub fn new(args: &crate::cli::Args) -> Result { 18 | let url = format!("{}/api/v2", args.vector_host.trim_end_matches('/')); 19 | let tenant = args.tenant.clone(); 20 | let database = args.database.clone(); 21 | let dimension = args.dimension; 22 | let client = Client::new(); 23 | let auth_token = if args.use_auth && !args.secret.is_empty() { 24 | Some(args.secret.clone()) 25 | } else { 26 | None 27 | }; 28 | 29 | let metric = args.metric.clone(); 30 | Ok(ChromaDatabase { 31 | client, 32 | url, 33 | tenant, 34 | database, 35 | dimension, 36 | auth_token, 37 | metric, 38 | }) 39 | } 40 | } 41 | 42 | impl Database for ChromaDatabase { 43 | 44 | 45 | fn store_vector( 46 | &self, 47 | table: &str, 48 | items: &[(String, Vec, Value)] 49 | ) -> Result<(), DbError> { 50 | if items.is_empty() { 51 | return Ok(()); 52 | } 53 | 54 | let normalized_table = table.to_lowercase(); 55 | if normalized_table != table { 56 | info!("Normalizing Chroma collection name '{}' to '{}'", table, normalized_table); 57 | } 58 | 59 | let dbs_url = format!("{}/tenants/{}/databases", self.url, self.tenant); 60 | let mut list_dbs_req = self.client.get(&dbs_url); 61 | if let Some(ref token) = self.auth_token { 62 | list_dbs_req = list_dbs_req.header("X-Chroma-Token", token); 63 | } 64 | let dbs_json: Value = list_dbs_req.send()?.json()?; 65 | let db_exists = dbs_json 66 | .as_array() 67 | .map(|arr| arr.iter().any(|db| db["name"].as_str() == Some(&self.database))) 68 | .unwrap_or(false); 69 | if !db_exists { 70 | info!("Creating Chroma database '{}'", self.database); 71 | let mut create_db_req = self.client 72 | .post(&dbs_url) 73 | .json(&serde_json::json!({ "name": self.database })); 74 | if let Some(ref token) = self.auth_token { 75 | create_db_req = create_db_req.header("X-Chroma-Token", token); 76 | } 77 | let create_db_res = create_db_req.send()?; 78 | if !create_db_res.status().is_success() { 79 | let err = create_db_res.text()?; 80 | return Err( 81 | format!("Failed to create Chroma database '{}': {}", self.database, err).into() 82 | ); 83 | } 84 | info!("Chroma database '{}' created", self.database); 85 | } 86 | 87 | let collections_url = format!( 88 | "{}/tenants/{}/databases/{}/collections", 89 | self.url, 90 | self.tenant, 91 | self.database 92 | ); 93 | let mut list_req = self.client.get(&collections_url); 94 | if let Some(ref token) = self.auth_token { 95 | list_req = list_req.header("X-Chroma-Token", token); 96 | } 97 | let cols_res = list_req.send()?; 98 | let cols_json: Value = cols_res.json()?; 99 | let mut collection_id: Option = None; 100 | if let Some(arr) = cols_json.as_array() { 101 | for col in arr { 102 | if col["name"].as_str() == Some(&normalized_table) { 103 | collection_id = col["id"].as_str().map(|s| s.to_string()); 104 | break; 105 | } 106 | } 107 | } 108 | let collection_id = match collection_id { 109 | Some(id) => id, 110 | None => { 111 | let col_body = 112 | serde_json::json!({ 113 | "name": normalized_table, 114 | "dimension": self.dimension, 115 | "configuration_json": { 116 | "embedding_function": null, 117 | "hnsw": { 118 | "space": self.metric, 119 | "ef_construction": 100, 120 | "ef_search": 100, 121 | "max_neighbors": 16, 122 | "resize_factor": 1.2, 123 | "sync_threshold": 1000 124 | }, 125 | "spann": null 126 | } 127 | }); 128 | let mut col_req = self.client.post(&collections_url).json(&col_body); 129 | if let Some(ref token) = self.auth_token { 130 | col_req = col_req.header("X-Chroma-Token", token); 131 | } 132 | let col_res = col_req.send()?; 133 | let col_json: Value = col_res.json()?; 134 | debug!("Chroma create collection response: {}", col_json); 135 | 136 | col_json 137 | .get("id") 138 | .or_else(|| col_json.get("collection_id")) 139 | .and_then(|v| v.as_str()) 140 | .ok_or_else(|| { 141 | format!("Failed to get new collection id, response: {}", col_json) 142 | })? 143 | .to_string() 144 | } 145 | }; 146 | 147 | let ids: Vec = items 148 | .iter() 149 | .map(|(id, _, _)| format!("{}:{}", normalized_table, id)) 150 | .collect(); 151 | let embeddings: Vec> = items 152 | .iter() 153 | .map(|(id, vec, _)| { 154 | if vec.is_empty() { 155 | warn!("ID='{}': Empty vector received, inserting dummy value", id); 156 | vec![0.1] 157 | } else if vec.len() != self.dimension { 158 | warn!( 159 | "ID='{}': Vector length {} != collection dimension {}, fixing", 160 | id, 161 | vec.len(), 162 | self.dimension 163 | ); 164 | let mut fixed_vec = vec![0.0; self.dimension]; 165 | for (i, val) in vec.iter().enumerate().take(self.dimension) { 166 | fixed_vec[i] = *val; 167 | } 168 | fixed_vec 169 | } else { 170 | vec.clone() 171 | } 172 | }) 173 | .collect(); 174 | let documents: Vec = items 175 | .iter() 176 | .map(|_| String::new()) 177 | .collect(); 178 | let metadatas: Vec = items 179 | .iter() 180 | .map(|(_, _, m)| { 181 | if let Some(map) = m.as_object() { 182 | let mut simple = serde_json::Map::new(); 183 | for (k, v) in map { 184 | if v.is_string() || v.is_number() || v.is_boolean() { 185 | simple.insert(k.clone(), v.clone()); 186 | } 187 | } 188 | if simple.is_empty() { 189 | Value::Null 190 | } else { 191 | Value::Object(simple) 192 | } 193 | } else { 194 | Value::Null 195 | } 196 | }) 197 | .collect(); 198 | 199 | let body = 200 | serde_json::json!({ 201 | "ids": ids, 202 | "embeddings": embeddings, 203 | "documents": documents, 204 | "metadatas": metadatas 205 | }); 206 | 207 | let add_url = format!( 208 | "{}/tenants/{}/databases/{}/collections/{}/add", 209 | self.url, 210 | self.tenant, 211 | self.database, 212 | collection_id 213 | ); 214 | let mut req = self.client.post(&add_url).json(&body); 215 | if let Some(ref token) = self.auth_token { 216 | req = req.header("X-Chroma-Token", token); 217 | } 218 | let resp = req.send()?; 219 | 220 | let status = resp.status(); 221 | let body_text = resp.text()?; 222 | debug!("Chroma insert response ({}): {}", status, body_text); 223 | 224 | if status.is_success() { 225 | info!("Chroma: inserted {} vectors into '{}' (original: '{}')", 226 | items.len(), normalized_table, table); 227 | Ok(()) 228 | } else if body_text.contains("Error in compaction") { 229 | warn!("Chroma compaction error during insert (ignored): {}", body_text); 230 | info!( 231 | "Chroma: potentially inserted {} vectors into '{}' despite compaction error", 232 | items.len(), 233 | normalized_table 234 | ); 235 | Ok(()) 236 | } else { 237 | Err(format!("Chroma bulk insert failed: {}", body_text).into()) 238 | } 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /src/db/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod redis; 2 | pub mod qdrant; 3 | pub mod chroma; 4 | pub mod milvus; 5 | pub mod surreal; 6 | pub mod pinecone; 7 | pub use redis::RedisDatabase; 8 | pub use milvus::MilvusDatabase; 9 | pub use qdrant::QdrantDatabase; 10 | pub use chroma::ChromaDatabase; 11 | pub use surreal::SurrealDatabase; 12 | pub use pinecone::PineconeDatabase; 13 | use serde_json::Value; 14 | use std::error::Error; 15 | use crate::cli::Args; 16 | 17 | pub type DbError = Box; 18 | 19 | pub trait Database: Send + Sync { 20 | 21 | fn store_vector(&self, table: &str, items: &[(String, Vec, Value)]) -> Result<(), DbError>; 22 | } 23 | 24 | pub fn select_database(args: &Args) -> Result, DbError> { 25 | let database: Box = match args.vector_export_type.as_str() { 26 | "redis" => Box::new(RedisDatabase::new(args)?), 27 | "qdrant" => Box::new(QdrantDatabase::new(args)?), 28 | "chroma" => Box::new(ChromaDatabase::new(args)?), 29 | "milvus" => Box::new(MilvusDatabase::new(args)?), 30 | "surreal" => Box::new(SurrealDatabase::new(args)?), 31 | "pinecone" => Box::new(PineconeDatabase::new(args)?), 32 | _ => { 33 | return Err("Unsupported database type".into()); 34 | } 35 | }; 36 | 37 | Ok(database) 38 | } 39 | 40 | pub fn store_in_batches( 41 | db: &dyn Database, 42 | table: &str, 43 | items: &[(String, Vec, Value)], 44 | max_bytes: usize 45 | ) -> Result<(), DbError> { 46 | let mut start = 0; 47 | let mut cur_size = 0; 48 | for (i, (id, vec, meta)) in items.iter().enumerate() { 49 | let meta_json = serde_json::to_string(meta)?; 50 | let rec_size = id.len() + vec.len() * 4 + meta_json.len(); 51 | if cur_size + rec_size > max_bytes && start < i { 52 | db.store_vector(table, &items[start..i])?; 53 | start = i; 54 | cur_size = rec_size; 55 | } else { 56 | cur_size += rec_size; 57 | } 58 | } 59 | if start < items.len() { 60 | db.store_vector(table, &items[start..])?; 61 | } 62 | Ok(()) 63 | } 64 | -------------------------------------------------------------------------------- /src/db/pinecone.rs: -------------------------------------------------------------------------------- 1 | use reqwest::blocking::Client; 2 | use serde_json::{ Value, json }; 3 | use log::{ info, warn, error }; 4 | use super::{ Database, DbError }; 5 | 6 | pub struct PineconeDatabase { 7 | control_plane_url: String, 8 | data_plane_url: String, 9 | client: Client, 10 | api_version: String, 11 | api_key: Option, 12 | use_auth: bool, 13 | dimension: usize, 14 | } 15 | 16 | impl PineconeDatabase { 17 | pub fn new(args: &crate::cli::Args) -> Result { 18 | let client = Client::new(); 19 | let api_version = "2025-01".to_string(); 20 | let is_local = 21 | args.vector_host.contains("localhost") || 22 | args.vector_host.contains("127.0.0.1") || 23 | args.vector_host.contains("::1"); 24 | 25 | let control_plane_url = if is_local { 26 | args.vector_host.clone() 27 | } else { 28 | "https://api.pinecone.io".to_string() 29 | }; 30 | 31 | let mut parsed_host_from_create: Option = None; 32 | 33 | if !args.indexes.is_empty() && !is_local { 34 | let index_name = args.indexes.as_str(); 35 | let endpoint = "indexes"; 36 | let url = format!("{}/{}", control_plane_url, endpoint); 37 | 38 | let spec = json!({ "serverless": { "cloud": args.cloud, "region": args.region } }); 39 | let body = 40 | json!({ "name": index_name, "dimension": args.dimension, "metric": args.metric, "spec": spec }); 41 | 42 | let mut req = client 43 | .post(&url) 44 | .header("Content-Type", "application/json") 45 | .header("X-Pinecone-API-Version", &api_version) 46 | .json(&body); 47 | 48 | if args.secret.is_empty() { 49 | return Err("Pinecone cloud requires an API key (-k/--secret).".into()); 50 | } 51 | req = req.header("Api-Key", &args.secret); 52 | 53 | let resp = req.send()?; 54 | match resp.status().as_u16() { 55 | 201 | 200 => { 56 | let j: Value = resp.json()?; 57 | let host = j 58 | .get("host") 59 | .and_then(|h| h.as_str()) 60 | .ok_or_else(|| DbError::from("Missing host in create index response"))?; 61 | info!("Index '{}' available at host: {}", index_name, host); 62 | parsed_host_from_create = Some(format!("https://{}", host)); 63 | } 64 | 409 => { 65 | warn!("Index '{}' already exists, attempting to describe it to get host.", index_name); 66 | let describe_url = format!("{}/{}", url, index_name); 67 | let describe_req = client 68 | .get(&describe_url) 69 | .header("Accept", "application/json") 70 | .header("X-Pinecone-API-Version", &api_version) 71 | .header("Api-Key", &args.secret); 72 | 73 | let describe_resp = describe_req.send()?; 74 | if describe_resp.status().is_success() { 75 | let j: Value = describe_resp.json()?; 76 | let host = j 77 | .get("host") 78 | .and_then(|h| h.as_str()) 79 | .ok_or_else(|| 80 | DbError::from("Missing host in describe index response") 81 | )?; 82 | info!("Existing index '{}' found at host: {}", index_name, host); 83 | parsed_host_from_create = Some(format!("https://{}", host)); 84 | } else { 85 | let txt = describe_resp.text().unwrap_or_default(); 86 | return Err( 87 | format!( 88 | "Failed to describe existing index '{}': {}", 89 | index_name, 90 | txt 91 | ).into() 92 | ); 93 | } 94 | } 95 | status => { 96 | let txt = resp.text().unwrap_or_default(); 97 | return Err( 98 | format!("Failed to create/ensure index ({}): {}", status, txt).into() 99 | ); 100 | } 101 | } 102 | } else if !args.indexes.is_empty() && is_local { 103 | warn!( 104 | "Running locally. Assuming database '{}' exists. Skipping creation/check.", 105 | args.indexes 106 | ); 107 | } 108 | 109 | let data_plane_url = if is_local { 110 | args.vector_host.clone() 111 | } else { 112 | if args.vector_host.contains(".svc.") && args.vector_host.contains(".pinecone.io") { 113 | info!("Using provided --host as data plane URL: {}", args.vector_host); 114 | if args.vector_host.starts_with("https://") { 115 | args.vector_host.clone() 116 | } else { 117 | format!("https://{}", args.vector_host) 118 | } 119 | } else if let Some(host) = parsed_host_from_create { 120 | info!("Using host from create/describe API response as data plane URL: {}", host); 121 | host 122 | } else { 123 | return Err( 124 | "Could not determine Pinecone data plane URL. Provide it via --host or ensure --indexes is set correctly.".into() 125 | ); 126 | } 127 | }; 128 | if !is_local && args.secret.is_empty() { 129 | return Err("Pinecone cloud requires an API key (-k/--secret).".into()); 130 | } 131 | 132 | let pd = PineconeDatabase { 133 | control_plane_url, 134 | data_plane_url, 135 | client, 136 | api_version, 137 | api_key: if args.secret.is_empty() { 138 | None 139 | } else { 140 | Some(args.secret.clone()) 141 | }, 142 | use_auth: !is_local, 143 | dimension: args.dimension, 144 | }; 145 | 146 | info!("Pinecone mode: {}", if is_local { "LOCAL" } else { "CLOUD" }); 147 | info!("Control plane URL: {}", pd.control_plane_url); 148 | info!("Data plane URL: {}", pd.data_plane_url); 149 | 150 | Ok(pd) 151 | } 152 | } 153 | impl Database for PineconeDatabase { 154 | 155 | fn store_vector( 156 | &self, 157 | table: &str, 158 | items: &[(String, Vec, Value)] 159 | ) -> Result<(), DbError> { 160 | if items.is_empty() { 161 | return Ok(()); 162 | } 163 | 164 | let normalized_namespace = table.to_lowercase(); 165 | if normalized_namespace != table { 166 | info!("Normalizing Pinecone namespace '{}' to '{}'", table, normalized_namespace); 167 | } 168 | 169 | let url = format!("{}/vectors/upsert", self.data_plane_url); 170 | let vectors: Vec = items 171 | .iter() 172 | .map(|(id, vector, data)| { 173 | let values = if vector.is_empty() { 174 | warn!("ID='{}': Empty vector received, inserting dummy values", id); 175 | vec![0.1; self.dimension] 176 | } else if vector.len() != self.dimension { 177 | warn!( 178 | "ID='{}': Vector length {} != expected dimension {}, fixing", 179 | id, 180 | vector.len(), 181 | self.dimension 182 | ); 183 | let mut fixed_vec = vec![0.0; self.dimension]; 184 | for (i, val) in vector.iter().enumerate().take(self.dimension) { 185 | fixed_vec[i] = *val; 186 | } 187 | fixed_vec 188 | } else { 189 | vector.clone() 190 | }; 191 | 192 | let mut record = 193 | json!({ 194 | "id": id, 195 | "values": values 196 | }); 197 | 198 | let mut processed_metadata = serde_json::Map::new(); 199 | processed_metadata.insert("table".to_string(), Value::String(table.to_string())); 200 | 201 | if let Some(map) = data.as_object() { 202 | for (k, v) in map { 203 | if v.is_null() { 204 | continue; 205 | } 206 | if v.is_object() || v.is_array() { 207 | processed_metadata.insert( 208 | k.clone(), 209 | Value::String(serde_json::to_string(v).unwrap_or_default()) 210 | ); 211 | } else { 212 | processed_metadata.insert(k.clone(), v.clone()); 213 | } 214 | } 215 | } 216 | record["metadata"] = Value::Object(processed_metadata); 217 | record 218 | }) 219 | .collect(); 220 | 221 | let payload = json!({ 222 | "vectors": vectors, 223 | "namespace": normalized_namespace 224 | }); 225 | 226 | let mut req = self.client 227 | .post(&url) 228 | .header("Content-Type", "application/json") 229 | .header("Accept", "application/json") 230 | .header("X-Pinecone-API-Version", &self.api_version); 231 | 232 | if self.use_auth { 233 | if let Some(key) = self.api_key.as_ref() { 234 | req = req.header("Api-Key", key); 235 | } else { 236 | error!("Pinecone auth enabled but no API key available."); 237 | return Err("Pinecone auth enabled but no API key available.".into()); 238 | } 239 | } 240 | 241 | let resp = req.json(&payload).send()?; 242 | if resp.status().is_success() { 243 | let j: Value = resp.json()?; 244 | let count = j 245 | .get("upsertedCount") 246 | .and_then(|c| c.as_u64()) 247 | .unwrap_or(0); 248 | info!("Pinecone: upserted {} vectors into namespace `{}` (original: '{}')", 249 | count, normalized_namespace, table); 250 | Ok(()) 251 | } else { 252 | let status = resp.status(); 253 | let txt = resp.text()?; 254 | error!("Pinecone bulk upsert failed for namespace '{}' ({}): {}", 255 | normalized_namespace, status, txt); 256 | Err(format!("Pinecone bulk upsert error for namespace '{}': {}", table, txt).into()) 257 | } 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /src/db/qdrant.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn }; 2 | use reqwest::blocking::Client; 3 | use serde_json::{ json, Value }; 4 | use super::{ Database, DbError }; 5 | 6 | pub struct QdrantDatabase { 7 | client: Client, 8 | url: String, 9 | api_key: Option, 10 | dimension: usize, 11 | metric: String, 12 | } 13 | 14 | impl QdrantDatabase { 15 | pub fn new(args: &crate::cli::Args) -> Result { 16 | let qdrant_url = args.vector_host.clone(); 17 | let api_key = if args.use_auth && !args.secret.is_empty() { 18 | Some(args.secret.clone()) 19 | } else { 20 | None 21 | }; 22 | let client = Client::new(); 23 | 24 | Ok(QdrantDatabase { 25 | client, 26 | url: qdrant_url, 27 | api_key, 28 | dimension: args.dimension, 29 | metric: args.metric.clone(), 30 | }) 31 | } 32 | } 33 | 34 | impl Database for QdrantDatabase { 35 | 36 | fn store_vector( 37 | &self, 38 | table: &str, 39 | items: &[(String, Vec, Value)] 40 | ) -> Result<(), DbError> { 41 | if items.is_empty() { 42 | return Ok(()); 43 | } 44 | 45 | let normalized_table = table.to_lowercase(); 46 | let coll_url = format!("{}/collections/{}", self.url, normalized_table); 47 | let mut chk = self.client.get(&coll_url); 48 | if let Some(k) = &self.api_key { 49 | chk = chk.header("api-key", k); 50 | } 51 | let resp = chk.send()?; 52 | if resp.status().as_u16() == 404 { 53 | let distance = match self.metric.to_lowercase().as_str() { 54 | "cosine" => "Cosine", 55 | "euclidean" => "Euclidean", 56 | "dotproduct" | "dot" => "Dot", 57 | other => { 58 | warn!("Unknown metric '{}', falling back to Cosine", other); 59 | "Cosine" 60 | } 61 | }; 62 | 63 | info!( 64 | "Creating Qdrant collection '{}' (from table '{}') with dimension {} and distance {}", 65 | normalized_table, table, self.dimension, distance 66 | ); 67 | let body = 68 | json!({ 69 | "vectors": { 70 | "size": self.dimension, 71 | "distance": distance 72 | } 73 | }); 74 | let mut crt = self.client.put(&coll_url).json(&body); 75 | if let Some(k) = &self.api_key { 76 | crt = crt.header("api-key", k); 77 | } 78 | let cr = crt.send()?; 79 | if !cr.status().is_success() { 80 | let err = cr.text()?; 81 | warn!("Failed to create collection '{}': {}. Attempting to insert anyway.", normalized_table, err); 82 | } 83 | } 84 | 85 | let points: Vec = items 86 | .iter() 87 | .map(|(id, vec, payload)| { 88 | let v = if vec.len() == self.dimension { 89 | vec.clone() 90 | } else { 91 | warn!( 92 | "ID={}: vector length {} ≠ {}, filling zeros", 93 | id, 94 | vec.len(), 95 | self.dimension 96 | ); 97 | vec![0.0; self.dimension] 98 | }; 99 | json!({ "id": id, "vector": v, "payload": payload }) 100 | }) 101 | .collect(); 102 | 103 | let up_url = format!("{}/collections/{}/points?wait=true", self.url, normalized_table); 104 | let mut up = self.client.put(&up_url).json(&json!({ "points": points })); 105 | if let Some(k) = &self.api_key { 106 | up = up.header("api-key", k); 107 | } 108 | let up_res = up.send()?; 109 | if up_res.status().is_success() { 110 | info!("Qdrant: upserted {} points into `{}`", items.len(), normalized_table); 111 | Ok(()) 112 | } else { 113 | let txt = up_res.text()?; 114 | Err(format!("Qdrant upsert failed: {}", txt).into()) 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/db/redis.rs: -------------------------------------------------------------------------------- 1 | use redis::Client; 2 | use serde_json::Value; 3 | use log::{ info, warn, debug }; 4 | use std::io::{ Error as IoError, ErrorKind }; 5 | use super::{ Database, DbError }; 6 | 7 | pub struct RedisDatabase { 8 | client: Client, 9 | password: Option, 10 | dimension: usize, 11 | metric: String, 12 | group_redis: bool, 13 | } 14 | 15 | impl RedisDatabase { 16 | pub fn new(args: &crate::cli::Args) -> Result { 17 | info!("Connecting to Redis at {}", args.vector_host); 18 | let client = Client::open(args.vector_host.as_str()).map_err( 19 | |e| 20 | Box::new( 21 | IoError::new(ErrorKind::Other, format!("Failed to open Redis client: {}", e)) 22 | ) as DbError 23 | )?; 24 | let password = if args.use_auth && !args.pass.is_empty() { 25 | Some(args.pass.clone()) 26 | } else { 27 | None 28 | }; 29 | 30 | let mut conn = client 31 | .get_connection() 32 | .map_err( 33 | |e| 34 | Box::new( 35 | IoError::new( 36 | ErrorKind::Other, 37 | format!("Failed to get Redis connection: {}", e) 38 | ) 39 | ) as DbError 40 | )?; 41 | if let Some(ref pass) = password { 42 | redis 43 | ::cmd("AUTH") 44 | .arg(pass) 45 | .query::<()>(&mut conn) 46 | .map_err( 47 | |e| 48 | Box::new( 49 | IoError::new(ErrorKind::Other, format!("Redis AUTH failed: {}", e)) 50 | ) as DbError 51 | )?; 52 | info!("Redis AUTH successful"); 53 | } 54 | 55 | let pong: String = redis 56 | ::cmd("PING") 57 | .query(&mut conn) 58 | .map_err( 59 | |e| 60 | Box::new( 61 | IoError::new(ErrorKind::Other, format!("Redis PING failed: {}", e)) 62 | ) as DbError 63 | )?; 64 | if pong != "PONG" { 65 | warn!("Redis PING received unexpected response: {}", pong); 66 | } else { 67 | info!("Redis PING successful"); 68 | } 69 | 70 | Ok(RedisDatabase { 71 | client, 72 | password, 73 | dimension: args.dimension, 74 | metric: args.metric.clone(), 75 | group_redis: args.group_redis, 76 | }) 77 | } 78 | 79 | fn get_connection(&self) -> Result { 80 | let mut con = self.client 81 | .get_connection() 82 | .map_err( 83 | |e| 84 | Box::new( 85 | IoError::new( 86 | ErrorKind::Other, 87 | format!("Failed to get Redis connection: {}", e) 88 | ) 89 | ) as DbError 90 | )?; 91 | if let Some(ref pass) = self.password { 92 | redis 93 | ::cmd("AUTH") 94 | .arg(pass) 95 | .query::<()>(&mut con) 96 | .map_err( 97 | |e| 98 | Box::new( 99 | IoError::new(ErrorKind::Other, format!("Redis AUTH failed: {}", e)) 100 | ) as DbError 101 | )?; 102 | } 103 | Ok(con) 104 | } 105 | 106 | fn map_metric_to_redis(&self) -> &str { 107 | match self.metric.to_lowercase().as_str() { 108 | "cosine" => "COSINE", 109 | "l2" | "euclidean" => "L2", 110 | "ip" | "dotproduct" | "innerproduct" => "IP", 111 | _ => { 112 | warn!("Unsupported metric '{}', defaulting to COSINE", self.metric); 113 | "COSINE" 114 | } 115 | } 116 | } 117 | 118 | fn ensure_index_exists( 119 | &self, 120 | con: &mut redis::Connection, 121 | table: &str, 122 | sample_data: Option<&Value> 123 | ) -> Result<(), DbError> { 124 | let index_name = format!("idx:{}", table); 125 | 126 | match redis::cmd("FT.INFO").arg(&index_name).query::>(con) { 127 | Ok(_) => { 128 | return Ok(()); 129 | } 130 | Err(_) => { 131 | info!("Index '{}' not found, creating it now", index_name); 132 | } 133 | } 134 | 135 | let mut ft = redis::cmd("FT.CREATE"); 136 | ft.arg(&index_name) 137 | .arg("ON") 138 | .arg("JSON") 139 | .arg("PREFIX") 140 | .arg("1") 141 | .arg(format!("item:{}:", table)) 142 | .arg("SCHEMA") 143 | .arg("$.vector") 144 | .arg("AS") 145 | .arg("vector") 146 | .arg("VECTOR") 147 | .arg("FLAT") 148 | .arg("6") 149 | .arg("TYPE") 150 | .arg("FLOAT32") 151 | .arg("DIM") 152 | .arg(self.dimension.to_string()) 153 | .arg("DISTANCE_METRIC") 154 | .arg(self.map_metric_to_redis()); 155 | 156 | if let Some(Value::Object(data_map)) = sample_data { 157 | info!("Attempting to discover schema from first item data for index '{}'", index_name); 158 | let standard_fields = vec![ 159 | ("source_table".to_string(), "TEXT".to_string()), 160 | ("original_id".to_string(), "TEXT".to_string()) 161 | ]; 162 | 163 | for (field, idx_ty_str) in standard_fields { 164 | debug!("Adding standard field to schema: $.{} AS {} {}", field, field, idx_ty_str); 165 | ft.arg(format!("$.{}", field)).arg("AS").arg(&field).arg(&idx_ty_str); 166 | if idx_ty_str == "TEXT" { 167 | ft.arg("SORTABLE"); 168 | } 169 | } 170 | 171 | for (field, value) in data_map { 172 | if field == "vector" || field == "source_table" || field == "original_id" { 173 | continue; 174 | } 175 | 176 | let idx_ty = match value { 177 | Value::String(_) => "TEXT", 178 | Value::Number(_) => "NUMERIC", 179 | Value::Bool(_) => "NUMERIC", 180 | _ => { 181 | continue; 182 | } 183 | }; 184 | 185 | debug!("Adding discovered field to schema: $.{} AS {} {}", field, field, idx_ty); 186 | ft.arg(format!("$.{}", field)).arg("AS").arg(field).arg(idx_ty); 187 | if idx_ty == "TEXT" { 188 | ft.arg("SORTABLE"); 189 | } 190 | } 191 | } else { 192 | warn!("No sample data provided or sample data is not a JSON object for index '{}'. Only indexing vector field.", index_name); 193 | ft.arg("$.source_table").arg("AS").arg("source_table").arg("TEXT").arg("SORTABLE"); 194 | ft.arg("$.original_id").arg("AS").arg("original_id").arg("TEXT").arg("SORTABLE"); 195 | } 196 | 197 | match ft.query::<()>(con) { 198 | Ok(_) => { 199 | info!("Created Redis index '{}'", index_name); 200 | Ok(()) 201 | } 202 | Err(e) => { 203 | let msg = e.to_string(); 204 | if msg.contains("Index already exists") { 205 | info!("Index '{}' already exists (concurrent creation?), skipping", index_name); 206 | Ok(()) 207 | } else { 208 | Err( 209 | Box::new( 210 | IoError::new( 211 | ErrorKind::Other, 212 | format!("FT.CREATE failed for index '{}': {}", index_name, msg) 213 | ) 214 | ) as DbError 215 | ) 216 | } 217 | } 218 | } 219 | } 220 | } 221 | 222 | impl Database for RedisDatabase { 223 | 224 | fn store_vector( 225 | &self, 226 | table: &str, 227 | items: &[(String, Vec, Value)] 228 | ) -> Result<(), DbError> { 229 | if items.is_empty() { 230 | return Ok(()); 231 | } 232 | 233 | let normalized_table = table.to_lowercase(); 234 | if normalized_table != table { 235 | info!("Normalizing Redis table/index name '{}' to '{}'", table, normalized_table); 236 | } 237 | 238 | let mut con = self.get_connection()?; 239 | 240 | if self.group_redis { 241 | let key = format!("table:{}", normalized_table); 242 | 243 | let docs: Vec = items 244 | .iter() 245 | .map(|(id, vec, data)| { 246 | let mut obj = serde_json::Map::new(); 247 | obj.insert("id".to_string(), Value::String(id.clone())); 248 | obj.insert("vector".to_string(), serde_json::to_value(vec).unwrap()); 249 | if let Value::Object(map) = data { 250 | for (k, v) in map { 251 | if k != "vector" { 252 | obj.insert(k.clone(), v.clone()); 253 | } 254 | } 255 | } 256 | Value::Object(obj) 257 | }) 258 | .collect(); 259 | 260 | let payload = Value::Array(docs); 261 | 262 | redis 263 | ::cmd("JSON.SET") 264 | .arg(&key) 265 | .arg("$") 266 | .arg(serde_json::to_string(&payload)?) 267 | .query::<()>(&mut con) 268 | .map_err(|e| { 269 | Box::new( 270 | IoError::new( 271 | ErrorKind::Other, 272 | format!("Redis JSON.SET failed for '{}': {}", key, e) 273 | ) 274 | ) as DbError 275 | })?; 276 | 277 | info!("Stored {} items grouped for table '{}' (original: '{}')", 278 | items.len(), normalized_table, table); 279 | return Ok(()); 280 | } 281 | 282 | let first_item_data = items.first().map(|(_, _, data)| data); 283 | self.ensure_index_exists(&mut con, &normalized_table, first_item_data)?; 284 | 285 | let mut pipe = redis::pipe(); 286 | pipe.atomic(); 287 | 288 | for (id, vec, data) in items { 289 | let key = format!("item:{}:{}", normalized_table, id); 290 | let mut record_obj = serde_json::Map::new(); 291 | record_obj.insert("vector".to_string(), serde_json::to_value(vec)?); 292 | record_obj.insert("source_table".to_string(), Value::String(table.to_string())); 293 | record_obj.insert("original_id".to_string(), Value::String(id.clone())); 294 | 295 | if let Value::Object(obj) = data { 296 | for (k, v) in obj { 297 | if k != "vector" && k != "source_table" { 298 | record_obj.insert(k.clone(), v.clone()); 299 | } 300 | } 301 | } 302 | 303 | debug!( 304 | "Redis JSON document for {}: {}", 305 | key, 306 | serde_json::to_string(&Value::Object(record_obj.clone()))? 307 | ); 308 | 309 | pipe.cmd("JSON.SET") 310 | .arg(&key) 311 | .arg("$") 312 | .arg(serde_json::to_string(&Value::Object(record_obj))?); 313 | } 314 | 315 | pipe 316 | .query::<()>(&mut con) 317 | .map_err(|e| { 318 | Box::new( 319 | IoError::new( 320 | ErrorKind::Other, 321 | format!("Redis pipeline failed for table '{}': {}", table, e) 322 | ) 323 | ) as DbError 324 | })?; 325 | 326 | info!("Stored {} items for table '{}' (original: '{}') in Redis", 327 | items.len(), normalized_table, table); 328 | Ok(()) 329 | } 330 | } 331 | -------------------------------------------------------------------------------- /src/db/surreal.rs: -------------------------------------------------------------------------------- 1 | use base64::{ engine::general_purpose::STANDARD, Engine as _ }; 2 | use log::{ info, error, warn }; 3 | use reqwest::blocking::Client; 4 | use serde_json::Value; 5 | use super::{ Database, DbError }; 6 | 7 | pub struct SurrealDatabase { 8 | url: String, 9 | ns: String, 10 | db: String, 11 | auth_header: Option, 12 | client: Client, 13 | } 14 | 15 | impl SurrealDatabase { 16 | pub fn new(args: &crate::cli::Args) -> Result { 17 | let base_url = args.vector_host.clone(); 18 | let sql_url = format!("{}/sql", base_url.trim_end_matches('/')); 19 | let ns = args.namespace.clone(); 20 | let db = args.database.clone(); 21 | let client = Client::new(); 22 | let auth_header = if args.use_auth { 23 | Some(format!("Basic {}", STANDARD.encode(format!("{}:{}", args.user, args.pass)))) 24 | } else { 25 | None 26 | }; 27 | 28 | let define_ns_sql = format!("DEFINE NAMESPACE IF NOT EXISTS {};", ns); 29 | info!("Sending DEFINE NAMESPACE: {}", define_ns_sql); 30 | let mut req_ns = client 31 | .post(&sql_url) 32 | .header("Content-Type", "text/plain") 33 | .header("Accept", "application/json") 34 | .body(define_ns_sql); 35 | 36 | if let Some(ref auth) = auth_header { 37 | req_ns = req_ns.header("Authorization", auth); 38 | } 39 | 40 | let resp_ns = req_ns.send().map_err(|e| Box::new(e) as DbError)?; 41 | let status_ns = resp_ns.status(); 42 | let text_ns = resp_ns.text().map_err(|e| Box::new(e) as DbError)?; 43 | 44 | info!("SurrealDB DEFINE NAMESPACE response: {}", text_ns); 45 | if !status_ns.is_success() && !text_ns.contains("already exists") { 46 | error!("Failed to execute DEFINE NAMESPACE (Status: {}): {}", status_ns, text_ns); 47 | } 48 | 49 | let define_db_sql = format!("DEFINE DATABASE IF NOT EXISTS {};", db); 50 | info!("Sending DEFINE DATABASE: {}", define_db_sql); 51 | let mut req_db = client 52 | .post(&sql_url) 53 | .header("Content-Type", "text/plain") 54 | .header("Accept", "application/json") 55 | .header("Surreal-NS", &ns) 56 | .body(define_db_sql); 57 | 58 | if let Some(ref auth) = auth_header { 59 | req_db = req_db.header("Authorization", auth); 60 | } 61 | 62 | let resp_db = req_db.send().map_err(|e| Box::new(e) as DbError)?; 63 | let status_db = resp_db.status(); 64 | let text_db = resp_db.text().map_err(|e| Box::new(e) as DbError)?; 65 | 66 | info!("SurrealDB DEFINE DATABASE response: {}", text_db); 67 | if !status_db.is_success() && !text_db.contains("already exists") { 68 | error!("Failed to execute DEFINE DATABASE (Status: {}): {}", status_db, text_db); 69 | } 70 | 71 | Ok(SurrealDatabase { url: base_url, ns, db, auth_header, client }) 72 | } 73 | 74 | fn ensure_table_exists(&self, table: &str) -> Result<(), DbError> { 75 | let sql_url = format!("{}/sql", self.url.trim_end_matches('/')); 76 | let define_table_sql = 77 | format!("DEFINE TABLE IF NOT EXISTS `{}` TYPE ANY SCHEMALESS PERMISSIONS NONE;", table); 78 | 79 | info!("Ensuring table exists: {}", define_table_sql); 80 | 81 | let mut req = self.client 82 | .post(&sql_url) 83 | .header("Content-Type", "text/plain") 84 | .header("Accept", "application/json") 85 | .header("Surreal-NS", &self.ns) 86 | .header("Surreal-DB", &self.db) 87 | .body(define_table_sql); 88 | 89 | if let Some(ref auth) = self.auth_header { 90 | req = req.header("Authorization", auth); 91 | } 92 | 93 | let resp = req.send().map_err(|e| Box::new(e) as DbError)?; 94 | let status = resp.status(); 95 | let text = resp.text().map_err(|e| Box::new(e) as DbError)?; 96 | 97 | info!("SurrealDB DEFINE TABLE response ({}): {}", status, text); 98 | 99 | if !status.is_success() { 100 | warn!("Potential issue defining table '{}' (Status: {}): {}", table, status, text); 101 | } 102 | 103 | Ok(()) 104 | } 105 | 106 | } 107 | 108 | impl Database for SurrealDatabase { 109 | 110 | fn store_vector( 111 | &self, 112 | table: &str, 113 | items: &[(String, Vec, Value)] 114 | ) -> Result<(), DbError> { 115 | if items.is_empty() { 116 | return Ok(()); 117 | } 118 | 119 | let normalized_table = table.to_lowercase(); 120 | if normalized_table != table { 121 | info!("Normalizing SurrealDB table name '{}' to '{}'", table, normalized_table); 122 | } 123 | 124 | self.ensure_table_exists(&normalized_table)?; 125 | 126 | let records: Vec<(String, Value)> = items 127 | .iter() 128 | .map(|(id, vector, meta)| { 129 | let mut record = meta.clone(); 130 | if let Some(obj) = record.as_object_mut() { 131 | obj.insert( 132 | "vector".to_string(), 133 | serde_json::to_value(vector).unwrap_or_default() 134 | ); 135 | obj.insert("original_table".to_string(), Value::String(table.to_string())); 136 | } 137 | (id.clone(), record) 138 | }) 139 | .collect(); 140 | 141 | let import_url = format!("{}/import", self.url.trim_end_matches('/')); 142 | let mut import_data = String::new(); 143 | 144 | for (id, data) in &records { 145 | let record_id = format!("{}:`{}`", normalized_table, id); 146 | let content_json = serde_json::to_string(&data)?; 147 | import_data.push_str(&format!("CREATE {} CONTENT {};\n", record_id, content_json)); 148 | } 149 | 150 | info!("SurrealDB Import URL: {}", import_url); 151 | let preview_len = import_data.chars().count().min(300); 152 | info!( 153 | "SurrealDB Import Payload Preview: {}...", 154 | import_data.chars().take(preview_len).collect::() 155 | ); 156 | 157 | let mut req = self.client 158 | .post(&import_url) 159 | .header("Surreal-NS", &self.ns) 160 | .header("Surreal-DB", &self.db) 161 | .header("Content-Type", "text/plain") 162 | .header("Accept", "application/json") 163 | .body(import_data); 164 | 165 | if let Some(ref auth) = self.auth_header { 166 | req = req.header("Authorization", auth); 167 | } 168 | 169 | let resp = req.send()?; 170 | let status = resp.status(); 171 | let text = resp.text().unwrap_or_else(|e| format!("Failed to read response body: {}", e)); 172 | info!("SurrealDB Import Response Status: {}", status); 173 | info!("SurrealDB Import Response Body: {}", text); 174 | 175 | if !status.is_success() { 176 | return Err(format!("SurrealDB batch import failed ({}): {}", status, text).into()); 177 | } 178 | 179 | info!("SurrealDB: successfully imported {} records to {} (original: {})", 180 | records.len(), normalized_table, table); 181 | Ok(()) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/embedding/embeding.rs: -------------------------------------------------------------------------------- 1 | use log::{ error, info, warn }; 2 | use serde_json::Value; 3 | use std::error::Error as StdError; 4 | use std::sync::atomic::{ AtomicUsize, Ordering }; 5 | use std::sync::Arc; 6 | use tokio::runtime::Runtime; 7 | use uuid::Uuid; 8 | use crate::cli::Args; 9 | use crate::embedding::{ 10 | models::google::GoogleEmbeddingClient, 11 | models::ollama::OllamaEmbeddingClient, 12 | models::tei::TeiEmbeddingClient, 13 | AsyncEmbeddingGenerator, 14 | }; 15 | 16 | pub fn initialize_embedding_generator( 17 | args: &Args, 18 | override_url: Option<&str>, 19 | ) -> Result, Box> { 20 | let provider = args.embedding_provider.to_lowercase(); 21 | info!("Selected embedding provider: {}", provider); 22 | 23 | let url = override_url 24 | .or_else(|| args.embedding_url.as_deref()) 25 | .map(|s| s.to_string()); 26 | 27 | match provider.as_str() { 28 | "tei" => { 29 | let port = args.tei_local_port; 30 | let url_to_use = match override_url { 31 | Some(url) => url.to_string(), 32 | None => args.embedding_url.as_deref() 33 | .unwrap_or(&format!("http://localhost:{}", port)) 34 | .to_string() 35 | }; 36 | 37 | info!("🟢 TEI client connecting to {}", url_to_use); 38 | 39 | let client = TeiEmbeddingClient::new( 40 | url_to_use, 41 | args.dimension, 42 | args.embedding_timeout 43 | )?; 44 | Ok(Box::new(client)) 45 | } 46 | 47 | "ollama" => { 48 | let ollama_url = url.unwrap_or_else(|| "http://localhost:11434".into()); 49 | info!("🟢 Ollama client -> {}", ollama_url); 50 | let client = OllamaEmbeddingClient::new( 51 | &ollama_url, 52 | &args.embedding_model, 53 | args.dimension, 54 | )?; 55 | Ok(Box::new(client)) 56 | } 57 | 58 | "google" => { 59 | let api_key = args.embedding_api_key 60 | .clone() 61 | .ok_or_else(|| "Missing EMBEDDING_API_KEY for Google".to_string())?; 62 | info!("🟢 Google client"); 63 | let client = GoogleEmbeddingClient::new( 64 | api_key, 65 | Some(args.embedding_model.clone()), 66 | args.dimension, 67 | )?; 68 | Ok(Box::new(client)) 69 | } 70 | 71 | other => Err(format!("Unsupported embedding provider: {}", other).into()), 72 | } 73 | } 74 | 75 | pub fn process_records_with_embeddings( 76 | records: Vec, 77 | args: &Args, 78 | embedding_counter: Arc, 79 | generator: Box 80 | ) -> Result, Value)>, Box> { 81 | let chunk_size = args.embedding_batch_size; 82 | let total_records = records.len(); 83 | let mut prepared_records = Vec::with_capacity(total_records); 84 | let rt = Runtime::new()?; 85 | let approx_char_limit_from_tokens = args.embedding_max_tokens * 3; 86 | 87 | for (chunk_idx, chunk) in records.chunks(chunk_size).enumerate() { 88 | info!( 89 | "Processing embedding chunk {}/{}", 90 | chunk_idx + 1, 91 | (total_records + chunk_size - 1) / chunk_size 92 | ); 93 | 94 | let texts: Vec = chunk 95 | .iter() 96 | .map(|record| { 97 | let mut full_text = record 98 | .as_object() 99 | .map(|obj| { 100 | obj.iter() 101 | .filter(|(k, _)| *k != "table" && *k != "id") 102 | .map(|(k, v)| format!("{}: {}", k, v)) 103 | .collect::>() 104 | .join(", ") 105 | }) 106 | .unwrap_or_else(|| record.to_string()); 107 | 108 | if full_text.chars().count() > approx_char_limit_from_tokens { 109 | warn!( 110 | "Client-side truncation: Input text for a record ({} chars) exceeds approximate limit derived from embedding_max_tokens ({} tokens -> ~{} chars). Truncating. Provider might also truncate based on its own limits.", 111 | full_text.chars().count(), 112 | args.embedding_max_tokens, 113 | approx_char_limit_from_tokens 114 | ); 115 | full_text = full_text.chars().take(approx_char_limit_from_tokens).collect::(); 116 | } 117 | full_text 118 | }) 119 | .collect(); 120 | 121 | let embeddings_result = rt.block_on(generator.generate_embeddings_batch(&texts)); 122 | 123 | match embeddings_result { 124 | Ok(embeddings) => { 125 | if embeddings.len() != chunk.len() { 126 | error!( 127 | "CRITICAL: Embedding generator returned {} results for {} inputs in chunk {}", 128 | embeddings.len(), 129 | chunk.len(), 130 | chunk_idx + 1 131 | ); 132 | return Err( 133 | format!( 134 | "Embedding generator returned incomplete results: got {}/{}", 135 | embeddings.len(), 136 | chunk.len() 137 | ).into() 138 | ); 139 | } 140 | 141 | let chunk_results: Vec<_> = chunk 142 | .iter() 143 | .zip(embeddings.into_iter()) 144 | .map(|(record, vec)| { 145 | let id = Uuid::new_v4().to_string(); 146 | let mut meta = record.clone(); 147 | let table = meta 148 | .get("table") 149 | .and_then(|t| t.as_str()) 150 | .unwrap_or("unknown_table") 151 | .to_string(); 152 | if let Some(_obj) = meta.as_object_mut() { 153 | } 154 | (table, id, vec, meta) 155 | }) 156 | .collect(); 157 | 158 | prepared_records.extend(chunk_results); 159 | embedding_counter.fetch_add(chunk.len(), Ordering::Relaxed); 160 | } 161 | Err(e) => { 162 | error!("CRITICAL: Embedding generation failed for chunk {}: {}", chunk_idx + 1, e); 163 | return Err(format!("Embedding generation failed: {}", e).into()); 164 | } 165 | } 166 | } 167 | 168 | Ok(prepared_records) 169 | } -------------------------------------------------------------------------------- /src/embedding/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod embeding; 2 | pub mod models; 3 | 4 | use async_trait::async_trait; 5 | use std::error::Error as StdError; 6 | 7 | #[async_trait] 8 | pub trait AsyncEmbeddingGenerator: Send + Sync { 9 | async fn generate_embeddings_batch( 10 | &self, 11 | texts: &[String] 12 | ) -> Result>, Box>; 13 | 14 | fn get_dimension(&self) -> usize; 15 | } 16 | 17 | pub trait EmbeddingModel { 18 | fn generate_embedding(&self, text: &str) -> Result, Box>; 19 | } 20 | 21 | pub struct EmbeddingService { 22 | model: T, 23 | } 24 | 25 | impl EmbeddingService { 26 | pub fn new(model: T) -> Self { 27 | Self { model } 28 | } 29 | 30 | pub fn generate(&self, text: &str) -> Result, Box> { 31 | self.model.generate_embedding(text) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/embedding/models/google.rs: -------------------------------------------------------------------------------- 1 | use crate::embedding::AsyncEmbeddingGenerator; 2 | use async_trait::async_trait; 3 | use log::{ info, error, warn, debug }; 4 | use reqwest::Client; 5 | use serde_json::{ json, Value }; 6 | use std::error::Error as StdError; 7 | use std::time::Duration; 8 | use tokio::time::sleep; 9 | 10 | pub struct GoogleEmbeddingClient { 11 | client: Client, 12 | api_key: String, 13 | model_name: String, 14 | dimension: usize, 15 | task_type: String, 16 | request_delay_ms: u64, 17 | } 18 | 19 | impl GoogleEmbeddingClient { 20 | pub fn new( 21 | api_key: String, 22 | model: Option, 23 | dimension: usize 24 | ) -> Result> { 25 | let embed_model = model.unwrap_or_else(|| "text-embedding-004".to_string()); 26 | let clean_model = embed_model.trim_start_matches("models/").to_string(); 27 | let default_delay_ms = 1100; 28 | 29 | info!( 30 | "Initializing Google Embedding Client with model: {}, dimension: {}, request delay: {}ms", 31 | clean_model, 32 | dimension, 33 | default_delay_ms 34 | ); 35 | 36 | Ok(Self { 37 | client: Client::new(), 38 | api_key, 39 | model_name: clean_model, 40 | dimension, 41 | task_type: "SEMANTIC_SIMILARITY".to_string(), 42 | request_delay_ms: default_delay_ms, 43 | }) 44 | } 45 | 46 | pub fn with_task_type(mut self, task_type: &str) -> Self { 47 | self.task_type = task_type.to_string(); 48 | self 49 | } 50 | 51 | pub fn with_request_delay(mut self, delay_ms: u64) -> Self { 52 | self.request_delay_ms = delay_ms; 53 | info!("Set Google request delay to {}ms", delay_ms); 54 | self 55 | } 56 | } 57 | 58 | #[async_trait] 59 | impl AsyncEmbeddingGenerator for GoogleEmbeddingClient { 60 | async fn generate_embeddings_batch( 61 | &self, 62 | texts: &[String] 63 | ) -> Result>, Box> { 64 | if texts.is_empty() { 65 | return Ok(vec![]); 66 | } 67 | 68 | info!( 69 | "Google: Generating embeddings for {} texts using model {} with task type {} (delay: {}ms)", 70 | texts.len(), 71 | self.model_name, 72 | self.task_type, 73 | self.request_delay_ms 74 | ); 75 | 76 | let mut results = Vec::with_capacity(texts.len()); 77 | 78 | for (i, text) in texts.iter().enumerate() { 79 | if i > 0 { 80 | sleep(Duration::from_millis(self.request_delay_ms)).await; 81 | } 82 | 83 | let url = format!( 84 | "https://generativelanguage.googleapis.com/v1beta/models/{}:embedContent?key={}", 85 | self.model_name, 86 | self.api_key 87 | ); 88 | 89 | let request_body = 90 | json!({ 91 | "model": format!("models/{}", self.model_name), 92 | "content": { 93 | "parts": [ 94 | { 95 | "text": text 96 | } 97 | ] 98 | }, 99 | "taskType": self.task_type 100 | }); 101 | 102 | debug!("Request URL: {}", url); 103 | debug!("Request body: {}", request_body.to_string()); 104 | 105 | let response = self.client 106 | .post(&url) 107 | .header("Content-Type", "application/json") 108 | .json(&request_body) 109 | .send().await; 110 | 111 | match response { 112 | Ok(res) => { 113 | let status = res.status(); 114 | 115 | if status.is_success() { 116 | match res.json::().await { 117 | Ok(json_response) => { 118 | debug!("Success Response: {:?}", json_response); 119 | 120 | if 121 | let Some(values) = json_response 122 | .get("embedding") 123 | .and_then(|e| e.get("values")) 124 | .and_then(|v| v.as_array()) 125 | { 126 | let embedding: Vec = values 127 | .iter() 128 | .filter_map(|v| v.as_f64().map(|f| f as f32)) 129 | .collect(); 130 | 131 | results.push(embedding); 132 | } else { 133 | error!( 134 | "CRITICAL: Invalid response format: {:?}", 135 | json_response 136 | ); 137 | return Err("Invalid embedding response format".into()); 138 | } 139 | } 140 | Err(e) => { 141 | error!("CRITICAL: Failed to parse response JSON: {}", e); 142 | return Err(format!("JSON parsing error: {}", e).into()); 143 | } 144 | } 145 | } else { 146 | if status == reqwest::StatusCode::TOO_MANY_REQUESTS { 147 | warn!( 148 | "Rate limit hit (429). Consider increasing delay or checking quota." 149 | ); 150 | } 151 | 152 | let error_text = match res.text().await { 153 | Ok(text) => text, 154 | Err(_) => "Failed to read error response".to_string(), 155 | }; 156 | error!("CRITICAL: Google API error ({}): {}", status, error_text); 157 | 158 | if let Ok(error_json) = serde_json::from_str::(&error_text) { 159 | error!("Error details: {:?}", error_json); 160 | if 161 | let Some(message) = error_json 162 | .get("error") 163 | .and_then(|e| e.get("message")) 164 | .and_then(|m| m.as_str()) 165 | { 166 | error!("Error message: {}", message); 167 | return Err(format!("Google API error: {}", message).into()); 168 | } 169 | } 170 | 171 | return Err(format!("Google API error ({}): {}", status, error_text).into()); 172 | } 173 | } 174 | Err(e) => { 175 | error!("CRITICAL: Request failed: {}", e); 176 | return Err(format!("Network error: {}", e).into()); 177 | } 178 | } 179 | } 180 | 181 | info!("Google: Successfully generated {} embeddings", results.len()); 182 | Ok(results) 183 | } 184 | 185 | fn get_dimension(&self) -> usize { 186 | self.dimension 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/embedding/models/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod google; 2 | pub mod ollama; 3 | pub mod tei; 4 | -------------------------------------------------------------------------------- /src/embedding/models/ollama.rs: -------------------------------------------------------------------------------- 1 | use crate::embedding::AsyncEmbeddingGenerator; 2 | use async_trait::async_trait; 3 | use log::{ error, info, warn }; 4 | use reqwest::Client as AsyncHttpClient; 5 | use serde_json::{ json, Value }; 6 | use std::{ error::Error as StdError, time::Duration }; 7 | use futures::future::join_all; 8 | 9 | pub struct OllamaEmbeddingClient { 10 | client: AsyncHttpClient, 11 | api_url: String, 12 | model: String, 13 | dimension: usize, 14 | } 15 | 16 | impl OllamaEmbeddingClient { 17 | pub fn new( 18 | base_url: &str, 19 | model: &str, 20 | dimension: usize 21 | ) -> Result> { 22 | let api_url = if base_url.ends_with("/api/embeddings") { 23 | base_url.to_string() 24 | } else { 25 | format!("{}/api/embeddings", base_url.trim_end_matches('/')) 26 | }; 27 | 28 | let client = AsyncHttpClient::builder().timeout(Duration::from_secs(20)).build()?; 29 | 30 | Ok(Self { 31 | client, 32 | api_url, 33 | model: model.to_string(), 34 | dimension, 35 | }) 36 | } 37 | 38 | async fn generate_single_embedding( 39 | &self, 40 | text: &str 41 | ) -> Result, Box> { 42 | let response = self.client 43 | .post(&self.api_url) 44 | .header("Content-Type", "application/json") 45 | .json(&json!({ "model": &self.model, "prompt": text })) 46 | .send().await?; 47 | 48 | if !response.status().is_success() { 49 | let status = response.status(); 50 | let error_body = response 51 | .text().await 52 | .unwrap_or_else(|_| "Failed to read error body".to_string()); 53 | warn!("Ollama API (single) returned status: {}, body: {}", status, error_body); 54 | return Err(format!("Ollama API error: {}", status).into()); 55 | } 56 | 57 | let json_body = response.json::().await?; 58 | if let Some(embedding_array) = json_body["embedding"].as_array() { 59 | let embedding: Vec = embedding_array 60 | .iter() 61 | .filter_map(|v| v.as_f64().map(|f| f as f32)) 62 | .collect(); 63 | 64 | if embedding.is_empty() && self.dimension > 0 { 65 | warn!( 66 | "Ollama returned empty embedding for input '{}', expected dimension {}. Returning zero vector.", 67 | text, self.dimension 68 | ); 69 | Ok(vec![0.0; self.dimension]) 70 | } else if embedding.len() != self.dimension { 71 | warn!( 72 | "Ollama returned embedding with dimension {}, expected {}", 73 | embedding.len(), 74 | self.dimension 75 | ); 76 | Err( 77 | format!( 78 | "Dimension mismatch: expected {}, got {}", 79 | self.dimension, 80 | embedding.len() 81 | ).into() 82 | ) 83 | } else { 84 | Ok(embedding) 85 | } 86 | } else { 87 | Err("Unexpected response structure from Ollama API".into()) 88 | } 89 | } 90 | } 91 | 92 | #[async_trait] 93 | impl AsyncEmbeddingGenerator for OllamaEmbeddingClient { 94 | async fn generate_embeddings_batch( 95 | &self, 96 | texts: &[String] 97 | ) -> Result>, Box> { 98 | if texts.is_empty() { 99 | return Ok(vec![]); 100 | } 101 | info!("Ollama: Generating embeddings for {} texts", texts.len()); 102 | 103 | let response = self.client 104 | .post(&self.api_url) 105 | .json(&json!({ "model": &self.model, "prompts": texts })) 106 | .send().await; 107 | 108 | match response { 109 | Ok(resp) if resp.status().is_success() => { 110 | match resp.json::().await { 111 | Ok(parsed) => { 112 | info!("Ollama batch response structure: {:?}", parsed); 113 | 114 | if let Some(embeddings) = parsed.get("embeddings").and_then(|e| e.as_array()) { 115 | let mut result = Vec::with_capacity(embeddings.len()); 116 | let mut success_count = 0; 117 | for (i, emb_val) in embeddings.iter().enumerate() { 118 | if let Some(vector) = emb_val.get("embedding").and_then(|v| v.as_array()) { 119 | let embedding: Vec = vector 120 | .iter() 121 | .filter_map(|v| v.as_f64().map(|f| f as f32)) 122 | .collect(); 123 | 124 | if embedding.len() == self.dimension { 125 | result.push(embedding); 126 | success_count += 1; 127 | } else { 128 | warn!( 129 | "Ollama batch item {} dimension mismatch: expected {}, got {}", 130 | i, 131 | self.dimension, 132 | embedding.len() 133 | ); 134 | result.push(vec![0.0; self.dimension]); 135 | } 136 | } else { 137 | warn!("Ollama batch item {} missing 'embedding' array", i); 138 | result.push(vec![0.0; self.dimension]); 139 | } 140 | } 141 | 142 | if result.len() == texts.len() { 143 | info!( 144 | "Ollama: Successfully processed batch of {} embeddings ({} succeeded)", 145 | result.len(), 146 | success_count 147 | ); 148 | return Ok(result); 149 | } else { 150 | warn!( 151 | "Ollama batch result count mismatch: expected {}, got {}", 152 | texts.len(), 153 | result.len() 154 | ); 155 | } 156 | } 157 | 158 | else if parsed.is_array() { 159 | let array = parsed.as_array().unwrap(); 160 | let mut result = Vec::with_capacity(array.len()); 161 | let mut success_count = 0; 162 | for (i, emb_val) in array.iter().enumerate() { 163 | if let Some(vector) = emb_val.as_array() { 164 | let embedding: Vec = vector 165 | .iter() 166 | .filter_map(|v| v.as_f64().map(|f| f as f32)) 167 | .collect(); 168 | 169 | if embedding.len() == self.dimension { 170 | result.push(embedding); 171 | success_count += 1; 172 | } else { 173 | warn!( 174 | "Ollama batch item {} dimension mismatch: expected {}, got {}", 175 | i, 176 | self.dimension, 177 | embedding.len() 178 | ); 179 | result.push(vec![0.0; self.dimension]); 180 | } 181 | } else { 182 | warn!("Ollama batch item {} missing 'embedding' array", i); 183 | result.push(vec![0.0; self.dimension]); 184 | } 185 | } 186 | 187 | if result.len() == texts.len() { 188 | info!( 189 | "Ollama: Successfully processed batch of {} embeddings ({} succeeded)", 190 | result.len(), 191 | success_count 192 | ); 193 | return Ok(result); 194 | } else { 195 | warn!( 196 | "Ollama batch result count mismatch: expected {}, got {}", 197 | texts.len(), 198 | result.len() 199 | ); 200 | } 201 | } 202 | else if let Some(embedding) = parsed.get("embedding") { 203 | if let Some(vector) = embedding.as_array() { 204 | let embedding: Vec = vector 205 | .iter() 206 | .filter_map(|v| v.as_f64().map(|f| f as f32)) 207 | .collect(); 208 | 209 | if embedding.len() == self.dimension { 210 | return Ok(vec![embedding]); 211 | } else { 212 | warn!( 213 | "Ollama single embedding dimension mismatch: expected {}, got {}", 214 | self.dimension, 215 | embedding.len() 216 | ); 217 | } 218 | } else { 219 | warn!("Ollama single embedding missing 'embedding' array"); 220 | } 221 | } else { 222 | warn!("Ollama batch response missing 'embeddings' array"); 223 | } 224 | } 225 | Err(e) => { 226 | warn!("Failed to parse Ollama batch response: {}. Falling back.", e); 227 | } 228 | } 229 | } 230 | Ok(resp) => { 231 | let status = resp.status(); 232 | let error_body = resp 233 | .text().await 234 | .unwrap_or_else(|_| "Failed to read error body".to_string()); 235 | warn!( 236 | "Ollama batch API returned status: {}. Body: {}. Falling back.", 237 | status, 238 | error_body 239 | ); 240 | } 241 | Err(e) => { 242 | warn!("Ollama batch request failed: {}. Falling back.", e); 243 | } 244 | } 245 | 246 | info!( 247 | "Ollama: Using fallback: parallel individual embedding requests for {} texts", 248 | texts.len() 249 | ); 250 | let futures: Vec<_> = texts 251 | .iter() 252 | .map(|text| self.generate_single_embedding(text)) 253 | .collect(); 254 | 255 | let results: Vec, _>> = join_all(futures).await; 256 | 257 | let final_embeddings: Vec> = results 258 | .into_iter() 259 | .map(|res| { 260 | match res { 261 | Ok(embedding) => embedding, 262 | Err(e) => { 263 | error!("Ollama single embedding failed during fallback: {}", e); 264 | vec![0.0; self.dimension] 265 | } 266 | } 267 | }) 268 | .collect(); 269 | 270 | Ok(final_embeddings) 271 | } 272 | 273 | fn get_dimension(&self) -> usize { 274 | self.dimension 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /src/embedding/models/tei.rs: -------------------------------------------------------------------------------- 1 | use crate::embedding::AsyncEmbeddingGenerator; 2 | use async_trait::async_trait; 3 | use log::{ info, error, warn }; 4 | use reqwest::Client; 5 | use serde::Serialize; 6 | use std::error::Error as StdError; 7 | use std::time::Duration; 8 | 9 | #[derive(Serialize)] 10 | struct TeiRequest { 11 | inputs: Vec, 12 | #[serde(skip_serializing_if = "Option::is_none")] 13 | truncate: Option, 14 | } 15 | 16 | type TeiResponse = Vec>; 17 | 18 | pub struct TeiEmbeddingClient { 19 | client: Client, 20 | api_url: String, 21 | dimension: usize, 22 | } 23 | 24 | impl TeiEmbeddingClient { 25 | pub fn new( 26 | api_url: String, 27 | dimension: usize, 28 | timeout_secs: u64 29 | ) -> Result> { 30 | let api_endpoint = if !api_url.ends_with("/embed") { 31 | format!("{}/embed", api_url.trim_end_matches('/')) 32 | } else { 33 | api_url 34 | }; 35 | 36 | warn!("TEI server URL: {}", api_endpoint); 37 | Ok(Self { 38 | client: Client::builder().timeout(Duration::from_secs(timeout_secs)).build()?, 39 | api_url: api_endpoint, 40 | dimension, 41 | }) 42 | } 43 | } 44 | 45 | #[async_trait] 46 | impl AsyncEmbeddingGenerator for TeiEmbeddingClient { 47 | async fn generate_embeddings_batch( 48 | &self, 49 | texts: &[String] 50 | ) -> Result>, Box> { 51 | if texts.is_empty() { 52 | return Ok(vec![]); 53 | } 54 | 55 | info!( 56 | "TEI Client: Generating embeddings for {} texts via {}", 57 | texts.len(), 58 | self.api_url 59 | ); 60 | 61 | let request_payload = TeiRequest { 62 | inputs: texts.to_vec(), 63 | truncate: None, 64 | }; 65 | 66 | 67 | let mut retries = 3; 68 | let mut last_error = None; 69 | 70 | while retries > 0 { 71 | match self.client.post(&self.api_url).json(&request_payload).send().await { 72 | Ok(response) => { 73 | if response.status().is_success() { 74 | let embeddings = response.json::().await?; 75 | if embeddings.len() != texts.len() { 76 | error!( 77 | "TEI Client: Mismatch in response length. Expected {}, got {}.", 78 | texts.len(), 79 | embeddings.len() 80 | ); 81 | return Err( 82 | format!( 83 | "TEI response length mismatch: expected {}, got {}", 84 | texts.len(), 85 | embeddings.len() 86 | ).into() 87 | ); 88 | } 89 | for emb in &embeddings { 90 | if emb.len() != self.dimension { 91 | error!( 92 | "TEI Client: Mismatch in embedding dimension. Expected {}, got {}.", 93 | self.dimension, 94 | emb.len() 95 | ); 96 | return Err( 97 | format!( 98 | "TEI dimension mismatch: expected {}, got {}", 99 | self.dimension, 100 | emb.len() 101 | ).into() 102 | ); 103 | } 104 | } 105 | info!("TEI Client: Successfully generated {} embeddings", embeddings.len()); 106 | return Ok(embeddings); 107 | } else { 108 | let status = response.status(); 109 | let error_text = response 110 | .text().await 111 | .unwrap_or_else(|_| "Failed to read error body".to_string()); 112 | error!("TEI server returned error {}: {}", status, error_text); 113 | return Err(format!("TEI server error {}: {}", status, error_text).into()); 114 | } 115 | }, 116 | Err(e) => { 117 | warn!("TEI request failed (retries left: {}): {}", retries - 1, e); 118 | retries -= 1; 119 | last_error = Some(e); 120 | tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; 121 | continue; 122 | } 123 | } 124 | } 125 | 126 | Err(Box::new(std::io::Error::new( 127 | std::io::ErrorKind::Other, 128 | format!("Failed after multiple retries: {}", last_error.unwrap()) 129 | ))) 130 | } 131 | 132 | fn get_dimension(&self) -> usize { 133 | self.dimension 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod db; 2 | pub mod parser; 3 | pub mod embedding; 4 | pub mod cli; 5 | pub mod util; 6 | pub mod workflow; -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | 2 | use db2vec::util; 3 | 4 | use clap::Parser; 5 | use db2vec::cli::Args; 6 | use db2vec::db::select_database; 7 | use dotenvy::dotenv; 8 | 9 | use log::{ info, error }; 10 | use db2vec::util::{ read_file_and_detect_format, logo }; 11 | use db2vec::parser::parse_database_export; 12 | use db2vec::workflow::execute_migration_workflow; 13 | 14 | fn main() -> Result<(), db2vec::db::DbError> { 15 | logo(); 16 | dotenv().ok(); 17 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("off")).init(); 18 | let args = Args::parse(); 19 | let file_path = args.dump_file.clone(); 20 | util::init_thread_pool(args.num_threads); 21 | 22 | let (content, format) = match read_file_and_detect_format(&file_path) { 23 | Ok(result) => result, 24 | Err(e) => { 25 | let err_msg = format!("Error reading file '{}': {}", file_path, e); 26 | error!("{}", err_msg); 27 | return Err(err_msg.into()); 28 | } 29 | }; 30 | 31 | let records = match parse_database_export(&content, &format, &args) { 32 | Ok(recs) => recs, 33 | Err(e) => { 34 | let err_msg = format!("Error parsing database export: {}", e); 35 | error!("{}", err_msg); 36 | return Err(err_msg.into()); 37 | } 38 | }; 39 | 40 | let database = select_database(&args)?; 41 | match execute_migration_workflow(records, &*database, &args) { 42 | Ok(stats) => { 43 | info!( 44 | "Migration successful: {} records processed in {:.2} seconds", 45 | stats.processed_records, 46 | stats.elapsed_seconds 47 | ); 48 | Ok(()) 49 | } 50 | Err(e) => { 51 | error!("Migration failed: {}", e); 52 | Err(e) 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /src/parser/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::cli::Args; 2 | 3 | use log::{ info, warn, debug }; 4 | use parse_regex::mssql::parse_mssql; 5 | use parse_regex::mysql::parse_mysql; 6 | use parse_regex::oracle::parse_oracle; 7 | use parse_regex::postgres::parse_postgres; 8 | use parse_regex::sqlite::parse_sqlite; 9 | use parse_regex::surreal::parse_surreal; 10 | use serde_json::Value; 11 | use std::error::Error; 12 | 13 | pub mod parse_regex; 14 | pub trait ExportParser { 15 | fn parse(&self, content: &str) -> Result, Box>; 16 | } 17 | 18 | pub fn parse_database_export( 19 | content: &str, 20 | format: &str, 21 | args: &Args 22 | ) -> Result, Box> { 23 | let mut all_records = Vec::new(); 24 | 25 | let excluder = if args.use_exclude { 26 | Some(crate::util::exclude::Excluder::load("config/exclude.json")) 27 | } else { 28 | None 29 | }; 30 | 31 | let chunks: Vec = match format { 32 | "mssql" | "postgres" | "mysql" | "surreal" | "sqlite" => { 33 | info!("Processing {} file without chunking", format); 34 | vec![content.to_string()] 35 | } 36 | "oracle" => { 37 | content 38 | .split("Insert into") 39 | .filter(|s| !s.trim().is_empty()) 40 | .enumerate() 41 | .map(|(i, s)| { 42 | if i > 0 { format!("Insert into{}", s) } else { s.to_string() } 43 | }) 44 | .collect() 45 | } 46 | _ => { 47 | warn!("Using default (single chunk) processing for unknown format: {}", format); 48 | vec![content.to_string()] 49 | } 50 | }; 51 | 52 | info!("Found {} chunks to process for format '{}'", chunks.len(), format); 53 | 54 | for (i, chunk) in chunks.iter().enumerate() { 55 | if chunk.trim().is_empty() { 56 | debug!("Skipping empty chunk {}", i); 57 | continue; 58 | } 59 | 60 | match parse_with_regex(&chunk, format, args) { 61 | Some(mut records) => { 62 | if !records.is_empty() { 63 | if let Some(ref excl) = excluder { 64 | debug!("Filtering fields for {} records from chunk {}", records.len(), i); 65 | for record in &mut records { 66 | excl.filter_record(record); 67 | } 68 | } 69 | 70 | info!("Parsed {} records from chunk {}", records.len(), i); 71 | if args.debug { 72 | for (j, rec) in records.iter().enumerate() { 73 | debug!("Debug: Record {} in chunk {}: {}", j, i, rec); 74 | } 75 | } 76 | all_records.extend(records); 77 | } else { 78 | debug!("Regex parsing yielded 0 records for chunk {}", i); 79 | } 80 | } 81 | None => { 82 | 83 | if args.debug && chunk.len() < 1000 { 84 | debug!("Content of failed chunk {}:\n{}", i, chunk); 85 | } else if args.debug { 86 | debug!( 87 | "Content of failed chunk {} (truncated):\n{}...", 88 | i, 89 | &chunk[..std::cmp::min(chunk.len(), 1000)] 90 | ); 91 | } 92 | } 93 | } 94 | } 95 | 96 | info!("Total records parsed: {}", all_records.len()); 97 | Ok(all_records) 98 | } 99 | 100 | pub fn detect_format(file_path: &str, content: &str) -> String { 101 | let _content_lower = content.to_lowercase(); 102 | 103 | if file_path.ends_with(".surql") { 104 | return "surreal".to_string(); 105 | } 106 | 107 | // Oracle distinctive patterns 108 | if 109 | content.contains("REM INSERTING into") || 110 | content.contains("SET DEFINE OFF;") || 111 | content.contains("Insert into ") || 112 | (content.contains("CREATE TABLE \"") && 113 | content.contains("PCTFREE") && 114 | content.contains("TABLESPACE")) || 115 | content.contains("BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT") || 116 | content.contains("USING INDEX PCTFREE") || 117 | content.contains("ALTER SESSION SET EVENTS") || 118 | content.contains("DBMS_LOGREP_IMP") 119 | { 120 | return "oracle".to_string(); 121 | } 122 | 123 | // PostgreSQL distinctive patterns 124 | if 125 | (content.contains("COPY ") && content.contains(" FROM stdin;")) || 126 | content.contains("PostgreSQL database dump") || 127 | (content.contains("SET ") && content.contains("standard_conforming_strings")) || 128 | content.contains("ALTER TABLE ONLY") || 129 | (content.contains("CREATE TYPE") && content.contains("AS ENUM")) || 130 | (content.contains("CREATE SEQUENCE") && content.contains("OWNED BY")) 131 | { 132 | return "postgres".to_string(); 133 | } 134 | 135 | // SQLite distinctive patterns 136 | if 137 | content.starts_with("PRAGMA foreign_keys=OFF;") || 138 | (content.contains("BEGIN TRANSACTION;") && 139 | content.contains("COMMIT;") && 140 | content.contains("CREATE TABLE") && 141 | !content.contains("ENGINE=InnoDB") && 142 | !content.contains("TABLESPACE") && 143 | content.contains("INSERT INTO ")) || 144 | content.contains("sqlite_sequence") 145 | { 146 | return "sqlite".to_string(); 147 | } 148 | 149 | // MSSQL distinctive patterns 150 | if 151 | content.contains("SET ANSI_NULLS ON") || 152 | content.contains("SET QUOTED_IDENTIFIER ON") || 153 | content.contains("CREATE TABLE [dbo].") || 154 | content.contains("INSERT [dbo].") || 155 | content.contains("WITH (PAD_INDEX = OFF") || 156 | content.contains("GO") 157 | { 158 | return "mssql".to_string(); 159 | } 160 | 161 | // MySQL distinctive patterns 162 | if 163 | content.contains("ENGINE=InnoDB") || 164 | content.contains("LOCK TABLES") || 165 | content.contains("/*!40") || 166 | content.contains("AUTO_INCREMENT") || 167 | content.contains("COLLATE=utf8mb4") 168 | { 169 | return "mysql".to_string(); 170 | } 171 | 172 | "json".to_string() 173 | } 174 | 175 | pub fn parse_with_regex(chunk: &str, format: &str, args: &Args) -> Option> { 176 | match format { 177 | "surreal" => parse_surreal(chunk, args), 178 | "mysql" => parse_mysql(chunk, args), 179 | "postgres" => parse_postgres(chunk, args), 180 | "oracle" => parse_oracle(chunk, args), 181 | "sqlite" => parse_sqlite(chunk, args), 182 | "mssql" => parse_mssql(chunk, args), 183 | _ => None, 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/parser/parse_regex/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod mysql; 2 | pub mod postgres; 3 | pub mod oracle; 4 | pub mod surreal; 5 | pub mod sqlite; 6 | pub mod mssql; 7 | use serde_json::Value; 8 | 9 | pub fn clean_html_in_value(val: &mut Value) { 10 | match val { 11 | Value::String(s) => { 12 | if s.contains('<') && s.contains('>') { 13 | *s = html2text 14 | ::from_read(s.as_bytes(), usize::MAX) 15 | .unwrap_or_else(|_| s.clone()) 16 | .replace('\n', " ") 17 | .split_whitespace() 18 | .collect::>() 19 | .join(" ") 20 | .trim() 21 | .to_string(); 22 | } 23 | } 24 | Value::Array(arr) => { 25 | for v in arr { 26 | clean_html_in_value(v); 27 | } 28 | } 29 | Value::Object(obj) => { 30 | for v in obj.values_mut() { 31 | clean_html_in_value(v); 32 | } 33 | } 34 | _ => {} 35 | } 36 | } 37 | 38 | pub fn extract_json_array(text: &str) -> Option<&str> { 39 | let open_bracket = text.find('[')?; 40 | let mut depth = 1; 41 | let mut in_string = false; 42 | let mut escape_next = false; 43 | 44 | for (i, c) in text[open_bracket + 1..].chars().enumerate() { 45 | match c { 46 | '[' if !in_string => { 47 | depth += 1; 48 | } 49 | ']' if !in_string => { 50 | depth -= 1; 51 | if depth == 0 { 52 | return Some(&text[open_bracket..=open_bracket + i + 1]); 53 | } 54 | } 55 | '"' if !escape_next => { 56 | in_string = !in_string; 57 | } 58 | '\\' if in_string && !escape_next => { 59 | escape_next = true; 60 | } 61 | _ => { 62 | escape_next = false; 63 | } 64 | } 65 | } 66 | 67 | None 68 | } 69 | pub fn parse_array(array_str: &str) -> Option { 70 | let content = array_str.get(1..array_str.len() - 1)?; 71 | if content.is_empty() { 72 | return Some(Value::Array(vec![])); 73 | } 74 | let mut elements = Vec::new(); 75 | let mut current_element = String::new(); 76 | let mut chars = content.chars().peekable(); 77 | let mut in_quotes = false; 78 | let mut escape_next = false; 79 | 80 | while let Some(c) = chars.next() { 81 | if escape_next { 82 | current_element.push(c); 83 | escape_next = false; 84 | } else if c == '\\' { 85 | escape_next = true; 86 | } else if c == '"' { 87 | in_quotes = !in_quotes; 88 | } else if c == ',' && !in_quotes { 89 | elements.push(Value::String(current_element.trim().to_string())); 90 | current_element.clear(); 91 | } else { 92 | current_element.push(c); 93 | } 94 | } 95 | 96 | elements.push(Value::String(current_element.trim().to_string())); 97 | 98 | Some(Value::Array(elements)) 99 | } 100 | -------------------------------------------------------------------------------- /src/parser/parse_regex/mssql.rs: -------------------------------------------------------------------------------- 1 | use log::{ debug, info }; 2 | use regex::Regex; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::clean_html_in_value; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_mssql(chunk: &str, args: &Args) -> Option> { 9 | info!("Using parse method: MSSQL"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let insert_re = Regex::new( 19 | r"(?is)INSERT\s+\[(?:dbo|DB_OWNER)\]\.\[(\w+)\]\s*(?:\((.*?)\))?\s*VALUES\s*" 20 | ).ok()?; 21 | let values_re = Regex::new(r"\(((?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*)\)").ok()?; 22 | 23 | for cap in insert_re.captures_iter(chunk) { 24 | let table = cap.get(1)?.as_str(); 25 | 26 | if let Some(ref excl) = excluder { 27 | if excl.ignore_table(table) { 28 | info!("Skipping excluded MSSQL table: {}", table); 29 | continue; 30 | } 31 | } 32 | 33 | info!("Processing INSERT for MSSQL table: {}", table); 34 | 35 | let column_names: Vec = if let Some(cols_match) = cap.get(2) { 36 | cols_match 37 | .as_str() 38 | .split(',') 39 | .map(|c| 40 | c 41 | .trim() 42 | .trim_matches(&['[', ']'][..]) 43 | .to_string() 44 | ) 45 | .collect() 46 | } else { 47 | Vec::new() 48 | }; 49 | 50 | let insert_statement_end = cap.get(0)?.end(); 51 | let search_area = &chunk[insert_statement_end..]; 52 | 53 | for values_cap in values_re.captures_iter(search_area) { 54 | let values_str = values_cap.get(1)?.as_str(); 55 | let mut fields = Vec::new(); 56 | let mut current = String::new(); 57 | let mut in_string = false; 58 | let mut in_cast = 0; 59 | let mut escape_next = false; 60 | 61 | for c in values_str.chars() { 62 | if escape_next { 63 | current.push(c); 64 | escape_next = false; 65 | } else if c == '\\' { 66 | current.push(c); 67 | escape_next = true; 68 | } else if c == '\'' { 69 | current.push(c); 70 | if !in_string && current.ends_with("N'") { 71 | in_string = true; 72 | } else if in_string { 73 | if let Some(next_char) = search_area.chars().nth(current.len()) { 74 | if next_char == '\'' { 75 | continue; 76 | } 77 | } 78 | in_string = false; 79 | } 80 | } else if c == '(' { 81 | current.push(c); 82 | if current.contains("CAST") { 83 | in_cast += 1; 84 | } 85 | } else if c == ')' { 86 | current.push(c); 87 | if in_cast > 0 { 88 | in_cast -= 1; 89 | } 90 | } else if c == ',' && !in_string && in_cast == 0 { 91 | fields.push(current.trim().to_string()); 92 | current.clear(); 93 | } else { 94 | current.push(c); 95 | } 96 | } 97 | 98 | if !current.is_empty() { 99 | fields.push(current.trim().to_string()); 100 | } 101 | 102 | let col_names = if !column_names.is_empty() { 103 | column_names.clone() 104 | } else { 105 | (0..fields.len()) 106 | .map(|i| { 107 | match i { 108 | 0 => "id".to_string(), 109 | 1 => "name".to_string(), 110 | 2 => "description".to_string(), 111 | _ => format!("column{}", i), 112 | } 113 | }) 114 | .collect() 115 | }; 116 | 117 | if fields.len() != col_names.len() { 118 | continue; 119 | } 120 | 121 | let mut obj = serde_json::Map::new(); 122 | obj.insert("table".to_string(), Value::String(table.to_string())); 123 | 124 | for (i, val_str) in fields.iter().enumerate() { 125 | if i >= col_names.len() { 126 | continue; 127 | } 128 | 129 | let value = parse_mssql_value(val_str); 130 | obj.insert(col_names[i].clone(), value); 131 | } 132 | 133 | let id_key = obj 134 | .keys() 135 | .find(|k| k.eq_ignore_ascii_case("id")) 136 | .cloned(); 137 | if let Some(key) = id_key { 138 | obj.remove(&key); 139 | debug!("Removed 'id' field (key: {}) from MSSQL record", key); 140 | } 141 | 142 | if obj.len() > 1 { 143 | let mut final_value = Value::Object(obj); 144 | clean_html_in_value(&mut final_value); 145 | records.push(final_value); 146 | } 147 | } 148 | } 149 | 150 | if records.is_empty() { 151 | None 152 | } else { 153 | Some(records) 154 | } 155 | } 156 | 157 | fn parse_mssql_value(val_str: &str) -> Value { 158 | if val_str == "NULL" { 159 | return Value::Null; 160 | } 161 | if val_str.starts_with("N'") && val_str.ends_with("'") { 162 | let inner_str = &val_str[2..val_str.len() - 1].replace("''", "'"); 163 | 164 | if 165 | (inner_str.starts_with("[") && inner_str.ends_with("]")) || 166 | (inner_str.starts_with("{") && inner_str.ends_with("}")) 167 | { 168 | if let Ok(json_val) = serde_json::from_str(inner_str) { 169 | return json_val; 170 | } 171 | } 172 | 173 | return Value::String(inner_str.to_string()); 174 | } 175 | 176 | if val_str.starts_with("CAST(") { 177 | let re = Regex::new(r"CAST\(\s*N?'?(.*?)'?\s+AS").ok(); 178 | if let Some(re) = re { 179 | if let Some(cap) = re.captures(val_str) { 180 | if let Some(m) = cap.get(1) { 181 | return parse_mssql_value(m.as_str()); 182 | } 183 | } 184 | } 185 | 186 | return Value::String(val_str.to_string()); 187 | } 188 | 189 | if val_str == "0" || val_str == "1" { 190 | if let Ok(b) = val_str.parse::() { 191 | return Value::Bool(b != 0); 192 | } 193 | } 194 | 195 | if let Ok(i) = val_str.parse::() { 196 | return Value::Number(i.into()); 197 | } 198 | 199 | if let Ok(f) = val_str.parse::() { 200 | if let Some(n) = serde_json::Number::from_f64(f) { 201 | return Value::Number(n); 202 | } 203 | } 204 | 205 | Value::String(val_str.to_string()) 206 | } 207 | -------------------------------------------------------------------------------- /src/parser/parse_regex/mysql.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn, debug }; 2 | use regex::Regex; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::{ clean_html_in_value, parse_array }; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_mysql(chunk: &str, args: &Args) -> Option> { 9 | info!("Using parse method: MySQL"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let insert_re = Regex::new( 19 | r#"(?is)INSERT INTO\s+[`'\"]?(\w+)[`'\"]?\s*(?:\(([^)]+)\))?\s*VALUES\s*(.*?);"# 20 | ).ok()?; 21 | 22 | let row_re = Regex::new(r"\((.*?)\)").ok()?; 23 | 24 | for cap in insert_re.captures_iter(chunk) { 25 | let table = cap.get(1)?.as_str(); 26 | 27 | if let Some(ref excl) = excluder { 28 | if excl.ignore_table(table) { 29 | info!("Skipping excluded MySQL table: {}", table); 30 | continue; 31 | } 32 | } 33 | 34 | let column_names: Vec = if let Some(cols_match) = cap.get(2) { 35 | cols_match 36 | .as_str() 37 | .split(',') 38 | .map(|c| 39 | c 40 | .trim() 41 | .trim_matches(&['`', '\'', '"'][..]) 42 | .to_string() 43 | ) 44 | .collect() 45 | } else { 46 | Vec::new() 47 | }; 48 | 49 | let values_str = cap.get(3)?.as_str(); 50 | let mut inferred_column_count = 0; 51 | let mut first_row_processed = false; 52 | 53 | for row_cap in row_re.captures_iter(values_str) { 54 | let row = row_cap.get(1)?.as_str(); 55 | let mut fields = Vec::new(); 56 | let mut current = String::new(); 57 | let mut in_string = false; 58 | let mut escape_next = false; 59 | 60 | for c in row.chars() { 61 | if escape_next { 62 | current.push(c); 63 | escape_next = false; 64 | } else if c == '\\' { 65 | current.push(c); 66 | escape_next = true; 67 | } else if c == '\'' { 68 | current.push(c); 69 | in_string = !in_string; 70 | } else if c == ',' && !in_string { 71 | fields.push(current.trim().to_string()); 72 | current.clear(); 73 | } else { 74 | current.push(c); 75 | } 76 | } 77 | if !current.is_empty() { 78 | fields.push(current.trim().to_string()); 79 | } 80 | 81 | let col_names = if !column_names.is_empty() { 82 | column_names.clone() 83 | } else if !first_row_processed { 84 | first_row_processed = true; 85 | inferred_column_count = fields.len(); 86 | 87 | let mut default_cols = Vec::with_capacity(fields.len()); 88 | for i in 0..fields.len() { 89 | let col_name = match i { 90 | 0 => "id".to_string(), 91 | 1 => "name".to_string(), 92 | 2 => "description".to_string(), 93 | _ => format!("column{}", i), 94 | }; 95 | default_cols.push(col_name); 96 | } 97 | default_cols 98 | } else { 99 | (0..inferred_column_count) 100 | .map(|i| { 101 | match i { 102 | 0 => "id".to_string(), 103 | 1 => "name".to_string(), 104 | 2 => "description".to_string(), 105 | _ => format!("column{}", i), 106 | } 107 | }) 108 | .collect() 109 | }; 110 | 111 | let mut obj = serde_json::Map::new(); 112 | obj.insert("table".to_string(), Value::String(table.to_string())); 113 | 114 | for (i, val_str) in fields.iter().enumerate() { 115 | if i >= col_names.len() { 116 | warn!("More values than columns for table '{}', value: '{}'", table, val_str); 117 | continue; 118 | } 119 | 120 | let mut value = Value::Null; 121 | 122 | if val_str == "NULL" { 123 | } else if val_str.starts_with('\'') && val_str.ends_with('\'') { 124 | let inner_str = val_str.trim_matches('\''); 125 | let unescaped_mysql_str = inner_str 126 | .replace("''", "'") 127 | .replace("\\\\", "\\") 128 | .replace("\\'", "'"); 129 | 130 | if 131 | (unescaped_mysql_str.starts_with('[') && 132 | unescaped_mysql_str.ends_with(']')) || 133 | (unescaped_mysql_str.starts_with('{') && unescaped_mysql_str.ends_with('}')) 134 | { 135 | let potential_json_str = unescaped_mysql_str.replace("\\\"", "\""); 136 | match serde_json::from_str::(&potential_json_str) { 137 | Ok(json_value) => { 138 | value = json_value; 139 | } 140 | Err(_) => { 141 | value = Value::String(unescaped_mysql_str); 142 | } 143 | } 144 | } else { 145 | value = Value::String(unescaped_mysql_str); 146 | } 147 | } else if val_str.starts_with('{') && val_str.ends_with('}') { 148 | match serde_json::from_str::(val_str) { 149 | Ok(json_val) => { 150 | value = json_val; 151 | } 152 | Err(_) => { 153 | value = parse_array(val_str).unwrap_or( 154 | Value::String(val_str.to_string()) 155 | ); 156 | } 157 | } 158 | } else if val_str.starts_with('[') && val_str.ends_with(']') { 159 | match serde_json::from_str::(val_str) { 160 | Ok(json_val) => { 161 | value = json_val; 162 | } 163 | Err(_) => { 164 | value = Value::String(val_str.to_string()); 165 | } 166 | } 167 | } else if let Ok(n) = val_str.parse::() { 168 | value = Value::Number(n.into()); 169 | } else if let Ok(f) = val_str.parse::() { 170 | value = Value::Number( 171 | serde_json::Number::from_f64(f).unwrap_or_else(|| (0).into()) 172 | ); 173 | } else { 174 | value = Value::String(val_str.to_string()); 175 | } 176 | 177 | obj.insert(col_names[i].clone(), value); 178 | } 179 | 180 | let id_key = obj 181 | .keys() 182 | .find(|k| k.eq_ignore_ascii_case("id")) 183 | .cloned(); 184 | if let Some(key) = id_key { 185 | obj.remove(&key); 186 | debug!("Removed 'id' field (key: {}) from MySQL record", key); 187 | } 188 | 189 | if obj.len() > 1 { 190 | let mut final_value = Value::Object(obj); 191 | clean_html_in_value(&mut final_value); 192 | records.push(final_value); 193 | } else { 194 | warn!("Skipping MySQL record for table '{}', too few fields after processing", table); 195 | } 196 | } 197 | } 198 | 199 | if records.is_empty() { 200 | None 201 | } else { 202 | Some(records) 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/parser/parse_regex/oracle.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn, debug }; 2 | use regex::Regex; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::clean_html_in_value; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_oracle(content: &str, args: &Args) -> Option> { 9 | info!("Using parse method: Oracle"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let insert_re = Regex::new( 19 | r#"(?is)Insert\s+into\s+([\w\.\"]+)\s+\(([^)]+)\)\s+values\s+\(([^;]+)\);"# 20 | ).ok()?; 21 | 22 | for cap in insert_re.captures_iter(content) { 23 | let full_table = cap.get(1)?.as_str(); 24 | 25 | let table = ( 26 | if full_table.contains('.') { 27 | full_table.split('.').last().unwrap_or(full_table) 28 | } else { 29 | full_table 30 | } 31 | ).trim_matches('"'); 32 | 33 | if let Some(ref excl) = excluder { 34 | if excl.ignore_table(table) { 35 | info!("Skipping excluded Oracle table: {}", table); 36 | continue; 37 | } 38 | } 39 | 40 | debug!("Processing Oracle INSERT for table: {}", table); 41 | 42 | let columns: Vec<&str> = cap 43 | .get(2)? 44 | .as_str() 45 | .split(',') 46 | .map(|s| s.trim().trim_matches('"')) 47 | .collect(); 48 | 49 | let values_str = cap.get(3)?.as_str(); 50 | let mut fields = Vec::new(); 51 | let mut current = String::new(); 52 | let mut in_string = false; 53 | let mut function_depth = 0; 54 | let mut chars = values_str.chars().peekable(); 55 | 56 | while let Some(c) = chars.next() { 57 | match c { 58 | '\'' if !in_string => { 59 | current.push('\''); 60 | in_string = true; 61 | } 62 | '\'' if in_string => { 63 | if chars.peek() == Some(&'\'') { 64 | current.push('\''); 65 | current.push('\''); 66 | chars.next(); 67 | } else { 68 | current.push('\''); 69 | in_string = false; 70 | } 71 | } 72 | '(' => { 73 | current.push('('); 74 | if !in_string { 75 | function_depth += 1; 76 | } 77 | } 78 | ')' => { 79 | current.push(')'); 80 | if !in_string && function_depth > 0 { 81 | function_depth -= 1; 82 | } 83 | } 84 | ',' if !in_string && function_depth == 0 => { 85 | fields.push(current.trim().to_string()); 86 | current.clear(); 87 | } 88 | _ => current.push(c), 89 | } 90 | } 91 | 92 | if !current.is_empty() || fields.len() < columns.len() { 93 | fields.push(current.trim().to_string()); 94 | } 95 | 96 | if fields.len() != columns.len() { 97 | warn!( 98 | "Mismatched number of columns ({}) and values ({}) for table '{}'. Row values: '{}'", 99 | columns.len(), 100 | fields.len(), 101 | table, 102 | values_str 103 | ); 104 | continue; 105 | } 106 | 107 | let mut obj = serde_json::Map::new(); 108 | obj.insert("table".to_string(), Value::String(table.to_string())); 109 | 110 | for (col, val) in columns.iter().zip(fields.iter()) { 111 | let parsed_value = parse_oracle_value(val); 112 | obj.insert(col.to_string(), parsed_value); 113 | } 114 | 115 | let id_key = obj 116 | .keys() 117 | .find(|k| k.eq_ignore_ascii_case("id")) 118 | .cloned(); 119 | if let Some(key) = id_key { 120 | obj.remove(&key); 121 | debug!("Removed 'id' field (key: {}) from Oracle record", key); 122 | } 123 | 124 | if obj.len() > 1 { 125 | let mut final_value = Value::Object(obj); 126 | clean_html_in_value(&mut final_value); 127 | records.push(final_value); 128 | } else { 129 | warn!("Skipping Oracle record for table '{}', too few fields after processing", table); 130 | } 131 | } 132 | 133 | if records.is_empty() { 134 | None 135 | } else { 136 | Some(records) 137 | } 138 | } 139 | 140 | fn parse_oracle_value(val_str: &str) -> Value { 141 | if val_str.eq_ignore_ascii_case("NULL") { 142 | return Value::Null; 143 | } 144 | if val_str.starts_with('\'') && val_str.ends_with('\'') { 145 | let inner_str = &val_str[1..val_str.len() - 1].replace("''", "'"); 146 | 147 | if 148 | (inner_str.starts_with('{') && inner_str.ends_with('}')) || 149 | (inner_str.starts_with('[') && inner_str.ends_with(']')) 150 | { 151 | if let Ok(json_val) = serde_json::from_str(inner_str) { 152 | return json_val; 153 | } 154 | } 155 | 156 | return Value::String(inner_str.to_string()); 157 | } 158 | 159 | if val_str.starts_with("to_timestamp(") { 160 | let timestamp_re = Regex::new(r"to_timestamp\('([^']+)'").ok(); 161 | if let Some(re) = timestamp_re { 162 | if let Some(cap) = re.captures(val_str) { 163 | if let Some(date_match) = cap.get(1) { 164 | return Value::String(date_match.as_str().to_string()); 165 | } 166 | } 167 | } 168 | return Value::String("timestamp_parse_error".to_string()); 169 | } 170 | 171 | if let Ok(i) = val_str.parse::() { 172 | return Value::Number(i.into()); 173 | } 174 | 175 | if let Ok(f) = val_str.parse::() { 176 | if let Some(n) = serde_json::Number::from_f64(f) { 177 | return Value::Number(n); 178 | } 179 | } 180 | 181 | Value::String(val_str.to_string()) 182 | } 183 | -------------------------------------------------------------------------------- /src/parser/parse_regex/postgres.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn, debug }; 2 | use regex::Regex; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::{ clean_html_in_value, parse_array }; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_postgres(content: &str, args: &Args) -> Option> { 9 | info!("Using parse method: Postgres"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let copy_re = Regex::new( 19 | r"COPY\s+public\.([a-zA-Z0-9_]+)\s*\(([^)]+)\)\s+FROM stdin;\n((?s:.*?))\n\\\." 20 | ).ok()?; 21 | 22 | for cap in copy_re.captures_iter(content) { 23 | let table = cap.get(1)?.as_str(); 24 | 25 | if let Some(ref excl) = excluder { 26 | if excl.ignore_table(table) { 27 | info!("Skipping excluded Postgres table: {}", table); 28 | continue; 29 | } 30 | } 31 | 32 | let columns: Vec<&str> = cap 33 | .get(2)? 34 | .as_str() 35 | .split(',') 36 | .map(|s| s.trim()) 37 | .collect(); 38 | let rows = cap.get(3)?.as_str(); 39 | 40 | for line in rows.lines() { 41 | if line.trim().is_empty() { 42 | continue; 43 | } 44 | 45 | let fields: Vec<&str> = line.split('\t').collect(); 46 | if fields.len() != columns.len() { 47 | warn!( 48 | "Warning: Mismatched number of columns ({}) and values ({}) for table '{}' in COPY data. Line: '{}'", 49 | columns.len(), 50 | fields.len(), 51 | table, 52 | line 53 | ); 54 | continue; 55 | } 56 | let mut obj = serde_json::Map::new(); 57 | obj.insert("table".to_string(), Value::String(table.to_string())); 58 | 59 | for (col, val_str) in columns.iter().zip(fields.iter()) { 60 | let value = if *val_str == r"\N" { 61 | Value::Null 62 | } else if 63 | (val_str.starts_with('{') && val_str.ends_with('}')) || 64 | (val_str.starts_with('[') && val_str.ends_with(']')) 65 | { 66 | match serde_json::from_str::(val_str) { 67 | Ok(json_val) => json_val, 68 | Err(_) => { 69 | if val_str.starts_with('{') && val_str.ends_with('}') { 70 | parse_array(val_str).unwrap_or(Value::String(val_str.to_string())) 71 | } else { 72 | Value::String(val_str.to_string()) 73 | } 74 | } 75 | } 76 | } else { 77 | let unescaped_val = val_str 78 | .replace("\\\\", "\\") 79 | .replace("\\t", "\t") 80 | .replace("\\n", "\n"); 81 | Value::String(unescaped_val) 82 | }; 83 | obj.insert(col.trim().to_string(), value); 84 | } 85 | 86 | let id_key = obj 87 | .keys() 88 | .find(|k| k.eq_ignore_ascii_case("id")) 89 | .cloned(); 90 | if let Some(key) = id_key { 91 | obj.remove(&key); 92 | debug!("Removed 'id' field (key: {}) from Postgres record", key); 93 | } 94 | 95 | if obj.len() > 1 { 96 | let mut final_value = Value::Object(obj); 97 | clean_html_in_value(&mut final_value); 98 | records.push(final_value); 99 | } else { 100 | warn!( 101 | "Skipping Postgres record for table '{}', became empty after removing ID. Original line: '{}'", 102 | table, 103 | line 104 | ); 105 | } 106 | } 107 | } 108 | 109 | if records.is_empty() { 110 | None 111 | } else { 112 | Some(records) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/parser/parse_regex/sqlite.rs: -------------------------------------------------------------------------------- 1 | use log::{ info, warn, debug }; 2 | use regex::Regex; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::clean_html_in_value; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_sqlite(chunk: &str, args: &Args) -> Option> { 9 | info!("Using parse method: SQLite"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let create_re = Regex::new( 19 | r"(?is)CREATE TABLE\s+(?:IF NOT EXISTS\s+)?(?:`?(\w+)`?|(\w+))\s*\((.*?)\);" 20 | ).ok()?; 21 | 22 | let column_def_re = Regex::new(r"^\s*(?:`?(\w+)`?|(\w+))\s+").ok()?; 23 | let mut table_columns = std::collections::HashMap::new(); 24 | 25 | for cap in create_re.captures_iter(chunk) { 26 | let table_name = cap 27 | .get(1) 28 | .or_else(|| cap.get(2)) 29 | .map(|m| m.as_str()); 30 | let cols_def_match = cap.get(3); 31 | 32 | if let (Some(table_name), Some(cols_def_match)) = (table_name, cols_def_match) { 33 | let cols_def = cols_def_match.as_str(); 34 | let mut cols = Vec::new(); 35 | for line in cols_def.lines() { 36 | let trimmed_line = line.trim(); 37 | if 38 | trimmed_line.starts_with("--") || 39 | trimmed_line.starts_with("PRIMARY") || 40 | trimmed_line.starts_with("UNIQUE") || 41 | trimmed_line.starts_with("CHECK") || 42 | trimmed_line.starts_with("FOREIGN") || 43 | trimmed_line.is_empty() 44 | { 45 | continue; 46 | } 47 | if let Some(col_cap) = column_def_re.captures(trimmed_line) { 48 | if let Some(col_name) = col_cap.get(1).or_else(|| col_cap.get(2)) { 49 | cols.push(col_name.as_str().to_string()); 50 | } 51 | } 52 | } 53 | if !cols.is_empty() { 54 | debug!("Found columns for table '{}': {:?}", table_name, cols); 55 | table_columns.insert(table_name.to_string(), cols); 56 | } 57 | } 58 | } 59 | 60 | if table_columns.is_empty() { 61 | warn!("Could not parse any CREATE TABLE statements to find column names in SQLite chunk."); 62 | return None; 63 | } 64 | 65 | let insert_re = Regex::new( 66 | r"(?is)INSERT INTO\s+(?:`?(\w+)`?|(\w+))\s+VALUES\s*\((.*?)\);" 67 | ).ok()?; 68 | 69 | for cap in insert_re.captures_iter(chunk) { 70 | let table = match cap.get(1).or_else(|| cap.get(2)) { 71 | Some(t) => t.as_str(), 72 | None => { 73 | continue; 74 | } 75 | }; 76 | 77 | if table == "sqlite_sequence" { 78 | continue; 79 | } 80 | 81 | if let Some(ref excl) = excluder { 82 | if excl.ignore_table(table) { 83 | info!("Skipping excluded SQLite table: {}", table); 84 | continue; 85 | } 86 | } 87 | 88 | let columns = match table_columns.get(table) { 89 | Some(cols) => cols, 90 | None => { 91 | warn!("Skipping INSERT for table '{}' because columns were not found (CREATE TABLE missing or unparsed).", table); 92 | continue; 93 | } 94 | }; 95 | let values_str = cap.get(3).map_or("", |m| m.as_str()); 96 | let mut fields = Vec::new(); 97 | let mut current_field = String::new(); 98 | let mut in_string = false; 99 | let mut chars = values_str.chars().peekable(); 100 | 101 | while let Some(c) = chars.next() { 102 | if c == '\'' { 103 | if in_string && chars.peek() == Some(&'\'') { 104 | current_field.push(c); 105 | chars.next(); 106 | } else { 107 | in_string = !in_string; 108 | } 109 | current_field.push(c); 110 | } else if c == ',' && !in_string { 111 | fields.push(current_field.trim().to_string()); 112 | current_field.clear(); 113 | } else { 114 | current_field.push(c); 115 | } 116 | } 117 | 118 | fields.push(current_field.trim().to_string()); 119 | 120 | if fields.len() != columns.len() { 121 | warn!( 122 | "Mismatched number of columns ({}) and values ({}) for table '{}'. Row: '{}'", 123 | columns.len(), 124 | fields.len(), 125 | table, 126 | values_str 127 | ); 128 | continue; 129 | } 130 | 131 | let mut obj = serde_json::Map::new(); 132 | obj.insert("table".to_string(), Value::String(table.to_string())); 133 | 134 | for (i, col) in columns.iter().enumerate() { 135 | let val_str = &fields[i]; 136 | let mut value = Value::Null; 137 | if val_str == "NULL" { 138 | } else if val_str.starts_with('\'') && val_str.ends_with('\'') && val_str.len() >= 2 { 139 | let inner_str = &val_str[1..val_str.len() - 1]; 140 | let unescaped_str = inner_str.replace("''", "'"); 141 | 142 | if 143 | (unescaped_str.starts_with('[') && unescaped_str.ends_with(']')) || 144 | (unescaped_str.starts_with('{') && unescaped_str.ends_with('}')) 145 | { 146 | match serde_json::from_str::(&unescaped_str) { 147 | Ok(json_value) => { 148 | value = json_value; 149 | } 150 | Err(_) => { 151 | value = Value::String(unescaped_str); 152 | } 153 | } 154 | } else { 155 | value = Value::String(unescaped_str); 156 | } 157 | } else if let Ok(n) = val_str.parse::() { 158 | value = Value::Number(n.into()); 159 | } else if let Ok(f) = val_str.parse::() { 160 | value = Value::Number( 161 | serde_json::Number::from_f64(f).unwrap_or_else(|| (0).into()) 162 | ); 163 | } else { 164 | warn!( 165 | "Unrecognized value format for column '{}' in table '{}': {}", 166 | col, 167 | table, 168 | val_str 169 | ); 170 | value = Value::String(val_str.to_string()); 171 | } 172 | obj.insert(col.clone(), value); 173 | } 174 | 175 | let id_key = obj 176 | .keys() 177 | .find(|k| k.eq_ignore_ascii_case("id")) 178 | .cloned(); 179 | if let Some(key) = id_key { 180 | obj.remove(&key); 181 | debug!("Removed 'id' field (key: {}) from SQLite record", key); 182 | } 183 | 184 | if obj.len() > 1 { 185 | let mut final_value = Value::Object(obj); 186 | clean_html_in_value(&mut final_value); 187 | records.push(final_value); 188 | } else { 189 | warn!( 190 | "Skipping SQLite record for table '{}', became empty after removing ID. Original values: '{}'", 191 | table, 192 | values_str 193 | ); 194 | } 195 | } 196 | 197 | if records.is_empty() { 198 | None 199 | } else { 200 | Some(records) 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/parser/parse_regex/surreal.rs: -------------------------------------------------------------------------------- 1 | use regex::Regex; 2 | use log::{ info, warn, debug }; 3 | use serde_json::Value; 4 | use crate::parser::parse_regex::clean_html_in_value; 5 | use crate::cli::Args; 6 | use crate::util::exclude::Excluder; 7 | 8 | pub fn parse_surreal(chunk: &str, args: &Args) -> Option> { 9 | info!("Using parse method: Surreal"); 10 | let mut records = Vec::new(); 11 | 12 | let excluder = if args.use_exclude { 13 | Some(Excluder::load("config/exclude.json")) 14 | } else { 15 | None 16 | }; 17 | 18 | let table_header_re = Regex::new(r"--\s*TABLE DATA:\s*([a-zA-Z0-9_]+)").ok()?; 19 | let insert_re = Regex::new(r"INSERT\s*\[(?s)(.*?)\]\s*;").ok()?; 20 | 21 | let mut inserts = Vec::new(); 22 | for insert_cap in insert_re.captures_iter(chunk) { 23 | if let Some(array_content) = insert_cap.get(1) { 24 | let array_text = array_content.as_str(); 25 | let full_match = insert_cap.get(0).unwrap().as_str(); 26 | inserts.push((full_match, array_text)); 27 | } 28 | } 29 | 30 | if inserts.is_empty() { 31 | warn!("No INSERT statements found in chunk"); 32 | return None; 33 | } 34 | 35 | let mut table_sections = Vec::new(); 36 | for table_cap in table_header_re.captures_iter(chunk) { 37 | if let Some(table_name) = table_cap.get(1) { 38 | let pos = table_cap.get(0).unwrap().start(); 39 | table_sections.push((table_name.as_str().to_string(), pos)); 40 | } 41 | } 42 | 43 | table_sections.sort_by_key(|&(_, pos)| pos); 44 | 45 | for (i, (insert_stmt, array_content)) in inserts.iter().enumerate() { 46 | let insert_pos = chunk.find(insert_stmt).unwrap_or(0); 47 | let mut table_name = "unknown_table".to_string(); 48 | for (t_name, t_pos) in &table_sections { 49 | if *t_pos < insert_pos { 50 | table_name = t_name.clone(); 51 | } else { 52 | break; 53 | } 54 | } 55 | 56 | if let Some(ref excl) = excluder { 57 | if excl.ignore_table(&table_name) { 58 | info!("Skipping excluded table: {}", table_name); 59 | continue; 60 | } 61 | } 62 | 63 | info!("Processing INSERT #{} for table: {}", i, table_name); 64 | debug!("Parsing data from table {}: {:.100}...", table_name, array_content); 65 | 66 | let object_re = Regex::new(r"\}\s*,\s*\{").unwrap(); 67 | let items: Vec = object_re 68 | .split(array_content) 69 | .map(|s| { 70 | let trimmed = s.trim(); 71 | let mut obj = trimmed.to_string(); 72 | if !obj.starts_with('{') { 73 | obj.insert(0, '{'); 74 | } 75 | if !obj.ends_with('}') { 76 | obj.push('}'); 77 | } 78 | obj 79 | }) 80 | .collect(); 81 | 82 | for item_str in items { 83 | if let Ok(mut obj) = serde_json::from_str::>(&item_str) { 84 | obj.insert("table".to_string(), Value::String(table_name.clone())); 85 | let mut value = Value::Object(obj); 86 | clean_html_in_value(&mut value); 87 | records.push(value); 88 | continue; 89 | } 90 | 91 | let mut record = serde_json::Map::new(); 92 | let kv_regex = Regex::new( 93 | r#"([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*("(?:\\.|[^"\\])*"|'[^']*'|\[.*?\]|\{.*?\}|[0-9.]+(?:f)?|true|false|null)"# 94 | ).unwrap(); 95 | 96 | for caps in kv_regex.captures_iter(&item_str) { 97 | let key = caps.get(1).unwrap().as_str(); 98 | let raw_val = caps.get(2).unwrap().as_str().trim(); 99 | 100 | let value = if raw_val.starts_with('[') && raw_val.ends_with(']') { 101 | serde_json 102 | ::from_str::(raw_val) 103 | .unwrap_or(Value::String(raw_val.to_string())) 104 | } else if raw_val.starts_with('{') && raw_val.ends_with('}') { 105 | serde_json 106 | ::from_str::(raw_val) 107 | .unwrap_or(Value::String(raw_val.to_string())) 108 | } else if raw_val.starts_with('\'') && raw_val.ends_with('\'') { 109 | Value::String(raw_val.trim_matches('\'').to_string()) 110 | } else if raw_val.starts_with('"') && raw_val.ends_with('"') { 111 | match serde_json::from_str::(raw_val) { 112 | Ok(s) => Value::String(s), 113 | Err(_) => { 114 | let s = raw_val[1..raw_val.len()-1] 115 | .replace("\\\"", "\"") 116 | .replace("\\\\", "\\") 117 | .replace("\\n", "\n") 118 | .replace("\\r", "\r") 119 | .replace("\\t", "\t"); 120 | Value::String(s) 121 | } 122 | } 123 | } else if let Ok(n) = raw_val.trim_end_matches('f').parse::() { 124 | if n.fract() == 0.0 { 125 | Value::Number((n as i64).into()) 126 | } else { 127 | serde_json::Number 128 | ::from_f64(n) 129 | .map(Value::Number) 130 | .unwrap_or(Value::String(raw_val.to_string())) 131 | } 132 | } else if raw_val == "true" { 133 | Value::Bool(true) 134 | } else if raw_val == "false" { 135 | Value::Bool(false) 136 | } else if raw_val == "null" { 137 | Value::Null 138 | } else { 139 | Value::String(raw_val.to_string()) 140 | }; 141 | 142 | record.insert(key.to_string(), value); 143 | } 144 | 145 | record.insert("table".to_string(), Value::String(table_name.clone())); 146 | record.remove("id"); 147 | 148 | if record.len() > 1 { 149 | let mut value = Value::Object(record); 150 | clean_html_in_value(&mut value); 151 | records.push(value); 152 | } else { 153 | warn!("Regex fallback produced empty record for: {}", item_str); 154 | } 155 | } 156 | } 157 | 158 | if records.is_empty() { 159 | warn!("No records parsed from section"); 160 | None 161 | } else { 162 | 163 | info!("Successfully parsed {} records", records.len()); 164 | Some(records) 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/util/exclude.rs: -------------------------------------------------------------------------------- 1 | use serde::Deserialize; 2 | use serde_json::Value; 3 | use std::{collections::HashMap, fs, path::Path}; 4 | 5 | #[derive(Debug, Deserialize)] 6 | #[serde(default)] 7 | pub struct ExcludeEntry { 8 | pub table: String, 9 | pub ignore_table: bool, 10 | #[serde(default)] 11 | pub exclude_fields: HashMap, 12 | } 13 | 14 | impl Default for ExcludeEntry { 15 | fn default() -> Self { 16 | ExcludeEntry { 17 | table: String::new(), 18 | ignore_table: false, 19 | exclude_fields: HashMap::new(), 20 | } 21 | } 22 | } 23 | 24 | #[derive(Debug, Deserialize)] 25 | #[serde(untagged)] 26 | pub enum FieldExclude { 27 | All(bool), 28 | Sub(Vec), 29 | } 30 | 31 | pub struct Excluder { 32 | entries: HashMap, 33 | } 34 | 35 | impl Excluder { 36 | pub fn load>(path: P) -> Self { 37 | let data = fs::read_to_string(path).unwrap_or_else(|_| "[]".into()); 38 | let list: Vec = 39 | serde_json::from_str(&data).unwrap_or_else(|_| Vec::new()); 40 | let entries = list.into_iter() 41 | .map(|e| (e.table.clone(), e)) 42 | .collect(); 43 | Excluder { entries } 44 | } 45 | 46 | pub fn ignore_table(&self, table: &str) -> bool { 47 | self.entries 48 | .get(table) 49 | .map(|e| e.ignore_table) 50 | .unwrap_or(false) 51 | } 52 | 53 | 54 | pub fn filter_record(&self, record: &mut Value) { 55 | let table = match record.get("table").and_then(Value::as_str) { 56 | Some(t) => t, 57 | None => return, 58 | }; 59 | 60 | if let Some(entry) = self.entries.get(table) { 61 | if let Value::Object(map) = record { 62 | for (field, rule) in &entry.exclude_fields { 63 | match rule { 64 | FieldExclude::All(true) => { 65 | map.remove(field); 66 | } 67 | FieldExclude::Sub(keys) => { 68 | if let Some(Value::Object(sub_map)) = map.get_mut(field) { 69 | for k in keys { 70 | sub_map.remove(k); 71 | } 72 | } 73 | 74 | else if let Some(Value::String(obj_str)) = map.get_mut(field) { 75 | if obj_str.trim().starts_with('{') && obj_str.trim().ends_with('}') { 76 | for key in keys { 77 | let patterns = [ 78 | format!("{}:\\s*[^,}}]+,", regex::escape(key)), 79 | format!("{}:\\s*[^,}}]+}}", regex::escape(key)), 80 | format!("\"{}\":\\s*[^,}}]+,", regex::escape(key)), 81 | format!("'{}\':\\s*[^,}}]+,", regex::escape(key)), 82 | ]; 83 | 84 | for pattern in patterns { 85 | if let Ok(re) = regex::Regex::new(&pattern) { 86 | *obj_str = re.replace(obj_str, "").to_string(); 87 | } 88 | } 89 | 90 | if let Ok(re) = regex::Regex::new(r",\s*}") { 91 | *obj_str = re.replace(obj_str, "}").to_string(); 92 | } 93 | if let Ok(re) = regex::Regex::new(r",\s*,") { 94 | *obj_str = re.replace(obj_str, ",").to_string(); 95 | } 96 | } 97 | } 98 | } 99 | }, 100 | _ => {} 101 | } 102 | } 103 | } 104 | } 105 | } 106 | 107 | } -------------------------------------------------------------------------------- /src/util/handle_tei.rs: -------------------------------------------------------------------------------- 1 | use std::process::{ Child, Command, Stdio }; 2 | use log::{ info, error }; 3 | use std::{ error::Error as StdError, io::{ BufRead, BufReader, Write }, time::{ Duration, Instant }, thread }; 4 | use crate::{cli::Args, util::spinner::start_operation_animation}; 5 | use std::sync::atomic::Ordering; 6 | use std::sync::mpsc; 7 | 8 | pub struct ManagedProcess { 9 | child: Child, 10 | name: String, 11 | } 12 | 13 | impl ManagedProcess { 14 | pub fn new(child: Child, name: String) -> Self { 15 | info!("Started managed process '{}' (PID: {})", name, child.id()); 16 | Self { child, name } 17 | } 18 | 19 | 20 | pub fn id(&self) -> u32 { 21 | self.child.id() 22 | } 23 | 24 | pub fn kill(&mut self) -> Result<(), Box> { 25 | info!("Manually terminating process '{}' (PID: {})", self.name, self.child.id()); 26 | match self.child.kill() { 27 | Ok(_) => { 28 | info!("Successfully sent kill signal to process '{}'", self.name); 29 | Ok(()) 30 | } 31 | Err(e) => { 32 | let err = format!("Failed to kill process '{}': {}", self.name, e); 33 | error!("{}", err); 34 | Err(err.into()) 35 | } 36 | } 37 | } 38 | } 39 | 40 | impl Drop for ManagedProcess { 41 | fn drop(&mut self) { 42 | info!("Attempting to terminate managed process '{}' (PID: {})", self.name, self.child.id()); 43 | match self.child.kill() { 44 | Ok(_) => { 45 | info!("Successfully sent kill signal to process '{}'", self.name); 46 | } 47 | Err(e) => 48 | error!("Failed to kill process '{}' (PID: {}): {}", self.name, self.child.id(), e), 49 | } 50 | } 51 | } 52 | 53 | pub fn start_and_wait_for_tei( 54 | args: &Args 55 | ) -> Result<(ManagedProcess, String), Box> { 56 | 57 | println!("\n══════════════════════════════════════════════════════════════"); 58 | println!("🚀 Starting local TEI embedding server with model: {}", args.embedding_model); 59 | println!(" This process can take 3-20 minutes on first run for model download"); 60 | println!("══════════════════════════════════════════════════════════════\n"); 61 | 62 | let (animation, counter) = start_operation_animation("Initializing TEI server"); 63 | 64 | let model_id = if args.embedding_model.is_empty() { 65 | animation.stop(); 66 | return Err("embedding_model must be specified when managing local TEI".into()); 67 | } else { 68 | &args.embedding_model 69 | }; 70 | 71 | let tei_binary = &args.tei_binary_path; 72 | 73 | 74 | info!("Starting TEI binary: '{}' with model '{}'", tei_binary, model_id); 75 | 76 | let mut command = Command::new(tei_binary); 77 | command 78 | .args(["--model-id", model_id, "--port", &args.tei_local_port.to_string(), "--auto-truncate"]) 79 | .env("RUST_LOG", "info") 80 | .stdout(Stdio::piped()) 81 | .stderr(Stdio::piped()); 82 | 83 | let mut child = match command.spawn() { 84 | Ok(child) => child, 85 | Err(e) => { 86 | animation.stop(); 87 | return Err(format!("Failed to spawn TEI binary '{}': {}", tei_binary, e).into()); 88 | } 89 | }; 90 | 91 | let stdout = match child.stdout.take() { 92 | Some(stdout) => stdout, 93 | None => { 94 | animation.stop(); 95 | return Err("Failed to capture TEI stdout".into()); 96 | } 97 | }; 98 | 99 | let stderr = match child.stderr.take() { 100 | Some(stderr) => stderr, 101 | None => { 102 | animation.stop(); 103 | return Err("Failed to capture TEI stderr".into()); 104 | } 105 | }; 106 | 107 | let process_name = format!("tei-server-{}", child.id()); 108 | let managed_process = ManagedProcess::new(child, process_name); 109 | 110 | println!("\nTEI Server Logs:"); 111 | println!("----------------"); 112 | 113 | let (tx, rx) = mpsc::channel(); 114 | let tx_stderr = tx.clone(); 115 | 116 | thread::spawn(move || { 117 | let reader = BufReader::new(stdout); 118 | for line in reader.lines() { 119 | if let Ok(line) = line { 120 | if let Err(_) = tx.send(line) { 121 | break; 122 | } 123 | } 124 | } 125 | }); 126 | 127 | thread::spawn(move || { 128 | let reader = BufReader::new(stderr); 129 | for line in reader.lines() { 130 | if let Ok(line) = line { 131 | if let Err(_) = tx_stderr.send(line) { 132 | break; 133 | } 134 | } 135 | } 136 | }); 137 | 138 | let start_time = Instant::now(); 139 | let mut ready = false; 140 | let mut log_buffer = Vec::new(); 141 | let timeout = Duration::from_secs(300); 142 | let deadline = start_time + timeout; 143 | 144 | while Instant::now() < deadline { 145 | match rx.recv_timeout(Duration::from_secs(1)) { 146 | Ok(line) => { 147 | log_buffer.push(line.clone()); 148 | println!(" TEI: {}", line); 149 | 150 | if line.contains("Starting download") { 151 | counter.store(20, Ordering::Relaxed); 152 | } else if line.contains("Model weights downloaded") { 153 | counter.store(40, Ordering::Relaxed); 154 | } else if line.contains("Starting model backend") { 155 | counter.store(60, Ordering::Relaxed); 156 | } else if line.contains("Warming up model") { 157 | counter.store(80, Ordering::Relaxed); 158 | } else if line.contains("Starting HTTP server") { 159 | counter.store(90, Ordering::Relaxed); 160 | } else if line.contains("Ready") { 161 | counter.store(100, Ordering::Relaxed); 162 | ready = true; 163 | break; 164 | } 165 | 166 | let _ = std::io::stdout().flush(); 167 | }, 168 | Err(mpsc::RecvTimeoutError::Timeout) => { 169 | continue; 170 | }, 171 | Err(mpsc::RecvTimeoutError::Disconnected) => { 172 | println!(" ⚠️ TEI process may have terminated unexpectedly"); 173 | break; 174 | } 175 | } 176 | } 177 | 178 | animation.stop(); 179 | 180 | if ready { 181 | println!("\n✅ TEI server ready in {:?}! Continuing with processing...\n", start_time.elapsed()); 182 | let tei_url = format!("http://localhost:{}", args.tei_local_port); 183 | Ok((managed_process, tei_url)) 184 | } else if Instant::now() >= deadline { 185 | println!("\n❌ Timeout waiting for TEI server to become ready"); 186 | 187 | if !log_buffer.is_empty() { 188 | let _ = std::fs::write("tei_timeout.log", log_buffer.join("\n")); 189 | println!("TEI logs saved to 'tei_timeout.log'"); 190 | } 191 | 192 | Err("Timeout waiting for TEI server to become ready".into()) 193 | } else { 194 | println!("\n❌ TEI server failed to start properly"); 195 | 196 | if !log_buffer.is_empty() { 197 | let _ = std::fs::write("tei_failure.log", log_buffer.join("\n")); 198 | println!("TEI logs saved to 'tei_failure.log'"); 199 | } 200 | 201 | Err("TEI server failed to report ready".into()) 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/util/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod utils; 2 | pub mod spinner; 3 | pub use utils::*; 4 | pub mod handle_tei; 5 | pub mod exclude; 6 | pub use handle_tei::ManagedProcess; 7 | pub use handle_tei::start_and_wait_for_tei; 8 | -------------------------------------------------------------------------------- /src/util/spinner.rs: -------------------------------------------------------------------------------- 1 | use std::io::{ stdout, Write }; 2 | use std::sync::{ Arc, Mutex }; 3 | use std::sync::atomic::{ AtomicUsize, Ordering }; 4 | use std::thread::{ self, JoinHandle }; 5 | use std::time::Duration; 6 | 7 | pub struct AnimationHandle { 8 | pub thread: JoinHandle<()>, 9 | pub stop_flag: Arc>, 10 | } 11 | 12 | impl AnimationHandle { 13 | pub fn stop(self) { 14 | *self.stop_flag.lock().unwrap() = true; 15 | if let Err(e) = self.thread.join() { 16 | eprintln!("Failed to join animation thread: {:?}", e); 17 | } 18 | } 19 | } 20 | 21 | pub fn start_spinner_animation( 22 | counter: Arc, 23 | total: usize, 24 | message: &str 25 | ) -> AnimationHandle { 26 | let stop_flag = Arc::new(Mutex::new(false)); 27 | let stop_clone = stop_flag.clone(); 28 | let message = message.to_string(); 29 | 30 | let thread = thread::spawn(move || { 31 | let spinner_chars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; 32 | let mut spinner_idx = 0; 33 | 34 | while !*stop_clone.lock().unwrap() { 35 | let count = counter.load(Ordering::Relaxed); 36 | spinner_idx = (spinner_idx + 1) % spinner_chars.len(); 37 | 38 | print!( 39 | "\r{} {}... [{}/{}] ({}%)", 40 | spinner_chars[spinner_idx], 41 | message, 42 | count, 43 | total, 44 | (count * 100) / total.max(1) 45 | ); 46 | 47 | let _ = stdout().flush(); 48 | thread::sleep(Duration::from_millis(80)); 49 | } 50 | 51 | print!("\r{}\r", " ".repeat(80)); 52 | let _ = stdout().flush(); 53 | }); 54 | 55 | AnimationHandle { thread, stop_flag } 56 | } 57 | 58 | pub fn start_operation_animation(message: &str) -> (AnimationHandle, Arc) { 59 | let counter = Arc::new(AtomicUsize::new(0)); 60 | let total = 100; 61 | let handle = start_spinner_animation(counter.clone(), total, message); 62 | (handle, counter) 63 | } 64 | -------------------------------------------------------------------------------- /src/util/utils.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::io::{ Cursor, Read, Result as IoResult }; 3 | use std::path::Path; 4 | use encoding_rs::UTF_16LE; 5 | use encoding_rs_io::DecodeReaderBytesBuilder; 6 | use log::info; 7 | use crate::parser::detect_format; 8 | 9 | 10 | pub fn read_file_content>(file_path: P) -> IoResult { 11 | info!("Reading file: {}", file_path.as_ref().display()); 12 | let raw = fs::read(&file_path)?; 13 | if raw.starts_with(&[0xff, 0xfe]) { 14 | let mut decoder = DecodeReaderBytesBuilder::new() 15 | .encoding(Some(UTF_16LE)) 16 | .bom_override(true) 17 | .build(Cursor::new(raw)); 18 | 19 | let mut content = String::new(); 20 | decoder.read_to_string(&mut content)?; 21 | Ok(content) 22 | } else { 23 | String::from_utf8(raw).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) 24 | } 25 | } 26 | 27 | pub fn read_file_and_detect_format>(file_path: P) -> IoResult<(String, String)> { 28 | let content = read_file_content(&file_path)?; 29 | 30 | info!("Detecting format..."); 31 | 32 | let file_path_str = file_path.as_ref().to_str().unwrap_or("unknown_path"); 33 | let format = detect_format(file_path_str, &content); 34 | 35 | info!("Detected format: {}", format); 36 | info!("Processing {} format file: {}", format, file_path.as_ref().display()); 37 | 38 | Ok((content, format)) 39 | } 40 | 41 | pub fn logo() { 42 | println!( 43 | r#" 44 | ____ ____ ____ _ _ ____ ___ 45 | ( \( _ \(___ \/ )( \( __) / __) 46 | ) D ( ) _ ( / __/\ \/ / ) _) ( (__ 47 | (____/(____/(____) \__/ (____) \___) 48 | "# 49 | ); 50 | println!("Database to Vector Migration Tool\n"); 51 | } 52 | 53 | 54 | pub fn init_thread_pool(num_threads: usize) { 55 | let thread_count = if num_threads == 0 { num_cpus::get() } else { num_threads }; 56 | rayon::ThreadPoolBuilder::new().num_threads(thread_count).build_global().unwrap(); 57 | info!("Using {} threads for parallel processing", thread_count); 58 | } 59 | -------------------------------------------------------------------------------- /src/workflow.rs: -------------------------------------------------------------------------------- 1 | use crate::cli::Args; 2 | use crate::db::{ Database, DbError, store_in_batches }; 3 | use crate::embedding::embeding::{ initialize_embedding_generator, process_records_with_embeddings }; 4 | use crate::util::spinner::start_spinner_animation; 5 | use crate::util::handle_tei::{start_and_wait_for_tei, ManagedProcess}; 6 | use log::{ info, warn, error }; 7 | use serde_json::Value; 8 | use std::collections::HashMap; 9 | use std::sync::Arc; 10 | use std::sync::atomic::{ AtomicUsize, Ordering }; 11 | use std::time::Instant; 12 | 13 | pub struct MigrationStats { 14 | pub total_records: usize, 15 | pub processed_records: usize, 16 | pub elapsed_seconds: f64, 17 | } 18 | 19 | pub fn execute_migration_workflow( 20 | records: Vec, 21 | database: &dyn Database, 22 | args: &Args, 23 | ) -> Result { 24 | let total_records = records.len(); 25 | if total_records == 0 { 26 | warn!("No records to process"); 27 | return Ok(MigrationStats { 28 | total_records: 0, 29 | processed_records: 0, 30 | elapsed_seconds: 0.0, 31 | }); 32 | } 33 | 34 | let mut tei_process: Option = None; 35 | let mut override_url: Option = None; 36 | 37 | if args.embedding_provider == "tei" && args.embedding_url.is_none() { 38 | let args = args.clone(); 39 | let (proc, url) = std::thread::spawn(move || start_and_wait_for_tei(&args)) 40 | .join() 41 | .map_err(|e| format!("TEI thread panicked: {:?}", e))??; 42 | tei_process = Some(proc); 43 | override_url = Some(url); 44 | } 45 | 46 | let generator = initialize_embedding_generator(args, override_url.as_deref()) 47 | .map_err(|e| DbError::from(format!("Init embed gen failed: {}", e)))?; 48 | 49 | let start_time = Instant::now(); 50 | let embedding_count = Arc::new(AtomicUsize::new(0)); 51 | let embedding_animation = start_spinner_animation( 52 | embedding_count.clone(), 53 | total_records, 54 | "Generating embeddings" 55 | ); 56 | 57 | info!("Starting embedding generation for {} records", total_records); 58 | 59 | let prepared_records = match 60 | process_records_with_embeddings(records, args, embedding_count.clone(), generator) 61 | { 62 | Ok(records) => records, 63 | Err(e) => { 64 | embedding_animation.stop(); 65 | error!("CRITICAL: Embedding generation failed: {}", e); 66 | return Err(format!("Embedding generation critical error: {}", e).into()); 67 | } 68 | }; 69 | 70 | embedding_animation.stop(); 71 | 72 | if prepared_records.is_empty() { 73 | warn!("No records were prepared for storage after embedding process."); 74 | } else { 75 | println!("\nEmbedding generation complete! Storing data..."); 76 | 77 | let mut grouped_records: HashMap, Value)>> = HashMap::new(); 78 | for (table, id, vec, meta) in prepared_records { 79 | grouped_records.entry(table).or_insert_with(Vec::new).push((id, vec, meta)); 80 | } 81 | 82 | let processed_count = Arc::new(AtomicUsize::new(0)); 83 | let storage_animation = start_spinner_animation( 84 | processed_count.clone(), 85 | total_records, 86 | "Storing in database" 87 | ); 88 | 89 | let max_payload_bytes = args.max_payload_size_mb * 1024 * 1024; 90 | let chunk_size = args.chunk_size; 91 | 92 | for (table, items) in grouped_records { 93 | info!("Storing {} items for table '{}'", items.len(), table); 94 | for batch in items.chunks(chunk_size) { 95 | match store_in_batches(database, &table, batch, max_payload_bytes) { 96 | Ok(_) => { 97 | let _ = processed_count.fetch_add(batch.len(), Ordering::Relaxed); 98 | } 99 | Err(e) => { 100 | storage_animation.stop(); 101 | error!("CRITICAL: Database storage error for table '{}': {}", table, e); 102 | return Err(format!("Database storage error: {}", e).into()); 103 | } 104 | } 105 | } 106 | } 107 | storage_animation.stop(); 108 | } 109 | 110 | let elapsed_time = start_time.elapsed(); 111 | let final_count = embedding_count.load(Ordering::Relaxed); 112 | 113 | println!( 114 | "\nFinished processing {} records in {:.2} seconds ({:.1} records/sec)", 115 | final_count, 116 | elapsed_time.as_secs_f64(), 117 | if elapsed_time.as_secs_f64() > 0.0 { 118 | (final_count as f64) / elapsed_time.as_secs_f64() 119 | } else { 120 | 0.0 121 | } 122 | ); 123 | println!("Migration Complete."); 124 | 125 | if let Some(mut p) = tei_process { 126 | let _ = p.kill(); 127 | } 128 | 129 | Ok(MigrationStats { 130 | total_records, 131 | processed_records: final_count, 132 | elapsed_seconds: elapsed_time.as_secs_f64(), 133 | }) 134 | } -------------------------------------------------------------------------------- /tei/tei-linux-x86: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-linux-x86 -------------------------------------------------------------------------------- /tei/tei-metal-mac-arm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-metal-mac-arm -------------------------------------------------------------------------------- /tei/tei-onnx-mac-arm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-onnx-mac-arm -------------------------------------------------------------------------------- /tei_timeout.log: -------------------------------------------------------------------------------- 1 | 2025-05-07T19:34:32.481255Z  INFO text_embeddings_router: router/src/main.rs:189: Args { model_id: "Ali****-***/***-*****-*.**-*****uct", revision: None, tokenization_workers: None, dtype: None, pooling: None, max_concurrent_requests: 512, max_batch_tokens: 16384, max_batch_requests: None, max_client_batch_size: 32, auto_truncate: true, default_prompt_name: None, default_prompt: None, hf_api_token: None, hf_token: None, hostname: "0.0.0.0", port: 19998, uds_path: "/tmp/text-embeddings-inference-server", huggingface_hub_cache: None, payload_limit: 2000000, api_key: None, json_output: false, disable_spans: false, otlp_endpoint: None, otlp_service_name: "text-embeddings-inference.server", prometheus_port: 9000, cors_allow_origin: None } 2 | 2025-05-07T19:34:32.486659Z  INFO download_artifacts: text_embeddings_core::download: core/src/download.rs:20: Starting download 3 | 2025-05-07T19:34:32.486675Z  INFO download_artifacts:download_pool_config: text_embeddings_core::download: core/src/download.rs:53: Downloading `1_Pooling/config.json` 4 | 2025-05-07T19:34:32.487341Z  INFO download_artifacts:download_new_st_config: text_embeddings_core::download: core/src/download.rs:77: Downloading `config_sentence_transformers.json` 5 | 2025-05-07T19:34:32.487367Z  INFO download_artifacts: text_embeddings_core::download: core/src/download.rs:40: Downloading `config.json` 6 | 2025-05-07T19:34:32.487388Z  INFO download_artifacts: text_embeddings_core::download: core/src/download.rs:43: Downloading `tokenizer.json` 7 | 2025-05-07T19:34:32.487520Z  INFO download_artifacts: text_embeddings_core::download: core/src/download.rs:47: Model artifacts downloaded in 861.667µs 8 | 2025-05-07T19:34:32.585305Z  INFO text_embeddings_router: router/src/lib.rs:193: Maximum number of tokens per request: 32768 9 | 2025-05-07T19:34:32.585400Z  INFO text_embeddings_core::tokenization: core/src/tokenization.rs:38: Starting 14 tokenization workers 10 | 2025-05-07T19:34:32.671605Z  INFO text_embeddings_router: router/src/lib.rs:235: Starting model backend 11 | 2025-05-07T19:34:32.671903Z  INFO text_embeddings_backend: backends/src/lib.rs:534: Downloading `model.onnx` 12 | 2025-05-07T19:34:33.079916Z  WARN text_embeddings_backend: backends/src/lib.rs:538: Could not download `model.onnx`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.onnx) 13 | 2025-05-07T19:34:33.079938Z  INFO text_embeddings_backend: backends/src/lib.rs:539: Downloading `onnx/model.onnx` 14 | 2025-05-07T19:34:33.489241Z  WARN text_embeddings_backend: backends/src/lib.rs:543: Could not download `onnx/model.onnx`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/onnx/model.onnx) 15 | 2025-05-07T19:34:33.489254Z  INFO text_embeddings_backend: backends/src/lib.rs:548: Downloading `model.onnx_data` 16 | 2025-05-07T19:34:33.753610Z  WARN text_embeddings_backend: backends/src/lib.rs:552: Could not download `model.onnx_data`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.onnx_data) 17 | 2025-05-07T19:34:33.753637Z  INFO text_embeddings_backend: backends/src/lib.rs:553: Downloading `onnx/model.onnx_data` 18 | 2025-05-07T19:34:34.513003Z  WARN text_embeddings_backend: backends/src/lib.rs:557: Could not download `onnx/model.onnx_data`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/onnx/model.onnx_data) 19 | 2025-05-07T19:34:34.513013Z ERROR text_embeddings_backend: backends/src/lib.rs:346: Model ONNX files not found in the repository 20 | 2025-05-07T19:34:34.513294Z ERROR text_embeddings_backend: backends/src/lib.rs:358: Could not start ORT backend: Could not start backend: Pooling last_token is not supported for this backend. Use `candle` backend instead. 21 | 2025-05-07T19:34:34.513515Z  INFO text_embeddings_backend: backends/src/lib.rs:493: Downloading `model.safetensors` 22 | 2025-05-07T19:34:35.332343Z  WARN text_embeddings_backend: backends/src/lib.rs:496: Could not download `model.safetensors`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.safetensors) 23 | 2025-05-07T19:34:35.332358Z  INFO text_embeddings_backend: backends/src/lib.rs:501: Downloading `model.safetensors.index.json` 24 | 2025-05-07T19:34:35.333710Z  INFO text_embeddings_backend: backends/src/lib.rs:523: Downloading `model-00002-of-00002.safetensors` 25 | 2025-05-07T19:36:48.542283Z  INFO text_embeddings_backend: backends/src/lib.rs:523: Downloading `model-00001-of-00002.safetensors` -------------------------------------------------------------------------------- /vector-export-scripts/qdrant.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | QDRANT_URL="http://localhost:6333" 5 | 6 | # 1. Discover all collections 7 | collections=$(curl -s "${QDRANT_URL}/collections" \ 8 | -H "Content-Type: application/json" \ 9 | | jq -r '.result.collections[].name') 10 | 11 | for col in $collections; do 12 | echo "Exporting collection: $col" 13 | 14 | # 2. Create snapshot (synchronous) 15 | resp=$(curl -s -X POST \ 16 | "${QDRANT_URL}/collections/${col}/snapshots" \ 17 | -H "Content-Type: application/json") 18 | snap=$(jq -r '.result.name' <<<"$resp") 19 | echo " Snapshot created: $snap" 20 | 21 | # 3. Download 22 | curl -s "${QDRANT_URL}/collections/${col}/snapshots/${snap}" \ 23 | --output "${col}.snapshot" 24 | echo " Saved ${col}.snapshot" 25 | done 26 | 27 | echo "✅ All snapshots exported." 28 | --------------------------------------------------------------------------------