├── .cargo
    └── config.toml
├── .env-example
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── README.md
├── assets
    └── db2vec_screenshot.png
├── build-cross-release.sh
├── config
    └── exclude.json
├── dockerfile
├── docs
    ├── COMPATIBLE.md
    ├── DOCKER_SETUP.md
    ├── OPTION.md
    └── TEI.md
├── samples
    ├── mssql_sample.sql
    ├── mysql_sample.sql
    ├── oracle_sample.sql
    ├── postgres_sample.sql
    ├── profile_sample.txt
    ├── sqlite_sample.sql
    └── surreal_sample.surql
├── src
    ├── cli
    │   └── mod.rs
    ├── db
    │   ├── chroma.rs
    │   ├── milvus.rs
    │   ├── mod.rs
    │   ├── pinecone.rs
    │   ├── qdrant.rs
    │   ├── redis.rs
    │   └── surreal.rs
    ├── embedding
    │   ├── embeding.rs
    │   ├── mod.rs
    │   └── models
    │   │   ├── google.rs
    │   │   ├── mod.rs
    │   │   ├── ollama.rs
    │   │   └── tei.rs
    ├── lib.rs
    ├── main.rs
    ├── parser
    │   ├── mod.rs
    │   └── parse_regex
    │   │   ├── mod.rs
    │   │   ├── mssql.rs
    │   │   ├── mysql.rs
    │   │   ├── oracle.rs
    │   │   ├── postgres.rs
    │   │   ├── sqlite.rs
    │   │   └── surreal.rs
    ├── util
    │   ├── exclude.rs
    │   ├── handle_tei.rs
    │   ├── mod.rs
    │   ├── spinner.rs
    │   └── utils.rs
    └── workflow.rs
├── tei
    ├── tei-linux-x86
    ├── tei-metal-mac-arm
    └── tei-onnx-mac-arm
├── tei_timeout.log
├── tests
    └── integration_test.rs
└── vector-export-scripts
    └── qdrant.sh


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.x86_64-pc-windows-gnu]
2 | linker = "x86_64-w64-mingw32-gcc"
3 | [target.x86_64-unknown-linux-gnu]
4 | linker = "x86_64-linux-gnu-gcc"


--------------------------------------------------------------------------------
/.env-example:
--------------------------------------------------------------------------------
 1 | # DB2VEC ENVIRONMENT CONFIGURATION
 2 | # ===============================
 3 | # This file contains environment variables used by db2vec
 4 | # Copy this file to ".env" and customize as needed
 5 | 
 6 | # INPUT/OUTPUT CONFIGURATION
 7 | # --------------------------
 8 | # Path to the database dump file to process (.sql/.surql)
 9 | DUMP_FILE=./surreal.surql
10 | 
11 | # Target vector database type
12 | # Options: redis|chroma|milvus|qdrant|surrealdb|pinecone
13 | EXPORT_TYPE=redis
14 | 
15 | # DEBUG MODE
16 | # ----------
17 | # Print parsed JSON records before embedding
18 | DEBUG=false
19 | 
20 | # VECTOR DATABASE CONNECTION
21 | # -------------------------
22 | # Vector database URL/host endpoint
23 | VECTOR_HOST=redis://127.0.0.1:6379
24 | 
25 | # Database authentication (user/password or API key)
26 | USER=root
27 | PASS=
28 | SECRET=
29 | AUTH=false
30 | 
31 | # Database organization
32 | DATABASE=default_database
33 | TENANT=default_tenant
34 | NAMESPACE=default_namespace
35 | 
36 | # Pinecone-specific settings
37 | INDEXES=default_indexes
38 | CLOUD=aws
39 | REGION=us-east-1
40 | 
41 | # VECTOR CONFIGURATION
42 | # -------------------
43 | # Vector dimension size (must match your embedding model)
44 | DIMENSION=768
45 | 
46 | # Distance metric: l2|ip|cosine|euclidean|dotproduct
47 | METRIC=cosine
48 | 
49 | # DATA HANDLING
50 | # ------------
51 | # Max payload size (MB) per request
52 | PAYLOAD_SIZE_MB=12
53 | 
54 | # Batch size for DB inserts
55 | CHUNK_SIZE=10
56 | 
57 | # Group Redis records by table name (else use FT.CREATE/SEARCH)
58 | GROUP_REDIS=false
59 | 
60 | # Use exclusion rules from config/exclude.json
61 | USE_EXCLUDE=false
62 | 
63 | # EMBEDDING CONFIGURATION
64 | # ---------------------
65 | # Which embedding provider to use: ollama, tei, or google
66 | EMBEDDING_PROVIDER=ollama
67 | 
68 | # Embedding model name/id
69 | # Examples: nomic-embed-text, text-embedding-004, nomic-embed-text-v2-moe
70 | EMBEDDING_MODEL=nomic-embed-text
71 | 
72 | # API Key for Google Gemini (required if EMBEDDING_PROVIDER=google)
73 | # EMBEDDING_API_KEY=
74 | 
75 | # URL endpoint for Ollama or Google embeddings (optional)
76 | # EMBEDDING_URL=
77 | 
78 | # Embedding performance tuning
79 | EMBEDDING_MAX_CONCURRENCY=4
80 | EMBEDDING_BATCH_SIZE=16
81 | EMBEDDING_MAX_TOKENS=8000
82 | OLLAMA_TIMEOUT=60
83 | 
84 | # Task type for Google Gemini
85 | EMBEDDING_TASK_TYPE=SEMANTIC_SIMILARITY
86 | 
87 | # TEI (Text Embedding Inference) specific settings
88 | TEI_BINARY_PATH=tei/tei-metal
89 | TEI_LOCAL_PORT=8080
90 | 
91 | # PERFORMANCE
92 | # ----------
93 | # CPU threads for parallel tasks (0 = auto detect)
94 | NUM_THREADS=0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | .env
 3 | /volumes
 4 | /root
 5 | /chroma-data
 6 | history.txt
 7 | milvus.yaml
 8 | qdrant_storage
 9 | hero.surql
10 | docker-compose.yml
11 | /dist
12 | .DS_Store
13 | tests/volumes
14 | tei_failure.log


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "db2vec"
 3 | version = "0.5.7"
 4 | edition = "2024"
 5 | authors = ["Thanon Aphithanawat"]
 6 | description = "High-performance tool to parse database dumps, generate vector embeddings, and load them into vector databases"
 7 | readme = "README.md"
 8 | repository = "https://github.com/DevsHero/db2vec"
 9 | license = "MIT"
10 | keywords = ["vector-database", "embedding", "ollama", "database-export", "vector-search"]
11 | categories = ["database", "command-line-utilities", "text-processing"]
12 | 
13 | [dependencies]
14 | redis = "0.29"
15 | serde = { version = "1", features = ["derive"] }
16 | serde_json = "1"
17 | reqwest = { version = "0.11" ,default-features = false, features = ["rustls-tls", "blocking", "json"] }
18 | tokio = { version = "1", features = ["full"] }
19 | uuid =   { version = "1", features = ["v4", "rng-getrandom"] }
20 | regex = "1.11"
21 | byteorder = "1.5.0"
22 | base64 = "0.22"
23 | html2text = "0.14"
24 | clap =  { version = "4", features = ["derive", "env"] }
25 | dotenvy = "0.15"
26 | log = "0.4"
27 | env_logger = "0.11"
28 | encoding_rs = "0.8"
29 | encoding_rs_io = "0.1"
30 | once_cell = "1"
31 | rayon = "1"
32 | num_cpus = "1"
33 | lazy_static = "1.5" 
34 | async-trait = "0.1"
35 | futures = "0.3"
36 | portpicker = "0.1.1"
37 | 
38 | [dev-dependencies]
39 | db2vec = { path = "." }
40 | 
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # db2vec: From Database Dumps to Vector Search at Speed 
  2 | 
  3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  4 | 
  5 | Tired of waiting hours for Python scripts to embed large database exports, especially on machines without powerful GPUs? So was I. Processing millions of records demands performance, even on standard hardware. `db2vec` is a high‑performance Rust tool designed for efficient **CPU-based embedding generation**. It parses your database dumps, generates vector embeddings using local models (Ollama, text-embeddings-inference(TEI) ) or cloud APIs (Google Gemini), and loads them into your vector database of choice – all optimized for speed without requiring a dedicated GPU.
  6 | 
  7 | ![db2vec CLI running](assets/db2vec_screenshot.png)
  8 | 
  9 | ---
 10 | 
 11 | ## Core Features
 12 | 
 13 | *   🚀 **Blazing Fast:** Built in Rust for maximum throughput on large datasets, optimized for CPU.
 14 | *   🔄 **Parallel Processing:** Adjustable concurrency and batch‑size for embedding generation (`--num‑threads`, `--embedding‑concurrency`, `--embedding‑batch-size`).
 15 | *   📦 **Batch Inserts:** Configurable batch size (`-c, --chunk-size`) and payload limits (`-m, --max-payload-size-mb`) for efficient bulk loading into the target vector database.
 16 | *   🛡️ **Data Filtering:** Exclude sensitive tables or fields via configuration for data privacy and reduced processing time.
 17 | *   🔧 **Highly Configurable:** Fine-tune performance and behavior with extensive CLI arguments for embedding, database connections, batching, and more.
 18 | *   📄 **Supported Dump Formats:**
 19 |     *   `.sql` (MySQL, PostgreSQL, MSSQL, SQLite, Oracle)
 20 |         *   **MSSQL:**
 21 |             ```bash
 22 |             sqlcmd -S server -U user -P pass -Q "SET NOCOUNT ON; SELECT * FROM dbo.TableName;" -o dump.sql
 23 |             ```
 24 |         *   *Oracle requires exporting via SQL Developer or similar into standard SQL.*
 25 |     *   `.surql` (SurrealDB)
 26 | *   🧠 **Flexible Embeddings:** Supports multiple providers:
 27 |     *   **Ollama** – best for local CPU/GPU, extremely fast.
 28 |     *   **TEI** – CPU-only Text Embeddings Inference (v1.7.0), slower than Ollama but faster than cloud. See [docs/TEI.md](docs/TEI.md) for details.
 29 |     *   **Google Gemini** – cloud API, ideal if you have very limited local resources. Beware of rate limits; use small batch sizes to avoid throttling.
 30 | *   💾 **Vector DB Targets:** Inserts vectors + metadata into:
 31 |     *   Chroma
 32 |     *   Milvus
 33 |     *   Pinecone (Cloud & Local Dev Image)
 34 |     *   Qdrant
 35 |     *   Redis Stack
 36 |     *   SurrealDB
 37 | *   ⚙️ **Pure Regex Parsing:** Fast, reliable record extraction (no AI).
 38 | *   🔒 **Authentication:** Supports user/password, API key, tenants/namespaces per DB.
 39 | *   ☁️ **Pinecone Cloud Support:** Automatically creates/describes indexes, uses namespaces.
 40 | *   🐞 **Debug Mode:** `--debug` prints parsed JSON records before embedding.
 41 | 
 42 | ---
 43 | 
 44 | ## Requirements
 45 | 
 46 | *   **Rust:** Latest stable (Edition 2021+).
 47 | *   **Embedding Provider:** One of the following configured:
 48 |     *   **Ollama:** Running locally with your desired model(s) pulled (e.g., `ollama pull nomic-embed-text`).
 49 |     *   **TEI:** Requires TEI binary (`tei-metal`) and compatible model (e.g., `nomic-embed-text-v2-moe`). See [docs/TEI.md](docs/TEI.md) for setup.
 50 |     *   **Google Gemini:** A valid Google Cloud API key (`--secret` or `EMBEDDING_API_KEY`) with the Generative Language API enabled for your project.
 51 | *   **Target DB:** One of Chroma, Milvus, Pinecone, Qdrant, Redis Stack, SurrealDB (Docker recommended for local).
 52 | *   **(Optional) `.env`:** For setting default configuration values.
 53 | 
 54 | ---
 55 | 
 56 | ## Configuration
 57 | 
 58 | Configuration can be set using CLI flags or by creating a `.env` file in the project root. CLI flags always override values set in the `.env` file.
 59 | 
 60 | Refer to the `.env-example` file for a comprehensive list of available environment variables, their descriptions, and default values.
 61 | 
 62 | ---
 63 | 
 64 | ## How It Works
 65 | 
 66 | 1.  **Read & Detect:** Load dump (`.sql`/`.surql`), detect SQL dialect or SurrealDB.
 67 | 2.  **Parse (Regex):** Extract records and types.
 68 | 3.  **Apply Exclusions:** Skip tables or fields based on your exclusion rules (if enabled).
 69 | 4.  **Embed:** Call the selected embedding provider (`ollama`, `tei` on CPU, `google`) to get vectors.
 70 | 5.  **Auto-Schema:** Automatically create:
 71 |     *   Target database if it doesn't exist
 72 |     *   Collections/indices from table names in the dump
 73 |     *   Proper dimension settings based on your `--dimension` parameter
 74 |     *   Distance metrics using your specified `--metric` value
 75 | 6.  **Store:** Insert into your vector DB with metadata.
 76 | 
 77 | ---
 78 | 
 79 | ## Data Exclusion
 80 | 
 81 | The exclusion feature allows you to skip entire tables or specific fields within records, which is useful for:
 82 | 
 83 | * Protecting sensitive data (passwords, PII)
 84 | * Improving performance by excluding large tables or fields not needed for search
 85 | * Reducing storage costs in your vector database
 86 | 
 87 | ### How to Use Exclusions
 88 | 
 89 | 1. Create a `config/exclude.json` file with your exclusion rules
 90 | 2. Enable exclusions with the `--use-exclude` flag
 91 | 
 92 | ### Sample exclude.json
 93 | 
 94 | ```json
 95 | [
 96 |   {
 97 |     "table": "users",
 98 |     "ignore_table": false,
 99 |     "exclude_fields": {
100 |       "password": true,
101 |       "email": true,
102 |       "profile": ["ssn", "tax_id"]
103 |     }
104 |   },
105 |   {
106 |     "table": "audit_logs",
107 |     "ignore_table": true
108 |   }
109 | ]
110 | ```
111 | This configuration:
112 | 
113 | Keeps the "users" table but removes password and email fields
114 | For the "profile" object field, only removes the "ssn" and "tax_id" subfields
115 | Completely skips the "audit_logs" table
116 | ---
117 | 
118 | ## Automatic Collection Creation
119 | 
120 | For each table in your source data dump, `db2vec` automatically:
121 | 
122 | *   Creates a corresponding collection/index in the target vector database
123 | *   Names the collection after the source table name
124 | *   Configures proper dimensions and metric type based on your CLI arguments
125 | *   Creates the database first if it doesn't exist
126 | 
127 | This zero-config schema creation means you don't need to manually set up your vector database structure before import.
128 | 
129 | > **Note:** When using Redis with `--group-redis`, collections aren't created in the traditional sense. Instead, records are grouped by table name into Redis data structures (e.g., `table:profile` → [records]). Without this flag, Redis stores each record as an individual entry with a table label in the metadata.
130 | >
131 | > **Warning:** If collections already exist, their dimension must match the `--dimension` parameter you provide. Some databases like Pinecone will reject vectors with mismatched dimensions, causing the import to fail.
132 | 
133 | ---
134 | 
135 | ## Quick Start
136 | 
137 | 1.  **Clone & build**
138 |     ```bash
139 |     git clone https://github.com/DevsHero/db2vec.git
140 |     cd db2vec
141 |     cargo build --release
142 |     ```
143 | 2.  **Prepare your dump**
144 |     *   MySQL/Postgres/Oracle: export `.sql`
145 |     *   MSSQL: `sqlcmd … > mssql_dump.sql`
146 |     *   SQLite: `sqlite3 mydb.db .dump > sqlite_dump.sql`
147 |     *   SurrealDB: `.surql` file
148 | 3.  **(Optional) Create `.env`:** Copy `.env-example` to `.env` and customize defaults.
149 | 4.  **Run**
150 |     ```bash
151 |     # MySQL → Milvus (using Ollama)
152 |     ./target/release/db2vec \
153 |       -f samples/mysql_sample.sql \
154 |       -t milvus \
155 |       --host http://127.0.0.1:19530 \
156 |       --database mydb \
157 |       --embedding-provider ollama \
158 |       --embedding-model nomic-embed-text \
159 |       --dimension 768 \
160 |       -u root -p secret --use-auth \
161 |       --debug
162 | 
163 |     # SurrealDB → Pinecone (using TEI)
164 |     ./target/release/db2vec \
165 |       -f samples/surreal_sample.surql \
166 |       -t pinecone \
167 |       --host https://index-123.svc.us-east-1.pinecone.io \
168 |       --namespace myns \
169 |       --embedding-provider tei \
170 |       --tei-binary-path tei/tei-metal \
171 |       --embedding-model nomic-embed-text-v2-moe \
172 |       --dimension 768
173 | 
174 |     # SQLite → Qdrant (using Google Gemini)
175 |     ./target/release/db2vec \
176 |       -f samples/oracle_sample.sql \
177 |       -t qdrant \
178 |       --host http://localhost:6333 \
179 |       --embedding-provider google \
180 |       --embedding-model text-embedding-004 \
181 |       --dimension 768 \
182 |       --embedding-api-key <GOOGLE_API_KEY> \
183 |       --dimension 768 \
184 |       --debug
185 |     ```
186 | 
187 | ---
188 | 
189 | ## Usage
190 | 
191 | ```bash
192 | # Cargo
193 | cargo run -- [OPTIONS]
194 | 
195 | # Binary
196 | ./target/release/db2vec [OPTIONS]
197 | 
198 | # Logging
199 | RUST_LOG=info ./target/release/db2vec [OPTIONS]
200 | RUST_LOG=debug ./target/release/db2vec --debug [OPTIONS]
201 | ```
202 | 
203 | ## Compatibility
204 | 
205 | See [docs/compatible.md](docs/compatible.md) for the full compatibility matrix of supported vector database versions and import file formats.
206 | 
207 | 
208 | ---
209 | 
210 | ## Docker Setup
211 | 
212 | Run supported vector DBs locally via Docker – see [DOCKER_SETUP.md](docs/DOCKER_SETUP.md) for commands.
213 | 
214 | 
215 | ---
216 | 
217 | ## Target Environment
218 | 
219 | Primarily developed and tested against Docker‑hosted or cloud vector databases via RESTful APIs. Ensure your target is reachable from where you run `db2vec`. **Designed to run efficiently even on standard CPU hardware.**
220 | 
221 | ---
222 | 
223 | ## Testing
224 | 
225 | ### Integration Tests
226 | 
227 | db2vec includes comprehensive integration tests that verify functionality across all supported database types and embedding providers.
228 | 
229 | #### Prerequisites
230 | 
231 | - **Docker**: Required to run containerized instances of all supported vector databases
232 | - **Embedding Provider**: At least one of the supported embedding providers (Ollama/TEI/Google)
233 | 
234 | #### Running Integration Tests
235 | 
236 | The integration test suite will:
237 | 
238 | 1. Spin up Docker containers for each supported vector database
239 | 2. Test all database import formats (MySQL, PostgreSQL, MSSQL, SQLite, Oracle, SurrealDB)
240 | 3. Generate embeddings using the specified provider
241 | 4. Verify proper storage and retrieval from each vector database
242 | 
243 | ```bash
244 | # Test with Ollama (fastest, requires Ollama running locally)
245 | EMBEDDING_PROVIDER=ollama cargo test --test integration_test -- --nocapture
246 | 
247 | # Test with TEI (CPU-based, no external dependencies)
248 | EMBEDDING_PROVIDER=tei cargo test --test integration_test -- --nocapture
249 | 
250 | # Test with mock embeddings (no external provider required)
251 | EMBEDDING_PROVIDER=mock cargo test --test integration_test -- --nocapture
252 | ```
253 | 
254 | ---
255 | 
256 | ## Contributing
257 | 
258 | Issues, PRs, and feedback welcome!
259 | 
260 | ---
261 | 
262 | ## License
263 | 
264 | MIT – see [LICENSE](LICENSE).


--------------------------------------------------------------------------------
/assets/db2vec_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/assets/db2vec_screenshot.png


--------------------------------------------------------------------------------
/build-cross-release.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | # 1) Name of your Rust binary (as in Cargo.toml)
 5 | BIN_NAME="db2vec"
 6 | 
 7 | # 2) Ensure cross is installed
 8 | if ! command -v cross &>/dev/null; then
 9 |   echo "❌ 'cross' not found – installing..."
10 |   cargo install cross --git https://github.com/cross-rs/cross :contentReference[oaicite:0]{index=0}
11 | fi
12 | 
13 | # 3) Output directory
14 | DIST_DIR="$(pwd)/dist"
15 | mkdir -p "$DIST_DIR"
16 | 
17 | # 4) List of targets
18 | TARGETS=(
19 |   "x86_64-unknown-linux-gnu"      # Linux x86_64 :contentReference[oaicite:1]{index=1}
20 |   "aarch64-unknown-linux-gnu"     # Linux ARM64 :contentReference[oaicite:2]{index=2}
21 |   "x86_64-pc-windows-gnu"         # Windows x64 :contentReference[oaicite:3]{index=3}
22 | )
23 | 
24 | # 5) Build loop
25 | for TARGET in "${TARGETS[@]}"; do
26 |   echo "⏳ Building for $TARGET..."
27 |   cross rustc --target "$TARGET" --release 
28 | done
29 | 
30 | # 6) Copy binaries into dist/
31 | echo "📂 Collecting binaries into $DIST_DIR..."
32 | for TARGET in "${TARGETS[@]}"; do
33 |   BIN_PATH="target/${TARGET}/release/${BIN_NAME}"
34 |   # On Windows targets, add .exe
35 |   if [[ "$TARGET" == *"windows"* ]]; then
36 |     BIN_PATH+=".exe"
37 |   fi
38 | 
39 |   if [[ -f "$BIN_PATH" ]]; then
40 |     OUT_NAME="${BIN_NAME}-${TARGET}"
41 |     # Preserve extension on Windows
42 |     if [[ "$TARGET" == *"windows"* ]]; then
43 |       OUT_NAME+=".exe"
44 |     fi
45 | 
46 |     cp "$BIN_PATH" "$DIST_DIR/$OUT_NAME"
47 |     echo "✅ $OUT_NAME"
48 |   else
49 |     echo "⚠️  Missing: $BIN_PATH"
50 |   fi
51 | done
52 | 
53 | echo "🎉 All done! Binaries are in $DIST_DIR."
54 | 


--------------------------------------------------------------------------------
/config/exclude.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "table": "users",
 4 |       "ignore_table": false,
 5 |       "exclude_fields": {
 6 |         "password": true,
 7 |         "email": true,
 8 |         "profile": ["ssn", "tax_id"]
 9 |       }
10 |     },
11 |     {
12 |       "table": "audit_logs",
13 |       "ignore_table": true
14 |     }
15 |   ]


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # 1. Planner stage: generate the dependency recipe (with C++ compiler)
 3 | ###############################################################################
 4 | FROM rustlang/rust:nightly-bullseye-slim AS chef
 5 | 
 6 | # Install system deps including C++ compiler
 7 | RUN apt-get update && \
 8 |     apt-get install -y --no-install-recommends \
 9 |       build-essential \                       
10 |       pkg-config \
11 |       libssl-dev \
12 |       git \
13 |       ca-certificates && \
14 |     rm -rf /var/lib/apt/lists/*
15 | 
16 | # Install cargo-chef and sccache
17 | RUN cargo install --locked cargo-chef sccache
18 | ENV RUSTC_WRAPPER="sccache" \
19 |     SCCACHE_DIR="/sccache"
20 | 
21 | WORKDIR /app
22 | 
23 | # Copy manifests and dummy main for cargo-chef
24 | COPY Cargo.toml Cargo.lock ./
25 | RUN mkdir src && echo 'fn main() {}' > src/main.rs && \
26 |     cargo chef prepare --recipe-path recipe.json
27 | 
28 | ###############################################################################
29 | # 2. Builder stage: compile dependencies & your code
30 | ###############################################################################
31 | FROM chef AS builder
32 | WORKDIR /app
33 | 
34 | # Rehydrate dependencies
35 | COPY --from=chef /app/recipe.json recipe.json
36 | RUN --mount=type=cache,target=/usr/local/cargo/registry \
37 |     --mount=type=cache,target=/usr/local/cargo/git \
38 |     --mount=type=cache,target=$SCCACHE_DIR,sharing=locked \
39 |     cargo chef cook --release --recipe-path recipe.json
40 | 
41 | # Build the full application
42 | COPY . .
43 | RUN --mount=type=cache,target=/usr/local/cargo/registry \
44 |     --mount=type=cache,target=/usr/local/cargo/git \
45 |     --mount=type=cache,target=$SCCACHE_DIR,sharing=locked \
46 |     cargo build --release
47 | 
48 | ###############################################################################
49 | # 3. Runtime stage: minimal Debian image
50 | ###############################################################################
51 | FROM debian:bullseye-slim AS runtime
52 | 
53 | RUN apt-get update && \
54 |     apt-get install -y --no-install-recommends ca-certificates && \
55 |     rm -rf /var/lib/apt/lists/*
56 | 
57 | COPY --from=builder /app/target/release/db2vec /usr/local/bin/db2vec
58 | 
59 | # Drop privileges: non-root user
60 | RUN useradd --system --uid 10001 --shell /usr/sbin/nologin appuser
61 | USER appuser
62 | 
63 | ENTRYPOINT ["/usr/local/bin/db2vec"]
64 | 


--------------------------------------------------------------------------------
/docs/COMPATIBLE.md:
--------------------------------------------------------------------------------
 1 | # Compatibility Matrix
 2 | 
 3 | ## Supported Vector Database Versions
 4 | 
 5 | | Vector DB    | API Version                         | Notes                         |
 6 | |--------------|-------------------------------------|-------------------------------|
 7 | | Pinecone     | 2025-01                             | Pinecone Cloud Control Plane  |
 8 | | Milvus       | v2                                  | Milvus Server API v2          |
 9 | | Chroma       | v2                                  | Chroma HTTP API v2            |
10 | | Qdrant       | v1.14.0                             | Qdrant Server v1.14.0         |
11 | | Redis Stack  | redis-stack:7.4.0-v3 (as of 30/4/2025) | Includes RedisJSON, RediSearch |
12 | | SurrealDB    | v2.3.0 (as of 30/4/2025)            | SurrealDB HTTP API v2.3.0     |
13 | 
14 | ---
15 | 
16 | ## Supported Import File Formats
17 | 
18 | All sample dumps use the latest database‐specific dump format as of 30/4/2025.
19 | 
20 | | Format       | Sample File             | Notes                           |
21 | |--------------|-------------------------|---------------------------------|
22 | | MSSQL        | `mssql_sample.sql`      | SQLCMD export with `SET NOCOUNT ON` |
23 | | MySQL        | `mysql_sample.sql`      | mysqldump / standard SQL dump   |
24 | | Oracle       | `oracle_sample.sql`     | SQL Developer / expdp format    |
25 | | PostgreSQL   | `postgres_sample.sql`   | `pg_dump --format=plain`        |
26 | | SQLite       | `sqlite_sample.sql`     | `sqlite3 .dump`                 |
27 | | SurrealDB    | `surreal_sample.surql`  | SurrealDB `.surql` export       |
28 | 
29 | ---
30 | 
31 | ## Pinecone Cloud Support
32 | 
33 | When `-t pinecone` is selected and `--host` is not a local URL:
34 | 
35 | 1.  **Create / Describe Index**
36 |     *   Uses the control plane `https://api.pinecone.io/indexes`
37 |     *   Requires `--indexes`, `--secret` (API key), `--cloud`, and `--region`
38 |     *   If the index does not exist, it is created with your `--dimension` and `--metric`
39 |     *   On `409 Conflict`, the existing index is described to retrieve its data‑plane host
40 | 
41 | 2.  **Data‑Plane Upserts**
42 |     *   Vectors are upserted to `https://<your-index-host>`
43 |     *   Namespace = source table name (each table is a separate namespace)
44 |     *   Metadata includes a `"table": "<table_name>"` field
45 | 
46 | > **Note:** For local Pinecone dev images, index creation via API may not be supported.
47 | > Ensure your index exists or provide the full data‑plane URL with `--host`.
48 | 
49 | ## Other Cloud-Hosted Vector Services (Untested)
50 | 
51 | While we haven’t explicitly tested against managed cloud offerings beyond Pinecone, the same HTTP/API-key patterns should apply:
52 | 
53 | - **Milvus Cloud** / Zilliz Cloud  
54 | - **Qdrant Cloud**  
55 | - **Redis Enterprise Cloud**  
56 | - **Surreal Cloud**  
57 | 
58 | To try one of these services:
59 | 
60 | 1.  Set `--host` to your service’s HTTP endpoint.  
61 | 2.  Pass your API key or token via `--secret` and enable `--use-auth`.  
62 | 3.  Configure any provider-specific flags (e.g. `--indexes`, `--namespace`, etc.).  
63 | 
64 | db2vec uses standard REST calls and bearer-token auth under the hood, so you may find these services work out-of-the-box. Actual support may vary based on each provider’s API quirks.
65 | 
66 | 


--------------------------------------------------------------------------------
/docs/DOCKER_SETUP.md:
--------------------------------------------------------------------------------
 1 | # Local Vector Database Setup with Docker
 2 | 
 3 | This guide provides quick‑start Docker commands for running supported vector databases locally with `db2vec`. For full details and advanced options, please refer to the official documentation links provided for each database.
 4 | 
 5 | ---
 6 | 
 7 | ## Pinecone (Local Development)
 8 | 
 9 | Official docs: https://docs.pinecone.io/guides/operations/local-development#docker-cli
10 | 
11 | ```bash
12 | docker run -d \
13 |   --name dense-index \
14 |   -e PORT=5081 \
15 |   -e INDEX_TYPE=serverless \
16 |   -e VECTOR_TYPE=dense \
17 |   -e DIMENSION=768 \
18 |   -e METRIC=cosine \
19 |   -p 5081:5081 \
20 |   --platform linux/amd64 \
21 |   ghcr.io/pinecone-io/pinecone-index:latest
22 | ```
23 | 
24 | ---
25 | 
26 | ## SurrealDB
27 | 
28 | Official docs: https://surrealdb.com/docs/surrealdb/installation/running/docker
29 | 
30 | ```bash
31 | docker run -d --rm --pull always \
32 |   --name surreal \
33 |   -p 8000:8000 \
34 |   -v /mydata:/mydata \
35 |   surrealdb/surrealdb:latest \
36 |   start --user root --pass root
37 | ```
38 | 
39 | ---
40 | 
41 | ## Milvus (Standalone)
42 | 
43 | Official docs: https://milvus.io/docs/configure-docker.md?tab=component
44 | 
45 | ```bash
46 | wget https://github.com/milvus-io/milvus/releases/download/v2.5.9/milvus-standalone-docker-compose.yml \
47 |   -O docker-compose.yml
48 | docker compose up -d
49 | ```
50 | 
51 | ---
52 | 
53 | ## Redis Stack
54 | 
55 | Official docs: https://hub.docker.com/r/redis/redis-stack
56 | 
57 | ```bash
58 | docker run -d \
59 |   --name redis-stack \
60 |   -p 6379:6379 \
61 |   -p 8001:8001 \
62 |   redis/redis-stack:latest
63 | ```
64 | 
65 | ---
66 | 
67 | ## Chroma
68 | 
69 | Official docs: https://docs.trychroma.com/production/containers/docker
70 | 
71 | ```bash
72 | docker run -d \
73 |   -v ./chroma-data:/data \
74 |   -p 8000:8000 \
75 |   chromadb/chroma
76 | ```
77 | 
78 | ---
79 | 
80 | ## Qdrant
81 | 
82 | Official docs: https://qdrant.tech/documentation/quickstart/
83 | 
84 | ```bash
85 | docker run -d \
86 |   --name qdrant \
87 |   -p 6333:6333 \
88 |   -p 6334:6334 \
89 |   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
90 |   qdrant/qdrant
91 | ```
92 | 
93 | ---
94 | 
95 | > **Note:** Always consult the official documentation for each database for the latest setup instructions, environment variables, and recommended production configurations.  
96 | >  
97 | > Save this file as `DOCKER_SETUP.md` in your project root and copy the commands as needed.  


--------------------------------------------------------------------------------
/docs/OPTION.md:
--------------------------------------------------------------------------------
 1 | # db2vec Command-Line Options
 2 | 
 3 | Below is the full list of CLI flags, their environment-variable equivalents, defaults, and descriptions.  
 4 | (Note: `--tei-local-port` has been removed; only `--tei-binary-path` remains.)
 5 | 
 6 | | Flag / Env Var                                      | Default                  | Description                                                                                   |
 7 | |-----------------------------------------------------|--------------------------|-----------------------------------------------------------------------------------------------|
 8 | | -f, --data-file <FILE> <br> DUMP_FILE               | `./surreal.surql`        | Path to the `.sql` / `.surql` database-dump file.                                             |
 9 | | -t, --vector-export-type <EXPORT_TYPE> <br> EXPORT_TYPE               | `redis`                  | Target vector database: `redis` \| `chroma` \| `milvus` \| `qdrant` \| `surreal` \| `pinecone`.|
10 | | -u, --user <USER> <br> USER                         | `root`                   | Username for DB authentication (Milvus, SurrealDB).                                           |
11 | | -p, --pass <PASS> <br> PASS                         | `""`                     | Password for DB authentication (Milvus, SurrealDB, Redis).                                    |
12 | | -k, --secret <SECRET> <br> SECRET                   | `""`                     | API key / token for DB auth (Chroma, Qdrant, Pinecone).                                       |
13 | | --use-auth <BOOL> <br> AUTH                         | `false`                  | Enable authentication for the vector database.                                                |
14 | | --debug <BOOL> <br> DEBUG                           | `false`                  | Print parsed JSON records before embedding.                                                   |
15 | | --vector-host <HOST> <br> VECTOR_HOST               | `redis://127.0.0.1:6379` | Vector-database URL or host endpoint.                                                         |
16 | | --database <DB> <br> DATABASE                       | `default_database`       | Target database/collection name (Chroma, Milvus, Qdrant, Surreal).                           |
17 | | --indexes <NAME> <br> INDEXES                       | `default_indexes`        | Pinecone index name (only for `-t pinecone`).                                                 |
18 | | --cloud <CLOUD> <br> CLOUD                          | `aws`                    | Pinecone cloud provider: `aws` \| `azure` \| `gcp`.                                           |
19 | | --region <REGION> <br> REGION                       | `us-east-1`              | Pinecone cloud region (e.g. `us-east-1`).                                                     |
20 | | --tenant <TENANT> <br> TENANT                       | `default_tenant`         | Tenant name for multi-tenant DBs (Chroma).                                                    |
21 | | --namespace <NAMESPACE> <br> NAMESPACE              | `default_namespace`      | Namespace for SurrealDB or Pinecone.                                                          |
22 | | --dimension <N> <br> DIMENSION                      | `768`                    | Vector dimension size (must match your embedding model).                                      |
23 | | --metric <METRIC> <br> METRIC                       | `cosine`                 | Distance metric: `l2` \| `ip` \| `cosine` \| `euclidean` \| `dotproduct`.                    |
24 | | -m, --max-payload-size-mb <MB> <br> PAYLOAD_SIZE_MB | `12`                     | Max payload size **MB** per request (DB batch upload).                                        |
25 | | -c, --chunk-size <N> <br> CHUNK_SIZE                | `10`                     | Number of records per batch insert.                                                           |
26 | | --embedding-provider <PROVIDER> <br> EMBEDDING_PROVIDER | `ollama`               | Embedding provider: `ollama` (fast CPU/GPU) \| `tei` (CPU-only TEI v1.7.0) \| `google` (cloud).|
27 | | --embedding-api-key <KEY> <br> EMBEDDING_API_KEY    | _none_                   | API Key for Google Gemini (required if provider=`google`).                                     |
28 | | --embedding-model <MODEL> <br> EMBEDDING_MODEL      | `nomic-embed-text`       | Model name/ID for your provider (e.g. `nomic-embed-text`, `text-embedding-004`, `...-moe`).   |
29 | | --embedding-url <URL> <br> EMBEDDING_URL            | _none_                   | URL endpoint for Ollama or Google embeddings (e.g. `http://localhost:11434`).                |
30 | | --embedding-max-concurrency <N> <br> EMBEDDING_MAX_CONCURRENCY | `4`             | Parallel embedding requests.                                                                  |
31 | | --embedding-batch-size <N> <br> EMBEDDING_BATCH_SIZE | `16`                     | Number of texts per embedding batch.                                                          |
32 | | --embedding-max-tokens <N> <br> EMBEDDING_MAX_TOKENS | `8000`                   | Max tokens per embedding request (provider-specific).                                         |
33 | | --embedding-timeout <SEC> <br> OLLAMA_TIMEOUT       | `60`                     | Timeout (seconds) for embedding calls.                                                        |
34 | | --embedding-task-EXPORT_TYPE <EXPORT_TYPE> <br> EMBEDDING_TASK_EXPORT_TYPE | `SEMANTIC_SIMILARITY` | Optional task EXPORT_TYPE for Google Gemini API.                                                     |
35 | | --num-threads <N> <br> NUM_THREADS                  | `0`                      | CPU threads for parallel tasks (0 = auto-detect).                                             |
36 | | --group-redis <BOOL> <br> GROUP_REDIS               | `false`                  | Group Redis records by table name (vs individual FT.CREATE/SEARCH).                           |
37 | | --tei-binary-path <PATH> <br> TEI_BINARY_PATH       | `tei/tei-metal`          | Path to TEI binary (`tei-metal` or `tei-onnx`). If omitted, the embedded TEI is auto-extracted.| 
38 | 
39 | 
40 | This document now reflects the removal of `--tei-local-port` and clearly lists the remaining CLI options, including how to invoke and configure the TEI binary.This document now reflects the removal of `--tei-local-port` and clearly lists the remaining CLI options, including how to invoke and configure the TEI binary.


--------------------------------------------------------------------------------
/docs/TEI.md:
--------------------------------------------------------------------------------
 1 | # TEI Provider (Text Embeddings Inference)
 2 | 
 3 | This project ships two TEI binaries under the `tei/` folder, built from **v1.7.0**:
 4 | 
 5 | - `tei/tei-metal`  – for Apple Silicon (M1/M2) using the Metal backend  
 6 | - `tei/tei-onnx`   – for x86_64 using the ONNX Runtime backend
 7 | 
 8 | Feel free to build your own from source:
 9 | 
10 | ```bash
11 | git clone https://github.com/huggingface/text-embeddings-inference.git
12 | cd text-embeddings-inference
13 | 
14 | # On x86_64 with ONNX backend (recommended)
15 | cargo install --path router -F ort
16 | 
17 | # On x86_64 with Intel MKL
18 | cargo install --path router -F mkl
19 | 
20 | # On Apple Silicon (M1/M2) with Metal
21 | cargo install --path router -F metal
22 | ```
23 | 
24 | You can also run the TEI router standalone:
25 | 
26 | ```bash
27 | # e.g. on CPU:
28 | text-embeddings-router --model-id YOUR_MODEL_ID --port 8080
29 | ```
30 | 
31 | > Note: on Linux you may need OpenSSL & gcc:
32 | > `sudo apt-get install libssl-dev gcc -y`
33 | 
34 | ---
35 | 
36 | ## Using local TEI with db2vec
37 |  
38 | 
39 | ```bash
40 | cargo run --release -- \
41 |   -f your_dataset.surql \
42 |   -t pinecone \
43 |   --embedding-provider tei \
44 |   --tei-binary-path tei/tei-metal \
45 |   --embedding-model nomic-ai/nomic-embed-text-v2-moe \
46 |   --dimension 768
47 |   ```
48 |   
49 | --tei-binary-path : path to tei-metal or tei-onnx
50 | Leave --embedding-url empty to start a local server
51 | 
52 | 


--------------------------------------------------------------------------------
/samples/mssql_sample.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/samples/mssql_sample.sql


--------------------------------------------------------------------------------
/samples/mysql_sample.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.13  Distrib 9.3.0, for Linux (aarch64)
 2 | --
 3 | -- Host: localhost    Database: mydb
 4 | -- ------------------------------------------------------
 5 | -- Server version	9.3.0
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!50503 SET NAMES utf8mb4 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
17 | 
18 | --
19 | -- Current Database: `mydb`
20 | --
21 | 
22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `mydb` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci */ /*!80016 DEFAULT ENCRYPTION='N' */;
23 | 
24 | USE `mydb`;
25 | 
26 | --
27 | -- Table structure for table `products`
28 | --
29 | 
30 | DROP TABLE IF EXISTS `products`;
31 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
32 | /*!50503 SET character_set_client = utf8mb4 */;
33 | CREATE TABLE `products` (
34 |   `id` int NOT NULL AUTO_INCREMENT,
35 |   `name` varchar(100) DEFAULT NULL,
36 |   `tags` json DEFAULT NULL,
37 |   `price` decimal(10,2) DEFAULT NULL,
38 |   `meta` json DEFAULT NULL,
39 |   PRIMARY KEY (`id`)
40 | ) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
41 | /*!40101 SET character_set_client = @saved_cs_client */;
42 | 
43 | --
44 | -- Dumping data for table `products`
45 | --
46 | 
47 | LOCK TABLES `products` WRITE;
48 | /*!40000 ALTER TABLE `products` DISABLE KEYS */;
49 | INSERT INTO `products` VALUES (1,'products','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}');
50 | /*!40000 ALTER TABLE `products` ENABLE KEYS */;
51 |  
52 | LOCK TABLES `cars` WRITE;
53 | /*!40000 ALTER TABLE `products` DISABLE KEYS */;
54 | INSERT INTO `cars` VALUES (1,'cars','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}');
55 | /*!40000 ALTER TABLE `products` ENABLE KEYS */;
56 | UNLOCK TABLES;
57 | LOCK TABLES `home` WRITE;
58 | /*!40000 ALTER TABLE `products` DISABLE KEYS */;
59 | INSERT INTO `home` VALUES (1,'home','[\"electronics\", \"computer\", \"16GB RAM\"]',1299.99,'{\"brand\": \"BrandX\", \"features\": {\"color\": \"silver\", \"weight\": 2.5}}'),(2,'Phone','[\"electronics\", \"mobile\", \"Android\"]',799.50,'{\"brand\": \"BrandY\", \"features\": {\"color\": \"black\", \"weight\": 0.2}}'),(3,'Desk','[\"furniture\", \"wood\", \"large\"]',250.00,'{\"brand\": \"BrandZ\", \"features\": {\"material\": \"oak\", \"adjustable\": true}}');
60 | /*!40000 ALTER TABLE `products` ENABLE KEYS */;
61 | UNLOCK TABLES;
62 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
63 | 
64 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
65 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
66 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
67 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
68 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
69 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
70 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
71 | 
72 | -- Dump completed on 2025-04-19  1:45:18
73 | 


--------------------------------------------------------------------------------
/samples/oracle_sample.sql:
--------------------------------------------------------------------------------
  1 | --------------------------------------------------------
  2 | --  File created - Tuesday-April-22-2025   
  3 | --------------------------------------------------------
  4 | REM INSERTING into SYSTEM.CUSTOMER_PROFILES
  5 | SET DEFINE OFF;
  6 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (1,'John Smith','john.smith@example.com');
  7 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (2,'Maria Garcia','maria.garcia@example.com');
  8 | Insert into SYSTEM.CUSTOMER_PROFILES (ID,NAME,EMAIL) values (3,'Ahmed Khan','ahmed.khan@example.com');
  9 | REM INSERTING into SYSTEM.PRODUCTS
 10 | SET DEFINE OFF;
 11 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (1,'Premium Laptop',1299.99,'High-performance laptop for professionals');
 12 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (2,'Ergonomic Office Chair',249.99,'Comfortable chair with lumbar support');
 13 | Insert into SYSTEM.PRODUCTS (ID,NAME,PRICE,DESCRIPTION) values (3,'Smart Home Hub',179.5,'<b>Voice-controlled</b> smart home <i>central hub</i> with <span style="color:blue">wireless connectivity</span>');
 14 | REM INSERTING into SYSTEM.ANALYTICS_DATA
 15 | SET DEFINE OFF;
 16 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (1,'page_view',to_timestamp('20 APR 2025 08.15.32.452000000','DD MON RRRR HH24.MI.SSXFF'),1001);
 17 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (2,'purchase',to_timestamp('20 APR 2025 10.38.14.129000000','DD MON RRRR HH24.MI.SSXFF'),1042);
 18 | Insert into SYSTEM.ANALYTICS_DATA (ID,EVENT_TYPE,TIMESTAMP,USER_ID) values (3,'search',to_timestamp('21 APR 2025 15.22.40.781000000','DD MON RRRR HH24.MI.SSXFF'),1098);
 19 | --------------------------------------------------------
 20 | --  DDL for Index SYS_C008435
 21 | --------------------------------------------------------
 22 | 
 23 |   CREATE UNIQUE INDEX "SYSTEM"."SYS_C008435" ON "SYSTEM"."CUSTOMER_PROFILES" ("ID") 
 24 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 25 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 26 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 27 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 28 |   TABLESPACE "SYSTEM" ;
 29 | --------------------------------------------------------
 30 | --  DDL for Index IDX_CUSTOMER_EMAIL
 31 | --------------------------------------------------------
 32 | 
 33 |   CREATE INDEX "SYSTEM"."IDX_CUSTOMER_EMAIL" ON "SYSTEM"."CUSTOMER_PROFILES" ("EMAIL") 
 34 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 35 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 36 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 37 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 38 |   TABLESPACE "SYSTEM" ;
 39 | --------------------------------------------------------
 40 | --  DDL for Index SYS_C008434
 41 | --------------------------------------------------------
 42 | 
 43 |   CREATE UNIQUE INDEX "SYSTEM"."SYS_C008434" ON "SYSTEM"."PRODUCTS" ("ID") 
 44 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 45 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 46 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 47 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 48 |   TABLESPACE "SYSTEM" ;
 49 | --------------------------------------------------------
 50 | --  DDL for Index IDX_PRODUCTS_NAME
 51 | --------------------------------------------------------
 52 | 
 53 |   CREATE INDEX "SYSTEM"."IDX_PRODUCTS_NAME" ON "SYSTEM"."PRODUCTS" ("NAME") 
 54 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 55 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 56 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 57 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 58 |   TABLESPACE "SYSTEM" ;
 59 | --------------------------------------------------------
 60 | --  DDL for Index SYS_C008436
 61 | --------------------------------------------------------
 62 | 
 63 |   CREATE UNIQUE INDEX "SYSTEM"."SYS_C008436" ON "SYSTEM"."ANALYTICS_DATA" ("ID") 
 64 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 65 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 66 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 67 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 68 |   TABLESPACE "SYSTEM" ;
 69 | --------------------------------------------------------
 70 | --  DDL for Index IDX_ANALYTICS_EVENT_TYPE
 71 | --------------------------------------------------------
 72 | 
 73 |   CREATE INDEX "SYSTEM"."IDX_ANALYTICS_EVENT_TYPE" ON "SYSTEM"."ANALYTICS_DATA" ("EVENT_TYPE") 
 74 |   PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 75 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 76 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 77 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 78 |   TABLESPACE "SYSTEM" ;
 79 | --------------------------------------------------------
 80 | --  Constraints for Table CUSTOMER_PROFILES
 81 | --------------------------------------------------------
 82 | 
 83 |   ALTER TABLE "SYSTEM"."CUSTOMER_PROFILES" MODIFY ("ID" NOT NULL ENABLE);
 84 |   ALTER TABLE "SYSTEM"."CUSTOMER_PROFILES" ADD PRIMARY KEY ("ID")
 85 |   USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 86 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 87 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 88 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
 89 |   TABLESPACE "SYSTEM"  ENABLE;
 90 | --------------------------------------------------------
 91 | --  Constraints for Table PRODUCTS
 92 | --------------------------------------------------------
 93 | 
 94 |   ALTER TABLE "SYSTEM"."PRODUCTS" MODIFY ("ID" NOT NULL ENABLE);
 95 |   ALTER TABLE "SYSTEM"."PRODUCTS" ADD PRIMARY KEY ("ID")
 96 |   USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
 97 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
 98 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
 99 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
100 |   TABLESPACE "SYSTEM"  ENABLE;
101 |   ALTER TABLE "SYSTEM"."PRODUCTS" ADD CONSTRAINT "PRODUCTS_METADATA_JSON" CHECK (METADATA IS JSON) ENABLE;
102 | --------------------------------------------------------
103 | --  Constraints for Table ANALYTICS_DATA
104 | --------------------------------------------------------
105 | 
106 |   ALTER TABLE "SYSTEM"."ANALYTICS_DATA" MODIFY ("ID" NOT NULL ENABLE);
107 |   ALTER TABLE "SYSTEM"."ANALYTICS_DATA" ADD PRIMARY KEY ("ID")
108 |   USING INDEX PCTFREE 10 INITRANS 2 MAXTRANS 255 COMPUTE STATISTICS 
109 |   STORAGE(INITIAL 65536 NEXT 1048576 MINEXTENTS 1 MAXEXTENTS 2147483645
110 |   PCTINCREASE 0 FREELISTS 1 FREELIST GROUPS 1
111 |   BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT)
112 |   TABLESPACE "SYSTEM"  ENABLE;
113 | 


--------------------------------------------------------------------------------
/samples/profile_sample.txt:
--------------------------------------------------------------------------------
  1 | -- Define the database
  2 | DEFINE DATABASE portfolio;
  3 | 
  4 | -- defined database
  5 | DEFINE NAMESPACE IF NOT EXISTS surreal;
  6 | 
  7 | -- use namespace and database
  8 | USE NS surreal;
  9 | USE DB portfolio;
 10 | 
 11 | -- create table
 12 | DEFINE TABLE profile;
 13 | DEFINE TABLE contact;
 14 | DEFINE TABLE experience;
 15 | DEFINE TABLE portfolio;
 16 | DEFINE TABLE education;
 17 | DEFINE TABLE language;
 18 | DEFINE TABLE skill;
 19 | 
 20 | -- Insert data into the profile table
 21 | INSERT INTO profile {
 22 |     about: "I am newbie developer",
 23 |     avatar: "https://raw.githubusercontent.com/marwin1991/profile-technology-icons/refs/heads/main/icons/github.png",
 24 |     birth_date: "2000-01-01",
 25 |     first_name: "John",
 26 |     gender: "Male",
 27 |     pdf: {
 28 | 	    show_about: true,
 29 | 		show_contact: true,
 30 | 		show_education: true,
 31 | 		show_experience: true,
 32 | 		show_language: true,
 33 | 		show_portfolio: true,
 34 | 		show_profile: true,
 35 |         show_avatar: true,
 36 | 		show_skill: true,
 37 | 		use_about_pdf_version: false,
 38 |         use_avatar_pdf_version:false,
 39 | 		use_generate: true,
 40 | 		use_pdf: true
 41 | 	},
 42 |     last_name: "Doe",
 43 |     nationality: "US",
 44 |     nick_name: "Mr.Robot",
 45 |     address: "CA USA"
 46 |     role: "Junior Developer"
 47 | };
 48 | 
 49 | -- Insert data into the contact table
 50 | INSERT INTO contact {
 51 |     contact_icon: "Facebook",
 52 |     contact_value: "https://www.facebook.com/zuck/",
 53 |     use_link: true
 54 | };
 55 | -- Insert data into the language table
 56 | INSERT INTO language {
 57 |     level: "Native",
 58 | 	name: "English"
 59 | };
 60 | 
 61 | INSERT INTO contact {
 62 |     contact_title: "My Email",
 63 |     contact_icon: "Mail",
 64 |     contact_value: "my@email.com",
 65 |     use_link: false
 66 | };
 67 | 
 68 | -- Insert data into the experience table
 69 | INSERT INTO experience {
 70 |     company_logo_url: "https://seeklogo.com/images/A/avengers-logo-5B0A68AFB3-seeklogo.com.png",
 71 |     company_name: "Avengers Team",
 72 |     company_url: "https://en.wikipedia.org/wiki/List_of_Avengers_members",
 73 |     describe: "Assisted in retrieving and securing dangerous alien technology (Captain America: Civil War).\nEngaged in high-stakes urban combat during Battle of New York (Infinity War).\nParticipated in intergalactic rescue missions; fought Thanos’ army on Titan.\nBlipped out of existence for five years, then returned to help in the final battle against Thanos (Endgame).\nProvided support in rebuilding efforts post-Blip, maintaining neighborhood security.",
 74 |     end_date: "2004-01-01",
 75 |     position_name: "Spider Man",
 76 |     company_address:"Hollywood USA",
 77 | 
 78 |     use_describe_pdf_version: false,
 79 |     start_date: "2000-01-01"
 80 | };
 81 | 
 82 | -- Insert data into the experience table
 83 | INSERT INTO education {
 84 |   	degree: "bachelor's degree",
 85 | 	gpa: "4.00",
 86 | 	graduated_year: "2010",
 87 | 	institute_address: "CA USA",
 88 | 	institute_logo_url: "https://identity.stanford.edu/wp-content/uploads/sites/3/2020/07/SU_SealColor_web3.png",
 89 | 	institute_name: "Stanford University",
 90 | 	major: "computer science"
 91 | };
 92 | 
 93 | -- Insert data into the portfolio table
 94 | INSERT INTO portfolio {
 95 |     uuid: "0a6fb385-39ca-4a4f-8e8b-4ed1643462d7",
 96 |     index:0,
 97 |     is_opensource: false,
 98 |     portfolio_detail: "Fullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin systemFullstack rust portfolio project with admin system",
 99 |     portfolio_icon_url: "https://cdn-icons-png.flaticon.com/512/25/25231.png",
100 |     portfolio_link: "https://github.com/zelda2003/leptos_portfolio_admin",
101 |     portfolio_name: "Leptos Portfolio Admin",
102 |     screenshots_url: [
103 |         "https://149842033.v2.pressablecdn.com/wp-content/uploads/2019/03/breed2-free-portfolio-website-templates.jpg",
104 |         "https://themewagon.com/wp-content/uploads/2021/11/html.design.jpg"
105 |     ],
106 |     stacks: [
107 |         "Rust",
108 |         "Leptos",
109 |         "ActixWeb",
110 |         "Tailwind",
111 |         "SurrealDB"
112 |     ],
113 |     use_portfolio_detail_pdf_version: false
114 | };
115 | 
116 | -- Insert data into the skill table
117 | INSERT INTO skill {
118 |     level: "Middle",
119 |     name: "Postgresql"
120 | };
121 | 
122 | INSERT INTO skill {
123 |     level: "Middle",
124 |     name: "MongoDB"
125 | };
126 | 


--------------------------------------------------------------------------------
/samples/sqlite_sample.sql:
--------------------------------------------------------------------------------
 1 | PRAGMA foreign_keys=OFF;
 2 | BEGIN TRANSACTION;
 3 | CREATE TABLE items (
 4 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
 5 |     name TEXT NOT NULL,
 6 |     description TEXT,
 7 |     tags TEXT,  -- To store comma-separated tags or array-like strings
 8 |     attributes TEXT, -- To store JSON-like strings
 9 |     price REAL,
10 |     is_active BOOLEAN DEFAULT 1,
11 |     created_at DATETIME DEFAULT CURRENT_TIMESTAMP
12 | );
13 | INSERT INTO items VALUES(1,'Laptop','A standard laptop',NULL,NULL,1200.5,1,'2025-04-20 02:04:22');
14 | INSERT INTO items VALUES(2,'Keyboard','Mechanical keyboard','["gaming", "rgb", "mechanical"]','{"brand": "Keychron", "switches": "brown", "layout": "TKL"}',99.9899999999999949,1,'2025-04-20 02:04:22');
15 | INSERT INTO items VALUES(3,'Mouse',NULL,'','',25.0,0,'2025-04-20 02:04:22');
16 | INSERT INTO items VALUES(4,'Monitor','4K Monitor','["large", "4k", "ips", "monitor, curved"]','{"resolution": "3840x2160", "size_inches": 27, "ports": ["HDMI", "DP"]}',350.75,1,'2025-04-20 02:04:22');
17 | INSERT INTO items VALUES(5,'Webcam','1080p Webcam','video, conference, usb',NULL,45.0,1,'2025-04-20 02:04:22');
18 | INSERT INTO items VALUES(6,'Desk Chair','Ergonomic office chair','["furniture", "office", "ergonomic"]','{"material": "mesh", "color": "black", "adjustments": {"height": true, "lumbar": "fixed"}}',180.0,1,'2025-04-20 02:04:23');
19 | DELETE FROM sqlite_sequence;
20 | INSERT INTO sqlite_sequence VALUES('items',6);
21 | COMMIT;
22 | CREATE TABLE drug (
23 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
24 |     name TEXT NOT NULL,
25 |     description TEXT,
26 |     tags TEXT,  -- To store comma-separated tags or array-like strings
27 |     attributes TEXT, -- To store JSON-like strings
28 |     price REAL,
29 |     is_active BOOLEAN DEFAULT 1,
30 |     created_at DATETIME DEFAULT CURRENT_TIMESTAMP
31 | );
32 | INSERT INTO drug VALUES(1,'drug','A standard laptop',NULL,NULL,1200.5,1,'2025-04-20 02:04:22');
33 | INSERT INTO drug VALUES(2,'Keyboard','Mechanical keyboard','["gaming", "rgb", "mechanical"]','{"brand": "Keychron", "switches": "brown", "layout": "TKL"}',99.9899999999999949,1,'2025-04-20 02:04:22');
34 | INSERT INTO drug VALUES(3,'Mouse',NULL,'','',25.0,0,'2025-04-20 02:04:22');
35 | INSERT INTO drug VALUES(4,'Monitor','4K Monitor','["large", "4k", "ips", "monitor, curved"]','{"resolution": "3840x2160", "size_inches": 27, "ports": ["HDMI", "DP"]}',350.75,1,'2025-04-20 02:04:22');
36 | INSERT INTO drug VALUES(5,'Webcam','1080p Webcam','video, conference, usb',NULL,45.0,1,'2025-04-20 02:04:22');
37 | INSERT INTO drug VALUES(6,'Desk Chair','Ergonomic office chair','["furniture", "office", "ergonomic"]','{"material": "mesh", "color": "black", "adjustments": {"height": true, "lumbar": "fixed"}}',180.0,1,'2025-04-20 02:04:23');
38 | DELETE FROM sqlite_sequence;
39 | INSERT INTO sqlite_sequence VALUES('items',6);
40 | COMMIT;
41 | 


--------------------------------------------------------------------------------
/samples/surreal_sample.surql:
--------------------------------------------------------------------------------
 1 | -- ------------------------------
 2 | -- OPTION
 3 | -- ------------------------------
 4 | 
 5 | OPTION IMPORT;
 6 | 
 7 | -- ------------------------------
 8 | -- TABLE: products
 9 | -- ------------------------------
10 | 
11 | DEFINE TABLE products TYPE ANY SCHEMALESS PERMISSIONS NONE;
12 | 
13 | 
14 | 
15 | 
16 | -- ------------------------------
17 | -- TABLE DATA: products
18 | -- ------------------------------
19 | 
20 | INSERT [ { id: products:dn5pqchc33kxacvcf7x6, name: 'Phone', price: 799.5f, tags: ['electronics', 'mobile', 'Android'], test: { a: 'b', c: 'd', e: 1 } }, { id: products:jqkzvhosxcme1dzert7y, name: 'Desk', price: 250, tags: ['furniture', 'wood', 'large'], test: { a: 'b', c: 'd', e: 1 } }, { id: products:zzbpfmkkmcj4lxqn5knf, name: 'Laptop', price: 1299.99f, tags: ['electronics', 'computer', '16GB RAM'] } ];
21 | 
22 | 


--------------------------------------------------------------------------------
/src/cli/mod.rs:
--------------------------------------------------------------------------------
  1 | use clap::Parser;
  2 | 
  3 | #[derive(Parser, Debug, Clone)]
  4 | #[command(author, version, about, long_about = None)]
  5 | pub struct Args {
  6 |     /// Path to the .sql/.surql database dump file to process
  7 |     #[arg(short = 'f', env = "DUMP_FILE", long, default_value = "./surreal.surql")]
  8 |     pub dump_file: String,
  9 | 
 10 |     /// Target vector database: redis|chroma|milvus|qdrant|surreal|pinecone
 11 |     #[arg(short = 't', env = "EXPORT_TYPE", long, default_value = "redis")]
 12 |     pub vector_export_type: String,
 13 | 
 14 |     /// Username for database authentication (Milvus, SurrealDB)
 15 |     #[arg(short = 'u', env = "USER", long, default_value = "root")]
 16 |     pub user: String,
 17 | 
 18 |     /// Password for database authentication (Milvus, SurrealDB, Redis)
 19 |     #[arg(short = 'p', env = "PASS", long, default_value = "")]
 20 |     pub pass: String,
 21 | 
 22 |     /// API key/token for database authentication (Chroma, Qdrant, Pinecone)
 23 |     #[arg(short = 'k', env = "SECRET", long, default_value = "")]
 24 |     pub secret: String,
 25 | 
 26 |     /// Enable authentication for the vector database
 27 |     #[arg(long, env = "AUTH", default_value = "false")]
 28 |     pub use_auth: bool,
 29 | 
 30 |     /// Print parsed JSON records before embedding (debug mode)
 31 |     #[arg(long, env = "DEBUG", default_value = "false")]
 32 |     pub debug: bool,
 33 | 
 34 |     /// Vector database URL/host endpoint (e.g. redis://127.0.0.1:6379)
 35 |     #[arg(long, env = "VECTOR_HOST", default_value = "redis://127.0.0.1:6379")]
 36 |     pub vector_host: String,
 37 | 
 38 |     /// Target database name (Chroma, Milvus, Qdrant, Surreal)
 39 |     #[arg(long, env = "DATABASE", default_value = "default_database")]
 40 |     pub database: String,
 41 | 
 42 |     /// Pinecone index name (only for -t pinecone)
 43 |     #[arg(long, env = "INDEXES", default_value = "default_indexes")]
 44 |     pub indexes: String,
 45 | 
 46 |     /// Pinecone cloud provider: aws|azure|gcp
 47 |     #[arg(long, env = "CLOUD", default_value = "aws")]
 48 |     pub cloud: String,
 49 | 
 50 |     /// Pinecone cloud region, e.g. us-east-1
 51 |     #[arg(long, env = "REGION", default_value = "us-east-1")]
 52 |     pub region: String,
 53 | 
 54 |     /// Tenant name for multi-tenant DBs (Chroma)
 55 |     #[arg(long, env = "TENANT", default_value = "default_tenant")]
 56 |     pub tenant: String,
 57 | 
 58 |     /// Namespace for databases that support it (SurrealDB, Pinecone)
 59 |     #[arg(long, env = "NAMESPACE", default_value = "default_namespace")]
 60 |     pub namespace: String,
 61 | 
 62 |     /// Vector dimension size (must match your embedding model)
 63 |     #[arg(long, env = "DIMENSION", default_value = "768")]
 64 |     pub dimension: usize,
 65 | 
 66 |     /// Distance metric: l2|ip|cosine|euclidean|dotproduct
 67 |     #[arg(long, env = "METRIC", default_value = "cosine")]
 68 |     pub metric: String,
 69 | 
 70 |     /// Max payload size (MB) per request
 71 |     #[arg(short = 'm', env = "PAYLOAD_SIZE_MB", long, default_value = "12")]
 72 |     pub max_payload_size_mb: usize,
 73 | 
 74 |     /// Batch size for DB inserts
 75 |     #[arg(short = 'c', env = "CHUNK_SIZE", long, default_value = "10")]
 76 |     pub chunk_size: usize,
 77 | 
 78 |     /// Which embedding provider to use: ollama, tei, or google
 79 |     #[arg(long, env = "EMBEDDING_PROVIDER", default_value = "ollama")]
 80 |     pub embedding_provider: String,
 81 | 
 82 |     /// API Key for Google Gemini (required if --embedding-provider=google)
 83 |     #[arg(long, env = "EMBEDDING_API_KEY")]
 84 |     pub embedding_api_key: Option<String>,
 85 | 
 86 |     /// Embedding model name/id (e.g. nomic-embed-text, text-embedding-004, nomic-embed-text-v2-moe)
 87 |     #[arg(long, env = "EMBEDDING_MODEL", default_value = "nomic-embed-text")]
 88 |     pub embedding_model: String,
 89 | 
 90 |     /// URL endpoint for Ollama or Google embeddings
 91 |     #[arg(long, env = "EMBEDDING_URL")]
 92 |     pub embedding_url: Option<String>,
 93 | 
 94 |     /// Parallel embedding requests
 95 |     #[arg(long, env = "EMBEDDING_MAX_CONCURRENCY", default_value = "4")]
 96 |     pub embedding_concurrency: usize,
 97 | 
 98 |     /// Number of texts per embedding batch
 99 |     #[arg(long, env = "EMBEDDING_BATCH_SIZE", default_value = "16")]
100 |     pub embedding_batch_size: usize,
101 | 
102 |     /// Max tokens per embedding request (provider-specific)
103 |     #[arg(long, env = "EMBEDDING_MAX_TOKENS", default_value = "8192")]
104 |     pub embedding_max_tokens: usize,
105 | 
106 |     /// Timeout (seconds) for embedding calls
107 |     #[arg(long, env = "OLLAMA_TIMEOUT", default_value = "60")]
108 |     pub embedding_timeout: u64,
109 | 
110 |     /// Task type for Google Gemini (default: SEMANTIC_SIMILARITY)
111 |     #[arg(long, env = "EMBEDDING_TASK_TYPE", default_value = "SEMANTIC_SIMILARITY")]
112 |     pub embedding_task_type: String,
113 | 
114 |     /// CPU threads for parallel tasks (0 = auto detect)
115 |     #[arg(long, env = "NUM_THREADS", default_value = "0")]
116 |     pub num_threads: usize,
117 | 
118 |     /// Group Redis records by table name if true (else use FT.CREATE/SEARCH)
119 |     #[arg(long, env = "GROUP_REDIS", default_value = "false")]
120 |     pub group_redis: bool,
121 | 
122 |     /// Path to TEI binary (tei-metal or tei-onnx).  
123 |     /// If you omit this, the embedded TEI will be extracted & launched.
124 |     #[arg(long, env = "TEI_BINARY_PATH", default_value = "tei/tei-metal")]
125 |     pub tei_binary_path: String,
126 | 
127 |     /// Port for the managed TEI server (only used if starting TEI locally)
128 |     #[arg(long, env = "TEI_LOCAL_PORT", default_value_t = 8080)]
129 |     pub tei_local_port: u16,
130 | 
131 |     /// Apply exclusion rules from config/exclude.json to remove sensitive fields
132 |     #[arg(long, env = "USE_EXCLUDE", default_value = "false")]
133 |     pub use_exclude: bool,
134 | }
135 | 


--------------------------------------------------------------------------------
/src/db/chroma.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn, debug };
  2 | use reqwest::blocking::Client;
  3 | use serde_json::Value;
  4 | use super::{ Database, DbError };
  5 | 
  6 | pub struct ChromaDatabase {
  7 |     client: Client,
  8 |     url: String,
  9 |     tenant: String,
 10 |     database: String,
 11 |     dimension: usize,
 12 |     auth_token: Option<String>,
 13 |     metric: String,
 14 | }
 15 | 
 16 | impl ChromaDatabase {
 17 |     pub fn new(args: &crate::cli::Args) -> Result<Self, DbError> {
 18 |         let url = format!("{}/api/v2", args.vector_host.trim_end_matches('/'));
 19 |         let tenant = args.tenant.clone();
 20 |         let database = args.database.clone();
 21 |         let dimension = args.dimension;
 22 |         let client = Client::new();
 23 |         let auth_token = if args.use_auth && !args.secret.is_empty() {
 24 |             Some(args.secret.clone())
 25 |         } else {
 26 |             None
 27 |         };
 28 | 
 29 |         let metric = args.metric.clone();
 30 |         Ok(ChromaDatabase {
 31 |             client,
 32 |             url,
 33 |             tenant,
 34 |             database,
 35 |             dimension,
 36 |             auth_token,
 37 |             metric,
 38 |         })
 39 |     }
 40 | }
 41 | 
 42 | impl Database for ChromaDatabase {
 43 |     
 44 | 
 45 |     fn store_vector(
 46 |         &self,
 47 |         table: &str,
 48 |         items: &[(String, Vec<f32>, Value)]
 49 |     ) -> Result<(), DbError> {
 50 |         if items.is_empty() {
 51 |             return Ok(());
 52 |         }
 53 | 
 54 |         let normalized_table = table.to_lowercase();
 55 |         if normalized_table != table {
 56 |             info!("Normalizing Chroma collection name '{}' to '{}'", table, normalized_table);
 57 |         }
 58 | 
 59 |         let dbs_url = format!("{}/tenants/{}/databases", self.url, self.tenant);
 60 |         let mut list_dbs_req = self.client.get(&dbs_url);
 61 |         if let Some(ref token) = self.auth_token {
 62 |             list_dbs_req = list_dbs_req.header("X-Chroma-Token", token);
 63 |         }
 64 |         let dbs_json: Value = list_dbs_req.send()?.json()?;
 65 |         let db_exists = dbs_json
 66 |             .as_array()
 67 |             .map(|arr| arr.iter().any(|db| db["name"].as_str() == Some(&self.database)))
 68 |             .unwrap_or(false);
 69 |         if !db_exists {
 70 |             info!("Creating Chroma database '{}'", self.database);
 71 |             let mut create_db_req = self.client
 72 |                 .post(&dbs_url)
 73 |                 .json(&serde_json::json!({ "name": self.database }));
 74 |             if let Some(ref token) = self.auth_token {
 75 |                 create_db_req = create_db_req.header("X-Chroma-Token", token);
 76 |             }
 77 |             let create_db_res = create_db_req.send()?;
 78 |             if !create_db_res.status().is_success() {
 79 |                 let err = create_db_res.text()?;
 80 |                 return Err(
 81 |                     format!("Failed to create Chroma database '{}': {}", self.database, err).into()
 82 |                 );
 83 |             }
 84 |             info!("Chroma database '{}' created", self.database);
 85 |         }
 86 | 
 87 |         let collections_url = format!(
 88 |             "{}/tenants/{}/databases/{}/collections",
 89 |             self.url,
 90 |             self.tenant,
 91 |             self.database
 92 |         );
 93 |         let mut list_req = self.client.get(&collections_url);
 94 |         if let Some(ref token) = self.auth_token {
 95 |             list_req = list_req.header("X-Chroma-Token", token);
 96 |         }
 97 |         let cols_res = list_req.send()?;
 98 |         let cols_json: Value = cols_res.json()?;
 99 |         let mut collection_id: Option<String> = None;
100 |         if let Some(arr) = cols_json.as_array() {
101 |             for col in arr {
102 |                 if col["name"].as_str() == Some(&normalized_table) {
103 |                     collection_id = col["id"].as_str().map(|s| s.to_string());
104 |                     break;
105 |                 }
106 |             }
107 |         }
108 |         let collection_id = match collection_id {
109 |             Some(id) => id,
110 |             None => {
111 |                 let col_body =
112 |                     serde_json::json!({
113 |                     "name": normalized_table,
114 |                     "dimension": self.dimension,
115 |                     "configuration_json": {
116 |                         "embedding_function": null,
117 |                         "hnsw": {
118 |                             "space": self.metric,  
119 |                             "ef_construction": 100,
120 |                             "ef_search": 100,
121 |                             "max_neighbors": 16,
122 |                             "resize_factor": 1.2,
123 |                             "sync_threshold": 1000
124 |                         },
125 |                         "spann": null
126 |                     }
127 |                 });
128 |                 let mut col_req = self.client.post(&collections_url).json(&col_body);
129 |                 if let Some(ref token) = self.auth_token {
130 |                     col_req = col_req.header("X-Chroma-Token", token);
131 |                 }
132 |                 let col_res = col_req.send()?;
133 |                 let col_json: Value = col_res.json()?;
134 |                 debug!("Chroma create collection response: {}", col_json);
135 | 
136 |                 col_json
137 |                     .get("id")
138 |                     .or_else(|| col_json.get("collection_id"))
139 |                     .and_then(|v| v.as_str())
140 |                     .ok_or_else(|| {
141 |                         format!("Failed to get new collection id, response: {}", col_json)
142 |                     })?
143 |                     .to_string()
144 |             }
145 |         };
146 | 
147 |         let ids: Vec<String> = items
148 |             .iter()
149 |             .map(|(id, _, _)| format!("{}:{}", normalized_table, id))
150 |             .collect();
151 |         let embeddings: Vec<Vec<f32>> = items
152 |             .iter()
153 |             .map(|(id, vec, _)| {
154 |                 if vec.is_empty() {
155 |                     warn!("ID='{}': Empty vector received, inserting dummy value", id);
156 |                     vec![0.1]
157 |                 } else if vec.len() != self.dimension {
158 |                     warn!(
159 |                         "ID='{}': Vector length {} != collection dimension {}, fixing",
160 |                         id,
161 |                         vec.len(),
162 |                         self.dimension
163 |                     );
164 |                     let mut fixed_vec = vec![0.0; self.dimension];
165 |                     for (i, val) in vec.iter().enumerate().take(self.dimension) {
166 |                         fixed_vec[i] = *val;
167 |                     }
168 |                     fixed_vec
169 |                 } else {
170 |                     vec.clone()
171 |                 }
172 |             })
173 |             .collect();
174 |         let documents: Vec<String> = items
175 |             .iter()
176 |             .map(|_| String::new())
177 |             .collect();
178 |         let metadatas: Vec<Value> = items
179 |             .iter()
180 |             .map(|(_, _, m)| {
181 |                 if let Some(map) = m.as_object() {
182 |                     let mut simple = serde_json::Map::new();
183 |                     for (k, v) in map {
184 |                         if v.is_string() || v.is_number() || v.is_boolean() {
185 |                             simple.insert(k.clone(), v.clone());
186 |                         }
187 |                     }
188 |                     if simple.is_empty() {
189 |                         Value::Null
190 |                     } else {
191 |                         Value::Object(simple)
192 |                     }
193 |                 } else {
194 |                     Value::Null
195 |                 }
196 |             })
197 |             .collect();
198 | 
199 |         let body =
200 |             serde_json::json!({
201 |             "ids": ids,
202 |             "embeddings": embeddings,
203 |             "documents": documents,
204 |             "metadatas": metadatas
205 |         });
206 | 
207 |         let add_url = format!(
208 |             "{}/tenants/{}/databases/{}/collections/{}/add",
209 |             self.url,
210 |             self.tenant,
211 |             self.database,
212 |             collection_id
213 |         );
214 |         let mut req = self.client.post(&add_url).json(&body);
215 |         if let Some(ref token) = self.auth_token {
216 |             req = req.header("X-Chroma-Token", token);
217 |         }
218 |         let resp = req.send()?;
219 | 
220 |         let status = resp.status();
221 |         let body_text = resp.text()?;
222 |         debug!("Chroma insert response ({}): {}", status, body_text);
223 | 
224 |         if status.is_success() {
225 |             info!("Chroma: inserted {} vectors into '{}' (original: '{}')", 
226 |                   items.len(), normalized_table, table);
227 |             Ok(())
228 |         } else if body_text.contains("Error in compaction") {
229 |             warn!("Chroma compaction error during insert (ignored): {}", body_text);
230 |             info!(
231 |                 "Chroma: potentially inserted {} vectors into '{}' despite compaction error",
232 |                 items.len(),
233 |                 normalized_table
234 |             );
235 |             Ok(())
236 |         } else {
237 |             Err(format!("Chroma bulk insert failed: {}", body_text).into())
238 |         }
239 |     }
240 | }
241 | 


--------------------------------------------------------------------------------
/src/db/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod redis;
 2 | pub mod qdrant;
 3 | pub mod chroma;
 4 | pub mod milvus;
 5 | pub mod surreal;
 6 | pub mod pinecone;
 7 | pub use redis::RedisDatabase;
 8 | pub use milvus::MilvusDatabase;
 9 | pub use qdrant::QdrantDatabase;
10 | pub use chroma::ChromaDatabase;
11 | pub use surreal::SurrealDatabase;
12 | pub use pinecone::PineconeDatabase;
13 | use serde_json::Value;
14 | use std::error::Error;
15 | use crate::cli::Args;
16 | 
17 | pub type DbError = Box<dyn Error + Send + Sync>;
18 | 
19 | pub trait Database: Send + Sync {
20 |     
21 |     fn store_vector(&self, table: &str, items: &[(String, Vec<f32>, Value)]) -> Result<(), DbError>;
22 | }
23 | 
24 | pub fn select_database(args: &Args) -> Result<Box<dyn Database>, DbError> {
25 |     let database: Box<dyn Database> = match args.vector_export_type.as_str() {
26 |         "redis" => Box::new(RedisDatabase::new(args)?),
27 |         "qdrant" => Box::new(QdrantDatabase::new(args)?),
28 |         "chroma" => Box::new(ChromaDatabase::new(args)?),
29 |         "milvus" => Box::new(MilvusDatabase::new(args)?),
30 |         "surreal" => Box::new(SurrealDatabase::new(args)?),
31 |         "pinecone" => Box::new(PineconeDatabase::new(args)?),
32 |         _ => {
33 |             return Err("Unsupported database type".into());
34 |         }
35 |     };
36 | 
37 |     Ok(database)
38 | }
39 | 
40 | pub fn store_in_batches(
41 |     db: &dyn Database,
42 |     table: &str,
43 |     items: &[(String, Vec<f32>, Value)],
44 |     max_bytes: usize
45 | ) -> Result<(), DbError> {
46 |     let mut start = 0;
47 |     let mut cur_size = 0;
48 |     for (i, (id, vec, meta)) in items.iter().enumerate() {
49 |         let meta_json = serde_json::to_string(meta)?;
50 |         let rec_size = id.len() + vec.len() * 4 + meta_json.len();
51 |         if cur_size + rec_size > max_bytes && start < i {
52 |             db.store_vector(table, &items[start..i])?;
53 |             start = i;
54 |             cur_size = rec_size;
55 |         } else {
56 |             cur_size += rec_size;
57 |         }
58 |     }
59 |     if start < items.len() {
60 |         db.store_vector(table, &items[start..])?;
61 |     }
62 |     Ok(())
63 | }
64 | 


--------------------------------------------------------------------------------
/src/db/pinecone.rs:
--------------------------------------------------------------------------------
  1 | use reqwest::blocking::Client;
  2 | use serde_json::{ Value, json };
  3 | use log::{ info, warn, error };
  4 | use super::{ Database, DbError };
  5 | 
  6 | pub struct PineconeDatabase {
  7 |     control_plane_url: String,
  8 |     data_plane_url: String,
  9 |     client: Client,
 10 |     api_version: String,
 11 |     api_key: Option<String>,
 12 |     use_auth: bool,
 13 |     dimension: usize,
 14 | }
 15 | 
 16 | impl PineconeDatabase {
 17 |     pub fn new(args: &crate::cli::Args) -> Result<Self, DbError> {
 18 |         let client = Client::new();
 19 |         let api_version = "2025-01".to_string();
 20 |         let is_local =
 21 |             args.vector_host.contains("localhost") ||
 22 |             args.vector_host.contains("127.0.0.1") ||
 23 |             args.vector_host.contains("::1");
 24 | 
 25 |         let control_plane_url = if is_local {
 26 |             args.vector_host.clone()
 27 |         } else {
 28 |             "https://api.pinecone.io".to_string()
 29 |         };
 30 | 
 31 |         let mut parsed_host_from_create: Option<String> = None;
 32 | 
 33 |         if !args.indexes.is_empty() && !is_local {
 34 |             let index_name = args.indexes.as_str();
 35 |             let endpoint = "indexes";
 36 |             let url = format!("{}/{}", control_plane_url, endpoint);
 37 | 
 38 |             let spec = json!({ "serverless": { "cloud": args.cloud, "region": args.region } });
 39 |             let body =
 40 |                 json!({ "name": index_name, "dimension": args.dimension, "metric": args.metric, "spec": spec });
 41 | 
 42 |             let mut req = client
 43 |                 .post(&url)
 44 |                 .header("Content-Type", "application/json")
 45 |                 .header("X-Pinecone-API-Version", &api_version)
 46 |                 .json(&body);
 47 | 
 48 |             if args.secret.is_empty() {
 49 |                 return Err("Pinecone cloud requires an API key (-k/--secret).".into());
 50 |             }
 51 |             req = req.header("Api-Key", &args.secret);
 52 | 
 53 |             let resp = req.send()?;
 54 |             match resp.status().as_u16() {
 55 |                 201 | 200 => {
 56 |                     let j: Value = resp.json()?;
 57 |                     let host = j
 58 |                         .get("host")
 59 |                         .and_then(|h| h.as_str())
 60 |                         .ok_or_else(|| DbError::from("Missing host in create index response"))?;
 61 |                     info!("Index '{}' available at host: {}", index_name, host);
 62 |                     parsed_host_from_create = Some(format!("https://{}", host));
 63 |                 }
 64 |                 409 => {
 65 |                     warn!("Index '{}' already exists, attempting to describe it to get host.", index_name);
 66 |                     let describe_url = format!("{}/{}", url, index_name);
 67 |                     let describe_req = client
 68 |                         .get(&describe_url)
 69 |                         .header("Accept", "application/json")
 70 |                         .header("X-Pinecone-API-Version", &api_version)
 71 |                         .header("Api-Key", &args.secret);
 72 | 
 73 |                     let describe_resp = describe_req.send()?;
 74 |                     if describe_resp.status().is_success() {
 75 |                         let j: Value = describe_resp.json()?;
 76 |                         let host = j
 77 |                             .get("host")
 78 |                             .and_then(|h| h.as_str())
 79 |                             .ok_or_else(||
 80 |                                 DbError::from("Missing host in describe index response")
 81 |                             )?;
 82 |                         info!("Existing index '{}' found at host: {}", index_name, host);
 83 |                         parsed_host_from_create = Some(format!("https://{}", host));
 84 |                     } else {
 85 |                         let txt = describe_resp.text().unwrap_or_default();
 86 |                         return Err(
 87 |                             format!(
 88 |                                 "Failed to describe existing index '{}': {}",
 89 |                                 index_name,
 90 |                                 txt
 91 |                             ).into()
 92 |                         );
 93 |                     }
 94 |                 }
 95 |                 status => {
 96 |                     let txt = resp.text().unwrap_or_default();
 97 |                     return Err(
 98 |                         format!("Failed to create/ensure index ({}): {}", status, txt).into()
 99 |                     );
100 |                 }
101 |             }
102 |         } else if !args.indexes.is_empty() && is_local {
103 |             warn!(
104 |                 "Running locally. Assuming database '{}' exists. Skipping creation/check.",
105 |                 args.indexes
106 |             );
107 |         }
108 | 
109 |         let data_plane_url = if is_local {
110 |             args.vector_host.clone()
111 |         } else {
112 |             if args.vector_host.contains(".svc.") && args.vector_host.contains(".pinecone.io") {
113 |                 info!("Using provided --host as data plane URL: {}", args.vector_host);
114 |                 if args.vector_host.starts_with("https://") {
115 |                     args.vector_host.clone()
116 |                 } else {
117 |                     format!("https://{}", args.vector_host)
118 |                 }
119 |             } else if let Some(host) = parsed_host_from_create {
120 |                 info!("Using host from create/describe API response as data plane URL: {}", host);
121 |                 host
122 |             } else {
123 |                 return Err(
124 |                     "Could not determine Pinecone data plane URL. Provide it via --host or ensure --indexes is set correctly.".into()
125 |                 );
126 |             }
127 |         };
128 |         if !is_local && args.secret.is_empty() {
129 |             return Err("Pinecone cloud requires an API key (-k/--secret).".into());
130 |         }
131 | 
132 |         let pd = PineconeDatabase {
133 |             control_plane_url,
134 |             data_plane_url,
135 |             client,
136 |             api_version,
137 |             api_key: if args.secret.is_empty() {
138 |                 None
139 |             } else {
140 |                 Some(args.secret.clone())
141 |             },
142 |             use_auth: !is_local,
143 |             dimension: args.dimension,
144 |         };
145 | 
146 |         info!("Pinecone mode: {}", if is_local { "LOCAL" } else { "CLOUD" });
147 |         info!("Control plane URL: {}", pd.control_plane_url);
148 |         info!("Data plane URL: {}", pd.data_plane_url);
149 | 
150 |         Ok(pd)
151 |     }
152 | }
153 | impl Database for PineconeDatabase {
154 |  
155 |     fn store_vector(
156 |         &self,
157 |         table: &str,
158 |         items: &[(String, Vec<f32>, Value)]
159 |     ) -> Result<(), DbError> {
160 |         if items.is_empty() {
161 |             return Ok(());
162 |         }
163 | 
164 |         let normalized_namespace = table.to_lowercase();
165 |         if normalized_namespace != table {
166 |             info!("Normalizing Pinecone namespace '{}' to '{}'", table, normalized_namespace);
167 |         }
168 |         
169 |         let url = format!("{}/vectors/upsert", self.data_plane_url);
170 |         let vectors: Vec<Value> = items
171 |             .iter()
172 |             .map(|(id, vector, data)| {
173 |                 let values = if vector.is_empty() {
174 |                     warn!("ID='{}': Empty vector received, inserting dummy values", id);
175 |                     vec![0.1; self.dimension]
176 |                 } else if vector.len() != self.dimension {
177 |                     warn!(
178 |                         "ID='{}': Vector length {} != expected dimension {}, fixing",
179 |                         id,
180 |                         vector.len(),
181 |                         self.dimension
182 |                     );
183 |                     let mut fixed_vec = vec![0.0; self.dimension];
184 |                     for (i, val) in vector.iter().enumerate().take(self.dimension) {
185 |                         fixed_vec[i] = *val;
186 |                     }
187 |                     fixed_vec
188 |                 } else {
189 |                     vector.clone()
190 |                 };
191 | 
192 |                 let mut record =
193 |                     json!({
194 |                     "id": id, 
195 |                     "values": values
196 |                 });
197 | 
198 |                 let mut processed_metadata = serde_json::Map::new();
199 |                 processed_metadata.insert("table".to_string(), Value::String(table.to_string()));
200 | 
201 |                 if let Some(map) = data.as_object() {
202 |                     for (k, v) in map {
203 |                         if v.is_null() {
204 |                             continue;
205 |                         }
206 |                         if v.is_object() || v.is_array() {
207 |                             processed_metadata.insert(
208 |                                 k.clone(),
209 |                                 Value::String(serde_json::to_string(v).unwrap_or_default())
210 |                             );
211 |                         } else {
212 |                             processed_metadata.insert(k.clone(), v.clone());
213 |                         }
214 |                     }
215 |                 }
216 |                 record["metadata"] = Value::Object(processed_metadata);
217 |                 record
218 |             })
219 |             .collect();
220 | 
221 |         let payload = json!({
222 |             "vectors": vectors,
223 |             "namespace": normalized_namespace  
224 |         });
225 | 
226 |         let mut req = self.client
227 |             .post(&url)
228 |             .header("Content-Type", "application/json")
229 |             .header("Accept", "application/json")
230 |             .header("X-Pinecone-API-Version", &self.api_version);
231 | 
232 |         if self.use_auth {
233 |             if let Some(key) = self.api_key.as_ref() {
234 |                 req = req.header("Api-Key", key);
235 |             } else {
236 |                 error!("Pinecone auth enabled but no API key available.");
237 |                 return Err("Pinecone auth enabled but no API key available.".into());
238 |             }
239 |         }
240 | 
241 |         let resp = req.json(&payload).send()?;
242 |         if resp.status().is_success() {
243 |             let j: Value = resp.json()?;
244 |             let count = j
245 |                 .get("upsertedCount")
246 |                 .and_then(|c| c.as_u64())
247 |                 .unwrap_or(0);
248 |             info!("Pinecone: upserted {} vectors into namespace `{}` (original: '{}')", 
249 |                   count, normalized_namespace, table);
250 |             Ok(())
251 |         } else {
252 |             let status = resp.status();
253 |             let txt = resp.text()?;
254 |             error!("Pinecone bulk upsert failed for namespace '{}' ({}): {}", 
255 |                    normalized_namespace, status, txt);
256 |             Err(format!("Pinecone bulk upsert error for namespace '{}': {}", table, txt).into())
257 |         }
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------
/src/db/qdrant.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn };
  2 | use reqwest::blocking::Client;
  3 | use serde_json::{ json, Value };
  4 | use super::{ Database, DbError };
  5 | 
  6 | pub struct QdrantDatabase {
  7 |     client: Client,
  8 |     url: String,
  9 |     api_key: Option<String>,
 10 |     dimension: usize,
 11 |     metric: String,
 12 | }
 13 | 
 14 | impl QdrantDatabase {
 15 |     pub fn new(args: &crate::cli::Args) -> Result<Self, DbError> {
 16 |         let qdrant_url = args.vector_host.clone();
 17 |         let api_key = if args.use_auth && !args.secret.is_empty() {
 18 |             Some(args.secret.clone())
 19 |         } else {
 20 |             None
 21 |         };
 22 |         let client = Client::new();
 23 | 
 24 |         Ok(QdrantDatabase {
 25 |             client,
 26 |             url: qdrant_url,
 27 |             api_key,
 28 |             dimension: args.dimension,
 29 |             metric: args.metric.clone(),
 30 |         })
 31 |     }
 32 | }
 33 | 
 34 | impl Database for QdrantDatabase {
 35 |  
 36 |     fn store_vector(
 37 |         &self,
 38 |         table: &str,
 39 |         items: &[(String, Vec<f32>, Value)]
 40 |     ) -> Result<(), DbError> {
 41 |         if items.is_empty() {
 42 |             return Ok(());
 43 |         }
 44 | 
 45 |         let normalized_table = table.to_lowercase();
 46 |         let coll_url = format!("{}/collections/{}", self.url, normalized_table);
 47 |         let mut chk = self.client.get(&coll_url);
 48 |         if let Some(k) = &self.api_key {
 49 |             chk = chk.header("api-key", k);
 50 |         }
 51 |         let resp = chk.send()?;
 52 |         if resp.status().as_u16() == 404 {
 53 |             let distance = match self.metric.to_lowercase().as_str() {
 54 |                 "cosine" => "Cosine",
 55 |                 "euclidean" => "Euclidean",
 56 |                 "dotproduct" | "dot" => "Dot",
 57 |                 other => {
 58 |                     warn!("Unknown metric '{}', falling back to Cosine", other);
 59 |                     "Cosine"
 60 |                 }
 61 |             };
 62 | 
 63 |             info!(
 64 |                 "Creating Qdrant collection '{}' (from table '{}') with dimension {} and distance {}",
 65 |                 normalized_table, table, self.dimension, distance
 66 |             );
 67 |             let body =
 68 |                 json!({
 69 |                 "vectors": {
 70 |                     "size": self.dimension,
 71 |                     "distance": distance
 72 |                 }
 73 |             });
 74 |             let mut crt = self.client.put(&coll_url).json(&body);
 75 |             if let Some(k) = &self.api_key {
 76 |                 crt = crt.header("api-key", k);
 77 |             }
 78 |             let cr = crt.send()?;
 79 |             if !cr.status().is_success() {
 80 |                 let err = cr.text()?;
 81 |                 warn!("Failed to create collection '{}': {}. Attempting to insert anyway.", normalized_table, err);
 82 |             }
 83 |         }
 84 | 
 85 |         let points: Vec<Value> = items
 86 |             .iter()
 87 |             .map(|(id, vec, payload)| {
 88 |                 let v = if vec.len() == self.dimension {
 89 |                     vec.clone()
 90 |                 } else {
 91 |                     warn!(
 92 |                         "ID={}: vector length {} ≠ {}, filling zeros",
 93 |                         id,
 94 |                         vec.len(),
 95 |                         self.dimension
 96 |                     );
 97 |                     vec![0.0; self.dimension]
 98 |                 };
 99 |                 json!({ "id": id, "vector": v, "payload": payload })
100 |             })
101 |             .collect();
102 | 
103 |         let up_url = format!("{}/collections/{}/points?wait=true", self.url, normalized_table);
104 |         let mut up = self.client.put(&up_url).json(&json!({ "points": points }));
105 |         if let Some(k) = &self.api_key {
106 |             up = up.header("api-key", k);
107 |         }
108 |         let up_res = up.send()?;
109 |         if up_res.status().is_success() {
110 |             info!("Qdrant: upserted {} points into `{}`", items.len(), normalized_table);
111 |             Ok(())
112 |         } else {
113 |             let txt = up_res.text()?;
114 |             Err(format!("Qdrant upsert failed: {}", txt).into())
115 |         }
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/db/redis.rs:
--------------------------------------------------------------------------------
  1 | use redis::Client;
  2 | use serde_json::Value;
  3 | use log::{ info, warn, debug };
  4 | use std::io::{ Error as IoError, ErrorKind };
  5 | use super::{ Database, DbError };
  6 | 
  7 | pub struct RedisDatabase {
  8 |     client: Client,
  9 |     password: Option<String>,
 10 |     dimension: usize,
 11 |     metric: String,
 12 |     group_redis: bool,
 13 | }
 14 | 
 15 | impl RedisDatabase {
 16 |     pub fn new(args: &crate::cli::Args) -> Result<Self, DbError> {
 17 |         info!("Connecting to Redis at {}", args.vector_host);
 18 |         let client = Client::open(args.vector_host.as_str()).map_err(
 19 |             |e|
 20 |                 Box::new(
 21 |                     IoError::new(ErrorKind::Other, format!("Failed to open Redis client: {}", e))
 22 |                 ) as DbError
 23 |         )?;
 24 |         let password = if args.use_auth && !args.pass.is_empty() {
 25 |             Some(args.pass.clone())
 26 |         } else {
 27 |             None
 28 |         };
 29 | 
 30 |         let mut conn = client
 31 |             .get_connection()
 32 |             .map_err(
 33 |                 |e|
 34 |                     Box::new(
 35 |                         IoError::new(
 36 |                             ErrorKind::Other,
 37 |                             format!("Failed to get Redis connection: {}", e)
 38 |                         )
 39 |                     ) as DbError
 40 |             )?;
 41 |         if let Some(ref pass) = password {
 42 |             redis
 43 |                 ::cmd("AUTH")
 44 |                 .arg(pass)
 45 |                 .query::<()>(&mut conn)
 46 |                 .map_err(
 47 |                     |e|
 48 |                         Box::new(
 49 |                             IoError::new(ErrorKind::Other, format!("Redis AUTH failed: {}", e))
 50 |                         ) as DbError
 51 |                 )?;
 52 |             info!("Redis AUTH successful");
 53 |         }
 54 | 
 55 |         let pong: String = redis
 56 |             ::cmd("PING")
 57 |             .query(&mut conn)
 58 |             .map_err(
 59 |                 |e|
 60 |                     Box::new(
 61 |                         IoError::new(ErrorKind::Other, format!("Redis PING failed: {}", e))
 62 |                     ) as DbError
 63 |             )?;
 64 |         if pong != "PONG" {
 65 |             warn!("Redis PING received unexpected response: {}", pong);
 66 |         } else {
 67 |             info!("Redis PING successful");
 68 |         }
 69 | 
 70 |         Ok(RedisDatabase {
 71 |             client,
 72 |             password,
 73 |             dimension: args.dimension,
 74 |             metric: args.metric.clone(),
 75 |             group_redis: args.group_redis,
 76 |         })
 77 |     }
 78 | 
 79 |     fn get_connection(&self) -> Result<redis::Connection, DbError> {
 80 |         let mut con = self.client
 81 |             .get_connection()
 82 |             .map_err(
 83 |                 |e|
 84 |                     Box::new(
 85 |                         IoError::new(
 86 |                             ErrorKind::Other,
 87 |                             format!("Failed to get Redis connection: {}", e)
 88 |                         )
 89 |                     ) as DbError
 90 |             )?;
 91 |         if let Some(ref pass) = self.password {
 92 |             redis
 93 |                 ::cmd("AUTH")
 94 |                 .arg(pass)
 95 |                 .query::<()>(&mut con)
 96 |                 .map_err(
 97 |                     |e|
 98 |                         Box::new(
 99 |                             IoError::new(ErrorKind::Other, format!("Redis AUTH failed: {}", e))
100 |                         ) as DbError
101 |                 )?;
102 |         }
103 |         Ok(con)
104 |     }
105 | 
106 |     fn map_metric_to_redis(&self) -> &str {
107 |         match self.metric.to_lowercase().as_str() {
108 |             "cosine" => "COSINE",
109 |             "l2" | "euclidean" => "L2",
110 |             "ip" | "dotproduct" | "innerproduct" => "IP",
111 |             _ => {
112 |                 warn!("Unsupported metric '{}', defaulting to COSINE", self.metric);
113 |                 "COSINE"
114 |             }
115 |         }
116 |     }
117 | 
118 |     fn ensure_index_exists(
119 |         &self,
120 |         con: &mut redis::Connection,
121 |         table: &str,
122 |         sample_data: Option<&Value>
123 |     ) -> Result<(), DbError> {
124 |         let index_name = format!("idx:{}", table);
125 | 
126 |         match redis::cmd("FT.INFO").arg(&index_name).query::<Vec<redis::Value>>(con) {
127 |             Ok(_) => {
128 |                 return Ok(());
129 |             }
130 |             Err(_) => {
131 |                 info!("Index '{}' not found, creating it now", index_name);
132 |             }
133 |         }
134 | 
135 |         let mut ft = redis::cmd("FT.CREATE");
136 |         ft.arg(&index_name)
137 |             .arg("ON")
138 |             .arg("JSON")
139 |             .arg("PREFIX")
140 |             .arg("1")
141 |             .arg(format!("item:{}:", table))
142 |             .arg("SCHEMA")
143 |             .arg("$.vector")
144 |             .arg("AS")
145 |             .arg("vector")
146 |             .arg("VECTOR")
147 |             .arg("FLAT")
148 |             .arg("6")
149 |             .arg("TYPE")
150 |             .arg("FLOAT32")
151 |             .arg("DIM")
152 |             .arg(self.dimension.to_string())
153 |             .arg("DISTANCE_METRIC")
154 |             .arg(self.map_metric_to_redis());
155 | 
156 |         if let Some(Value::Object(data_map)) = sample_data {
157 |             info!("Attempting to discover schema from first item data for index '{}'", index_name);
158 |             let standard_fields = vec![
159 |                 ("source_table".to_string(), "TEXT".to_string()),
160 |                 ("original_id".to_string(), "TEXT".to_string())
161 |             ];
162 | 
163 |             for (field, idx_ty_str) in standard_fields {
164 |                 debug!("Adding standard field to schema: $.{} AS {} {}", field, field, idx_ty_str);
165 |                 ft.arg(format!("$.{}", field)).arg("AS").arg(&field).arg(&idx_ty_str);
166 |                 if idx_ty_str == "TEXT" {
167 |                     ft.arg("SORTABLE");
168 |                 }
169 |             }
170 | 
171 |             for (field, value) in data_map {
172 |                 if field == "vector" || field == "source_table" || field == "original_id" {
173 |                     continue;
174 |                 }
175 | 
176 |                 let idx_ty = match value {
177 |                     Value::String(_) => "TEXT",
178 |                     Value::Number(_) => "NUMERIC",
179 |                     Value::Bool(_) => "NUMERIC",
180 |                     _ => {
181 |                         continue;
182 |                     }
183 |                 };
184 | 
185 |                 debug!("Adding discovered field to schema: $.{} AS {} {}", field, field, idx_ty);
186 |                 ft.arg(format!("$.{}", field)).arg("AS").arg(field).arg(idx_ty);
187 |                 if idx_ty == "TEXT" {
188 |                     ft.arg("SORTABLE");
189 |                 }
190 |             }
191 |         } else {
192 |             warn!("No sample data provided or sample data is not a JSON object for index '{}'. Only indexing vector field.", index_name);
193 |             ft.arg("$.source_table").arg("AS").arg("source_table").arg("TEXT").arg("SORTABLE");
194 |             ft.arg("$.original_id").arg("AS").arg("original_id").arg("TEXT").arg("SORTABLE");
195 |         }
196 | 
197 |         match ft.query::<()>(con) {
198 |             Ok(_) => {
199 |                 info!("Created Redis index '{}'", index_name);
200 |                 Ok(())
201 |             }
202 |             Err(e) => {
203 |                 let msg = e.to_string();
204 |                 if msg.contains("Index already exists") {
205 |                     info!("Index '{}' already exists (concurrent creation?), skipping", index_name);
206 |                     Ok(())
207 |                 } else {
208 |                     Err(
209 |                         Box::new(
210 |                             IoError::new(
211 |                                 ErrorKind::Other,
212 |                                 format!("FT.CREATE failed for index '{}': {}", index_name, msg)
213 |                             )
214 |                         ) as DbError
215 |                     )
216 |                 }
217 |             }
218 |         }
219 |     }
220 | }
221 | 
222 | impl Database for RedisDatabase {
223 |  
224 |     fn store_vector(
225 |         &self,
226 |         table: &str,
227 |         items: &[(String, Vec<f32>, Value)]
228 |     ) -> Result<(), DbError> {
229 |         if items.is_empty() {
230 |             return Ok(());
231 |         }
232 | 
233 |         let normalized_table = table.to_lowercase();
234 |         if normalized_table != table {
235 |             info!("Normalizing Redis table/index name '{}' to '{}'", table, normalized_table);
236 |         }
237 | 
238 |         let mut con = self.get_connection()?;
239 | 
240 |         if self.group_redis {
241 |             let key = format!("table:{}", normalized_table);
242 | 
243 |             let docs: Vec<Value> = items
244 |                 .iter()
245 |                 .map(|(id, vec, data)| {
246 |                     let mut obj = serde_json::Map::new();
247 |                     obj.insert("id".to_string(), Value::String(id.clone()));
248 |                     obj.insert("vector".to_string(), serde_json::to_value(vec).unwrap());
249 |                     if let Value::Object(map) = data {
250 |                         for (k, v) in map {
251 |                             if k != "vector" {
252 |                                 obj.insert(k.clone(), v.clone());
253 |                             }
254 |                         }
255 |                     }
256 |                     Value::Object(obj)
257 |                 })
258 |                 .collect();
259 | 
260 |             let payload = Value::Array(docs);
261 | 
262 |             redis
263 |                 ::cmd("JSON.SET")
264 |                 .arg(&key)
265 |                 .arg("$")
266 |                 .arg(serde_json::to_string(&payload)?)
267 |                 .query::<()>(&mut con)
268 |                 .map_err(|e| {
269 |                     Box::new(
270 |                         IoError::new(
271 |                             ErrorKind::Other,
272 |                             format!("Redis JSON.SET failed for '{}': {}", key, e)
273 |                         )
274 |                     ) as DbError
275 |                 })?;
276 | 
277 |             info!("Stored {} items grouped for table '{}' (original: '{}')", 
278 |                   items.len(), normalized_table, table);
279 |             return Ok(());
280 |         }
281 | 
282 |         let first_item_data = items.first().map(|(_, _, data)| data);
283 |         self.ensure_index_exists(&mut con, &normalized_table, first_item_data)?;
284 | 
285 |         let mut pipe = redis::pipe();
286 |         pipe.atomic();
287 | 
288 |         for (id, vec, data) in items {
289 |             let key = format!("item:{}:{}", normalized_table, id);
290 |             let mut record_obj = serde_json::Map::new();
291 |             record_obj.insert("vector".to_string(), serde_json::to_value(vec)?);
292 |             record_obj.insert("source_table".to_string(), Value::String(table.to_string()));
293 |             record_obj.insert("original_id".to_string(), Value::String(id.clone()));
294 | 
295 |             if let Value::Object(obj) = data {
296 |                 for (k, v) in obj {
297 |                     if k != "vector" && k != "source_table" {
298 |                         record_obj.insert(k.clone(), v.clone());
299 |                     }
300 |                 }
301 |             }
302 | 
303 |             debug!(
304 |                 "Redis JSON document for {}: {}",
305 |                 key,
306 |                 serde_json::to_string(&Value::Object(record_obj.clone()))?
307 |             );
308 | 
309 |             pipe.cmd("JSON.SET")
310 |                 .arg(&key)
311 |                 .arg("$")
312 |                 .arg(serde_json::to_string(&Value::Object(record_obj))?);
313 |         }
314 | 
315 |         pipe
316 |             .query::<()>(&mut con)
317 |             .map_err(|e| {
318 |                 Box::new(
319 |                     IoError::new(
320 |                         ErrorKind::Other,
321 |                         format!("Redis pipeline failed for table '{}': {}", table, e)
322 |                     )
323 |                 ) as DbError
324 |             })?;
325 | 
326 |         info!("Stored {} items for table '{}' (original: '{}') in Redis", 
327 |               items.len(), normalized_table, table);
328 |         Ok(())
329 |     }
330 | }
331 | 


--------------------------------------------------------------------------------
/src/db/surreal.rs:
--------------------------------------------------------------------------------
  1 | use base64::{ engine::general_purpose::STANDARD, Engine as _ };
  2 | use log::{ info, error, warn };
  3 | use reqwest::blocking::Client;
  4 | use serde_json::Value;
  5 | use super::{ Database, DbError };
  6 | 
  7 | pub struct SurrealDatabase {
  8 |     url: String,
  9 |     ns: String,
 10 |     db: String,
 11 |     auth_header: Option<String>,
 12 |     client: Client,
 13 | }
 14 | 
 15 | impl SurrealDatabase {
 16 |     pub fn new(args: &crate::cli::Args) -> Result<Self, DbError> {
 17 |         let base_url = args.vector_host.clone();
 18 |         let sql_url = format!("{}/sql", base_url.trim_end_matches('/'));
 19 |         let ns = args.namespace.clone();
 20 |         let db = args.database.clone();
 21 |         let client = Client::new();
 22 |         let auth_header = if args.use_auth {
 23 |             Some(format!("Basic {}", STANDARD.encode(format!("{}:{}", args.user, args.pass))))
 24 |         } else {
 25 |             None
 26 |         };
 27 | 
 28 |         let define_ns_sql = format!("DEFINE NAMESPACE IF NOT EXISTS {};", ns);
 29 |         info!("Sending DEFINE NAMESPACE: {}", define_ns_sql);
 30 |         let mut req_ns = client
 31 |             .post(&sql_url)
 32 |             .header("Content-Type", "text/plain")
 33 |             .header("Accept", "application/json")
 34 |             .body(define_ns_sql);
 35 | 
 36 |         if let Some(ref auth) = auth_header {
 37 |             req_ns = req_ns.header("Authorization", auth);
 38 |         }
 39 | 
 40 |         let resp_ns = req_ns.send().map_err(|e| Box::new(e) as DbError)?;
 41 |         let status_ns = resp_ns.status();
 42 |         let text_ns = resp_ns.text().map_err(|e| Box::new(e) as DbError)?;
 43 | 
 44 |         info!("SurrealDB DEFINE NAMESPACE response: {}", text_ns);
 45 |         if !status_ns.is_success() && !text_ns.contains("already exists") {
 46 |             error!("Failed to execute DEFINE NAMESPACE (Status: {}): {}", status_ns, text_ns);
 47 |         }
 48 | 
 49 |         let define_db_sql = format!("DEFINE DATABASE IF NOT EXISTS {};", db);
 50 |         info!("Sending DEFINE DATABASE: {}", define_db_sql);
 51 |         let mut req_db = client
 52 |             .post(&sql_url)
 53 |             .header("Content-Type", "text/plain")
 54 |             .header("Accept", "application/json")
 55 |             .header("Surreal-NS", &ns)
 56 |             .body(define_db_sql);
 57 | 
 58 |         if let Some(ref auth) = auth_header {
 59 |             req_db = req_db.header("Authorization", auth);
 60 |         }
 61 | 
 62 |         let resp_db = req_db.send().map_err(|e| Box::new(e) as DbError)?;
 63 |         let status_db = resp_db.status();
 64 |         let text_db = resp_db.text().map_err(|e| Box::new(e) as DbError)?;
 65 | 
 66 |         info!("SurrealDB DEFINE DATABASE response: {}", text_db);
 67 |         if !status_db.is_success() && !text_db.contains("already exists") {
 68 |             error!("Failed to execute DEFINE DATABASE (Status: {}): {}", status_db, text_db);
 69 |         }
 70 | 
 71 |         Ok(SurrealDatabase { url: base_url, ns, db, auth_header, client })
 72 |     }
 73 | 
 74 |     fn ensure_table_exists(&self, table: &str) -> Result<(), DbError> {
 75 |         let sql_url = format!("{}/sql", self.url.trim_end_matches('/'));
 76 |         let define_table_sql =
 77 |             format!("DEFINE TABLE IF NOT EXISTS `{}` TYPE ANY SCHEMALESS PERMISSIONS NONE;", table);
 78 | 
 79 |         info!("Ensuring table exists: {}", define_table_sql);
 80 | 
 81 |         let mut req = self.client
 82 |             .post(&sql_url)
 83 |             .header("Content-Type", "text/plain")
 84 |             .header("Accept", "application/json")
 85 |             .header("Surreal-NS", &self.ns)
 86 |             .header("Surreal-DB", &self.db)
 87 |             .body(define_table_sql);
 88 | 
 89 |         if let Some(ref auth) = self.auth_header {
 90 |             req = req.header("Authorization", auth);
 91 |         }
 92 | 
 93 |         let resp = req.send().map_err(|e| Box::new(e) as DbError)?;
 94 |         let status = resp.status();
 95 |         let text = resp.text().map_err(|e| Box::new(e) as DbError)?;
 96 | 
 97 |         info!("SurrealDB DEFINE TABLE response ({}): {}", status, text);
 98 | 
 99 |         if !status.is_success() {
100 |             warn!("Potential issue defining table '{}' (Status: {}): {}", table, status, text);
101 |         }
102 | 
103 |         Ok(())
104 |     }
105 | 
106 |   }
107 | 
108 | impl Database for SurrealDatabase {
109 |   
110 |     fn store_vector(
111 |         &self,
112 |         table: &str,
113 |         items: &[(String, Vec<f32>, Value)]
114 |     ) -> Result<(), DbError> {
115 |         if items.is_empty() {
116 |             return Ok(());
117 |         }
118 | 
119 |         let normalized_table = table.to_lowercase();
120 |         if normalized_table != table {
121 |             info!("Normalizing SurrealDB table name '{}' to '{}'", table, normalized_table);
122 |         }
123 | 
124 |         self.ensure_table_exists(&normalized_table)?;
125 | 
126 |         let records: Vec<(String, Value)> = items
127 |             .iter()
128 |             .map(|(id, vector, meta)| {
129 |                 let mut record = meta.clone();
130 |                 if let Some(obj) = record.as_object_mut() {
131 |                     obj.insert(
132 |                         "vector".to_string(),
133 |                         serde_json::to_value(vector).unwrap_or_default()
134 |                     );
135 |                     obj.insert("original_table".to_string(), Value::String(table.to_string()));
136 |                 }
137 |                 (id.clone(), record)
138 |             })
139 |             .collect();
140 | 
141 |         let import_url = format!("{}/import", self.url.trim_end_matches('/'));
142 |         let mut import_data = String::new();
143 | 
144 |         for (id, data) in &records {
145 |             let record_id = format!("{}:`{}`", normalized_table, id);
146 |             let content_json = serde_json::to_string(&data)?;
147 |             import_data.push_str(&format!("CREATE {} CONTENT {};\n", record_id, content_json));
148 |         }
149 | 
150 |         info!("SurrealDB Import URL: {}", import_url);
151 |         let preview_len = import_data.chars().count().min(300);
152 |         info!(
153 |             "SurrealDB Import Payload Preview: {}...",
154 |             import_data.chars().take(preview_len).collect::<String>()
155 |         );
156 | 
157 |         let mut req = self.client
158 |             .post(&import_url)
159 |             .header("Surreal-NS", &self.ns)
160 |             .header("Surreal-DB", &self.db)
161 |             .header("Content-Type", "text/plain")
162 |             .header("Accept", "application/json")
163 |             .body(import_data);
164 | 
165 |         if let Some(ref auth) = self.auth_header {
166 |             req = req.header("Authorization", auth);
167 |         }
168 | 
169 |         let resp = req.send()?;
170 |         let status = resp.status();
171 |         let text = resp.text().unwrap_or_else(|e| format!("Failed to read response body: {}", e));
172 |         info!("SurrealDB Import Response Status: {}", status);
173 |         info!("SurrealDB Import Response Body: {}", text);
174 | 
175 |         if !status.is_success() {
176 |             return Err(format!("SurrealDB batch import failed ({}): {}", status, text).into());
177 |         }
178 | 
179 |         info!("SurrealDB: successfully imported {} records to {} (original: {})", 
180 |               records.len(), normalized_table, table);
181 |         Ok(())
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/src/embedding/embeding.rs:
--------------------------------------------------------------------------------
  1 | use log::{ error, info, warn };
  2 | use serde_json::Value;
  3 | use std::error::Error as StdError;
  4 | use std::sync::atomic::{ AtomicUsize, Ordering };
  5 | use std::sync::Arc;
  6 | use tokio::runtime::Runtime;
  7 | use uuid::Uuid;
  8 | use crate::cli::Args;
  9 | use crate::embedding::{
 10 |     models::google::GoogleEmbeddingClient,
 11 |     models::ollama::OllamaEmbeddingClient, 
 12 |     models::tei::TeiEmbeddingClient,
 13 |     AsyncEmbeddingGenerator,
 14 | };
 15 | 
 16 | pub fn initialize_embedding_generator(
 17 |     args: &Args,
 18 |     override_url: Option<&str>,
 19 | ) -> Result<Box<dyn AsyncEmbeddingGenerator + Send + Sync>, Box<dyn StdError + Sync + Send>> {
 20 |     let provider = args.embedding_provider.to_lowercase();
 21 |     info!("Selected embedding provider: {}", provider);
 22 | 
 23 |     let url = override_url
 24 |         .or_else(|| args.embedding_url.as_deref())
 25 |         .map(|s| s.to_string());
 26 | 
 27 |     match provider.as_str() {
 28 |         "tei" => {
 29 |             let port = args.tei_local_port;
 30 |             let url_to_use = match override_url {
 31 |                 Some(url) => url.to_string(),
 32 |                 None => args.embedding_url.as_deref()
 33 |                     .unwrap_or(&format!("http://localhost:{}", port))
 34 |                     .to_string()
 35 |             };
 36 |             
 37 |             info!("🟢 TEI client connecting to {}", url_to_use);
 38 |             
 39 |             let client = TeiEmbeddingClient::new(
 40 |                 url_to_use,
 41 |                 args.dimension,
 42 |                 args.embedding_timeout 
 43 |             )?;
 44 |             Ok(Box::new(client))
 45 |         }
 46 | 
 47 |         "ollama" => {
 48 |             let ollama_url = url.unwrap_or_else(|| "http://localhost:11434".into());
 49 |             info!("🟢 Ollama client -> {}", ollama_url);
 50 |             let client = OllamaEmbeddingClient::new(
 51 |                 &ollama_url,
 52 |                 &args.embedding_model,
 53 |                 args.dimension,
 54 |             )?;
 55 |             Ok(Box::new(client))
 56 |         }
 57 | 
 58 |         "google" => {
 59 |             let api_key = args.embedding_api_key
 60 |                 .clone()
 61 |                 .ok_or_else(|| "Missing EMBEDDING_API_KEY for Google".to_string())?;
 62 |             info!("🟢 Google client");
 63 |             let client = GoogleEmbeddingClient::new(
 64 |                 api_key,
 65 |                 Some(args.embedding_model.clone()),
 66 |                 args.dimension,
 67 |             )?;
 68 |             Ok(Box::new(client))
 69 |         }
 70 | 
 71 |         other => Err(format!("Unsupported embedding provider: {}", other).into()),
 72 |     }
 73 | }
 74 | 
 75 | pub fn process_records_with_embeddings(
 76 |     records: Vec<Value>,
 77 |     args: &Args,
 78 |     embedding_counter: Arc<AtomicUsize>,
 79 |     generator: Box<dyn AsyncEmbeddingGenerator + Send + Sync>
 80 | ) -> Result<Vec<(String, String, Vec<f32>, Value)>, Box<dyn StdError + Send + Sync>> {
 81 |     let chunk_size = args.embedding_batch_size;
 82 |     let total_records = records.len();
 83 |     let mut prepared_records = Vec::with_capacity(total_records);
 84 |     let rt = Runtime::new()?;
 85 |     let approx_char_limit_from_tokens = args.embedding_max_tokens * 3;
 86 | 
 87 |     for (chunk_idx, chunk) in records.chunks(chunk_size).enumerate() {
 88 |         info!(
 89 |             "Processing embedding chunk {}/{}",
 90 |             chunk_idx + 1,
 91 |             (total_records + chunk_size - 1) / chunk_size
 92 |         );
 93 | 
 94 |         let texts: Vec<String> = chunk
 95 |             .iter()
 96 |             .map(|record| {
 97 |                 let mut full_text = record 
 98 |                     .as_object()
 99 |                     .map(|obj| {
100 |                         obj.iter()
101 |                             .filter(|(k, _)| *k != "table" && *k != "id")
102 |                             .map(|(k, v)| format!("{}: {}", k, v))
103 |                             .collect::<Vec<_>>()
104 |                             .join(", ")
105 |                     })
106 |                     .unwrap_or_else(|| record.to_string());
107 | 
108 |                 if full_text.chars().count() > approx_char_limit_from_tokens {
109 |                     warn!(
110 |                         "Client-side truncation: Input text for a record ({} chars) exceeds approximate limit derived from embedding_max_tokens ({} tokens -> ~{} chars). Truncating. Provider might also truncate based on its own limits.",
111 |                         full_text.chars().count(),
112 |                         args.embedding_max_tokens,
113 |                         approx_char_limit_from_tokens
114 |                     );
115 |                     full_text = full_text.chars().take(approx_char_limit_from_tokens).collect::<String>();
116 |                 }
117 |                 full_text
118 |             })
119 |             .collect();
120 | 
121 |         let embeddings_result = rt.block_on(generator.generate_embeddings_batch(&texts));
122 | 
123 |         match embeddings_result {
124 |             Ok(embeddings) => {
125 |                 if embeddings.len() != chunk.len() {
126 |                     error!(
127 |                         "CRITICAL: Embedding generator returned {} results for {} inputs in chunk {}",
128 |                         embeddings.len(),
129 |                         chunk.len(),
130 |                         chunk_idx + 1
131 |                     );
132 |                     return Err(
133 |                         format!(
134 |                             "Embedding generator returned incomplete results: got {}/{}",
135 |                             embeddings.len(),
136 |                             chunk.len()
137 |                         ).into()
138 |                     );
139 |                 }
140 | 
141 |                 let chunk_results: Vec<_> = chunk
142 |                     .iter()
143 |                     .zip(embeddings.into_iter())
144 |                     .map(|(record, vec)| {
145 |                         let id = Uuid::new_v4().to_string();
146 |                         let mut meta = record.clone();
147 |                         let table = meta
148 |                             .get("table")
149 |                             .and_then(|t| t.as_str())
150 |                             .unwrap_or("unknown_table")
151 |                             .to_string();
152 |                         if let Some(_obj) = meta.as_object_mut() {
153 |                         }
154 |                         (table, id, vec, meta)
155 |                     })
156 |                     .collect();
157 | 
158 |                 prepared_records.extend(chunk_results);
159 |                 embedding_counter.fetch_add(chunk.len(), Ordering::Relaxed);
160 |             }
161 |             Err(e) => {
162 |                 error!("CRITICAL: Embedding generation failed for chunk {}: {}", chunk_idx + 1, e);
163 |                 return Err(format!("Embedding generation failed: {}", e).into());
164 |             }
165 |         }
166 |     }
167 | 
168 |     Ok(prepared_records)
169 | }


--------------------------------------------------------------------------------
/src/embedding/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod embeding;
 2 | pub mod models;
 3 | 
 4 | use async_trait::async_trait;
 5 | use std::error::Error as StdError;
 6 | 
 7 | #[async_trait]
 8 | pub trait AsyncEmbeddingGenerator: Send + Sync {
 9 |     async fn generate_embeddings_batch(
10 |         &self,
11 |         texts: &[String]
12 |     ) -> Result<Vec<Vec<f32>>, Box<dyn StdError + Send + Sync>>;
13 | 
14 |     fn get_dimension(&self) -> usize;
15 | }
16 | 
17 | pub trait EmbeddingModel {
18 |     fn generate_embedding(&self, text: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>>;
19 | }
20 | 
21 | pub struct EmbeddingService<T: EmbeddingModel> {
22 |     model: T,
23 | }
24 | 
25 | impl<T: EmbeddingModel> EmbeddingService<T> {
26 |     pub fn new(model: T) -> Self {
27 |         Self { model }
28 |     }
29 | 
30 |     pub fn generate(&self, text: &str) -> Result<Vec<f32>, Box<dyn std::error::Error>> {
31 |         self.model.generate_embedding(text)
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/embedding/models/google.rs:
--------------------------------------------------------------------------------
  1 | use crate::embedding::AsyncEmbeddingGenerator;
  2 | use async_trait::async_trait;
  3 | use log::{ info, error, warn, debug };
  4 | use reqwest::Client;
  5 | use serde_json::{ json, Value };
  6 | use std::error::Error as StdError;
  7 | use std::time::Duration;
  8 | use tokio::time::sleep;
  9 | 
 10 | pub struct GoogleEmbeddingClient {
 11 |     client: Client,
 12 |     api_key: String,
 13 |     model_name: String,
 14 |     dimension: usize,
 15 |     task_type: String,
 16 |     request_delay_ms: u64,
 17 | }
 18 | 
 19 | impl GoogleEmbeddingClient {
 20 |     pub fn new(
 21 |         api_key: String,
 22 |         model: Option<String>,
 23 |         dimension: usize
 24 |     ) -> Result<Self, Box<dyn StdError + Send + Sync>> {
 25 |         let embed_model = model.unwrap_or_else(|| "text-embedding-004".to_string());
 26 |         let clean_model = embed_model.trim_start_matches("models/").to_string();
 27 |         let default_delay_ms = 1100;
 28 | 
 29 |         info!(
 30 |             "Initializing Google Embedding Client with model: {}, dimension: {}, request delay: {}ms",
 31 |             clean_model,
 32 |             dimension,
 33 |             default_delay_ms
 34 |         );
 35 | 
 36 |         Ok(Self {
 37 |             client: Client::new(),
 38 |             api_key,
 39 |             model_name: clean_model,
 40 |             dimension,
 41 |             task_type: "SEMANTIC_SIMILARITY".to_string(),
 42 |             request_delay_ms: default_delay_ms,
 43 |         })
 44 |     }
 45 | 
 46 |     pub fn with_task_type(mut self, task_type: &str) -> Self {
 47 |         self.task_type = task_type.to_string();
 48 |         self
 49 |     }
 50 | 
 51 |     pub fn with_request_delay(mut self, delay_ms: u64) -> Self {
 52 |         self.request_delay_ms = delay_ms;
 53 |         info!("Set Google request delay to {}ms", delay_ms);
 54 |         self
 55 |     }
 56 | }
 57 | 
 58 | #[async_trait]
 59 | impl AsyncEmbeddingGenerator for GoogleEmbeddingClient {
 60 |     async fn generate_embeddings_batch(
 61 |         &self,
 62 |         texts: &[String]
 63 |     ) -> Result<Vec<Vec<f32>>, Box<dyn StdError + Send + Sync>> {
 64 |         if texts.is_empty() {
 65 |             return Ok(vec![]);
 66 |         }
 67 | 
 68 |         info!(
 69 |             "Google: Generating embeddings for {} texts using model {} with task type {} (delay: {}ms)",
 70 |             texts.len(),
 71 |             self.model_name,
 72 |             self.task_type,
 73 |             self.request_delay_ms
 74 |         );
 75 | 
 76 |         let mut results = Vec::with_capacity(texts.len());
 77 | 
 78 |         for (i, text) in texts.iter().enumerate() {
 79 |             if i > 0 {
 80 |                 sleep(Duration::from_millis(self.request_delay_ms)).await;
 81 |             }
 82 | 
 83 |             let url = format!(
 84 |                 "https://generativelanguage.googleapis.com/v1beta/models/{}:embedContent?key={}",
 85 |                 self.model_name,
 86 |                 self.api_key
 87 |             );
 88 | 
 89 |             let request_body =
 90 |                 json!({
 91 |                 "model": format!("models/{}", self.model_name),
 92 |                 "content": {
 93 |                     "parts": [
 94 |                         {
 95 |                             "text": text
 96 |                         }
 97 |                     ]
 98 |                 },
 99 |                 "taskType": self.task_type
100 |             });
101 | 
102 |             debug!("Request URL: {}", url);
103 |             debug!("Request body: {}", request_body.to_string());
104 | 
105 |             let response = self.client
106 |                 .post(&url)
107 |                 .header("Content-Type", "application/json")
108 |                 .json(&request_body)
109 |                 .send().await;
110 | 
111 |             match response {
112 |                 Ok(res) => {
113 |                     let status = res.status();
114 | 
115 |                     if status.is_success() {
116 |                         match res.json::<Value>().await {
117 |                             Ok(json_response) => {
118 |                                 debug!("Success Response: {:?}", json_response);
119 | 
120 |                                 if
121 |                                     let Some(values) = json_response
122 |                                         .get("embedding")
123 |                                         .and_then(|e| e.get("values"))
124 |                                         .and_then(|v| v.as_array())
125 |                                 {
126 |                                     let embedding: Vec<f32> = values
127 |                                         .iter()
128 |                                         .filter_map(|v| v.as_f64().map(|f| f as f32))
129 |                                         .collect();
130 | 
131 |                                     results.push(embedding);
132 |                                 } else {
133 |                                     error!(
134 |                                         "CRITICAL: Invalid response format: {:?}",
135 |                                         json_response
136 |                                     );
137 |                                     return Err("Invalid embedding response format".into());
138 |                                 }
139 |                             }
140 |                             Err(e) => {
141 |                                 error!("CRITICAL: Failed to parse response JSON: {}", e);
142 |                                 return Err(format!("JSON parsing error: {}", e).into());
143 |                             }
144 |                         }
145 |                     } else {
146 |                         if status == reqwest::StatusCode::TOO_MANY_REQUESTS {
147 |                             warn!(
148 |                                 "Rate limit hit (429). Consider increasing delay or checking quota."
149 |                             );
150 |                         }
151 | 
152 |                         let error_text = match res.text().await {
153 |                             Ok(text) => text,
154 |                             Err(_) => "Failed to read error response".to_string(),
155 |                         };
156 |                         error!("CRITICAL: Google API error ({}): {}", status, error_text);
157 | 
158 |                         if let Ok(error_json) = serde_json::from_str::<Value>(&error_text) {
159 |                             error!("Error details: {:?}", error_json);
160 |                             if
161 |                                 let Some(message) = error_json
162 |                                     .get("error")
163 |                                     .and_then(|e| e.get("message"))
164 |                                     .and_then(|m| m.as_str())
165 |                             {
166 |                                 error!("Error message: {}", message);
167 |                                 return Err(format!("Google API error: {}", message).into());
168 |                             }
169 |                         }
170 | 
171 |                         return Err(format!("Google API error ({}): {}", status, error_text).into());
172 |                     }
173 |                 }
174 |                 Err(e) => {
175 |                     error!("CRITICAL: Request failed: {}", e);
176 |                     return Err(format!("Network error: {}", e).into());
177 |                 }
178 |             }
179 |         }
180 | 
181 |         info!("Google: Successfully generated {} embeddings", results.len());
182 |         Ok(results)
183 |     }
184 | 
185 |     fn get_dimension(&self) -> usize {
186 |         self.dimension
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/src/embedding/models/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod google;
2 | pub mod ollama;
3 | pub mod tei;
4 | 


--------------------------------------------------------------------------------
/src/embedding/models/ollama.rs:
--------------------------------------------------------------------------------
  1 | use crate::embedding::AsyncEmbeddingGenerator;
  2 | use async_trait::async_trait;
  3 | use log::{ error, info, warn };
  4 | use reqwest::Client as AsyncHttpClient;
  5 | use serde_json::{ json, Value };
  6 | use std::{ error::Error as StdError, time::Duration };
  7 | use futures::future::join_all;
  8 | 
  9 | pub struct OllamaEmbeddingClient {
 10 |     client: AsyncHttpClient,
 11 |     api_url: String,
 12 |     model: String,
 13 |     dimension: usize,
 14 | }
 15 | 
 16 | impl OllamaEmbeddingClient {
 17 |     pub fn new(
 18 |         base_url: &str,
 19 |         model: &str,
 20 |         dimension: usize
 21 |     ) -> Result<Self, Box<dyn StdError + Send + Sync>> {
 22 |         let api_url = if base_url.ends_with("/api/embeddings") {
 23 |             base_url.to_string()
 24 |         } else {
 25 |             format!("{}/api/embeddings", base_url.trim_end_matches('/'))
 26 |         };
 27 | 
 28 |         let client = AsyncHttpClient::builder().timeout(Duration::from_secs(20)).build()?;
 29 | 
 30 |         Ok(Self {
 31 |             client,
 32 |             api_url,
 33 |             model: model.to_string(),
 34 |             dimension,
 35 |         })
 36 |     }
 37 | 
 38 |     async fn generate_single_embedding(
 39 |         &self,
 40 |         text: &str
 41 |     ) -> Result<Vec<f32>, Box<dyn StdError + Send + Sync>> {
 42 |         let response = self.client
 43 |             .post(&self.api_url)
 44 |             .header("Content-Type", "application/json")
 45 |             .json(&json!({ "model": &self.model, "prompt": text }))
 46 |             .send().await?;
 47 | 
 48 |         if !response.status().is_success() {
 49 |             let status = response.status();
 50 |             let error_body = response
 51 |                 .text().await
 52 |                 .unwrap_or_else(|_| "Failed to read error body".to_string());
 53 |             warn!("Ollama API (single) returned status: {}, body: {}", status, error_body);
 54 |             return Err(format!("Ollama API error: {}", status).into());
 55 |         }
 56 | 
 57 |         let json_body = response.json::<Value>().await?;
 58 |         if let Some(embedding_array) = json_body["embedding"].as_array() {
 59 |             let embedding: Vec<f32> = embedding_array
 60 |                 .iter()
 61 |                 .filter_map(|v| v.as_f64().map(|f| f as f32))
 62 |                 .collect();
 63 | 
 64 |             if embedding.is_empty() && self.dimension > 0 {
 65 |                 warn!(
 66 |                     "Ollama returned empty embedding for input '{}', expected dimension {}. Returning zero vector.",
 67 |                     text, self.dimension
 68 |                 );
 69 |                 Ok(vec![0.0; self.dimension])
 70 |             } else if embedding.len() != self.dimension {
 71 |                 warn!(
 72 |                     "Ollama returned embedding with dimension {}, expected {}",
 73 |                     embedding.len(),
 74 |                     self.dimension
 75 |                 );
 76 |                 Err(
 77 |                     format!(
 78 |                         "Dimension mismatch: expected {}, got {}",
 79 |                         self.dimension,
 80 |                         embedding.len()
 81 |                     ).into()
 82 |                 )
 83 |             } else {
 84 |                 Ok(embedding)
 85 |             }
 86 |         } else {
 87 |             Err("Unexpected response structure from Ollama API".into())
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | #[async_trait]
 93 | impl AsyncEmbeddingGenerator for OllamaEmbeddingClient {
 94 |     async fn generate_embeddings_batch(
 95 |         &self,
 96 |         texts: &[String]
 97 |     ) -> Result<Vec<Vec<f32>>, Box<dyn StdError + Send + Sync>> {
 98 |         if texts.is_empty() {
 99 |             return Ok(vec![]);
100 |         }
101 |         info!("Ollama: Generating embeddings for {} texts", texts.len());
102 | 
103 |         let response = self.client
104 |             .post(&self.api_url)
105 |             .json(&json!({ "model": &self.model, "prompts": texts }))
106 |             .send().await;
107 | 
108 |         match response {
109 |             Ok(resp) if resp.status().is_success() => {
110 |                 match resp.json::<Value>().await {
111 |                     Ok(parsed) => {
112 |                         info!("Ollama batch response structure: {:?}", parsed);
113 | 
114 |                         if let Some(embeddings) = parsed.get("embeddings").and_then(|e| e.as_array()) {
115 |                             let mut result = Vec::with_capacity(embeddings.len());
116 |                             let mut success_count = 0;
117 |                             for (i, emb_val) in embeddings.iter().enumerate() {
118 |                                 if let Some(vector) = emb_val.get("embedding").and_then(|v| v.as_array()) {
119 |                                     let embedding: Vec<f32> = vector
120 |                                         .iter()
121 |                                         .filter_map(|v| v.as_f64().map(|f| f as f32))
122 |                                         .collect();
123 | 
124 |                                     if embedding.len() == self.dimension {
125 |                                         result.push(embedding);
126 |                                         success_count += 1;
127 |                                     } else {
128 |                                         warn!(
129 |                                             "Ollama batch item {} dimension mismatch: expected {}, got {}",
130 |                                             i,
131 |                                             self.dimension,
132 |                                             embedding.len()
133 |                                         );
134 |                                         result.push(vec![0.0; self.dimension]);
135 |                                     }
136 |                                 } else {
137 |                                     warn!("Ollama batch item {} missing 'embedding' array", i);
138 |                                     result.push(vec![0.0; self.dimension]);
139 |                                 }
140 |                             }
141 | 
142 |                             if result.len() == texts.len() {
143 |                                 info!(
144 |                                     "Ollama: Successfully processed batch of {} embeddings ({} succeeded)",
145 |                                     result.len(),
146 |                                     success_count
147 |                                 );
148 |                                 return Ok(result);
149 |                             } else {
150 |                                 warn!(
151 |                                     "Ollama batch result count mismatch: expected {}, got {}",
152 |                                     texts.len(),
153 |                                     result.len()
154 |                                 );
155 |                             }
156 |                         } 
157 | 
158 |                         else if parsed.is_array() {
159 |                             let array = parsed.as_array().unwrap();
160 |                             let mut result = Vec::with_capacity(array.len());
161 |                             let mut success_count = 0;
162 |                             for (i, emb_val) in array.iter().enumerate() {
163 |                                 if let Some(vector) = emb_val.as_array() {
164 |                                     let embedding: Vec<f32> = vector
165 |                                         .iter()
166 |                                         .filter_map(|v| v.as_f64().map(|f| f as f32))
167 |                                         .collect();
168 | 
169 |                                     if embedding.len() == self.dimension {
170 |                                         result.push(embedding);
171 |                                         success_count += 1;
172 |                                     } else {
173 |                                         warn!(
174 |                                             "Ollama batch item {} dimension mismatch: expected {}, got {}",
175 |                                             i,
176 |                                             self.dimension,
177 |                                             embedding.len()
178 |                                         );
179 |                                         result.push(vec![0.0; self.dimension]);
180 |                                     }
181 |                                 } else {
182 |                                     warn!("Ollama batch item {} missing 'embedding' array", i);
183 |                                     result.push(vec![0.0; self.dimension]);
184 |                                 }
185 |                             }
186 | 
187 |                             if result.len() == texts.len() {
188 |                                 info!(
189 |                                     "Ollama: Successfully processed batch of {} embeddings ({} succeeded)",
190 |                                     result.len(),
191 |                                     success_count
192 |                                 );
193 |                                 return Ok(result);
194 |                             } else {
195 |                                 warn!(
196 |                                     "Ollama batch result count mismatch: expected {}, got {}",
197 |                                     texts.len(),
198 |                                     result.len()
199 |                                 );
200 |                             }
201 |                         } 
202 |                         else if let Some(embedding) = parsed.get("embedding") {
203 |                             if let Some(vector) = embedding.as_array() {
204 |                                 let embedding: Vec<f32> = vector
205 |                                     .iter()
206 |                                     .filter_map(|v| v.as_f64().map(|f| f as f32))
207 |                                     .collect();
208 | 
209 |                                 if embedding.len() == self.dimension {
210 |                                     return Ok(vec![embedding]);
211 |                                 } else {
212 |                                     warn!(
213 |                                         "Ollama single embedding dimension mismatch: expected {}, got {}",
214 |                                         self.dimension,
215 |                                         embedding.len()
216 |                                     );
217 |                                 }
218 |                             } else {
219 |                                 warn!("Ollama single embedding missing 'embedding' array");
220 |                             }
221 |                         } else {
222 |                             warn!("Ollama batch response missing 'embeddings' array");
223 |                         }
224 |                     }
225 |                     Err(e) => {
226 |                         warn!("Failed to parse Ollama batch response: {}. Falling back.", e);
227 |                     }
228 |                 }
229 |             }
230 |             Ok(resp) => {
231 |                 let status = resp.status();
232 |                 let error_body = resp
233 |                     .text().await
234 |                     .unwrap_or_else(|_| "Failed to read error body".to_string());
235 |                 warn!(
236 |                     "Ollama batch API returned status: {}. Body: {}. Falling back.",
237 |                     status,
238 |                     error_body
239 |                 );
240 |             }
241 |             Err(e) => {
242 |                 warn!("Ollama batch request failed: {}. Falling back.", e);
243 |             }
244 |         }
245 | 
246 |         info!(
247 |             "Ollama: Using fallback: parallel individual embedding requests for {} texts",
248 |             texts.len()
249 |         );
250 |         let futures: Vec<_> = texts
251 |             .iter()
252 |             .map(|text| self.generate_single_embedding(text))
253 |             .collect();
254 | 
255 |         let results: Vec<Result<Vec<f32>, _>> = join_all(futures).await;
256 | 
257 |         let final_embeddings: Vec<Vec<f32>> = results
258 |             .into_iter()
259 |             .map(|res| {
260 |                 match res {
261 |                     Ok(embedding) => embedding,
262 |                     Err(e) => {
263 |                         error!("Ollama single embedding failed during fallback: {}", e);
264 |                         vec![0.0; self.dimension]
265 |                     }
266 |                 }
267 |             })
268 |             .collect();
269 | 
270 |         Ok(final_embeddings)
271 |     }
272 | 
273 |     fn get_dimension(&self) -> usize {
274 |         self.dimension
275 |     }
276 | }
277 | 


--------------------------------------------------------------------------------
/src/embedding/models/tei.rs:
--------------------------------------------------------------------------------
  1 | use crate::embedding::AsyncEmbeddingGenerator;
  2 | use async_trait::async_trait;
  3 | use log::{ info, error, warn };
  4 | use reqwest::Client;
  5 | use serde::Serialize;
  6 | use std::error::Error as StdError;
  7 | use std::time::Duration;
  8 | 
  9 | #[derive(Serialize)]
 10 | struct TeiRequest {
 11 |     inputs: Vec<String>,
 12 |     #[serde(skip_serializing_if = "Option::is_none")]
 13 |     truncate: Option<bool>, 
 14 | }
 15 | 
 16 | type TeiResponse = Vec<Vec<f32>>;
 17 | 
 18 | pub struct TeiEmbeddingClient {
 19 |     client: Client,
 20 |     api_url: String,  
 21 |     dimension: usize,
 22 | }
 23 | 
 24 | impl TeiEmbeddingClient {
 25 |     pub fn new(
 26 |         api_url: String, 
 27 |         dimension: usize, 
 28 |         timeout_secs: u64
 29 |     ) -> Result<Self, Box<dyn StdError + Send + Sync>> {
 30 |         let api_endpoint = if !api_url.ends_with("/embed") {
 31 |             format!("{}/embed", api_url.trim_end_matches('/'))
 32 |         } else {
 33 |             api_url
 34 |         };
 35 |         
 36 |         warn!("TEI server URL: {}", api_endpoint); 
 37 |         Ok(Self {
 38 |             client: Client::builder().timeout(Duration::from_secs(timeout_secs)).build()?,
 39 |             api_url: api_endpoint, 
 40 |             dimension,
 41 |         })
 42 |     }
 43 | }
 44 | 
 45 | #[async_trait]
 46 | impl AsyncEmbeddingGenerator for TeiEmbeddingClient {
 47 |     async fn generate_embeddings_batch(
 48 |         &self,
 49 |         texts: &[String]
 50 |     ) -> Result<Vec<Vec<f32>>, Box<dyn StdError + Send + Sync>> {
 51 |         if texts.is_empty() {
 52 |             return Ok(vec![]);
 53 |         }
 54 | 
 55 |         info!(
 56 |             "TEI Client: Generating embeddings for {} texts via {}",
 57 |             texts.len(),
 58 |             self.api_url
 59 |         );
 60 | 
 61 |         let request_payload = TeiRequest {
 62 |             inputs: texts.to_vec(),
 63 |             truncate: None, 
 64 |         };
 65 | 
 66 |     
 67 |         let mut retries = 3;
 68 |         let mut last_error = None;
 69 |         
 70 |         while retries > 0 {
 71 |             match self.client.post(&self.api_url).json(&request_payload).send().await {
 72 |                 Ok(response) => {
 73 |                     if response.status().is_success() {
 74 |                         let embeddings = response.json::<TeiResponse>().await?;
 75 |                         if embeddings.len() != texts.len() {
 76 |                             error!(
 77 |                                 "TEI Client: Mismatch in response length. Expected {}, got {}.",
 78 |                                 texts.len(),
 79 |                                 embeddings.len()
 80 |                             );
 81 |                             return Err(
 82 |                                 format!(
 83 |                                     "TEI response length mismatch: expected {}, got {}",
 84 |                                     texts.len(),
 85 |                                     embeddings.len()
 86 |                                 ).into()
 87 |                             );
 88 |                         }
 89 |                         for emb in &embeddings {
 90 |                             if emb.len() != self.dimension {
 91 |                                 error!(
 92 |                                     "TEI Client: Mismatch in embedding dimension. Expected {}, got {}.",
 93 |                                     self.dimension,
 94 |                                     emb.len()
 95 |                                 );
 96 |                                 return Err(
 97 |                                     format!(
 98 |                                         "TEI dimension mismatch: expected {}, got {}",
 99 |                                         self.dimension,
100 |                                         emb.len()
101 |                                     ).into()
102 |                                 );
103 |                             }
104 |                         }
105 |                         info!("TEI Client: Successfully generated {} embeddings", embeddings.len());
106 |                         return Ok(embeddings);
107 |                     } else {
108 |                         let status = response.status();
109 |                         let error_text = response
110 |                             .text().await
111 |                             .unwrap_or_else(|_| "Failed to read error body".to_string());
112 |                         error!("TEI server returned error {}: {}", status, error_text);
113 |                         return Err(format!("TEI server error {}: {}", status, error_text).into());
114 |                     }
115 |                 },
116 |                 Err(e) => {
117 |                     warn!("TEI request failed (retries left: {}): {}", retries - 1, e);
118 |                     retries -= 1;
119 |                     last_error = Some(e);
120 |                     tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
121 |                     continue;
122 |                 }
123 |             }
124 |         }
125 | 
126 |         Err(Box::new(std::io::Error::new(
127 |             std::io::ErrorKind::Other,
128 |             format!("Failed after multiple retries: {}", last_error.unwrap())
129 |         )))
130 |     }
131 | 
132 |     fn get_dimension(&self) -> usize {
133 |         self.dimension
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod db;
2 | pub mod parser;
3 | pub mod embedding;
4 | pub mod cli;
5 | pub mod util;
6 | pub mod workflow;


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | use db2vec::util;
 3 | 
 4 | use clap::Parser;
 5 | use db2vec::cli::Args;
 6 | use db2vec::db::select_database;
 7 | use dotenvy::dotenv;
 8 | 
 9 | use log::{ info, error };
10 | use db2vec::util::{ read_file_and_detect_format, logo };
11 | use db2vec::parser::parse_database_export;
12 | use db2vec::workflow::execute_migration_workflow;
13 | 
14 | fn main() -> Result<(), db2vec::db::DbError> {
15 |     logo();
16 |     dotenv().ok();
17 |     env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("off")).init();
18 |     let args = Args::parse();
19 |     let file_path = args.dump_file.clone();
20 |     util::init_thread_pool(args.num_threads);
21 | 
22 |     let (content, format) = match read_file_and_detect_format(&file_path) {
23 |         Ok(result) => result,
24 |         Err(e) => {
25 |             let err_msg = format!("Error reading file '{}': {}", file_path, e);
26 |             error!("{}", err_msg);
27 |             return Err(err_msg.into());
28 |         }
29 |     };
30 | 
31 |     let records = match parse_database_export(&content, &format, &args) {
32 |         Ok(recs) => recs,
33 |         Err(e) => {
34 |             let err_msg = format!("Error parsing database export: {}", e);
35 |             error!("{}", err_msg);
36 |             return Err(err_msg.into());
37 |         }
38 |     };
39 | 
40 |     let database = select_database(&args)?;
41 |     match execute_migration_workflow(records, &*database, &args) {
42 |         Ok(stats) => {
43 |             info!(
44 |                 "Migration successful: {} records processed in {:.2} seconds",
45 |                 stats.processed_records,
46 |                 stats.elapsed_seconds
47 |             );
48 |             Ok(())
49 |         }
50 |         Err(e) => {
51 |             error!("Migration failed: {}", e);
52 |             Err(e)
53 |         }
54 |     }
55 | }


--------------------------------------------------------------------------------
/src/parser/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::cli::Args;
  2 | 
  3 | use log::{ info, warn, debug };
  4 | use parse_regex::mssql::parse_mssql;
  5 | use parse_regex::mysql::parse_mysql;
  6 | use parse_regex::oracle::parse_oracle;
  7 | use parse_regex::postgres::parse_postgres;
  8 | use parse_regex::sqlite::parse_sqlite;
  9 | use parse_regex::surreal::parse_surreal;
 10 | use serde_json::Value;
 11 | use std::error::Error;
 12 | 
 13 | pub mod parse_regex;
 14 | pub trait ExportParser {
 15 |     fn parse(&self, content: &str) -> Result<Vec<Value>, Box<dyn Error>>;
 16 | }
 17 | 
 18 | pub fn parse_database_export(
 19 |     content: &str,
 20 |     format: &str,
 21 |     args: &Args
 22 | ) -> Result<Vec<Value>, Box<dyn std::error::Error>> {
 23 |     let mut all_records = Vec::new();
 24 |     
 25 |     let excluder = if args.use_exclude {
 26 |         Some(crate::util::exclude::Excluder::load("config/exclude.json"))
 27 |     } else {
 28 |         None
 29 |     };
 30 | 
 31 |     let chunks: Vec<String> = match format {
 32 |         "mssql" | "postgres" | "mysql" | "surreal" | "sqlite" => {
 33 |             info!("Processing {} file without chunking", format);
 34 |             vec![content.to_string()]
 35 |         }
 36 |         "oracle" => {
 37 |             content
 38 |                 .split("Insert into")
 39 |                 .filter(|s| !s.trim().is_empty())
 40 |                 .enumerate()
 41 |                 .map(|(i, s)| {
 42 |                     if i > 0 { format!("Insert into{}", s) } else { s.to_string() }
 43 |                 })
 44 |                 .collect()
 45 |         }
 46 |         _ => {
 47 |             warn!("Using default (single chunk) processing for unknown format: {}", format);
 48 |             vec![content.to_string()]
 49 |         }
 50 |     };
 51 | 
 52 |     info!("Found {} chunks to process for format '{}'", chunks.len(), format);
 53 | 
 54 |     for (i, chunk) in chunks.iter().enumerate() {
 55 |         if chunk.trim().is_empty() {
 56 |             debug!("Skipping empty chunk {}", i);
 57 |             continue;
 58 |         }
 59 | 
 60 |         match parse_with_regex(&chunk, format, args) {
 61 |             Some(mut records) => {
 62 |                 if !records.is_empty() {
 63 |                     if let Some(ref excl) = excluder {
 64 |                         debug!("Filtering fields for {} records from chunk {}", records.len(), i);
 65 |                         for record in &mut records {
 66 |                             excl.filter_record(record);
 67 |                         }
 68 |                     }
 69 |                     
 70 |                     info!("Parsed {} records from chunk {}", records.len(), i);
 71 |                     if args.debug {
 72 |                         for (j, rec) in records.iter().enumerate() {
 73 |                             debug!("Debug: Record {} in chunk {}: {}", j, i, rec);
 74 |                         }
 75 |                     }
 76 |                     all_records.extend(records);
 77 |                 } else {
 78 |                     debug!("Regex parsing yielded 0 records for chunk {}", i);
 79 |                 }
 80 |             }
 81 |             None => {
 82 |            
 83 |                 if args.debug && chunk.len() < 1000 {
 84 |                     debug!("Content of failed chunk {}:\n{}", i, chunk);
 85 |                 } else if args.debug {
 86 |                     debug!(
 87 |                         "Content of failed chunk {} (truncated):\n{}...",
 88 |                         i,
 89 |                         &chunk[..std::cmp::min(chunk.len(), 1000)]
 90 |                     );
 91 |                 }
 92 |             }
 93 |         }
 94 |     }
 95 | 
 96 |     info!("Total records parsed: {}", all_records.len());
 97 |     Ok(all_records)
 98 | }
 99 | 
100 | pub fn detect_format(file_path: &str, content: &str) -> String {
101 |     let _content_lower = content.to_lowercase();
102 | 
103 |     if file_path.ends_with(".surql") {
104 |         return "surreal".to_string();
105 |     }
106 | 
107 |     // Oracle distinctive patterns
108 |     if
109 |         content.contains("REM INSERTING into") ||
110 |         content.contains("SET DEFINE OFF;") ||
111 |         content.contains("Insert into ") ||
112 |         (content.contains("CREATE TABLE \"") &&
113 |             content.contains("PCTFREE") &&
114 |             content.contains("TABLESPACE")) ||
115 |         content.contains("BUFFER_POOL DEFAULT FLASH_CACHE DEFAULT CELL_FLASH_CACHE DEFAULT") ||
116 |         content.contains("USING INDEX PCTFREE") ||
117 |         content.contains("ALTER SESSION SET EVENTS") ||
118 |         content.contains("DBMS_LOGREP_IMP")
119 |     {
120 |         return "oracle".to_string();
121 |     }
122 | 
123 |     // PostgreSQL distinctive patterns
124 |     if
125 |         (content.contains("COPY ") && content.contains(" FROM stdin;")) ||
126 |         content.contains("PostgreSQL database dump") ||
127 |         (content.contains("SET ") && content.contains("standard_conforming_strings")) ||
128 |         content.contains("ALTER TABLE ONLY") ||
129 |         (content.contains("CREATE TYPE") && content.contains("AS ENUM")) ||
130 |         (content.contains("CREATE SEQUENCE") && content.contains("OWNED BY"))
131 |     {
132 |         return "postgres".to_string();
133 |     }
134 | 
135 |     // SQLite distinctive patterns
136 |     if
137 |         content.starts_with("PRAGMA foreign_keys=OFF;") ||
138 |         (content.contains("BEGIN TRANSACTION;") &&
139 |             content.contains("COMMIT;") &&
140 |             content.contains("CREATE TABLE") &&
141 |             !content.contains("ENGINE=InnoDB") &&
142 |             !content.contains("TABLESPACE") &&
143 |             content.contains("INSERT INTO ")) ||
144 |         content.contains("sqlite_sequence")
145 |     {
146 |         return "sqlite".to_string();
147 |     }
148 | 
149 |     // MSSQL distinctive patterns
150 |     if
151 |         content.contains("SET ANSI_NULLS ON") ||
152 |         content.contains("SET QUOTED_IDENTIFIER ON") ||
153 |         content.contains("CREATE TABLE [dbo].") ||
154 |         content.contains("INSERT [dbo].") ||
155 |         content.contains("WITH (PAD_INDEX = OFF") ||
156 |         content.contains("GO")
157 |     {
158 |         return "mssql".to_string();
159 |     }
160 | 
161 |     // MySQL distinctive patterns
162 |     if
163 |         content.contains("ENGINE=InnoDB") ||
164 |         content.contains("LOCK TABLES") ||
165 |         content.contains("/*!40") ||
166 |         content.contains("AUTO_INCREMENT") ||
167 |         content.contains("COLLATE=utf8mb4")
168 |     {
169 |         return "mysql".to_string();
170 |     }
171 | 
172 |     "json".to_string()
173 | }
174 | 
175 | pub fn parse_with_regex(chunk: &str, format: &str, args: &Args) -> Option<Vec<Value>> {
176 |     match format {
177 |         "surreal" => parse_surreal(chunk, args),
178 |         "mysql" => parse_mysql(chunk, args),
179 |         "postgres" => parse_postgres(chunk, args),
180 |         "oracle" => parse_oracle(chunk, args),
181 |         "sqlite" => parse_sqlite(chunk, args),
182 |         "mssql" => parse_mssql(chunk, args),
183 |         _ => None,
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/mod.rs:
--------------------------------------------------------------------------------
  1 | pub mod mysql;
  2 | pub mod postgres;
  3 | pub mod oracle;
  4 | pub mod surreal;
  5 | pub mod sqlite;
  6 | pub mod mssql;
  7 | use serde_json::Value;
  8 | 
  9 | pub fn clean_html_in_value(val: &mut Value) {
 10 |     match val {
 11 |         Value::String(s) => {
 12 |             if s.contains('<') && s.contains('>') {
 13 |                 *s = html2text
 14 |                     ::from_read(s.as_bytes(), usize::MAX)
 15 |                     .unwrap_or_else(|_| s.clone())
 16 |                     .replace('\n', " ")
 17 |                     .split_whitespace()
 18 |                     .collect::<Vec<_>>()
 19 |                     .join(" ")
 20 |                     .trim()
 21 |                     .to_string();
 22 |             }
 23 |         }
 24 |         Value::Array(arr) => {
 25 |             for v in arr {
 26 |                 clean_html_in_value(v);
 27 |             }
 28 |         }
 29 |         Value::Object(obj) => {
 30 |             for v in obj.values_mut() {
 31 |                 clean_html_in_value(v);
 32 |             }
 33 |         }
 34 |         _ => {}
 35 |     }
 36 | }
 37 | 
 38 | pub fn extract_json_array(text: &str) -> Option<&str> {
 39 |     let open_bracket = text.find('[')?;
 40 |     let mut depth = 1;
 41 |     let mut in_string = false;
 42 |     let mut escape_next = false;
 43 | 
 44 |     for (i, c) in text[open_bracket + 1..].chars().enumerate() {
 45 |         match c {
 46 |             '[' if !in_string => {
 47 |                 depth += 1;
 48 |             }
 49 |             ']' if !in_string => {
 50 |                 depth -= 1;
 51 |                 if depth == 0 {
 52 |                     return Some(&text[open_bracket..=open_bracket + i + 1]);
 53 |                 }
 54 |             }
 55 |             '"' if !escape_next => {
 56 |                 in_string = !in_string;
 57 |             }
 58 |             '\\' if in_string && !escape_next => {
 59 |                 escape_next = true;
 60 |             }
 61 |             _ => {
 62 |                 escape_next = false;
 63 |             }
 64 |         }
 65 |     }
 66 | 
 67 |     None
 68 | }
 69 | pub fn parse_array(array_str: &str) -> Option<Value> {
 70 |     let content = array_str.get(1..array_str.len() - 1)?;
 71 |     if content.is_empty() {
 72 |         return Some(Value::Array(vec![]));
 73 |     }
 74 |     let mut elements = Vec::new();
 75 |     let mut current_element = String::new();
 76 |     let mut chars = content.chars().peekable();
 77 |     let mut in_quotes = false;
 78 |     let mut escape_next = false;
 79 | 
 80 |     while let Some(c) = chars.next() {
 81 |         if escape_next {
 82 |             current_element.push(c);
 83 |             escape_next = false;
 84 |         } else if c == '\\' {
 85 |             escape_next = true;
 86 |         } else if c == '"' {
 87 |             in_quotes = !in_quotes;
 88 |         } else if c == ',' && !in_quotes {
 89 |             elements.push(Value::String(current_element.trim().to_string()));
 90 |             current_element.clear();
 91 |         } else {
 92 |             current_element.push(c);
 93 |         }
 94 |     }
 95 | 
 96 |     elements.push(Value::String(current_element.trim().to_string()));
 97 | 
 98 |     Some(Value::Array(elements))
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/mssql.rs:
--------------------------------------------------------------------------------
  1 | use log::{ debug, info };
  2 | use regex::Regex;
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::clean_html_in_value;
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_mssql(chunk: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: MSSQL");
 10 |     let mut records = Vec::new();
 11 |     
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 | 
 18 |     let insert_re = Regex::new(
 19 |         r"(?is)INSERT\s+\[(?:dbo|DB_OWNER)\]\.\[(\w+)\]\s*(?:\((.*?)\))?\s*VALUES\s*"
 20 |     ).ok()?;
 21 |     let values_re = Regex::new(r"\(((?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*)\)").ok()?;
 22 | 
 23 |     for cap in insert_re.captures_iter(chunk) {
 24 |         let table = cap.get(1)?.as_str();
 25 |         
 26 |         if let Some(ref excl) = excluder {
 27 |             if excl.ignore_table(table) {
 28 |                 info!("Skipping excluded MSSQL table: {}", table);
 29 |                 continue;
 30 |             }
 31 |         }
 32 | 
 33 |         info!("Processing INSERT for MSSQL table: {}", table);
 34 | 
 35 |         let column_names: Vec<String> = if let Some(cols_match) = cap.get(2) {
 36 |             cols_match
 37 |                 .as_str()
 38 |                 .split(',')
 39 |                 .map(|c|
 40 |                     c
 41 |                         .trim()
 42 |                         .trim_matches(&['[', ']'][..])
 43 |                         .to_string()
 44 |                 )
 45 |                 .collect()
 46 |         } else {
 47 |             Vec::new()
 48 |         };
 49 | 
 50 |         let insert_statement_end = cap.get(0)?.end();
 51 |         let search_area = &chunk[insert_statement_end..];
 52 | 
 53 |         for values_cap in values_re.captures_iter(search_area) {
 54 |             let values_str = values_cap.get(1)?.as_str();
 55 |             let mut fields = Vec::new();
 56 |             let mut current = String::new();
 57 |             let mut in_string = false;
 58 |             let mut in_cast = 0;
 59 |             let mut escape_next = false;
 60 | 
 61 |             for c in values_str.chars() {
 62 |                 if escape_next {
 63 |                     current.push(c);
 64 |                     escape_next = false;
 65 |                 } else if c == '\\' {
 66 |                     current.push(c);
 67 |                     escape_next = true;
 68 |                 } else if c == '\'' {
 69 |                     current.push(c);
 70 |                     if !in_string && current.ends_with("N'") {
 71 |                         in_string = true;
 72 |                     } else if in_string {
 73 |                         if let Some(next_char) = search_area.chars().nth(current.len()) {
 74 |                             if next_char == '\'' {
 75 |                                 continue;
 76 |                             }
 77 |                         }
 78 |                         in_string = false;
 79 |                     }
 80 |                 } else if c == '(' {
 81 |                     current.push(c);
 82 |                     if current.contains("CAST") {
 83 |                         in_cast += 1;
 84 |                     }
 85 |                 } else if c == ')' {
 86 |                     current.push(c);
 87 |                     if in_cast > 0 {
 88 |                         in_cast -= 1;
 89 |                     }
 90 |                 } else if c == ',' && !in_string && in_cast == 0 {
 91 |                     fields.push(current.trim().to_string());
 92 |                     current.clear();
 93 |                 } else {
 94 |                     current.push(c);
 95 |                 }
 96 |             }
 97 | 
 98 |             if !current.is_empty() {
 99 |                 fields.push(current.trim().to_string());
100 |             }
101 | 
102 |             let col_names = if !column_names.is_empty() {
103 |                 column_names.clone()
104 |             } else {
105 |                 (0..fields.len())
106 |                     .map(|i| {
107 |                         match i {
108 |                             0 => "id".to_string(),
109 |                             1 => "name".to_string(),
110 |                             2 => "description".to_string(),
111 |                             _ => format!("column{}", i),
112 |                         }
113 |                     })
114 |                     .collect()
115 |             };
116 | 
117 |             if fields.len() != col_names.len() {
118 |                 continue;
119 |             }
120 | 
121 |             let mut obj = serde_json::Map::new();
122 |             obj.insert("table".to_string(), Value::String(table.to_string()));
123 | 
124 |             for (i, val_str) in fields.iter().enumerate() {
125 |                 if i >= col_names.len() {
126 |                     continue;
127 |                 }
128 | 
129 |                 let value = parse_mssql_value(val_str);
130 |                 obj.insert(col_names[i].clone(), value);
131 |             }
132 | 
133 |             let id_key = obj
134 |                 .keys()
135 |                 .find(|k| k.eq_ignore_ascii_case("id"))
136 |                 .cloned();
137 |             if let Some(key) = id_key {
138 |                 obj.remove(&key);
139 |                 debug!("Removed 'id' field (key: {}) from MSSQL record", key);
140 |             }
141 | 
142 |             if obj.len() > 1 {
143 |                 let mut final_value = Value::Object(obj);
144 |                 clean_html_in_value(&mut final_value);
145 |                 records.push(final_value);
146 |             }
147 |         }
148 |     }
149 | 
150 |     if records.is_empty() {
151 |         None
152 |     } else {
153 |         Some(records)
154 |     }
155 | }
156 | 
157 | fn parse_mssql_value(val_str: &str) -> Value {
158 |     if val_str == "NULL" {
159 |         return Value::Null;
160 |     }
161 |     if val_str.starts_with("N'") && val_str.ends_with("'") {
162 |         let inner_str = &val_str[2..val_str.len() - 1].replace("''", "'");
163 | 
164 |         if
165 |             (inner_str.starts_with("[") && inner_str.ends_with("]")) ||
166 |             (inner_str.starts_with("{") && inner_str.ends_with("}"))
167 |         {
168 |             if let Ok(json_val) = serde_json::from_str(inner_str) {
169 |                 return json_val;
170 |             }
171 |         }
172 | 
173 |         return Value::String(inner_str.to_string());
174 |     }
175 | 
176 |     if val_str.starts_with("CAST(") {
177 |         let re = Regex::new(r"CAST\(\s*N?'?(.*?)'?\s+AS").ok();
178 |         if let Some(re) = re {
179 |             if let Some(cap) = re.captures(val_str) {
180 |                 if let Some(m) = cap.get(1) {
181 |                     return parse_mssql_value(m.as_str());
182 |                 }
183 |             }
184 |         }
185 | 
186 |         return Value::String(val_str.to_string());
187 |     }
188 | 
189 |     if val_str == "0" || val_str == "1" {
190 |         if let Ok(b) = val_str.parse::<i8>() {
191 |             return Value::Bool(b != 0);
192 |         }
193 |     }
194 | 
195 |     if let Ok(i) = val_str.parse::<i64>() {
196 |         return Value::Number(i.into());
197 |     }
198 | 
199 |     if let Ok(f) = val_str.parse::<f64>() {
200 |         if let Some(n) = serde_json::Number::from_f64(f) {
201 |             return Value::Number(n);
202 |         }
203 |     }
204 | 
205 |     Value::String(val_str.to_string())
206 | }
207 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/mysql.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn, debug };
  2 | use regex::Regex;
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::{ clean_html_in_value, parse_array };
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_mysql(chunk: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: MySQL");
 10 |     let mut records = Vec::new();
 11 |     
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 | 
 18 |     let insert_re = Regex::new(
 19 |         r#"(?is)INSERT INTO\s+[`'\"]?(\w+)[`'\"]?\s*(?:\(([^)]+)\))?\s*VALUES\s*(.*?);"#
 20 |     ).ok()?;
 21 | 
 22 |     let row_re = Regex::new(r"\((.*?)\)").ok()?;
 23 | 
 24 |     for cap in insert_re.captures_iter(chunk) {
 25 |         let table = cap.get(1)?.as_str();
 26 |         
 27 |         if let Some(ref excl) = excluder {
 28 |             if excl.ignore_table(table) {
 29 |                 info!("Skipping excluded MySQL table: {}", table);
 30 |                 continue;
 31 |             }
 32 |         }
 33 | 
 34 |         let column_names: Vec<String> = if let Some(cols_match) = cap.get(2) {
 35 |             cols_match
 36 |                 .as_str()
 37 |                 .split(',')
 38 |                 .map(|c|
 39 |                     c
 40 |                         .trim()
 41 |                         .trim_matches(&['`', '\'', '"'][..])
 42 |                         .to_string()
 43 |                 )
 44 |                 .collect()
 45 |         } else {
 46 |             Vec::new()
 47 |         };
 48 | 
 49 |         let values_str = cap.get(3)?.as_str();
 50 |         let mut inferred_column_count = 0;
 51 |         let mut first_row_processed = false;
 52 | 
 53 |         for row_cap in row_re.captures_iter(values_str) {
 54 |             let row = row_cap.get(1)?.as_str();
 55 |             let mut fields = Vec::new();
 56 |             let mut current = String::new();
 57 |             let mut in_string = false;
 58 |             let mut escape_next = false;
 59 | 
 60 |             for c in row.chars() {
 61 |                 if escape_next {
 62 |                     current.push(c);
 63 |                     escape_next = false;
 64 |                 } else if c == '\\' {
 65 |                     current.push(c);
 66 |                     escape_next = true;
 67 |                 } else if c == '\'' {
 68 |                     current.push(c);
 69 |                     in_string = !in_string;
 70 |                 } else if c == ',' && !in_string {
 71 |                     fields.push(current.trim().to_string());
 72 |                     current.clear();
 73 |                 } else {
 74 |                     current.push(c);
 75 |                 }
 76 |             }
 77 |             if !current.is_empty() {
 78 |                 fields.push(current.trim().to_string());
 79 |             }
 80 | 
 81 |             let col_names = if !column_names.is_empty() {
 82 |                 column_names.clone()
 83 |             } else if !first_row_processed {
 84 |                 first_row_processed = true;
 85 |                 inferred_column_count = fields.len();
 86 | 
 87 |                 let mut default_cols = Vec::with_capacity(fields.len());
 88 |                 for i in 0..fields.len() {
 89 |                     let col_name = match i {
 90 |                         0 => "id".to_string(),
 91 |                         1 => "name".to_string(),
 92 |                         2 => "description".to_string(),
 93 |                         _ => format!("column{}", i),
 94 |                     };
 95 |                     default_cols.push(col_name);
 96 |                 }
 97 |                 default_cols
 98 |             } else {
 99 |                 (0..inferred_column_count)
100 |                     .map(|i| {
101 |                         match i {
102 |                             0 => "id".to_string(),
103 |                             1 => "name".to_string(),
104 |                             2 => "description".to_string(),
105 |                             _ => format!("column{}", i),
106 |                         }
107 |                     })
108 |                     .collect()
109 |             };
110 | 
111 |             let mut obj = serde_json::Map::new();
112 |             obj.insert("table".to_string(), Value::String(table.to_string()));
113 | 
114 |             for (i, val_str) in fields.iter().enumerate() {
115 |                 if i >= col_names.len() {
116 |                     warn!("More values than columns for table '{}', value: '{}'", table, val_str);
117 |                     continue;
118 |                 }
119 | 
120 |                 let mut value = Value::Null;
121 | 
122 |                 if val_str == "NULL" {
123 |                 } else if val_str.starts_with('\'') && val_str.ends_with('\'') {
124 |                     let inner_str = val_str.trim_matches('\'');
125 |                     let unescaped_mysql_str = inner_str
126 |                         .replace("''", "'")
127 |                         .replace("\\\\", "\\")
128 |                         .replace("\\'", "'");
129 | 
130 |                     if
131 |                         (unescaped_mysql_str.starts_with('[') &&
132 |                             unescaped_mysql_str.ends_with(']')) ||
133 |                         (unescaped_mysql_str.starts_with('{') && unescaped_mysql_str.ends_with('}'))
134 |                     {
135 |                         let potential_json_str = unescaped_mysql_str.replace("\\\"", "\"");
136 |                         match serde_json::from_str::<Value>(&potential_json_str) {
137 |                             Ok(json_value) => {
138 |                                 value = json_value;
139 |                             }
140 |                             Err(_) => {
141 |                                 value = Value::String(unescaped_mysql_str);
142 |                             }
143 |                         }
144 |                     } else {
145 |                         value = Value::String(unescaped_mysql_str);
146 |                     }
147 |                 } else if val_str.starts_with('{') && val_str.ends_with('}') {
148 |                     match serde_json::from_str::<Value>(val_str) {
149 |                         Ok(json_val) => {
150 |                             value = json_val;
151 |                         }
152 |                         Err(_) => {
153 |                             value = parse_array(val_str).unwrap_or(
154 |                                 Value::String(val_str.to_string())
155 |                             );
156 |                         }
157 |                     }
158 |                 } else if val_str.starts_with('[') && val_str.ends_with(']') {
159 |                     match serde_json::from_str::<Value>(val_str) {
160 |                         Ok(json_val) => {
161 |                             value = json_val;
162 |                         }
163 |                         Err(_) => {
164 |                             value = Value::String(val_str.to_string());
165 |                         }
166 |                     }
167 |                 } else if let Ok(n) = val_str.parse::<i64>() {
168 |                     value = Value::Number(n.into());
169 |                 } else if let Ok(f) = val_str.parse::<f64>() {
170 |                     value = Value::Number(
171 |                         serde_json::Number::from_f64(f).unwrap_or_else(|| (0).into())
172 |                     );
173 |                 } else {
174 |                     value = Value::String(val_str.to_string());
175 |                 }
176 | 
177 |                 obj.insert(col_names[i].clone(), value);
178 |             }
179 | 
180 |             let id_key = obj
181 |                 .keys()
182 |                 .find(|k| k.eq_ignore_ascii_case("id"))
183 |                 .cloned();
184 |             if let Some(key) = id_key {
185 |                 obj.remove(&key);
186 |                 debug!("Removed 'id' field (key: {}) from MySQL record", key);
187 |             }
188 | 
189 |             if obj.len() > 1 {
190 |                 let mut final_value = Value::Object(obj);
191 |                 clean_html_in_value(&mut final_value);
192 |                 records.push(final_value);
193 |             } else {
194 |                 warn!("Skipping MySQL record for table '{}', too few fields after processing", table);
195 |             }
196 |         }
197 |     }
198 | 
199 |     if records.is_empty() {
200 |         None
201 |     } else {
202 |         Some(records)
203 |     }
204 | }
205 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/oracle.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn, debug };
  2 | use regex::Regex;
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::clean_html_in_value;
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_oracle(content: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: Oracle");
 10 |     let mut records = Vec::new();
 11 | 
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 | 
 18 |     let insert_re = Regex::new(
 19 |         r#"(?is)Insert\s+into\s+([\w\.\"]+)\s+\(([^)]+)\)\s+values\s+\(([^;]+)\);"#
 20 |     ).ok()?;
 21 | 
 22 |     for cap in insert_re.captures_iter(content) {
 23 |         let full_table = cap.get(1)?.as_str();
 24 | 
 25 |         let table = (
 26 |             if full_table.contains('.') {
 27 |                 full_table.split('.').last().unwrap_or(full_table)
 28 |             } else {
 29 |                 full_table
 30 |             }
 31 |         ).trim_matches('"');
 32 | 
 33 |         if let Some(ref excl) = excluder {
 34 |             if excl.ignore_table(table) {
 35 |                 info!("Skipping excluded Oracle table: {}", table);
 36 |                 continue;
 37 |             }
 38 |         }
 39 | 
 40 |         debug!("Processing Oracle INSERT for table: {}", table);
 41 | 
 42 |         let columns: Vec<&str> = cap
 43 |             .get(2)?
 44 |             .as_str()
 45 |             .split(',')
 46 |             .map(|s| s.trim().trim_matches('"'))
 47 |             .collect();
 48 | 
 49 |         let values_str = cap.get(3)?.as_str();
 50 |         let mut fields = Vec::new();
 51 |         let mut current = String::new();
 52 |         let mut in_string = false;
 53 |         let mut function_depth = 0;
 54 |         let mut chars = values_str.chars().peekable();
 55 | 
 56 |         while let Some(c) = chars.next() {
 57 |             match c {
 58 |                 '\'' if !in_string => {
 59 |                     current.push('\'');
 60 |                     in_string = true;
 61 |                 }
 62 |                 '\'' if in_string => {
 63 |                     if chars.peek() == Some(&'\'') {
 64 |                         current.push('\'');
 65 |                         current.push('\'');
 66 |                         chars.next();
 67 |                     } else {
 68 |                         current.push('\'');
 69 |                         in_string = false;
 70 |                     }
 71 |                 }
 72 |                 '(' => {
 73 |                     current.push('(');
 74 |                     if !in_string {
 75 |                         function_depth += 1;
 76 |                     }
 77 |                 }
 78 |                 ')' => {
 79 |                     current.push(')');
 80 |                     if !in_string && function_depth > 0 {
 81 |                         function_depth -= 1;
 82 |                     }
 83 |                 }
 84 |                 ',' if !in_string && function_depth == 0 => {
 85 |                     fields.push(current.trim().to_string());
 86 |                     current.clear();
 87 |                 }
 88 |                 _ => current.push(c),
 89 |             }
 90 |         }
 91 | 
 92 |         if !current.is_empty() || fields.len() < columns.len() {
 93 |             fields.push(current.trim().to_string());
 94 |         }
 95 | 
 96 |         if fields.len() != columns.len() {
 97 |             warn!(
 98 |                 "Mismatched number of columns ({}) and values ({}) for table '{}'. Row values: '{}'",
 99 |                 columns.len(),
100 |                 fields.len(),
101 |                 table,
102 |                 values_str
103 |             );
104 |             continue;
105 |         }
106 | 
107 |         let mut obj = serde_json::Map::new();
108 |         obj.insert("table".to_string(), Value::String(table.to_string()));
109 | 
110 |         for (col, val) in columns.iter().zip(fields.iter()) {
111 |             let parsed_value = parse_oracle_value(val);
112 |             obj.insert(col.to_string(), parsed_value);
113 |         }
114 | 
115 |         let id_key = obj
116 |             .keys()
117 |             .find(|k| k.eq_ignore_ascii_case("id"))
118 |             .cloned();
119 |         if let Some(key) = id_key {
120 |             obj.remove(&key);
121 |             debug!("Removed 'id' field (key: {}) from Oracle record", key);
122 |         }
123 | 
124 |         if obj.len() > 1 {
125 |             let mut final_value = Value::Object(obj);
126 |             clean_html_in_value(&mut final_value);
127 |             records.push(final_value);
128 |         } else {
129 |             warn!("Skipping Oracle record for table '{}', too few fields after processing", table);
130 |         }
131 |     }
132 | 
133 |     if records.is_empty() {
134 |         None
135 |     } else {
136 |         Some(records)
137 |     }
138 | }
139 | 
140 | fn parse_oracle_value(val_str: &str) -> Value {
141 |     if val_str.eq_ignore_ascii_case("NULL") {
142 |         return Value::Null;
143 |     }
144 |     if val_str.starts_with('\'') && val_str.ends_with('\'') {
145 |         let inner_str = &val_str[1..val_str.len() - 1].replace("''", "'");
146 | 
147 |         if
148 |             (inner_str.starts_with('{') && inner_str.ends_with('}')) ||
149 |             (inner_str.starts_with('[') && inner_str.ends_with(']'))
150 |         {
151 |             if let Ok(json_val) = serde_json::from_str(inner_str) {
152 |                 return json_val;
153 |             }
154 |         }
155 | 
156 |         return Value::String(inner_str.to_string());
157 |     }
158 | 
159 |     if val_str.starts_with("to_timestamp(") {
160 |         let timestamp_re = Regex::new(r"to_timestamp\('([^']+)'").ok();
161 |         if let Some(re) = timestamp_re {
162 |             if let Some(cap) = re.captures(val_str) {
163 |                 if let Some(date_match) = cap.get(1) {
164 |                     return Value::String(date_match.as_str().to_string());
165 |                 }
166 |             }
167 |         }
168 |         return Value::String("timestamp_parse_error".to_string());
169 |     }
170 | 
171 |     if let Ok(i) = val_str.parse::<i64>() {
172 |         return Value::Number(i.into());
173 |     }
174 | 
175 |     if let Ok(f) = val_str.parse::<f64>() {
176 |         if let Some(n) = serde_json::Number::from_f64(f) {
177 |             return Value::Number(n);
178 |         }
179 |     }
180 | 
181 |     Value::String(val_str.to_string())
182 | }
183 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/postgres.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn, debug };
  2 | use regex::Regex;
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::{ clean_html_in_value, parse_array };
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_postgres(content: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: Postgres");
 10 |     let mut records = Vec::new();
 11 |     
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 |     
 18 |     let copy_re = Regex::new(
 19 |         r"COPY\s+public\.([a-zA-Z0-9_]+)\s*\(([^)]+)\)\s+FROM stdin;\n((?s:.*?))\n\\\."
 20 |     ).ok()?;
 21 | 
 22 |     for cap in copy_re.captures_iter(content) {
 23 |         let table = cap.get(1)?.as_str();
 24 |         
 25 |         if let Some(ref excl) = excluder {
 26 |             if excl.ignore_table(table) {
 27 |                 info!("Skipping excluded Postgres table: {}", table);
 28 |                 continue;
 29 |             }
 30 |         }
 31 |         
 32 |         let columns: Vec<&str> = cap
 33 |             .get(2)?
 34 |             .as_str()
 35 |             .split(',')
 36 |             .map(|s| s.trim())
 37 |             .collect();
 38 |         let rows = cap.get(3)?.as_str();
 39 | 
 40 |         for line in rows.lines() {
 41 |             if line.trim().is_empty() {
 42 |                 continue;
 43 |             }
 44 | 
 45 |             let fields: Vec<&str> = line.split('\t').collect();
 46 |             if fields.len() != columns.len() {
 47 |                 warn!(
 48 |                     "Warning: Mismatched number of columns ({}) and values ({}) for table '{}' in COPY data. Line: '{}'",
 49 |                     columns.len(),
 50 |                     fields.len(),
 51 |                     table,
 52 |                     line
 53 |                 );
 54 |                 continue;
 55 |             }
 56 |             let mut obj = serde_json::Map::new();
 57 |             obj.insert("table".to_string(), Value::String(table.to_string()));
 58 | 
 59 |             for (col, val_str) in columns.iter().zip(fields.iter()) {
 60 |                 let value = if *val_str == r"\N" {
 61 |                     Value::Null
 62 |                 } else if
 63 |                     (val_str.starts_with('{') && val_str.ends_with('}')) ||
 64 |                     (val_str.starts_with('[') && val_str.ends_with(']'))
 65 |                 {
 66 |                     match serde_json::from_str::<Value>(val_str) {
 67 |                         Ok(json_val) => json_val,
 68 |                         Err(_) => {
 69 |                             if val_str.starts_with('{') && val_str.ends_with('}') {
 70 |                                 parse_array(val_str).unwrap_or(Value::String(val_str.to_string()))
 71 |                             } else {
 72 |                                 Value::String(val_str.to_string())
 73 |                             }
 74 |                         }
 75 |                     }
 76 |                 } else {
 77 |                     let unescaped_val = val_str
 78 |                         .replace("\\\\", "\\")
 79 |                         .replace("\\t", "\t")
 80 |                         .replace("\\n", "\n");
 81 |                     Value::String(unescaped_val)
 82 |                 };
 83 |                 obj.insert(col.trim().to_string(), value);
 84 |             }
 85 | 
 86 |             let id_key = obj
 87 |                 .keys()
 88 |                 .find(|k| k.eq_ignore_ascii_case("id"))
 89 |                 .cloned();
 90 |             if let Some(key) = id_key {
 91 |                 obj.remove(&key);
 92 |                 debug!("Removed 'id' field (key: {}) from Postgres record", key);
 93 |             }
 94 | 
 95 |             if obj.len() > 1 {
 96 |                 let mut final_value = Value::Object(obj);
 97 |                 clean_html_in_value(&mut final_value);
 98 |                 records.push(final_value);
 99 |             } else {
100 |                 warn!(
101 |                     "Skipping Postgres record for table '{}', became empty after removing ID. Original line: '{}'",
102 |                     table,
103 |                     line
104 |                 );
105 |             }
106 |         }
107 |     }
108 | 
109 |     if records.is_empty() {
110 |         None
111 |     } else {
112 |         Some(records)
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/sqlite.rs:
--------------------------------------------------------------------------------
  1 | use log::{ info, warn, debug };
  2 | use regex::Regex;
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::clean_html_in_value;
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_sqlite(chunk: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: SQLite");
 10 |     let mut records = Vec::new();
 11 |     
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 | 
 18 |     let create_re = Regex::new(
 19 |         r"(?is)CREATE TABLE\s+(?:IF NOT EXISTS\s+)?(?:`?(\w+)`?|(\w+))\s*\((.*?)\);"
 20 |     ).ok()?;
 21 | 
 22 |     let column_def_re = Regex::new(r"^\s*(?:`?(\w+)`?|(\w+))\s+").ok()?;
 23 |     let mut table_columns = std::collections::HashMap::new();
 24 | 
 25 |     for cap in create_re.captures_iter(chunk) {
 26 |         let table_name = cap
 27 |             .get(1)
 28 |             .or_else(|| cap.get(2))
 29 |             .map(|m| m.as_str());
 30 |         let cols_def_match = cap.get(3);
 31 | 
 32 |         if let (Some(table_name), Some(cols_def_match)) = (table_name, cols_def_match) {
 33 |             let cols_def = cols_def_match.as_str();
 34 |             let mut cols = Vec::new();
 35 |             for line in cols_def.lines() {
 36 |                 let trimmed_line = line.trim();
 37 |                 if
 38 |                     trimmed_line.starts_with("--") ||
 39 |                     trimmed_line.starts_with("PRIMARY") ||
 40 |                     trimmed_line.starts_with("UNIQUE") ||
 41 |                     trimmed_line.starts_with("CHECK") ||
 42 |                     trimmed_line.starts_with("FOREIGN") ||
 43 |                     trimmed_line.is_empty()
 44 |                 {
 45 |                     continue;
 46 |                 }
 47 |                 if let Some(col_cap) = column_def_re.captures(trimmed_line) {
 48 |                     if let Some(col_name) = col_cap.get(1).or_else(|| col_cap.get(2)) {
 49 |                         cols.push(col_name.as_str().to_string());
 50 |                     }
 51 |                 }
 52 |             }
 53 |             if !cols.is_empty() {
 54 |                 debug!("Found columns for table '{}': {:?}", table_name, cols);
 55 |                 table_columns.insert(table_name.to_string(), cols);
 56 |             }
 57 |         }
 58 |     }
 59 | 
 60 |     if table_columns.is_empty() {
 61 |         warn!("Could not parse any CREATE TABLE statements to find column names in SQLite chunk.");
 62 |         return None;
 63 |     }
 64 | 
 65 |     let insert_re = Regex::new(
 66 |         r"(?is)INSERT INTO\s+(?:`?(\w+)`?|(\w+))\s+VALUES\s*\((.*?)\);"
 67 |     ).ok()?;
 68 | 
 69 |     for cap in insert_re.captures_iter(chunk) {
 70 |         let table = match cap.get(1).or_else(|| cap.get(2)) {
 71 |             Some(t) => t.as_str(),
 72 |             None => {
 73 |                 continue;
 74 |             }
 75 |         };
 76 | 
 77 |         if table == "sqlite_sequence" {
 78 |             continue;
 79 |         }
 80 |         
 81 |         if let Some(ref excl) = excluder {
 82 |             if excl.ignore_table(table) {
 83 |                 info!("Skipping excluded SQLite table: {}", table);
 84 |                 continue;
 85 |             }
 86 |         }
 87 | 
 88 |         let columns = match table_columns.get(table) {
 89 |             Some(cols) => cols,
 90 |             None => {
 91 |                 warn!("Skipping INSERT for table '{}' because columns were not found (CREATE TABLE missing or unparsed).", table);
 92 |                 continue;
 93 |             }
 94 |         };
 95 |         let values_str = cap.get(3).map_or("", |m| m.as_str());
 96 |         let mut fields = Vec::new();
 97 |         let mut current_field = String::new();
 98 |         let mut in_string = false;
 99 |         let mut chars = values_str.chars().peekable();
100 | 
101 |         while let Some(c) = chars.next() {
102 |             if c == '\'' {
103 |                 if in_string && chars.peek() == Some(&'\'') {
104 |                     current_field.push(c);
105 |                     chars.next();
106 |                 } else {
107 |                     in_string = !in_string;
108 |                 }
109 |                 current_field.push(c);
110 |             } else if c == ',' && !in_string {
111 |                 fields.push(current_field.trim().to_string());
112 |                 current_field.clear();
113 |             } else {
114 |                 current_field.push(c);
115 |             }
116 |         }
117 | 
118 |         fields.push(current_field.trim().to_string());
119 | 
120 |         if fields.len() != columns.len() {
121 |             warn!(
122 |                 "Mismatched number of columns ({}) and values ({}) for table '{}'. Row: '{}'",
123 |                 columns.len(),
124 |                 fields.len(),
125 |                 table,
126 |                 values_str
127 |             );
128 |             continue;
129 |         }
130 | 
131 |         let mut obj = serde_json::Map::new();
132 |         obj.insert("table".to_string(), Value::String(table.to_string()));
133 | 
134 |         for (i, col) in columns.iter().enumerate() {
135 |             let val_str = &fields[i];
136 |             let mut value = Value::Null;
137 |             if val_str == "NULL" {
138 |             } else if val_str.starts_with('\'') && val_str.ends_with('\'') && val_str.len() >= 2 {
139 |                 let inner_str = &val_str[1..val_str.len() - 1];
140 |                 let unescaped_str = inner_str.replace("''", "'");
141 | 
142 |                 if
143 |                     (unescaped_str.starts_with('[') && unescaped_str.ends_with(']')) ||
144 |                     (unescaped_str.starts_with('{') && unescaped_str.ends_with('}'))
145 |                 {
146 |                     match serde_json::from_str::<Value>(&unescaped_str) {
147 |                         Ok(json_value) => {
148 |                             value = json_value;
149 |                         }
150 |                         Err(_) => {
151 |                             value = Value::String(unescaped_str);
152 |                         }
153 |                     }
154 |                 } else {
155 |                     value = Value::String(unescaped_str);
156 |                 }
157 |             } else if let Ok(n) = val_str.parse::<i64>() {
158 |                 value = Value::Number(n.into());
159 |             } else if let Ok(f) = val_str.parse::<f64>() {
160 |                 value = Value::Number(
161 |                     serde_json::Number::from_f64(f).unwrap_or_else(|| (0).into())
162 |                 );
163 |             } else {
164 |                 warn!(
165 |                     "Unrecognized value format for column '{}' in table '{}': {}",
166 |                     col,
167 |                     table,
168 |                     val_str
169 |                 );
170 |                 value = Value::String(val_str.to_string());
171 |             }
172 |             obj.insert(col.clone(), value);
173 |         }
174 | 
175 |         let id_key = obj
176 |             .keys()
177 |             .find(|k| k.eq_ignore_ascii_case("id"))
178 |             .cloned();
179 |         if let Some(key) = id_key {
180 |             obj.remove(&key);
181 |             debug!("Removed 'id' field (key: {}) from SQLite record", key);
182 |         }
183 | 
184 |         if obj.len() > 1 {
185 |             let mut final_value = Value::Object(obj);
186 |             clean_html_in_value(&mut final_value);
187 |             records.push(final_value);
188 |         } else {
189 |             warn!(
190 |                 "Skipping SQLite record for table '{}', became empty after removing ID. Original values: '{}'",
191 |                 table,
192 |                 values_str
193 |             );
194 |         }
195 |     }
196 | 
197 |     if records.is_empty() {
198 |         None
199 |     } else {
200 |         Some(records)
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/src/parser/parse_regex/surreal.rs:
--------------------------------------------------------------------------------
  1 | use regex::Regex;
  2 | use log::{ info, warn, debug };
  3 | use serde_json::Value;
  4 | use crate::parser::parse_regex::clean_html_in_value;
  5 | use crate::cli::Args;
  6 | use crate::util::exclude::Excluder;
  7 | 
  8 | pub fn parse_surreal(chunk: &str, args: &Args) -> Option<Vec<Value>> {
  9 |     info!("Using parse method: Surreal");
 10 |     let mut records = Vec::new();
 11 | 
 12 |     let excluder = if args.use_exclude {
 13 |         Some(Excluder::load("config/exclude.json"))
 14 |     } else {
 15 |         None
 16 |     };
 17 | 
 18 |     let table_header_re = Regex::new(r"--\s*TABLE DATA:\s*([a-zA-Z0-9_]+)").ok()?;
 19 |     let insert_re = Regex::new(r"INSERT\s*\[(?s)(.*?)\]\s*;").ok()?;
 20 | 
 21 |     let mut inserts = Vec::new();
 22 |     for insert_cap in insert_re.captures_iter(chunk) {
 23 |         if let Some(array_content) = insert_cap.get(1) {
 24 |             let array_text = array_content.as_str();
 25 |             let full_match = insert_cap.get(0).unwrap().as_str();
 26 |             inserts.push((full_match, array_text));
 27 |         }
 28 |     }
 29 | 
 30 |     if inserts.is_empty() {
 31 |         warn!("No INSERT statements found in chunk");
 32 |         return None;
 33 |     }
 34 | 
 35 |     let mut table_sections = Vec::new();
 36 |     for table_cap in table_header_re.captures_iter(chunk) {
 37 |         if let Some(table_name) = table_cap.get(1) {
 38 |             let pos = table_cap.get(0).unwrap().start();
 39 |             table_sections.push((table_name.as_str().to_string(), pos));
 40 |         }
 41 |     }
 42 | 
 43 |     table_sections.sort_by_key(|&(_, pos)| pos);
 44 | 
 45 |     for (i, (insert_stmt, array_content)) in inserts.iter().enumerate() {
 46 |         let insert_pos = chunk.find(insert_stmt).unwrap_or(0);
 47 |         let mut table_name = "unknown_table".to_string();
 48 |         for (t_name, t_pos) in &table_sections {
 49 |             if *t_pos < insert_pos {
 50 |                 table_name = t_name.clone();
 51 |             } else {
 52 |                 break;
 53 |             }
 54 |         }
 55 | 
 56 |         if let Some(ref excl) = excluder {
 57 |             if excl.ignore_table(&table_name) {
 58 |                 info!("Skipping excluded table: {}", table_name);
 59 |                 continue; 
 60 |             }
 61 |         }
 62 | 
 63 |         info!("Processing INSERT #{} for table: {}", i, table_name);
 64 |         debug!("Parsing data from table {}: {:.100}...", table_name, array_content);
 65 | 
 66 |         let object_re = Regex::new(r"\}\s*,\s*\{").unwrap();
 67 |         let items: Vec<String> = object_re
 68 |             .split(array_content)
 69 |             .map(|s| {
 70 |                 let trimmed = s.trim();
 71 |                 let mut obj = trimmed.to_string();
 72 |                 if !obj.starts_with('{') {
 73 |                     obj.insert(0, '{');
 74 |                 }
 75 |                 if !obj.ends_with('}') {
 76 |                     obj.push('}');
 77 |                 }
 78 |                 obj
 79 |             })
 80 |             .collect();
 81 | 
 82 |         for item_str in items {
 83 |             if let Ok(mut obj) = serde_json::from_str::<serde_json::Map<String, Value>>(&item_str) {
 84 |                 obj.insert("table".to_string(), Value::String(table_name.clone()));
 85 |                 let mut value = Value::Object(obj);
 86 |                 clean_html_in_value(&mut value);
 87 |                 records.push(value);
 88 |                 continue;
 89 |             }
 90 | 
 91 |             let mut record = serde_json::Map::new();
 92 |             let kv_regex = Regex::new(
 93 |                 r#"([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*("(?:\\.|[^"\\])*"|'[^']*'|\[.*?\]|\{.*?\}|[0-9.]+(?:f)?|true|false|null)"#
 94 |             ).unwrap();
 95 | 
 96 |             for caps in kv_regex.captures_iter(&item_str) {
 97 |                 let key = caps.get(1).unwrap().as_str();
 98 |                 let raw_val = caps.get(2).unwrap().as_str().trim();
 99 | 
100 |                 let value = if raw_val.starts_with('[') && raw_val.ends_with(']') {
101 |                     serde_json
102 |                         ::from_str::<Value>(raw_val)
103 |                         .unwrap_or(Value::String(raw_val.to_string()))
104 |                 } else if raw_val.starts_with('{') && raw_val.ends_with('}') {
105 |                     serde_json
106 |                         ::from_str::<Value>(raw_val)
107 |                         .unwrap_or(Value::String(raw_val.to_string()))
108 |                 } else if raw_val.starts_with('\'') && raw_val.ends_with('\'') {
109 |                     Value::String(raw_val.trim_matches('\'').to_string())
110 |                 } else if raw_val.starts_with('"') && raw_val.ends_with('"') {
111 |                     match serde_json::from_str::<String>(raw_val) {
112 |                         Ok(s) => Value::String(s),
113 |                         Err(_) => {
114 |                             let s = raw_val[1..raw_val.len()-1]
115 |                                 .replace("\\\"", "\"")
116 |                                 .replace("\\\\", "\\")
117 |                                 .replace("\\n", "\n")
118 |                                 .replace("\\r", "\r")
119 |                                 .replace("\\t", "\t");
120 |                             Value::String(s)
121 |                         }
122 |                     }
123 |                 } else if let Ok(n) = raw_val.trim_end_matches('f').parse::<f64>() {
124 |                     if n.fract() == 0.0 {
125 |                         Value::Number((n as i64).into())
126 |                     } else {
127 |                         serde_json::Number
128 |                             ::from_f64(n)
129 |                             .map(Value::Number)
130 |                             .unwrap_or(Value::String(raw_val.to_string()))
131 |                     }
132 |                 } else if raw_val == "true" {
133 |                     Value::Bool(true)
134 |                 } else if raw_val == "false" {
135 |                     Value::Bool(false)
136 |                 } else if raw_val == "null" {
137 |                     Value::Null
138 |                 } else {
139 |                     Value::String(raw_val.to_string())
140 |                 };
141 | 
142 |                 record.insert(key.to_string(), value);
143 |             }
144 | 
145 |             record.insert("table".to_string(), Value::String(table_name.clone()));
146 |             record.remove("id");
147 | 
148 |             if record.len() > 1 {
149 |                 let mut value = Value::Object(record);
150 |                 clean_html_in_value(&mut value);
151 |                 records.push(value);
152 |             } else {
153 |                 warn!("Regex fallback produced empty record for: {}", item_str);
154 |             }
155 |         }
156 |     }
157 | 
158 |     if records.is_empty() {
159 |         warn!("No records parsed from section");
160 |         None
161 |     } else {
162 |    
163 |         info!("Successfully parsed {} records", records.len());
164 |         Some(records)
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/util/exclude.rs:
--------------------------------------------------------------------------------
  1 | use serde::Deserialize;
  2 | use serde_json::Value;
  3 | use std::{collections::HashMap, fs, path::Path};
  4 | 
  5 | #[derive(Debug, Deserialize)]
  6 | #[serde(default)]
  7 | pub struct ExcludeEntry {
  8 |     pub table: String,
  9 |     pub ignore_table: bool,
 10 |     #[serde(default)]
 11 |     pub exclude_fields: HashMap<String, FieldExclude>,
 12 | }
 13 | 
 14 | impl Default for ExcludeEntry {
 15 |     fn default() -> Self {
 16 |         ExcludeEntry {
 17 |             table: String::new(),
 18 |             ignore_table: false,
 19 |             exclude_fields: HashMap::new(),
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | #[derive(Debug, Deserialize)]
 25 | #[serde(untagged)]
 26 | pub enum FieldExclude {
 27 |     All(bool),
 28 |     Sub(Vec<String>),
 29 | }
 30 | 
 31 | pub struct Excluder {
 32 |     entries: HashMap<String, ExcludeEntry>,
 33 | }
 34 | 
 35 | impl Excluder {
 36 |     pub fn load<P: AsRef<Path>>(path: P) -> Self {
 37 |         let data = fs::read_to_string(path).unwrap_or_else(|_| "[]".into());
 38 |         let list: Vec<ExcludeEntry> =
 39 |             serde_json::from_str(&data).unwrap_or_else(|_| Vec::new());
 40 |         let entries = list.into_iter()
 41 |             .map(|e| (e.table.clone(), e))
 42 |             .collect();
 43 |         Excluder { entries }
 44 |     }
 45 | 
 46 |     pub fn ignore_table(&self, table: &str) -> bool {
 47 |         self.entries
 48 |             .get(table)
 49 |             .map(|e| e.ignore_table)
 50 |             .unwrap_or(false)
 51 |     }
 52 | 
 53 | 
 54 |     pub fn filter_record(&self, record: &mut Value) {
 55 |         let table = match record.get("table").and_then(Value::as_str) {
 56 |             Some(t) => t,
 57 |             None => return,
 58 |         };
 59 |         
 60 |         if let Some(entry) = self.entries.get(table) {
 61 |             if let Value::Object(map) = record {
 62 |                 for (field, rule) in &entry.exclude_fields {
 63 |                     match rule {
 64 |                         FieldExclude::All(true) => {
 65 |                             map.remove(field);
 66 |                         }
 67 |                         FieldExclude::Sub(keys) => {
 68 |                             if let Some(Value::Object(sub_map)) = map.get_mut(field) {
 69 |                                 for k in keys {
 70 |                                     sub_map.remove(k);
 71 |                                 }
 72 |                             } 
 73 |                          
 74 |                             else if let Some(Value::String(obj_str)) = map.get_mut(field) {
 75 |                                 if obj_str.trim().starts_with('{') && obj_str.trim().ends_with('}') {
 76 |                                     for key in keys {
 77 |                                         let patterns = [
 78 |                                             format!("{}:\\s*[^,}}]+,", regex::escape(key)),
 79 |                                             format!("{}:\\s*[^,}}]+}}", regex::escape(key)),
 80 |                                             format!("\"{}\":\\s*[^,}}]+,", regex::escape(key)),
 81 |                                             format!("'{}\':\\s*[^,}}]+,", regex::escape(key)),
 82 |                                         ];
 83 |                                         
 84 |                                         for pattern in patterns {
 85 |                                             if let Ok(re) = regex::Regex::new(&pattern) {
 86 |                                                 *obj_str = re.replace(obj_str, "").to_string();
 87 |                                             }
 88 |                                         }
 89 |                                         
 90 |                                         if let Ok(re) = regex::Regex::new(r",\s*}") {
 91 |                                             *obj_str = re.replace(obj_str, "}").to_string();
 92 |                                         }
 93 |                                         if let Ok(re) = regex::Regex::new(r",\s*,") {
 94 |                                             *obj_str = re.replace(obj_str, ",").to_string();
 95 |                                         }
 96 |                                     }
 97 |                                 }
 98 |                             }
 99 |                         },
100 |                         _ => {}
101 |                     }
102 |                 }
103 |             }
104 |         }
105 |     }
106 | 
107 | }


--------------------------------------------------------------------------------
/src/util/handle_tei.rs:
--------------------------------------------------------------------------------
  1 | use std::process::{ Child, Command, Stdio };
  2 | use log::{ info, error };
  3 | use std::{ error::Error as StdError, io::{ BufRead, BufReader, Write }, time::{ Duration, Instant }, thread };
  4 | use crate::{cli::Args, util::spinner::start_operation_animation};
  5 | use std::sync::atomic::Ordering;
  6 | use std::sync::mpsc; 
  7 | 
  8 | pub struct ManagedProcess {
  9 |     child: Child,
 10 |     name: String,
 11 | }
 12 | 
 13 | impl ManagedProcess {
 14 |     pub fn new(child: Child, name: String) -> Self {
 15 |         info!("Started managed process '{}' (PID: {})", name, child.id());
 16 |         Self { child, name }
 17 |     }
 18 | 
 19 | 
 20 |     pub fn id(&self) -> u32 {
 21 |         self.child.id()
 22 |     }
 23 | 
 24 |     pub fn kill(&mut self) -> Result<(), Box<dyn StdError + Send + Sync>> {
 25 |         info!("Manually terminating process '{}' (PID: {})", self.name, self.child.id());
 26 |         match self.child.kill() {
 27 |             Ok(_) => {
 28 |                 info!("Successfully sent kill signal to process '{}'", self.name);
 29 |                 Ok(())
 30 |             }
 31 |             Err(e) => {
 32 |                 let err = format!("Failed to kill process '{}': {}", self.name, e);
 33 |                 error!("{}", err);
 34 |                 Err(err.into())
 35 |             }
 36 |         }
 37 |     }
 38 | }
 39 | 
 40 | impl Drop for ManagedProcess {
 41 |     fn drop(&mut self) {
 42 |         info!("Attempting to terminate managed process '{}' (PID: {})", self.name, self.child.id());
 43 |         match self.child.kill() { 
 44 |             Ok(_) => {
 45 |                 info!("Successfully sent kill signal to process '{}'", self.name);
 46 |             }
 47 |             Err(e) =>
 48 |                 error!("Failed to kill process '{}' (PID: {}): {}", self.name, self.child.id(), e),
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | pub fn start_and_wait_for_tei(
 54 |     args: &Args
 55 | ) -> Result<(ManagedProcess, String), Box<dyn StdError + Send + Sync>> {
 56 | 
 57 |     println!("\n══════════════════════════════════════════════════════════════");
 58 |     println!("🚀 Starting local TEI embedding server with model: {}", args.embedding_model);
 59 |     println!("   This process can take 3-20 minutes on first run for model download");
 60 |     println!("══════════════════════════════════════════════════════════════\n");
 61 | 
 62 |     let (animation, counter) = start_operation_animation("Initializing TEI server");
 63 |     
 64 |     let model_id = if args.embedding_model.is_empty() {
 65 |         animation.stop(); 
 66 |         return Err("embedding_model must be specified when managing local TEI".into());
 67 |     } else {
 68 |         &args.embedding_model
 69 |     };
 70 | 
 71 |     let tei_binary = &args.tei_binary_path;
 72 |   
 73 | 
 74 |     info!("Starting TEI binary: '{}' with model '{}'", tei_binary, model_id);
 75 | 
 76 |     let mut command = Command::new(tei_binary);
 77 |     command
 78 |         .args(["--model-id", model_id,  "--port", &args.tei_local_port.to_string(),  "--auto-truncate"])
 79 |         .env("RUST_LOG", "info") 
 80 |         .stdout(Stdio::piped())
 81 |         .stderr(Stdio::piped());
 82 | 
 83 |     let mut child = match command.spawn() {
 84 |         Ok(child) => child,
 85 |         Err(e) => {
 86 |             animation.stop();
 87 |             return Err(format!("Failed to spawn TEI binary '{}': {}", tei_binary, e).into());
 88 |         }
 89 |     };
 90 | 
 91 |     let stdout = match child.stdout.take() {
 92 |         Some(stdout) => stdout,
 93 |         None => {
 94 |             animation.stop();
 95 |             return Err("Failed to capture TEI stdout".into());
 96 |         }
 97 |     };
 98 | 
 99 |     let stderr = match child.stderr.take() {
100 |         Some(stderr) => stderr,
101 |         None => {
102 |             animation.stop();
103 |             return Err("Failed to capture TEI stderr".into());
104 |         }
105 |     };
106 | 
107 |     let process_name = format!("tei-server-{}", child.id());
108 |     let managed_process = ManagedProcess::new(child, process_name);
109 | 
110 |     println!("\nTEI Server Logs:");
111 |     println!("----------------");
112 | 
113 |     let (tx, rx) = mpsc::channel();
114 |     let tx_stderr = tx.clone();
115 | 
116 |     thread::spawn(move || {
117 |         let reader = BufReader::new(stdout);
118 |         for line in reader.lines() {
119 |             if let Ok(line) = line {
120 |                 if let Err(_) = tx.send(line) {
121 |                     break;
122 |                 }
123 |             }
124 |         }
125 |     });
126 | 
127 |     thread::spawn(move || {
128 |         let reader = BufReader::new(stderr);
129 |         for line in reader.lines() {
130 |             if let Ok(line) = line {
131 |                 if let Err(_) = tx_stderr.send(line) {
132 |                     break;
133 |                 }
134 |             }
135 |         }
136 |     });
137 | 
138 |     let start_time = Instant::now();
139 |     let mut ready = false;
140 |     let mut log_buffer = Vec::new();
141 |     let timeout = Duration::from_secs(300); 
142 |     let deadline = start_time + timeout;
143 |     
144 |     while Instant::now() < deadline {
145 |         match rx.recv_timeout(Duration::from_secs(1)) {
146 |             Ok(line) => {
147 |                 log_buffer.push(line.clone());
148 |                 println!("  TEI: {}", line);
149 |                 
150 |                 if line.contains("Starting download") {
151 |                     counter.store(20, Ordering::Relaxed);
152 |                 } else if line.contains("Model weights downloaded") {
153 |                     counter.store(40, Ordering::Relaxed);
154 |                 } else if line.contains("Starting model backend") {
155 |                     counter.store(60, Ordering::Relaxed);
156 |                 } else if line.contains("Warming up model") {
157 |                     counter.store(80, Ordering::Relaxed);
158 |                 } else if line.contains("Starting HTTP server") {
159 |                     counter.store(90, Ordering::Relaxed);
160 |                 } else if line.contains("Ready") {
161 |                     counter.store(100, Ordering::Relaxed);
162 |                     ready = true;
163 |                     break;
164 |                 }
165 |                 
166 |                 let _ = std::io::stdout().flush();
167 |             },
168 |             Err(mpsc::RecvTimeoutError::Timeout) => {
169 |                 continue;
170 |             },
171 |             Err(mpsc::RecvTimeoutError::Disconnected) => {
172 |                 println!("  ⚠️ TEI process may have terminated unexpectedly");
173 |                 break;
174 |             }
175 |         }
176 |     }
177 | 
178 |     animation.stop();
179 | 
180 |     if ready {
181 |         println!("\n✅ TEI server ready in {:?}! Continuing with processing...\n", start_time.elapsed());
182 |         let tei_url = format!("http://localhost:{}", args.tei_local_port);
183 |         Ok((managed_process, tei_url))
184 |     } else if Instant::now() >= deadline {
185 |         println!("\n❌ Timeout waiting for TEI server to become ready");
186 |         
187 |         if !log_buffer.is_empty() {
188 |             let _ = std::fs::write("tei_timeout.log", log_buffer.join("\n"));
189 |             println!("TEI logs saved to 'tei_timeout.log'");
190 |         }
191 |         
192 |         Err("Timeout waiting for TEI server to become ready".into())
193 |     } else {
194 |         println!("\n❌ TEI server failed to start properly");
195 |         
196 |         if !log_buffer.is_empty() {
197 |             let _ = std::fs::write("tei_failure.log", log_buffer.join("\n"));
198 |             println!("TEI logs saved to 'tei_failure.log'");
199 |         }
200 |         
201 |         Err("TEI server failed to report ready".into())
202 |     }
203 | }
204 | 


--------------------------------------------------------------------------------
/src/util/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod utils;
2 | pub mod spinner;
3 | pub use utils::*;
4 | pub mod handle_tei;
5 | pub mod exclude;
6 | pub use handle_tei::ManagedProcess;
7 | pub use handle_tei::start_and_wait_for_tei;
8 | 


--------------------------------------------------------------------------------
/src/util/spinner.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{ stdout, Write };
 2 | use std::sync::{ Arc, Mutex };
 3 | use std::sync::atomic::{ AtomicUsize, Ordering };
 4 | use std::thread::{ self, JoinHandle };
 5 | use std::time::Duration;
 6 | 
 7 | pub struct AnimationHandle {
 8 |     pub thread: JoinHandle<()>,
 9 |     pub stop_flag: Arc<Mutex<bool>>,
10 | }
11 | 
12 | impl AnimationHandle {
13 |     pub fn stop(self) {
14 |         *self.stop_flag.lock().unwrap() = true;
15 |         if let Err(e) = self.thread.join() {
16 |             eprintln!("Failed to join animation thread: {:?}", e);
17 |         }
18 |     }
19 | }
20 | 
21 | pub fn start_spinner_animation(
22 |     counter: Arc<AtomicUsize>,
23 |     total: usize,
24 |     message: &str
25 | ) -> AnimationHandle {
26 |     let stop_flag = Arc::new(Mutex::new(false));
27 |     let stop_clone = stop_flag.clone();
28 |     let message = message.to_string();
29 | 
30 |     let thread = thread::spawn(move || {
31 |         let spinner_chars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
32 |         let mut spinner_idx = 0;
33 | 
34 |         while !*stop_clone.lock().unwrap() {
35 |             let count = counter.load(Ordering::Relaxed);
36 |             spinner_idx = (spinner_idx + 1) % spinner_chars.len();
37 | 
38 |             print!(
39 |                 "\r{} {}... [{}/{}] ({}%)",
40 |                 spinner_chars[spinner_idx],
41 |                 message,
42 |                 count,
43 |                 total,
44 |                 (count * 100) / total.max(1)
45 |             );
46 | 
47 |             let _ = stdout().flush();
48 |             thread::sleep(Duration::from_millis(80));
49 |         }
50 | 
51 |         print!("\r{}\r", " ".repeat(80));
52 |         let _ = stdout().flush();
53 |     });
54 | 
55 |     AnimationHandle { thread, stop_flag }
56 | }
57 | 
58 | pub fn start_operation_animation(message: &str) -> (AnimationHandle, Arc<AtomicUsize>) {
59 |     let counter = Arc::new(AtomicUsize::new(0));
60 |     let total = 100; 
61 |     let handle = start_spinner_animation(counter.clone(), total, message);
62 |     (handle, counter)
63 | }
64 | 


--------------------------------------------------------------------------------
/src/util/utils.rs:
--------------------------------------------------------------------------------
 1 | use std::fs;
 2 | use std::io::{ Cursor, Read, Result as IoResult };
 3 | use std::path::Path;
 4 | use encoding_rs::UTF_16LE;
 5 | use encoding_rs_io::DecodeReaderBytesBuilder;
 6 | use log::info;
 7 | use crate::parser::detect_format;
 8 | 
 9 | 
10 | pub fn read_file_content<P: AsRef<Path>>(file_path: P) -> IoResult<String> {
11 |     info!("Reading file: {}", file_path.as_ref().display());
12 |     let raw = fs::read(&file_path)?;
13 |     if raw.starts_with(&[0xff, 0xfe]) {
14 |         let mut decoder = DecodeReaderBytesBuilder::new()
15 |             .encoding(Some(UTF_16LE))
16 |             .bom_override(true)
17 |             .build(Cursor::new(raw));
18 | 
19 |         let mut content = String::new();
20 |         decoder.read_to_string(&mut content)?;
21 |         Ok(content)
22 |     } else {
23 |         String::from_utf8(raw).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
24 |     }
25 | }
26 | 
27 | pub fn read_file_and_detect_format<P: AsRef<Path>>(file_path: P) -> IoResult<(String, String)> {
28 |     let content = read_file_content(&file_path)?;
29 | 
30 |     info!("Detecting format...");
31 | 
32 |     let file_path_str = file_path.as_ref().to_str().unwrap_or("unknown_path");
33 |     let format = detect_format(file_path_str, &content);
34 | 
35 |     info!("Detected format: {}", format);
36 |     info!("Processing {} format file: {}", format, file_path.as_ref().display());
37 | 
38 |     Ok((content, format))
39 | }
40 | 
41 | pub fn logo() {
42 |     println!(
43 |         r#"
44 |         ____  ____  ____  _  _  ____   ___ 
45 |         (    \(  _ \(___ \/ )( \(  __) / __)
46 |          ) D ( ) _ ( / __/\ \/ / ) _) ( (__ 
47 |         (____/(____/(____) \__/ (____) \___)                                                                      
48 |         "#
49 |     );
50 |     println!("Database to Vector Migration Tool\n");
51 | }
52 | 
53 | 
54 | pub fn init_thread_pool(num_threads: usize) {
55 |     let thread_count = if num_threads == 0 { num_cpus::get() } else { num_threads };
56 |     rayon::ThreadPoolBuilder::new().num_threads(thread_count).build_global().unwrap();
57 |     info!("Using {} threads for parallel processing", thread_count);
58 | }
59 | 


--------------------------------------------------------------------------------
/src/workflow.rs:
--------------------------------------------------------------------------------
  1 | use crate::cli::Args;
  2 | use crate::db::{ Database, DbError, store_in_batches };
  3 | use crate::embedding::embeding::{ initialize_embedding_generator, process_records_with_embeddings };
  4 | use crate::util::spinner::start_spinner_animation;
  5 | use crate::util::handle_tei::{start_and_wait_for_tei, ManagedProcess};
  6 | use log::{ info, warn, error };
  7 | use serde_json::Value;
  8 | use std::collections::HashMap;
  9 | use std::sync::Arc;
 10 | use std::sync::atomic::{ AtomicUsize, Ordering };
 11 | use std::time::Instant;
 12 | 
 13 | pub struct MigrationStats {
 14 |     pub total_records: usize,
 15 |     pub processed_records: usize,
 16 |     pub elapsed_seconds: f64,
 17 | }
 18 | 
 19 | pub fn execute_migration_workflow(
 20 |     records: Vec<Value>,
 21 |     database: &dyn Database,
 22 |     args: &Args,
 23 | ) -> Result<MigrationStats, DbError> {
 24 |     let total_records = records.len();
 25 |     if total_records == 0 {
 26 |         warn!("No records to process");
 27 |         return Ok(MigrationStats {
 28 |             total_records: 0,
 29 |             processed_records: 0,
 30 |             elapsed_seconds: 0.0,
 31 |         });
 32 |     }
 33 | 
 34 |     let mut tei_process: Option<ManagedProcess> = None;
 35 |     let mut override_url: Option<String> = None;
 36 | 
 37 |     if args.embedding_provider == "tei" && args.embedding_url.is_none() {
 38 |         let args = args.clone();
 39 |         let (proc, url) = std::thread::spawn(move || start_and_wait_for_tei(&args))
 40 |             .join()
 41 |             .map_err(|e| format!("TEI thread panicked: {:?}", e))??;
 42 |         tei_process = Some(proc);
 43 |         override_url = Some(url);
 44 |     }
 45 | 
 46 |     let generator = initialize_embedding_generator(args, override_url.as_deref())
 47 |         .map_err(|e| DbError::from(format!("Init embed gen failed: {}", e)))?;
 48 | 
 49 |     let start_time = Instant::now();
 50 |     let embedding_count = Arc::new(AtomicUsize::new(0));
 51 |     let embedding_animation = start_spinner_animation(
 52 |         embedding_count.clone(),
 53 |         total_records,
 54 |         "Generating embeddings"
 55 |     );
 56 | 
 57 |     info!("Starting embedding generation for {} records", total_records);
 58 | 
 59 |     let prepared_records = match
 60 |         process_records_with_embeddings(records, args, embedding_count.clone(), generator)
 61 |     {
 62 |         Ok(records) => records,
 63 |         Err(e) => {
 64 |             embedding_animation.stop();
 65 |             error!("CRITICAL: Embedding generation failed: {}", e);
 66 |             return Err(format!("Embedding generation critical error: {}", e).into());
 67 |         }
 68 |     };
 69 | 
 70 |     embedding_animation.stop();
 71 | 
 72 |     if prepared_records.is_empty() {
 73 |         warn!("No records were prepared for storage after embedding process.");
 74 |     } else {
 75 |         println!("\nEmbedding generation complete! Storing data...");
 76 | 
 77 |         let mut grouped_records: HashMap<String, Vec<(String, Vec<f32>, Value)>> = HashMap::new();
 78 |         for (table, id, vec, meta) in prepared_records {
 79 |             grouped_records.entry(table).or_insert_with(Vec::new).push((id, vec, meta));
 80 |         }
 81 | 
 82 |         let processed_count = Arc::new(AtomicUsize::new(0));
 83 |         let storage_animation = start_spinner_animation(
 84 |             processed_count.clone(),
 85 |             total_records,
 86 |             "Storing in database"
 87 |         );
 88 | 
 89 |         let max_payload_bytes = args.max_payload_size_mb * 1024 * 1024;
 90 |         let chunk_size = args.chunk_size;
 91 | 
 92 |         for (table, items) in grouped_records {
 93 |             info!("Storing {} items for table '{}'", items.len(), table);
 94 |             for batch in items.chunks(chunk_size) {
 95 |                 match store_in_batches(database, &table, batch, max_payload_bytes) {
 96 |                     Ok(_) => {
 97 |                         let _ = processed_count.fetch_add(batch.len(), Ordering::Relaxed);
 98 |                     }
 99 |                     Err(e) => {
100 |                         storage_animation.stop();
101 |                         error!("CRITICAL: Database storage error for table '{}': {}", table, e);
102 |                         return Err(format!("Database storage error: {}", e).into());
103 |                     }
104 |                 }
105 |             }
106 |         }
107 |         storage_animation.stop();
108 |     }
109 | 
110 |     let elapsed_time = start_time.elapsed();
111 |     let final_count = embedding_count.load(Ordering::Relaxed);
112 | 
113 |     println!(
114 |         "\nFinished processing {} records in {:.2} seconds ({:.1} records/sec)",
115 |         final_count,
116 |         elapsed_time.as_secs_f64(),
117 |         if elapsed_time.as_secs_f64() > 0.0 {
118 |             (final_count as f64) / elapsed_time.as_secs_f64()
119 |         } else {
120 |             0.0
121 |         }
122 |     );
123 |     println!("Migration Complete.");
124 | 
125 |     if let Some(mut p) = tei_process {
126 |         let _ = p.kill();
127 |     }
128 | 
129 |     Ok(MigrationStats {
130 |         total_records,
131 |         processed_records: final_count,
132 |         elapsed_seconds: elapsed_time.as_secs_f64(),
133 |     })
134 | }


--------------------------------------------------------------------------------
/tei/tei-linux-x86:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-linux-x86


--------------------------------------------------------------------------------
/tei/tei-metal-mac-arm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-metal-mac-arm


--------------------------------------------------------------------------------
/tei/tei-onnx-mac-arm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DevsHero/db2vec/c2b2ce9818aa67acafe185895cb85939100bae27/tei/tei-onnx-mac-arm


--------------------------------------------------------------------------------
/tei_timeout.log:
--------------------------------------------------------------------------------
 1 | [2m2025-05-07T19:34:32.481255Z[0m [32m INFO[0m [2mtext_embeddings_router[0m[2m:[0m [2mrouter/src/main.rs[0m[2m:[0m[2m189:[0m Args { model_id: "Ali****-***/***-*****-*.**-*****uct", revision: None, tokenization_workers: None, dtype: None, pooling: None, max_concurrent_requests: 512, max_batch_tokens: 16384, max_batch_requests: None, max_client_batch_size: 32, auto_truncate: true, default_prompt_name: None, default_prompt: None, hf_api_token: None, hf_token: None, hostname: "0.0.0.0", port: 19998, uds_path: "/tmp/text-embeddings-inference-server", huggingface_hub_cache: None, payload_limit: 2000000, api_key: None, json_output: false, disable_spans: false, otlp_endpoint: None, otlp_service_name: "text-embeddings-inference.server", prometheus_port: 9000, cors_allow_origin: None }
 2 | [2m2025-05-07T19:34:32.486659Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m20:[0m Starting download
 3 | [2m2025-05-07T19:34:32.486675Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m[1mdownload_pool_config[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m53:[0m Downloading `1_Pooling/config.json`
 4 | [2m2025-05-07T19:34:32.487341Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m[1mdownload_new_st_config[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m77:[0m Downloading `config_sentence_transformers.json`
 5 | [2m2025-05-07T19:34:32.487367Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m40:[0m Downloading `config.json`
 6 | [2m2025-05-07T19:34:32.487388Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m43:[0m Downloading `tokenizer.json`
 7 | [2m2025-05-07T19:34:32.487520Z[0m [32m INFO[0m [1mdownload_artifacts[0m[2m:[0m [2mtext_embeddings_core::download[0m[2m:[0m [2mcore/src/download.rs[0m[2m:[0m[2m47:[0m Model artifacts downloaded in 861.667µs
 8 | [2m2025-05-07T19:34:32.585305Z[0m [32m INFO[0m [2mtext_embeddings_router[0m[2m:[0m [2mrouter/src/lib.rs[0m[2m:[0m[2m193:[0m Maximum number of tokens per request: 32768
 9 | [2m2025-05-07T19:34:32.585400Z[0m [32m INFO[0m [2mtext_embeddings_core::tokenization[0m[2m:[0m [2mcore/src/tokenization.rs[0m[2m:[0m[2m38:[0m Starting 14 tokenization workers
10 | [2m2025-05-07T19:34:32.671605Z[0m [32m INFO[0m [2mtext_embeddings_router[0m[2m:[0m [2mrouter/src/lib.rs[0m[2m:[0m[2m235:[0m Starting model backend
11 | [2m2025-05-07T19:34:32.671903Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m534:[0m Downloading `model.onnx`
12 | [2m2025-05-07T19:34:33.079916Z[0m [33m WARN[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m538:[0m Could not download `model.onnx`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.onnx)
13 | [2m2025-05-07T19:34:33.079938Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m539:[0m Downloading `onnx/model.onnx`
14 | [2m2025-05-07T19:34:33.489241Z[0m [33m WARN[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m543:[0m Could not download `onnx/model.onnx`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/onnx/model.onnx)
15 | [2m2025-05-07T19:34:33.489254Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m548:[0m Downloading `model.onnx_data`
16 | [2m2025-05-07T19:34:33.753610Z[0m [33m WARN[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m552:[0m Could not download `model.onnx_data`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.onnx_data)
17 | [2m2025-05-07T19:34:33.753637Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m553:[0m Downloading `onnx/model.onnx_data`
18 | [2m2025-05-07T19:34:34.513003Z[0m [33m WARN[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m557:[0m Could not download `onnx/model.onnx_data`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/onnx/model.onnx_data)
19 | [2m2025-05-07T19:34:34.513013Z[0m [31mERROR[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m346:[0m Model ONNX files not found in the repository
20 | [2m2025-05-07T19:34:34.513294Z[0m [31mERROR[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m358:[0m Could not start ORT backend: Could not start backend: Pooling last_token is not supported for this backend. Use `candle` backend instead.
21 | [2m2025-05-07T19:34:34.513515Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m493:[0m Downloading `model.safetensors`
22 | [2m2025-05-07T19:34:35.332343Z[0m [33m WARN[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m496:[0m Could not download `model.safetensors`: request error: HTTP status client error (404 Not Found) for url (https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/resolve/main/model.safetensors)
23 | [2m2025-05-07T19:34:35.332358Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m501:[0m Downloading `model.safetensors.index.json`
24 | [2m2025-05-07T19:34:35.333710Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m523:[0m Downloading `model-00002-of-00002.safetensors`
25 | [2m2025-05-07T19:36:48.542283Z[0m [32m INFO[0m [2mtext_embeddings_backend[0m[2m:[0m [2mbackends/src/lib.rs[0m[2m:[0m[2m523:[0m Downloading `model-00001-of-00002.safetensors`


--------------------------------------------------------------------------------
/vector-export-scripts/qdrant.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eo pipefail
 3 | 
 4 | QDRANT_URL="http://localhost:6333"
 5 | 
 6 | # 1. Discover all collections
 7 | collections=$(curl -s "${QDRANT_URL}/collections" \
 8 |                -H "Content-Type: application/json" \
 9 |              | jq -r '.result.collections[].name')
10 | 
11 | for col in $collections; do
12 |   echo "Exporting collection: $col"
13 | 
14 |   # 2. Create snapshot (synchronous)
15 |   resp=$(curl -s -X POST \
16 |            "${QDRANT_URL}/collections/${col}/snapshots" \
17 |            -H "Content-Type: application/json")
18 |   snap=$(jq -r '.result.name' <<<"$resp")
19 |   echo " Snapshot created: $snap"
20 | 
21 |   # 3. Download
22 |   curl -s "${QDRANT_URL}/collections/${col}/snapshots/${snap}" \
23 |        --output "${col}.snapshot"
24 |   echo " Saved ${col}.snapshot"
25 | done
26 | 
27 | echo "✅ All snapshots exported."
28 | 


--------------------------------------------------------------------------------