├── .github
    └── workflows
    │   ├── claude.yml
    │   ├── publish.yml
    │   ├── pyright.yml
    │   ├── pytest.yml
    │   └── ruff.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CLAUDE.md
├── CONTRIBUTING.md
├── README.md
├── docs
    ├── api
    │   └── index.md
    ├── assets
    │   └── images
    │   │   ├── cluster-details.png
    │   │   ├── cluster-map.png
    │   │   ├── cluster-tree.png
    │   │   ├── conversation.png
    │   │   └── kura-architecture.png
    ├── blog
    │   ├── index.md
    │   └── posts
    │   │   ├── does-kura-work.md
    │   │   ├── kura-0-5-0-release.md
    │   │   └── new-documentation-release.md
    ├── core-concepts
    │   ├── clustering.md
    │   ├── conversations.md
    │   ├── dimensionality-reduction.md
    │   ├── embedding.md
    │   ├── meta-clustering.md
    │   ├── overview.md
    │   └── summarization.md
    ├── getting-started
    │   ├── configuration.md
    │   ├── installation.md
    │   └── quickstart.md
    └── index.md
├── kura.png
├── kura
    ├── __init__.py
    ├── base_classes
    │   ├── __init__.py
    │   ├── cluster.py
    │   ├── clustering_method.py
    │   ├── dimensionality.py
    │   ├── embedding.py
    │   ├── meta_cluster.py
    │   └── summarisation.py
    ├── cli
    │   ├── cli.py
    │   ├── server.py
    │   └── visualisation.py
    ├── clio_reference.md
    ├── cluster.py
    ├── dimensionality.py
    ├── embedding.py
    ├── k_means.py
    ├── kura.py
    ├── meta_cluster.py
    ├── static
    │   └── dist
    │   │   ├── assets
    │   │       ├── index-CvLvA1NY.css
    │   │       └── index-DztdrX1V.js
    │   │   ├── index.html
    │   │   └── vite.svg
    ├── summarisation.py
    ├── types
    │   ├── __init__.py
    │   ├── cluster.py
    │   ├── conversation.py
    │   ├── dimensionality.py
    │   └── summarisation.py
    ├── v1
    │   ├── README.md
    │   ├── __init__.py
    │   ├── kura.py
    │   └── visualization.py
    └── visualization.py
├── mkdocs.yml
├── pyproject.toml
├── requirements.txt
├── scripts
    ├── README.md
    ├── build_docs.sh
    ├── test_sentence_transformer_real.py
    ├── tutorial_class_api.py
    └── tutorial_procedural_api.py
├── tests
    └── test_meta_cluster.py
├── ui
    ├── .gitignore
    ├── README.md
    ├── bun.lockb
    ├── components.json
    ├── eslint.config.js
    ├── index.html
    ├── package.json
    ├── public
    │   └── vite.svg
    ├── src
    │   ├── App.tsx
    │   ├── assets
    │   │   └── react.svg
    │   ├── components
    │   │   ├── cluster-details.tsx
    │   │   ├── cluster-map.tsx
    │   │   ├── cluster-tree.tsx
    │   │   ├── conversation-dialog.tsx
    │   │   ├── ui
    │   │   │   ├── button.tsx
    │   │   │   ├── card.tsx
    │   │   │   ├── dialog.tsx
    │   │   │   └── input.tsx
    │   │   └── upload-form.tsx
    │   ├── index.css
    │   ├── lib
    │   │   ├── parse.ts
    │   │   ├── tree.ts
    │   │   └── utils.ts
    │   ├── main.tsx
    │   ├── types
    │   │   ├── cluster.ts
    │   │   └── kura.ts
    │   └── vite-env.d.ts
    ├── tsconfig.app.json
    ├── tsconfig.json
    ├── tsconfig.node.json
    └── vite.config.ts
└── uv.lock


/.github/workflows/claude.yml:
--------------------------------------------------------------------------------
 1 | name: Claude Code
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 |   pull_request_review_comment:
 7 |     types: [created]
 8 |   issues:
 9 |     types: [opened, assigned]
10 |   pull_request_review:
11 |     types: [submitted]
12 | 
13 | jobs:
14 |   claude:
15 |     if: |
16 |       (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) ||
17 |       (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) ||
18 |       (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) ||
19 |       (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')))
20 |     runs-on: ubuntu-latest
21 |     permissions:
22 |       contents: read
23 |       pull-requests: read
24 |       issues: read
25 |       id-token: write
26 |     steps:
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v4
29 |         with:
30 |           fetch-depth: 1
31 | 
32 |       - name: Run Claude Code
33 |         id: claude
34 |         uses: anthropics/claude-code-action@beta
35 |         with:
36 |           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
37 |           allowed_tools: "Edit,Replace,NotebookEditCell"
38 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPI when a Release is Created
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   pypi-publish:
 9 |     name: Publish release to PyPI
10 |     runs-on: ubuntu-latest
11 |     environment:
12 |       name: pypi
13 |       url: https://pypi.org/p/kura
14 |     permissions:
15 |       id-token: write
16 |     steps:
17 |       - uses: actions/checkout@v4
18 | 
19 |       - name: Setup Bun
20 |         uses: oven-sh/setup-bun@v1
21 |         with:
22 |           bun-version: latest
23 | 
24 |       - name: Build Web Assets
25 |         working-directory: ui
26 |         run: |
27 |           bun install
28 |           bun run build
29 | 
30 |       - name: Install uv
31 |         uses: astral-sh/setup-uv@v5
32 |         with:
33 |           python-version: 3.9
34 | 
35 |       - name: Install the project
36 |         run: uv sync --all-extras --dev
37 | 
38 |       - name: Build Package
39 |         run: uv build
40 | 
41 |       - name: Build and publish Python package
42 |         run: uv publish
43 |         env:
44 |           UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }}
45 | 


--------------------------------------------------------------------------------
/.github/workflows/pyright.yml:
--------------------------------------------------------------------------------
 1 | name: Pyright
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | env:
10 |   WORKING_DIRECTORY: "."
11 |   PYRIGHT_OUTPUT_FILENAME: "pyright.log"
12 | 
13 | jobs:
14 |   Pyright:
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 |         os: [ubuntu-latest]
19 |         python-version: ["3.9"]
20 | 
21 |     steps:
22 |       - name: Checkout code
23 |         uses: actions/checkout@v4
24 |       - name: Install uv
25 |         uses: astral-sh/setup-uv@v4
26 |         with:
27 |           enable-cache: true
28 |       - name: Set up Python
29 |         run: uv python install ${{ matrix.python-version }}
30 |       - name: Install the project
31 |         run: uv sync --all-extras --dev
32 |       - name: Run pyright
33 |         run: uv run pyright
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest.yml:
--------------------------------------------------------------------------------
 1 | name: Pytest
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   push:
 7 |     branches: [main]
 8 | 
 9 | jobs:
10 |   Pytest:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: [ubuntu-latest]
15 |         python-version: ["3.9"]
16 |     env:
17 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
18 |       GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
19 | 
20 |     steps:
21 |       - name: Checkout code
22 |         uses: actions/checkout@v4
23 |       - name: Install uv
24 |         uses: astral-sh/setup-uv@v4
25 |         with:
26 |           enable-cache: true
27 |       - name: Set up Python
28 |         run: uv python install ${{ matrix.python-version }}
29 |       - name: Install the project
30 |         run: uv sync --all-extras --dev
31 |       - name: Run pytest
32 |         run: uv run pytest
33 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Ruff
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 | 
 7 | jobs:
 8 |   ruff:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - uses: astral-sh/ruff-action@v3
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | .venv/
 5 | venv/
 6 | *.so
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | 
24 | # Node.js
25 | node_modules/
26 | 
27 | # macOS
28 | .DS_Store
29 | 
30 | # Environments
31 | .env
32 | .env.*
33 | !.env.example
34 | 
35 | # IDE
36 | .idea/
37 | .vscode/
38 | *.swp
39 | *.swo
40 | 
41 | # Logs
42 | logs/
43 | *.log
44 | 
45 | # Examples directory
46 | examples/
47 | 
48 | # Tutorial checkpoints
49 | tutorial_checkpoints/
50 | tutorial_checkpoints_class/
51 | site/
52 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 |       - id: check-merge-conflict
 9 |       - id: check-toml
10 | 
11 |   - repo: https://github.com/astral-sh/ruff-pre-commit
12 |     rev: v0.11.11
13 |     hooks:
14 |       # Run the linter
15 |       - id: ruff-check
16 |         args: [--fix]
17 |       # Run the formatter
18 |       - id: ruff-format
19 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
  1 | # CLAUDE.md
  2 | 
  3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
  4 | 
  5 | ## Commands
  6 | 
  7 | ### Python Environment Setup
  8 | 
  9 | ```bash
 10 | # Create and activate a virtual environment
 11 | python -m venv venv
 12 | source venv/bin/activate  # On Windows: venv\Scripts\activate
 13 | 
 14 | # Install package in development mode with dev dependencies
 15 | pip install -e ".[dev]"
 16 | ```
 17 | 
 18 | ### Running Tests
 19 | 
 20 | ```bash
 21 | # Run all tests
 22 | pytest
 23 | 
 24 | # Run a specific test file
 25 | pytest tests/test_meta_cluster.py
 26 | 
 27 | # Run a specific test
 28 | pytest tests/test_meta_cluster.py::test_cluster_label_exact_match
 29 | ```
 30 | 
 31 | ### Type Checking
 32 | 
 33 | ```bash
 34 | # Run type checking
 35 | pyright
 36 | ```
 37 | 
 38 | ### Documentation
 39 | 
 40 | ```bash
 41 | # Install documentation dependencies
 42 | pip install -e ".[docs]"
 43 | 
 44 | # Serve documentation locally
 45 | mkdocs serve
 46 | ```
 47 | 
 48 | ### UI Development
 49 | 
 50 | ```bash
 51 | # Navigate to UI directory
 52 | cd ui
 53 | 
 54 | # Install dependencies
 55 | npm install
 56 | 
 57 | # Start development server
 58 | npm run dev
 59 | 
 60 | # Build for production
 61 | npm run build
 62 | 
 63 | # Lint code
 64 | npm run lint
 65 | ```
 66 | 
 67 | ### Running the Application
 68 | 
 69 | ```bash
 70 | # Start the Kura web server (implemented in kura/cli/cli.py and kura/cli/server.py)
 71 | kura start-app
 72 | 
 73 | # Start with a custom checkpoint directory
 74 | kura start-app --dir ./my-checkpoints
 75 | ```
 76 | 
 77 | ## Architecture Overview
 78 | 
 79 | Kura is a tool for analyzing and visualizing chat data, built on the same ideas as Anthropic's CLIO. It uses machine learning techniques to understand user conversations by clustering them into meaningful groups.
 80 | 
 81 | ### Two API Approaches
 82 | 
 83 | Kura offers two APIs for different use cases:
 84 | 
 85 | 1. **Class-Based API** (`kura/kura.py`): The original API with a single `Kura` class that orchestrates the entire pipeline
 86 | 2. **Procedural API** (`kura/v1/`): A functional approach with composable functions for maximum flexibility
 87 | 
 88 | ### Core Components
 89 | 
 90 | 1. **Summarisation Model** (`kura/summarisation.py`): Takes user conversations and summarizes them into task descriptions
 91 | 2. **Embedding Model** (`kura/embedding.py`): Converts text into vector representations (embeddings)
 92 | 3. **Clustering Model** (`kura/cluster.py`): Groups summaries into clusters based on embeddings
 93 | 4. **Meta Clustering Model** (`kura/meta_cluster.py`): Further groups clusters into a hierarchical structure (Note: `max_clusters` parameter now lives here, not in the main Kura class)
 94 | 5. **Dimensionality Reduction** (`kura/dimensionality.py`): Reduces high-dimensional embeddings for visualization
 95 | 
 96 | ### Data Flow
 97 | 
 98 | 1. Raw conversations are loaded
 99 | 2. Conversations are summarized
100 | 3. Summaries are embedded and clustered
101 | 4. Base clusters are reduced to meta-clusters
102 | 5. Dimensionality reduction is applied for visualization
103 | 6. Results are saved as checkpoints for persistence
104 | 
105 | ### Key Classes
106 | 
107 | - `Kura` (`kura/kura.py`): Main class that orchestrates the entire pipeline
108 | - `BaseEmbeddingModel` / `OpenAIEmbeddingModel` (`kura/embedding.py`): Handle text embedding
109 | - `BaseSummaryModel` / `SummaryModel` (`kura/summarisation.py`): Summarize conversations
110 | - `BaseClusterModel` / `ClusterModel` (`kura/cluster.py`): Create initial clusters
111 | - `BaseMetaClusterModel` / `MetaClusterModel` (`kura/meta_cluster.py`): Reduce clusters into hierarchical groups
112 | - `BaseDimensionalityReduction` / `HDBUMAP` (`kura/dimensionality.py`): Reduce dimensions for visualization
113 | - `Conversation` (`kura/types/conversation.py`): Core data model for user conversations
114 | 
115 | ### UI Components
116 | 
117 | The project includes a React/TypeScript frontend for visualizing the clusters, with components for:
118 | - Displaying cluster maps (`ui/src/components/cluster-map.tsx`)
119 | - Showing cluster details (`ui/src/components/cluster-details.tsx`)
120 | - Visualizing cluster hierarchies (`ui/src/components/cluster-tree.tsx`)
121 | - Handling conversation uploads (`ui/src/components/upload-form.tsx`)
122 | - Displaying individual conversations (`ui/src/components/conversation-dialog.tsx`)
123 | 
124 | ### Extensibility
125 | 
126 | The system is designed to be modular, allowing custom implementations of:
127 | - Embedding models
128 | - Summarization models
129 | - Clustering algorithms
130 | - Dimensionality reduction techniques
131 | 
132 | ## Working with Metadata
133 | 
134 | Kura supports two types of metadata for enriching conversation analysis:
135 | 
136 | ### 1. LLM Extractors
137 | Custom metadata can be extracted from conversations using LLM-powered extractors (implemented in `kura/summarisation.py`). These functions run on raw conversations to identify properties like:
138 | - Language detection
139 | - Sentiment analysis
140 | - Topic identification
141 | - Custom metrics
142 | 
143 | Example of creating a custom extractor:
144 | ```python
145 | async def language_extractor(
146 |     conversation: Conversation,
147 |     sems: dict[str, asyncio.Semaphore],
148 |     clients: dict[str, instructor.AsyncInstructor],
149 | ) -> ExtractedProperty:
150 |     sem = sems.get("default")
151 |     client = clients.get("default")
152 |     
153 |     async with sem:
154 |         resp = await client.chat.completions.create(
155 |             model="gemini-2.0-flash",
156 |             messages=[
157 |                 {
158 |                     "role": "system",
159 |                     "content": "Extract the language of this conversation.",
160 |                 },
161 |                 {
162 |                     "role": "user",
163 |                     "content": "\n".join(
164 |                         [f"{msg.role}: {msg.content}" for msg in conversation.messages]
165 |                     ),
166 |                 },
167 |             ],
168 |             response_model=Language,
169 |         )
170 |         return ExtractedProperty(
171 |             name="language_code",
172 |             value=resp.language_code,
173 |         )
174 | ```
175 | 
176 | ### 2. Conversation Metadata
177 | Metadata can be directly attached to conversation objects when loading data (implemented in `kura/types/conversation.py`):
178 | ```python
179 | conversations = Conversation.from_hf_dataset(
180 |     "allenai/WildChat-nontoxic",
181 |     metadata_fn=lambda x: {
182 |         "model": x["model"],
183 |         "toxic": x["toxic"],
184 |         "redacted": x["redacted"],
185 |     },
186 | )
187 | ```
188 | 
189 | ## Loading Data
190 | 
191 | Kura supports multiple data sources (implementations in `kura/types/conversation.py`):
192 | 
193 | ### Claude Conversation History
194 | ```python
195 | from kura.types import Conversation
196 | conversations = Conversation.from_claude_conversation_dump("conversations.json")
197 | ```
198 | 
199 | ### Hugging Face Datasets
200 | ```python
201 | from kura.types import Conversation
202 | conversations = Conversation.from_hf_dataset(
203 |     "ivanleomk/synthetic-gemini-conversations", 
204 |     split="train"
205 | )
206 | ```
207 | 
208 | ### Custom Conversations
209 | For custom data formats, create Conversation objects directly:
210 | ```python
211 | from kura.types import Conversation, Message
212 | from datetime import datetime
213 | from uuid import uuid4
214 | 
215 | conversations = [
216 |     Conversation(
217 |         messages=[
218 |             Message(
219 |                 created_at=str(datetime.now()),
220 |                 role=message["role"],
221 |                 content=message["content"],
222 |             )
223 |             for message in raw_messages
224 |         ],
225 |         id=str(uuid4()),
226 |         created_at=datetime.now(),
227 |     )
228 | ]
229 | ```
230 | 
231 | ## Checkpoints
232 | 
233 | Kura uses checkpoint files to save state between runs (checkpoint handling in `kura/kura.py`):
234 | - `conversations.json`: Raw conversation data
235 | - `summaries.jsonl`: Summarized conversations
236 | - `clusters.jsonl`: Base cluster data
237 | - `meta_clusters.jsonl`: Hierarchical cluster data
238 | - `dimensionality.jsonl`: Projected cluster data for visualization
239 | 
240 | Checkpoints are stored in the directory specified by the `checkpoint_dir` parameter (default: `./checkpoints`).
241 | 
242 | ## Visualization
243 | 
244 | Kura includes visualization tools:
245 | 
246 | ### CLI Visualization
247 | ```python
248 | # Tree visualization implemented in kura/kura.py
249 | kura.visualise_clusters()
250 | ```
251 | 
252 | ### Web Server
253 | ```bash
254 | # Web server implemented in kura/cli/server.py
255 | kura start-app
256 | # Access at http://localhost:8000
257 | ```
258 | 
259 | The web interface provides:
260 | - Interactive cluster map
261 | - Cluster hierarchy tree
262 | - Cluster details panel
263 | - Conversation preview
264 | - Metadata filtering
265 | 
266 | ## Procedural API (v1)
267 | 
268 | The procedural API in `kura/v1/` provides a functional approach to the pipeline:
269 | 
270 | ### Key Functions
271 | - `summarise_conversations(conversations, *, model, checkpoint_manager=None)` - Generate summaries
272 | - `generate_base_clusters_from_conversation_summaries(summaries, *, model, checkpoint_manager=None)` - Create initial clusters
273 | - `reduce_clusters_from_base_clusters(clusters, *, model, checkpoint_manager=None)` - Build hierarchy
274 | - `reduce_dimensionality_from_clusters(clusters, *, model, checkpoint_manager=None)` - Project to 2D
275 | 
276 | ### Example Usage
277 | ```python
278 | from kura import (
279 |     summarise_conversations,
280 |     generate_base_clusters_from_conversation_summaries,
281 |     reduce_clusters_from_base_clusters,
282 |     reduce_dimensionality_from_clusters,
283 |     CheckpointManager
284 | )
285 | 
286 | # Run pipeline with explicit steps
287 | checkpoint_mgr = CheckpointManager("./checkpoints", enabled=True)
288 | 
289 | summaries = await summarise_conversations(
290 |     conversations,
291 |     model=summary_model,
292 |     checkpoint_manager=checkpoint_mgr
293 | )
294 | 
295 | clusters = await generate_base_clusters_from_conversation_summaries(
296 |     summaries,
297 |     model=cluster_model,
298 |     checkpoint_manager=checkpoint_mgr
299 | )
300 | # ... continue with remaining steps
301 | ```
302 | 
303 | ### Benefits
304 | - Fine-grained control over each step
305 | - Easy to skip or reorder steps
306 | - Support for heterogeneous models (OpenAI, vLLM, Hugging Face, etc.)
307 | - Functional programming style with no hidden state
308 | - All functions use keyword-only arguments for clarity


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Kura
  2 | 
  3 | Thank you for your interest in contributing to Kura! This document provides guidelines and information to help you contribute effectively.
  4 | 
  5 | ## Setting Up the Development Environment
  6 | 
  7 | 1. Create and activate a virtual environment using uv:
  8 | ```bash
  9 | uv venv
 10 | source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 11 | ```
 12 | 
 13 | 2. Install the package in development mode with dev dependencies:
 14 | ```bash
 15 | uv pip install -e ".[dev]"
 16 | ```
 17 | 
 18 | ## Testing
 19 | 
 20 | Kura uses pytest for testing. The current test suite primarily focuses on the meta-clustering functionality.
 21 | 
 22 | ### Quick Start Testing
 23 | 
 24 | To quickly test the full Kura pipeline and UI:
 25 | 
 26 | 1. **Run the tutorial test** to generate sample data:
 27 | 
 28 |    For the procedural API (recommended for understanding the pipeline):
 29 |    ```bash
 30 |    uv run python scripts/tutorial_procedural_api.py
 31 |    ```
 32 | 
 33 |    For the class-based API (simpler to use):
 34 |    ```bash
 35 |    uv run python scripts/tutorial_class_api.py
 36 |    ```
 37 | 
 38 |    Either tutorial will:
 39 |    - Import all Kura modules
 40 |    - Load 190 sample conversations from Hugging Face
 41 |    - Process and cluster the conversations
 42 |    - Generate 29 hierarchical clusters organized into 10 root categories
 43 |    - Generate visualization data
 44 |    - Save results to `./tutorial_checkpoints/` (procedural) or `./tutorial_checkpoints_class/` (class-based)
 45 | 
 46 |    Note: This process may take a few minutes depending on your system and API rate limits.
 47 | 
 48 | 2. **Test the UI** after running the tutorial:
 49 | 
 50 |    For procedural API results:
 51 |    ```bash
 52 |    kura start-app --dir ./tutorial_checkpoints
 53 |    ```
 54 | 
 55 |    For class-based API results:
 56 |    ```bash
 57 |    kura start-app --dir ./tutorial_checkpoints_class
 58 |    ```
 59 | 
 60 |    This will:
 61 |    - Start the backend API and frontend on http://localhost:8000
 62 |    - Use the data from the checkpoint directory generated by the tutorial test
 63 |    - Display the cluster map, tree view, and detailed cluster information
 64 | 
 65 |    Note: The UI may take a moment to fully load as it processes the cluster data.
 66 | 
 67 | ### Running Unit Tests
 68 | 
 69 | ```bash
 70 | # Run all tests
 71 | pytest
 72 | 
 73 | # Run a specific test file
 74 | pytest tests/test_meta_cluster.py
 75 | 
 76 | # Run a specific test
 77 | pytest tests/test_meta_cluster.py::test_cluster_label_exact_match
 78 | ```
 79 | 
 80 | ### Test Structure
 81 | 
 82 | Tests are located in the `tests/` directory. The current tests verify:
 83 | 
 84 | - **Exact match functionality**: Tests that `ClusterLabel` correctly validates when there's an exact match between input and candidate clusters.
 85 | - **Fuzzy matching**: Tests that similar but not identical strings can be matched using fuzzy matching with an appropriate threshold.
 86 | - **Validation errors**: Tests that the system properly rejects inputs that don't match any candidates.
 87 | 
 88 | ### Writing New Tests
 89 | 
 90 | When adding new features or fixing bugs, please include appropriate tests. Follow these guidelines:
 91 | 
 92 | 1. Create test files with the `test_` prefix
 93 | 2. Write test functions with descriptive names and docstrings
 94 | 3. Use pytest fixtures when appropriate
 95 | 4. Use assertions to verify expected behavior
 96 | 5. Test both the class-based API and procedural API where applicable
 97 | 
 98 | ### Example: Testing with the Procedural API
 99 | 
100 | ```python
101 | import pytest
102 | import asyncio
103 | from kura.v1 import (
104 |     summarise_conversations,
105 |     generate_base_clusters_from_conversation_summaries,
106 |     CheckpointManager
107 | )
108 | from kura.summarisation import SummaryModel
109 | from kura.cluster import ClusterModel
110 | from kura.types import Conversation
111 | 
112 | @pytest.mark.asyncio
113 | async def test_procedural_pipeline():
114 |     # Load test conversations
115 |     conversations = Conversation.from_hf_dataset(
116 |         "ivanleomk/synthetic-gemini-conversations",
117 |         split="train[:10]"  # Use only 10 for testing
118 |     )
119 | 
120 |     # Initialize models
121 |     summary_model = SummaryModel()
122 |     cluster_model = ClusterModel()
123 | 
124 |     # Run pipeline steps
125 |     summaries = await summarise_conversations(
126 |         conversations,
127 |         model=summary_model,
128 |         checkpoint_manager=None  # No checkpointing for tests
129 |     )
130 | 
131 |     clusters = await generate_base_clusters_from_conversation_summaries(
132 |         summaries,
133 |         model=cluster_model,
134 |         checkpoint_manager=None
135 |     )
136 | 
137 |     # Assertions
138 |     assert len(summaries) == 10
139 |     assert len(clusters) > 0
140 |     assert all(cluster.label for cluster in clusters)
141 | ```
142 | 
143 | ## Type Checking
144 | 
145 | Kura uses pyright for type checking:
146 | 
147 | ```bash
148 | pyright
149 | ```
150 | 
151 | ## Documentation
152 | 
153 | To work on documentation:
154 | 
155 | 1. Install documentation dependencies:
156 | ```bash
157 | uv pip install -e ".[docs]"
158 | ```
159 | 
160 | 2. Serve documentation locally:
161 | ```bash
162 | mkdocs serve
163 | ```
164 | 
165 | ## Code Style
166 | 
167 | - Follow PEP 8 guidelines for Python code
168 | - Use type hints for all function parameters and return values
169 | - Write docstrings for all public classes and functions
170 | 
171 | ## Pull Request Process
172 | 
173 | 1. Fork the repository
174 | 2. Create a feature branch
175 | 3. Add tests for your changes
176 | 4. Ensure all tests pass
177 | 5. Update documentation as needed
178 | 6. Submit a pull request
179 | 
180 | ## UI Development
181 | 
182 | If you're working on the UI:
183 | 
184 | ```bash
185 | # Navigate to UI directory
186 | cd ui
187 | 
188 | # Install dependencies
189 | npm install
190 | 
191 | # Start development server
192 | npm run dev
193 | 
194 | # Build for production
195 | npm run build
196 | 
197 | # Lint code
198 | npm run lint
199 | ```
200 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Kura: Procedural API for Chat Data Analysis
  2 | 
  3 | ![Kura Architecture](./kura.png)
  4 | 
  5 | [![PyPI Downloads](https://img.shields.io/pypi/dm/kura?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/kura/)
  6 | [![GitHub Stars](https://img.shields.io/github/stars/567-labs/kura?style=flat-square&logo=github)](https://github.com/567-labs/kura/stargazers)
  7 | [![Documentation](https://img.shields.io/badge/docs-available-brightgreen?style=flat-square&logo=gitbook&logoColor=white)](https://567-labs.github.io/kura/)
  8 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
  9 | [![Python Version](https://img.shields.io/pypi/pyversions/kura?style=flat-square&logo=python&logoColor=white)](https://pypi.org/project/kura/)
 10 | [![PyPI Version](https://img.shields.io/pypi/v/kura?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/kura/)
 11 | 
 12 | Kura is an open-source library for understanding chat data through machine learning, inspired by [Anthropic's CLIO](https://www.anthropic.com/research/clio). It provides a functional, composable API for clustering conversations to discover patterns and insights.
 13 | 
 14 | ## Why Analyze Conversation Data?
 15 | 
 16 | As AI assistants and chatbots become increasingly central to product experiences, understanding how users interact with these systems at scale becomes a critical challenge. Manually reviewing thousands of conversations is impractical, yet crucial patterns and user needs often remain hidden in this data.
 17 | 
 18 | Kura addresses this challenge by:
 19 | 
 20 | - **Revealing user intent patterns** that may not be obvious from individual conversations
 21 | - **Identifying common user needs** to prioritize feature development
 22 | - **Discovering edge cases and failures** that require attention
 23 | - **Tracking usage trends** over time as your product evolves
 24 | - **Informing prompt engineering** by highlighting successful and problematic interactions
 25 | 
 26 | By clustering similar conversations and providing intuitive visualizations, Kura transforms raw chat data into actionable insights without compromising user privacy.
 27 | 
 28 | ## Installation
 29 | 
 30 | ```bash
 31 | uv pip install kura
 32 | ```
 33 | 
 34 | ## Quick Start
 35 | 
 36 | ```python
 37 | import asyncio
 38 | from rich.console import Console
 39 | from kura import (
 40 |     summarise_conversations,
 41 |     generate_base_clusters_from_conversation_summaries,
 42 |     reduce_clusters_from_base_clusters,
 43 |     reduce_dimensionality_from_clusters,
 44 |     CheckpointManager,
 45 | )
 46 | from kura.visualization import visualise_pipeline_results
 47 | from kura.types import Conversation
 48 | from kura.summarisation import SummaryModel
 49 | from kura.cluster import ClusterModel
 50 | from kura.meta_cluster import MetaClusterModel
 51 | from kura.dimensionality import HDBUMAP
 52 | 
 53 | async def main():
 54 |     # Initialize models
 55 |     console = Console()
 56 |     summary_model = SummaryModel(console=console)
 57 |     cluster_model = ClusterModel(console=console)
 58 |     meta_cluster_model = MetaClusterModel(console=console)
 59 |     dimensionality_model = HDBUMAP()
 60 | 
 61 |     # Set up checkpointing to save intermediate results
 62 |     checkpoint_manager = CheckpointManager("./checkpoints", enabled=True)
 63 | 
 64 |     # Load conversations from Hugging Face dataset
 65 |     conversations = Conversation.from_hf_dataset(
 66 |         "ivanleomk/synthetic-gemini-conversations",
 67 |         split="train"
 68 |     )
 69 | 
 70 |     # Process through the pipeline step by step
 71 |     summaries = await summarise_conversations(
 72 |         conversations,
 73 |         model=summary_model,
 74 |         checkpoint_manager=checkpoint_manager
 75 |     )
 76 | 
 77 |     clusters = await generate_base_clusters_from_conversation_summaries(
 78 |         summaries,
 79 |         model=cluster_model,
 80 |         checkpoint_manager=checkpoint_manager
 81 |     )
 82 | 
 83 |     reduced_clusters = await reduce_clusters_from_base_clusters(
 84 |         clusters,
 85 |         model=meta_cluster_model,
 86 |         checkpoint_manager=checkpoint_manager
 87 |     )
 88 | 
 89 |     projected_clusters = await reduce_dimensionality_from_clusters(
 90 |         reduced_clusters,
 91 |         model=dimensionality_model,
 92 |         checkpoint_manager=checkpoint_manager,
 93 |     )
 94 | 
 95 |     # Visualize results
 96 |     visualise_pipeline_results(reduced_clusters, style="enhanced")
 97 | 
 98 |     print(f"\nProcessed {len(conversations)} conversations")
 99 |     print(f"Created {len(reduced_clusters)} meta clusters")
100 |     print(f"Checkpoints saved to: {checkpoint_manager.checkpoint_dir}")
101 | 
102 | if __name__ == "__main__":
103 |     asyncio.run(main())
104 | ```
105 | 
106 | This example will:
107 | 
108 | 1. Load 190 synthetic programming conversations from Hugging Face
109 | 2. Process them through the complete analysis pipeline step by step
110 | 3. Generate hierarchical clusters organized into categories
111 | 4. Display the results with enhanced visualization
112 | 
113 | ## Key Design Principles
114 | 
115 | Kura follows a function-based architecture where pipeline functions orchestrate the execution while models handle the core logic. Each function is designed with explicit inputs/outputs and no hidden state, working with any model that implements the required interface. The system supports various model types through polymorphic interfaces - from OpenAI to local models for summarization, different clustering algorithms, and various dimensionality reduction techniques.
116 | 
117 | Data can be loaded from multiple sources including Claude conversation history (`Conversation.from_claude_conversation_dump()`) and Hugging Face datasets (`Conversation.from_hf_dataset()`). The example uses a dataset of 190 synthetic programming conversations that form natural clusters across technical topics.
118 | 
119 | The pipeline architecture processes data through sequential stages: loading, summarization, embedding, base clustering, meta-clustering, and dimensionality reduction. All progress is automatically saved using checkpoints, and the system can be extended by implementing custom versions of any component model.
120 | 
121 | ## Documentation
122 | 
123 | - **Getting Started**
124 | 
125 |   - [Installation Guide](https://567-labs.github.io/kura/getting-started/installation/)
126 |   - [Quickstart Guide](https://567-labs.github.io/kura/getting-started/quickstart/)
127 | 
128 | - **Core Concepts**
129 | 
130 |   - [Conversations](https://567-labs.github.io/kura/core-concepts/conversations/)
131 |   - [Embedding](https://567-labs.github.io/kura/core-concepts/embedding/)
132 |   - [Clustering](https://567-labs.github.io/kura/core-concepts/clustering/)
133 |   - [Summarization](https://567-labs.github.io/kura/core-concepts/summarization/)
134 |   - [Meta-Clustering](https://567-labs.github.io/kura/core-concepts/meta-clustering/)
135 |   - [Dimensionality Reduction](https://567-labs.github.io/kura/core-concepts/dimensionality-reduction/)
136 | 
137 | - **API Reference**
138 |   - [Procedural API Documentation](https://567-labs.github.io/kura/api/)
139 | 
140 | ## Comparison with Similar Tools
141 | 
142 | | Feature                | Kura                                  | Traditional Analytics          | Manual Review          | Generic Clustering       |
143 | | ---------------------- | ------------------------------------- | ------------------------------ | ---------------------- | ------------------------ |
144 | | Semantic Understanding | ✅ Uses LLMs for deep understanding   | ❌ Limited to keywords         | ✅ Human understanding | ⚠️ Basic similarity only |
145 | | Scalability            | ✅ Handles thousands of conversations | ✅ Highly scalable             | ❌ Time intensive      | ✅ Works at scale        |
146 | | Visualization          | ✅ Interactive UI                     | ⚠️ Basic charts                | ❌ Manual effort       | ⚠️ Generic plots         |
147 | | Hierarchy Discovery    | ✅ Meta-clustering feature            | ❌ Flat categories             | ⚠️ Subjective grouping | ❌ Typically flat        |
148 | | Extensibility          | ✅ Custom models and extractors       | ⚠️ Limited customization       | ✅ Flexible but manual | ⚠️ Some algorithms       |
149 | | Privacy                | ✅ Self-hosted option                 | ⚠️ Often requires data sharing | ✅ Can be private      | ✅ Can be private        |
150 | 
151 | ## Future Roadmap
152 | 
153 | Kura is actively evolving with plans to add:
154 | 
155 | - **Enhanced Topic Modeling**: More sophisticated detection of themes across conversations
156 | - **Temporal Analysis**: Tracking how conversation patterns evolve over time
157 | - **Advanced Visualizations**: Additional visual representations of conversation data
158 | - **Data Connectors**: More integrations with popular conversation data sources
159 | - **Multi-modal Support**: Analysis of conversations that include images and other media
160 | - **Export Capabilities**: Enhanced formats for sharing and presenting findings
161 | 
162 | ## Testing
163 | 
164 | To quickly test Kura and see it in action:
165 | 
166 | ```bash
167 | uv run python scripts/tutorial_procedural_api.py
168 | ```
169 | 
170 | Expected output:
171 | 
172 | ```text
173 | Loaded 190 conversations successfully!
174 | 
175 | ============================================================
176 |                   Conversation Processing
177 | ============================================================
178 | 
179 | Starting conversation clustering...
180 | Step 1: Generating conversation summaries...
181 | Generated 190 summaries
182 | Step 2: Generating base clusters from summaries...
183 | Generated 19 base clusters
184 | Step 3: Reducing clusters hierarchically...
185 | Reduced to 29 meta clusters
186 | Step 4: Projecting clusters to 2D for visualization...
187 | Generated 29 projected clusters
188 | 
189 | Pipeline complete! Generated 29 projected clusters!
190 | 
191 | Processing Summary:
192 |   • Input conversations: 190
193 |   • Final reduced clusters: 29
194 |   • Final projected clusters: 29
195 |   • Checkpoints saved to: ./tutorial_checkpoints
196 | ```
197 | 
198 | This will:
199 | 
200 | - Load 190 sample conversations from Hugging Face
201 | - Process them through the complete pipeline
202 | - Generate 29 hierarchical clusters organized into 10 root categories
203 | - Save checkpoints to `./tutorial_checkpoints`
204 | 
205 | ## Development
206 | 
207 | See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing, and contribution guidelines.
208 | 
209 | ## License
210 | 
211 | [MIT License](LICENSE)
212 | 
213 | ## About
214 | 
215 | Kura is under active development. If you face any issues or have suggestions, please feel free to [open an issue](https://github.com/567-labs/kura/issues) or a PR. For more details on the technical implementation, check out this [walkthrough of the code](https://ivanleo.com/blog/understanding-user-conversations).
216 | 


--------------------------------------------------------------------------------
/docs/api/index.md:
--------------------------------------------------------------------------------
 1 | # API Reference
 2 | 
 3 | This section provides detailed API reference documentation for the Kura package, automatically generated from the source code using mkdocstrings.
 4 | 
 5 | ## How to Use This Reference
 6 | 
 7 | The API reference is organized by module, with each module containing related classes and functions. For each class, you'll find:
 8 | 
 9 | - Constructor parameters and their descriptions
10 | - Instance methods with parameter details and return types
11 | - Properties and attributes
12 | 
13 | To use these classes in your code, import them from their respective modules:
14 | 
15 | ```python
16 | from kura import Kura
17 | from kura.embedding import OpenAIEmbeddingModel
18 | from kura.summarisation import SummaryModel
19 | # And so on...
20 | ```
21 | 
22 | ## Core Classes
23 | 
24 | ## Procedural API
25 | 
26 | The procedural API provides a functional approach to conversation analysis with composable pipeline functions.
27 | 
28 | ### Pipeline Functions
29 | 
30 | ::: kura.summarise_conversations
31 | 
32 | ::: kura.generate_base_clusters_from_conversation_summaries
33 | 
34 | ::: kura.reduce_clusters_from_base_clusters
35 | 
36 | ::: kura.reduce_dimensionality_from_clusters
37 | 
38 | ### Checkpoint Management
39 | 
40 | ::: kura.CheckpointManager
41 | 
42 | ## Implementation Classes
43 | 
44 | ### Embedding Models
45 | 
46 | ::: kura.embedding
47 | 
48 | ### Summarization
49 | 
50 | ::: kura.summarisation
51 | 
52 | ### Clustering
53 | 
54 | ::: kura.cluster
55 | 
56 | ### Meta-Clustering
57 | 
58 | ::: kura.meta_cluster
59 | 
60 | ### Dimensionality Reduction
61 | 
62 | ::: kura.dimensionality
63 | 


--------------------------------------------------------------------------------
/docs/assets/images/cluster-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-details.png


--------------------------------------------------------------------------------
/docs/assets/images/cluster-map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-map.png


--------------------------------------------------------------------------------
/docs/assets/images/cluster-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-tree.png


--------------------------------------------------------------------------------
/docs/assets/images/conversation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/conversation.png


--------------------------------------------------------------------------------
/docs/assets/images/kura-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/kura-architecture.png


--------------------------------------------------------------------------------
/docs/blog/index.md:
--------------------------------------------------------------------------------
1 | Here are articles that we've written to show you how to work with Kura.
2 | 


--------------------------------------------------------------------------------
/docs/blog/posts/kura-0-5-0-release.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Kura v0.5.0 Released - Procedural API, Better Docs & More
 3 | date: 2025-05-29
 4 | categories:
 5 |   - Kura
 6 |   - Release
 7 | ---
 8 | 
 9 | # Kura v0.5.0 Released
10 | 
11 | We're excited to announce the release of Kura v0.5.0! This release brings significant improvements to documentation, introduces a new procedural API for maximum flexibility, and includes numerous enhancements to make Kura even better for analyzing conversation data.
12 | 
13 | ## What's New in v0.5.0
14 | 
15 | ### New Procedural API (v1)
16 | 
17 | The headline feature of this release is the introduction of a functional, procedural API that gives you fine-grained control over the analysis pipeline:
18 | 
19 | ```python
20 | from kura import (
21 |     summarise_conversations,
22 |     generate_base_clusters_from_conversation_summaries,
23 |     reduce_clusters_from_base_clusters,
24 |     reduce_dimensionality_from_clusters,
25 | )
26 | 
27 | # Run each step independently
28 | summaries = await summarise_conversations(conversations, model=summary_model)
29 | clusters = await generate_base_clusters_from_conversation_summaries(summaries, model=cluster_model)
30 | meta_clusters = await reduce_clusters_from_base_clusters(clusters, model=meta_cluster_model)
31 | projected = await reduce_dimensionality_from_clusters(meta_clusters, model=dim_reduction_model)
32 | ```
33 | 
34 | This new API offers:
35 | - Complete control over each pipeline step
36 | - Easy integration with heterogeneous models (OpenAI, vLLM, Hugging Face)
37 | - Functional programming style with no hidden state
38 | - Keyword-only arguments for clarity
39 | 
40 | <!-- more -->
41 | 
42 | 
43 | ### Enhanced Documentation
44 | 
45 | We've made major improvements to our documentation:
46 | 
47 | - **API Reference**: Now generated with mkdocstrings for always up-to-date documentation
48 | - **CLAUDE.md**: Repository guidance for AI assistants working with the codebase
49 | - **CONTRIBUTING.md**: Clear guidelines for contributors with testing and UV setup
50 | - **Better Examples**: Added context about real datasets like the ivanleomk dataset
51 | 
52 | ### Technical Improvements
53 | 
54 | #### Refactored Architecture
55 | - Extracted visualization logic into separate modules for better maintainability
56 | - Moved `max_clusters` parameter from Kura to MetaClusterModel where it belongs
57 | - Implemented lazy imports for UMap to improve startup time
58 | - Simplified embedding extensibility by replacing `embed_text()` with `__repr__()`
59 | 
60 | #### Enhanced Cluster Visualization
61 | - Added slug field to cluster models for better identification
62 | - Improved cluster visualization with more meaningful labels
63 | - Better support for cluster hierarchies in the UI
64 | 
65 | #### Developer Experience
66 | - Added Ruff workflows and pre-commit hooks for consistent code quality
67 | - Fixed numerous type checking bugs
68 | - Improved Summary class implementation
69 | - Better error messages and debugging support
70 | 
71 | ## Breaking Changes
72 | 
73 | While we've tried to maintain backward compatibility, please note:
74 | - The `max_clusters` parameter has moved from the main Kura class to MetaClusterModel
75 | - Some internal APIs have been refactored for the new procedural approach
76 | 
77 | ## What's Next
78 | 
79 | We're already working on the next release with plans for:
80 | 
81 | - More embedding model integrations
82 | - Enhanced meta-clustering algorithms
83 | - Performance optimizations for large datasets
84 | - Additional visualization options
85 | 
86 | ## Feedback Welcome!
87 | 
88 | We'd love to hear your thoughts on this release. Please:
89 | 
90 | - Report issues on [GitHub](https://github.com/567-labs/kura/issues)
91 | - Join the discussion in [GitHub Discussions](https://github.com/567-labs/kura/discussions)
92 | - Share your use cases and success stories
93 | 


--------------------------------------------------------------------------------
/docs/blog/posts/new-documentation-release.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: New Documentation Release - We're Open to Feedback
 3 | date: 2025-05-16
 4 | categories:
 5 |   - Kura
 6 |   - Documentation
 7 | ---
 8 | 
 9 | # New Documentation Release
10 | 
11 | We're excited to announce a comprehensive overhaul of the Kura documentation! The new documentation is designed to help users get started quickly and make the most of Kura's powerful features for analyzing conversation data.
12 | 
13 | ## What's New
14 | 
15 | Our documentation has been completely reorganized and expanded to provide a better experience:
16 | 
17 | - **Clear Structure**: New organization with dedicated sections for Getting Started, Core Concepts, and API Reference
18 | - **Comprehensive Installation Guide**: Detailed instructions for different installation methods, including both `uv` and `pip`
19 | - **In-depth Tutorials**: Step-by-step guides in our Getting Started section
20 | - **Enriched API Reference**: Better organized and more detailed API documentation
21 | - **Core Concepts Explained**: Detailed explanations of Kura's architecture and components
22 | - **Improved Code Examples**: Concise, practical examples throughout
23 | 
24 | <!-- more -->
25 | 
26 | ## Getting Started
27 | 
28 | If you're new to Kura, we recommend starting with:
29 | 
30 | 1. [Installation Guide](../../getting-started/installation.md)
31 | 2. [Quickstart Guide](../../getting-started/quickstart.md)
32 | 3. [Core Concepts Overview](../../core-concepts/overview.md)
33 | 
34 | ## We Want Your Feedback!
35 | 
36 | Documentation is only useful if it answers your questions. We're actively seeking feedback on the new documentation:
37 | 
38 | - Is anything unclear or confusing?
39 | - Are there missing topics you'd like to see covered?
40 | - Did you find any errors or inconsistencies?
41 | - What would make your experience better?
42 | 
43 | Please share your thoughts by:
44 | 
45 | - Opening an issue on [GitHub](https://github.com/567-labs/kura/issues/new?labels=documentation)
46 | - Starting a discussion in our [GitHub Discussions](https://github.com/567-labs/kura/discussions)
47 | - Reaching out to [Jason](https://twitter.com/jxnl) or [Ivan](https://x.com/ivanleomk)
48 | 
49 | ## What's Next
50 | 
51 | This documentation release is just the beginning. We're planning to:
52 | 
53 | - Add more real-world examples and use cases
54 | - Develop video tutorials
55 | - Expand the API reference with more details
56 | - Create a cookbook of common patterns and techniques
57 | 
58 | Stay tuned for more updates, and don't hesitate to let us know what you'd like to see next!
59 | 
60 | ---
61 | 
62 | Thank you for using Kura. We're committed to building not just a great tool, but also great documentation to help you succeed with it.
63 | 


--------------------------------------------------------------------------------
/docs/core-concepts/clustering.md:
--------------------------------------------------------------------------------
  1 | # Clustering
  2 | 
  3 | Kura's clustering pipeline groups similar conversation summaries into meaningful clusters. This process is fundamental for large-scale analysis, enabling the discovery of dominant themes, understanding diverse user intents, and surfacing potentially "unknown unknown" patterns from vast quantities of conversational data. Clustering follows summarization and embedding in the Kura pipeline.
  4 | 
  5 | ---
  6 | 
  7 | ## Overview
  8 | 
  9 | **Clustering** in Kura organizes `ConversationSummary` objects (see [Summarization](summarization.md)) into groups based on semantic similarity. Each resulting cluster is assigned a descriptive name and a concise summary, making it easier to interpret the primary topics and user requests within the dataset. This bottom-up approach to pattern discovery is crucial for making sense of and navigating large collections of conversations.
 10 | 
 11 | - **Input:** A list of `ConversationSummary` objects (with or without embeddings)
 12 | - **Output:** A list of `Cluster` objects, each with a name, description, and associated conversation IDs
 13 | 
 14 | Clustering enables downstream tasks such as:
 15 | - Identifying and monitoring prevalent topics or user needs
 16 | - Visualizing trends and thematic structures in the data
 17 | - Facilitating efficient exploratory search and retrieval of related conversations
 18 | - Providing a foundation for hierarchical topic modeling through [Meta-Clustering](meta-clustering.md)
 19 | 
 20 | ---
 21 | 
 22 | ## The Clustering Model
 23 | 
 24 | Kura's main clustering logic is implemented in the `ClusterModel` class (see `kura/cluster.py`). This class orchestrates the embedding, grouping, and labeling of conversation summaries.
 25 | 
 26 | ### Key Components
 27 | 
 28 | - **Clustering Method:** Determines how summaries are grouped (default: K-means, see `KmeansClusteringMethod`)
 29 | - **Embedding Model:** Used to convert summaries to vectors if not already embedded (default: `OpenAIEmbeddingModel`)
 30 | - **Cluster Naming:** Uses an LLM to generate a descriptive name and summary for each cluster, distinguishing it from others
 31 | 
 32 | #### Example: ClusterModel Initialization
 33 | 
 34 | ```python
 35 | model = ClusterModel(
 36 |     clustering_method=KmeansClusteringMethod(),
 37 |     embedding_model=OpenAIEmbeddingModel(),
 38 |     max_concurrent_requests=50,
 39 |     model="openai/gpt-4o-mini",
 40 | )
 41 | ```
 42 | 
 43 | ---
 44 | 
 45 | ## Clustering Pipeline
 46 | 
 47 | The clustering process consists of several steps:
 48 | 
 49 | 1. **Embedding Summaries:**
 50 |    - If summaries do not already have embeddings, the model uses the configured embedding model to generate them.
 51 |    - Embedding is performed in batches and can be parallelized for efficiency.
 52 | 
 53 |    ```python
 54 |    embeddings = await self.embedding_model.embed([str(item) for item in summaries])
 55 |    ```
 56 | 
 57 | 2. **Grouping Summaries:**
 58 |    - The clustering method (e.g., K-means) groups summaries based on their embeddings.
 59 |    - Each group is assigned a cluster ID.
 60 | 
 61 |    ```python
 62 |    cluster_id_to_summaries = self.clustering_method.cluster(items_with_embeddings)
 63 |    ```
 64 | 
 65 | 3. **Generating Cluster Names and Descriptions:**
 66 |    - For each cluster, an LLM is prompted to generate a concise, two-sentence summary and a short, imperative cluster name.
 67 |    - The prompt includes both positive examples (summaries in the cluster) and contrastive examples (summaries from other clusters). Contrastive examples are crucial: they guide the LLM to produce highly specific and distinguishing names/descriptions, preventing overly generic labels and ensuring each cluster's unique essence is captured.
 68 | 
 69 |    ```python
 70 |    cluster = await self.generate_cluster(summaries, contrastive_examples)
 71 |    # Returns a Cluster object with name, description, and chat_ids
 72 |    ```
 73 | 
 74 | 4. **Output:**
 75 |    - The result is a list of `Cluster` objects, each containing:
 76 |      - `name`: Imperative sentence capturing the main request/theme
 77 |      - `description`: Two-sentence summary of the cluster
 78 |      - `chat_ids`: List of conversation IDs in the cluster
 79 | 
 80 | ---
 81 | 
 82 | ## Cluster Naming and Description Generation
 83 | 
 84 | Cluster names and descriptions are generated using a large language model (LLM) with a carefully crafted prompt. The prompt:
 85 | - Instructs the LLM to summarize the group in two sentences (past tense)
 86 | - Requires the name to be an imperative sentence (e.g., "Help me debug Python code")
 87 | - Provides contrastive examples to ensure the name/summary is specific, distinct, and accurately reflects the cluster's content compared to others.
 88 | - Encourages specificity, especially for sensitive or harmful topics
 89 | - Reinforces privacy by instructing the LLM to avoid including any Personally Identifiable Information (PII) or proper nouns in the generated cluster names and descriptions, complementing the PII removal in the initial summarization phase.
 90 | 
 91 | **Prompt excerpt:**
 92 | 
 93 | ```
 94 | Summarize all the statements into a clear, precise, two-sentence description in the past tense. ...
 95 | After creating the summary, generate a short name for the group of statements. This name should be at most ten words long ...
 96 | The cluster name should be a sentence in the imperative that captures the user's request. ...
 97 | ```
 98 | 
 99 | ---
100 | 
101 | ## Configuration and Extensibility
102 | 
103 | - **Clustering Method:** Swap out `KmeansClusteringMethod` for other algorithms by implementing the `BaseClusteringMethod` interface.
104 | - **Embedding Model:** Use any model implementing `BaseEmbeddingModel` (e.g., local or cloud-based embeddings).
105 | - **LLM Model:** The LLM used for naming/describing clusters is configurable (default: `openai/gpt-4o-mini`).
106 | - **Concurrency:** `max_concurrent_requests` controls parallelism for embedding and LLM calls.
107 | - **Progress Reporting:** Optional integration with Rich or tqdm for progress bars and live cluster previews.
108 | 
109 | ---
110 | 
111 | ## Hierarchical Analysis with Meta-Clustering
112 | 
113 | While the `ClusterModel` produces a flat list of semantically distinct clusters, Kura also supports the creation of hierarchical cluster structures through its **meta-clustering** capabilities (see [Meta-Clustering](meta-clustering.md)). This next step takes the output of the initial clustering (a list of `Cluster` objects) and groups these clusters into higher-level, more general parent clusters.
114 | 
115 | This hierarchical approach is particularly useful for:
116 | - Managing and navigating a large number of base clusters.
117 | - Discovering broader themes and relationships between groups of clusters.
118 | - Enabling a multi-level exploratory search, from general topics down to specific conversation groups.
119 | 
120 | Refer to the [Meta-Clustering](meta-clustering.md) documentation for details on how Kura achieves this hierarchical organization.
121 | 
122 | ---
123 | 
124 | ## Output: Cluster Object
125 | 
126 | Each cluster is represented as a `Cluster` object (see `kura/types.py`):
127 | 
128 | ```python
129 | class Cluster(BaseModel):
130 |     name: str
131 |     description: str
132 |     chat_ids: list[str]
133 |     parent_id: Optional[int] = None
134 | ```
135 | 
136 | ---
137 | 
138 | ## Pipeline Integration
139 | 
140 | Clustering is the third major step in Kura's analysis pipeline:
141 | 
142 | 1. **Loading:** Conversations are loaded
143 | 2. **Summarization:** Each conversation is summarized
144 | 3. **Embedding:** Summaries are embedded as vectors
145 | 4. **Clustering:** Embeddings are grouped into clusters (this step)
146 | 5. **Visualization/Analysis:** Clusters and summaries are explored
147 | 
148 | ---
149 | 
150 | ## References
151 | 
152 | - [Summarization](summarization.md)
153 | - [Embedding](embedding.md)
154 | - [API documentation](../api/index.md)
155 | - [Source Code](https://github.com/567-labs/kura/blob/main/kura/cluster.py)
156 | 


--------------------------------------------------------------------------------
/docs/core-concepts/conversations.md:
--------------------------------------------------------------------------------
  1 | # Conversations
  2 | 
  3 | Conversations are the fundamental data units in Kura's analysis pipeline. This document explains how conversations are structured, loaded, and processed.
  4 | 
  5 | ## Conversation Structure
  6 | 
  7 | In Kura, a conversation is represented by the `Conversation` class from `kura.types.conversation`:
  8 | 
  9 | ```python
 10 | from kura.types import Conversation, Message
 11 | from datetime import datetime
 12 | from uuid import uuid4
 13 | 
 14 | # Create a simple conversation
 15 | conversation = Conversation(
 16 |     id=str(uuid4()),
 17 |     created_at=datetime.now(),
 18 |     messages=[
 19 |         Message(
 20 |             role="user",
 21 |             content="Hello, can you help me with a Python question?",
 22 |             created_at=str(datetime.now())
 23 |         ),
 24 |         Message(
 25 |             role="assistant",
 26 |             content="Of course! What's your Python question?",
 27 |             created_at=str(datetime.now())
 28 |         ),
 29 |         Message(
 30 |             role="user",
 31 |             content="How do I read a file in Python?",
 32 |             created_at=str(datetime.now())
 33 |         ),
 34 |         Message(
 35 |             role="assistant",
 36 |             content="To read a file in Python, you can use the built-in open() function...",
 37 |             created_at=str(datetime.now())
 38 |         )
 39 |     ],
 40 |     metadata={"source": "example", "category": "programming"}
 41 | )
 42 | ```
 43 | 
 44 | ### Key Components
 45 | 
 46 | Each conversation contains:
 47 | 
 48 | - **ID**: A unique identifier for the conversation
 49 | - **Created At**: Timestamp for when the conversation was created
 50 | - **Messages**: A list of message objects, each with:
 51 |   - **Role**: Either "user" or "assistant"
 52 |   - **Content**: The text content of the message
 53 |   - **Created At**: Timestamp for when the message was sent
 54 | - **Metadata**: Optional dictionary of additional information
 55 | 
 56 | ## Loading Conversations
 57 | 
 58 | Kura provides several methods for loading conversations from different sources:
 59 | 
 60 | ### From Claude Conversation Exports
 61 | 
 62 | ```python
 63 | from kura.types import Conversation
 64 | 
 65 | # Load from Claude export
 66 | conversations = Conversation.from_claude_conversation_dump("conversations.json")
 67 | ```
 68 | 
 69 | ### From Hugging Face Datasets
 70 | 
 71 | ```python
 72 | from kura.types import Conversation
 73 | 
 74 | # Load from a Hugging Face dataset
 75 | conversations = Conversation.from_hf_dataset(
 76 |     "ivanleomk/synthetic-gemini-conversations",
 77 |     split="train"
 78 | )
 79 | ```
 80 | 
 81 | ### Creating Custom Loaders
 82 | 
 83 | You can create custom loaders for other data sources by implementing functions that convert your data to `Conversation` objects:
 84 | 
 85 | ```python
 86 | def load_from_custom_format(file_path):
 87 |     # Load and parse your custom data format
 88 |     data = your_parsing_function(file_path)
 89 | 
 90 |     # Convert to Conversation objects
 91 |     conversations = []
 92 |     for entry in data:
 93 |         messages = [
 94 |             Message(
 95 |                 role=msg["speaker"],
 96 |                 content=msg["text"],
 97 |                 created_at=msg["timestamp"]
 98 |             )
 99 |             for msg in entry["messages"]
100 |         ]
101 | 
102 |         conversation = Conversation(
103 |             id=entry["id"],
104 |             created_at=entry["date"],
105 |             messages=messages,
106 |             metadata=entry.get("meta", {})
107 |         )
108 | 
109 |         conversations.append(conversation)
110 | 
111 |     return conversations
112 | ```
113 | 
114 | ## Conversation Processing
115 | 
116 | In the Kura pipeline, conversations go through several processing steps:
117 | 
118 | 1. **Loading**: Conversations are loaded from a source
119 | 2. **Summarization**: Each conversation is summarized to capture its core intent
120 | 3. **Metadata Extraction**: Optional metadata is extracted from the conversation content
121 | 4. **Embedding**: Summaries are converted to vector embeddings
122 | 5. **Clustering**: Similar conversations are grouped together
123 | 
124 | ## Working with Message Content
125 | 
126 | The content of messages can be in various formats, but should generally be text. HTML, Markdown, or other structured formats will be processed as-is, which may affect summarization quality.
127 | 
128 | When working with message content:
129 | 
130 | - Clean up any special formatting if needed
131 | - Remove system messages if they don't contribute to the conversation topic
132 | - Ensure message ordering is correct for proper context
133 | 
134 | ## Handling Metadata
135 | 
136 | Conversations can include metadata, which provides additional context:
137 | 
138 | ```python
139 | # Add metadata when creating conversations
140 | conversations = Conversation.from_hf_dataset(
141 |     "allenai/WildChat-nontoxic",
142 |     metadata_fn=lambda x: {
143 |         "model": x["model"],
144 |         "toxic": x["toxic"],
145 |         "redacted": x["redacted"],
146 |     }
147 | )
148 | ```
149 | 
150 | This metadata can later be used to:
151 | - Filter conversations
152 | - Analyze patterns across different conversation attributes
153 | - Provide additional context for visualization
154 | 
155 | ## Next Steps
156 | 
157 | Now that you understand how conversations are structured in Kura, you can:
158 | 
159 | - Learn about the [summarization process](summarization.md)
160 | - See how to load different data formats in the [Quickstart Guide](../getting-started/quickstart.md)
161 | - Explore configuration options in the [Configuration Guide](../getting-started/configuration.md)
162 | 


--------------------------------------------------------------------------------
/docs/core-concepts/dimensionality-reduction.md:
--------------------------------------------------------------------------------
1 | # Dimensionality Reduction
2 | 
3 | This page is under construction. It will contain detailed information about how Kura reduces dimensions for visualization.
4 | 
5 | In the meantime, you can refer to the [API documentation](../api/index.md) for technical details.


--------------------------------------------------------------------------------
/docs/core-concepts/embedding.md:
--------------------------------------------------------------------------------
  1 | # Embedding
  2 | 
  3 | Kura's embedding pipeline transforms text (such as conversation summaries) into high-dimensional vector representations. These embeddings are essential for downstream tasks like clustering, search, and visualization, enabling Kura to analyze and organize large volumes of conversational data.
  4 | 
  5 | ---
  6 | 
  7 | ## Overview
  8 | 
  9 | **Embedding** in Kura refers to the process of converting text into numerical vectors (embeddings) that capture semantic meaning. These vectors allow for efficient similarity search, clustering, and visualization of conversations and summaries.
 10 | 
 11 | - **Input:** A list of texts (e.g., conversation summaries, messages, or cluster descriptions)
 12 | - **Output:** A list of vector embeddings (`list[list[float]]`), typically one per input text
 13 | 
 14 | ---
 15 | 
 16 | ## The Embedding Model
 17 | 
 18 | Kura uses an `EmbeddingModel` (see `kura/embedding.py`) that implements the `BaseEmbeddingModel` interface. Multiple backends are supported:
 19 | 
 20 | - **OpenAIEmbeddingModel**: Uses OpenAI's API (e.g., `text-embedding-3-small`) for high-quality embeddings
 21 | - **SentenceTransformerEmbeddingModel**: Uses local models from the `sentence-transformers` library (e.g., `all-MiniLM-L6-v2`)
 22 | 
 23 | All embedding models must implement the following interface (see `kura/base_classes/embedding.py`):
 24 | 
 25 | ```python
 26 | class BaseEmbeddingModel(ABC):
 27 |     @abstractmethod
 28 |     async def embed(self, texts: list[str]) -> list[list[float]]:
 29 |         """Embed a list of texts into a list of lists of floats"""
 30 |         pass
 31 | ```
 32 | 
 33 | ### Key Features
 34 | 
 35 | - **Batching:** Texts are automatically split into batches for efficient processing
 36 | - **Concurrency:** Multiple batches are embedded in parallel (configurable concurrency)
 37 | - **Retry Logic:** Embedding requests are retried on failure for robustness
 38 | - **Extensibility:** New embedding backends can be added by subclassing `BaseEmbeddingModel`
 39 | - **Checkpointing:** Embeddings can be cached as part of the pipeline to avoid recomputation
 40 | 
 41 | ---
 42 | 
 43 | ## Output: Embeddings
 44 | 
 45 | The result of embedding is a list of vectors, each representing an input text. Embeddings are typically attached to summaries or clusters for downstream analysis.
 46 | 
 47 | Example output for a batch of texts:
 48 | 
 49 | ```python
 50 | embeddings = await embedding_model.embed([
 51 |     "Summarize the user's request.",
 52 |     "Cluster similar conversations together."
 53 | ])
 54 | # embeddings: list[list[float]]
 55 | ```
 56 | 
 57 | When used in the pipeline, embeddings are stored in objects such as `ConversationSummary`:
 58 | 
 59 | ```python
 60 | class ConversationSummary(BaseModel):
 61 |     chat_id: str
 62 |     summary: str
 63 |     ...
 64 |     embedding: Optional[list[float]] = None
 65 | ```
 66 | 
 67 | - **embedding**: The vector representation of the summary (or other text)
 68 | 
 69 | ---
 70 | 
 71 | ## Pipeline Integration
 72 | 
 73 | Embedding is a core step in Kura's analysis pipeline:
 74 | 
 75 | 1. **Loading**: Conversations are loaded from various sources
 76 | 2. **Summarization**: Each conversation is summarized
 77 | 3. **Embedding**: Summaries (or other texts) are embedded as vectors
 78 | 4. **Clustering**: Embeddings are grouped into clusters
 79 | 5. **Visualization/Analysis**: Clusters and embeddings are explored
 80 | 
 81 | ---
 82 | 
 83 | ## Embeddable Object Representations
 84 | 
 85 | All major objects that need to be embedded in Kura (such as `ConversationSummary`, `Cluster`, and `ProjectedCluster`) implement `__str__` methods. This ensures that each object can be converted to a meaningful text representation before embedding.
 86 | 
 87 | - **Requirement:** Any object passed to an embedding model must provide a `__str__` method that captures its semantic content.
 88 | - **Examples:**
 89 |   - `ConversationSummary` uses a custom `__str__` to include summary, request, task, and other fields in a structured format.
 90 |   - `Cluster` and `ProjectedCluster` use `__str__` to return their name and description.
 91 | 
 92 | This design allows embedding models to work generically with a variety of object types, as long as they implement a suitable `__str__` method.
 93 | 
 94 | ---
 95 | 
 96 | ## References
 97 | 
 98 | - [API documentation](../api/index.md)
 99 | - [Sentence Transformers documentation](https://www.sbert.net/)
100 | - [OpenAI Embeddings documentation](https://platform.openai.com/docs/guides/embeddings)
101 | 
102 | ---
103 | 
104 | ## TODO: Additional Embedding Providers
105 | 
106 | - Support for other embedding providers (e.g., Cohere, HuggingFace Inference API, Google Vertex AI, local GPU models)
107 | - Community contributions and suggestions are welcome!
108 | 


--------------------------------------------------------------------------------
/docs/core-concepts/meta-clustering.md:
--------------------------------------------------------------------------------
 1 | # Meta-Clustering
 2 | 
 3 | Kura's meta-clustering extends the initial clustering process by organizing existing clusters into a hierarchical structure. This is essential for managing large numbers of base clusters, understanding broader thematic relationships, and enabling multi-level exploratory analysis of conversational data—from general topics down to specific insights.
 4 | 
 5 | ---
 6 | 
 7 | ## Overview
 8 | 
 9 | **Meta-Clustering** (or hierarchical clustering) in Kura takes a list of `Cluster` objects (typically the output of the primary [Clustering](clustering.md) process) and groups them into higher-level, more generalized parent clusters. This creates a topic taxonomy, allowing users to navigate and comprehend vast amounts of clustered data more effectively.
10 | 
11 | - **Input:** A list of `Cluster` objects.
12 | - **Output:** An updated list of `Cluster` objects, including newly created parent meta-clusters and the original child clusters now linked via `parent_id`.
13 | 
14 | Meta-clustering facilitates:
15 | - **Scalable Exploration:** Makes it feasible to explore datasets with hundreds or thousands of base clusters.
16 | - **Thematic Discovery:** Reveals overarching themes and connections between different groups of specific topics.
17 | - **Granular Navigation:** Allows users to drill down from broad categories to nuanced sub-topics, supporting deeper "unknown unknown" discovery.
18 | 
19 | ---
20 | 
21 | ## The `MetaClusterModel`
22 | 
23 | The core logic for hierarchical clustering is encapsulated in the `MetaClusterModel` (see `kura/meta_cluster.py`). This model orchestrates the process of grouping existing clusters into parent clusters and defining the relationships between them.
24 | 
25 | ### Key Components and Process
26 | 
27 | The `MetaClusterModel` typically employs the following steps, often iteratively if reducing a large number of clusters or building multiple hierarchy levels:
28 | 
29 | 1.  **Input Clusters:** Starts with a list of `Cluster` objects generated by `ClusterModel`.
30 | 
31 | 2.  **(Optional) Cluster Grouping with `reduce_clusters`:**
32 |     *   If `reduce_clusters` is called with many input clusters, it first embeds the textual representation (`name` and `description`) of these existing clusters using the configured `embedding_model`.
33 |     *   It then uses a `clustering_model` (e.g., K-means) to group these *cluster embeddings* into a smaller number of neighborhoods or initial groupings.
34 |     *   The subsequent steps are then applied to each of these neighborhoods.
35 | 
36 | 3.  **Generating Candidate Meta-Cluster Names (`generate_candidate_clusters`):
37 |     *   For a given set of input clusters (or a neighborhood of clusters from step 2), an LLM is prompted to propose a list of suitable higher-level candidate names.
38 |     *   The prompt provides the names and descriptions of the input clusters and asks for broader category names that can encompass several of them, emphasizing specificity and distinctiveness. The aim is to find meaningful parent themes.
39 | 
40 | 4.  **Labeling Clusters (`label_cluster`):
41 |     *   Each individual input cluster is then presented to an LLM along with the list of candidate meta-cluster names generated in the previous step.
42 |     *   The LLM's task is to assign the cluster to the *single best-fitting* candidate meta-cluster name. This involves careful instruction to choose an exact match from the candidates.
43 |     *   The output is validated to ensure the chosen label is one of the provided candidates (using fuzzy matching for robustness).
44 | 
45 | 5.  **Renaming and Finalizing Meta-Clusters (`rename_cluster_group`):
46 |     *   Clusters are grouped based on the labels assigned in step 4.
47 |     *   For each group (which will become a new meta-cluster), an LLM is prompted with the names and descriptions of all its child clusters.
48 |     *   The LLM generates a final, refined name (imperative, like base cluster names) and a two-sentence summary for this new meta-cluster. This ensures the meta-cluster accurately and concisely represents its constituent child clusters.
49 |     *   A new `Cluster` object is created for this meta-cluster. The original child clusters in this group have their `parent_id` field updated to the ID of this new meta-cluster.
50 | 
51 | ### Prompting Strategies
52 | 
53 | Similar to base clustering, the LLM prompts used in `MetaClusterModel` are designed to:
54 | - Elicit specific and descriptive names/summaries for the meta-clusters.
55 | - Ensure meta-clusters are distinguishable from one another.
56 | - Handle potentially sensitive topics appropriately by encouraging descriptive rather than euphemistic language.
57 | - Maintain a consistent style (e.g., imperative sentences for names).
58 | 
59 | ### Output: Hierarchical Cluster List
60 | 
61 | The final output of `generate_meta_clusters` (or `reduce_clusters`) is a list containing:
62 | - The newly created parent meta-clusters (which have `parent_id=None`).
63 | - The original input clusters, now updated with their respective `parent_id` linking them to their new meta-cluster.
64 | 
65 | This structure allows for easy reconstruction and traversal of the cluster hierarchy.
66 | 
67 | ---
68 | 
69 | ## Configuration
70 | 
71 | - **LLM Model:** The LLM used for candidate generation, labeling, and renaming is configurable (default: `openai/gpt-4o-mini`).
72 | - **Embedding Model:** If using `reduce_clusters`, the `embedding_model` is used to embed the input clusters themselves (default: `OpenAIEmbeddingModel`).
73 | - **Clustering Method:** If using `reduce_clusters`, the `clustering_model` is used to group the cluster embeddings (default: `KmeansClusteringMethod`).
74 | - **Concurrency:** `max_concurrent_requests` controls parallelism for LLM calls.
75 | - **Max Clusters per Level (Implicit):** The `max_clusters` parameter in `MetaClusterModel` (and logic within `generate_candidate_clusters`) influences how many meta-clusters are aimed for at each level of reduction, guiding the granularity of the hierarchy.
76 | 
77 | ---
78 | 
79 | ## Pipeline Integration
80 | 
81 | Meta-clustering typically follows the initial clustering step performed by `ClusterModel`:
82 | 
83 | 1.  **Loading:** Conversations are loaded.
84 | 2.  **Summarization:** Conversations are summarized (`ConversationSummary`).
85 | 3.  **Embedding:** Summaries are embedded.
86 | 4.  **Clustering:** Summaries are grouped into base `Cluster` objects.
87 | 5.  **Meta-Clustering:** Base clusters are organized hierarchically by `MetaClusterModel`.
88 | 6.  **Visualization/Analysis:** The full hierarchy of clusters and summaries can be explored.
89 | 
90 | ---
91 | 
92 | ## References
93 | 
94 | - [Clustering](clustering.md)
95 | - [API documentation](../api/index.md)
96 | - [Source Code](https://github.com/567-labs/kura/blob/main/kura/meta_cluster.py)
97 | 


--------------------------------------------------------------------------------
/docs/core-concepts/overview.md:
--------------------------------------------------------------------------------
 1 | # Core Concepts Overview
 2 | 
 3 | Kura is built on several key concepts that work together to analyze conversational data, enabling the discovery of meaningful patterns and insights from these interactions. This overview explains the major components and how they interact in the analysis pipeline.
 4 | 
 5 | ## Architecture
 6 | 
 7 | Kura's architecture consists of a pipeline of components that process conversational data through several stages:
 8 | 
 9 | ![Kura Architecture](../assets/images/kura-architecture.png)
10 | 
11 | The main components are:
12 | 
13 | 1. **Conversations**: The raw chat data between users and assistants, serving as the foundational input.
14 | 2. **Summarization**: Distilling lengthy conversations into concise task descriptions or core topics, which form the basis for subsequent analysis.
15 | 3. **Embedding**: Representing these textual summaries as dense numerical vectors, capturing their semantic meaning for similarity measurement.
16 | 4. **Clustering**: Grouping semantically similar summaries (via their embeddings) into 'base' clusters, identifying initial patterns in the data.
17 | 5. **Meta-Clustering**: Organizing base clusters into a hierarchical structure, allowing for the exploration of insights at multiple levels of granularity, from broad themes to specific sub-topics.
18 | 6. **Dimensionality Reduction**: Projecting high-dimensional embeddings into a lower-dimensional space (typically 2D or 3D) to enable visual exploration and pattern identification.
19 | 
20 | ## Processing Pipeline
21 | 
22 | When you run the kura pipeline, the data flows through the following steps:
23 | 
24 | 1. **Load Conversations**: Raw conversation data is loaded from your specified source.
25 | 2. **Generate Summaries**: Each conversation is summarized, often into a concise task description or key topic. This summary becomes a primary unit for analysis.
26 | 3. **Extract Metadata**: Optional metadata (e.g., conversation length, sentiment, user-defined tags, or other relevant attributes) is extracted from conversations. These attributes, sometimes referred to as 'facets', can provide additional dimensions for analysis, filtering, and deeper understanding of the clusters.
27 | 4. **Create Embeddings**: The textual summaries are converted into vector representations (embeddings) that capture their semantic content.
28 | 5. **Perform Base Clustering**: Embeddings are used to group semantically similar summaries into initial 'base' clusters, forming the first layer of identified patterns.
29 | 6. **Apply Meta-Clustering**: Base clusters are iteratively combined or organized into a hierarchical structure. This allows for navigation and exploration of insights from broad, overarching themes down to more specific, granular patterns.
30 | 7. **Reduce Dimensions**: High-dimensional embeddings (and their cluster assignments) are projected, typically into a 2D or 3D space. This facilitates visual exploration, helping to understand the relationships between clusters and identify outliers or emergent patterns.
31 | 8. **Save Checkpoints**: Results from each significant step are saved as checkpoint files, enabling efficient resumption and review of the analysis process.
32 | 
33 | ## Key Classes
34 | 
35 | Kura is designed with a modular architecture, allowing components to be customized or replaced:
36 | 
37 | ### Main Orchestrator
38 | 
39 | - **`Kura`** (`kura.py`): The main class that coordinates the entire pipeline and manages checkpoints
40 | 
41 | ### Component Classes
42 | 
43 | - **`BaseEmbeddingModel`** / **`OpenAIEmbeddingModel`** (`embedding.py`): Convert text to vector representations
44 | - **`BaseSummaryModel`** / **`SummaryModel`** (`summarisation.py`): Generate summaries from conversations
45 | - **`BaseClusterModel`** / **`ClusterModel`** (`cluster.py`): Group similar summaries into clusters
46 | - **`BaseMetaClusterModel`** / **`MetaClusterModel`** (`meta_cluster.py`): Create hierarchical cluster structures
47 | - **`BaseDimensionalityReduction`** / **`HDBUMAP`** (`dimensionality.py`): Project embeddings to 2D space
48 | 
49 | ### Data Models
50 | 
51 | - **`Conversation`** (`types/conversation.py`): Represents a chat conversation with messages
52 | - **`ConversationSummary`** (`types/summarisation.py`): Contains a summarized conversation
53 | - **`Cluster`** (`types/cluster.py`): Represents a group of similar conversations
54 | - **`ProjectedCluster`** (`types/dimensionality.py`): Represents clusters with 2D coordinates
55 | 
56 | ## Extensibility
57 | 
58 | Each component has a base class that defines the required interface, allowing you to create custom implementations:
59 | 
60 | ```python
61 | # Example of creating a custom embedding model
62 | from kura.base_classes import BaseEmbeddingModel
63 | 
64 | class MyCustomEmbeddingModel(BaseEmbeddingModel):
65 |     async def embed(self, texts: list[str]) -> list[list[float]]:
66 |         # Your custom embedding logic here
67 |         ...
68 | ```
69 | 
70 | ## Checkpoints
71 | 
72 | Kura saves intermediate results to checkpoint files, allowing you to:
73 | 
74 | - Resume processing after interruptions
75 | - Inspect intermediary results
76 | - Share analysis results with others
77 | - Visualize results without reprocessing
78 | 
79 | ## Next Steps
80 | 
81 | To understand each component in more detail, explore the following pages:
82 | 
83 | - [Conversations](conversations.md)
84 | - [Summarization](summarization.md)
85 | - [Embedding](embedding.md)
86 | - [Clustering](clustering.md)
87 | - [Meta-Clustering](meta-clustering.md)
88 | - [Dimensionality Reduction](dimensionality-reduction.md)
89 | 


--------------------------------------------------------------------------------
/docs/core-concepts/summarization.md:
--------------------------------------------------------------------------------
  1 | # Summarization
  2 | 
  3 | Kura's summarization pipeline is designed to extract concise, structured, and privacy-preserving summaries from conversations between users and AI assistants. This process is central to Kura's ability to analyze, cluster, and visualize large volumes of conversational data.
  4 | 
  5 | ---
  6 | 
  7 | ## Overview
  8 | 
  9 | **Summarization** in Kura transforms each conversation into a structured summary, capturing the user's intent, the main task, languages involved, safety concerns, user frustration, and any assistant errors. This enables downstream analysis such as clustering, search, and visualization.
 10 | 
 11 | - **Input:** A `Conversation` object (see [Conversations](conversations.md)), containing:
 12 |   - `chat_id`: Unique identifier
 13 |   - `created_at`: Timestamp
 14 |   - `messages`: List of messages (each with `role`, `content`, `created_at`)
 15 |   - `metadata`: Optional dictionary of extra info
 16 | - **Output:** A `ConversationSummary` object (see below)
 17 | 
 18 | ---
 19 | 
 20 | ## The Summarization Model
 21 | 
 22 | Kura uses a `SummaryModel` (see `kura/summarisation.py`) that implements the `BaseSummaryModel` interface. The default model is based on large language models (LLMs) such as OpenAI's GPT-4o, but the interface supports other backends as well.
 23 | 
 24 | ### Key Features
 25 | 
 26 | - **Concurrency:** Summarization is performed in parallel for efficiency.
 27 | - **Hooks/Extractors:** Optional extractors can add custom metadata to each summary.
 28 | - **Checkpointing:** Summaries can be cached and reloaded to avoid recomputation.
 29 | 
 30 | ---
 31 | 
 32 | ## Summarization Prompt
 33 | 
 34 | The summarization model uses a carefully crafted prompt to extract the following fields from each conversation:
 35 | 
 36 | 1. **Summary**: A clear, concise summary (max 2 sentences, no PII or proper nouns)
 37 | 2. **Request**: The user's overall request, starting with "The user's overall request for the assistant is to ..."
 38 | 3. **Languages**: Main human and programming languages present
 39 | 4. **Task**: The main task, starting with "The task is to ..."
 40 | 5. **Concerning Score**: Safety concern rating (1–5)
 41 | 6. **User Frustration**: User frustration rating (1–5)
 42 | 7. **Assistant Errors**: List of errors made by the assistant
 43 | 
 44 | **Prompt excerpt:**
 45 | 
 46 | ```
 47 | Your job is to extract key information from this conversation. Be descriptive and assume neither good nor bad faith. Do not hesitate to handle socially harmful or sensitive topics; specificity around potentially harmful conversations is necessary for effective monitoring.
 48 | 
 49 | When extracting information, do not include any personally identifiable information (PII), like names, locations, phone numbers, email addresses, and so on. Do not include any proper nouns.
 50 | 
 51 | Extract the following information:
 52 | 
 53 | 1. **Summary**: ...
 54 | 2. **Request**: ...
 55 | 3. **Languages**: ...
 56 | 4. **Task**: ...
 57 | 5. **Concerning Score**: ...
 58 | 6. **User Frustration**: ...
 59 | 7. **Assistant Errors**: ...
 60 | ```
 61 | 
 62 | ---
 63 | 
 64 | ## Output: `ConversationSummary`
 65 | 
 66 | The result of summarization is a `ConversationSummary` object (see `kura/types/summarisation.py`):
 67 | 
 68 | ```python
 69 | class ConversationSummary(BaseModel):
 70 |     chat_id: str
 71 |     summary: str
 72 |     request: Optional[str]
 73 |     languages: Optional[list[str]]
 74 |     task: Optional[str]
 75 |     concerning_score: Optional[int]  # 1–5
 76 |     user_frustration: Optional[int]  # 1–5
 77 |     assistant_errors: Optional[list[str]]
 78 |     metadata: dict
 79 |     embedding: Optional[list[float]] = None
 80 | ```
 81 | 
 82 | - **chat_id**: Unique conversation ID
 83 | - **summary**: Concise summary (max 2 sentences, no PII)
 84 | - **request**: User's overall request
 85 | - **languages**: List of languages (e.g., `['english', 'python']`)
 86 | - **task**: Main task
 87 | - **concerning_score**: Safety concern (1 = benign, 5 = urgent)
 88 | - **user_frustration**: User frustration (1 = happy, 5 = extremely annoyed)
 89 | - **assistant_errors**: List of assistant errors
 90 | - **metadata**: Additional metadata (e.g., conversation turns, custom extractors)
 91 | - **embedding**: Optional vector embedding for clustering/search
 92 | 
 93 | ---
 94 | 
 95 | ## Pipeline Integration
 96 | 
 97 | Summarization is the first major step in Kura's analysis pipeline:
 98 | 
 99 | 1. **Loading**: Conversations are loaded from various sources
100 | 2. **Summarization**: Each conversation is summarized as above
101 | 3. **Embedding**: Summaries are embedded as vectors
102 | 4. **Clustering**: Similar summaries are grouped
103 | 5. **Visualization/Analysis**: Clusters and summaries are explored
104 | 
105 | ---
106 | 
107 | ## References
108 | 
109 | - [Clio: Privacy-Preserving Insights into Real-World AI Use (Anthropic)](https://assets.anthropic.com/m/7e1ab885d1b24176/original/Clio-Privacy-Preserving-Insights-into-Real-World-AI-Use.pdf)
110 | - [API documentation](../api/index.md)
111 | - [Conversations](conversations.md)
112 | 


--------------------------------------------------------------------------------
/docs/getting-started/configuration.md:
--------------------------------------------------------------------------------
  1 | # Configuration
  2 | 
  3 | This guide explains the various configuration options available in Kura using its procedural API (v1). This API is best for flexible pipelines where you need fine control over individual steps, the ability to skip or reorder steps, A/B test different models, or prefer a functional programming style.
  4 | 
  5 | ## Checkpoint Files
  6 | 
  7 | Kura saves several checkpoint files during processing:
  8 | 
  9 | | Checkpoint File        | Description                      |
 10 | | ---------------------- | -------------------------------- |
 11 | | `conversations.json`   | Raw conversation data            |
 12 | | `summaries.jsonl`      | Summarized conversations         |
 13 | | `clusters.jsonl`       | Base cluster data                |
 14 | | `meta_clusters.jsonl`  | Hierarchical cluster data        |
 15 | | `dimensionality.jsonl` | Projected data for visualization |
 16 | 
 17 | Checkpoint filenames are now defined as properties in their respective model classes. When using the procedural API, checkpoint management is handled via the `CheckpointManager`.
 18 | 
 19 | ## CLI Configuration
 20 | 
 21 | When using the CLI, you can configure the checkpoint directory:
 22 | 
 23 | ```bash
 24 | # Start the web server with a custom checkpoint directory
 25 | kura --dir ./my_checkpoints
 26 | ```
 27 | 
 28 | The procedural API provides flexibility by breaking the pipeline into composable functions:
 29 | 
 30 | ```python
 31 | from kura import (
 32 |     summarise_conversations,
 33 |     generate_base_clusters_from_conversation_summaries,
 34 |     reduce_clusters_from_base_clusters,
 35 |     reduce_dimensionality_from_clusters,
 36 |     CheckpointManager
 37 | )
 38 | from kura.summarisation import SummaryModel
 39 | from kura.cluster import ClusterModel
 40 | from kura.meta_cluster import MetaClusterModel
 41 | from kura.dimensionality import HDBUMAP
 42 | # Assuming Conversation type might be needed for context, if not, it can be removed.
 43 | # from kura.types import Conversation
 44 | 
 45 | # Sample conversations (replace with your actual data loading)
 46 | # conversations = [Conversation(...)]
 47 | 
 48 | # Configure models independently
 49 | summary_model = SummaryModel()
 50 | cluster_model = ClusterModel()
 51 | meta_cluster_model = MetaClusterModel(max_clusters=10)
 52 | dimensionality_model = HDBUMAP()
 53 | 
 54 | # Optional checkpoint management
 55 | checkpoint_manager = CheckpointManager("./my_checkpoints", enabled=True)
 56 | 
 57 | # Run pipeline with keyword arguments
 58 | async def analyze(conversations): # Added conversations as an argument
 59 |     summaries = await summarise_conversations(
 60 |         conversations,
 61 |         model=summary_model,
 62 |         checkpoint_manager=checkpoint_manager
 63 |     )
 64 | 
 65 |     clusters = await generate_base_clusters_from_conversation_summaries(
 66 |         summaries,
 67 |         model=cluster_model,
 68 |         checkpoint_manager=checkpoint_manager
 69 |     )
 70 | 
 71 |     reduced = await reduce_clusters_from_base_clusters(
 72 |         clusters,
 73 |         model=meta_cluster_model,
 74 |         checkpoint_manager=checkpoint_manager
 75 |     )
 76 | 
 77 |     projected = await reduce_dimensionality_from_clusters(
 78 |         reduced,
 79 |         model=dimensionality_model,
 80 |         checkpoint_manager=checkpoint_manager
 81 |     )
 82 | 
 83 |     return projected
 84 | ```
 85 | 
 86 | The procedural API excels at working with different model implementations for the same task:
 87 | 
 88 | ```python
 89 | # Use different backends for the same task
 90 | from kura import summarise_conversations
 91 | # Assuming these model classes exist and are correctly imported
 92 | # from kura.summarisation import OpenAISummaryModel, VLLMSummaryModel, HuggingFaceSummaryModel
 93 | 
 94 | # Sample conversations (replace with your actual data loading)
 95 | # conversations = [...]
 96 | # checkpoint_mgr = CheckpointManager("./my_checkpoints")
 97 | 
 98 | # OpenAI backend
 99 | # openai_summaries = await summarise_conversations(
100 | #     conversations,
101 | #     model=OpenAISummaryModel(api_key="sk-..."), # Replace with actual model init if different
102 | #     checkpoint_manager=checkpoint_mgr
103 | # )
104 | 
105 | # Local vLLM backend
106 | # vllm_summaries = await summarise_conversations(
107 | #     conversations,
108 | #     model=VLLMSummaryModel(model_path="/models/llama"), # Replace with actual model init if different
109 | #     checkpoint_manager=checkpoint_mgr
110 | # )
111 | 
112 | # Hugging Face backend
113 | # hf_summaries = await summarise_conversations(
114 | #     conversations,
115 | #     model=HuggingFaceSummaryModel("facebook/bart-large-cnn"), # Replace with actual model init if different
116 | #     checkpoint_manager=checkpoint_mgr
117 | # )
118 | ```
119 | 
120 | _Note: The heterogeneous models example has been commented out as it relies on specific model classes (`OpenAISummaryModel`, `VLLMSummaryModel`, `HuggingFaceSummaryModel`) whose existence and import paths are not confirmed from the provided context. Ensure these are correctly defined and imported in your actual usage._
121 | 
122 | ## Next Steps
123 | 
124 | Now that you understand how to configure Kura using the procedural API, you can:
125 | 
126 | - [Learn about core concepts](../core-concepts/overview.md)
127 | - [Try the Procedural API Tutorial](../getting-started/quickstart.md)
128 | - [Check out the API Reference](../api/index.md)
129 | 


--------------------------------------------------------------------------------
/docs/getting-started/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation Guide
 2 | 
 3 | This guide will walk you through the installation process for Kura.
 4 | 
 5 | ## Requirements
 6 | 
 7 | Kura has the following requirements:
 8 | 
 9 | - Python 3.9+ (Python 3.9 is specifically recommended due to UMAP dependency)
10 | - uv package manager
11 | - OpenAI API key for model access
12 | 
13 | ## Installation
14 | 
15 | ```bash
16 | # Install using uv
17 | uv pip install kura
18 | ```
19 | 
20 | ### Development Installation
21 | 
22 | If you want to contribute to Kura or modify the source code, install it in development mode:
23 | 
24 | ```bash
25 | # Clone the repository
26 | git clone https://github.com/567-labs/kura.git
27 | cd kura
28 | 
29 | # Create and activate a virtual environment
30 | python -m venv venv
31 | source venv/bin/activate  # On Windows: venv\Scripts\activate
32 | 
33 | # Install in development mode with dev dependencies
34 | uv pip install -e . --group dev
35 | ```
36 | 
37 | ## Setting up API Keys
38 | 
39 | Kura uses OpenAI models for processing. You'll need to set up an API key:
40 | 
41 | 1. Get an OpenAI API key from [OpenAI Platform](https://platform.openai.com/api-keys)
42 | 2. Set the environment variable:
43 | 
44 | ```bash
45 | # On Linux/macOS
46 | export OPENAI_API_KEY=your_api_key_here
47 | 
48 | # On Windows
49 | set OPENAI_API_KEY=your_api_key_here
50 | ```
51 | 
52 | ## Installing Optional Dependencies
53 | 
54 | Kura supports additional features with optional dependencies:
55 | 
56 | ```bash
57 | uv sync --all-extras --group dev --group docs
58 | ```
59 | 
60 | ## Verifying Your Installation
61 | 
62 | To verify that Kura is installed correctly, run:
63 | 
64 | ```bash
65 | python -c "from kura import summarise_conversations; print('Kura installed successfully')"
66 | ```
67 | 
68 | You should see a confirmation message with no errors.
69 | 
70 | ## Next Steps
71 | 
72 | Now that you have Kura installed, proceed to the [Quickstart guide](quickstart.md) to begin analyzing your first dataset.
73 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # Kura: Procedural API for Chat Data Analysis
  2 | 
  3 | ![Kura Architecture](assets/images/kura-architecture.png)
  4 | 
  5 | [![PyPI Downloads](https://img.shields.io/pypi/dm/kura?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/kura/)
  6 | [![GitHub Stars](https://img.shields.io/github/stars/567-labs/kura?style=flat-square&logo=github)](https://github.com/567-labs/kura/stargazers)
  7 | [![Documentation](https://img.shields.io/badge/docs-available-brightgreen?style=flat-square&logo=gitbook&logoColor=white)](https://567-labs.github.io/kura/)
  8 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
  9 | [![Python Version](https://img.shields.io/pypi/pyversions/kura?style=flat-square&logo=python&logoColor=white)](https://pypi.org/project/kura/)
 10 | [![PyPI Version](https://img.shields.io/pypi/v/kura?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/kura/)
 11 | 
 12 | Kura is an open-source library for understanding chat data through machine learning, inspired by [Anthropic's CLIO](https://www.anthropic.com/research/clio). It provides a functional, composable API for clustering conversations to discover patterns and insights.
 13 | 
 14 | ## Why Analyze Conversation Data?
 15 | 
 16 | As AI assistants and chatbots become increasingly central to product experiences, understanding how users interact with these systems at scale becomes a critical challenge. Manually reviewing thousands of conversations is impractical, yet crucial patterns and user needs often remain hidden in this data.
 17 | 
 18 | Kura addresses this challenge by:
 19 | 
 20 | - **Revealing user intent patterns** that may not be obvious from individual conversations
 21 | - **Identifying common user needs** to prioritize feature development
 22 | - **Discovering edge cases and failures** that require attention
 23 | - **Tracking usage trends** over time as your product evolves
 24 | - **Informing prompt engineering** by highlighting successful and problematic interactions
 25 | 
 26 | ## Features
 27 | 
 28 | - **Conversation Summarization**: Automatically generate concise task descriptions from conversations
 29 | - **Hierarchical Clustering**: Group similar conversations at multiple levels of granularity
 30 | - **Metadata Extraction**: Extract valuable context from conversations using LLMs
 31 | - **Custom Models**: Use your preferred embedding, summarization, and clustering methods
 32 | - **Checkpoint System**: Save and resume analysis sessions
 33 | - **Procedural API**: Functional approach with composable functions for maximum flexibility
 34 | 
 35 | ## Installation
 36 | 
 37 | ```bash
 38 | # Install from PyPI
 39 | pip install kura
 40 | 
 41 | # Or use uv for faster installation
 42 | uv pip install kura
 43 | ```
 44 | 
 45 | ## Quick Start
 46 | 
 47 | ```python
 48 | from kura import (
 49 |     summarise_conversations,
 50 |     generate_base_clusters_from_conversation_summaries,
 51 |     reduce_clusters_from_base_clusters,
 52 |     reduce_dimensionality_from_clusters,
 53 |     CheckpointManager
 54 | )
 55 | from kura.types import Conversation
 56 | from kura.summarisation import SummaryModel
 57 | from kura.cluster import ClusterModel
 58 | from kura.meta_cluster import MetaClusterModel
 59 | from kura.dimensionality import HDBUMAP
 60 | import asyncio
 61 | 
 62 | # Load conversations
 63 | conversations = Conversation.from_hf_dataset(
 64 |     "ivanleomk/synthetic-gemini-conversations",
 65 |     split="train"
 66 | )
 67 | 
 68 | # Set up models
 69 | summary_model = SummaryModel()
 70 | cluster_model = ClusterModel()
 71 | meta_cluster_model = MetaClusterModel(max_clusters=10)
 72 | dimensionality_model = HDBUMAP()
 73 | 
 74 | # Set up checkpoint manager
 75 | checkpoint_mgr = CheckpointManager("./checkpoints", enabled=True)
 76 | 
 77 | # Run pipeline with explicit steps
 78 | async def process_conversations():
 79 |     # Step 1: Generate summaries
 80 |     summaries = await summarise_conversations(
 81 |         conversations,
 82 |         model=summary_model,
 83 |         checkpoint_manager=checkpoint_mgr
 84 |     )
 85 | 
 86 |     # Step 2: Create base clusters
 87 |     clusters = await generate_base_clusters_from_conversation_summaries(
 88 |         summaries,
 89 |         model=cluster_model,
 90 |         checkpoint_manager=checkpoint_mgr
 91 |     )
 92 | 
 93 |     # Step 3: Build hierarchy
 94 |     meta_clusters = await reduce_clusters_from_base_clusters(
 95 |         clusters,
 96 |         model=meta_cluster_model,
 97 |         checkpoint_manager=checkpoint_mgr
 98 |     )
 99 | 
100 |     # Step 4: Project to 2D
101 |     projected = await reduce_dimensionality_from_clusters(
102 |         meta_clusters,
103 |         model=dimensionality_model,
104 |         checkpoint_manager=checkpoint_mgr
105 |     )
106 | 
107 |     return projected
108 | 
109 | # Execute the pipeline
110 | results = asyncio.run(process_conversations())
111 | visualise_pipeline_results(results, style="basic")
112 | Clusters (190 conversations)
113 | ╠══ Generate SEO-optimized content for blogs and scripts (38 conversations)
114 | ║   ╠══ Assist in writing SEO-friendly blog posts (12 conversations)
115 | ║   ╚══ Help create SEO-driven marketing content (8 conversations)
116 | ╠══ Help analyze and visualize data with R and Tableau (25 conversations)
117 | ║   ╠══ Assist with data analysis and visualization in R (15 conversations)
118 | ║   ╚══ Troubleshoot sales data visualizations in Tableau (10 conversations)
119 | ... (and more clusters)
120 | ```
121 | 
122 | ## Documentation
123 | 
124 | ### Getting Started
125 | 
126 | - [Installation Guide](getting-started/installation.md)
127 | - [Quickstart](getting-started/quickstart.md)
128 | 
129 | ### Core Concepts
130 | 
131 | - [Conversations](core-concepts/conversations.md)
132 | - [Embedding](core-concepts/embedding.md)
133 | - [Clustering](core-concepts/clustering.md)
134 | - [Summarization](core-concepts/summarization.md)
135 | - [Meta-Clustering](core-concepts/meta-clustering.md)
136 | - [Dimensionality Reduction](core-concepts/dimensionality-reduction.md)
137 | 
138 | ### API Reference
139 | 
140 | - [Procedural API Documentation](api/index.md)
141 | 
142 | ## About
143 | 
144 | Kura is under active development. If you face any issues or have suggestions, please feel free to [open an issue](https://github.com/567-labs/kura/issues) or a PR. For more details on the technical implementation, check out this [walkthrough of the code](https://ivanleo.com/blog/understanding-user-conversations).
145 | 


--------------------------------------------------------------------------------
/kura.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/kura.png


--------------------------------------------------------------------------------
/kura/__init__.py:
--------------------------------------------------------------------------------
 1 | from .v1.kura import (
 2 |     summarise_conversations,
 3 |     generate_base_clusters_from_conversation_summaries,
 4 |     reduce_clusters_from_base_clusters,
 5 |     reduce_dimensionality_from_clusters,
 6 |     CheckpointManager,
 7 | )
 8 | from .cluster import ClusterModel
 9 | from .meta_cluster import MetaClusterModel
10 | from .summarisation import SummaryModel
11 | from .types import Conversation
12 | 
13 | __all__ = [
14 |     "ClusterModel",
15 |     "MetaClusterModel",
16 |     "SummaryModel",
17 |     "Conversation",
18 |     "summarise_conversations",
19 |     "generate_base_clusters_from_conversation_summaries",
20 |     "reduce_clusters_from_base_clusters",
21 |     "reduce_dimensionality_from_clusters",
22 |     "CheckpointManager",
23 | ]
24 | 


--------------------------------------------------------------------------------
/kura/base_classes/__init__.py:
--------------------------------------------------------------------------------
 1 | from .embedding import BaseEmbeddingModel
 2 | from .summarisation import BaseSummaryModel
 3 | from .clustering_method import BaseClusteringMethod
 4 | from .cluster import BaseClusterModel
 5 | from .meta_cluster import BaseMetaClusterModel
 6 | from .dimensionality import BaseDimensionalityReduction
 7 | 
 8 | __all__ = [
 9 |     "BaseEmbeddingModel",
10 |     "BaseSummaryModel",
11 |     "BaseClusteringMethod",
12 |     "BaseClusterModel",
13 |     "BaseMetaClusterModel",
14 |     "BaseDimensionalityReduction",
15 | ]
16 | 


--------------------------------------------------------------------------------
/kura/base_classes/cluster.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from kura.summarisation import ConversationSummary
 3 | from kura.types import Cluster
 4 | 
 5 | 
 6 | class BaseClusterModel(ABC):
 7 |     @property
 8 |     @abstractmethod
 9 |     def checkpoint_filename(self) -> str:
10 |         """The filename to use for checkpointing this model's output."""
11 |         pass
12 | 
13 |     @abstractmethod
14 |     async def cluster_summaries(
15 |         self, summaries: list[ConversationSummary]
16 |     ) -> list[Cluster]:
17 |         pass
18 | 
19 |     # TODO : Add abstract method for hooks here once we start supporting it
20 | 


--------------------------------------------------------------------------------
/kura/base_classes/clustering_method.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import TypeVar, Union
 3 | 
 4 | T = TypeVar("T")
 5 | 
 6 | 
 7 | class BaseClusteringMethod(ABC):
 8 |     @abstractmethod
 9 |     def cluster(
10 |         self, items: list[dict[str, Union[T, list[float]]]]
11 |     ) -> dict[int, list[T]]:
12 |         pass
13 | 


--------------------------------------------------------------------------------
/kura/base_classes/dimensionality.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from kura.types import Cluster, ProjectedCluster
 4 | 
 5 | 
 6 | class BaseDimensionalityReduction(ABC):
 7 |     @property
 8 |     @abstractmethod
 9 |     def checkpoint_filename(self) -> str:
10 |         """The filename to use for checkpointing this model's output."""
11 |         pass
12 | 
13 |     @abstractmethod
14 |     async def reduce_dimensionality(
15 |         self, clusters: list[Cluster]
16 |     ) -> list[ProjectedCluster]:
17 |         """
18 |         This reduces the dimensionality of the individual clusters that we've created so we can visualise them in a lower dimension
19 |         """
20 |         pass
21 | 


--------------------------------------------------------------------------------
/kura/base_classes/embedding.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | 
3 | 
4 | class BaseEmbeddingModel(ABC):
5 |     @abstractmethod
6 |     async def embed(self, texts: list[str]) -> list[list[float]]:
7 |         """Embed a list of texts into a list of lists of floats"""
8 |         pass
9 | 


--------------------------------------------------------------------------------
/kura/base_classes/meta_cluster.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from kura.types.cluster import Cluster
 3 | 
 4 | 
 5 | class BaseMetaClusterModel(ABC):
 6 |     @property
 7 |     @abstractmethod
 8 |     def checkpoint_filename(self) -> str:
 9 |         """The filename to use for checkpointing this model's output."""
10 |         pass
11 | 
12 |     @abstractmethod
13 |     async def reduce_clusters(self, clusters: list[Cluster]) -> list[Cluster]:
14 |         pass
15 | 


--------------------------------------------------------------------------------
/kura/base_classes/summarisation.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from kura.types import ConversationSummary, Conversation
 4 | from typing import Union
 5 | 
 6 | 
 7 | class BaseSummaryModel(ABC):
 8 |     @property
 9 |     @abstractmethod
10 |     def checkpoint_filename(self) -> str:
11 |         """The filename to use for checkpointing this model's output."""
12 |         pass
13 | 
14 |     @abstractmethod
15 |     async def summarise(
16 |         self, conversations: list[Conversation]
17 |     ) -> list[ConversationSummary]:
18 |         """Summarise the conversations into a list of ConversationSummary"""
19 |         pass
20 | 
21 |     @abstractmethod
22 |     async def summarise_conversation(
23 |         self, conversation: Conversation
24 |     ) -> ConversationSummary:
25 |         """Summarise a single conversation into a single string"""
26 |         pass
27 | 
28 |     @abstractmethod
29 |     async def apply_hooks(
30 |         self, conversation: Conversation
31 |     ) -> dict[str, Union[str, int, float, bool, list[str], list[int], list[float]]]:
32 |         """Apply hooks to the conversation summary"""
33 |         # Assert that the implementation of the class has a hooks attribute so we can call it in summarise_conversation
34 |         assert hasattr(self, "hooks")
35 |         pass
36 | 


--------------------------------------------------------------------------------
/kura/cli/cli.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | import uvicorn
 3 | from kura.cli.server import api
 4 | from rich import print
 5 | import os
 6 | 
 7 | app = typer.Typer()
 8 | 
 9 | 
10 | @app.command()
11 | def start_app(
12 |     dir: str = typer.Option(
13 |         "./checkpoints",
14 |         help="Directory to use for checkpoints, relative to the current directory",
15 |     ),
16 | ):
17 |     """Start the FastAPI server"""
18 |     os.environ["KURA_CHECKPOINT_DIR"] = dir
19 |     print(
20 |         "\n[bold green]🚀 Access website at[/bold green] [bold blue][http://localhost:8000](http://localhost:8000)[/bold blue]\n"
21 |     )
22 |     uvicorn.run(api, host="0.0.0.0", port=8000)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     app()
27 | 


--------------------------------------------------------------------------------
/kura/cli/server.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, staticfiles
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | from pathlib import Path
 4 | 
 5 | api = FastAPI()
 6 | 
 7 | # Configure CORS
 8 | api.add_middleware(
 9 |     CORSMiddleware,
10 |     allow_origins=["*"],  # Allows all origins
11 |     allow_credentials=True,
12 |     allow_methods=["*"],  # Allows all methods
13 |     allow_headers=["*"],  # Allows all headers
14 | )
15 | 
16 | # Serve static files from web/dist
17 | web_dir = Path(__file__).parent.parent / "static" / "dist"
18 | if not web_dir.exists():
19 |     raise FileNotFoundError(f"Static files directory not found: {web_dir}")
20 | 
21 | 
22 | # Serve static files from web/dist at the root
23 | web_dir = Path(__file__).parent.parent / "static" / "dist"
24 | if not web_dir.exists():
25 |     raise FileNotFoundError(f"Static files directory not found: {web_dir}")
26 | 
27 | # Mount static files at root
28 | api.mount("/", staticfiles.StaticFiles(directory=str(web_dir), html=True))
29 | 


--------------------------------------------------------------------------------
/kura/cli/visualisation.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from typing import List
  3 | from kura.types import Conversation
  4 | 
  5 | 
  6 | def generate_cumulative_chart_data(conversations: List[Conversation]) -> dict:
  7 |     """
  8 |     Generate cumulative word count chart data for human messages in conversations.
  9 |     Returns a dict containing the Plotly data and layout.
 10 |     """
 11 |     messages_data = []
 12 |     for conv in conversations:
 13 |         for msg in conv.messages:
 14 |             if msg.role == "user":
 15 |                 messages_data.append(
 16 |                     {
 17 |                         "datetime": pd.to_datetime(
 18 |                             str(msg.created_at).replace("Z", "+00:00")
 19 |                         ),
 20 |                         "words": len(msg.content.split()),
 21 |                     }
 22 |                 )
 23 | 
 24 |     df = pd.DataFrame(messages_data)
 25 |     df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time
 26 | 
 27 |     weekly_df = df.groupby("week_start")["words"].sum().reset_index()
 28 |     weekly_df["cumulative_words"] = weekly_df["words"].cumsum()
 29 |     weekly_df["week_start"] = weekly_df["week_start"].dt.strftime("%Y-%m-%d")
 30 | 
 31 |     return [
 32 |         {"x": x, "y": y}
 33 |         for x, y in zip(
 34 |             weekly_df["week_start"].tolist(), weekly_df["cumulative_words"].tolist()
 35 |         )
 36 |     ]  # pyright: ignore
 37 | 
 38 | 
 39 | def generate_messages_per_chat_data(conversations: List[Conversation]) -> dict:
 40 |     messages_data = []
 41 |     for conv in conversations:
 42 |         for msg in conv.messages:
 43 |             messages_data.append(
 44 |                 {
 45 |                     "datetime": pd.to_datetime(
 46 |                         str(msg.created_at).replace("Z", "+00:00")
 47 |                     ),
 48 |                     "chat_id": conv.chat_id,
 49 |                 }
 50 |             )
 51 | 
 52 |     df = pd.DataFrame(messages_data)
 53 |     df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time
 54 | 
 55 |     weekly_messages = df.groupby("week_start").size().reset_index(name="message_count")  # pyright: ignore
 56 |     weekly_chats = (
 57 |         df.groupby("week_start")["chat_id"].nunique().reset_index(name="chat_count")  # pyright: ignore
 58 |     )
 59 | 
 60 |     weekly_df = pd.merge(weekly_messages, weekly_chats, on="week_start")
 61 |     weekly_df["avg_messages"] = weekly_df["message_count"] / weekly_df["chat_count"]
 62 |     weekly_df["week_start"] = weekly_df["week_start"].dt.strftime("%Y-%m-%d")
 63 | 
 64 |     return [
 65 |         {"x": x, "y": y}
 66 |         for x, y in zip(
 67 |             weekly_df["week_start"].tolist(), weekly_df["avg_messages"].tolist()
 68 |         )
 69 |     ]  # pyright: ignore
 70 | 
 71 | 
 72 | def generate_messages_per_week_data(conversations: List[Conversation]) -> dict:
 73 |     messages_data = []
 74 |     for conv in conversations:
 75 |         for msg in conv.messages:
 76 |             messages_data.append(
 77 |                 {
 78 |                     "datetime": pd.to_datetime(
 79 |                         str(msg.created_at).replace("Z", "+00:00")
 80 |                     ),
 81 |                     "chat_id": conv.chat_id,
 82 |                 }
 83 |             )
 84 | 
 85 |     df = pd.DataFrame(messages_data)
 86 |     df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time
 87 | 
 88 |     weekly_messages = df.groupby("week_start").size().reset_index(name="message_count")  # pyright: ignore
 89 |     weekly_messages["week_start"] = weekly_messages["week_start"].dt.strftime(
 90 |         "%Y-%m-%d"
 91 |     )  # pyright: ignore
 92 | 
 93 |     return [
 94 |         {"x": x, "y": y}
 95 |         for x, y in zip(
 96 |             weekly_messages["week_start"].tolist(),
 97 |             weekly_messages["message_count"].tolist(),
 98 |         )
 99 |     ]  # pyright: ignore
100 | 
101 | 
102 | def generate_new_chats_per_week_data(conversations: List[Conversation]) -> dict:
103 |     chat_starts = pd.DataFrame(
104 |         [
105 |             {
106 |                 "datetime": pd.to_datetime(str(conv.created_at).replace("Z", "+00:00")),
107 |                 "chat_id": conv.chat_id,
108 |             }
109 |             for conv in conversations
110 |         ]
111 |     )
112 |     chat_starts["week_start"] = (
113 |         chat_starts["datetime"].dt.to_period("W-MON").dt.start_time
114 |     )
115 |     weekly_chats = (
116 |         chat_starts.groupby("week_start").size().reset_index(name="chat_count")  # pyright: ignore
117 |     )
118 |     weekly_chats["week_start"] = weekly_chats["week_start"].dt.strftime("%Y-%m-%d")
119 | 
120 |     return [
121 |         {"x": x, "y": y}
122 |         for x, y in zip(
123 |             weekly_chats["week_start"].tolist(), weekly_chats["chat_count"].tolist()
124 |         )
125 |     ]  # pyright: ignore
126 | 


--------------------------------------------------------------------------------
/kura/dimensionality.py:
--------------------------------------------------------------------------------
  1 | from kura.base_classes import BaseDimensionalityReduction, BaseEmbeddingModel
  2 | from kura.types import Cluster, ProjectedCluster
  3 | from kura.embedding import OpenAIEmbeddingModel
  4 | from typing import Union
  5 | import numpy as np
  6 | import logging
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class HDBUMAP(BaseDimensionalityReduction):
 12 |     @property
 13 |     def checkpoint_filename(self) -> str:
 14 |         """The filename to use for checkpointing this model's output."""
 15 |         return "dimensionality.jsonl"
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         embedding_model: BaseEmbeddingModel = OpenAIEmbeddingModel(),
 20 |         n_components: int = 2,
 21 |         min_dist: float = 0.1,
 22 |         metric: str = "cosine",
 23 |         n_neighbors: Union[int, None] = None,
 24 |     ):
 25 |         self.embedding_model = embedding_model
 26 |         self.n_components = n_components
 27 |         self.min_dist = min_dist
 28 |         self.metric = metric
 29 |         self.n_neighbors = n_neighbors
 30 |         logger.info(
 31 |             f"Initialized HDBUMAP with embedding_model={type(embedding_model).__name__}, n_components={n_components}, min_dist={min_dist}, metric={metric}, n_neighbors={n_neighbors}"
 32 |         )
 33 | 
 34 |     async def reduce_dimensionality(
 35 |         self, clusters: list[Cluster]
 36 |     ) -> list[ProjectedCluster]:
 37 |         # Embed all clusters
 38 |         from umap import UMAP
 39 | 
 40 |         if not clusters:
 41 |             logger.warning("Empty clusters list provided to reduce_dimensionality")
 42 |             return []
 43 | 
 44 |         logger.info(f"Starting dimensionality reduction for {len(clusters)} clusters")
 45 |         texts_to_embed = [str(c) for c in clusters]
 46 | 
 47 |         try:
 48 |             cluster_embeddings = await self.embedding_model.embed(texts_to_embed)
 49 |             logger.debug(f"Generated embeddings for {len(clusters)} clusters")
 50 |         except Exception as e:
 51 |             logger.error(f"Failed to generate embeddings for clusters: {e}")
 52 |             raise
 53 | 
 54 |         if not cluster_embeddings or len(cluster_embeddings) != len(texts_to_embed):
 55 |             logger.error(
 56 |                 f"Error: Number of embeddings ({len(cluster_embeddings) if cluster_embeddings else 0}) does not match number of clusters ({len(texts_to_embed)}) or embeddings are empty."
 57 |             )
 58 |             return []
 59 | 
 60 |         embeddings = np.array(cluster_embeddings)
 61 |         logger.debug(f"Created embedding matrix of shape {embeddings.shape}")
 62 | 
 63 |         # Project to 2D using UMAP
 64 |         n_neighbors_actual = (
 65 |             self.n_neighbors if self.n_neighbors else min(15, len(embeddings) - 1)
 66 |         )
 67 |         logger.debug(
 68 |             f"Using UMAP with n_neighbors={n_neighbors_actual}, min_dist={self.min_dist}, metric={self.metric}"
 69 |         )
 70 | 
 71 |         try:
 72 |             umap_reducer = UMAP(
 73 |                 n_components=self.n_components,
 74 |                 n_neighbors=n_neighbors_actual,
 75 |                 min_dist=self.min_dist,
 76 |                 metric=self.metric,
 77 |             )
 78 |             reduced_embeddings = umap_reducer.fit_transform(embeddings)
 79 |             logger.info(
 80 |                 f"UMAP dimensionality reduction completed: {embeddings.shape} -> {reduced_embeddings.shape}"  # type: ignore
 81 |             )
 82 |         except Exception as e:
 83 |             logger.error(f"UMAP dimensionality reduction failed: {e}")
 84 |             raise
 85 | 
 86 |         # Create projected clusters with 2D coordinates
 87 |         res = []
 88 |         for i, cluster in enumerate(clusters):
 89 |             projected = ProjectedCluster(
 90 |                 slug=cluster.slug,
 91 |                 id=cluster.id,
 92 |                 name=cluster.name,
 93 |                 description=cluster.description,
 94 |                 chat_ids=cluster.chat_ids,
 95 |                 parent_id=cluster.parent_id,
 96 |                 x_coord=float(reduced_embeddings[i][0]),  # pyright: ignore
 97 |                 y_coord=float(reduced_embeddings[i][1]),  # pyright: ignore
 98 |                 level=0
 99 |                 if cluster.parent_id is None
100 |                 else 1,  # TODO: Fix this, should reflect the level of the cluster
101 |             )
102 |             res.append(projected)
103 | 
104 |         logger.info(f"Successfully created {len(res)} projected clusters")
105 |         return res
106 | 


--------------------------------------------------------------------------------
/kura/embedding.py:
--------------------------------------------------------------------------------
  1 | from kura.base_classes import BaseEmbeddingModel
  2 | from asyncio import Semaphore, gather
  3 | from tenacity import retry, wait_fixed, stop_after_attempt
  4 | from openai import AsyncOpenAI
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class OpenAIEmbeddingModel(BaseEmbeddingModel):
 11 |     def __init__(
 12 |         self,
 13 |         model_name: str = "text-embedding-3-small",
 14 |         model_batch_size: int = 50,
 15 |         n_concurrent_jobs: int = 5,
 16 |     ):
 17 |         self.client = AsyncOpenAI()
 18 |         self.model_name = model_name
 19 |         self._model_batch_size = model_batch_size
 20 |         self._n_concurrent_jobs = n_concurrent_jobs
 21 |         self._semaphore = Semaphore(n_concurrent_jobs)
 22 |         logger.info(
 23 |             f"Initialized OpenAIEmbeddingModel with model={model_name}, batch_size={model_batch_size}, concurrent_jobs={n_concurrent_jobs}"
 24 |         )
 25 | 
 26 |     def slug(self):
 27 |         return f"openai:{self.model_name}-batchsize:{self._model_batch_size}-concurrent:{self._n_concurrent_jobs}"
 28 | 
 29 |     @retry(wait=wait_fixed(3), stop=stop_after_attempt(3))
 30 |     async def _embed_batch(self, texts: list[str]) -> list[list[float]]:
 31 |         """Embed a single batch of texts."""
 32 |         async with self._semaphore:
 33 |             try:
 34 |                 logger.debug(
 35 |                     f"Embedding batch of {len(texts)} texts using model {self.model_name}"
 36 |                 )
 37 |                 resp = await self.client.embeddings.create(
 38 |                     input=texts, model=self.model_name
 39 |                 )
 40 |                 embeddings = [item.embedding for item in resp.data]
 41 |                 logger.debug(
 42 |                     f"Successfully embedded batch of {len(texts)} texts, got {len(embeddings)} embeddings"
 43 |                 )
 44 |                 return embeddings
 45 |             except Exception as e:
 46 |                 logger.error(f"Failed to embed batch of {len(texts)} texts: {e}")
 47 |                 raise
 48 | 
 49 |     async def embed(self, texts: list[str]) -> list[list[float]]:
 50 |         if not texts:
 51 |             logger.debug("Empty text list provided, returning empty embeddings")
 52 |             return []
 53 | 
 54 |         logger.info(f"Starting embedding of {len(texts)} texts using {self.model_name}")
 55 | 
 56 |         # Create batches
 57 |         batches = _batch_texts(texts, self._model_batch_size)
 58 |         logger.debug(
 59 |             f"Split {len(texts)} texts into {len(batches)} batches of size {self._model_batch_size}"
 60 |         )
 61 | 
 62 |         # Process all batches concurrently
 63 |         tasks = [self._embed_batch(batch) for batch in batches]
 64 |         try:
 65 |             results_list_of_lists = await gather(*tasks)
 66 |             logger.debug(f"Completed embedding {len(batches)} batches")
 67 |         except Exception as e:
 68 |             logger.error(f"Failed to embed texts: {e}")
 69 |             raise
 70 | 
 71 |         # Flatten results
 72 |         embeddings = []
 73 |         for result_batch in results_list_of_lists:
 74 |             embeddings.extend(result_batch)
 75 | 
 76 |         logger.info(
 77 |             f"Successfully embedded {len(texts)} texts, produced {len(embeddings)} embeddings"
 78 |         )
 79 |         return embeddings
 80 | 
 81 | 
 82 | def _batch_texts(texts: list[str], batch_size: int) -> list[list[str]]:
 83 |     """Helper function to divide a list of texts into batches."""
 84 |     if not texts:
 85 |         return []
 86 | 
 87 |     batches = []
 88 |     for i in range(0, len(texts), batch_size):
 89 |         batch = texts[i : i + batch_size]
 90 |         batches.append(batch)
 91 |     return batches
 92 | 
 93 | 
 94 | class SentenceTransformerEmbeddingModel(BaseEmbeddingModel):
 95 |     def __init__(
 96 |         self,
 97 |         model_name: str = "all-MiniLM-L6-v2",
 98 |         model_batch_size: int = 128,
 99 |     ):
100 |         from sentence_transformers import SentenceTransformer  # type: ignore
101 | 
102 |         logger.info(
103 |             f"Initializing SentenceTransformerEmbeddingModel with model={model_name}, batch_size={model_batch_size}"
104 |         )
105 |         try:
106 |             self.model = SentenceTransformer(model_name)
107 |             self._model_batch_size = model_batch_size
108 |             logger.info(f"Successfully loaded SentenceTransformer model: {model_name}")
109 |         except Exception as e:
110 |             logger.error(f"Failed to load SentenceTransformer model {model_name}: {e}")
111 |             raise
112 | 
113 |     @retry(wait=wait_fixed(3), stop=stop_after_attempt(3))
114 |     async def embed(self, texts: list[str]) -> list[list[float]]:
115 |         if not texts:
116 |             logger.debug("Empty text list provided, returning empty embeddings")
117 |             return []
118 | 
119 |         logger.info(
120 |             f"Starting embedding of {len(texts)} texts using SentenceTransformer"
121 |         )
122 | 
123 |         # Create batches
124 |         batches = _batch_texts(texts, self._model_batch_size)
125 |         logger.debug(
126 |             f"Split {len(texts)} texts into {len(batches)} batches of size {self._model_batch_size}"
127 |         )
128 | 
129 |         # Process all batches
130 |         embeddings = []
131 |         try:
132 |             for i, batch in enumerate(batches):
133 |                 logger.debug(
134 |                     f"Processing batch {i + 1}/{len(batches)} with {len(batch)} texts"
135 |                 )
136 |                 batch_embeddings = self.model.encode(batch).tolist()
137 |                 embeddings.extend(batch_embeddings)
138 |                 logger.debug(f"Completed batch {i + 1}/{len(batches)}")
139 | 
140 |             logger.info(
141 |                 f"Successfully embedded {len(texts)} texts using SentenceTransformer, produced {len(embeddings)} embeddings"
142 |             )
143 |         except Exception as e:
144 |             logger.error(f"Failed to embed texts using SentenceTransformer: {e}")
145 |             raise
146 | 
147 |         return embeddings
148 | 


--------------------------------------------------------------------------------
/kura/k_means.py:
--------------------------------------------------------------------------------
 1 | from kura.base_classes import BaseClusteringMethod
 2 | from sklearn.cluster import KMeans
 3 | import math
 4 | from typing import TypeVar
 5 | import numpy as np
 6 | import logging
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | T = TypeVar("T")
11 | 
12 | 
13 | class KmeansClusteringMethod(BaseClusteringMethod):
14 |     def __init__(self, clusters_per_group: int = 10):
15 |         self.clusters_per_group = clusters_per_group
16 |         logger.info(
17 |             f"Initialized KmeansClusteringMethod with clusters_per_group={clusters_per_group}"
18 |         )
19 | 
20 |     def cluster(self, items: list[T]) -> dict[int, list[T]]:
21 |         """
22 |         We perform a clustering here using an embedding defined on each individual item.
23 | 
24 |         We assume that the item is passed in as a dictionary with
25 | 
26 |         - its relevant embedding stored in the "embedding" key.
27 |         - the item itself stored in the "item" key.
28 | 
29 |         {
30 |             "embedding": list[float],
31 |             "item": any,
32 |         }
33 |         """
34 |         if not items:
35 |             logger.warning("Empty items list provided to cluster method")
36 |             return {}
37 | 
38 |         logger.info(f"Starting K-means clustering of {len(items)} items")
39 | 
40 |         try:
41 |             embeddings = [item["embedding"] for item in items]  # pyright: ignore
42 |             data: list[T] = [item["item"] for item in items]  # pyright: ignore
43 |             n_clusters = math.ceil(len(data) / self.clusters_per_group)
44 | 
45 |             logger.debug(
46 |                 f"Calculated {n_clusters} clusters for {len(data)} items (target: {self.clusters_per_group} items per cluster)"
47 |             )
48 | 
49 |             X = np.array(embeddings)
50 |             logger.debug(f"Created embedding matrix of shape {X.shape}")
51 | 
52 |             kmeans = KMeans(n_clusters=n_clusters)
53 |             cluster_labels = kmeans.fit_predict(X)
54 | 
55 |             logger.debug(
56 |                 f"K-means clustering completed, assigned {len(set(cluster_labels))} unique cluster labels"
57 |             )
58 | 
59 |             result = {
60 |                 i: [data[j] for j in range(len(data)) if cluster_labels[j] == i]
61 |                 for i in range(n_clusters)
62 |             }
63 | 
64 |             # Log cluster size distribution
65 |             cluster_sizes = [len(cluster_items) for cluster_items in result.values()]
66 |             logger.info(
67 |                 f"K-means clustering completed: {len(result)} clusters created with sizes {cluster_sizes}"
68 |             )
69 |             logger.debug(
70 |                 f"Cluster size stats - min: {min(cluster_sizes)}, max: {max(cluster_sizes)}, avg: {sum(cluster_sizes) / len(cluster_sizes):.1f}"
71 |             )
72 | 
73 |             return result
74 | 
75 |         except Exception as e:
76 |             logger.error(
77 |                 f"Failed to perform K-means clustering on {len(items)} items: {e}"
78 |             )
79 |             raise
80 | 


--------------------------------------------------------------------------------
/kura/static/dist/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <title>Kura - Using Language Models for Conversation Clustering</title>
 8 |     <script type="module" crossorigin src="/assets/index-DztdrX1V.js"></script>
 9 |     <link rel="stylesheet" crossorigin href="/assets/index-CvLvA1NY.css">
10 |   </head>
11 |   <body>
12 |     <div id="root"></div>
13 |   </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/kura/static/dist/vite.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>


--------------------------------------------------------------------------------
/kura/types/__init__.py:
--------------------------------------------------------------------------------
 1 | from .conversation import Conversation, Message
 2 | from .cluster import Cluster, GeneratedCluster, ClusterTreeNode
 3 | from .dimensionality import ProjectedCluster
 4 | from .summarisation import ExtractedProperty, GeneratedSummary, ConversationSummary
 5 | 
 6 | __all__ = [
 7 |     "Cluster",
 8 |     "Conversation",
 9 |     "Message",
10 |     "GeneratedCluster",
11 |     "ProjectedCluster",
12 |     "ClusterTreeNode",
13 |     "ExtractedProperty",
14 |     "GeneratedSummary",
15 |     "ConversationSummary",
16 | ]
17 | 


--------------------------------------------------------------------------------
/kura/types/cluster.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field, computed_field
 2 | import uuid
 3 | from typing import Union
 4 | 
 5 | 
 6 | class Cluster(BaseModel):
 7 |     id: str = Field(
 8 |         default_factory=lambda: uuid.uuid4().hex,
 9 |     )
10 |     name: str
11 |     description: str
12 |     slug: str = Field(
13 |         ...,
14 |         description="A three-word snake_case summary of what this cluster is about",
15 |     )
16 |     chat_ids: list[str]
17 |     parent_id: Union[str, None]
18 | 
19 |     @computed_field
20 |     def count(self) -> int:
21 |         return len(self.chat_ids)
22 | 
23 |     def __str__(self) -> str:
24 |         return f"Name: {self.name}\nDescription: {self.description}"
25 | 
26 | 
27 | class GeneratedCluster(BaseModel):
28 |     name: str = Field(
29 |         ...,
30 |         description="A short, imperative sentence (at most ten words) that captures the user's request and distinguishes this cluster from others. Should be specific but reflective of most statements in the group. Examples: 'Brainstorm ideas for a birthday party' or 'Generate blog spam for gambling websites'",
31 |     )
32 |     summary: str = Field(
33 |         ...,
34 |         description="A clear, precise, two-sentence description in past tense that captures the essence of the clustered statements and distinguishes them from contrastive examples. Should be specific to this group without including PII or proper nouns",
35 |     )
36 |     slug: str = Field(
37 |         ...,
38 |         description="A three-word snake_case summary of what this cluster is about. Examples: 'birthday_party_planning', 'gambling_content_generation', 'code_debugging_help'",
39 |     )
40 | 
41 | 
42 | class ClusterTreeNode(BaseModel):
43 |     id: str
44 |     name: str
45 |     description: str
46 |     slug: str
47 |     count: int
48 |     children: list[str]
49 | 


--------------------------------------------------------------------------------
/kura/types/conversation.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel
  2 | from datetime import datetime
  3 | from typing import Literal, Union, Callable
  4 | import json
  5 | import importlib
  6 | from tqdm import tqdm
  7 | 
  8 | metadata_dict = dict[
  9 |     str, Union[str, int, float, bool, list[str], list[int], list[float]]
 10 | ]
 11 | 
 12 | 
 13 | class Message(BaseModel):
 14 |     created_at: datetime
 15 |     role: Literal["user", "assistant"]
 16 |     content: str
 17 | 
 18 | 
 19 | class Conversation(BaseModel):
 20 |     chat_id: str
 21 |     created_at: datetime
 22 |     messages: list[Message]
 23 |     metadata: metadata_dict
 24 | 
 25 |     @classmethod
 26 |     def generate_conversation_dump(
 27 |         cls, conversations: list["Conversation"], file_path: str
 28 |     ) -> None:
 29 |         with open(file_path, "w") as f:
 30 |             json.dump(
 31 |                 [
 32 |                     conversation.model_dump(mode="json")
 33 |                     for conversation in conversations
 34 |                 ],
 35 |                 f,
 36 |             )
 37 | 
 38 |     @classmethod
 39 |     def from_conversation_dump(cls, file_path: str) -> list["Conversation"]:
 40 |         with open(file_path, "r") as f:
 41 |             return [Conversation(**conversation) for conversation in json.load(f)]
 42 | 
 43 |     @classmethod
 44 |     def from_hf_dataset(
 45 |         cls,
 46 |         dataset_name: str,
 47 |         split: str = "train",
 48 |         max_conversations: Union[int, None] = None,
 49 |         chat_id_fn=lambda x: x["chat_id"],
 50 |         created_at_fn=lambda x: x["created_at"],
 51 |         messages_fn=lambda x: x["messages"],
 52 |         metadata_fn=lambda x: {},
 53 |     ) -> list["Conversation"]:
 54 |         if importlib.util.find_spec("datasets") is None:  # type: ignore
 55 |             raise ImportError(
 56 |                 "Please install hf datasets to load conversations from a dataset"
 57 |             )
 58 |         from datasets import load_dataset  # type: ignore
 59 | 
 60 |         if max_conversations:
 61 |             dataset = load_dataset(dataset_name, split=split, streaming=True).take(  # type: ignore
 62 |                 max_conversations
 63 |             )
 64 |         else:
 65 |             dataset = load_dataset(dataset_name, split=split, streaming=True)
 66 | 
 67 |         return [
 68 |             Conversation(
 69 |                 chat_id=chat_id_fn(item),
 70 |                 created_at=created_at_fn(item),
 71 |                 messages=messages_fn(item),
 72 |                 metadata=metadata_fn(item),
 73 |             )
 74 |             for item in tqdm(dataset, desc="Loading Conversations")
 75 |         ]
 76 | 
 77 |     @classmethod
 78 |     def from_claude_conversation_dump(
 79 |         cls,
 80 |         file_path: str,
 81 |         metadata_fn: Callable[[dict], metadata_dict] = lambda x: {},
 82 |     ) -> list["Conversation"]:
 83 |         with open(file_path, "r") as f:
 84 |             return [
 85 |                 Conversation(
 86 |                     chat_id=conversation["uuid"],
 87 |                     created_at=conversation["created_at"],
 88 |                     messages=[
 89 |                         Message(
 90 |                             created_at=datetime.fromisoformat(
 91 |                                 message["created_at"].replace("Z", "+00:00")
 92 |                             ),
 93 |                             role="user"
 94 |                             if message["sender"] == "human"
 95 |                             else "assistant",
 96 |                             content="\n".join(
 97 |                                 [
 98 |                                     item["text"]
 99 |                                     for item in message["content"]
100 |                                     if item["type"] == "text"
101 |                                 ]
102 |                             ),
103 |                         )
104 |                         for message in sorted(
105 |                             conversation["chat_messages"],
106 |                             key=lambda x: (
107 |                                 datetime.fromisoformat(
108 |                                     x["created_at"].replace("Z", "+00:00")
109 |                                 ),
110 |                                 0 if x["sender"] == "human" else 1,
111 |                             ),
112 |                         )
113 |                     ],
114 |                     metadata=metadata_fn(conversation),
115 |                 )
116 |                 for conversation in json.load(f)
117 |             ]
118 | 


--------------------------------------------------------------------------------
/kura/types/dimensionality.py:
--------------------------------------------------------------------------------
1 | from .cluster import Cluster
2 | 
3 | 
4 | class ProjectedCluster(Cluster):
5 |     x_coord: float
6 |     y_coord: float
7 |     level: int
8 | 


--------------------------------------------------------------------------------
/kura/types/summarisation.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import Optional, Union
 3 | 
 4 | 
 5 | class GeneratedSummary(BaseModel):
 6 |     summary: str = Field(
 7 |         ...,
 8 |         description="A clear and concise summary of the conversation in at most two sentences, avoiding phrases like 'Based on the conversation' and excluding proper nouns or PII",
 9 |     )
10 |     request: Optional[str] = Field(
11 |         None, description="The user's overall request for the assistant"
12 |     )
13 |     topic: Optional[str] = Field(
14 |         None,
15 |         description="The main high-level topic of the conversation (e.g., 'software development', 'creative writing', 'scientific research').",
16 |     )
17 |     languages: Optional[list[str]] = Field(
18 |         None,
19 |         description="Main languages present in the conversation including human and programming languages (e.g., ['english', 'arabic', 'python', 'javascript'])",
20 |     )
21 |     task: Optional[str] = Field(
22 |         None, description="The task the model is being asked to perform"
23 |     )
24 |     concerning_score: Optional[int] = Field(
25 |         None, ge=1, le=5, description="Safety concern rating from 1-5 scale"
26 |     )
27 |     user_frustration: Optional[int] = Field(
28 |         None, ge=1, le=5, description="User frustration rating from 1-5 scale"
29 |     )
30 |     assistant_errors: Optional[list[str]] = Field(
31 |         None, description="List of errors the assistant made"
32 |     )
33 | 
34 |     def __repr__(self) -> str:
35 |         return f"""<summary>{self.summary}</summary>
36 | <topic>{self.topic}</topic>
37 | <request>{self.request}</request>
38 | <task>{self.task}</task>
39 | <languages>{self.languages}</languages>
40 | <assistant_errors>{self.assistant_errors}</assistant_errors>"""
41 | 
42 | 
43 | class ConversationSummary(GeneratedSummary):
44 |     chat_id: str
45 |     metadata: dict
46 |     embedding: Optional[list[float]] = None
47 | 
48 | 
49 | class ExtractedProperty(BaseModel):
50 |     name: str
51 |     value: Union[str, int, float, bool, list[str], list[int], list[float]]
52 | 


--------------------------------------------------------------------------------
/kura/v1/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Kura V1: Procedural Implementation
 3 | 
 4 | A functional approach to conversation analysis that breaks down the pipeline
 5 | into composable functions for better flexibility and testability.
 6 | """
 7 | 
 8 | from .kura import (
 9 |     # Core pipeline functions
10 |     summarise_conversations,
11 |     generate_base_clusters_from_conversation_summaries,
12 |     reduce_clusters_from_base_clusters,
13 |     reduce_dimensionality_from_clusters,
14 |     # Checkpoint management
15 |     CheckpointManager,
16 | )
17 | 
18 | __all__ = [
19 |     # Core functions
20 |     "summarise_conversations",
21 |     "generate_base_clusters_from_conversation_summaries",
22 |     "reduce_clusters_from_base_clusters",
23 |     "reduce_dimensionality_from_clusters",
24 |     # Utilities
25 |     "CheckpointManager",
26 | ]
27 | 
28 | __version__ = "1.0.0"
29 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | site_name: Kura
  2 | site_url: https://usekura.xyz
  3 | site_author: Ivan Leo
  4 | site_description: A tool for analysing and visualising chat data
  5 | repo_name: kura
  6 | repo_url: https://github.com/567-labs/kura
  7 | edit_uri: edit/main/docs/
  8 | 
  9 | extra:
 10 |   social:
 11 |     - icon: fontawesome/brands/github
 12 |       link: https://github.com/567-labs/kura
 13 |     - icon: fontawesome/brands/twitter
 14 |       link: https://twitter.com/ivanleomk
 15 |   analytics:
 16 |     provider: google
 17 |     property: G-5ZYBS2C5VF
 18 |     feedback:
 19 |       title: Was this page helpful?
 20 |       ratings:
 21 |         - icon: material/emoticon-happy-outline
 22 |           name: This page was helpful
 23 |           data: 1
 24 |           note: >-
 25 |             Thanks for your feedback!
 26 |         - icon: material/emoticon-sad-outline
 27 |           name: This page could be improved
 28 |           data: 0
 29 |           note: >-
 30 |             Thanks for your feedback! Help us improve this page by
 31 |             using our <a href="https://github.com/567-labs/kura/issues" target="_blank" rel="noopener">GitHub issues</a>.
 32 | 
 33 | theme:
 34 |   name: material
 35 |   icon:
 36 |     repo: fontawesome/brands/github
 37 |     edit: material/pencil
 38 |     view: material/eye
 39 |   features:
 40 |     - content.action.edit
 41 |     - content.action.view
 42 |     - content.code.select
 43 |     - content.code.annotate
 44 |     - content.code.copy
 45 |     - content.tabs.link
 46 |     - content.tooltips
 47 |     - navigation.tabs
 48 |     - navigation.sections
 49 |     - navigation.expand
 50 |     - navigation.footer
 51 |     - navigation.instant
 52 |     - navigation.instant.prefetch
 53 |     - navigation.instant.progress
 54 |     - navigation.top
 55 |     - navigation.tracking
 56 |     - search.highlight
 57 |     - search.share
 58 |     - search.suggest
 59 |     - toc.follow
 60 |     - content.social.cards
 61 |   palette:
 62 |     - scheme: default
 63 |       primary: indigo
 64 |       accent: indigo
 65 |       toggle:
 66 |         icon: material/brightness-7
 67 |         name: Switch to dark mode
 68 |     - scheme: slate
 69 |       primary: indigo
 70 |       accent: indigo
 71 |       toggle:
 72 |         icon: material/brightness-4
 73 |         name: Switch to light mode
 74 |   font:
 75 |     text: Roboto
 76 |     code: Roboto Mono
 77 |   favicon: assets/favicon.png
 78 | 
 79 | copyright: Copyright &copy; 2025 Ivan Leo
 80 | 
 81 | nav:
 82 |   - Home: index.md
 83 |   - Getting Started:
 84 |       - Installation: getting-started/installation.md
 85 |       - Quickstart: getting-started/quickstart.md
 86 |       - Configuration: getting-started/configuration.md
 87 |   - Core Concepts:
 88 |       - Overview: core-concepts/overview.md
 89 |       - Conversations: core-concepts/conversations.md
 90 |       - Summarization: core-concepts/summarization.md
 91 |       - Embedding: core-concepts/embedding.md
 92 |       - Clustering: core-concepts/clustering.md
 93 |       - Meta-Clustering: core-concepts/meta-clustering.md
 94 |       - Dimensionality Reduction: core-concepts/dimensionality-reduction.md
 95 |   - API Reference: api/index.md
 96 |   - Blog:
 97 |       - blog/index.md
 98 | 
 99 | plugins:
100 |   - social
101 |   - search:
102 |       separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\b)(?=[A-Z][a-z])'
103 |   - minify:
104 |       minify_html: true
105 |   - blog
106 |   - rss:
107 |       match_path: blog/posts/.*
108 |       date_from_meta:
109 |         as_creation: date
110 |       categories:
111 |         - categories
112 |         - tags
113 |       use_git: false
114 |   - mkdocstrings:
115 |       default_handler: python
116 |       handlers:
117 |         python:
118 |           paths: [../kura]
119 |           options:
120 |             docstring_style: google
121 |             show_root_heading: true
122 |             show_if_no_docstring: true
123 |             show_signature_annotations: true
124 |             members_order: alphabetical
125 |             allow_inspection: true
126 |             show_bases: true
127 | 
128 | markdown_extensions:
129 |   - abbr
130 |   - admonition
131 |   - pymdownx.details
132 |   - attr_list
133 |   - def_list
134 |   - footnotes
135 |   - md_in_html
136 |   - toc:
137 |       permalink: true
138 |   - pymdownx.betterem:
139 |       smart_enable: all
140 |   - pymdownx.caret
141 |   - pymdownx.details
142 |   - pymdownx.highlight:
143 |       anchor_linenums: true
144 |       line_spans: __span
145 |       pygments_lang_class: true
146 |   - pymdownx.inlinehilite
147 |   - pymdownx.keys
148 |   - pymdownx.magiclink:
149 |       normalize_issue_symbols: true
150 |       repo_url_shorthand: true
151 |       user: 567-labs
152 |       repo: kura
153 |   - pymdownx.mark
154 |   - pymdownx.smartsymbols
155 |   - pymdownx.snippets
156 |   - pymdownx.tabbed:
157 |       alternate_style: true
158 |       combine_header_slug: true
159 |   - pymdownx.tasklist:
160 |       custom_checkbox: true
161 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "kura"
 3 | version = "0.5.0"
 4 | description = "Kura is a tool for analysing and visualising chat data"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "Ivan Leo", email = "ivanleomk@gmail.com" }
 8 | ]
 9 | requires-python = ">=3.9"
10 | dependencies = [
11 |     "pandas>=2.2.3",
12 |     "rich>=13.9.4",
13 |     "scikit-learn>=1.6.0",
14 |     "uvicorn>=0.34.0",
15 |     "fastapi[standard]>=0.115.6",
16 |     "umap-learn>=0.5.7",
17 |     "hdbscan>=0.8.40",
18 |     "eval-type-backport>=0.2.2",
19 |     "jsonref>=1.1.0",
20 |     "instructor>=1.8.3",
21 |     "thefuzz>=0.22.1",
22 |     "typer>=0.9.0",
23 |     "sqlmodel>=0.0.14",
24 |     "datasets>=3.6.0",
25 | ]
26 | 
27 | 
28 | 
29 | [build-system]
30 | requires = ["hatchling"]
31 | build-backend = "hatchling.build"
32 | 
33 | [dependency-groups]
34 | docs = [
35 |     "mkdocs-material[imaging]>=9.5.49",
36 |     "mkdocs-minify-plugin>=0.8.0",
37 |     "mkdocs-rss-plugin>=1.17.1",
38 |     "mkdocstrings>=0.27.0",
39 |     "mkdocstrings-python>=1.13.0",
40 |     "pymdown-extensions>=10.14",
41 |     "mkdocs-material-extensions>=1.3.1",
42 |     "mkdocs-rss-plugin>=1.17.1",
43 |     "mkdocstrings[python]>=0.27.0",
44 |     "mkdocs-minify-plugin>=0.8.0",
45 | ]
46 | dev = [
47 |     "pyright>=1.1.399",
48 |     "pytest>=8.3.5",
49 |     "pytest-asyncio>=0.26.0",
50 |     "pre-commit>=4.2.0",
51 |     "ruff>=0.11.11",
52 |     "mkdocs>=1.6.1",
53 |     "mkdocs-material>=9.5.49",
54 | ]
55 | 
56 | 
57 | [project.scripts]
58 | kura = "kura.cli.cli:app"
59 | 
60 | [tool.pytest.ini_options]
61 | addopts = "--ignore=scripts/tutorial_procedural_api.py --ignore=scripts/tutorial_class_api.py"
62 | 
63 | [tool.pyright]
64 | include = ["kura"]
65 | exclude = [
66 |     "**/node_modules",
67 |     "**/__pycache__",
68 |     "src/experimental",
69 |     "src/typestubs",
70 |     "**/tests/**",
71 | ]
72 | 
73 | reportMissingImports = "error"
74 | reportMissingTypeStubs = false
75 | 
76 | pythonVersion = "3.9"
77 | pythonPlatform = "Linux"
78 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | mkdocs>=1.5.0
 2 | mkdocs-material==9.5.44
 3 | Pillow>=10.0.0
 4 | mkdocs-minify-plugin==0.8.0
 5 | mkdocstrings==0.27.0
 6 | mkdocstrings-python==1.12.2
 7 | mkdocs-jupyter>=0.24.0
 8 | mkdocs-redirects>=1.2.0
 9 | pymdown-extensions==10.12
10 | mkdocs-rss-plugin==1.16.0
11 | mkdocs-material[imaging]==9.5.44


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Development Scripts
 2 | 
 3 | This directory contains utility scripts for Kura development.
 4 | 
 5 | ## Available Scripts
 6 | 
 7 | ### build_docs.sh
 8 | 
 9 | Builds the documentation using MkDocs with the mkdocstrings plugin for automatic API documentation generation.
10 | 
11 | Usage:
12 | ```bash
13 | ./scripts/build_docs.sh
14 | ```
15 | 
16 | This script will:
17 | 1. Install required documentation dependencies
18 | 2. Build the documentation
19 | 3. Provide instructions for serving the documentation locally


--------------------------------------------------------------------------------
/scripts/build_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to build and serve the documentation
 3 | 
 4 | # Install uv if not already installed
 5 | if ! command -v uv &> /dev/null; then
 6 |     curl -LsSf https://astral.sh/uv/install.sh | sh
 7 | fi
 8 | 
 9 | # Create and activate virtual environment if it doesn't exist
10 | if [ ! -d ".venv" ]; then
11 |     uv venv
12 | fi
13 | 
14 | # Activate virtual environment
15 | source .venv/bin/activate
16 | 
17 | # Install dependencies using uv
18 | uv pip install -e ".[docs]"
19 | 
20 | # Build the documentation
21 | echo "Building documentation..."
22 | python3 -m mkdocs build
23 | 
24 | # Serve the documentation (optional)
25 | echo "To serve the documentation locally, run:"
26 | echo "python3 -m mkdocs serve --dev-addr=127.0.0.1:8000"
27 | 


--------------------------------------------------------------------------------
/scripts/test_sentence_transformer_real.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import platform
 3 | import asyncio
 4 | from kura.embedding import SentenceTransformerEmbeddingModel
 5 | 
 6 | 
 7 | def main():
 8 |     if platform.system() != "Darwin":
 9 |         print("Skipping: This test only runs on macOS (Darwin).")
10 |         sys.exit(0)
11 | 
12 |     texts = [
13 |         "Hello world!",
14 |         "Sentence Transformers are great.",
15 |         "This is a test sentence.",
16 |     ]
17 | 
18 |     async def run():
19 |         print("Instantiating SentenceTransformerEmbeddingModel...")
20 |         model = SentenceTransformerEmbeddingModel()
21 |         print("Embedding texts:", texts)
22 |         try:
23 |             embeddings = await model.embed(texts)
24 |             print("Embeddings:")
25 |             for i, emb in enumerate(embeddings):
26 |                 print(f"Text {i}: {emb[:5]}... (len={len(emb)})")
27 |         except Exception as e:
28 |             print(f"Error during embedding: {e}")
29 | 
30 |     asyncio.run(run())
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/scripts/tutorial_class_api.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import asyncio
  3 | from contextlib import contextmanager
  4 | 
  5 | 
  6 | @contextmanager
  7 | def timer(message):
  8 |     start_time = time.time()
  9 |     yield
 10 |     end_time = time.time()
 11 |     print(f"{message} took {end_time - start_time:.2f} seconds")
 12 | 
 13 | 
 14 | def show_section_header(title):
 15 |     """Display a formatted section header."""
 16 |     print(f"\n{'=' * 60}")
 17 |     print(f"{title:^60}")
 18 |     print(f"{'=' * 60}\n")
 19 | 
 20 | 
 21 | with timer("Importing kura modules"):
 22 |     # Import the class-based Kura API
 23 |     from kura import Kura
 24 |     from kura.types import Conversation
 25 |     import json
 26 |     import os
 27 | 
 28 | 
 29 | # Initialize Kura with checkpoint directory
 30 | kura = Kura(checkpoint_dir="./tutorial_checkpoints_class")
 31 | 
 32 | with timer("Loading sample conversations"):
 33 |     conversations = Conversation.from_hf_dataset(
 34 |         "ivanleomk/synthetic-gemini-conversations", split="train"
 35 |     )
 36 | 
 37 | print(f"Loaded {len(conversations)} conversations successfully!\n")
 38 | 
 39 | # Save conversations to JSON for database loading
 40 | show_section_header("Saving Conversations")
 41 | 
 42 | with timer("Saving conversations to JSON"):
 43 |     # Ensure checkpoint directory exists
 44 |     os.makedirs("./tutorial_checkpoints_class", exist_ok=True)
 45 | 
 46 |     # Convert conversations to JSON format
 47 |     conversations_data = [conv.model_dump() for conv in conversations]
 48 | 
 49 |     # Save to conversations.json
 50 |     with open("./tutorial_checkpoints_class/conversations.json", "w") as f:
 51 |         json.dump(conversations_data, f, indent=2, default=str)
 52 | 
 53 | print(
 54 |     f"Saved {len(conversations)} conversations to tutorial_checkpoints_class/conversations.json\n"
 55 | )
 56 | 
 57 | # Sample conversation examination
 58 | show_section_header("Sample Data Examination")
 59 | 
 60 | sample_conversation = conversations[0]
 61 | 
 62 | # Print conversation details
 63 | print("Sample Conversation Details:")
 64 | print(f"Chat ID: {sample_conversation.chat_id}")
 65 | print(f"Created At: {sample_conversation.created_at}")
 66 | print(f"Number of Messages: {len(sample_conversation.messages)}")
 67 | print()
 68 | 
 69 | # Sample messages
 70 | print("Sample Messages:")
 71 | for i, msg in enumerate(sample_conversation.messages[:3]):
 72 |     content_preview = (
 73 |         msg.content[:100] + "..." if len(msg.content) > 100 else msg.content
 74 |     )
 75 |     print(f"  {msg.role}: {content_preview}")
 76 | 
 77 | print()
 78 | 
 79 | # Processing section
 80 | show_section_header("Conversation Processing")
 81 | 
 82 | print("Starting conversation clustering...")
 83 | 
 84 | 
 85 | async def process_conversations():
 86 |     """Process conversations using the class-based API."""
 87 |     print("Running the complete pipeline through Kura class...")
 88 |     with timer("Complete pipeline processing"):
 89 |         result = await kura.cluster_conversations(conversations)
 90 | 
 91 |     print("\nPipeline complete!")
 92 |     print("Processing Summary:")
 93 |     print(f"  • Input conversations: {len(conversations)}")
 94 |     print(f"  • Result clusters: {len(result) if result else 0}")
 95 |     print(f"  • Checkpoints saved to: {kura.checkpoint_dir}")
 96 | 
 97 |     return result
 98 | 
 99 | 
100 | # Run the processing
101 | result = asyncio.run(process_conversations())
102 | 
103 | print()
104 | 
105 | # Visualization section
106 | show_section_header("Cluster Visualization")
107 | 
108 | print("1. Basic cluster visualization:")
109 | print("-" * 50)
110 | with timer("Basic visualization"):
111 |     kura.visualise_clusters()
112 | 
113 | print("\n2. Enhanced cluster visualization:")
114 | print("-" * 50)
115 | with timer("Enhanced visualization"):
116 |     kura.visualise_clusters_enhanced()
117 | 
118 | print("\n3. Rich cluster visualization:")
119 | print("-" * 50)
120 | with timer("Rich visualization"):
121 |     kura.visualise_clusters_rich()
122 | 
123 | # Summary statistics
124 | show_section_header("Cluster Statistics")
125 | 
126 | if result:
127 |     # Count root clusters (clusters with no parent)
128 |     root_clusters = [c for c in result if c.parent_id is None]
129 |     print(f"Total Clusters: {len(result)}")
130 |     print(f"Root Clusters: {len(root_clusters)}")
131 |     print(f"Total Conversations: {sum(c.count for c in root_clusters)}")
132 |     print(
133 |         f"Average Conversations per Root Cluster: {sum(c.count for c in root_clusters) / len(root_clusters):.1f}"
134 |     )
135 | 
136 |     # Show cluster size distribution
137 |     print("\nCluster Size Distribution:")
138 |     large_clusters = [c for c in root_clusters if c.count > 100]
139 |     medium_clusters = [c for c in root_clusters if 21 <= c.count <= 100]
140 |     small_clusters = [c for c in root_clusters if 6 <= c.count <= 20]
141 |     tiny_clusters = [c for c in root_clusters if 1 <= c.count <= 5]
142 | 
143 |     print(f"  • Large clusters (>100 conversations): {len(large_clusters)}")
144 |     print(f"  • Medium clusters (21-100 conversations): {len(medium_clusters)}")
145 |     print(f"  • Small clusters (6-20 conversations): {len(small_clusters)}")
146 |     print(f"  • Tiny clusters (1-5 conversations): {len(tiny_clusters)}")
147 | else:
148 |     print("No clusters were generated.")
149 | 
150 | print("\n" + "=" * 80)
151 | print("✨ TUTORIAL COMPLETE!")
152 | print("=" * 80)
153 | 
154 | print("Class-Based API Benefits Demonstrated:")
155 | print("  ✅ Simple one-line processing with cluster_conversations()")
156 | print("  ✅ Automatic checkpoint management")
157 | print("  ✅ Built-in visualization methods")
158 | print("  • All components integrated in single class")
159 | print("  • Direct access to intermediate results")
160 | print("  • Multiple visualization styles available")
161 | print()
162 | 
163 | print("Key Differences from Procedural API:")
164 | print("  • Single orchestrating class instead of separate functions")
165 | print("  • State management within the Kura instance")
166 | print("  • Less granular control but simpler to use")
167 | print("  • All models initialized automatically")
168 | print()
169 | 
170 | print(f"Check '{kura.checkpoint_dir}' for saved intermediate results!")
171 | print("Try different visualization styles:")
172 | print("  - kura.visualise_clusters() for basic view")
173 | print("  - kura.visualise_clusters_enhanced() for enhanced view")
174 | print("  - kura.visualise_clusters_rich() for rich view")
175 | print("\nTo start the web interface, run:")
176 | print(f"  kura start-app --dir {kura.checkpoint_dir}")
177 | 


--------------------------------------------------------------------------------
/scripts/tutorial_procedural_api.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import asyncio
  3 | from contextlib import contextmanager
  4 | 
  5 | 
  6 | @contextmanager
  7 | def timer(message):
  8 |     start_time = time.time()
  9 |     yield
 10 |     end_time = time.time()
 11 |     print(f"{message} took {end_time - start_time:.2f} seconds")
 12 | 
 13 | 
 14 | def show_section_header(title):
 15 |     """Display a formatted section header."""
 16 |     print(f"\n{'=' * 60}")
 17 |     print(f"{title:^60}")
 18 |     print(f"{'=' * 60}\n")
 19 | 
 20 | 
 21 | with timer("Importing kura modules"):
 22 |     # Import the procedural Kura v1 components
 23 |     from kura import (
 24 |         summarise_conversations,
 25 |         generate_base_clusters_from_conversation_summaries,
 26 |         reduce_clusters_from_base_clusters,
 27 |         reduce_dimensionality_from_clusters,
 28 |         CheckpointManager,
 29 |     )
 30 | 
 31 |     # Import visualization functions
 32 |     from kura.visualization import (
 33 |         visualise_clusters_enhanced,
 34 |         visualise_clusters_rich,
 35 |         visualise_from_checkpoint_manager,
 36 |         visualise_pipeline_results,
 37 |     )
 38 | 
 39 |     # Import existing Kura models and types
 40 |     from kura.types import Conversation
 41 |     from kura.summarisation import SummaryModel
 42 |     from kura.cluster import ClusterModel
 43 |     from kura.meta_cluster import MetaClusterModel
 44 |     from kura.dimensionality import HDBUMAP
 45 | 
 46 |     from rich.console import Console
 47 | 
 48 | 
 49 | # Set up individual models
 50 | console = Console()
 51 | summary_model = SummaryModel(console=console)
 52 | cluster_model = ClusterModel(console=console)
 53 | meta_cluster_model = MetaClusterModel(console=console)
 54 | dimensionality_model = HDBUMAP()
 55 | 
 56 | # Set up checkpointing
 57 | checkpoint_manager = CheckpointManager("./tutorial_checkpoints", enabled=True)
 58 | 
 59 | with timer("Loading sample conversations"):
 60 |     conversations = Conversation.from_hf_dataset(
 61 |         "ivanleomk/synthetic-gemini-conversations", split="train"
 62 |     )
 63 | 
 64 | print(f"Loaded {len(conversations)} conversations successfully!\n")
 65 | 
 66 | # Save conversations to JSON for database loading
 67 | show_section_header("Saving Conversations")
 68 | 
 69 | with timer("Saving conversations to JSON"):
 70 |     import json
 71 |     import os
 72 | 
 73 |     # Ensure checkpoint directory exists
 74 |     os.makedirs("./tutorial_checkpoints", exist_ok=True)
 75 | 
 76 |     # Convert conversations to JSON format
 77 |     conversations_data = [conv.model_dump() for conv in conversations]
 78 | 
 79 |     # Save to conversations.json
 80 |     with open("./tutorial_checkpoints/conversations.json", "w") as f:
 81 |         json.dump(conversations_data, f, indent=2, default=str)
 82 | 
 83 | print(
 84 |     f"Saved {len(conversations)} conversations to tutorial_checkpoints/conversations.json\n"
 85 | )
 86 | 
 87 | # Sample conversation examination
 88 | show_section_header("Sample Data Examination")
 89 | 
 90 | sample_conversation = conversations[0]
 91 | 
 92 | # Print conversation details
 93 | print("Sample Conversation Details:")
 94 | print(f"Chat ID: {sample_conversation.chat_id}")
 95 | print(f"Created At: {sample_conversation.created_at}")
 96 | print(f"Number of Messages: {len(sample_conversation.messages)}")
 97 | print()
 98 | 
 99 | # Sample messages
100 | print("Sample Messages:")
101 | for i, msg in enumerate(sample_conversation.messages[:3]):
102 |     content_preview = (
103 |         msg.content[:100] + "..." if len(msg.content) > 100 else msg.content
104 |     )
105 |     print(f"  {msg.role}: {content_preview}")
106 | 
107 | print()
108 | 
109 | # Processing section
110 | show_section_header("Conversation Processing")
111 | 
112 | print("Starting conversation clustering...")
113 | 
114 | 
115 | async def process_with_progress():
116 |     """Process conversations step by step using the procedural API."""
117 |     print("Step 1: Generating conversation summaries...")
118 |     with timer("Conversation summarization"):
119 |         summaries = await summarise_conversations(
120 |             conversations, model=summary_model, checkpoint_manager=checkpoint_manager
121 |         )
122 |     print(f"Generated {len(summaries)} summaries")
123 | 
124 |     print("Step 2: Generating base clusters from summaries...")
125 |     with timer("Base clustering"):
126 |         clusters = await generate_base_clusters_from_conversation_summaries(
127 |             summaries, model=cluster_model, checkpoint_manager=checkpoint_manager
128 |         )
129 |     print(f"Generated {len(clusters)} base clusters")
130 | 
131 |     print("Step 3: Reducing clusters hierarchically...")
132 |     with timer("Meta clustering"):
133 |         reduced_clusters = await reduce_clusters_from_base_clusters(
134 |             clusters, model=meta_cluster_model, checkpoint_manager=checkpoint_manager
135 |         )
136 |     print(f"Reduced to {len(reduced_clusters)} meta clusters")
137 | 
138 |     print("Step 4: Projecting clusters to 2D for visualization...")
139 |     with timer("Dimensionality reduction"):
140 |         projected_clusters = await reduce_dimensionality_from_clusters(
141 |             reduced_clusters,
142 |             model=dimensionality_model,
143 |             checkpoint_manager=checkpoint_manager,
144 |         )
145 |     print(f"Generated {len(projected_clusters)} projected clusters")
146 | 
147 |     return reduced_clusters, projected_clusters
148 | 
149 | 
150 | reduced_clusters, projected_clusters = asyncio.run(process_with_progress())
151 | 
152 | print(f"\nPipeline complete! Generated {len(projected_clusters)} projected clusters!\n")
153 | 
154 | print("Processing Summary:")
155 | print(f"  • Input conversations: {len(conversations)}")
156 | print(f"  • Final reduced clusters: {len(reduced_clusters)}")
157 | print(f"  • Final projected clusters: {len(projected_clusters)}")
158 | print(f"  • Checkpoints saved to: {checkpoint_manager.checkpoint_dir}")
159 | print()
160 | 
161 | print("=" * 80)
162 | print("VISUALIZATION DEMONSTRATION")
163 | print("=" * 80)
164 | 
165 | print("\n1. Basic cluster visualization (from checkpoint):")
166 | print("-" * 50)
167 | with timer("Basic visualization"):
168 |     visualise_from_checkpoint_manager(
169 |         checkpoint_manager, meta_cluster_model, style="basic"
170 |     )
171 | 
172 | print("\n2. Enhanced cluster visualization (from pipeline results):")
173 | print("-" * 50)
174 | with timer("Enhanced visualization"):
175 |     visualise_pipeline_results(reduced_clusters, style="enhanced")
176 | 
177 | print("\n3. Rich cluster visualization (with console integration):")
178 | print("-" * 50)
179 | with timer("Rich visualization"):
180 |     visualise_clusters_rich(reduced_clusters, console=console)
181 | 
182 | print("\n4. Direct checkpoint path visualization:")
183 | print("-" * 50)
184 | checkpoint_path = checkpoint_manager.get_checkpoint_path(
185 |     meta_cluster_model.checkpoint_filename
186 | )
187 | print(f"Loading from: {checkpoint_path}")
188 | with timer("Direct checkpoint visualization"):
189 |     visualise_clusters_enhanced(checkpoint_path=checkpoint_path)
190 | 
191 | print("=" * 80)
192 | print("✨ TUTORIAL COMPLETE!")
193 | print("=" * 80)
194 | 
195 | print("Procedural API Benefits Demonstrated:")
196 | print("  ✅ Step-by-step processing with individual control")
197 | print("  ✅ Flexible checkpoint management")
198 | print("  • Clear separation of concerns")
199 | print("  • Easy to customize individual steps")
200 | print("  • Multiple visualization options")
201 | print()
202 | 
203 | print("Visualization Features Demonstrated:")
204 | print("  • Basic hierarchical tree view")
205 | print("  • Enhanced view with statistics and progress bars")
206 | print("  • Rich-formatted output with colors and tables")
207 | print("  • Direct checkpoint integration")
208 | print("  • Pipeline result visualization")
209 | print()
210 | 
211 | print("CheckpointManager Integration:")
212 | print("  • Automatic checkpoint loading and saving")
213 | print("  • Seamless integration with visualization functions")
214 | print("  • Resume processing from any checkpoint")
215 | print("  • Visualize results without re-running pipeline")
216 | print()
217 | 
218 | print(f"Check '{checkpoint_manager.checkpoint_dir}' for saved intermediate results!")
219 | print("Try different visualization styles by modifying the 'style' parameter!")
220 | print("Customize visualization by passing different clusters or checkpoint paths!")
221 | 


--------------------------------------------------------------------------------
/tests/test_meta_cluster.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pydantic import ValidationError
 3 | from kura.meta_cluster import ClusterLabel
 4 | 
 5 | 
 6 | def test_cluster_label_exact_match():
 7 |     """Test that ClusterLabel works with exact matches"""
 8 |     candidate_clusters = [
 9 |         "Code Assistance (Python & Rust)",
10 |         "Data Analysis",
11 |         "Creative Writing",
12 |     ]
13 | 
14 |     validated = ClusterLabel.model_validate_json(
15 |         '{"higher_level_cluster": "Code Assistance (Python & Rust)"}',
16 |         context={"candidate_clusters": candidate_clusters},
17 |     )
18 | 
19 |     assert validated.higher_level_cluster == "Code Assistance (Python & Rust)"
20 | 
21 | 
22 | def test_fuzzy_match():
23 |     """Test that ClusterLabel works with fuzzy matches above threshold"""
24 |     candidate_clusters = [
25 |         "Code Assistance (Python & Rust)",
26 |         "Data Analysis",
27 |         "Creative Writing",
28 |     ]
29 | 
30 |     validated = ClusterLabel.model_validate_json(
31 |         '{"higher_level_cluster": "Code Assistance (Python & Rust"}',
32 |         context={"candidate_clusters": candidate_clusters},
33 |     )
34 | 
35 |     assert validated.higher_level_cluster == "Code Assistance (Python & Rust)"
36 | 
37 | 
38 | def test_no_match():
39 |     """Test that ClusterLabel works with exact matches"""
40 |     candidate_clusters = [
41 |         "Code Assistance (Python & Rust)",
42 |         "Data Analysis",
43 |         "Creative Writing",
44 |     ]
45 | 
46 |     with pytest.raises(ValidationError):
47 |         ClusterLabel.model_validate_json(
48 |             '{"higher_level_cluster": "Code Assistance"}',
49 |             context={"candidate_clusters": candidate_clusters},
50 |         )
51 | 


--------------------------------------------------------------------------------
/ui/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | yarn-debug.log*
 6 | yarn-error.log*
 7 | pnpm-debug.log*
 8 | lerna-debug.log*
 9 | 
10 | node_modules
11 | dist
12 | dist-ssr
13 | *.local
14 | 
15 | # Editor directories and files
16 | .vscode/*
17 | !.vscode/extensions.json
18 | .idea
19 | .DS_Store
20 | *.suo
21 | *.ntvs*
22 | *.njsproj
23 | *.sln
24 | *.sw?
25 | 


--------------------------------------------------------------------------------
/ui/README.md:
--------------------------------------------------------------------------------
 1 | # React + TypeScript + Vite
 2 | 
 3 | This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
 4 | 
 5 | Currently, two official plugins are available:
 6 | 
 7 | - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react/README.md) uses [Babel](https://babeljs.io/) for Fast Refresh
 8 | - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh
 9 | 
10 | ## Expanding the ESLint configuration
11 | 
12 | If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
13 | 
14 | ```js
15 | export default tseslint.config({
16 |   extends: [
17 |     // Remove ...tseslint.configs.recommended and replace with this
18 |     ...tseslint.configs.recommendedTypeChecked,
19 |     // Alternatively, use this for stricter rules
20 |     ...tseslint.configs.strictTypeChecked,
21 |     // Optionally, add this for stylistic rules
22 |     ...tseslint.configs.stylisticTypeChecked,
23 |   ],
24 |   languageOptions: {
25 |     // other options...
26 |     parserOptions: {
27 |       project: ['./tsconfig.node.json', './tsconfig.app.json'],
28 |       tsconfigRootDir: import.meta.dirname,
29 |     },
30 |   },
31 | })
32 | ```
33 | 
34 | You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
35 | 
36 | ```js
37 | // eslint.config.js
38 | import reactX from 'eslint-plugin-react-x'
39 | import reactDom from 'eslint-plugin-react-dom'
40 | 
41 | export default tseslint.config({
42 |   plugins: {
43 |     // Add the react-x and react-dom plugins
44 |     'react-x': reactX,
45 |     'react-dom': reactDom,
46 |   },
47 |   rules: {
48 |     // other rules...
49 |     // Enable its recommended typescript rules
50 |     ...reactX.configs['recommended-typescript'].rules,
51 |     ...reactDom.configs.recommended.rules,
52 |   },
53 | })
54 | ```
55 | 


--------------------------------------------------------------------------------
/ui/bun.lockb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/ui/bun.lockb


--------------------------------------------------------------------------------
/ui/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "new-york",
 4 |   "rsc": false,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "",
 8 |     "css": "src/index.css",
 9 |     "baseColor": "slate",
10 |     "cssVariables": true,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "@/components",
15 |     "utils": "@/lib/utils",
16 |     "ui": "@/components/ui",
17 |     "lib": "@/lib",
18 |     "hooks": "@/hooks"
19 |   },
20 |   "iconLibrary": "lucide"
21 | }


--------------------------------------------------------------------------------
/ui/eslint.config.js:
--------------------------------------------------------------------------------
 1 | import js from '@eslint/js'
 2 | import globals from 'globals'
 3 | import reactHooks from 'eslint-plugin-react-hooks'
 4 | import reactRefresh from 'eslint-plugin-react-refresh'
 5 | import tseslint from 'typescript-eslint'
 6 | 
 7 | export default tseslint.config(
 8 |   { ignores: ['dist'] },
 9 |   {
10 |     extends: [js.configs.recommended, ...tseslint.configs.recommended],
11 |     files: ['**/*.{ts,tsx}'],
12 |     languageOptions: {
13 |       ecmaVersion: 2020,
14 |       globals: globals.browser,
15 |     },
16 |     plugins: {
17 |       'react-hooks': reactHooks,
18 |       'react-refresh': reactRefresh,
19 |     },
20 |     rules: {
21 |       ...reactHooks.configs.recommended.rules,
22 |       'react-refresh/only-export-components': [
23 |         'warn',
24 |         { allowConstantExport: true },
25 |       ],
26 |     },
27 |   },
28 | )
29 | 


--------------------------------------------------------------------------------
/ui/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <link rel="icon" type="image/svg+xml" href="/vite.svg" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |     <title>Vite + React + TS</title>
 8 |   </head>
 9 |   <body>
10 |     <div id="root"></div>
11 |     <script type="module" src="/src/main.tsx"></script>
12 |   </body>
13 | </html>
14 | 


--------------------------------------------------------------------------------
/ui/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ui",
 3 |   "private": true,
 4 |   "version": "0.0.0",
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "dev": "vite",
 8 |     "build": "tsc -b && vite build",
 9 |     "lint": "eslint .",
10 |     "preview": "vite preview"
11 |   },
12 |   "dependencies": {
13 |     "@radix-ui/react-dialog": "^1.1.7",
14 |     "@radix-ui/react-slot": "^1.2.0",
15 |     "@tailwindcss/vite": "^4.1.3",
16 |     "class-variance-authority": "^0.7.1",
17 |     "clsx": "^2.1.1",
18 |     "lucide-react": "^0.487.0",
19 |     "react": "^19.0.0",
20 |     "react-dom": "^19.0.0",
21 |     "recharts": "^2.15.2",
22 |     "tailwind-merge": "^3.2.0",
23 |     "tailwindcss": "^4.1.3",
24 |     "tw-animate-css": "^1.2.5",
25 |     "zod": "^3.24.2"
26 |   },
27 |   "devDependencies": {
28 |     "@eslint/js": "^9.21.0",
29 |     "@types/node": "^22.14.0",
30 |     "@types/react": "^19.0.10",
31 |     "@types/react-dom": "^19.0.4",
32 |     "@vitejs/plugin-react": "^4.3.4",
33 |     "eslint": "^9.21.0",
34 |     "eslint-plugin-react-hooks": "^5.1.0",
35 |     "eslint-plugin-react-refresh": "^0.4.19",
36 |     "globals": "^15.15.0",
37 |     "typescript": "~5.7.2",
38 |     "typescript-eslint": "^8.24.1",
39 |     "vite": "^6.2.0"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/ui/public/vite.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>


--------------------------------------------------------------------------------
/ui/src/App.tsx:
--------------------------------------------------------------------------------
  1 | import { useState } from "react";
  2 | 
  3 | import UploadForm from "./components/upload-form";
  4 | import {
  5 |   ConversationClustersList,
  6 |   ConversationsList,
  7 |   ConversationSummariesList,
  8 | } from "./types/kura";
  9 | import {
 10 |   ConversationInfo,
 11 |   ConversationInfoSchema,
 12 |   ClusterTreeNode,
 13 | } from "./types/cluster";
 14 | import { buildClusterTree, flattenClusterTree } from "./lib/tree";
 15 | import ClusterTree from "./components/cluster-tree";
 16 | import ClusterDetails from "./components/cluster-details";
 17 | import { Button } from "./components/ui/button";
 18 | import { X } from "lucide-react";
 19 | import ClusterMap from "./components/cluster-map";
 20 | 
 21 | function App() {
 22 |   const [conversations, setConversations] = useState<ConversationsList | null>(
 23 |     null
 24 |   );
 25 |   const [summaries, setSummaries] = useState<ConversationSummariesList | null>(
 26 |     null
 27 |   );
 28 |   const [clusters, setClusters] = useState<ConversationClustersList | null>(
 29 |     null
 30 |   );
 31 |   const [conversationMetadataMap, setConversationMetadataMap] = useState<
 32 |     Map<string, ConversationInfo>
 33 |   >(new Map());
 34 | 
 35 |   const [clusterTree, setClusterTree] = useState<ClusterTreeNode | null>(null);
 36 |   const [selectedCluster, setSelectedCluster] =
 37 |     useState<ClusterTreeNode | null>(null);
 38 | 
 39 |   const [flatClusterNodes, setFlatClusterNodes] = useState<ClusterTreeNode[]>(
 40 |     []
 41 |   );
 42 | 
 43 |   const resetVisualisations = () => {
 44 |     setConversations(null);
 45 |     setSummaries(null);
 46 |     setClusters(null);
 47 |     setConversationMetadataMap(new Map());
 48 |     setClusterTree(null);
 49 |     setSelectedCluster(null);
 50 |   };
 51 | 
 52 |   const handleVisualiseClusters = () => {
 53 |     if (!clusters || !conversations) return;
 54 |     const metadataMap = new Map<string, ConversationInfo>();
 55 |     for (const conversation of conversations) {
 56 |       const summary = summaries?.find(
 57 |         (sum) => sum.chat_id === conversation.chat_id
 58 |       )?.summary;
 59 |       if (!summary) {
 60 |         throw new Error(
 61 |           `No summary found for conversation ${conversation.chat_id}`
 62 |         );
 63 |       }
 64 | 
 65 |       const conversationWithSummary = ConversationInfoSchema.parse({
 66 |         ...conversation,
 67 |         summary,
 68 |       });
 69 | 
 70 |       metadataMap.set(conversation.chat_id, conversationWithSummary);
 71 |     }
 72 | 
 73 |     // Create this so we can quickly compute the cluster metadata
 74 |     setConversationMetadataMap(metadataMap);
 75 | 
 76 |     // Now we build a tree of clusters
 77 |     const clusterTree = buildClusterTree(clusters, null, 0);
 78 |     setClusterTree(clusterTree);
 79 | 
 80 |     const flatClusterNodes = flattenClusterTree(clusterTree, []);
 81 |     setFlatClusterNodes(flatClusterNodes);
 82 |   };
 83 | 
 84 |   return (
 85 |     <div className="min-h-screen flex flex-col">
 86 |       {!clusterTree && (
 87 |         <div className="p-4 ">
 88 |           <UploadForm
 89 |             setConversations={setConversations}
 90 |             conversations={conversations}
 91 |             setSummaries={setSummaries}
 92 |             summaries={summaries}
 93 |             setClusters={setClusters}
 94 |             clusters={clusters}
 95 |             handleVisualiseClusters={handleVisualiseClusters}
 96 |           />
 97 |         </div>
 98 |       )}
 99 | 
100 |       {clusterTree && (
101 |         <div className="flex flex-1 h-screen">
102 |           <div className="w-1/3 min-w-[300px] border-r flex flex-col">
103 |             <div className="p-4 border-b">
104 |               <div className="flex justify-between items-center">
105 |                 <h2 className="text-lg font-bold">Cluster Hierarchy</h2>
106 |                 <Button
107 |                   variant="ghost"
108 |                   size="sm"
109 |                   onClick={resetVisualisations}
110 |                   className="text-slate-500 hover:text-slate-700"
111 |                 >
112 |                   <X className="h-4 w-4 mr-1" />
113 |                   Reset
114 |                 </Button>
115 |               </div>
116 |             </div>
117 |             <div className="p-4 overflow-y-auto h-[40vh] border-b">
118 |               <ClusterTree
119 |                 clusterTree={clusterTree}
120 |                 onSelectCluster={setSelectedCluster}
121 |               />
122 |             </div>
123 | 
124 |             {selectedCluster && (
125 |               <div className="flex-1 overflow-y-auto">
126 |                 <ClusterDetails
127 |                   selectedCluster={selectedCluster}
128 |                   conversationMetadataMap={conversationMetadataMap}
129 |                 />
130 |               </div>
131 |             )}
132 |           </div>
133 |           <div className="flex-1 flex items-center justify-center text-slate-700 font-bold">
134 |             {selectedCluster && clusters && (
135 |               <ClusterMap
136 |                 clusters={flatClusterNodes.filter(
137 |                   (item) => item.level == selectedCluster.level
138 |                 )}
139 |               />
140 |             )}
141 |           </div>
142 |         </div>
143 |       )}
144 |     </div>
145 |   );
146 | }
147 | 
148 | export default App;
149 | 


--------------------------------------------------------------------------------
/ui/src/assets/react.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="35.93" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 228"><path fill="#00D8FF" d="M210.483 73.824a171.49 171.49 0 0 0-8.24-2.597c.465-1.9.893-3.777 1.273-5.621c6.238-30.281 2.16-54.676-11.769-62.708c-13.355-7.7-35.196.329-57.254 19.526a171.23 171.23 0 0 0-6.375 5.848a155.866 155.866 0 0 0-4.241-3.917C100.759 3.829 77.587-4.822 63.673 3.233C50.33 10.957 46.379 33.89 51.995 62.588a170.974 170.974 0 0 0 1.892 8.48c-3.28.932-6.445 1.924-9.474 2.98C17.309 83.498 0 98.307 0 113.668c0 15.865 18.582 31.778 46.812 41.427a145.52 145.52 0 0 0 6.921 2.165a167.467 167.467 0 0 0-2.01 9.138c-5.354 28.2-1.173 50.591 12.134 58.266c13.744 7.926 36.812-.22 59.273-19.855a145.567 145.567 0 0 0 5.342-4.923a168.064 168.064 0 0 0 6.92 6.314c21.758 18.722 43.246 26.282 56.54 18.586c13.731-7.949 18.194-32.003 12.4-61.268a145.016 145.016 0 0 0-1.535-6.842c1.62-.48 3.21-.974 4.76-1.488c29.348-9.723 48.443-25.443 48.443-41.52c0-15.417-17.868-30.326-45.517-39.844Zm-6.365 70.984c-1.4.463-2.836.91-4.3 1.345c-3.24-10.257-7.612-21.163-12.963-32.432c5.106-11 9.31-21.767 12.459-31.957c2.619.758 5.16 1.557 7.61 2.4c23.69 8.156 38.14 20.213 38.14 29.504c0 9.896-15.606 22.743-40.946 31.14Zm-10.514 20.834c2.562 12.94 2.927 24.64 1.23 33.787c-1.524 8.219-4.59 13.698-8.382 15.893c-8.067 4.67-25.32-1.4-43.927-17.412a156.726 156.726 0 0 1-6.437-5.87c7.214-7.889 14.423-17.06 21.459-27.246c12.376-1.098 24.068-2.894 34.671-5.345a134.17 134.17 0 0 1 1.386 6.193ZM87.276 214.515c-7.882 2.783-14.16 2.863-17.955.675c-8.075-4.657-11.432-22.636-6.853-46.752a156.923 156.923 0 0 1 1.869-8.499c10.486 2.32 22.093 3.988 34.498 4.994c7.084 9.967 14.501 19.128 21.976 27.15a134.668 134.668 0 0 1-4.877 4.492c-9.933 8.682-19.886 14.842-28.658 17.94ZM50.35 144.747c-12.483-4.267-22.792-9.812-29.858-15.863c-6.35-5.437-9.555-10.836-9.555-15.216c0-9.322 13.897-21.212 37.076-29.293c2.813-.98 5.757-1.905 8.812-2.773c3.204 10.42 7.406 21.315 12.477 32.332c-5.137 11.18-9.399 22.249-12.634 32.792a134.718 134.718 0 0 1-6.318-1.979Zm12.378-84.26c-4.811-24.587-1.616-43.134 6.425-47.789c8.564-4.958 27.502 2.111 47.463 19.835a144.318 144.318 0 0 1 3.841 3.545c-7.438 7.987-14.787 17.08-21.808 26.988c-12.04 1.116-23.565 2.908-34.161 5.309a160.342 160.342 0 0 1-1.76-7.887Zm110.427 27.268a347.8 347.8 0 0 0-7.785-12.803c8.168 1.033 15.994 2.404 23.343 4.08c-2.206 7.072-4.956 14.465-8.193 22.045a381.151 381.151 0 0 0-7.365-13.322Zm-45.032-43.861c5.044 5.465 10.096 11.566 15.065 18.186a322.04 322.04 0 0 0-30.257-.006c4.974-6.559 10.069-12.652 15.192-18.18ZM82.802 87.83a323.167 323.167 0 0 0-7.227 13.238c-3.184-7.553-5.909-14.98-8.134-22.152c7.304-1.634 15.093-2.97 23.209-3.984a321.524 321.524 0 0 0-7.848 12.897Zm8.081 65.352c-8.385-.936-16.291-2.203-23.593-3.793c2.26-7.3 5.045-14.885 8.298-22.6a321.187 321.187 0 0 0 7.257 13.246c2.594 4.48 5.28 8.868 8.038 13.147Zm37.542 31.03c-5.184-5.592-10.354-11.779-15.403-18.433c4.902.192 9.899.29 14.978.29c5.218 0 10.376-.117 15.453-.343c-4.985 6.774-10.018 12.97-15.028 18.486Zm52.198-57.817c3.422 7.8 6.306 15.345 8.596 22.52c-7.422 1.694-15.436 3.058-23.88 4.071a382.417 382.417 0 0 0 7.859-13.026a347.403 347.403 0 0 0 7.425-13.565Zm-16.898 8.101a358.557 358.557 0 0 1-12.281 19.815a329.4 329.4 0 0 1-23.444.823c-7.967 0-15.716-.248-23.178-.732a310.202 310.202 0 0 1-12.513-19.846h.001a307.41 307.41 0 0 1-10.923-20.627a310.278 310.278 0 0 1 10.89-20.637l-.001.001a307.318 307.318 0 0 1 12.413-19.761c7.613-.576 15.42-.876 23.31-.876H128c7.926 0 15.743.303 23.354.883a329.357 329.357 0 0 1 12.335 19.695a358.489 358.489 0 0 1 11.036 20.54a329.472 329.472 0 0 1-11 20.722Zm22.56-122.124c8.572 4.944 11.906 24.881 6.52 51.026c-.344 1.668-.73 3.367-1.15 5.09c-10.622-2.452-22.155-4.275-34.23-5.408c-7.034-10.017-14.323-19.124-21.64-27.008a160.789 160.789 0 0 1 5.888-5.4c18.9-16.447 36.564-22.941 44.612-18.3ZM128 90.808c12.625 0 22.86 10.235 22.86 22.86s-10.235 22.86-22.86 22.86s-22.86-10.235-22.86-22.86s10.235-22.86 22.86-22.86Z"></path></svg>


--------------------------------------------------------------------------------
/ui/src/components/cluster-details.tsx:
--------------------------------------------------------------------------------
  1 | import { ClusterTreeNode, ConversationInfo } from "../types/cluster";
  2 | import { useState } from "react";
  3 | import ConversationDialog from "./conversation-dialog";
  4 | 
  5 | interface ClusterDetailsProps {
  6 |   selectedCluster: ClusterTreeNode | null;
  7 |   conversationMetadataMap: Map<string, ConversationInfo>;
  8 | }
  9 | 
 10 | interface MetadataSummaryProps {
 11 |   aggregatedMetadata: Record<string, any[]>;
 12 | }
 13 | 
 14 | function MetadataSummary({ aggregatedMetadata }: MetadataSummaryProps) {
 15 |   const [aggregationMode, setAggregationMode] = useState<"individual" | "list">(
 16 |     "individual"
 17 |   );
 18 | 
 19 |   if (Object.keys(aggregatedMetadata).length === 0) return null;
 20 | 
 21 |   return (
 22 |     <div className="mt-3 border-t pt-2">
 23 |       <div className="flex justify-between items-center mb-2">
 24 |         <h4 className="text-xs font-semibold">Metadata Summary</h4>
 25 |         <div className="flex items-center gap-2">
 26 |           <span className="text-xs text-slate-600">Aggregation:</span>
 27 |           <button
 28 |             onClick={() => setAggregationMode("individual")}
 29 |             className={`text-xs px-2 py-0.5 rounded ${
 30 |               aggregationMode === "individual"
 31 |                 ? "bg-blue-100 text-blue-800"
 32 |                 : "bg-slate-100 text-slate-600"
 33 |             }`}
 34 |           >
 35 |             Individual
 36 |           </button>
 37 |           <button
 38 |             onClick={() => setAggregationMode("list")}
 39 |             className={`text-xs px-2 py-0.5 rounded ${
 40 |               aggregationMode === "list"
 41 |                 ? "bg-blue-100 text-blue-800"
 42 |                 : "bg-slate-100 text-slate-600"
 43 |             }`}
 44 |           >
 45 |             List
 46 |           </button>
 47 |         </div>
 48 |       </div>
 49 |       <div className="grid grid-cols-2 gap-2">
 50 |         {Object.entries(aggregatedMetadata).map(([key, values]) => {
 51 |           // Count occurrences based on aggregation mode
 52 |           const valueCounts = values.reduce(
 53 |             (acc: Record<string, number>, val: any) => {
 54 |               if (Array.isArray(val) && aggregationMode === "individual") {
 55 |                 // For individual mode, count each item in the array separately
 56 |                 val.forEach((item: any) => {
 57 |                   const itemStr = String(item);
 58 |                   acc[itemStr] = (acc[itemStr] || 0) + 1;
 59 |                 });
 60 |               } else {
 61 |                 // For list mode or non-array values, count the whole value
 62 |                 const valueStr = Array.isArray(val)
 63 |                   ? val.join(", ")
 64 |                   : String(val);
 65 |                 acc[valueStr] = (acc[valueStr] || 0) + 1;
 66 |               }
 67 |               return acc;
 68 |             },
 69 |             {}
 70 |           );
 71 | 
 72 |           return (
 73 |             <div key={key} className="text-xs">
 74 |               <span className="font-medium">{key}:</span>{" "}
 75 |               <div className="flex flex-wrap gap-1 mt-1">
 76 |                 {Object.entries(valueCounts).map(([value, count]) => (
 77 |                   <div
 78 |                     key={`${key}-${value}`}
 79 |                     className="inline-flex items-center border rounded-full px-2 py-0.5 bg-slate-100 text-slate-800 text-[10px]"
 80 |                   >
 81 |                     {value}
 82 |                     <span className="ml-1 text-slate-500">({count})</span>
 83 |                   </div>
 84 |                 ))}
 85 |               </div>
 86 |             </div>
 87 |           );
 88 |         })}
 89 |       </div>
 90 |     </div>
 91 |   );
 92 | }
 93 | 
 94 | export default function ClusterDetails({
 95 |   selectedCluster,
 96 |   conversationMetadataMap,
 97 | }: ClusterDetailsProps) {
 98 |   const [selectedConversation, setSelectedConversation] =
 99 |     useState<ConversationInfo | null>(null);
100 |   const [isDialogOpen, setIsDialogOpen] = useState(false);
101 | 
102 |   if (!selectedCluster) return null;
103 | 
104 |   // Count chats that exist in the metadata map
105 |   // Get the actual conversation objects from the metadata map
106 |   const chats = selectedCluster.chat_ids?.map((id: string) => {
107 |     return conversationMetadataMap.get(id);
108 |   });
109 | 
110 |   const aggregatedMetadata = chats.reduce(
111 |     (acc: Record<string, any[]>, chat: ConversationInfo | undefined) => {
112 |       if (!chat || !chat.metadata) return acc;
113 | 
114 |       // Iterate through each metadata key-value pair
115 |       Object.entries(chat.metadata).forEach(([key, value]) => {
116 |         if (!acc[key]) {
117 |           // Initialize the array for this key
118 |           acc[key] = [];
119 |         }
120 | 
121 |         // If the value is already an array, store it as a nested array
122 |         if (Array.isArray(value)) {
123 |           acc[key].push(value);
124 |         } else {
125 |           // For primitive types (string, number, boolean), add to array
126 |           acc[key].push(value);
127 |         }
128 |       });
129 | 
130 |       return acc;
131 |     },
132 |     {}
133 |   );
134 | 
135 |   const handleConversationClick = (conversation: ConversationInfo) => {
136 |     setSelectedConversation(conversation);
137 |     setIsDialogOpen(true);
138 |   };
139 | 
140 |   return (
141 |     <div className="flex-1 p-4 overflow-y-auto h-[50vh]">
142 |       <h3 className="text-sm font-semibold mb-2">Cluster Details</h3>
143 |       <div className="rounded-md">
144 |         <p className="font-medium">{selectedCluster.name}</p>
145 |         {selectedCluster.description && (
146 |           <p className="text-xs text-slate-600 mt-1">
147 |             {selectedCluster.description}
148 |           </p>
149 |         )}
150 |         <div className="flex space-x-4 mt-2">
151 |           <p className="text-xs text-slate-600">
152 |             <span className="font-medium">Level:</span> {selectedCluster.level}
153 |           </p>
154 |         </div>
155 |         {selectedCluster.id && (
156 |           <p className="text-xs text-slate-500 mt-1">
157 |             ID: {selectedCluster.id}
158 |           </p>
159 |         )}
160 |         <p className="text-xs text-slate-500 mt-1">
161 |           {selectedCluster.chat_ids?.length} chats
162 |         </p>
163 | 
164 |         {/* Metadata summary section */}
165 |         <MetadataSummary aggregatedMetadata={aggregatedMetadata} />
166 | 
167 |         <div className="mt-3 space-y-2 ">
168 |           {chats.map((item: ConversationInfo) => (
169 |             <div
170 |               key={item.chat_id}
171 |               className="p-2 border rounded-md cursor-pointer hover:bg-slate-50 transition-colors"
172 |               onClick={() => handleConversationClick(item)}
173 |             >
174 |               <p className="text-xs mb-3 text-slate-500 mt-1">
175 |                 ID: {item.chat_id}
176 |               </p>
177 |               <p className="text-xs font-medium text-wrap">{item.summary}</p>
178 |             </div>
179 |           ))}
180 |         </div>
181 |       </div>
182 | 
183 |       {/* Conversation Dialog */}
184 |       {selectedConversation && (
185 |         <ConversationDialog
186 |           conversation={selectedConversation}
187 |           isOpen={isDialogOpen}
188 |           onOpenChange={setIsDialogOpen}
189 |         />
190 |       )}
191 |     </div>
192 |   );
193 | }
194 | 


--------------------------------------------------------------------------------
/ui/src/components/cluster-map.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   ScatterChart,
 3 |   Scatter,
 4 |   ResponsiveContainer,
 5 |   XAxis,
 6 |   YAxis,
 7 |   Tooltip,
 8 |   ZAxis,
 9 | } from "recharts";
10 | import { ConversationClustersList } from "@/types/kura";
11 | import { useMemo } from "react";
12 | 
13 | type ClusterMapProps = {
14 |   clusters: ConversationClustersList;
15 | };
16 | 
17 | const ClusterMap = ({ clusters }: ClusterMapProps) => {
18 |   const nodeCoordinates = clusters.map((cluster) => ({
19 |     label: cluster.name,
20 |     x: cluster.x_coord,
21 |     y: cluster.y_coord,
22 |     id: cluster.id,
23 |   }));
24 | 
25 |   // Calculate bounds for scaling
26 |   const { minX, maxX, minY, maxY, xRange, yRange } = useMemo(() => {
27 |     const xValues = nodeCoordinates.map((node) => node.x);
28 |     const yValues = nodeCoordinates.map((node) => node.y);
29 | 
30 |     const minX = Math.min(...xValues);
31 |     const maxX = Math.max(...xValues);
32 |     const minY = Math.min(...yValues);
33 |     const maxY = Math.max(...yValues);
34 | 
35 |     return {
36 |       minX,
37 |       maxX,
38 |       minY,
39 |       maxY,
40 |       xRange: maxX - minX,
41 |       yRange: maxY - minY,
42 |     };
43 |   }, [nodeCoordinates]);
44 | 
45 |   return (
46 |     <ResponsiveContainer>
47 |       <ScatterChart
48 |         margin={{
49 |           top: 20,
50 |           right: 20,
51 |           bottom: 20,
52 |           left: 20,
53 |         }}
54 |         width={1000}
55 |         height={1000}
56 |       >
57 |         <Tooltip
58 |           cursor={{ strokeDasharray: "3 3" }}
59 |           content={({ payload }) => {
60 |             if (payload && payload[0]) {
61 |               return (
62 |                 <div className="bg-white p-2 border rounded shadow">
63 |                   {payload[0].payload.label}
64 |                 </div>
65 |               );
66 |             }
67 |             return null;
68 |           }}
69 |         />
70 |         <XAxis
71 |           type="number"
72 |           dataKey="x"
73 |           domain={[minX - xRange * 0.05, maxX + xRange * 0.05]}
74 |           name="X"
75 |           tickFormatter={(value) => value.toFixed(2)}
76 |         />
77 |         <YAxis
78 |           type="number"
79 |           dataKey="y"
80 |           domain={[minY - yRange * 0.05, maxY + yRange * 0.05]}
81 |           name="Y"
82 |           tickFormatter={(value) => value.toFixed(2)}
83 |         />
84 |         <ZAxis range={[50, 200]} />
85 |         <Scatter
86 |           name="Clusters"
87 |           data={nodeCoordinates}
88 |           fill="#8884d8"
89 |           cursor="pointer"
90 |         />
91 |       </ScatterChart>
92 |     </ResponsiveContainer>
93 |   );
94 | };
95 | 
96 | export default ClusterMap;
97 | 


--------------------------------------------------------------------------------
/ui/src/components/cluster-tree.tsx:
--------------------------------------------------------------------------------
 1 | import { ClusterTreeNode } from "@/types/cluster";
 2 | import { useState } from "react";
 3 | import { ChevronRight, ChevronDown } from "lucide-react";
 4 | 
 5 | type Props = {
 6 |   clusterTree: ClusterTreeNode;
 7 |   indent?: number;
 8 |   onSelectCluster?: (cluster: ClusterTreeNode) => void;
 9 | };
10 | 
11 | const ClusterTree = ({ clusterTree, indent = 0, onSelectCluster }: Props) => {
12 |   const [isExpanded, setIsExpanded] = useState(false);
13 |   const toggleExpand = () => setIsExpanded(!isExpanded);
14 | 
15 |   const handleClick = (e: React.MouseEvent) => {
16 |     e.preventDefault();
17 |     toggleExpand();
18 |     if (onSelectCluster) {
19 |       onSelectCluster(clusterTree);
20 |     }
21 |   };
22 | 
23 |   // Don't render the node itself if it's Root, just its children
24 |   if (clusterTree.name === "Root") {
25 |     return (
26 |       <div className="text-left">
27 |         {clusterTree.children?.map((child: ClusterTreeNode, index: number) => (
28 |           <ClusterTree
29 |             key={child.id || index}
30 |             clusterTree={child}
31 |             indent={0}
32 |             onSelectCluster={onSelectCluster}
33 |           />
34 |         ))}
35 |       </div>
36 |     );
37 |   }
38 | 
39 |   return (
40 |     <div className="text-left">
41 |       <div
42 |         className="flex items-center hover:bg-slate-100 rounded py-2 cursor-pointer"
43 |         style={{ paddingLeft: `${indent}px` }}
44 |         onClick={handleClick}
45 |       >
46 |         {clusterTree.children && clusterTree.children.length > 0 ? (
47 |           <div className="flex-shrink-0 mr-1">
48 |             {isExpanded ? (
49 |               <ChevronDown className="h-4 w-4 text-slate-500" />
50 |             ) : (
51 |               <ChevronRight className="h-4 w-4 text-slate-500" />
52 |             )}
53 |           </div>
54 |         ) : (
55 |           <div className="w-5" />
56 |         )}
57 |         <div className="font-medium text-wrap">
58 |           {clusterTree.name}
59 |           {clusterTree.count > 0 && (
60 |             <span className="ml-2 text-xs text-slate-500">
61 |               ({clusterTree.count})
62 |             </span>
63 |           )}
64 |         </div>
65 |       </div>
66 | 
67 |       {isExpanded &&
68 |         clusterTree.children &&
69 |         clusterTree.children.length > 0 && (
70 |           <div className="pl-2 border-l border-slate-200 ml-2">
71 |             {clusterTree.children.map(
72 |               (child: ClusterTreeNode, index: number) => (
73 |                 <ClusterTree
74 |                   key={child.id || index}
75 |                   clusterTree={child}
76 |                   indent={indent + 5}
77 |                   onSelectCluster={onSelectCluster}
78 |                 />
79 |               )
80 |             )}
81 |           </div>
82 |         )}
83 |     </div>
84 |   );
85 | };
86 | 
87 | export default ClusterTree;
88 | 


--------------------------------------------------------------------------------
/ui/src/components/conversation-dialog.tsx:
--------------------------------------------------------------------------------
 1 | import { ConversationInfo } from "../types/cluster";
 2 | import { Dialog, DialogContent, DialogHeader, DialogTitle } from "./ui/dialog";
 3 | 
 4 | interface ConversationDialogProps {
 5 |   conversation: ConversationInfo;
 6 |   isOpen: boolean;
 7 |   onOpenChange: (open: boolean) => void;
 8 | }
 9 | 
10 | export default function ConversationDialog({
11 |   conversation,
12 |   isOpen,
13 |   onOpenChange,
14 | }: ConversationDialogProps) {
15 |   if (!conversation) return null;
16 | 
17 |   return (
18 |     <Dialog open={isOpen} onOpenChange={onOpenChange}>
19 |       <DialogContent className="max-h-[90vh] overflow-hidden flex flex-col max-w-[80vw] min-w-[80vw]">
20 |         <DialogHeader>
21 |           <DialogTitle className="flex flex-col gap-1">
22 |             <span>Conversation Details</span>
23 |             <span className="text-xs text-slate-500 font-normal">
24 |               ID: {conversation.chat_id}
25 |             </span>
26 |           </DialogTitle>
27 |         </DialogHeader>
28 | 
29 |         {/* Conversation metadata section */}
30 |         <div className="border-b pb-3">
31 |           <h3 className="text-sm font-medium mb-2">Metadata</h3>
32 |           <div className="grid grid-cols-2 gap-2">
33 |             {conversation.metadata &&
34 |               Object.entries(conversation.metadata).map(([key, value]) => (
35 |                 <div key={key} className="text-xs">
36 |                   <span className="font-medium">{key}:</span>{" "}
37 |                   <span className="text-slate-600">
38 |                     {Array.isArray(value) ? value.join(", ") : String(value)}
39 |                   </span>
40 |                 </div>
41 |               ))}
42 |             <div className="text-xs">
43 |               <span className="font-medium">Created:</span>{" "}
44 |               <span className="text-slate-600">
45 |                 {new Date(conversation.created_at).toLocaleString()}
46 |               </span>
47 |             </div>
48 |           </div>
49 |         </div>
50 | 
51 |         {/* Message history */}
52 |         <div className="overflow-y-auto flex-1 p-2">
53 |           <h3 className="text-sm font-medium mb-2">Messages</h3>
54 |           <div className="space-y-4">
55 |             {conversation.messages?.map((message, index) => (
56 |               <div
57 |                 key={index}
58 |                 className={`flex flex-col ${
59 |                   message.role === "user" ? "items-end" : "items-start"
60 |                 }`}
61 |               >
62 |                 <div
63 |                   className={`max-w-[80%] rounded-xl px-4 py-2 ${
64 |                     message.role === "user"
65 |                       ? "bg-blue-100 text-blue-900 rounded-br-none"
66 |                       : "bg-slate-100 text-slate-800 rounded-bl-none"
67 |                   }`}
68 |                 >
69 |                   <p className="whitespace-pre-wrap break-words">
70 |                     {message.content}
71 |                   </p>
72 |                 </div>
73 |                 <div className="text-xs text-slate-500 mt-1 px-2 flex gap-2">
74 |                   <span>{message.role === "user" ? "User" : "Assistant"}</span>
75 |                   <span>·</span>
76 |                   <span>
77 |                     {new Date(message.created_at).toLocaleTimeString()}
78 |                   </span>
79 |                 </div>
80 |               </div>
81 |             ))}
82 |           </div>
83 |         </div>
84 |       </DialogContent>
85 |     </Dialog>
86 |   );
87 | }
88 | 


--------------------------------------------------------------------------------
/ui/src/components/ui/button.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | import { Slot } from "@radix-ui/react-slot"
 3 | import { cva, type VariantProps } from "class-variance-authority"
 4 | 
 5 | import { cn } from "@/lib/utils"
 6 | 
 7 | const buttonVariants = cva(
 8 |   "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg:not([class*='size-'])]:size-4 shrink-0 [&_svg]:shrink-0 outline-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive",
 9 |   {
10 |     variants: {
11 |       variant: {
12 |         default:
13 |           "bg-primary text-primary-foreground shadow-xs hover:bg-primary/90",
14 |         destructive:
15 |           "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
16 |         outline:
17 |           "border bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50",
18 |         secondary:
19 |           "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80",
20 |         ghost:
21 |           "hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50",
22 |         link: "text-primary underline-offset-4 hover:underline",
23 |       },
24 |       size: {
25 |         default: "h-9 px-4 py-2 has-[>svg]:px-3",
26 |         sm: "h-8 rounded-md gap-1.5 px-3 has-[>svg]:px-2.5",
27 |         lg: "h-10 rounded-md px-6 has-[>svg]:px-4",
28 |         icon: "size-9",
29 |       },
30 |     },
31 |     defaultVariants: {
32 |       variant: "default",
33 |       size: "default",
34 |     },
35 |   }
36 | )
37 | 
38 | function Button({
39 |   className,
40 |   variant,
41 |   size,
42 |   asChild = false,
43 |   ...props
44 | }: React.ComponentProps<"button"> &
45 |   VariantProps<typeof buttonVariants> & {
46 |     asChild?: boolean
47 |   }) {
48 |   const Comp = asChild ? Slot : "button"
49 | 
50 |   return (
51 |     <Comp
52 |       data-slot="button"
53 |       className={cn(buttonVariants({ variant, size, className }))}
54 |       {...props}
55 |     />
56 |   )
57 | }
58 | 
59 | export { Button, buttonVariants }
60 | 


--------------------------------------------------------------------------------
/ui/src/components/ui/card.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | 
 3 | import { cn } from "@/lib/utils"
 4 | 
 5 | function Card({ className, ...props }: React.ComponentProps<"div">) {
 6 |   return (
 7 |     <div
 8 |       data-slot="card"
 9 |       className={cn(
10 |         "bg-card text-card-foreground flex flex-col gap-6 rounded-xl border py-6 shadow-sm",
11 |         className
12 |       )}
13 |       {...props}
14 |     />
15 |   )
16 | }
17 | 
18 | function CardHeader({ className, ...props }: React.ComponentProps<"div">) {
19 |   return (
20 |     <div
21 |       data-slot="card-header"
22 |       className={cn(
23 |         "@container/card-header grid auto-rows-min grid-rows-[auto_auto] items-start gap-1.5 px-6 has-data-[slot=card-action]:grid-cols-[1fr_auto] [.border-b]:pb-6",
24 |         className
25 |       )}
26 |       {...props}
27 |     />
28 |   )
29 | }
30 | 
31 | function CardTitle({ className, ...props }: React.ComponentProps<"div">) {
32 |   return (
33 |     <div
34 |       data-slot="card-title"
35 |       className={cn("leading-none font-semibold", className)}
36 |       {...props}
37 |     />
38 |   )
39 | }
40 | 
41 | function CardDescription({ className, ...props }: React.ComponentProps<"div">) {
42 |   return (
43 |     <div
44 |       data-slot="card-description"
45 |       className={cn("text-muted-foreground text-sm", className)}
46 |       {...props}
47 |     />
48 |   )
49 | }
50 | 
51 | function CardAction({ className, ...props }: React.ComponentProps<"div">) {
52 |   return (
53 |     <div
54 |       data-slot="card-action"
55 |       className={cn(
56 |         "col-start-2 row-span-2 row-start-1 self-start justify-self-end",
57 |         className
58 |       )}
59 |       {...props}
60 |     />
61 |   )
62 | }
63 | 
64 | function CardContent({ className, ...props }: React.ComponentProps<"div">) {
65 |   return (
66 |     <div
67 |       data-slot="card-content"
68 |       className={cn("px-6", className)}
69 |       {...props}
70 |     />
71 |   )
72 | }
73 | 
74 | function CardFooter({ className, ...props }: React.ComponentProps<"div">) {
75 |   return (
76 |     <div
77 |       data-slot="card-footer"
78 |       className={cn("flex items-center px-6 [.border-t]:pt-6", className)}
79 |       {...props}
80 |     />
81 |   )
82 | }
83 | 
84 | export {
85 |   Card,
86 |   CardHeader,
87 |   CardFooter,
88 |   CardTitle,
89 |   CardAction,
90 |   CardDescription,
91 |   CardContent,
92 | }
93 | 


--------------------------------------------------------------------------------
/ui/src/components/ui/dialog.tsx:
--------------------------------------------------------------------------------
  1 | import * as React from "react"
  2 | import * as DialogPrimitive from "@radix-ui/react-dialog"
  3 | import { XIcon } from "lucide-react"
  4 | 
  5 | import { cn } from "@/lib/utils"
  6 | 
  7 | function Dialog({
  8 |   ...props
  9 | }: React.ComponentProps<typeof DialogPrimitive.Root>) {
 10 |   return <DialogPrimitive.Root data-slot="dialog" {...props} />
 11 | }
 12 | 
 13 | function DialogTrigger({
 14 |   ...props
 15 | }: React.ComponentProps<typeof DialogPrimitive.Trigger>) {
 16 |   return <DialogPrimitive.Trigger data-slot="dialog-trigger" {...props} />
 17 | }
 18 | 
 19 | function DialogPortal({
 20 |   ...props
 21 | }: React.ComponentProps<typeof DialogPrimitive.Portal>) {
 22 |   return <DialogPrimitive.Portal data-slot="dialog-portal" {...props} />
 23 | }
 24 | 
 25 | function DialogClose({
 26 |   ...props
 27 | }: React.ComponentProps<typeof DialogPrimitive.Close>) {
 28 |   return <DialogPrimitive.Close data-slot="dialog-close" {...props} />
 29 | }
 30 | 
 31 | function DialogOverlay({
 32 |   className,
 33 |   ...props
 34 | }: React.ComponentProps<typeof DialogPrimitive.Overlay>) {
 35 |   return (
 36 |     <DialogPrimitive.Overlay
 37 |       data-slot="dialog-overlay"
 38 |       className={cn(
 39 |         "data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 fixed inset-0 z-50 bg-black/50",
 40 |         className
 41 |       )}
 42 |       {...props}
 43 |     />
 44 |   )
 45 | }
 46 | 
 47 | function DialogContent({
 48 |   className,
 49 |   children,
 50 |   ...props
 51 | }: React.ComponentProps<typeof DialogPrimitive.Content>) {
 52 |   return (
 53 |     <DialogPortal data-slot="dialog-portal">
 54 |       <DialogOverlay />
 55 |       <DialogPrimitive.Content
 56 |         data-slot="dialog-content"
 57 |         className={cn(
 58 |           "bg-background data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border p-6 shadow-lg duration-200 sm:max-w-lg",
 59 |           className
 60 |         )}
 61 |         {...props}
 62 |       >
 63 |         {children}
 64 |         <DialogPrimitive.Close className="ring-offset-background focus:ring-ring data-[state=open]:bg-accent data-[state=open]:text-muted-foreground absolute top-4 right-4 rounded-xs opacity-70 transition-opacity hover:opacity-100 focus:ring-2 focus:ring-offset-2 focus:outline-hidden disabled:pointer-events-none [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4">
 65 |           <XIcon />
 66 |           <span className="sr-only">Close</span>
 67 |         </DialogPrimitive.Close>
 68 |       </DialogPrimitive.Content>
 69 |     </DialogPortal>
 70 |   )
 71 | }
 72 | 
 73 | function DialogHeader({ className, ...props }: React.ComponentProps<"div">) {
 74 |   return (
 75 |     <div
 76 |       data-slot="dialog-header"
 77 |       className={cn("flex flex-col gap-2 text-center sm:text-left", className)}
 78 |       {...props}
 79 |     />
 80 |   )
 81 | }
 82 | 
 83 | function DialogFooter({ className, ...props }: React.ComponentProps<"div">) {
 84 |   return (
 85 |     <div
 86 |       data-slot="dialog-footer"
 87 |       className={cn(
 88 |         "flex flex-col-reverse gap-2 sm:flex-row sm:justify-end",
 89 |         className
 90 |       )}
 91 |       {...props}
 92 |     />
 93 |   )
 94 | }
 95 | 
 96 | function DialogTitle({
 97 |   className,
 98 |   ...props
 99 | }: React.ComponentProps<typeof DialogPrimitive.Title>) {
100 |   return (
101 |     <DialogPrimitive.Title
102 |       data-slot="dialog-title"
103 |       className={cn("text-lg leading-none font-semibold", className)}
104 |       {...props}
105 |     />
106 |   )
107 | }
108 | 
109 | function DialogDescription({
110 |   className,
111 |   ...props
112 | }: React.ComponentProps<typeof DialogPrimitive.Description>) {
113 |   return (
114 |     <DialogPrimitive.Description
115 |       data-slot="dialog-description"
116 |       className={cn("text-muted-foreground text-sm", className)}
117 |       {...props}
118 |     />
119 |   )
120 | }
121 | 
122 | export {
123 |   Dialog,
124 |   DialogClose,
125 |   DialogContent,
126 |   DialogDescription,
127 |   DialogFooter,
128 |   DialogHeader,
129 |   DialogOverlay,
130 |   DialogPortal,
131 |   DialogTitle,
132 |   DialogTrigger,
133 | }
134 | 


--------------------------------------------------------------------------------
/ui/src/components/ui/input.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | 
 3 | import { cn } from "@/lib/utils"
 4 | 
 5 | function Input({ className, type, ...props }: React.ComponentProps<"input">) {
 6 |   return (
 7 |     <input
 8 |       type={type}
 9 |       data-slot="input"
10 |       className={cn(
11 |         "file:text-foreground placeholder:text-muted-foreground selection:bg-primary selection:text-primary-foreground dark:bg-input/30 border-input flex h-9 w-full min-w-0 rounded-md border bg-transparent px-3 py-1 text-base shadow-xs transition-[color,box-shadow] outline-none file:inline-flex file:h-7 file:border-0 file:bg-transparent file:text-sm file:font-medium disabled:pointer-events-none disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
12 |         "focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px]",
13 |         "aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive",
14 |         className
15 |       )}
16 |       {...props}
17 |     />
18 |   )
19 | }
20 | 
21 | export { Input }
22 | 


--------------------------------------------------------------------------------
/ui/src/components/upload-form.tsx:
--------------------------------------------------------------------------------
  1 | import {
  2 |   ConversationClustersList,
  3 |   ConversationsList,
  4 |   ConversationSummariesList,
  5 | } from "@/types/kura";
  6 | import {
  7 |   Card,
  8 |   CardContent,
  9 |   CardDescription,
 10 |   CardHeader,
 11 |   CardTitle,
 12 | } from "./ui/card";
 13 | import { Input } from "./ui/input";
 14 | import {
 15 |   parseConversationClusterFile,
 16 |   parseConversationFile,
 17 |   parseConversationSummaryFile,
 18 | } from "@/lib/parse";
 19 | import { Button } from "./ui/button";
 20 | 
 21 | type UploadFormProps = {
 22 |   setConversations: (conversations: ConversationsList) => void;
 23 |   conversations: ConversationsList | null;
 24 |   setSummaries: (summaries: ConversationSummariesList) => void;
 25 |   summaries: ConversationSummariesList | null;
 26 |   setClusters: (clusters: ConversationClustersList) => void;
 27 |   clusters: ConversationClustersList | null;
 28 |   handleVisualiseClusters: () => void;
 29 | };
 30 | 
 31 | const UploadForm = ({
 32 |   setConversations,
 33 |   conversations,
 34 |   setSummaries,
 35 |   summaries,
 36 |   setClusters,
 37 |   clusters,
 38 |   handleVisualiseClusters,
 39 | }: UploadFormProps) => {
 40 |   const handleFileChange = async (e: React.ChangeEvent<HTMLInputElement>) => {
 41 |     const files = e.target.files;
 42 | 
 43 |     if (!files) return;
 44 | 
 45 |     for (const file of files) {
 46 |       if (file.name === "conversations.json") {
 47 |         console.log("Parsing conversation file");
 48 |         const conversations = await parseConversationFile(file);
 49 |         if (conversations) {
 50 |           setConversations(conversations);
 51 |         }
 52 |       }
 53 | 
 54 |       if (file.name === "summaries.jsonl") {
 55 |         console.log("Parsing conversation summary file");
 56 |         const summaries = await parseConversationSummaryFile(file);
 57 |         if (summaries) {
 58 |           setSummaries(summaries);
 59 |         }
 60 |       }
 61 | 
 62 |       if (file.name === "dimensionality.jsonl") {
 63 |         console.log("Parsing conversation cluster file");
 64 |         const clusters = await parseConversationClusterFile(file);
 65 |         if (clusters) {
 66 |           setClusters(clusters);
 67 |         }
 68 |       }
 69 |     }
 70 |   };
 71 |   return (
 72 |     <Card className="max-w-2xl mx-auto mt-10">
 73 |       <CardHeader>
 74 |         <CardTitle>Load Checkpoint</CardTitle>
 75 |         <CardDescription>
 76 |           Select the checkpoint directory created by Kura{" "}
 77 |         </CardDescription>
 78 |       </CardHeader>
 79 |       <CardContent>
 80 |         <Input
 81 |           type="file"
 82 |           multiple
 83 |           //@ts-ignore
 84 |           webkitdirectory=""
 85 |           className="cursor-pointer"
 86 |           accept=""
 87 |           onChange={handleFileChange}
 88 |         />
 89 |         <div className="mt-4 text-left text-muted-foreground text-sm">
 90 |           {conversations && summaries && clusters && (
 91 |             <div>
 92 |               <p>
 93 |                 Loaded in {conversations.length} conversations,{" "}
 94 |                 {summaries?.length} summaries, {clusters?.length} clusters
 95 |               </p>
 96 |               <Button className="w-full mt-4" onClick={handleVisualiseClusters}>
 97 |                 Visualise Clusters
 98 |               </Button>
 99 |             </div>
100 |           )}
101 |         </div>
102 |       </CardContent>
103 |     </Card>
104 |   );
105 | };
106 | 
107 | export default UploadForm;
108 | 


--------------------------------------------------------------------------------
/ui/src/index.css:
--------------------------------------------------------------------------------
  1 | @import "tailwindcss";
  2 | @import "tw-animate-css";
  3 | 
  4 | @custom-variant dark (&:is(.dark *));
  5 | 
  6 | @theme inline {
  7 |   --radius-sm: calc(var(--radius) - 4px);
  8 |   --radius-md: calc(var(--radius) - 2px);
  9 |   --radius-lg: var(--radius);
 10 |   --radius-xl: calc(var(--radius) + 4px);
 11 |   --color-background: var(--background);
 12 |   --color-foreground: var(--foreground);
 13 |   --color-card: var(--card);
 14 |   --color-card-foreground: var(--card-foreground);
 15 |   --color-popover: var(--popover);
 16 |   --color-popover-foreground: var(--popover-foreground);
 17 |   --color-primary: var(--primary);
 18 |   --color-primary-foreground: var(--primary-foreground);
 19 |   --color-secondary: var(--secondary);
 20 |   --color-secondary-foreground: var(--secondary-foreground);
 21 |   --color-muted: var(--muted);
 22 |   --color-muted-foreground: var(--muted-foreground);
 23 |   --color-accent: var(--accent);
 24 |   --color-accent-foreground: var(--accent-foreground);
 25 |   --color-destructive: var(--destructive);
 26 |   --color-border: var(--border);
 27 |   --color-input: var(--input);
 28 |   --color-ring: var(--ring);
 29 |   --color-chart-1: var(--chart-1);
 30 |   --color-chart-2: var(--chart-2);
 31 |   --color-chart-3: var(--chart-3);
 32 |   --color-chart-4: var(--chart-4);
 33 |   --color-chart-5: var(--chart-5);
 34 |   --color-sidebar: var(--sidebar);
 35 |   --color-sidebar-foreground: var(--sidebar-foreground);
 36 |   --color-sidebar-primary: var(--sidebar-primary);
 37 |   --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
 38 |   --color-sidebar-accent: var(--sidebar-accent);
 39 |   --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
 40 |   --color-sidebar-border: var(--sidebar-border);
 41 |   --color-sidebar-ring: var(--sidebar-ring);
 42 | }
 43 | 
 44 | :root {
 45 |   --radius: 0.625rem;
 46 |   --background: oklch(1 0 0);
 47 |   --foreground: oklch(0.129 0.042 264.695);
 48 |   --card: oklch(1 0 0);
 49 |   --card-foreground: oklch(0.129 0.042 264.695);
 50 |   --popover: oklch(1 0 0);
 51 |   --popover-foreground: oklch(0.129 0.042 264.695);
 52 |   --primary: oklch(0.208 0.042 265.755);
 53 |   --primary-foreground: oklch(0.984 0.003 247.858);
 54 |   --secondary: oklch(0.968 0.007 247.896);
 55 |   --secondary-foreground: oklch(0.208 0.042 265.755);
 56 |   --muted: oklch(0.968 0.007 247.896);
 57 |   --muted-foreground: oklch(0.554 0.046 257.417);
 58 |   --accent: oklch(0.968 0.007 247.896);
 59 |   --accent-foreground: oklch(0.208 0.042 265.755);
 60 |   --destructive: oklch(0.577 0.245 27.325);
 61 |   --border: oklch(0.929 0.013 255.508);
 62 |   --input: oklch(0.929 0.013 255.508);
 63 |   --ring: oklch(0.704 0.04 256.788);
 64 |   --chart-1: oklch(0.646 0.222 41.116);
 65 |   --chart-2: oklch(0.6 0.118 184.704);
 66 |   --chart-3: oklch(0.398 0.07 227.392);
 67 |   --chart-4: oklch(0.828 0.189 84.429);
 68 |   --chart-5: oklch(0.769 0.188 70.08);
 69 |   --sidebar: oklch(0.984 0.003 247.858);
 70 |   --sidebar-foreground: oklch(0.129 0.042 264.695);
 71 |   --sidebar-primary: oklch(0.208 0.042 265.755);
 72 |   --sidebar-primary-foreground: oklch(0.984 0.003 247.858);
 73 |   --sidebar-accent: oklch(0.968 0.007 247.896);
 74 |   --sidebar-accent-foreground: oklch(0.208 0.042 265.755);
 75 |   --sidebar-border: oklch(0.929 0.013 255.508);
 76 |   --sidebar-ring: oklch(0.704 0.04 256.788);
 77 | }
 78 | 
 79 | .dark {
 80 |   --background: oklch(0.129 0.042 264.695);
 81 |   --foreground: oklch(0.984 0.003 247.858);
 82 |   --card: oklch(0.208 0.042 265.755);
 83 |   --card-foreground: oklch(0.984 0.003 247.858);
 84 |   --popover: oklch(0.208 0.042 265.755);
 85 |   --popover-foreground: oklch(0.984 0.003 247.858);
 86 |   --primary: oklch(0.929 0.013 255.508);
 87 |   --primary-foreground: oklch(0.208 0.042 265.755);
 88 |   --secondary: oklch(0.279 0.041 260.031);
 89 |   --secondary-foreground: oklch(0.984 0.003 247.858);
 90 |   --muted: oklch(0.279 0.041 260.031);
 91 |   --muted-foreground: oklch(0.704 0.04 256.788);
 92 |   --accent: oklch(0.279 0.041 260.031);
 93 |   --accent-foreground: oklch(0.984 0.003 247.858);
 94 |   --destructive: oklch(0.704 0.191 22.216);
 95 |   --border: oklch(1 0 0 / 10%);
 96 |   --input: oklch(1 0 0 / 15%);
 97 |   --ring: oklch(0.551 0.027 264.364);
 98 |   --chart-1: oklch(0.488 0.243 264.376);
 99 |   --chart-2: oklch(0.696 0.17 162.48);
100 |   --chart-3: oklch(0.769 0.188 70.08);
101 |   --chart-4: oklch(0.627 0.265 303.9);
102 |   --chart-5: oklch(0.645 0.246 16.439);
103 |   --sidebar: oklch(0.208 0.042 265.755);
104 |   --sidebar-foreground: oklch(0.984 0.003 247.858);
105 |   --sidebar-primary: oklch(0.488 0.243 264.376);
106 |   --sidebar-primary-foreground: oklch(0.984 0.003 247.858);
107 |   --sidebar-accent: oklch(0.279 0.041 260.031);
108 |   --sidebar-accent-foreground: oklch(0.984 0.003 247.858);
109 |   --sidebar-border: oklch(1 0 0 / 10%);
110 |   --sidebar-ring: oklch(0.551 0.027 264.364);
111 | }
112 | 
113 | @layer base {
114 |   * {
115 |     @apply border-border outline-ring/50;
116 |   }
117 |   body {
118 |     @apply bg-background text-foreground;
119 |   }
120 | }
121 | 


--------------------------------------------------------------------------------
/ui/src/lib/parse.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   ConversationsList,
 3 |   ConversationListSchema,
 4 |   ConversationSummariesList,
 5 |   ConversationSummaryListSchema,
 6 |   ConversationClustersList,
 7 |   ConversationClusterListSchema,
 8 | } from "@/types/kura";
 9 | 
10 | export const parseConversationFile = async (
11 |   file: File
12 | ): Promise<ConversationsList | null> => {
13 |   try {
14 |     const text = await file.text();
15 |     const conversations = JSON.parse(text);
16 |     const parsedConversations = ConversationListSchema.safeParse(conversations);
17 |     if (!parsedConversations.success) {
18 |       console.error(
19 |         "Error parsing conversation file",
20 |         parsedConversations.error
21 |       );
22 |       return null;
23 |     }
24 |     return parsedConversations.data;
25 |   } catch (error) {
26 |     console.error("Error parsing conversation file", error);
27 |     return null;
28 |   }
29 | };
30 | 
31 | export const parseConversationSummaryFile = async (
32 |   file: File
33 | ): Promise<ConversationSummariesList | null> => {
34 |   try {
35 |     const text = await file.text();
36 |     const lines = text.split("\n").filter((line) => line.trim() !== "");
37 |     const summaries = lines.map((line) => JSON.parse(line));
38 | 
39 |     const parsedSummaries = ConversationSummaryListSchema.safeParse(summaries);
40 |     if (!parsedSummaries.success) {
41 |       console.error(
42 |         "Error parsing conversation summary file",
43 |         parsedSummaries.error
44 |       );
45 |       return null;
46 |     }
47 |     return parsedSummaries.data;
48 |   } catch (error) {
49 |     console.error("Error parsing conversation summary file", error);
50 |     return null;
51 |   }
52 | };
53 | 
54 | export const parseConversationClusterFile = async (
55 |   file: File
56 | ): Promise<ConversationClustersList | null> => {
57 |   try {
58 |     const text = await file.text();
59 |     const lines = text.split("\n").filter((line) => line.trim() !== "");
60 |     const clusters = lines.map((line) => JSON.parse(line));
61 | 
62 |     const parsedClusters = ConversationClusterListSchema.safeParse(clusters);
63 |     if (!parsedClusters.success) {
64 |       console.error(
65 |         "Error parsing conversation cluster file",
66 |         parsedClusters.error
67 |       );
68 |       return null;
69 |     }
70 |     return parsedClusters.data;
71 |   } catch (error) {
72 |     console.error("Error parsing conversation cluster file", error);
73 |     return null;
74 |   }
75 | };
76 | 


--------------------------------------------------------------------------------
/ui/src/lib/tree.ts:
--------------------------------------------------------------------------------
 1 | import { ClusterTreeNode, ClusterTreeNodeSchema } from "@/types/cluster";
 2 | import { ConversationClustersList } from "@/types/kura";
 3 | 
 4 | export const flattenClusterTree = (
 5 |   node: ClusterTreeNode,
 6 |   acc: ClusterTreeNode[]
 7 | ) => {
 8 |   acc.push(node);
 9 |   node.children.forEach((child: ClusterTreeNode) =>
10 |     flattenClusterTree(child, acc)
11 |   );
12 |   return acc;
13 | };
14 | 
15 | export const buildClusterTree = (
16 |   clusters: ConversationClustersList,
17 |   parent_id: string | null,
18 |   depth: number
19 | ): ClusterTreeNode => {
20 |   const children = clusters.filter((c) => c.parent_id === parent_id);
21 | 
22 |   const parent = clusters.find((c) => c.id === parent_id) ?? {
23 |     name: "Root",
24 |     id: "root",
25 |     description: "Root",
26 |     chat_ids: [],
27 |     x_coord: 0,
28 |     y_coord: 0,
29 |     count: 0,
30 |     level: depth,
31 |     parent_id: null,
32 |   };
33 | 
34 |   const data = ClusterTreeNodeSchema.safeParse({
35 |     ...parent,
36 |     children: children.map((c) => buildClusterTree(clusters, c.id, depth + 1)),
37 |   });
38 | 
39 |   if (!data.success) {
40 |     console.error(data.error);
41 |     throw new Error("Failed to build cluster tree");
42 |   }
43 | 
44 |   return {
45 |     ...data.data,
46 |     level: depth,
47 |   };
48 | };
49 | 


--------------------------------------------------------------------------------
/ui/src/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import { clsx, type ClassValue } from "clsx"
2 | import { twMerge } from "tailwind-merge"
3 | 
4 | export function cn(...inputs: ClassValue[]) {
5 |   return twMerge(clsx(inputs))
6 | }
7 | 


--------------------------------------------------------------------------------
/ui/src/main.tsx:
--------------------------------------------------------------------------------
 1 | import { StrictMode } from 'react'
 2 | import { createRoot } from 'react-dom/client'
 3 | import './index.css'
 4 | import App from './App.tsx'
 5 | 
 6 | createRoot(document.getElementById('root')!).render(
 7 |   <StrictMode>
 8 |     <App />
 9 |   </StrictMode>,
10 | )
11 | 


--------------------------------------------------------------------------------
/ui/src/types/cluster.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { ConversationSchema, ConversationClusterSchema } from "./kura";
 3 | 
 4 | export const ConversationInfoSchema = ConversationSchema.extend({
 5 |   summary: z.string(),
 6 | });
 7 | 
 8 | export const ClusterTreeNodeSchema: z.ZodType<any> =
 9 |   ConversationClusterSchema.extend({
10 |     children: z.lazy(() => z.array(ClusterTreeNodeSchema)),
11 |     depth: z.number().optional(),
12 |   });
13 | 
14 | export const ConversationInfoListSchema = z.array(ConversationInfoSchema);
15 | export type ConversationInfoList = z.infer<typeof ConversationInfoListSchema>;
16 | export type ClusterTreeNode = z.infer<typeof ClusterTreeNodeSchema>;
17 | export type ConversationInfo = z.infer<typeof ConversationInfoSchema>;
18 | 


--------------------------------------------------------------------------------
/ui/src/types/kura.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | 
 3 | export const MetadataDictSchema = z.record(
 4 |   z.string(),
 5 |   z.union([
 6 |     z.string(),
 7 |     z.number(),
 8 |     z.boolean(),
 9 |     z.array(z.string()),
10 |     z.array(z.number()),
11 |     z.array(z.boolean()),
12 |   ])
13 | );
14 | 
15 | export type ConversationMetadata = z.infer<typeof MetadataDictSchema>;
16 | 
17 | export const MessageSchema = z.object({
18 |   created_at: z.string(),
19 |   role: z.enum(["user", "assistant"]),
20 |   content: z.string(),
21 | });
22 | 
23 | export const ConversationSchema = z.object({
24 |   chat_id: z.string(),
25 |   created_at: z.string(),
26 |   messages: z.array(MessageSchema),
27 |   metadata: MetadataDictSchema,
28 | });
29 | 
30 | export const ConversationListSchema = z.array(ConversationSchema);
31 | export type ConversationsList = z.infer<typeof ConversationListSchema>;
32 | 
33 | export const ConversationSummarySchema = z.object({
34 |   chat_id: z.string(),
35 |   summary: z.string(),
36 |   metadata: MetadataDictSchema,
37 | });
38 | 
39 | export const ConversationSummaryListSchema = z.array(ConversationSummarySchema);
40 | export type ConversationSummariesList = z.infer<
41 |   typeof ConversationSummaryListSchema
42 | >;
43 | 
44 | export const ConversationClusterSchema = z.object({
45 |   id: z.string(),
46 |   name: z.string(),
47 |   description: z.string(),
48 |   chat_ids: z.array(z.string()),
49 |   parent_id: z.string().nullable(),
50 |   x_coord: z.number(),
51 |   y_coord: z.number(),
52 |   level: z.number(),
53 |   count: z.number(),
54 | });
55 | 
56 | export const ConversationClusterListSchema = z.array(ConversationClusterSchema);
57 | export type ConversationClustersList = z.infer<
58 |   typeof ConversationClusterListSchema
59 | >;
60 | 


--------------------------------------------------------------------------------
/ui/src/vite-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="vite/client" />
2 | 


--------------------------------------------------------------------------------
/ui/tsconfig.app.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
 4 |     "target": "ES2020",
 5 |     "useDefineForClassFields": true,
 6 |     "lib": ["ES2020", "DOM", "DOM.Iterable"],
 7 |     "module": "ESNext",
 8 |     "skipLibCheck": true,
 9 | 
10 |     /* Bundler mode */
11 |     "moduleResolution": "bundler",
12 |     "allowImportingTsExtensions": true,
13 |     "isolatedModules": true,
14 |     "moduleDetection": "force",
15 |     "noEmit": true,
16 |     "jsx": "react-jsx",
17 | 
18 |     /* Linting */
19 |     "strict": true,
20 |     "noUnusedLocals": true,
21 |     "noUnusedParameters": true,
22 |     "noFallthroughCasesInSwitch": true,
23 |     "noUncheckedSideEffectImports": true,
24 |     "baseUrl": ".",
25 |     "paths": {
26 |       "@/*": ["./src/*"]
27 |     }
28 |   },
29 |   "include": ["src"]
30 | }
31 | 


--------------------------------------------------------------------------------
/ui/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [],
 3 |   "references": [
 4 |     { "path": "./tsconfig.app.json" },
 5 |     { "path": "./tsconfig.node.json" }
 6 |   ],
 7 |   "compilerOptions": {
 8 |     "baseUrl": ".",
 9 |     "paths": {
10 |       "@/*": ["./src/*"]
11 |     }
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/ui/tsconfig.node.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
 4 |     "target": "ES2022",
 5 |     "lib": ["ES2023"],
 6 |     "module": "ESNext",
 7 |     "skipLibCheck": true,
 8 | 
 9 |     /* Bundler mode */
10 |     "moduleResolution": "bundler",
11 |     "allowImportingTsExtensions": true,
12 |     "isolatedModules": true,
13 |     "moduleDetection": "force",
14 |     "noEmit": true,
15 | 
16 |     /* Linting */
17 |     "strict": true,
18 |     "noUnusedLocals": true,
19 |     "noUnusedParameters": true,
20 |     "noFallthroughCasesInSwitch": true,
21 |     "noUncheckedSideEffectImports": true
22 |   },
23 |   "include": ["vite.config.ts"]
24 | }
25 | 


--------------------------------------------------------------------------------
/ui/vite.config.ts:
--------------------------------------------------------------------------------
 1 | import path from "path";
 2 | import tailwindcss from "@tailwindcss/vite";
 3 | import react from "@vitejs/plugin-react";
 4 | import { defineConfig } from "vite";
 5 | 
 6 | // https://vite.dev/config/
 7 | export default defineConfig({
 8 |   plugins: [react(), tailwindcss()],
 9 |   resolve: {
10 |     alias: {
11 |       "@": path.resolve(__dirname, "./src"),
12 |     },
13 |   },
14 |   build: {
15 |     outDir: path.resolve(__dirname, "../kura/static/dist"),
16 |     emptyOutDir: true,
17 |   },
18 | });
19 | 


--------------------------------------------------------------------------------