├── .github └── workflows │ ├── claude.yml │ ├── publish.yml │ ├── pyright.yml │ ├── pytest.yml │ └── ruff.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CLAUDE.md ├── CONTRIBUTING.md ├── README.md ├── docs ├── api │ └── index.md ├── assets │ └── images │ │ ├── cluster-details.png │ │ ├── cluster-map.png │ │ ├── cluster-tree.png │ │ ├── conversation.png │ │ └── kura-architecture.png ├── blog │ ├── index.md │ └── posts │ │ ├── does-kura-work.md │ │ ├── kura-0-5-0-release.md │ │ └── new-documentation-release.md ├── core-concepts │ ├── clustering.md │ ├── conversations.md │ ├── dimensionality-reduction.md │ ├── embedding.md │ ├── meta-clustering.md │ ├── overview.md │ └── summarization.md ├── getting-started │ ├── configuration.md │ ├── installation.md │ └── quickstart.md └── index.md ├── kura.png ├── kura ├── __init__.py ├── base_classes │ ├── __init__.py │ ├── cluster.py │ ├── clustering_method.py │ ├── dimensionality.py │ ├── embedding.py │ ├── meta_cluster.py │ └── summarisation.py ├── cli │ ├── cli.py │ ├── server.py │ └── visualisation.py ├── clio_reference.md ├── cluster.py ├── dimensionality.py ├── embedding.py ├── k_means.py ├── kura.py ├── meta_cluster.py ├── static │ └── dist │ │ ├── assets │ │ ├── index-CvLvA1NY.css │ │ └── index-DztdrX1V.js │ │ ├── index.html │ │ └── vite.svg ├── summarisation.py ├── types │ ├── __init__.py │ ├── cluster.py │ ├── conversation.py │ ├── dimensionality.py │ └── summarisation.py ├── v1 │ ├── README.md │ ├── __init__.py │ ├── kura.py │ └── visualization.py └── visualization.py ├── mkdocs.yml ├── pyproject.toml ├── requirements.txt ├── scripts ├── README.md ├── build_docs.sh ├── test_sentence_transformer_real.py ├── tutorial_class_api.py └── tutorial_procedural_api.py ├── tests └── test_meta_cluster.py ├── ui ├── .gitignore ├── README.md ├── bun.lockb ├── components.json ├── eslint.config.js ├── index.html ├── package.json ├── public │ └── vite.svg ├── src │ ├── App.tsx │ ├── assets │ │ └── react.svg │ ├── components │ │ ├── cluster-details.tsx │ │ ├── cluster-map.tsx │ │ ├── cluster-tree.tsx │ │ ├── conversation-dialog.tsx │ │ ├── ui │ │ │ ├── button.tsx │ │ │ ├── card.tsx │ │ │ ├── dialog.tsx │ │ │ └── input.tsx │ │ └── upload-form.tsx │ ├── index.css │ ├── lib │ │ ├── parse.ts │ │ ├── tree.ts │ │ └── utils.ts │ ├── main.tsx │ ├── types │ │ ├── cluster.ts │ │ └── kura.ts │ └── vite-env.d.ts ├── tsconfig.app.json ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts └── uv.lock /.github/workflows/claude.yml: -------------------------------------------------------------------------------- 1 | name: Claude Code 2 | 3 | on: 4 | issue_comment: 5 | types: [created] 6 | pull_request_review_comment: 7 | types: [created] 8 | issues: 9 | types: [opened, assigned] 10 | pull_request_review: 11 | types: [submitted] 12 | 13 | jobs: 14 | claude: 15 | if: | 16 | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || 17 | (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || 18 | (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || 19 | (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) 20 | runs-on: ubuntu-latest 21 | permissions: 22 | contents: read 23 | pull-requests: read 24 | issues: read 25 | id-token: write 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v4 29 | with: 30 | fetch-depth: 1 31 | 32 | - name: Run Claude Code 33 | id: claude 34 | uses: anthropics/claude-code-action@beta 35 | with: 36 | anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} 37 | allowed_tools: "Edit,Replace,NotebookEditCell" 38 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPI when a Release is Created 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | pypi-publish: 9 | name: Publish release to PyPI 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: pypi 13 | url: https://pypi.org/p/kura 14 | permissions: 15 | id-token: write 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Setup Bun 20 | uses: oven-sh/setup-bun@v1 21 | with: 22 | bun-version: latest 23 | 24 | - name: Build Web Assets 25 | working-directory: ui 26 | run: | 27 | bun install 28 | bun run build 29 | 30 | - name: Install uv 31 | uses: astral-sh/setup-uv@v5 32 | with: 33 | python-version: 3.9 34 | 35 | - name: Install the project 36 | run: uv sync --all-extras --dev 37 | 38 | - name: Build Package 39 | run: uv build 40 | 41 | - name: Build and publish Python package 42 | run: uv publish 43 | env: 44 | UV_PUBLISH_TOKEN: ${{ secrets.PYPI_TOKEN }} 45 | -------------------------------------------------------------------------------- /.github/workflows/pyright.yml: -------------------------------------------------------------------------------- 1 | name: Pyright 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | 9 | env: 10 | WORKING_DIRECTORY: "." 11 | PYRIGHT_OUTPUT_FILENAME: "pyright.log" 12 | 13 | jobs: 14 | Pyright: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest] 19 | python-version: ["3.9"] 20 | 21 | steps: 22 | - name: Checkout code 23 | uses: actions/checkout@v4 24 | - name: Install uv 25 | uses: astral-sh/setup-uv@v4 26 | with: 27 | enable-cache: true 28 | - name: Set up Python 29 | run: uv python install ${{ matrix.python-version }} 30 | - name: Install the project 31 | run: uv sync --all-extras --dev 32 | - name: Run pyright 33 | run: uv run pyright 34 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Pytest 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | Pytest: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest] 15 | python-version: ["3.9"] 16 | env: 17 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 18 | GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | - name: Install uv 24 | uses: astral-sh/setup-uv@v4 25 | with: 26 | enable-cache: true 27 | - name: Set up Python 28 | run: uv python install ${{ matrix.python-version }} 29 | - name: Install the project 30 | run: uv sync --all-extras --dev 31 | - name: Run pytest 32 | run: uv run pytest 33 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | 3 | on: 4 | pull_request: 5 | push: 6 | 7 | jobs: 8 | ruff: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: astral-sh/ruff-action@v3 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | .venv/ 5 | venv/ 6 | *.so 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | 24 | # Node.js 25 | node_modules/ 26 | 27 | # macOS 28 | .DS_Store 29 | 30 | # Environments 31 | .env 32 | .env.* 33 | !.env.example 34 | 35 | # IDE 36 | .idea/ 37 | .vscode/ 38 | *.swp 39 | *.swo 40 | 41 | # Logs 42 | logs/ 43 | *.log 44 | 45 | # Examples directory 46 | examples/ 47 | 48 | # Tutorial checkpoints 49 | tutorial_checkpoints/ 50 | tutorial_checkpoints_class/ 51 | site/ 52 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - id: check-merge-conflict 9 | - id: check-toml 10 | 11 | - repo: https://github.com/astral-sh/ruff-pre-commit 12 | rev: v0.11.11 13 | hooks: 14 | # Run the linter 15 | - id: ruff-check 16 | args: [--fix] 17 | # Run the formatter 18 | - id: ruff-format 19 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | ## Commands 6 | 7 | ### Python Environment Setup 8 | 9 | ```bash 10 | # Create and activate a virtual environment 11 | python -m venv venv 12 | source venv/bin/activate # On Windows: venv\Scripts\activate 13 | 14 | # Install package in development mode with dev dependencies 15 | pip install -e ".[dev]" 16 | ``` 17 | 18 | ### Running Tests 19 | 20 | ```bash 21 | # Run all tests 22 | pytest 23 | 24 | # Run a specific test file 25 | pytest tests/test_meta_cluster.py 26 | 27 | # Run a specific test 28 | pytest tests/test_meta_cluster.py::test_cluster_label_exact_match 29 | ``` 30 | 31 | ### Type Checking 32 | 33 | ```bash 34 | # Run type checking 35 | pyright 36 | ``` 37 | 38 | ### Documentation 39 | 40 | ```bash 41 | # Install documentation dependencies 42 | pip install -e ".[docs]" 43 | 44 | # Serve documentation locally 45 | mkdocs serve 46 | ``` 47 | 48 | ### UI Development 49 | 50 | ```bash 51 | # Navigate to UI directory 52 | cd ui 53 | 54 | # Install dependencies 55 | npm install 56 | 57 | # Start development server 58 | npm run dev 59 | 60 | # Build for production 61 | npm run build 62 | 63 | # Lint code 64 | npm run lint 65 | ``` 66 | 67 | ### Running the Application 68 | 69 | ```bash 70 | # Start the Kura web server (implemented in kura/cli/cli.py and kura/cli/server.py) 71 | kura start-app 72 | 73 | # Start with a custom checkpoint directory 74 | kura start-app --dir ./my-checkpoints 75 | ``` 76 | 77 | ## Architecture Overview 78 | 79 | Kura is a tool for analyzing and visualizing chat data, built on the same ideas as Anthropic's CLIO. It uses machine learning techniques to understand user conversations by clustering them into meaningful groups. 80 | 81 | ### Two API Approaches 82 | 83 | Kura offers two APIs for different use cases: 84 | 85 | 1. **Class-Based API** (`kura/kura.py`): The original API with a single `Kura` class that orchestrates the entire pipeline 86 | 2. **Procedural API** (`kura/v1/`): A functional approach with composable functions for maximum flexibility 87 | 88 | ### Core Components 89 | 90 | 1. **Summarisation Model** (`kura/summarisation.py`): Takes user conversations and summarizes them into task descriptions 91 | 2. **Embedding Model** (`kura/embedding.py`): Converts text into vector representations (embeddings) 92 | 3. **Clustering Model** (`kura/cluster.py`): Groups summaries into clusters based on embeddings 93 | 4. **Meta Clustering Model** (`kura/meta_cluster.py`): Further groups clusters into a hierarchical structure (Note: `max_clusters` parameter now lives here, not in the main Kura class) 94 | 5. **Dimensionality Reduction** (`kura/dimensionality.py`): Reduces high-dimensional embeddings for visualization 95 | 96 | ### Data Flow 97 | 98 | 1. Raw conversations are loaded 99 | 2. Conversations are summarized 100 | 3. Summaries are embedded and clustered 101 | 4. Base clusters are reduced to meta-clusters 102 | 5. Dimensionality reduction is applied for visualization 103 | 6. Results are saved as checkpoints for persistence 104 | 105 | ### Key Classes 106 | 107 | - `Kura` (`kura/kura.py`): Main class that orchestrates the entire pipeline 108 | - `BaseEmbeddingModel` / `OpenAIEmbeddingModel` (`kura/embedding.py`): Handle text embedding 109 | - `BaseSummaryModel` / `SummaryModel` (`kura/summarisation.py`): Summarize conversations 110 | - `BaseClusterModel` / `ClusterModel` (`kura/cluster.py`): Create initial clusters 111 | - `BaseMetaClusterModel` / `MetaClusterModel` (`kura/meta_cluster.py`): Reduce clusters into hierarchical groups 112 | - `BaseDimensionalityReduction` / `HDBUMAP` (`kura/dimensionality.py`): Reduce dimensions for visualization 113 | - `Conversation` (`kura/types/conversation.py`): Core data model for user conversations 114 | 115 | ### UI Components 116 | 117 | The project includes a React/TypeScript frontend for visualizing the clusters, with components for: 118 | - Displaying cluster maps (`ui/src/components/cluster-map.tsx`) 119 | - Showing cluster details (`ui/src/components/cluster-details.tsx`) 120 | - Visualizing cluster hierarchies (`ui/src/components/cluster-tree.tsx`) 121 | - Handling conversation uploads (`ui/src/components/upload-form.tsx`) 122 | - Displaying individual conversations (`ui/src/components/conversation-dialog.tsx`) 123 | 124 | ### Extensibility 125 | 126 | The system is designed to be modular, allowing custom implementations of: 127 | - Embedding models 128 | - Summarization models 129 | - Clustering algorithms 130 | - Dimensionality reduction techniques 131 | 132 | ## Working with Metadata 133 | 134 | Kura supports two types of metadata for enriching conversation analysis: 135 | 136 | ### 1. LLM Extractors 137 | Custom metadata can be extracted from conversations using LLM-powered extractors (implemented in `kura/summarisation.py`). These functions run on raw conversations to identify properties like: 138 | - Language detection 139 | - Sentiment analysis 140 | - Topic identification 141 | - Custom metrics 142 | 143 | Example of creating a custom extractor: 144 | ```python 145 | async def language_extractor( 146 | conversation: Conversation, 147 | sems: dict[str, asyncio.Semaphore], 148 | clients: dict[str, instructor.AsyncInstructor], 149 | ) -> ExtractedProperty: 150 | sem = sems.get("default") 151 | client = clients.get("default") 152 | 153 | async with sem: 154 | resp = await client.chat.completions.create( 155 | model="gemini-2.0-flash", 156 | messages=[ 157 | { 158 | "role": "system", 159 | "content": "Extract the language of this conversation.", 160 | }, 161 | { 162 | "role": "user", 163 | "content": "\n".join( 164 | [f"{msg.role}: {msg.content}" for msg in conversation.messages] 165 | ), 166 | }, 167 | ], 168 | response_model=Language, 169 | ) 170 | return ExtractedProperty( 171 | name="language_code", 172 | value=resp.language_code, 173 | ) 174 | ``` 175 | 176 | ### 2. Conversation Metadata 177 | Metadata can be directly attached to conversation objects when loading data (implemented in `kura/types/conversation.py`): 178 | ```python 179 | conversations = Conversation.from_hf_dataset( 180 | "allenai/WildChat-nontoxic", 181 | metadata_fn=lambda x: { 182 | "model": x["model"], 183 | "toxic": x["toxic"], 184 | "redacted": x["redacted"], 185 | }, 186 | ) 187 | ``` 188 | 189 | ## Loading Data 190 | 191 | Kura supports multiple data sources (implementations in `kura/types/conversation.py`): 192 | 193 | ### Claude Conversation History 194 | ```python 195 | from kura.types import Conversation 196 | conversations = Conversation.from_claude_conversation_dump("conversations.json") 197 | ``` 198 | 199 | ### Hugging Face Datasets 200 | ```python 201 | from kura.types import Conversation 202 | conversations = Conversation.from_hf_dataset( 203 | "ivanleomk/synthetic-gemini-conversations", 204 | split="train" 205 | ) 206 | ``` 207 | 208 | ### Custom Conversations 209 | For custom data formats, create Conversation objects directly: 210 | ```python 211 | from kura.types import Conversation, Message 212 | from datetime import datetime 213 | from uuid import uuid4 214 | 215 | conversations = [ 216 | Conversation( 217 | messages=[ 218 | Message( 219 | created_at=str(datetime.now()), 220 | role=message["role"], 221 | content=message["content"], 222 | ) 223 | for message in raw_messages 224 | ], 225 | id=str(uuid4()), 226 | created_at=datetime.now(), 227 | ) 228 | ] 229 | ``` 230 | 231 | ## Checkpoints 232 | 233 | Kura uses checkpoint files to save state between runs (checkpoint handling in `kura/kura.py`): 234 | - `conversations.json`: Raw conversation data 235 | - `summaries.jsonl`: Summarized conversations 236 | - `clusters.jsonl`: Base cluster data 237 | - `meta_clusters.jsonl`: Hierarchical cluster data 238 | - `dimensionality.jsonl`: Projected cluster data for visualization 239 | 240 | Checkpoints are stored in the directory specified by the `checkpoint_dir` parameter (default: `./checkpoints`). 241 | 242 | ## Visualization 243 | 244 | Kura includes visualization tools: 245 | 246 | ### CLI Visualization 247 | ```python 248 | # Tree visualization implemented in kura/kura.py 249 | kura.visualise_clusters() 250 | ``` 251 | 252 | ### Web Server 253 | ```bash 254 | # Web server implemented in kura/cli/server.py 255 | kura start-app 256 | # Access at http://localhost:8000 257 | ``` 258 | 259 | The web interface provides: 260 | - Interactive cluster map 261 | - Cluster hierarchy tree 262 | - Cluster details panel 263 | - Conversation preview 264 | - Metadata filtering 265 | 266 | ## Procedural API (v1) 267 | 268 | The procedural API in `kura/v1/` provides a functional approach to the pipeline: 269 | 270 | ### Key Functions 271 | - `summarise_conversations(conversations, *, model, checkpoint_manager=None)` - Generate summaries 272 | - `generate_base_clusters_from_conversation_summaries(summaries, *, model, checkpoint_manager=None)` - Create initial clusters 273 | - `reduce_clusters_from_base_clusters(clusters, *, model, checkpoint_manager=None)` - Build hierarchy 274 | - `reduce_dimensionality_from_clusters(clusters, *, model, checkpoint_manager=None)` - Project to 2D 275 | 276 | ### Example Usage 277 | ```python 278 | from kura import ( 279 | summarise_conversations, 280 | generate_base_clusters_from_conversation_summaries, 281 | reduce_clusters_from_base_clusters, 282 | reduce_dimensionality_from_clusters, 283 | CheckpointManager 284 | ) 285 | 286 | # Run pipeline with explicit steps 287 | checkpoint_mgr = CheckpointManager("./checkpoints", enabled=True) 288 | 289 | summaries = await summarise_conversations( 290 | conversations, 291 | model=summary_model, 292 | checkpoint_manager=checkpoint_mgr 293 | ) 294 | 295 | clusters = await generate_base_clusters_from_conversation_summaries( 296 | summaries, 297 | model=cluster_model, 298 | checkpoint_manager=checkpoint_mgr 299 | ) 300 | # ... continue with remaining steps 301 | ``` 302 | 303 | ### Benefits 304 | - Fine-grained control over each step 305 | - Easy to skip or reorder steps 306 | - Support for heterogeneous models (OpenAI, vLLM, Hugging Face, etc.) 307 | - Functional programming style with no hidden state 308 | - All functions use keyword-only arguments for clarity -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Kura 2 | 3 | Thank you for your interest in contributing to Kura! This document provides guidelines and information to help you contribute effectively. 4 | 5 | ## Setting Up the Development Environment 6 | 7 | 1. Create and activate a virtual environment using uv: 8 | ```bash 9 | uv venv 10 | source .venv/bin/activate # On Windows: .venv\Scripts\activate 11 | ``` 12 | 13 | 2. Install the package in development mode with dev dependencies: 14 | ```bash 15 | uv pip install -e ".[dev]" 16 | ``` 17 | 18 | ## Testing 19 | 20 | Kura uses pytest for testing. The current test suite primarily focuses on the meta-clustering functionality. 21 | 22 | ### Quick Start Testing 23 | 24 | To quickly test the full Kura pipeline and UI: 25 | 26 | 1. **Run the tutorial test** to generate sample data: 27 | 28 | For the procedural API (recommended for understanding the pipeline): 29 | ```bash 30 | uv run python scripts/tutorial_procedural_api.py 31 | ``` 32 | 33 | For the class-based API (simpler to use): 34 | ```bash 35 | uv run python scripts/tutorial_class_api.py 36 | ``` 37 | 38 | Either tutorial will: 39 | - Import all Kura modules 40 | - Load 190 sample conversations from Hugging Face 41 | - Process and cluster the conversations 42 | - Generate 29 hierarchical clusters organized into 10 root categories 43 | - Generate visualization data 44 | - Save results to `./tutorial_checkpoints/` (procedural) or `./tutorial_checkpoints_class/` (class-based) 45 | 46 | Note: This process may take a few minutes depending on your system and API rate limits. 47 | 48 | 2. **Test the UI** after running the tutorial: 49 | 50 | For procedural API results: 51 | ```bash 52 | kura start-app --dir ./tutorial_checkpoints 53 | ``` 54 | 55 | For class-based API results: 56 | ```bash 57 | kura start-app --dir ./tutorial_checkpoints_class 58 | ``` 59 | 60 | This will: 61 | - Start the backend API and frontend on http://localhost:8000 62 | - Use the data from the checkpoint directory generated by the tutorial test 63 | - Display the cluster map, tree view, and detailed cluster information 64 | 65 | Note: The UI may take a moment to fully load as it processes the cluster data. 66 | 67 | ### Running Unit Tests 68 | 69 | ```bash 70 | # Run all tests 71 | pytest 72 | 73 | # Run a specific test file 74 | pytest tests/test_meta_cluster.py 75 | 76 | # Run a specific test 77 | pytest tests/test_meta_cluster.py::test_cluster_label_exact_match 78 | ``` 79 | 80 | ### Test Structure 81 | 82 | Tests are located in the `tests/` directory. The current tests verify: 83 | 84 | - **Exact match functionality**: Tests that `ClusterLabel` correctly validates when there's an exact match between input and candidate clusters. 85 | - **Fuzzy matching**: Tests that similar but not identical strings can be matched using fuzzy matching with an appropriate threshold. 86 | - **Validation errors**: Tests that the system properly rejects inputs that don't match any candidates. 87 | 88 | ### Writing New Tests 89 | 90 | When adding new features or fixing bugs, please include appropriate tests. Follow these guidelines: 91 | 92 | 1. Create test files with the `test_` prefix 93 | 2. Write test functions with descriptive names and docstrings 94 | 3. Use pytest fixtures when appropriate 95 | 4. Use assertions to verify expected behavior 96 | 5. Test both the class-based API and procedural API where applicable 97 | 98 | ### Example: Testing with the Procedural API 99 | 100 | ```python 101 | import pytest 102 | import asyncio 103 | from kura.v1 import ( 104 | summarise_conversations, 105 | generate_base_clusters_from_conversation_summaries, 106 | CheckpointManager 107 | ) 108 | from kura.summarisation import SummaryModel 109 | from kura.cluster import ClusterModel 110 | from kura.types import Conversation 111 | 112 | @pytest.mark.asyncio 113 | async def test_procedural_pipeline(): 114 | # Load test conversations 115 | conversations = Conversation.from_hf_dataset( 116 | "ivanleomk/synthetic-gemini-conversations", 117 | split="train[:10]" # Use only 10 for testing 118 | ) 119 | 120 | # Initialize models 121 | summary_model = SummaryModel() 122 | cluster_model = ClusterModel() 123 | 124 | # Run pipeline steps 125 | summaries = await summarise_conversations( 126 | conversations, 127 | model=summary_model, 128 | checkpoint_manager=None # No checkpointing for tests 129 | ) 130 | 131 | clusters = await generate_base_clusters_from_conversation_summaries( 132 | summaries, 133 | model=cluster_model, 134 | checkpoint_manager=None 135 | ) 136 | 137 | # Assertions 138 | assert len(summaries) == 10 139 | assert len(clusters) > 0 140 | assert all(cluster.label for cluster in clusters) 141 | ``` 142 | 143 | ## Type Checking 144 | 145 | Kura uses pyright for type checking: 146 | 147 | ```bash 148 | pyright 149 | ``` 150 | 151 | ## Documentation 152 | 153 | To work on documentation: 154 | 155 | 1. Install documentation dependencies: 156 | ```bash 157 | uv pip install -e ".[docs]" 158 | ``` 159 | 160 | 2. Serve documentation locally: 161 | ```bash 162 | mkdocs serve 163 | ``` 164 | 165 | ## Code Style 166 | 167 | - Follow PEP 8 guidelines for Python code 168 | - Use type hints for all function parameters and return values 169 | - Write docstrings for all public classes and functions 170 | 171 | ## Pull Request Process 172 | 173 | 1. Fork the repository 174 | 2. Create a feature branch 175 | 3. Add tests for your changes 176 | 4. Ensure all tests pass 177 | 5. Update documentation as needed 178 | 6. Submit a pull request 179 | 180 | ## UI Development 181 | 182 | If you're working on the UI: 183 | 184 | ```bash 185 | # Navigate to UI directory 186 | cd ui 187 | 188 | # Install dependencies 189 | npm install 190 | 191 | # Start development server 192 | npm run dev 193 | 194 | # Build for production 195 | npm run build 196 | 197 | # Lint code 198 | npm run lint 199 | ``` 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kura: Procedural API for Chat Data Analysis 2 | 3 |  4 | 5 | [](https://pypi.org/project/kura/) 6 | [](https://github.com/567-labs/kura/stargazers) 7 | [](https://567-labs.github.io/kura/) 8 | [](https://opensource.org/licenses/MIT) 9 | [](https://pypi.org/project/kura/) 10 | [](https://pypi.org/project/kura/) 11 | 12 | Kura is an open-source library for understanding chat data through machine learning, inspired by [Anthropic's CLIO](https://www.anthropic.com/research/clio). It provides a functional, composable API for clustering conversations to discover patterns and insights. 13 | 14 | ## Why Analyze Conversation Data? 15 | 16 | As AI assistants and chatbots become increasingly central to product experiences, understanding how users interact with these systems at scale becomes a critical challenge. Manually reviewing thousands of conversations is impractical, yet crucial patterns and user needs often remain hidden in this data. 17 | 18 | Kura addresses this challenge by: 19 | 20 | - **Revealing user intent patterns** that may not be obvious from individual conversations 21 | - **Identifying common user needs** to prioritize feature development 22 | - **Discovering edge cases and failures** that require attention 23 | - **Tracking usage trends** over time as your product evolves 24 | - **Informing prompt engineering** by highlighting successful and problematic interactions 25 | 26 | By clustering similar conversations and providing intuitive visualizations, Kura transforms raw chat data into actionable insights without compromising user privacy. 27 | 28 | ## Installation 29 | 30 | ```bash 31 | uv pip install kura 32 | ``` 33 | 34 | ## Quick Start 35 | 36 | ```python 37 | import asyncio 38 | from rich.console import Console 39 | from kura import ( 40 | summarise_conversations, 41 | generate_base_clusters_from_conversation_summaries, 42 | reduce_clusters_from_base_clusters, 43 | reduce_dimensionality_from_clusters, 44 | CheckpointManager, 45 | ) 46 | from kura.visualization import visualise_pipeline_results 47 | from kura.types import Conversation 48 | from kura.summarisation import SummaryModel 49 | from kura.cluster import ClusterModel 50 | from kura.meta_cluster import MetaClusterModel 51 | from kura.dimensionality import HDBUMAP 52 | 53 | async def main(): 54 | # Initialize models 55 | console = Console() 56 | summary_model = SummaryModel(console=console) 57 | cluster_model = ClusterModel(console=console) 58 | meta_cluster_model = MetaClusterModel(console=console) 59 | dimensionality_model = HDBUMAP() 60 | 61 | # Set up checkpointing to save intermediate results 62 | checkpoint_manager = CheckpointManager("./checkpoints", enabled=True) 63 | 64 | # Load conversations from Hugging Face dataset 65 | conversations = Conversation.from_hf_dataset( 66 | "ivanleomk/synthetic-gemini-conversations", 67 | split="train" 68 | ) 69 | 70 | # Process through the pipeline step by step 71 | summaries = await summarise_conversations( 72 | conversations, 73 | model=summary_model, 74 | checkpoint_manager=checkpoint_manager 75 | ) 76 | 77 | clusters = await generate_base_clusters_from_conversation_summaries( 78 | summaries, 79 | model=cluster_model, 80 | checkpoint_manager=checkpoint_manager 81 | ) 82 | 83 | reduced_clusters = await reduce_clusters_from_base_clusters( 84 | clusters, 85 | model=meta_cluster_model, 86 | checkpoint_manager=checkpoint_manager 87 | ) 88 | 89 | projected_clusters = await reduce_dimensionality_from_clusters( 90 | reduced_clusters, 91 | model=dimensionality_model, 92 | checkpoint_manager=checkpoint_manager, 93 | ) 94 | 95 | # Visualize results 96 | visualise_pipeline_results(reduced_clusters, style="enhanced") 97 | 98 | print(f"\nProcessed {len(conversations)} conversations") 99 | print(f"Created {len(reduced_clusters)} meta clusters") 100 | print(f"Checkpoints saved to: {checkpoint_manager.checkpoint_dir}") 101 | 102 | if __name__ == "__main__": 103 | asyncio.run(main()) 104 | ``` 105 | 106 | This example will: 107 | 108 | 1. Load 190 synthetic programming conversations from Hugging Face 109 | 2. Process them through the complete analysis pipeline step by step 110 | 3. Generate hierarchical clusters organized into categories 111 | 4. Display the results with enhanced visualization 112 | 113 | ## Key Design Principles 114 | 115 | Kura follows a function-based architecture where pipeline functions orchestrate the execution while models handle the core logic. Each function is designed with explicit inputs/outputs and no hidden state, working with any model that implements the required interface. The system supports various model types through polymorphic interfaces - from OpenAI to local models for summarization, different clustering algorithms, and various dimensionality reduction techniques. 116 | 117 | Data can be loaded from multiple sources including Claude conversation history (`Conversation.from_claude_conversation_dump()`) and Hugging Face datasets (`Conversation.from_hf_dataset()`). The example uses a dataset of 190 synthetic programming conversations that form natural clusters across technical topics. 118 | 119 | The pipeline architecture processes data through sequential stages: loading, summarization, embedding, base clustering, meta-clustering, and dimensionality reduction. All progress is automatically saved using checkpoints, and the system can be extended by implementing custom versions of any component model. 120 | 121 | ## Documentation 122 | 123 | - **Getting Started** 124 | 125 | - [Installation Guide](https://567-labs.github.io/kura/getting-started/installation/) 126 | - [Quickstart Guide](https://567-labs.github.io/kura/getting-started/quickstart/) 127 | 128 | - **Core Concepts** 129 | 130 | - [Conversations](https://567-labs.github.io/kura/core-concepts/conversations/) 131 | - [Embedding](https://567-labs.github.io/kura/core-concepts/embedding/) 132 | - [Clustering](https://567-labs.github.io/kura/core-concepts/clustering/) 133 | - [Summarization](https://567-labs.github.io/kura/core-concepts/summarization/) 134 | - [Meta-Clustering](https://567-labs.github.io/kura/core-concepts/meta-clustering/) 135 | - [Dimensionality Reduction](https://567-labs.github.io/kura/core-concepts/dimensionality-reduction/) 136 | 137 | - **API Reference** 138 | - [Procedural API Documentation](https://567-labs.github.io/kura/api/) 139 | 140 | ## Comparison with Similar Tools 141 | 142 | | Feature | Kura | Traditional Analytics | Manual Review | Generic Clustering | 143 | | ---------------------- | ------------------------------------- | ------------------------------ | ---------------------- | ------------------------ | 144 | | Semantic Understanding | ✅ Uses LLMs for deep understanding | ❌ Limited to keywords | ✅ Human understanding | ⚠️ Basic similarity only | 145 | | Scalability | ✅ Handles thousands of conversations | ✅ Highly scalable | ❌ Time intensive | ✅ Works at scale | 146 | | Visualization | ✅ Interactive UI | ⚠️ Basic charts | ❌ Manual effort | ⚠️ Generic plots | 147 | | Hierarchy Discovery | ✅ Meta-clustering feature | ❌ Flat categories | ⚠️ Subjective grouping | ❌ Typically flat | 148 | | Extensibility | ✅ Custom models and extractors | ⚠️ Limited customization | ✅ Flexible but manual | ⚠️ Some algorithms | 149 | | Privacy | ✅ Self-hosted option | ⚠️ Often requires data sharing | ✅ Can be private | ✅ Can be private | 150 | 151 | ## Future Roadmap 152 | 153 | Kura is actively evolving with plans to add: 154 | 155 | - **Enhanced Topic Modeling**: More sophisticated detection of themes across conversations 156 | - **Temporal Analysis**: Tracking how conversation patterns evolve over time 157 | - **Advanced Visualizations**: Additional visual representations of conversation data 158 | - **Data Connectors**: More integrations with popular conversation data sources 159 | - **Multi-modal Support**: Analysis of conversations that include images and other media 160 | - **Export Capabilities**: Enhanced formats for sharing and presenting findings 161 | 162 | ## Testing 163 | 164 | To quickly test Kura and see it in action: 165 | 166 | ```bash 167 | uv run python scripts/tutorial_procedural_api.py 168 | ``` 169 | 170 | Expected output: 171 | 172 | ```text 173 | Loaded 190 conversations successfully! 174 | 175 | ============================================================ 176 | Conversation Processing 177 | ============================================================ 178 | 179 | Starting conversation clustering... 180 | Step 1: Generating conversation summaries... 181 | Generated 190 summaries 182 | Step 2: Generating base clusters from summaries... 183 | Generated 19 base clusters 184 | Step 3: Reducing clusters hierarchically... 185 | Reduced to 29 meta clusters 186 | Step 4: Projecting clusters to 2D for visualization... 187 | Generated 29 projected clusters 188 | 189 | Pipeline complete! Generated 29 projected clusters! 190 | 191 | Processing Summary: 192 | • Input conversations: 190 193 | • Final reduced clusters: 29 194 | • Final projected clusters: 29 195 | • Checkpoints saved to: ./tutorial_checkpoints 196 | ``` 197 | 198 | This will: 199 | 200 | - Load 190 sample conversations from Hugging Face 201 | - Process them through the complete pipeline 202 | - Generate 29 hierarchical clusters organized into 10 root categories 203 | - Save checkpoints to `./tutorial_checkpoints` 204 | 205 | ## Development 206 | 207 | See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing, and contribution guidelines. 208 | 209 | ## License 210 | 211 | [MIT License](LICENSE) 212 | 213 | ## About 214 | 215 | Kura is under active development. If you face any issues or have suggestions, please feel free to [open an issue](https://github.com/567-labs/kura/issues) or a PR. For more details on the technical implementation, check out this [walkthrough of the code](https://ivanleo.com/blog/understanding-user-conversations). 216 | -------------------------------------------------------------------------------- /docs/api/index.md: -------------------------------------------------------------------------------- 1 | # API Reference 2 | 3 | This section provides detailed API reference documentation for the Kura package, automatically generated from the source code using mkdocstrings. 4 | 5 | ## How to Use This Reference 6 | 7 | The API reference is organized by module, with each module containing related classes and functions. For each class, you'll find: 8 | 9 | - Constructor parameters and their descriptions 10 | - Instance methods with parameter details and return types 11 | - Properties and attributes 12 | 13 | To use these classes in your code, import them from their respective modules: 14 | 15 | ```python 16 | from kura import Kura 17 | from kura.embedding import OpenAIEmbeddingModel 18 | from kura.summarisation import SummaryModel 19 | # And so on... 20 | ``` 21 | 22 | ## Core Classes 23 | 24 | ## Procedural API 25 | 26 | The procedural API provides a functional approach to conversation analysis with composable pipeline functions. 27 | 28 | ### Pipeline Functions 29 | 30 | ::: kura.summarise_conversations 31 | 32 | ::: kura.generate_base_clusters_from_conversation_summaries 33 | 34 | ::: kura.reduce_clusters_from_base_clusters 35 | 36 | ::: kura.reduce_dimensionality_from_clusters 37 | 38 | ### Checkpoint Management 39 | 40 | ::: kura.CheckpointManager 41 | 42 | ## Implementation Classes 43 | 44 | ### Embedding Models 45 | 46 | ::: kura.embedding 47 | 48 | ### Summarization 49 | 50 | ::: kura.summarisation 51 | 52 | ### Clustering 53 | 54 | ::: kura.cluster 55 | 56 | ### Meta-Clustering 57 | 58 | ::: kura.meta_cluster 59 | 60 | ### Dimensionality Reduction 61 | 62 | ::: kura.dimensionality 63 | -------------------------------------------------------------------------------- /docs/assets/images/cluster-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-details.png -------------------------------------------------------------------------------- /docs/assets/images/cluster-map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-map.png -------------------------------------------------------------------------------- /docs/assets/images/cluster-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/cluster-tree.png -------------------------------------------------------------------------------- /docs/assets/images/conversation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/conversation.png -------------------------------------------------------------------------------- /docs/assets/images/kura-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/docs/assets/images/kura-architecture.png -------------------------------------------------------------------------------- /docs/blog/index.md: -------------------------------------------------------------------------------- 1 | Here are articles that we've written to show you how to work with Kura. 2 | -------------------------------------------------------------------------------- /docs/blog/posts/kura-0-5-0-release.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Kura v0.5.0 Released - Procedural API, Better Docs & More 3 | date: 2025-05-29 4 | categories: 5 | - Kura 6 | - Release 7 | --- 8 | 9 | # Kura v0.5.0 Released 10 | 11 | We're excited to announce the release of Kura v0.5.0! This release brings significant improvements to documentation, introduces a new procedural API for maximum flexibility, and includes numerous enhancements to make Kura even better for analyzing conversation data. 12 | 13 | ## What's New in v0.5.0 14 | 15 | ### New Procedural API (v1) 16 | 17 | The headline feature of this release is the introduction of a functional, procedural API that gives you fine-grained control over the analysis pipeline: 18 | 19 | ```python 20 | from kura import ( 21 | summarise_conversations, 22 | generate_base_clusters_from_conversation_summaries, 23 | reduce_clusters_from_base_clusters, 24 | reduce_dimensionality_from_clusters, 25 | ) 26 | 27 | # Run each step independently 28 | summaries = await summarise_conversations(conversations, model=summary_model) 29 | clusters = await generate_base_clusters_from_conversation_summaries(summaries, model=cluster_model) 30 | meta_clusters = await reduce_clusters_from_base_clusters(clusters, model=meta_cluster_model) 31 | projected = await reduce_dimensionality_from_clusters(meta_clusters, model=dim_reduction_model) 32 | ``` 33 | 34 | This new API offers: 35 | - Complete control over each pipeline step 36 | - Easy integration with heterogeneous models (OpenAI, vLLM, Hugging Face) 37 | - Functional programming style with no hidden state 38 | - Keyword-only arguments for clarity 39 | 40 | 41 | 42 | 43 | ### Enhanced Documentation 44 | 45 | We've made major improvements to our documentation: 46 | 47 | - **API Reference**: Now generated with mkdocstrings for always up-to-date documentation 48 | - **CLAUDE.md**: Repository guidance for AI assistants working with the codebase 49 | - **CONTRIBUTING.md**: Clear guidelines for contributors with testing and UV setup 50 | - **Better Examples**: Added context about real datasets like the ivanleomk dataset 51 | 52 | ### Technical Improvements 53 | 54 | #### Refactored Architecture 55 | - Extracted visualization logic into separate modules for better maintainability 56 | - Moved `max_clusters` parameter from Kura to MetaClusterModel where it belongs 57 | - Implemented lazy imports for UMap to improve startup time 58 | - Simplified embedding extensibility by replacing `embed_text()` with `__repr__()` 59 | 60 | #### Enhanced Cluster Visualization 61 | - Added slug field to cluster models for better identification 62 | - Improved cluster visualization with more meaningful labels 63 | - Better support for cluster hierarchies in the UI 64 | 65 | #### Developer Experience 66 | - Added Ruff workflows and pre-commit hooks for consistent code quality 67 | - Fixed numerous type checking bugs 68 | - Improved Summary class implementation 69 | - Better error messages and debugging support 70 | 71 | ## Breaking Changes 72 | 73 | While we've tried to maintain backward compatibility, please note: 74 | - The `max_clusters` parameter has moved from the main Kura class to MetaClusterModel 75 | - Some internal APIs have been refactored for the new procedural approach 76 | 77 | ## What's Next 78 | 79 | We're already working on the next release with plans for: 80 | 81 | - More embedding model integrations 82 | - Enhanced meta-clustering algorithms 83 | - Performance optimizations for large datasets 84 | - Additional visualization options 85 | 86 | ## Feedback Welcome! 87 | 88 | We'd love to hear your thoughts on this release. Please: 89 | 90 | - Report issues on [GitHub](https://github.com/567-labs/kura/issues) 91 | - Join the discussion in [GitHub Discussions](https://github.com/567-labs/kura/discussions) 92 | - Share your use cases and success stories 93 | -------------------------------------------------------------------------------- /docs/blog/posts/new-documentation-release.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: New Documentation Release - We're Open to Feedback 3 | date: 2025-05-16 4 | categories: 5 | - Kura 6 | - Documentation 7 | --- 8 | 9 | # New Documentation Release 10 | 11 | We're excited to announce a comprehensive overhaul of the Kura documentation! The new documentation is designed to help users get started quickly and make the most of Kura's powerful features for analyzing conversation data. 12 | 13 | ## What's New 14 | 15 | Our documentation has been completely reorganized and expanded to provide a better experience: 16 | 17 | - **Clear Structure**: New organization with dedicated sections for Getting Started, Core Concepts, and API Reference 18 | - **Comprehensive Installation Guide**: Detailed instructions for different installation methods, including both `uv` and `pip` 19 | - **In-depth Tutorials**: Step-by-step guides in our Getting Started section 20 | - **Enriched API Reference**: Better organized and more detailed API documentation 21 | - **Core Concepts Explained**: Detailed explanations of Kura's architecture and components 22 | - **Improved Code Examples**: Concise, practical examples throughout 23 | 24 | 25 | 26 | ## Getting Started 27 | 28 | If you're new to Kura, we recommend starting with: 29 | 30 | 1. [Installation Guide](../../getting-started/installation.md) 31 | 2. [Quickstart Guide](../../getting-started/quickstart.md) 32 | 3. [Core Concepts Overview](../../core-concepts/overview.md) 33 | 34 | ## We Want Your Feedback! 35 | 36 | Documentation is only useful if it answers your questions. We're actively seeking feedback on the new documentation: 37 | 38 | - Is anything unclear or confusing? 39 | - Are there missing topics you'd like to see covered? 40 | - Did you find any errors or inconsistencies? 41 | - What would make your experience better? 42 | 43 | Please share your thoughts by: 44 | 45 | - Opening an issue on [GitHub](https://github.com/567-labs/kura/issues/new?labels=documentation) 46 | - Starting a discussion in our [GitHub Discussions](https://github.com/567-labs/kura/discussions) 47 | - Reaching out to [Jason](https://twitter.com/jxnl) or [Ivan](https://x.com/ivanleomk) 48 | 49 | ## What's Next 50 | 51 | This documentation release is just the beginning. We're planning to: 52 | 53 | - Add more real-world examples and use cases 54 | - Develop video tutorials 55 | - Expand the API reference with more details 56 | - Create a cookbook of common patterns and techniques 57 | 58 | Stay tuned for more updates, and don't hesitate to let us know what you'd like to see next! 59 | 60 | --- 61 | 62 | Thank you for using Kura. We're committed to building not just a great tool, but also great documentation to help you succeed with it. 63 | -------------------------------------------------------------------------------- /docs/core-concepts/clustering.md: -------------------------------------------------------------------------------- 1 | # Clustering 2 | 3 | Kura's clustering pipeline groups similar conversation summaries into meaningful clusters. This process is fundamental for large-scale analysis, enabling the discovery of dominant themes, understanding diverse user intents, and surfacing potentially "unknown unknown" patterns from vast quantities of conversational data. Clustering follows summarization and embedding in the Kura pipeline. 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | **Clustering** in Kura organizes `ConversationSummary` objects (see [Summarization](summarization.md)) into groups based on semantic similarity. Each resulting cluster is assigned a descriptive name and a concise summary, making it easier to interpret the primary topics and user requests within the dataset. This bottom-up approach to pattern discovery is crucial for making sense of and navigating large collections of conversations. 10 | 11 | - **Input:** A list of `ConversationSummary` objects (with or without embeddings) 12 | - **Output:** A list of `Cluster` objects, each with a name, description, and associated conversation IDs 13 | 14 | Clustering enables downstream tasks such as: 15 | - Identifying and monitoring prevalent topics or user needs 16 | - Visualizing trends and thematic structures in the data 17 | - Facilitating efficient exploratory search and retrieval of related conversations 18 | - Providing a foundation for hierarchical topic modeling through [Meta-Clustering](meta-clustering.md) 19 | 20 | --- 21 | 22 | ## The Clustering Model 23 | 24 | Kura's main clustering logic is implemented in the `ClusterModel` class (see `kura/cluster.py`). This class orchestrates the embedding, grouping, and labeling of conversation summaries. 25 | 26 | ### Key Components 27 | 28 | - **Clustering Method:** Determines how summaries are grouped (default: K-means, see `KmeansClusteringMethod`) 29 | - **Embedding Model:** Used to convert summaries to vectors if not already embedded (default: `OpenAIEmbeddingModel`) 30 | - **Cluster Naming:** Uses an LLM to generate a descriptive name and summary for each cluster, distinguishing it from others 31 | 32 | #### Example: ClusterModel Initialization 33 | 34 | ```python 35 | model = ClusterModel( 36 | clustering_method=KmeansClusteringMethod(), 37 | embedding_model=OpenAIEmbeddingModel(), 38 | max_concurrent_requests=50, 39 | model="openai/gpt-4o-mini", 40 | ) 41 | ``` 42 | 43 | --- 44 | 45 | ## Clustering Pipeline 46 | 47 | The clustering process consists of several steps: 48 | 49 | 1. **Embedding Summaries:** 50 | - If summaries do not already have embeddings, the model uses the configured embedding model to generate them. 51 | - Embedding is performed in batches and can be parallelized for efficiency. 52 | 53 | ```python 54 | embeddings = await self.embedding_model.embed([str(item) for item in summaries]) 55 | ``` 56 | 57 | 2. **Grouping Summaries:** 58 | - The clustering method (e.g., K-means) groups summaries based on their embeddings. 59 | - Each group is assigned a cluster ID. 60 | 61 | ```python 62 | cluster_id_to_summaries = self.clustering_method.cluster(items_with_embeddings) 63 | ``` 64 | 65 | 3. **Generating Cluster Names and Descriptions:** 66 | - For each cluster, an LLM is prompted to generate a concise, two-sentence summary and a short, imperative cluster name. 67 | - The prompt includes both positive examples (summaries in the cluster) and contrastive examples (summaries from other clusters). Contrastive examples are crucial: they guide the LLM to produce highly specific and distinguishing names/descriptions, preventing overly generic labels and ensuring each cluster's unique essence is captured. 68 | 69 | ```python 70 | cluster = await self.generate_cluster(summaries, contrastive_examples) 71 | # Returns a Cluster object with name, description, and chat_ids 72 | ``` 73 | 74 | 4. **Output:** 75 | - The result is a list of `Cluster` objects, each containing: 76 | - `name`: Imperative sentence capturing the main request/theme 77 | - `description`: Two-sentence summary of the cluster 78 | - `chat_ids`: List of conversation IDs in the cluster 79 | 80 | --- 81 | 82 | ## Cluster Naming and Description Generation 83 | 84 | Cluster names and descriptions are generated using a large language model (LLM) with a carefully crafted prompt. The prompt: 85 | - Instructs the LLM to summarize the group in two sentences (past tense) 86 | - Requires the name to be an imperative sentence (e.g., "Help me debug Python code") 87 | - Provides contrastive examples to ensure the name/summary is specific, distinct, and accurately reflects the cluster's content compared to others. 88 | - Encourages specificity, especially for sensitive or harmful topics 89 | - Reinforces privacy by instructing the LLM to avoid including any Personally Identifiable Information (PII) or proper nouns in the generated cluster names and descriptions, complementing the PII removal in the initial summarization phase. 90 | 91 | **Prompt excerpt:** 92 | 93 | ``` 94 | Summarize all the statements into a clear, precise, two-sentence description in the past tense. ... 95 | After creating the summary, generate a short name for the group of statements. This name should be at most ten words long ... 96 | The cluster name should be a sentence in the imperative that captures the user's request. ... 97 | ``` 98 | 99 | --- 100 | 101 | ## Configuration and Extensibility 102 | 103 | - **Clustering Method:** Swap out `KmeansClusteringMethod` for other algorithms by implementing the `BaseClusteringMethod` interface. 104 | - **Embedding Model:** Use any model implementing `BaseEmbeddingModel` (e.g., local or cloud-based embeddings). 105 | - **LLM Model:** The LLM used for naming/describing clusters is configurable (default: `openai/gpt-4o-mini`). 106 | - **Concurrency:** `max_concurrent_requests` controls parallelism for embedding and LLM calls. 107 | - **Progress Reporting:** Optional integration with Rich or tqdm for progress bars and live cluster previews. 108 | 109 | --- 110 | 111 | ## Hierarchical Analysis with Meta-Clustering 112 | 113 | While the `ClusterModel` produces a flat list of semantically distinct clusters, Kura also supports the creation of hierarchical cluster structures through its **meta-clustering** capabilities (see [Meta-Clustering](meta-clustering.md)). This next step takes the output of the initial clustering (a list of `Cluster` objects) and groups these clusters into higher-level, more general parent clusters. 114 | 115 | This hierarchical approach is particularly useful for: 116 | - Managing and navigating a large number of base clusters. 117 | - Discovering broader themes and relationships between groups of clusters. 118 | - Enabling a multi-level exploratory search, from general topics down to specific conversation groups. 119 | 120 | Refer to the [Meta-Clustering](meta-clustering.md) documentation for details on how Kura achieves this hierarchical organization. 121 | 122 | --- 123 | 124 | ## Output: Cluster Object 125 | 126 | Each cluster is represented as a `Cluster` object (see `kura/types.py`): 127 | 128 | ```python 129 | class Cluster(BaseModel): 130 | name: str 131 | description: str 132 | chat_ids: list[str] 133 | parent_id: Optional[int] = None 134 | ``` 135 | 136 | --- 137 | 138 | ## Pipeline Integration 139 | 140 | Clustering is the third major step in Kura's analysis pipeline: 141 | 142 | 1. **Loading:** Conversations are loaded 143 | 2. **Summarization:** Each conversation is summarized 144 | 3. **Embedding:** Summaries are embedded as vectors 145 | 4. **Clustering:** Embeddings are grouped into clusters (this step) 146 | 5. **Visualization/Analysis:** Clusters and summaries are explored 147 | 148 | --- 149 | 150 | ## References 151 | 152 | - [Summarization](summarization.md) 153 | - [Embedding](embedding.md) 154 | - [API documentation](../api/index.md) 155 | - [Source Code](https://github.com/567-labs/kura/blob/main/kura/cluster.py) 156 | -------------------------------------------------------------------------------- /docs/core-concepts/conversations.md: -------------------------------------------------------------------------------- 1 | # Conversations 2 | 3 | Conversations are the fundamental data units in Kura's analysis pipeline. This document explains how conversations are structured, loaded, and processed. 4 | 5 | ## Conversation Structure 6 | 7 | In Kura, a conversation is represented by the `Conversation` class from `kura.types.conversation`: 8 | 9 | ```python 10 | from kura.types import Conversation, Message 11 | from datetime import datetime 12 | from uuid import uuid4 13 | 14 | # Create a simple conversation 15 | conversation = Conversation( 16 | id=str(uuid4()), 17 | created_at=datetime.now(), 18 | messages=[ 19 | Message( 20 | role="user", 21 | content="Hello, can you help me with a Python question?", 22 | created_at=str(datetime.now()) 23 | ), 24 | Message( 25 | role="assistant", 26 | content="Of course! What's your Python question?", 27 | created_at=str(datetime.now()) 28 | ), 29 | Message( 30 | role="user", 31 | content="How do I read a file in Python?", 32 | created_at=str(datetime.now()) 33 | ), 34 | Message( 35 | role="assistant", 36 | content="To read a file in Python, you can use the built-in open() function...", 37 | created_at=str(datetime.now()) 38 | ) 39 | ], 40 | metadata={"source": "example", "category": "programming"} 41 | ) 42 | ``` 43 | 44 | ### Key Components 45 | 46 | Each conversation contains: 47 | 48 | - **ID**: A unique identifier for the conversation 49 | - **Created At**: Timestamp for when the conversation was created 50 | - **Messages**: A list of message objects, each with: 51 | - **Role**: Either "user" or "assistant" 52 | - **Content**: The text content of the message 53 | - **Created At**: Timestamp for when the message was sent 54 | - **Metadata**: Optional dictionary of additional information 55 | 56 | ## Loading Conversations 57 | 58 | Kura provides several methods for loading conversations from different sources: 59 | 60 | ### From Claude Conversation Exports 61 | 62 | ```python 63 | from kura.types import Conversation 64 | 65 | # Load from Claude export 66 | conversations = Conversation.from_claude_conversation_dump("conversations.json") 67 | ``` 68 | 69 | ### From Hugging Face Datasets 70 | 71 | ```python 72 | from kura.types import Conversation 73 | 74 | # Load from a Hugging Face dataset 75 | conversations = Conversation.from_hf_dataset( 76 | "ivanleomk/synthetic-gemini-conversations", 77 | split="train" 78 | ) 79 | ``` 80 | 81 | ### Creating Custom Loaders 82 | 83 | You can create custom loaders for other data sources by implementing functions that convert your data to `Conversation` objects: 84 | 85 | ```python 86 | def load_from_custom_format(file_path): 87 | # Load and parse your custom data format 88 | data = your_parsing_function(file_path) 89 | 90 | # Convert to Conversation objects 91 | conversations = [] 92 | for entry in data: 93 | messages = [ 94 | Message( 95 | role=msg["speaker"], 96 | content=msg["text"], 97 | created_at=msg["timestamp"] 98 | ) 99 | for msg in entry["messages"] 100 | ] 101 | 102 | conversation = Conversation( 103 | id=entry["id"], 104 | created_at=entry["date"], 105 | messages=messages, 106 | metadata=entry.get("meta", {}) 107 | ) 108 | 109 | conversations.append(conversation) 110 | 111 | return conversations 112 | ``` 113 | 114 | ## Conversation Processing 115 | 116 | In the Kura pipeline, conversations go through several processing steps: 117 | 118 | 1. **Loading**: Conversations are loaded from a source 119 | 2. **Summarization**: Each conversation is summarized to capture its core intent 120 | 3. **Metadata Extraction**: Optional metadata is extracted from the conversation content 121 | 4. **Embedding**: Summaries are converted to vector embeddings 122 | 5. **Clustering**: Similar conversations are grouped together 123 | 124 | ## Working with Message Content 125 | 126 | The content of messages can be in various formats, but should generally be text. HTML, Markdown, or other structured formats will be processed as-is, which may affect summarization quality. 127 | 128 | When working with message content: 129 | 130 | - Clean up any special formatting if needed 131 | - Remove system messages if they don't contribute to the conversation topic 132 | - Ensure message ordering is correct for proper context 133 | 134 | ## Handling Metadata 135 | 136 | Conversations can include metadata, which provides additional context: 137 | 138 | ```python 139 | # Add metadata when creating conversations 140 | conversations = Conversation.from_hf_dataset( 141 | "allenai/WildChat-nontoxic", 142 | metadata_fn=lambda x: { 143 | "model": x["model"], 144 | "toxic": x["toxic"], 145 | "redacted": x["redacted"], 146 | } 147 | ) 148 | ``` 149 | 150 | This metadata can later be used to: 151 | - Filter conversations 152 | - Analyze patterns across different conversation attributes 153 | - Provide additional context for visualization 154 | 155 | ## Next Steps 156 | 157 | Now that you understand how conversations are structured in Kura, you can: 158 | 159 | - Learn about the [summarization process](summarization.md) 160 | - See how to load different data formats in the [Quickstart Guide](../getting-started/quickstart.md) 161 | - Explore configuration options in the [Configuration Guide](../getting-started/configuration.md) 162 | -------------------------------------------------------------------------------- /docs/core-concepts/dimensionality-reduction.md: -------------------------------------------------------------------------------- 1 | # Dimensionality Reduction 2 | 3 | This page is under construction. It will contain detailed information about how Kura reduces dimensions for visualization. 4 | 5 | In the meantime, you can refer to the [API documentation](../api/index.md) for technical details. -------------------------------------------------------------------------------- /docs/core-concepts/embedding.md: -------------------------------------------------------------------------------- 1 | # Embedding 2 | 3 | Kura's embedding pipeline transforms text (such as conversation summaries) into high-dimensional vector representations. These embeddings are essential for downstream tasks like clustering, search, and visualization, enabling Kura to analyze and organize large volumes of conversational data. 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | **Embedding** in Kura refers to the process of converting text into numerical vectors (embeddings) that capture semantic meaning. These vectors allow for efficient similarity search, clustering, and visualization of conversations and summaries. 10 | 11 | - **Input:** A list of texts (e.g., conversation summaries, messages, or cluster descriptions) 12 | - **Output:** A list of vector embeddings (`list[list[float]]`), typically one per input text 13 | 14 | --- 15 | 16 | ## The Embedding Model 17 | 18 | Kura uses an `EmbeddingModel` (see `kura/embedding.py`) that implements the `BaseEmbeddingModel` interface. Multiple backends are supported: 19 | 20 | - **OpenAIEmbeddingModel**: Uses OpenAI's API (e.g., `text-embedding-3-small`) for high-quality embeddings 21 | - **SentenceTransformerEmbeddingModel**: Uses local models from the `sentence-transformers` library (e.g., `all-MiniLM-L6-v2`) 22 | 23 | All embedding models must implement the following interface (see `kura/base_classes/embedding.py`): 24 | 25 | ```python 26 | class BaseEmbeddingModel(ABC): 27 | @abstractmethod 28 | async def embed(self, texts: list[str]) -> list[list[float]]: 29 | """Embed a list of texts into a list of lists of floats""" 30 | pass 31 | ``` 32 | 33 | ### Key Features 34 | 35 | - **Batching:** Texts are automatically split into batches for efficient processing 36 | - **Concurrency:** Multiple batches are embedded in parallel (configurable concurrency) 37 | - **Retry Logic:** Embedding requests are retried on failure for robustness 38 | - **Extensibility:** New embedding backends can be added by subclassing `BaseEmbeddingModel` 39 | - **Checkpointing:** Embeddings can be cached as part of the pipeline to avoid recomputation 40 | 41 | --- 42 | 43 | ## Output: Embeddings 44 | 45 | The result of embedding is a list of vectors, each representing an input text. Embeddings are typically attached to summaries or clusters for downstream analysis. 46 | 47 | Example output for a batch of texts: 48 | 49 | ```python 50 | embeddings = await embedding_model.embed([ 51 | "Summarize the user's request.", 52 | "Cluster similar conversations together." 53 | ]) 54 | # embeddings: list[list[float]] 55 | ``` 56 | 57 | When used in the pipeline, embeddings are stored in objects such as `ConversationSummary`: 58 | 59 | ```python 60 | class ConversationSummary(BaseModel): 61 | chat_id: str 62 | summary: str 63 | ... 64 | embedding: Optional[list[float]] = None 65 | ``` 66 | 67 | - **embedding**: The vector representation of the summary (or other text) 68 | 69 | --- 70 | 71 | ## Pipeline Integration 72 | 73 | Embedding is a core step in Kura's analysis pipeline: 74 | 75 | 1. **Loading**: Conversations are loaded from various sources 76 | 2. **Summarization**: Each conversation is summarized 77 | 3. **Embedding**: Summaries (or other texts) are embedded as vectors 78 | 4. **Clustering**: Embeddings are grouped into clusters 79 | 5. **Visualization/Analysis**: Clusters and embeddings are explored 80 | 81 | --- 82 | 83 | ## Embeddable Object Representations 84 | 85 | All major objects that need to be embedded in Kura (such as `ConversationSummary`, `Cluster`, and `ProjectedCluster`) implement `__str__` methods. This ensures that each object can be converted to a meaningful text representation before embedding. 86 | 87 | - **Requirement:** Any object passed to an embedding model must provide a `__str__` method that captures its semantic content. 88 | - **Examples:** 89 | - `ConversationSummary` uses a custom `__str__` to include summary, request, task, and other fields in a structured format. 90 | - `Cluster` and `ProjectedCluster` use `__str__` to return their name and description. 91 | 92 | This design allows embedding models to work generically with a variety of object types, as long as they implement a suitable `__str__` method. 93 | 94 | --- 95 | 96 | ## References 97 | 98 | - [API documentation](../api/index.md) 99 | - [Sentence Transformers documentation](https://www.sbert.net/) 100 | - [OpenAI Embeddings documentation](https://platform.openai.com/docs/guides/embeddings) 101 | 102 | --- 103 | 104 | ## TODO: Additional Embedding Providers 105 | 106 | - Support for other embedding providers (e.g., Cohere, HuggingFace Inference API, Google Vertex AI, local GPU models) 107 | - Community contributions and suggestions are welcome! 108 | -------------------------------------------------------------------------------- /docs/core-concepts/meta-clustering.md: -------------------------------------------------------------------------------- 1 | # Meta-Clustering 2 | 3 | Kura's meta-clustering extends the initial clustering process by organizing existing clusters into a hierarchical structure. This is essential for managing large numbers of base clusters, understanding broader thematic relationships, and enabling multi-level exploratory analysis of conversational data—from general topics down to specific insights. 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | **Meta-Clustering** (or hierarchical clustering) in Kura takes a list of `Cluster` objects (typically the output of the primary [Clustering](clustering.md) process) and groups them into higher-level, more generalized parent clusters. This creates a topic taxonomy, allowing users to navigate and comprehend vast amounts of clustered data more effectively. 10 | 11 | - **Input:** A list of `Cluster` objects. 12 | - **Output:** An updated list of `Cluster` objects, including newly created parent meta-clusters and the original child clusters now linked via `parent_id`. 13 | 14 | Meta-clustering facilitates: 15 | - **Scalable Exploration:** Makes it feasible to explore datasets with hundreds or thousands of base clusters. 16 | - **Thematic Discovery:** Reveals overarching themes and connections between different groups of specific topics. 17 | - **Granular Navigation:** Allows users to drill down from broad categories to nuanced sub-topics, supporting deeper "unknown unknown" discovery. 18 | 19 | --- 20 | 21 | ## The `MetaClusterModel` 22 | 23 | The core logic for hierarchical clustering is encapsulated in the `MetaClusterModel` (see `kura/meta_cluster.py`). This model orchestrates the process of grouping existing clusters into parent clusters and defining the relationships between them. 24 | 25 | ### Key Components and Process 26 | 27 | The `MetaClusterModel` typically employs the following steps, often iteratively if reducing a large number of clusters or building multiple hierarchy levels: 28 | 29 | 1. **Input Clusters:** Starts with a list of `Cluster` objects generated by `ClusterModel`. 30 | 31 | 2. **(Optional) Cluster Grouping with `reduce_clusters`:** 32 | * If `reduce_clusters` is called with many input clusters, it first embeds the textual representation (`name` and `description`) of these existing clusters using the configured `embedding_model`. 33 | * It then uses a `clustering_model` (e.g., K-means) to group these *cluster embeddings* into a smaller number of neighborhoods or initial groupings. 34 | * The subsequent steps are then applied to each of these neighborhoods. 35 | 36 | 3. **Generating Candidate Meta-Cluster Names (`generate_candidate_clusters`): 37 | * For a given set of input clusters (or a neighborhood of clusters from step 2), an LLM is prompted to propose a list of suitable higher-level candidate names. 38 | * The prompt provides the names and descriptions of the input clusters and asks for broader category names that can encompass several of them, emphasizing specificity and distinctiveness. The aim is to find meaningful parent themes. 39 | 40 | 4. **Labeling Clusters (`label_cluster`): 41 | * Each individual input cluster is then presented to an LLM along with the list of candidate meta-cluster names generated in the previous step. 42 | * The LLM's task is to assign the cluster to the *single best-fitting* candidate meta-cluster name. This involves careful instruction to choose an exact match from the candidates. 43 | * The output is validated to ensure the chosen label is one of the provided candidates (using fuzzy matching for robustness). 44 | 45 | 5. **Renaming and Finalizing Meta-Clusters (`rename_cluster_group`): 46 | * Clusters are grouped based on the labels assigned in step 4. 47 | * For each group (which will become a new meta-cluster), an LLM is prompted with the names and descriptions of all its child clusters. 48 | * The LLM generates a final, refined name (imperative, like base cluster names) and a two-sentence summary for this new meta-cluster. This ensures the meta-cluster accurately and concisely represents its constituent child clusters. 49 | * A new `Cluster` object is created for this meta-cluster. The original child clusters in this group have their `parent_id` field updated to the ID of this new meta-cluster. 50 | 51 | ### Prompting Strategies 52 | 53 | Similar to base clustering, the LLM prompts used in `MetaClusterModel` are designed to: 54 | - Elicit specific and descriptive names/summaries for the meta-clusters. 55 | - Ensure meta-clusters are distinguishable from one another. 56 | - Handle potentially sensitive topics appropriately by encouraging descriptive rather than euphemistic language. 57 | - Maintain a consistent style (e.g., imperative sentences for names). 58 | 59 | ### Output: Hierarchical Cluster List 60 | 61 | The final output of `generate_meta_clusters` (or `reduce_clusters`) is a list containing: 62 | - The newly created parent meta-clusters (which have `parent_id=None`). 63 | - The original input clusters, now updated with their respective `parent_id` linking them to their new meta-cluster. 64 | 65 | This structure allows for easy reconstruction and traversal of the cluster hierarchy. 66 | 67 | --- 68 | 69 | ## Configuration 70 | 71 | - **LLM Model:** The LLM used for candidate generation, labeling, and renaming is configurable (default: `openai/gpt-4o-mini`). 72 | - **Embedding Model:** If using `reduce_clusters`, the `embedding_model` is used to embed the input clusters themselves (default: `OpenAIEmbeddingModel`). 73 | - **Clustering Method:** If using `reduce_clusters`, the `clustering_model` is used to group the cluster embeddings (default: `KmeansClusteringMethod`). 74 | - **Concurrency:** `max_concurrent_requests` controls parallelism for LLM calls. 75 | - **Max Clusters per Level (Implicit):** The `max_clusters` parameter in `MetaClusterModel` (and logic within `generate_candidate_clusters`) influences how many meta-clusters are aimed for at each level of reduction, guiding the granularity of the hierarchy. 76 | 77 | --- 78 | 79 | ## Pipeline Integration 80 | 81 | Meta-clustering typically follows the initial clustering step performed by `ClusterModel`: 82 | 83 | 1. **Loading:** Conversations are loaded. 84 | 2. **Summarization:** Conversations are summarized (`ConversationSummary`). 85 | 3. **Embedding:** Summaries are embedded. 86 | 4. **Clustering:** Summaries are grouped into base `Cluster` objects. 87 | 5. **Meta-Clustering:** Base clusters are organized hierarchically by `MetaClusterModel`. 88 | 6. **Visualization/Analysis:** The full hierarchy of clusters and summaries can be explored. 89 | 90 | --- 91 | 92 | ## References 93 | 94 | - [Clustering](clustering.md) 95 | - [API documentation](../api/index.md) 96 | - [Source Code](https://github.com/567-labs/kura/blob/main/kura/meta_cluster.py) 97 | -------------------------------------------------------------------------------- /docs/core-concepts/overview.md: -------------------------------------------------------------------------------- 1 | # Core Concepts Overview 2 | 3 | Kura is built on several key concepts that work together to analyze conversational data, enabling the discovery of meaningful patterns and insights from these interactions. This overview explains the major components and how they interact in the analysis pipeline. 4 | 5 | ## Architecture 6 | 7 | Kura's architecture consists of a pipeline of components that process conversational data through several stages: 8 | 9 |  10 | 11 | The main components are: 12 | 13 | 1. **Conversations**: The raw chat data between users and assistants, serving as the foundational input. 14 | 2. **Summarization**: Distilling lengthy conversations into concise task descriptions or core topics, which form the basis for subsequent analysis. 15 | 3. **Embedding**: Representing these textual summaries as dense numerical vectors, capturing their semantic meaning for similarity measurement. 16 | 4. **Clustering**: Grouping semantically similar summaries (via their embeddings) into 'base' clusters, identifying initial patterns in the data. 17 | 5. **Meta-Clustering**: Organizing base clusters into a hierarchical structure, allowing for the exploration of insights at multiple levels of granularity, from broad themes to specific sub-topics. 18 | 6. **Dimensionality Reduction**: Projecting high-dimensional embeddings into a lower-dimensional space (typically 2D or 3D) to enable visual exploration and pattern identification. 19 | 20 | ## Processing Pipeline 21 | 22 | When you run the kura pipeline, the data flows through the following steps: 23 | 24 | 1. **Load Conversations**: Raw conversation data is loaded from your specified source. 25 | 2. **Generate Summaries**: Each conversation is summarized, often into a concise task description or key topic. This summary becomes a primary unit for analysis. 26 | 3. **Extract Metadata**: Optional metadata (e.g., conversation length, sentiment, user-defined tags, or other relevant attributes) is extracted from conversations. These attributes, sometimes referred to as 'facets', can provide additional dimensions for analysis, filtering, and deeper understanding of the clusters. 27 | 4. **Create Embeddings**: The textual summaries are converted into vector representations (embeddings) that capture their semantic content. 28 | 5. **Perform Base Clustering**: Embeddings are used to group semantically similar summaries into initial 'base' clusters, forming the first layer of identified patterns. 29 | 6. **Apply Meta-Clustering**: Base clusters are iteratively combined or organized into a hierarchical structure. This allows for navigation and exploration of insights from broad, overarching themes down to more specific, granular patterns. 30 | 7. **Reduce Dimensions**: High-dimensional embeddings (and their cluster assignments) are projected, typically into a 2D or 3D space. This facilitates visual exploration, helping to understand the relationships between clusters and identify outliers or emergent patterns. 31 | 8. **Save Checkpoints**: Results from each significant step are saved as checkpoint files, enabling efficient resumption and review of the analysis process. 32 | 33 | ## Key Classes 34 | 35 | Kura is designed with a modular architecture, allowing components to be customized or replaced: 36 | 37 | ### Main Orchestrator 38 | 39 | - **`Kura`** (`kura.py`): The main class that coordinates the entire pipeline and manages checkpoints 40 | 41 | ### Component Classes 42 | 43 | - **`BaseEmbeddingModel`** / **`OpenAIEmbeddingModel`** (`embedding.py`): Convert text to vector representations 44 | - **`BaseSummaryModel`** / **`SummaryModel`** (`summarisation.py`): Generate summaries from conversations 45 | - **`BaseClusterModel`** / **`ClusterModel`** (`cluster.py`): Group similar summaries into clusters 46 | - **`BaseMetaClusterModel`** / **`MetaClusterModel`** (`meta_cluster.py`): Create hierarchical cluster structures 47 | - **`BaseDimensionalityReduction`** / **`HDBUMAP`** (`dimensionality.py`): Project embeddings to 2D space 48 | 49 | ### Data Models 50 | 51 | - **`Conversation`** (`types/conversation.py`): Represents a chat conversation with messages 52 | - **`ConversationSummary`** (`types/summarisation.py`): Contains a summarized conversation 53 | - **`Cluster`** (`types/cluster.py`): Represents a group of similar conversations 54 | - **`ProjectedCluster`** (`types/dimensionality.py`): Represents clusters with 2D coordinates 55 | 56 | ## Extensibility 57 | 58 | Each component has a base class that defines the required interface, allowing you to create custom implementations: 59 | 60 | ```python 61 | # Example of creating a custom embedding model 62 | from kura.base_classes import BaseEmbeddingModel 63 | 64 | class MyCustomEmbeddingModel(BaseEmbeddingModel): 65 | async def embed(self, texts: list[str]) -> list[list[float]]: 66 | # Your custom embedding logic here 67 | ... 68 | ``` 69 | 70 | ## Checkpoints 71 | 72 | Kura saves intermediate results to checkpoint files, allowing you to: 73 | 74 | - Resume processing after interruptions 75 | - Inspect intermediary results 76 | - Share analysis results with others 77 | - Visualize results without reprocessing 78 | 79 | ## Next Steps 80 | 81 | To understand each component in more detail, explore the following pages: 82 | 83 | - [Conversations](conversations.md) 84 | - [Summarization](summarization.md) 85 | - [Embedding](embedding.md) 86 | - [Clustering](clustering.md) 87 | - [Meta-Clustering](meta-clustering.md) 88 | - [Dimensionality Reduction](dimensionality-reduction.md) 89 | -------------------------------------------------------------------------------- /docs/core-concepts/summarization.md: -------------------------------------------------------------------------------- 1 | # Summarization 2 | 3 | Kura's summarization pipeline is designed to extract concise, structured, and privacy-preserving summaries from conversations between users and AI assistants. This process is central to Kura's ability to analyze, cluster, and visualize large volumes of conversational data. 4 | 5 | --- 6 | 7 | ## Overview 8 | 9 | **Summarization** in Kura transforms each conversation into a structured summary, capturing the user's intent, the main task, languages involved, safety concerns, user frustration, and any assistant errors. This enables downstream analysis such as clustering, search, and visualization. 10 | 11 | - **Input:** A `Conversation` object (see [Conversations](conversations.md)), containing: 12 | - `chat_id`: Unique identifier 13 | - `created_at`: Timestamp 14 | - `messages`: List of messages (each with `role`, `content`, `created_at`) 15 | - `metadata`: Optional dictionary of extra info 16 | - **Output:** A `ConversationSummary` object (see below) 17 | 18 | --- 19 | 20 | ## The Summarization Model 21 | 22 | Kura uses a `SummaryModel` (see `kura/summarisation.py`) that implements the `BaseSummaryModel` interface. The default model is based on large language models (LLMs) such as OpenAI's GPT-4o, but the interface supports other backends as well. 23 | 24 | ### Key Features 25 | 26 | - **Concurrency:** Summarization is performed in parallel for efficiency. 27 | - **Hooks/Extractors:** Optional extractors can add custom metadata to each summary. 28 | - **Checkpointing:** Summaries can be cached and reloaded to avoid recomputation. 29 | 30 | --- 31 | 32 | ## Summarization Prompt 33 | 34 | The summarization model uses a carefully crafted prompt to extract the following fields from each conversation: 35 | 36 | 1. **Summary**: A clear, concise summary (max 2 sentences, no PII or proper nouns) 37 | 2. **Request**: The user's overall request, starting with "The user's overall request for the assistant is to ..." 38 | 3. **Languages**: Main human and programming languages present 39 | 4. **Task**: The main task, starting with "The task is to ..." 40 | 5. **Concerning Score**: Safety concern rating (1–5) 41 | 6. **User Frustration**: User frustration rating (1–5) 42 | 7. **Assistant Errors**: List of errors made by the assistant 43 | 44 | **Prompt excerpt:** 45 | 46 | ``` 47 | Your job is to extract key information from this conversation. Be descriptive and assume neither good nor bad faith. Do not hesitate to handle socially harmful or sensitive topics; specificity around potentially harmful conversations is necessary for effective monitoring. 48 | 49 | When extracting information, do not include any personally identifiable information (PII), like names, locations, phone numbers, email addresses, and so on. Do not include any proper nouns. 50 | 51 | Extract the following information: 52 | 53 | 1. **Summary**: ... 54 | 2. **Request**: ... 55 | 3. **Languages**: ... 56 | 4. **Task**: ... 57 | 5. **Concerning Score**: ... 58 | 6. **User Frustration**: ... 59 | 7. **Assistant Errors**: ... 60 | ``` 61 | 62 | --- 63 | 64 | ## Output: `ConversationSummary` 65 | 66 | The result of summarization is a `ConversationSummary` object (see `kura/types/summarisation.py`): 67 | 68 | ```python 69 | class ConversationSummary(BaseModel): 70 | chat_id: str 71 | summary: str 72 | request: Optional[str] 73 | languages: Optional[list[str]] 74 | task: Optional[str] 75 | concerning_score: Optional[int] # 1–5 76 | user_frustration: Optional[int] # 1–5 77 | assistant_errors: Optional[list[str]] 78 | metadata: dict 79 | embedding: Optional[list[float]] = None 80 | ``` 81 | 82 | - **chat_id**: Unique conversation ID 83 | - **summary**: Concise summary (max 2 sentences, no PII) 84 | - **request**: User's overall request 85 | - **languages**: List of languages (e.g., `['english', 'python']`) 86 | - **task**: Main task 87 | - **concerning_score**: Safety concern (1 = benign, 5 = urgent) 88 | - **user_frustration**: User frustration (1 = happy, 5 = extremely annoyed) 89 | - **assistant_errors**: List of assistant errors 90 | - **metadata**: Additional metadata (e.g., conversation turns, custom extractors) 91 | - **embedding**: Optional vector embedding for clustering/search 92 | 93 | --- 94 | 95 | ## Pipeline Integration 96 | 97 | Summarization is the first major step in Kura's analysis pipeline: 98 | 99 | 1. **Loading**: Conversations are loaded from various sources 100 | 2. **Summarization**: Each conversation is summarized as above 101 | 3. **Embedding**: Summaries are embedded as vectors 102 | 4. **Clustering**: Similar summaries are grouped 103 | 5. **Visualization/Analysis**: Clusters and summaries are explored 104 | 105 | --- 106 | 107 | ## References 108 | 109 | - [Clio: Privacy-Preserving Insights into Real-World AI Use (Anthropic)](https://assets.anthropic.com/m/7e1ab885d1b24176/original/Clio-Privacy-Preserving-Insights-into-Real-World-AI-Use.pdf) 110 | - [API documentation](../api/index.md) 111 | - [Conversations](conversations.md) 112 | -------------------------------------------------------------------------------- /docs/getting-started/configuration.md: -------------------------------------------------------------------------------- 1 | # Configuration 2 | 3 | This guide explains the various configuration options available in Kura using its procedural API (v1). This API is best for flexible pipelines where you need fine control over individual steps, the ability to skip or reorder steps, A/B test different models, or prefer a functional programming style. 4 | 5 | ## Checkpoint Files 6 | 7 | Kura saves several checkpoint files during processing: 8 | 9 | | Checkpoint File | Description | 10 | | ---------------------- | -------------------------------- | 11 | | `conversations.json` | Raw conversation data | 12 | | `summaries.jsonl` | Summarized conversations | 13 | | `clusters.jsonl` | Base cluster data | 14 | | `meta_clusters.jsonl` | Hierarchical cluster data | 15 | | `dimensionality.jsonl` | Projected data for visualization | 16 | 17 | Checkpoint filenames are now defined as properties in their respective model classes. When using the procedural API, checkpoint management is handled via the `CheckpointManager`. 18 | 19 | ## CLI Configuration 20 | 21 | When using the CLI, you can configure the checkpoint directory: 22 | 23 | ```bash 24 | # Start the web server with a custom checkpoint directory 25 | kura --dir ./my_checkpoints 26 | ``` 27 | 28 | The procedural API provides flexibility by breaking the pipeline into composable functions: 29 | 30 | ```python 31 | from kura import ( 32 | summarise_conversations, 33 | generate_base_clusters_from_conversation_summaries, 34 | reduce_clusters_from_base_clusters, 35 | reduce_dimensionality_from_clusters, 36 | CheckpointManager 37 | ) 38 | from kura.summarisation import SummaryModel 39 | from kura.cluster import ClusterModel 40 | from kura.meta_cluster import MetaClusterModel 41 | from kura.dimensionality import HDBUMAP 42 | # Assuming Conversation type might be needed for context, if not, it can be removed. 43 | # from kura.types import Conversation 44 | 45 | # Sample conversations (replace with your actual data loading) 46 | # conversations = [Conversation(...)] 47 | 48 | # Configure models independently 49 | summary_model = SummaryModel() 50 | cluster_model = ClusterModel() 51 | meta_cluster_model = MetaClusterModel(max_clusters=10) 52 | dimensionality_model = HDBUMAP() 53 | 54 | # Optional checkpoint management 55 | checkpoint_manager = CheckpointManager("./my_checkpoints", enabled=True) 56 | 57 | # Run pipeline with keyword arguments 58 | async def analyze(conversations): # Added conversations as an argument 59 | summaries = await summarise_conversations( 60 | conversations, 61 | model=summary_model, 62 | checkpoint_manager=checkpoint_manager 63 | ) 64 | 65 | clusters = await generate_base_clusters_from_conversation_summaries( 66 | summaries, 67 | model=cluster_model, 68 | checkpoint_manager=checkpoint_manager 69 | ) 70 | 71 | reduced = await reduce_clusters_from_base_clusters( 72 | clusters, 73 | model=meta_cluster_model, 74 | checkpoint_manager=checkpoint_manager 75 | ) 76 | 77 | projected = await reduce_dimensionality_from_clusters( 78 | reduced, 79 | model=dimensionality_model, 80 | checkpoint_manager=checkpoint_manager 81 | ) 82 | 83 | return projected 84 | ``` 85 | 86 | The procedural API excels at working with different model implementations for the same task: 87 | 88 | ```python 89 | # Use different backends for the same task 90 | from kura import summarise_conversations 91 | # Assuming these model classes exist and are correctly imported 92 | # from kura.summarisation import OpenAISummaryModel, VLLMSummaryModel, HuggingFaceSummaryModel 93 | 94 | # Sample conversations (replace with your actual data loading) 95 | # conversations = [...] 96 | # checkpoint_mgr = CheckpointManager("./my_checkpoints") 97 | 98 | # OpenAI backend 99 | # openai_summaries = await summarise_conversations( 100 | # conversations, 101 | # model=OpenAISummaryModel(api_key="sk-..."), # Replace with actual model init if different 102 | # checkpoint_manager=checkpoint_mgr 103 | # ) 104 | 105 | # Local vLLM backend 106 | # vllm_summaries = await summarise_conversations( 107 | # conversations, 108 | # model=VLLMSummaryModel(model_path="/models/llama"), # Replace with actual model init if different 109 | # checkpoint_manager=checkpoint_mgr 110 | # ) 111 | 112 | # Hugging Face backend 113 | # hf_summaries = await summarise_conversations( 114 | # conversations, 115 | # model=HuggingFaceSummaryModel("facebook/bart-large-cnn"), # Replace with actual model init if different 116 | # checkpoint_manager=checkpoint_mgr 117 | # ) 118 | ``` 119 | 120 | _Note: The heterogeneous models example has been commented out as it relies on specific model classes (`OpenAISummaryModel`, `VLLMSummaryModel`, `HuggingFaceSummaryModel`) whose existence and import paths are not confirmed from the provided context. Ensure these are correctly defined and imported in your actual usage._ 121 | 122 | ## Next Steps 123 | 124 | Now that you understand how to configure Kura using the procedural API, you can: 125 | 126 | - [Learn about core concepts](../core-concepts/overview.md) 127 | - [Try the Procedural API Tutorial](../getting-started/quickstart.md) 128 | - [Check out the API Reference](../api/index.md) 129 | -------------------------------------------------------------------------------- /docs/getting-started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation Guide 2 | 3 | This guide will walk you through the installation process for Kura. 4 | 5 | ## Requirements 6 | 7 | Kura has the following requirements: 8 | 9 | - Python 3.9+ (Python 3.9 is specifically recommended due to UMAP dependency) 10 | - uv package manager 11 | - OpenAI API key for model access 12 | 13 | ## Installation 14 | 15 | ```bash 16 | # Install using uv 17 | uv pip install kura 18 | ``` 19 | 20 | ### Development Installation 21 | 22 | If you want to contribute to Kura or modify the source code, install it in development mode: 23 | 24 | ```bash 25 | # Clone the repository 26 | git clone https://github.com/567-labs/kura.git 27 | cd kura 28 | 29 | # Create and activate a virtual environment 30 | python -m venv venv 31 | source venv/bin/activate # On Windows: venv\Scripts\activate 32 | 33 | # Install in development mode with dev dependencies 34 | uv pip install -e . --group dev 35 | ``` 36 | 37 | ## Setting up API Keys 38 | 39 | Kura uses OpenAI models for processing. You'll need to set up an API key: 40 | 41 | 1. Get an OpenAI API key from [OpenAI Platform](https://platform.openai.com/api-keys) 42 | 2. Set the environment variable: 43 | 44 | ```bash 45 | # On Linux/macOS 46 | export OPENAI_API_KEY=your_api_key_here 47 | 48 | # On Windows 49 | set OPENAI_API_KEY=your_api_key_here 50 | ``` 51 | 52 | ## Installing Optional Dependencies 53 | 54 | Kura supports additional features with optional dependencies: 55 | 56 | ```bash 57 | uv sync --all-extras --group dev --group docs 58 | ``` 59 | 60 | ## Verifying Your Installation 61 | 62 | To verify that Kura is installed correctly, run: 63 | 64 | ```bash 65 | python -c "from kura import summarise_conversations; print('Kura installed successfully')" 66 | ``` 67 | 68 | You should see a confirmation message with no errors. 69 | 70 | ## Next Steps 71 | 72 | Now that you have Kura installed, proceed to the [Quickstart guide](quickstart.md) to begin analyzing your first dataset. 73 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Kura: Procedural API for Chat Data Analysis 2 | 3 |  4 | 5 | [](https://pypi.org/project/kura/) 6 | [](https://github.com/567-labs/kura/stargazers) 7 | [](https://567-labs.github.io/kura/) 8 | [](https://opensource.org/licenses/MIT) 9 | [](https://pypi.org/project/kura/) 10 | [](https://pypi.org/project/kura/) 11 | 12 | Kura is an open-source library for understanding chat data through machine learning, inspired by [Anthropic's CLIO](https://www.anthropic.com/research/clio). It provides a functional, composable API for clustering conversations to discover patterns and insights. 13 | 14 | ## Why Analyze Conversation Data? 15 | 16 | As AI assistants and chatbots become increasingly central to product experiences, understanding how users interact with these systems at scale becomes a critical challenge. Manually reviewing thousands of conversations is impractical, yet crucial patterns and user needs often remain hidden in this data. 17 | 18 | Kura addresses this challenge by: 19 | 20 | - **Revealing user intent patterns** that may not be obvious from individual conversations 21 | - **Identifying common user needs** to prioritize feature development 22 | - **Discovering edge cases and failures** that require attention 23 | - **Tracking usage trends** over time as your product evolves 24 | - **Informing prompt engineering** by highlighting successful and problematic interactions 25 | 26 | ## Features 27 | 28 | - **Conversation Summarization**: Automatically generate concise task descriptions from conversations 29 | - **Hierarchical Clustering**: Group similar conversations at multiple levels of granularity 30 | - **Metadata Extraction**: Extract valuable context from conversations using LLMs 31 | - **Custom Models**: Use your preferred embedding, summarization, and clustering methods 32 | - **Checkpoint System**: Save and resume analysis sessions 33 | - **Procedural API**: Functional approach with composable functions for maximum flexibility 34 | 35 | ## Installation 36 | 37 | ```bash 38 | # Install from PyPI 39 | pip install kura 40 | 41 | # Or use uv for faster installation 42 | uv pip install kura 43 | ``` 44 | 45 | ## Quick Start 46 | 47 | ```python 48 | from kura import ( 49 | summarise_conversations, 50 | generate_base_clusters_from_conversation_summaries, 51 | reduce_clusters_from_base_clusters, 52 | reduce_dimensionality_from_clusters, 53 | CheckpointManager 54 | ) 55 | from kura.types import Conversation 56 | from kura.summarisation import SummaryModel 57 | from kura.cluster import ClusterModel 58 | from kura.meta_cluster import MetaClusterModel 59 | from kura.dimensionality import HDBUMAP 60 | import asyncio 61 | 62 | # Load conversations 63 | conversations = Conversation.from_hf_dataset( 64 | "ivanleomk/synthetic-gemini-conversations", 65 | split="train" 66 | ) 67 | 68 | # Set up models 69 | summary_model = SummaryModel() 70 | cluster_model = ClusterModel() 71 | meta_cluster_model = MetaClusterModel(max_clusters=10) 72 | dimensionality_model = HDBUMAP() 73 | 74 | # Set up checkpoint manager 75 | checkpoint_mgr = CheckpointManager("./checkpoints", enabled=True) 76 | 77 | # Run pipeline with explicit steps 78 | async def process_conversations(): 79 | # Step 1: Generate summaries 80 | summaries = await summarise_conversations( 81 | conversations, 82 | model=summary_model, 83 | checkpoint_manager=checkpoint_mgr 84 | ) 85 | 86 | # Step 2: Create base clusters 87 | clusters = await generate_base_clusters_from_conversation_summaries( 88 | summaries, 89 | model=cluster_model, 90 | checkpoint_manager=checkpoint_mgr 91 | ) 92 | 93 | # Step 3: Build hierarchy 94 | meta_clusters = await reduce_clusters_from_base_clusters( 95 | clusters, 96 | model=meta_cluster_model, 97 | checkpoint_manager=checkpoint_mgr 98 | ) 99 | 100 | # Step 4: Project to 2D 101 | projected = await reduce_dimensionality_from_clusters( 102 | meta_clusters, 103 | model=dimensionality_model, 104 | checkpoint_manager=checkpoint_mgr 105 | ) 106 | 107 | return projected 108 | 109 | # Execute the pipeline 110 | results = asyncio.run(process_conversations()) 111 | visualise_pipeline_results(results, style="basic") 112 | Clusters (190 conversations) 113 | ╠══ Generate SEO-optimized content for blogs and scripts (38 conversations) 114 | ║ ╠══ Assist in writing SEO-friendly blog posts (12 conversations) 115 | ║ ╚══ Help create SEO-driven marketing content (8 conversations) 116 | ╠══ Help analyze and visualize data with R and Tableau (25 conversations) 117 | ║ ╠══ Assist with data analysis and visualization in R (15 conversations) 118 | ║ ╚══ Troubleshoot sales data visualizations in Tableau (10 conversations) 119 | ... (and more clusters) 120 | ``` 121 | 122 | ## Documentation 123 | 124 | ### Getting Started 125 | 126 | - [Installation Guide](getting-started/installation.md) 127 | - [Quickstart](getting-started/quickstart.md) 128 | 129 | ### Core Concepts 130 | 131 | - [Conversations](core-concepts/conversations.md) 132 | - [Embedding](core-concepts/embedding.md) 133 | - [Clustering](core-concepts/clustering.md) 134 | - [Summarization](core-concepts/summarization.md) 135 | - [Meta-Clustering](core-concepts/meta-clustering.md) 136 | - [Dimensionality Reduction](core-concepts/dimensionality-reduction.md) 137 | 138 | ### API Reference 139 | 140 | - [Procedural API Documentation](api/index.md) 141 | 142 | ## About 143 | 144 | Kura is under active development. If you face any issues or have suggestions, please feel free to [open an issue](https://github.com/567-labs/kura/issues) or a PR. For more details on the technical implementation, check out this [walkthrough of the code](https://ivanleo.com/blog/understanding-user-conversations). 145 | -------------------------------------------------------------------------------- /kura.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/567-labs/kura/d01621c207c1e901139488a1107f3090fb634746/kura.png -------------------------------------------------------------------------------- /kura/__init__.py: -------------------------------------------------------------------------------- 1 | from .v1.kura import ( 2 | summarise_conversations, 3 | generate_base_clusters_from_conversation_summaries, 4 | reduce_clusters_from_base_clusters, 5 | reduce_dimensionality_from_clusters, 6 | CheckpointManager, 7 | ) 8 | from .cluster import ClusterModel 9 | from .meta_cluster import MetaClusterModel 10 | from .summarisation import SummaryModel 11 | from .types import Conversation 12 | 13 | __all__ = [ 14 | "ClusterModel", 15 | "MetaClusterModel", 16 | "SummaryModel", 17 | "Conversation", 18 | "summarise_conversations", 19 | "generate_base_clusters_from_conversation_summaries", 20 | "reduce_clusters_from_base_clusters", 21 | "reduce_dimensionality_from_clusters", 22 | "CheckpointManager", 23 | ] 24 | -------------------------------------------------------------------------------- /kura/base_classes/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedding import BaseEmbeddingModel 2 | from .summarisation import BaseSummaryModel 3 | from .clustering_method import BaseClusteringMethod 4 | from .cluster import BaseClusterModel 5 | from .meta_cluster import BaseMetaClusterModel 6 | from .dimensionality import BaseDimensionalityReduction 7 | 8 | __all__ = [ 9 | "BaseEmbeddingModel", 10 | "BaseSummaryModel", 11 | "BaseClusteringMethod", 12 | "BaseClusterModel", 13 | "BaseMetaClusterModel", 14 | "BaseDimensionalityReduction", 15 | ] 16 | -------------------------------------------------------------------------------- /kura/base_classes/cluster.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from kura.summarisation import ConversationSummary 3 | from kura.types import Cluster 4 | 5 | 6 | class BaseClusterModel(ABC): 7 | @property 8 | @abstractmethod 9 | def checkpoint_filename(self) -> str: 10 | """The filename to use for checkpointing this model's output.""" 11 | pass 12 | 13 | @abstractmethod 14 | async def cluster_summaries( 15 | self, summaries: list[ConversationSummary] 16 | ) -> list[Cluster]: 17 | pass 18 | 19 | # TODO : Add abstract method for hooks here once we start supporting it 20 | -------------------------------------------------------------------------------- /kura/base_classes/clustering_method.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import TypeVar, Union 3 | 4 | T = TypeVar("T") 5 | 6 | 7 | class BaseClusteringMethod(ABC): 8 | @abstractmethod 9 | def cluster( 10 | self, items: list[dict[str, Union[T, list[float]]]] 11 | ) -> dict[int, list[T]]: 12 | pass 13 | -------------------------------------------------------------------------------- /kura/base_classes/dimensionality.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from kura.types import Cluster, ProjectedCluster 4 | 5 | 6 | class BaseDimensionalityReduction(ABC): 7 | @property 8 | @abstractmethod 9 | def checkpoint_filename(self) -> str: 10 | """The filename to use for checkpointing this model's output.""" 11 | pass 12 | 13 | @abstractmethod 14 | async def reduce_dimensionality( 15 | self, clusters: list[Cluster] 16 | ) -> list[ProjectedCluster]: 17 | """ 18 | This reduces the dimensionality of the individual clusters that we've created so we can visualise them in a lower dimension 19 | """ 20 | pass 21 | -------------------------------------------------------------------------------- /kura/base_classes/embedding.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class BaseEmbeddingModel(ABC): 5 | @abstractmethod 6 | async def embed(self, texts: list[str]) -> list[list[float]]: 7 | """Embed a list of texts into a list of lists of floats""" 8 | pass 9 | -------------------------------------------------------------------------------- /kura/base_classes/meta_cluster.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from kura.types.cluster import Cluster 3 | 4 | 5 | class BaseMetaClusterModel(ABC): 6 | @property 7 | @abstractmethod 8 | def checkpoint_filename(self) -> str: 9 | """The filename to use for checkpointing this model's output.""" 10 | pass 11 | 12 | @abstractmethod 13 | async def reduce_clusters(self, clusters: list[Cluster]) -> list[Cluster]: 14 | pass 15 | -------------------------------------------------------------------------------- /kura/base_classes/summarisation.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from kura.types import ConversationSummary, Conversation 4 | from typing import Union 5 | 6 | 7 | class BaseSummaryModel(ABC): 8 | @property 9 | @abstractmethod 10 | def checkpoint_filename(self) -> str: 11 | """The filename to use for checkpointing this model's output.""" 12 | pass 13 | 14 | @abstractmethod 15 | async def summarise( 16 | self, conversations: list[Conversation] 17 | ) -> list[ConversationSummary]: 18 | """Summarise the conversations into a list of ConversationSummary""" 19 | pass 20 | 21 | @abstractmethod 22 | async def summarise_conversation( 23 | self, conversation: Conversation 24 | ) -> ConversationSummary: 25 | """Summarise a single conversation into a single string""" 26 | pass 27 | 28 | @abstractmethod 29 | async def apply_hooks( 30 | self, conversation: Conversation 31 | ) -> dict[str, Union[str, int, float, bool, list[str], list[int], list[float]]]: 32 | """Apply hooks to the conversation summary""" 33 | # Assert that the implementation of the class has a hooks attribute so we can call it in summarise_conversation 34 | assert hasattr(self, "hooks") 35 | pass 36 | -------------------------------------------------------------------------------- /kura/cli/cli.py: -------------------------------------------------------------------------------- 1 | import typer 2 | import uvicorn 3 | from kura.cli.server import api 4 | from rich import print 5 | import os 6 | 7 | app = typer.Typer() 8 | 9 | 10 | @app.command() 11 | def start_app( 12 | dir: str = typer.Option( 13 | "./checkpoints", 14 | help="Directory to use for checkpoints, relative to the current directory", 15 | ), 16 | ): 17 | """Start the FastAPI server""" 18 | os.environ["KURA_CHECKPOINT_DIR"] = dir 19 | print( 20 | "\n[bold green]🚀 Access website at[/bold green] [bold blue][http://localhost:8000](http://localhost:8000)[/bold blue]\n" 21 | ) 22 | uvicorn.run(api, host="0.0.0.0", port=8000) 23 | 24 | 25 | if __name__ == "__main__": 26 | app() 27 | -------------------------------------------------------------------------------- /kura/cli/server.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, staticfiles 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from pathlib import Path 4 | 5 | api = FastAPI() 6 | 7 | # Configure CORS 8 | api.add_middleware( 9 | CORSMiddleware, 10 | allow_origins=["*"], # Allows all origins 11 | allow_credentials=True, 12 | allow_methods=["*"], # Allows all methods 13 | allow_headers=["*"], # Allows all headers 14 | ) 15 | 16 | # Serve static files from web/dist 17 | web_dir = Path(__file__).parent.parent / "static" / "dist" 18 | if not web_dir.exists(): 19 | raise FileNotFoundError(f"Static files directory not found: {web_dir}") 20 | 21 | 22 | # Serve static files from web/dist at the root 23 | web_dir = Path(__file__).parent.parent / "static" / "dist" 24 | if not web_dir.exists(): 25 | raise FileNotFoundError(f"Static files directory not found: {web_dir}") 26 | 27 | # Mount static files at root 28 | api.mount("/", staticfiles.StaticFiles(directory=str(web_dir), html=True)) 29 | -------------------------------------------------------------------------------- /kura/cli/visualisation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from typing import List 3 | from kura.types import Conversation 4 | 5 | 6 | def generate_cumulative_chart_data(conversations: List[Conversation]) -> dict: 7 | """ 8 | Generate cumulative word count chart data for human messages in conversations. 9 | Returns a dict containing the Plotly data and layout. 10 | """ 11 | messages_data = [] 12 | for conv in conversations: 13 | for msg in conv.messages: 14 | if msg.role == "user": 15 | messages_data.append( 16 | { 17 | "datetime": pd.to_datetime( 18 | str(msg.created_at).replace("Z", "+00:00") 19 | ), 20 | "words": len(msg.content.split()), 21 | } 22 | ) 23 | 24 | df = pd.DataFrame(messages_data) 25 | df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time 26 | 27 | weekly_df = df.groupby("week_start")["words"].sum().reset_index() 28 | weekly_df["cumulative_words"] = weekly_df["words"].cumsum() 29 | weekly_df["week_start"] = weekly_df["week_start"].dt.strftime("%Y-%m-%d") 30 | 31 | return [ 32 | {"x": x, "y": y} 33 | for x, y in zip( 34 | weekly_df["week_start"].tolist(), weekly_df["cumulative_words"].tolist() 35 | ) 36 | ] # pyright: ignore 37 | 38 | 39 | def generate_messages_per_chat_data(conversations: List[Conversation]) -> dict: 40 | messages_data = [] 41 | for conv in conversations: 42 | for msg in conv.messages: 43 | messages_data.append( 44 | { 45 | "datetime": pd.to_datetime( 46 | str(msg.created_at).replace("Z", "+00:00") 47 | ), 48 | "chat_id": conv.chat_id, 49 | } 50 | ) 51 | 52 | df = pd.DataFrame(messages_data) 53 | df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time 54 | 55 | weekly_messages = df.groupby("week_start").size().reset_index(name="message_count") # pyright: ignore 56 | weekly_chats = ( 57 | df.groupby("week_start")["chat_id"].nunique().reset_index(name="chat_count") # pyright: ignore 58 | ) 59 | 60 | weekly_df = pd.merge(weekly_messages, weekly_chats, on="week_start") 61 | weekly_df["avg_messages"] = weekly_df["message_count"] / weekly_df["chat_count"] 62 | weekly_df["week_start"] = weekly_df["week_start"].dt.strftime("%Y-%m-%d") 63 | 64 | return [ 65 | {"x": x, "y": y} 66 | for x, y in zip( 67 | weekly_df["week_start"].tolist(), weekly_df["avg_messages"].tolist() 68 | ) 69 | ] # pyright: ignore 70 | 71 | 72 | def generate_messages_per_week_data(conversations: List[Conversation]) -> dict: 73 | messages_data = [] 74 | for conv in conversations: 75 | for msg in conv.messages: 76 | messages_data.append( 77 | { 78 | "datetime": pd.to_datetime( 79 | str(msg.created_at).replace("Z", "+00:00") 80 | ), 81 | "chat_id": conv.chat_id, 82 | } 83 | ) 84 | 85 | df = pd.DataFrame(messages_data) 86 | df["week_start"] = df["datetime"].dt.to_period("W-MON").dt.start_time 87 | 88 | weekly_messages = df.groupby("week_start").size().reset_index(name="message_count") # pyright: ignore 89 | weekly_messages["week_start"] = weekly_messages["week_start"].dt.strftime( 90 | "%Y-%m-%d" 91 | ) # pyright: ignore 92 | 93 | return [ 94 | {"x": x, "y": y} 95 | for x, y in zip( 96 | weekly_messages["week_start"].tolist(), 97 | weekly_messages["message_count"].tolist(), 98 | ) 99 | ] # pyright: ignore 100 | 101 | 102 | def generate_new_chats_per_week_data(conversations: List[Conversation]) -> dict: 103 | chat_starts = pd.DataFrame( 104 | [ 105 | { 106 | "datetime": pd.to_datetime(str(conv.created_at).replace("Z", "+00:00")), 107 | "chat_id": conv.chat_id, 108 | } 109 | for conv in conversations 110 | ] 111 | ) 112 | chat_starts["week_start"] = ( 113 | chat_starts["datetime"].dt.to_period("W-MON").dt.start_time 114 | ) 115 | weekly_chats = ( 116 | chat_starts.groupby("week_start").size().reset_index(name="chat_count") # pyright: ignore 117 | ) 118 | weekly_chats["week_start"] = weekly_chats["week_start"].dt.strftime("%Y-%m-%d") 119 | 120 | return [ 121 | {"x": x, "y": y} 122 | for x, y in zip( 123 | weekly_chats["week_start"].tolist(), weekly_chats["chat_count"].tolist() 124 | ) 125 | ] # pyright: ignore 126 | -------------------------------------------------------------------------------- /kura/dimensionality.py: -------------------------------------------------------------------------------- 1 | from kura.base_classes import BaseDimensionalityReduction, BaseEmbeddingModel 2 | from kura.types import Cluster, ProjectedCluster 3 | from kura.embedding import OpenAIEmbeddingModel 4 | from typing import Union 5 | import numpy as np 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class HDBUMAP(BaseDimensionalityReduction): 12 | @property 13 | def checkpoint_filename(self) -> str: 14 | """The filename to use for checkpointing this model's output.""" 15 | return "dimensionality.jsonl" 16 | 17 | def __init__( 18 | self, 19 | embedding_model: BaseEmbeddingModel = OpenAIEmbeddingModel(), 20 | n_components: int = 2, 21 | min_dist: float = 0.1, 22 | metric: str = "cosine", 23 | n_neighbors: Union[int, None] = None, 24 | ): 25 | self.embedding_model = embedding_model 26 | self.n_components = n_components 27 | self.min_dist = min_dist 28 | self.metric = metric 29 | self.n_neighbors = n_neighbors 30 | logger.info( 31 | f"Initialized HDBUMAP with embedding_model={type(embedding_model).__name__}, n_components={n_components}, min_dist={min_dist}, metric={metric}, n_neighbors={n_neighbors}" 32 | ) 33 | 34 | async def reduce_dimensionality( 35 | self, clusters: list[Cluster] 36 | ) -> list[ProjectedCluster]: 37 | # Embed all clusters 38 | from umap import UMAP 39 | 40 | if not clusters: 41 | logger.warning("Empty clusters list provided to reduce_dimensionality") 42 | return [] 43 | 44 | logger.info(f"Starting dimensionality reduction for {len(clusters)} clusters") 45 | texts_to_embed = [str(c) for c in clusters] 46 | 47 | try: 48 | cluster_embeddings = await self.embedding_model.embed(texts_to_embed) 49 | logger.debug(f"Generated embeddings for {len(clusters)} clusters") 50 | except Exception as e: 51 | logger.error(f"Failed to generate embeddings for clusters: {e}") 52 | raise 53 | 54 | if not cluster_embeddings or len(cluster_embeddings) != len(texts_to_embed): 55 | logger.error( 56 | f"Error: Number of embeddings ({len(cluster_embeddings) if cluster_embeddings else 0}) does not match number of clusters ({len(texts_to_embed)}) or embeddings are empty." 57 | ) 58 | return [] 59 | 60 | embeddings = np.array(cluster_embeddings) 61 | logger.debug(f"Created embedding matrix of shape {embeddings.shape}") 62 | 63 | # Project to 2D using UMAP 64 | n_neighbors_actual = ( 65 | self.n_neighbors if self.n_neighbors else min(15, len(embeddings) - 1) 66 | ) 67 | logger.debug( 68 | f"Using UMAP with n_neighbors={n_neighbors_actual}, min_dist={self.min_dist}, metric={self.metric}" 69 | ) 70 | 71 | try: 72 | umap_reducer = UMAP( 73 | n_components=self.n_components, 74 | n_neighbors=n_neighbors_actual, 75 | min_dist=self.min_dist, 76 | metric=self.metric, 77 | ) 78 | reduced_embeddings = umap_reducer.fit_transform(embeddings) 79 | logger.info( 80 | f"UMAP dimensionality reduction completed: {embeddings.shape} -> {reduced_embeddings.shape}" # type: ignore 81 | ) 82 | except Exception as e: 83 | logger.error(f"UMAP dimensionality reduction failed: {e}") 84 | raise 85 | 86 | # Create projected clusters with 2D coordinates 87 | res = [] 88 | for i, cluster in enumerate(clusters): 89 | projected = ProjectedCluster( 90 | slug=cluster.slug, 91 | id=cluster.id, 92 | name=cluster.name, 93 | description=cluster.description, 94 | chat_ids=cluster.chat_ids, 95 | parent_id=cluster.parent_id, 96 | x_coord=float(reduced_embeddings[i][0]), # pyright: ignore 97 | y_coord=float(reduced_embeddings[i][1]), # pyright: ignore 98 | level=0 99 | if cluster.parent_id is None 100 | else 1, # TODO: Fix this, should reflect the level of the cluster 101 | ) 102 | res.append(projected) 103 | 104 | logger.info(f"Successfully created {len(res)} projected clusters") 105 | return res 106 | -------------------------------------------------------------------------------- /kura/embedding.py: -------------------------------------------------------------------------------- 1 | from kura.base_classes import BaseEmbeddingModel 2 | from asyncio import Semaphore, gather 3 | from tenacity import retry, wait_fixed, stop_after_attempt 4 | from openai import AsyncOpenAI 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class OpenAIEmbeddingModel(BaseEmbeddingModel): 11 | def __init__( 12 | self, 13 | model_name: str = "text-embedding-3-small", 14 | model_batch_size: int = 50, 15 | n_concurrent_jobs: int = 5, 16 | ): 17 | self.client = AsyncOpenAI() 18 | self.model_name = model_name 19 | self._model_batch_size = model_batch_size 20 | self._n_concurrent_jobs = n_concurrent_jobs 21 | self._semaphore = Semaphore(n_concurrent_jobs) 22 | logger.info( 23 | f"Initialized OpenAIEmbeddingModel with model={model_name}, batch_size={model_batch_size}, concurrent_jobs={n_concurrent_jobs}" 24 | ) 25 | 26 | def slug(self): 27 | return f"openai:{self.model_name}-batchsize:{self._model_batch_size}-concurrent:{self._n_concurrent_jobs}" 28 | 29 | @retry(wait=wait_fixed(3), stop=stop_after_attempt(3)) 30 | async def _embed_batch(self, texts: list[str]) -> list[list[float]]: 31 | """Embed a single batch of texts.""" 32 | async with self._semaphore: 33 | try: 34 | logger.debug( 35 | f"Embedding batch of {len(texts)} texts using model {self.model_name}" 36 | ) 37 | resp = await self.client.embeddings.create( 38 | input=texts, model=self.model_name 39 | ) 40 | embeddings = [item.embedding for item in resp.data] 41 | logger.debug( 42 | f"Successfully embedded batch of {len(texts)} texts, got {len(embeddings)} embeddings" 43 | ) 44 | return embeddings 45 | except Exception as e: 46 | logger.error(f"Failed to embed batch of {len(texts)} texts: {e}") 47 | raise 48 | 49 | async def embed(self, texts: list[str]) -> list[list[float]]: 50 | if not texts: 51 | logger.debug("Empty text list provided, returning empty embeddings") 52 | return [] 53 | 54 | logger.info(f"Starting embedding of {len(texts)} texts using {self.model_name}") 55 | 56 | # Create batches 57 | batches = _batch_texts(texts, self._model_batch_size) 58 | logger.debug( 59 | f"Split {len(texts)} texts into {len(batches)} batches of size {self._model_batch_size}" 60 | ) 61 | 62 | # Process all batches concurrently 63 | tasks = [self._embed_batch(batch) for batch in batches] 64 | try: 65 | results_list_of_lists = await gather(*tasks) 66 | logger.debug(f"Completed embedding {len(batches)} batches") 67 | except Exception as e: 68 | logger.error(f"Failed to embed texts: {e}") 69 | raise 70 | 71 | # Flatten results 72 | embeddings = [] 73 | for result_batch in results_list_of_lists: 74 | embeddings.extend(result_batch) 75 | 76 | logger.info( 77 | f"Successfully embedded {len(texts)} texts, produced {len(embeddings)} embeddings" 78 | ) 79 | return embeddings 80 | 81 | 82 | def _batch_texts(texts: list[str], batch_size: int) -> list[list[str]]: 83 | """Helper function to divide a list of texts into batches.""" 84 | if not texts: 85 | return [] 86 | 87 | batches = [] 88 | for i in range(0, len(texts), batch_size): 89 | batch = texts[i : i + batch_size] 90 | batches.append(batch) 91 | return batches 92 | 93 | 94 | class SentenceTransformerEmbeddingModel(BaseEmbeddingModel): 95 | def __init__( 96 | self, 97 | model_name: str = "all-MiniLM-L6-v2", 98 | model_batch_size: int = 128, 99 | ): 100 | from sentence_transformers import SentenceTransformer # type: ignore 101 | 102 | logger.info( 103 | f"Initializing SentenceTransformerEmbeddingModel with model={model_name}, batch_size={model_batch_size}" 104 | ) 105 | try: 106 | self.model = SentenceTransformer(model_name) 107 | self._model_batch_size = model_batch_size 108 | logger.info(f"Successfully loaded SentenceTransformer model: {model_name}") 109 | except Exception as e: 110 | logger.error(f"Failed to load SentenceTransformer model {model_name}: {e}") 111 | raise 112 | 113 | @retry(wait=wait_fixed(3), stop=stop_after_attempt(3)) 114 | async def embed(self, texts: list[str]) -> list[list[float]]: 115 | if not texts: 116 | logger.debug("Empty text list provided, returning empty embeddings") 117 | return [] 118 | 119 | logger.info( 120 | f"Starting embedding of {len(texts)} texts using SentenceTransformer" 121 | ) 122 | 123 | # Create batches 124 | batches = _batch_texts(texts, self._model_batch_size) 125 | logger.debug( 126 | f"Split {len(texts)} texts into {len(batches)} batches of size {self._model_batch_size}" 127 | ) 128 | 129 | # Process all batches 130 | embeddings = [] 131 | try: 132 | for i, batch in enumerate(batches): 133 | logger.debug( 134 | f"Processing batch {i + 1}/{len(batches)} with {len(batch)} texts" 135 | ) 136 | batch_embeddings = self.model.encode(batch).tolist() 137 | embeddings.extend(batch_embeddings) 138 | logger.debug(f"Completed batch {i + 1}/{len(batches)}") 139 | 140 | logger.info( 141 | f"Successfully embedded {len(texts)} texts using SentenceTransformer, produced {len(embeddings)} embeddings" 142 | ) 143 | except Exception as e: 144 | logger.error(f"Failed to embed texts using SentenceTransformer: {e}") 145 | raise 146 | 147 | return embeddings 148 | -------------------------------------------------------------------------------- /kura/k_means.py: -------------------------------------------------------------------------------- 1 | from kura.base_classes import BaseClusteringMethod 2 | from sklearn.cluster import KMeans 3 | import math 4 | from typing import TypeVar 5 | import numpy as np 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | T = TypeVar("T") 11 | 12 | 13 | class KmeansClusteringMethod(BaseClusteringMethod): 14 | def __init__(self, clusters_per_group: int = 10): 15 | self.clusters_per_group = clusters_per_group 16 | logger.info( 17 | f"Initialized KmeansClusteringMethod with clusters_per_group={clusters_per_group}" 18 | ) 19 | 20 | def cluster(self, items: list[T]) -> dict[int, list[T]]: 21 | """ 22 | We perform a clustering here using an embedding defined on each individual item. 23 | 24 | We assume that the item is passed in as a dictionary with 25 | 26 | - its relevant embedding stored in the "embedding" key. 27 | - the item itself stored in the "item" key. 28 | 29 | { 30 | "embedding": list[float], 31 | "item": any, 32 | } 33 | """ 34 | if not items: 35 | logger.warning("Empty items list provided to cluster method") 36 | return {} 37 | 38 | logger.info(f"Starting K-means clustering of {len(items)} items") 39 | 40 | try: 41 | embeddings = [item["embedding"] for item in items] # pyright: ignore 42 | data: list[T] = [item["item"] for item in items] # pyright: ignore 43 | n_clusters = math.ceil(len(data) / self.clusters_per_group) 44 | 45 | logger.debug( 46 | f"Calculated {n_clusters} clusters for {len(data)} items (target: {self.clusters_per_group} items per cluster)" 47 | ) 48 | 49 | X = np.array(embeddings) 50 | logger.debug(f"Created embedding matrix of shape {X.shape}") 51 | 52 | kmeans = KMeans(n_clusters=n_clusters) 53 | cluster_labels = kmeans.fit_predict(X) 54 | 55 | logger.debug( 56 | f"K-means clustering completed, assigned {len(set(cluster_labels))} unique cluster labels" 57 | ) 58 | 59 | result = { 60 | i: [data[j] for j in range(len(data)) if cluster_labels[j] == i] 61 | for i in range(n_clusters) 62 | } 63 | 64 | # Log cluster size distribution 65 | cluster_sizes = [len(cluster_items) for cluster_items in result.values()] 66 | logger.info( 67 | f"K-means clustering completed: {len(result)} clusters created with sizes {cluster_sizes}" 68 | ) 69 | logger.debug( 70 | f"Cluster size stats - min: {min(cluster_sizes)}, max: {max(cluster_sizes)}, avg: {sum(cluster_sizes) / len(cluster_sizes):.1f}" 71 | ) 72 | 73 | return result 74 | 75 | except Exception as e: 76 | logger.error( 77 | f"Failed to perform K-means clustering on {len(items)} items: {e}" 78 | ) 79 | raise 80 | -------------------------------------------------------------------------------- /kura/static/dist/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |{selectedCluster.name}
145 | {selectedCluster.description && ( 146 |147 | {selectedCluster.description} 148 |
149 | )} 150 |152 | Level: {selectedCluster.level} 153 |
154 |157 | ID: {selectedCluster.id} 158 |
159 | )} 160 |161 | {selectedCluster.chat_ids?.length} chats 162 |
163 | 164 | {/* Metadata summary section */} 165 |175 | ID: {item.chat_id} 176 |
177 |{item.summary}
178 |93 | Loaded in {conversations.length} conversations,{" "} 94 | {summaries?.length} summaries, {clusters?.length} clusters 95 |
96 | 99 |