├── .gitignore ├── requirements.txt ├── script.py ├── improvements.md ├── ideas.md ├── README.md ├── main.py ├── graphrag.py └── deepcrawl.py /.gitignore: -------------------------------------------------------------------------------- 1 | rag-app/node_modules/ 2 | rag-app/build/ 3 | rag-app/.env 4 | rag-app/.env.local 5 | rag-app/.next 6 | __pycache__ 7 | .env 8 | kg.json 9 | enhanced_kg.json 10 | grpc 11 | .cursor/rules 12 | output/ 13 | grpc/kg.json 14 | grpc/enhanced_kg.json 15 | ragtest/input 16 | ragtest 17 | grpc/node_modules 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Crawl4AI GraphRAG project dependencies 2 | 3 | # Crawling / Graph / NLP 4 | crawl4ai[all]>=0.6.0 5 | sentence-transformers>=3.1.1 # Updated for Python 3.13 support 6 | networkx>=3.4.1 # Updated 7 | scikit-learn>=1.5.2 # Updated for Python 3.13 8 | numpy>=1.26.4 # Updated (1.24.x doesn't support 3.13) 9 | summa>=1.2.0 10 | spacy>=3.7.0 11 | pandas>=2.2.3 # Updated 12 | 13 | # Google Gemini integration 14 | google-genai 15 | # Environment & utilities 16 | python-dotenv>=1.0.0 17 | 18 | # gRPC server & Protocol Buffers 19 | grpcio>=1.73.1 20 | protobuf 21 | torch>=2.0.0 # Required for sentence-transformers 22 | h5py 23 | hashlib 24 | # Note: Removed asyncio-run>=0.1.1 as it's invalid/unnecessary (use built-in asyncio.run()) -------------------------------------------------------------------------------- /script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import arxiv 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser(description="Search arXiv from the command line") 7 | parser.add_argument("query", type=str, help="Search query") 8 | parser.add_argument("--max_results", type=int, default=5, help="Maximum number of results to fetch") 9 | parser.add_argument("--sort", type=str, default="submittedDate", choices=["relevance", "lastUpdatedDate", "submittedDate"], help="Sort criterion") 10 | parser.add_argument("--pdf", action="store_true", help="Print PDF links instead of abstract URLs") 11 | 12 | args = parser.parse_args() 13 | 14 | # Map string sort to enum 15 | sort_map = { 16 | "relevance": arxiv.SortCriterion.Relevance, 17 | "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate, 18 | "submittedDate": arxiv.SortCriterion.SubmittedDate, 19 | } 20 | 21 | search = arxiv.Search( 22 | query=args.query, 23 | max_results=args.max_results, 24 | sort_by=sort_map[args.sort], 25 | ) 26 | 27 | for result in search.results(): 28 | print("Title:", result.title) 29 | print("Authors:", ", ".join(a.name for a in result.authors)) 30 | print("Published:", result.published.date()) 31 | print("Link:", result.pdf_url if args.pdf else result.entry_id) 32 | print("Summary:", result.summary[:250].replace("\n", " "), "...\n") 33 | 34 | if __name__ == "__main__": 35 | main() 36 | 37 | -------------------------------------------------------------------------------- /improvements.md: -------------------------------------------------------------------------------- 1 | # Docbook Improvement Plan 2 | 3 | This document outlines a comprehensive plan to improve the `docbook` tool, focusing on software engineering best practices, CLI experience, agent integration, and advanced GraphRAG capabilities. 4 | 5 | ## 1. Software Design & Architecture 6 | 7 | ### **Current Issues** 8 | - **Monolithic Scripts**: `deepcrawl.py` and `graphrag.py` mix logic, data definitions, and execution. 9 | - **Hardcoded Dependencies**: Service instantiation happens inside classes. 10 | - **Print Debugging**: Extensive use of `print()` instead of structured logging. 11 | - **Global State**: Reliance on global variables and top-level execution code. 12 | 13 | ### **Proposed Refactoring** 14 | 1. **Package Structure**: 15 | ```text 16 | docbook/ 17 | ├── src/ 18 | │ └── docbook/ 19 | │ ├── __init__.py 20 | │ ├── cli.py # Entry point (Typer) 21 | │ ├── config.py # Pydantic settings 22 | │ ├── core/ # Core logic 23 | │ │ ├── graph.py # Graph data structures 24 | │ │ └── storage.py # HDF5/Database IO 25 | │ ├── crawler/ # Crawling logic 26 | │ │ └── deepcrawl.py 27 | │ ├── rag/ # RAG & Retrieval logic 28 | │ │ ├── graphrag.py 29 | │ │ └── embeddings.py 30 | │ └── agent/ # Agentic interfaces 31 | │ └── tools.py 32 | ├── pyproject.toml # Dependency management 33 | └── README.md 34 | ``` 35 | 36 | 2. **Dependency Injection**: 37 | - Pass `LLMClient`, `EmbeddingModel`, and `Storage` instances into classes rather than creating them inside. 38 | 39 | 3. **Configuration Management**: 40 | - Use `pydantic-settings` to manage `.env` and CLI args. 41 | 42 | 4. **Logging**: 43 | - Replace `print` with standard `logging` or `structlog` for better observability. 44 | 45 | ## 2. CLI Experience (The "Good CLI" Goal) 46 | 47 | Switch from `argparse` to **Typer** or **Click** for a modern, composable CLI with rich help and auto-completion. 48 | 49 | ### **Proposed Commands** 50 | ```bash 51 | # Crawl a documentation site 52 | docbook crawl https://docs.example.com --name "example-docs" --depth 2 --output ./data 53 | 54 | # Query the knowledge graph via CLI 55 | docbook ask "How do I configure auth?" --kg ./data/example-docs_kg.h5 56 | 57 | # Start an API server for agents 58 | docbook serve --kg ./data/example-docs_kg.h5 --port 8000 59 | 60 | # Inspect graph stats 61 | docbook inspect ./data/example-docs_kg.h5 62 | ``` 63 | 64 | ## 3. Minimal Coding Agent 65 | 66 | To enable a "minimal coding agent that can fetch docs content," we need to expose the Knowledge Graph (KG) as a **Tool** that an LLM can call. 67 | 68 | ### **Design** 69 | 1. **Tool Interface**: Define a standard Python interface `DocsTool`: 70 | - `search(query: str) -> List[Node]` 71 | - `read_node(url: str) -> str` 72 | - `get_related(url: str) -> List[Node]` 73 | 74 | 2. **Agent Loop**: 75 | - Use a lightweight ReAct (Reasoning + Acting) loop. 76 | - **Prompt**: "You are a coding assistant. Use `search_docs` to find API references. Read the content before writing code." 77 | - **Implementation**: 78 | ```python 79 | class DocAgent: 80 | def __init__(self, kg_path): 81 | self.rag = load_graphrag(kg_path) 82 | 83 | def solve(self, task): 84 | # 1. Plan 85 | # 2. Call rag.retrieve_and_generate() or specific lookup tools 86 | # 3. Generate Code 87 | ``` 88 | 89 | ## 4. Advanced GraphRAG Architecture (Research-Backed) 90 | 91 | Current implementation uses a basic "Keyword + Semantic" retrieval on a tree structure. We can improve this with recent research findings. 92 | 93 | ### **A. Hybrid Construction (NLP + LLM)** 94 | Instead of relying solely on LLMs for extraction (slow/expensive) or simple TextRank (low context), use a hybrid approach: 95 | - **Fast Entity Extraction**: Use **GliNER** (Generalist Model for Named Entity Recognition) or **Spacy** to extract entities (Functions, Classes, Constants) from code blocks and text. 96 | - **LLM refinement**: Only use LLM to summarize complex relationships between high-level entities. 97 | 98 | ### **B. Community Summarization (Microsoft GraphRAG style)** 99 | - **Cluster Nodes**: Use **Leiden** or **Louvain** algorithms to detect communities of related nodes (e.g., "Authentication Module", "Database Drivers"). 100 | - **Hierarchical Summaries**: Generate summaries for these clusters. 101 | - **Retrieval**: Match query against *cluster summaries* first, then drill down to specific nodes. This answers "global" questions (e.g., "How is error handling structured?") better than simple similarity search. 102 | 103 | ### **C. Agentic Graph Traversal (Graph-of-Thoughts)** 104 | Instead of a single retrieval step: 105 | 1. **Start**: Search entry nodes (high similarity). 106 | 2. **Navigate**: The Agent sees the node's content and its *outgoing edges* (links). 107 | 3. **Decide**: The Agent decides whether to: 108 | - Stop and answer. 109 | - Follow a link ("This mentions `AuthConfig`, let me check that node"). 110 | - Backtrack. 111 | This mimics how a human reads documentation (following hyperlinks). 112 | 113 | ### **D. Improved Embedding Strategy** 114 | - **Code-Aware Embeddings**: Use models trained on code (e.g., `jina-embeddings-v2-base-code` or `unixcoder`) for code snippets, rather than generic text embeddings (`all-MiniLM`). 115 | - **Late Interaction (ColBERT)**: If performance allows, use ColBERT-style token-level interaction for higher precision fetching. 116 | 117 | ## 5. Implementation Roadmap 118 | 119 | 1. **Refactor**: Move current code into `src/` structure and switch to `Typer`. 120 | 2. **Upgrade Graph**: Modify `deepcrawl` to use **GliNER** for better entity tagging during crawl. 121 | 3. **Agent API**: Create the `DocsTool` class and a simple `docbook serve` endpoint. 122 | 4. **Advanced RAG**: Implement "Community Summarization" as a post-processing step after crawling. 123 | 124 | -------------------------------------------------------------------------------- /ideas.md: -------------------------------------------------------------------------------- 1 | ## Practical GraphRAG / KG-RAG Ideas (from arXiv) 2 | 3 | Each entry includes the paper link, brief info, how it works, and concrete integration ideas for this repo (see `deepcrawl.py`, `graphrag.py`, `main.py`). 4 | 5 | ### 1) LEGO-GraphRAG: Modularizing Graph-based Retrieval-Augmented Generation for Design Space Exploration 6 | [arXiv:2411.05844](http://arxiv.org/abs/2411.05844v3) 7 | - **info**: Proposes a modular GraphRAG framework enabling plug-and-play components and ablations across stages (indexing, retrieval, reasoning). 8 | - **how it works**: Decouples graph construction, retrieval, expansion, and generation with configuration-driven choices per module. 9 | - **how to integrate**: 10 | - Refactor `graphrag.py` `GraphRAGSystem` into clear modules/interfaces: `Retriever`, `GraphExpander`, `Reranker`, `AnswerGenerator` with a config in `main.py`. 11 | - Add a registry pattern to toggle between keyword vs embedding retrieval in `_find_relevant_urls()` and neighbor strategies in `_expand_context_with_graph()`. 12 | 13 | ### 2) PolyG: Effective and Efficient GraphRAG with Adaptive Graph Traversal 14 | [arXiv:2504.02112](http://arxiv.org/abs/2504.02112v1) 15 | - **info**: Introduces adaptive traversal that selects expansion paths per query and budget, outperforming fixed BFS/DFS. 16 | - **how it works**: Learns/uses heuristics (e.g., keyword overlap, semantic similarity, node centrality) to choose next-hop neighbors under a token/time budget. 17 | - **how to integrate**: 18 | - In `GraphRAGSystem._expand_context_with_graph`, add a traversal policy that ranks neighbors by a weighted score of (cosine similarity, common keywords, degree/centrality, depth penalty). 19 | - Add a retrieval budget (tokens or nodes) in `main.py` args; stop expansion when budget is hit. 20 | 21 | ### 3) KG-Infused RAG: Augmenting Corpus-Based RAG with External Knowledge Graphs 22 | [arXiv:2506.09542](http://arxiv.org/abs/2506.09542v1) 23 | - **info**: Fuses unstructured retrieval with external KGs to improve grounding and coverage. 24 | - **how it works**: Entity/link detection connects text chunks to KG nodes; combines textual evidence with KG triples during retrieval and generation. 25 | - **how to integrate**: 26 | - In `deepcrawl.py`, add optional entity linking step (e.g., spaCy NER + simple Wikidata lookup) storing `entities` per node. 27 | - Extend `GraphRAGSystem._find_relevant_urls` to also retrieve from linked KG neighbors (stored as metadata) and merge with vector/keyword scores. 28 | 29 | ### 4) KET-RAG: A Cost-Efficient Multi-Granular Indexing Framework for Graph-RAG 30 | [arXiv:2502.09304](http://arxiv.org/abs/2502.09304v2) 31 | - **info**: Reduces cost via multi-granularity indices and routing, querying fine-grained units only when needed. 32 | - **how it works**: Two-stage retrieval: coarse (page/section) → fine (paragraph/snippet) with gating. 33 | - **how to integrate**: 34 | - During crawl, store multi-granularity chunks (page → section → paragraph) and precompute embeddings per level. 35 | - Implement two-stage retrieval in `GraphRAGSystem`: first rank pages/sections; only embed/expand into paragraphs for top-K seeds. 36 | 37 | ### 5) Walk&Retrieve: Zero-shot RAG via Knowledge Graph Walks 38 | [arXiv:2505.16849](http://arxiv.org/abs/2505.16849v2) 39 | - **info**: Uses guided random walks on the KG to gather compact, relevant subgraphs without supervision. 40 | - **how it works**: Starts from seed nodes derived from query terms/entities; performs biased walks to collect paths as context. 41 | - **how to integrate**: 42 | - Add `walk_based_retrieval(query, seeds, steps, bias)` that samples paths from the NetworkX graph in `graphrag.py`. 43 | - Use keyword/embedding similarity as transition bias; materialize unique nodes/edges from visited paths into the context. 44 | 45 | ### 6) Empowering GraphRAG with Knowledge Filtering and Integration 46 | [arXiv:2503.13804](http://arxiv.org/abs/2503.13804v1) 47 | - **info**: Improves GraphRAG by filtering noisy knowledge and integrating signals before generation. 48 | - **how it works**: Node/edge quality estimation; prune or down-weight low-signal parts; integrate multi-source knowledge consistently. 49 | - **how to integrate**: 50 | - Compute and store a `quality_score` per node/edge (signals: text length, dedup %, similarity to root domain, outbound degree anomalies). 51 | - During retrieval and expansion, weight scores by `quality_score`; drop nodes below a threshold. 52 | 53 | ### 7) When to use Graphs in RAG: A Comprehensive Analysis 54 | [arXiv:2506.05690](http://arxiv.org/abs/2506.05690v1) 55 | - **info**: Provides criteria for when GraphRAG beats vanilla RAG (e.g., multi-hop, hierarchical, entity-rich queries). 56 | - **how it works**: Empirical analysis across tasks shows benefits w.r.t. structure and reasoning depth. 57 | - **how to integrate**: 58 | - Add a routing policy in `main.py`: detect entity density, query length, and estimated hop depth; choose between vector-only, graph-only, or hybrid pipeline. 59 | - Log decisions to compare outcomes across policies. 60 | 61 | ### 8) GraphRAG under Fire 62 | [arXiv:2501.14050](http://arxiv.org/abs/2501.14050v3) 63 | - **info**: Evaluates GraphRAG robustness against poisoning/attacks; proposes defenses. 64 | - **how it works**: Identifies attack surfaces (malicious pages, link spam) and mitigation (provenance, trust scoring, anomaly detection). 65 | - **how to integrate**: 66 | - Track provenance per node (crawl timestamp, domain, referrer). Add a `trust_score` combining domain whitelist/blacklist and anomaly scores. 67 | - Filter/rerank retrieval by `trust_score`; add simple content sanitization (strip scripts/iframes) in `deepcrawl.py` extraction. 68 | 69 | ### 9) GraphRAG-Bench: Challenging Domain-Specific Reasoning Benchmark 70 | [arXiv:2506.02404](http://arxiv.org/abs/2506.02404v3) 71 | - **info**: Benchmark focusing on domain-specific, multi-step reasoning for GraphRAG. 72 | - **how it works**: Curates tasks requiring graph traversal, cross-page synthesis, and hierarchical context. 73 | - **how to integrate**: 74 | - Create a small benchmark YAML/JSON in `output/bench/` with question → gold citations → expected hops from your crawled docs. 75 | - Add `--eval bench.json` flag in `main.py` to run batch evaluation and record metrics (EM, F1, citation-precision). 76 | 77 | ### 10) Know3-RAG: Knowledge-aware RAG with Adaptive Retrieval, Generation, and Filtering 78 | [arXiv:2505.12662](http://arxiv.org/abs/2505.12662v1) 79 | - **info**: Iterative pipeline adaptively retrieves, generates, and filters content to reduce hallucinations. 80 | - **how it works**: Uses a control loop: retrieve → generate → evidence-check → refine retrieval. 81 | - **how to integrate**: 82 | - Add an optional iterative loop in `GraphRAGSystem.retrieve_and_generate`: after first answer, run a verifier that checks citation span overlap; if low, trigger another retrieval round with adjusted seeds. 83 | - Introduce a lightweight reranker (e.g., cosine + keyword + trust score) before generation. 84 | 85 | --- 86 | 87 | ### General quick wins for this codebase 88 | - **Budgeted expansion**: Add `--token_budget` and trim contexts by highest utility-per-token. 89 | - **Neighbor selection**: Blend cosine similarity with `common_keywords` and centrality for picking neighbors. 90 | - **Provenance & trust**: Store `provenance`, `trust_score` in HDF5; use in ranking. 91 | - **Two-stage retrieval**: Coarse (page/section) → fine (paragraph) to cut cost. 92 | - **Evaluation harness**: Batch mode with saved prompts/contexts for reproducibility and metric logging. 93 | 94 | 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # DocBook: GraphRAG Documentation Assistant 3 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/Saarthakkj/docbook) 4 | 5 | 6 | A Python-based system that crawls documentation websites using Crawl4AI, builds a knowledge graph with embeddings and relationships, and provides intelligent question-answering capabilities using Retrieval-Augmented Generation (RAG) powered by Google's Gemini API. 7 | 8 | ## Project Overview 9 | 10 | DocBook is designed to create an intelligent assistant for any documentation website. It performs a deep crawl of the target site, extracts content, builds a structured knowledge graph with semantic embeddings, and enables natural language querying of the documentation using an enhanced GraphRAG implementation. 11 | 12 | Key technical aspects: 13 | - **Crawling**: Uses Crawl4AI for asynchronous, depth-limited crawling with content extraction. 14 | - **Graph Construction**: Builds a hierarchical graph with nodes representing pages, including metadata like depth, keywords (extracted via TextRank), and embeddings (using Sentence Transformers). 15 | - **RAG System**: Combines keyword matching, semantic similarity (cosine similarity on embeddings), and graph traversal for context retrieval, followed by generation using Gemini API. 16 | - **Persistence**: Saves knowledge graphs as HDF5 for efficient storage and reuse, avoiding repeated crawls. 17 | 18 | ## Features 19 | 20 | - **Asynchronous Deep Crawling**: BFS-based crawling with configurable max depth and pages, skipping 404 errors. 21 | - **Keyword Extraction**: Uses TextRank (from summa library) with fallback to frequency-based method for robustness. 22 | - **Embeddings**: Generates vector embeddings using 'all-MiniLM-L6-v2' model from Sentence Transformers. 23 | - **Knowledge Graph**: Directed graph with NAVIGATES_TO relationships, including common keywords and semantic similarity scores. 24 | - **Multi-Method Retrieval**: Combines keyword indexing, semantic similarity, and graph expansion for relevant context. 25 | - **Interactive Q&A**: Command-line interface for querying with 'quit' to exit. 26 | - **HDF5 Storage**: Efficient binary format for storing large graphs with metadata, embeddings, and hierarchical structure. 27 | 28 | ## Architecture and Technical Details 29 | 30 | ### Core Components 31 | 32 | 1. **deepcrawl.py**: 33 | - **GraphNode dataclass**: Represents nodes with url, content, depth, keywords, embedding, and children list. 34 | - **Graph dataclass**: Contains nodes dict, edges list, and metadata dict with total_nodes, max_depth, root_url. 35 | - **Deep crawling**: Uses BFS strategy with AsyncWebCrawler, configurable depth/page limits. 36 | - **Keyword extraction**: Primary method uses TextRank via `extract_keywords_textrank()`, fallback to frequency-based analysis. 37 | - **Embeddings**: Creates embeddings using SentenceTransformer('all-MiniLM-L6-v2'). 38 | - **Edge computation**: Calculates semantic similarity as ratio of common keywords between parent-child nodes. 39 | - **HDF5 persistence**: `save_graph_hdf5()` stores graph in hierarchical format with metadata, nodes (with embeddings/keywords), and edges. 40 | - **Graph utilities**: `print_graph_structure()` for visualization, `find_parent_node()` for tree traversal. 41 | 42 | 2. **graphrag.py**: 43 | - **Entity**: Dataclass with source_urls, keywords, content_snippet, embedding, depth. 44 | - **Relationship**: Dataclass with source, target, source_urls, common_keywords, semantic_similarity. 45 | - **GraphRAGSystem**: Main RAG class with: 46 | - Initialization with Graph object and Gemini API key 47 | - SentenceTransformer model loading for embeddings 48 | - NetworkX DiGraph construction for graph operations 49 | - Keyword index (defaultdict) mapping keywords to URL lists 50 | - `load_from_kg_json()`: Creates entities from GraphNodes, relationships from edges 51 | - `retrieve_and_generate()`: Main query method combining retrieval and generation 52 | - `_find_relevant_urls()`: Multi-score ranking (keyword overlap + cosine similarity) 53 | - `_expand_context_with_graph()`: Graph traversal to add neighbor nodes (top 5 per seed) 54 | - `_generate_enhanced_answer()`: Constructs detailed prompt with context sections for LLM 55 | 56 | 3. **main.py**: 57 | - **CLI argument parsing**: Required args (--url, --output_dir, --name), optional (--max_depth, --max_pages). 58 | - **Graph persistence check**: Looks for existing `{name}_kg.h5` file to avoid re-crawling. 59 | - **Crawling workflow**: If no existing graph, runs `deepcrawl.deep_crawl()` and saves with `save_graph_hdf5()`. 60 | - **Graph loading**: Uses `load_graph_hdf5()` to reconstruct Graph object from HDF5. 61 | - **RAG initialization**: Creates GraphRAGSystem from loaded graph. 62 | - **Interactive loop**: Continuous Q&A interface with error handling and 'quit' command. 63 | - **Debug utilities**: `debug_save_process()` and `inspect_saved_graph()` for troubleshooting. 64 | 65 | ### How It Works (Step-by-Step) 66 | 67 | 1. **Crawling Phase**: 68 | - BFS traversal starting from root URL with depth/page limits 69 | - Content extraction to markdown, keyword extraction via TextRank 70 | - Embedding generation for each page's content 71 | - Parent-child relationship establishment with similarity scoring 72 | 73 | 2. **Graph Construction**: 74 | - Nodes stored as GraphNode objects with full content and metadata 75 | - Edges contain semantic similarity scores and common keywords 76 | - HDF5 storage with hierarchical structure: /metadata, /nodes, /nodes_index, /edges 77 | 78 | 3. **RAG System Loading**: 79 | - Graph reconstruction from HDF5 into memory 80 | - Entity creation from nodes, relationship mapping from edges 81 | - NetworkX graph building for efficient traversal 82 | - Keyword index construction for fast lookup 83 | 84 | 4. **Query Processing**: 85 | - Keyword matching and embedding similarity scoring 86 | - Graph expansion to include relevant neighbors 87 | - Context compilation with entities, relationships, and content snippets 88 | - LLM generation with structured prompt including instructions and context 89 | 90 | ## Installation 91 | 92 | ### Prerequisites 93 | - Python 3.8+ 94 | - uv (fast Python package manager and environment tool) 95 | - Google Gemini API key (from https://ai.dev/apikey) 96 | 97 | ### Steps (using uv) 98 | 1. Clone the repository: 99 | ``` 100 | git clone https://github.com/your-org/docbook.git 101 | cd docbook 102 | ``` 103 | 104 | 2. Install uv (if not already installed): 105 | ``` 106 | curl -LsSf https://astral.sh/uv/install.sh | sh 107 | ``` 108 | 109 | 3. Create and activate a virtual environment: 110 | ``` 111 | uv venv .venv 112 | . .venv/bin/activate 113 | ``` 114 | 115 | 4. Install dependencies with uv: 116 | ``` 117 | uv pip install -r requirements.txt 118 | ``` 119 | 120 | 5. Set environment variables in `.env`: 121 | ``` 122 | gemini_api_key=your_api_key_here 123 | ``` 124 | 125 | ## Usage 126 | 127 | Run a crawl and interactive query session (first run creates the HDF5 knowledge graph, later runs reuse it): 128 | 129 | ``` 130 | python main.py \ 131 | --url https://docs.crawl4ai.com/ \ 132 | --output_dir ./output \ 133 | --name crawl4ai_docs \ 134 | --max_depth 3 \ 135 | --max_pages 50 136 | ``` 137 | 138 | - Output file: `./output/crawl4ai_docs_kg.h5`. 139 | - Subsequent runs with the same `--name` and `--output_dir` load the graph directly (no re-crawl). 140 | - In the prompt, type `quit` to exit. 141 | 142 | Notes: 143 | - The environment variable name is lowercase and case-sensitive: `gemini_api_key`. 144 | - Embedding model: `'all-MiniLM-L6-v2'` via Sentence Transformers. 145 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from deepcrawl import GraphNode , Graph 2 | import deepcrawl 3 | import asyncio 4 | import argparse 5 | import sys 6 | import subprocess 7 | import os 8 | import json 9 | import os 10 | from dotenv import load_dotenv 11 | from graphrag import GraphRAGSystem, create_graphrag 12 | from deepcrawl import save_graph_hdf5 13 | 14 | load_dotenv() 15 | 16 | 17 | 18 | def parse_arguments(): 19 | '''parse command line arguments''' 20 | 21 | parser = argparse.ArgumentParser( 22 | description= "GraphRAG implementation using Crawl4AI" , 23 | formatter_class = argparse.RawDescriptionHelpFormatter , 24 | epilog=""" 25 | Examples : 26 | # Quick start with default settings 27 | python main.py --url https://docs.crawl4ai.com/ 28 | 29 | #Custom crawling parameters 30 | python main.py --url https://crawl4ai.com/ --max_depth 3 max_pages 50 31 | """ 32 | ) 33 | 34 | 35 | parser = argparse.ArgumentParser(description = "pass multiple varirables") 36 | parser.add_argument("--max_depth" , type = int , help = "maximum depth of pages") 37 | parser.add_argument ("--max_pages" , type = int , help = "maximum number of pages") 38 | 39 | parser.add_argument("--url" , type = str , required = True, help = "doc url") 40 | parser.add_argument("--output_dir" , type = str , required = True, help = "output directory") 41 | 42 | parser.add_argument("--name" , type = str , required = True, help = "name of your documentation") 43 | parser.add_argument("--token_budget", type=int, default=None, help="Approximate token budget for context assembly") 44 | 45 | 46 | return parser.parse_args() 47 | 48 | 49 | async def run_deepcrawl(url : str , max_depth : int , max_pages : int ) -> Graph: 50 | """run deepcrawl.py with specified parameters""" 51 | 52 | 53 | return await deepcrawl.main(url , max_depth , max_pages ) 54 | 55 | import json 56 | import os 57 | 58 | def inspect_saved_graph(filepath: str): 59 | """Inspect what's actually in the saved JSON file""" 60 | print(f"🔍 Inspecting saved graph: {filepath}") 61 | 62 | if not os.path.exists(filepath): 63 | print("❌ File doesn't exist") 64 | return 65 | 66 | file_size = os.path.getsize(filepath) 67 | print(f"📏 File size: {file_size} bytes") 68 | 69 | try: 70 | with open(filepath, 'r', encoding='utf-8') as f: 71 | data = json.load(f) 72 | 73 | print("📊 JSON structure analysis:") 74 | print(f" 🗂️ Top-level keys: {list(data.keys())}") 75 | 76 | if 'metadata' in data: 77 | print(f" 📈 Metadata: {data['metadata']}") 78 | 79 | if 'nodes' in data: 80 | node_count = len(data['nodes']) 81 | print(f" 🌐 Nodes count: {node_count}") 82 | 83 | if node_count > 0: 84 | # Show first few node keys 85 | node_keys = list(data['nodes'].keys())[:3] 86 | print(f" 🔑 Sample node keys: {node_keys}") 87 | 88 | # Show structure of first node 89 | if node_keys: 90 | first_node = data['nodes'][node_keys[0]] 91 | print(f" 📝 First node structure: {list(first_node.keys())}") 92 | print(f" 📄 First node content length: {len(first_node.get('content', ''))}") 93 | print(f" 🏷️ First node keywords count: {len(first_node.get('keywords', []))}") 94 | else: 95 | print(" ⚠️ No nodes found!") 96 | 97 | if 'edges' in data: 98 | edge_count = len(data['edges']) 99 | print(f" 🔗 Edges count: {edge_count}") 100 | 101 | if edge_count > 0: 102 | print(f" 🔗 First edge: {data['edges'][0]}") 103 | else: 104 | print(" ⚠️ No edges found!") 105 | 106 | return data 107 | 108 | except json.JSONDecodeError as e: 109 | print(f"❌ JSON decode error: {e}") 110 | except Exception as e: 111 | print(f"❌ Error reading file: {e}") 112 | 113 | def debug_graph_before_save(graph): 114 | """Debug the graph object before saving""" 115 | print("🔍 Debugging graph object before save:") 116 | print(f" 🌐 Root URL: {graph.url}") 117 | print(f" 📊 Root depth: {graph.depth}") 118 | print(f" 📄 Root content length: {len(graph.content) if graph.content else 0}") 119 | print(f" 🔗 Root children count: {len(graph.children)}") 120 | 121 | # Traverse and count all nodes 122 | all_nodes = [] 123 | stack = [graph] 124 | while stack: 125 | node = stack.pop() 126 | all_nodes.append(node) 127 | stack.extend(node.children) 128 | 129 | print(f" 🌳 Total nodes in tree: {len(all_nodes)}") 130 | 131 | # Show depth distribution 132 | depth_counts = {} 133 | for node in all_nodes: 134 | depth_counts[node.depth] = depth_counts.get(node.depth, 0) + 1 135 | 136 | print(f" 📊 Depth distribution: {depth_counts}") 137 | 138 | # Show some sample URLs 139 | sample_urls = [node.url for node in all_nodes[:5]] 140 | print(f" 🔗 Sample URLs: {sample_urls}") 141 | 142 | # Check for empty content 143 | empty_content_count = sum(1 for node in all_nodes if not node.content or len(node.content.strip()) == 0) 144 | print(f" ⚠️ Nodes with empty content: {empty_content_count}/{len(all_nodes)}") 145 | 146 | return all_nodes 147 | 148 | def debug_save_process(graph, filepath: str): 149 | """Debug the entire save process step by step""" 150 | print("🚀 Starting comprehensive save debug...") 151 | 152 | # 1. Debug input graph 153 | all_nodes = debug_graph_before_save(graph) 154 | 155 | if len(all_nodes) == 0: 156 | print("❌ PROBLEM: Graph has no nodes!") 157 | return 158 | 159 | if len(all_nodes) == 1: 160 | print("⚠️ WARNING: Graph has only root node (no children crawled)") 161 | 162 | # 2. Test keyword extraction on a sample 163 | print("\n🔍 Testing keyword extraction...") 164 | try: 165 | sample_node = all_nodes[0] 166 | if hasattr(sample_node, 'content') and sample_node.content: 167 | print(f" 📝 Sample content length: {len(sample_node.content)}") 168 | # Test if extract_keywords_textrank function exists and works 169 | try: 170 | keywords = extract_keywords_textrank(sample_node.content, 5) # Assuming top_k=5 171 | print(f" ✅ Keywords extracted: {keywords}") 172 | except NameError: 173 | print(" ❌ extract_keywords_textrank function not defined!") 174 | except Exception as e: 175 | print(f" ❌ Keyword extraction failed: {e}") 176 | else: 177 | print(" ⚠️ Sample node has no content") 178 | except Exception as e: 179 | print(f" ❌ Error testing keywords: {e}") 180 | 181 | # 3. Test common keywords function 182 | print("\n🔍 Testing common keywords function...") 183 | try: 184 | test_result = get_common_keywords(['test', 'word'], ['test', 'another']) 185 | print(f" ✅ get_common_keywords works: {test_result}") 186 | except NameError: 187 | print(" ❌ get_common_keywords function not defined!") 188 | except Exception as e: 189 | print(f" ❌ get_common_keywords failed: {e}") 190 | 191 | # 4. Now try the actual save 192 | print(f"\n💾 Attempting save to: {filepath}") 193 | try: 194 | save_graph(graph, filepath) 195 | 196 | # 5. Inspect what was actually saved 197 | print("\n🔍 Inspecting saved result...") 198 | inspect_saved_graph(filepath) 199 | 200 | except Exception as e: 201 | print(f"❌ Save failed: {e}") 202 | import traceback 203 | traceback.print_exc() 204 | 205 | import h5py 206 | import numpy as np 207 | 208 | 209 | def load_graph_hdf5(filepath: str) -> Graph: 210 | """Load the knowledge graph from HDF5 and reconstruct the GraphNode tree.""" 211 | 212 | with h5py.File(filepath, "r") as f: 213 | # Load metadata 214 | metadata = {} 215 | if "metadata" in f: 216 | meta_grp = f["metadata"] 217 | for k in meta_grp.attrs: 218 | metadata[k] = meta_grp.attrs[k] 219 | root_url = metadata.get("root_url") 220 | if not root_url: 221 | raise ValueError("No root_url found in metadata") 222 | 223 | # Build an ID -> URL map and load nodes (url -> GraphNode) 224 | id_to_url = {} 225 | if "nodes_index" in f: 226 | # Preferred path: use the explicit index 227 | idx = f["nodes_index"][:] 228 | for row in idx: 229 | rid = row["id"].decode("utf-8") if isinstance(row["id"], (bytes, bytearray)) else str(row["id"]) 230 | rurl = row["url"].decode("utf-8") if isinstance(row["url"], (bytes, bytearray)) else str(row["url"]) 231 | id_to_url[rid] = rurl 232 | 233 | node_map = {} 234 | if "nodes" not in f: 235 | raise ValueError("No nodes found in HDF5 file.") 236 | nodes_grp = f["nodes"] 237 | for node_group_name in nodes_grp: 238 | node_grp = nodes_grp[node_group_name] 239 | # Recover URL and depth 240 | raw_url = node_grp.attrs.get("url", "") 241 | url = raw_url.decode("utf-8") if isinstance(raw_url, (bytes, bytearray)) else str(raw_url) 242 | raw_depth = node_grp.attrs.get("depth", 0) 243 | depth = int(raw_depth) 244 | 245 | # Datasets: content, keywords, embedding 246 | content = "" 247 | if "content" in node_grp: 248 | raw_content = node_grp["content"][()] 249 | content = raw_content.decode("utf-8") if isinstance(raw_content, (bytes, bytearray)) else str(raw_content) 250 | 251 | keywords = [] 252 | if "keywords" in node_grp: 253 | kw_arr = node_grp["keywords"][:] 254 | for kw in kw_arr: 255 | if isinstance(kw, (bytes, bytearray)): 256 | kw = kw.decode("utf-8", errors="ignore") 257 | kw = str(kw) 258 | if kw: 259 | keywords.append(kw) 260 | 261 | embedding = None 262 | if "embedding" in node_grp: 263 | emb = node_grp["embedding"][:] 264 | try: 265 | embedding = emb.astype(float).tolist() 266 | except Exception: 267 | embedding = [float(x) for x in emb] 268 | 269 | node = GraphNode( 270 | url=url, 271 | content=content, 272 | depth=depth, 273 | keywords=keywords, 274 | embedding=embedding, 275 | ) 276 | node_map[url] = node 277 | 278 | # Load and apply edges to link children 279 | graph = Graph(nodes=node_map, edges=[], metadata=metadata) 280 | if "edges" in f: 281 | edges_dataset = f["edges"][:] 282 | for edge in edges_dataset: 283 | # Resolve IDs back to URLs 284 | raw_sid = edge["source_id"] 285 | raw_tid = edge["target_id"] 286 | sid = raw_sid.decode("utf-8") if isinstance(raw_sid, (bytes, bytearray)) else str(raw_sid) 287 | tid = raw_tid.decode("utf-8") if isinstance(raw_tid, (bytes, bytearray)) else str(raw_tid) 288 | source_url = id_to_url.get(sid, None) 289 | target_url = id_to_url.get(tid, None) 290 | if source_url is None or target_url is None: 291 | # Fallback: try to find URLs by scanning node groups' attrs if index is missing 292 | if not id_to_url: 293 | # Attempt to rebuild id_to_url from group names (n_) and attrs 294 | for ngn in nodes_grp: 295 | ngr = nodes_grp[ngn] 296 | rid = ngn[2:] if ngn.startswith("n_") else ngn 297 | rurl = ngr.attrs.get("url", "") 298 | rurl = rurl.decode("utf-8") if isinstance(rurl, (bytes, bytearray)) else str(rurl) 299 | id_to_url[rid] = rurl 300 | source_url = id_to_url.get(sid) 301 | target_url = id_to_url.get(tid) 302 | sim = float(edge["semantic_similarity"]) if "semantic_similarity" in edge.dtype.names else 0.0 303 | raw_ckw = edge["common_keywords"] if "common_keywords" in edge.dtype.names else b"" 304 | if isinstance(raw_ckw, (bytes, bytearray)): 305 | common_kw_str = raw_ckw.decode("utf-8", errors="ignore") 306 | else: 307 | common_kw_str = str(raw_ckw) 308 | common_kw = common_kw_str.split(',') if common_kw_str else [] 309 | 310 | if source_url in node_map and target_url in node_map: 311 | node_map[source_url].children.append(node_map[target_url]) 312 | 313 | graph.edges.append({ 314 | 'source': source_url if source_url is not None else '', 315 | 'target': target_url if target_url is not None else '', 316 | 'common_keywords': common_kw, 317 | 'semantic_similarity': sim 318 | }) 319 | return graph 320 | 321 | 322 | def print_graph_nodes_sample(graph: Graph, limit: int = 10): 323 | """Print a small sample of nodes to verify that the loaded graph is not empty.""" 324 | try: 325 | total_nodes = len(graph.nodes) if hasattr(graph, 'nodes') and graph.nodes is not None else 0 326 | total_edges = len(graph.edges) if hasattr(graph, 'edges') and graph.edges is not None else 0 327 | except Exception: 328 | total_nodes = 0 329 | total_edges = 0 330 | 331 | if total_nodes == 0: 332 | print("⚠️ Graph appears to have no nodes.") 333 | return 334 | 335 | print(f"🌐 Total nodes: {total_nodes}") 336 | print(f"🔗 Total edges: {total_edges}") 337 | sample_urls = list(graph.nodes.keys())[:limit] 338 | print(f"🔎 Sample of {len(sample_urls)} node URLs: {sample_urls}") 339 | for url in sample_urls: 340 | node = graph.nodes[url] 341 | depth = getattr(node, 'depth', None) 342 | kw_count = len((getattr(node, 'keywords', []) or [])) 343 | content_len = len((getattr(node, 'content', '') or '')) 344 | print(f" - {url} (depth={depth}, keywords={kw_count}, content_len={content_len})") 345 | 346 | 347 | async def main(): 348 | """Main function to demonstrate GraphRAG implementation""" 349 | 350 | 351 | args = parse_arguments() 352 | 353 | 354 | 355 | # Check for required API key 356 | gemini_api_key = os.getenv("gemini_api_key") 357 | if not gemini_api_key: 358 | print("❌ Error: GEMINI_API_KEY environment variable not set") 359 | print("Please set your Gemini API key in the .env file") 360 | return 361 | 362 | print("🚀 Starting Docbook ") 363 | print("=" * 60) 364 | 365 | # Step 1: Load the knowledge graph from deepcrawl.py output 366 | print("📂 Loading graphRG system.....") 367 | 368 | rag_system = None 369 | 370 | kg_path = os.path.join(args.output_dir, f"{args.name}_kg.h5") 371 | 372 | if os.path.exists(kg_path): 373 | print(f"Found existing knowledge graph at {kg_path}") 374 | graph = load_graph_hdf5(kg_path) 375 | # print_graph_nodes_sample(graph, limit=5) 376 | else: 377 | print(f"No existing graph found, running deepcrawl...") 378 | graph = await run_deepcrawl(args.url, args.max_depth, args.max_pages) 379 | # root = graph.nodes[args.url] 380 | save_graph_hdf5(graph, kg_path) 381 | print(f"Knowledge graph saved to {kg_path}") 382 | # print_graph_nodes_sample(graph, limit=5) 383 | 384 | rag_system = await create_graphrag( 385 | graph, 386 | gemini_api_key, 387 | token_budget=args.token_budget 388 | ) 389 | print("🤖 GraphRAG Query Interface Ready!") 390 | print(f"Ask questions about {args.name} documentation.") 391 | print("Type 'quit' to exit.") 392 | print("=" * 60) 393 | 394 | 395 | while True: 396 | print("\n" + "-" * 40) 397 | user_query = input("🔍 Enter your question: ").strip() 398 | 399 | if user_query.lower() in ['quit', 'exit', 'q']: 400 | print("👋 Thanks for using GraphRAG! Goodbye!") 401 | break 402 | 403 | if not user_query: 404 | continue 405 | 406 | try: 407 | print("\n🔎 Processing your query...") 408 | answer = await rag_system.retrieve_and_generate(user_query) 409 | print("\n📝 Answer:") 410 | print("=" * 50) 411 | print(answer) 412 | print("=" * 50) 413 | 414 | except Exception as e: 415 | print(f"❌ Error processing query: {e}") 416 | print("Please try a different question.") 417 | 418 | if __name__ == "__main__": 419 | asyncio.run(main()) 420 | -------------------------------------------------------------------------------- /graphrag.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO : 3 | - introduce cuGRAPh in networkx 4 | - any other score to add in find_relevant_url method? 5 | """ 6 | 7 | #!/usr/bin/env python3 8 | """ 9 | GraphRAG System for Documentation Crawling 10 | Optimized to work directly with hd5py format from deepcrawl.py 11 | """ 12 | 13 | import asyncio 14 | import json 15 | import numpy as np 16 | from typing import Dict, List, Set, Tuple, Optional 17 | from dataclasses import dataclass 18 | # pip install sentence-transformers 19 | from sentence_transformers import SentenceTransformer 20 | import networkx as nx 21 | from sklearn.metrics.pairwise import cosine_similarity 22 | import os 23 | from dotenv import load_dotenv 24 | from collections import Counter, defaultdict 25 | from google import genai 26 | from google.genai import types 27 | from deepcrawl import GraphNode , Graph 28 | 29 | load_dotenv() 30 | 31 | @dataclass 32 | class Entity: 33 | source_urls: List[str] 34 | keywords: List[str] 35 | content_snippet: str 36 | depth: int 37 | embedding: List[float] 38 | 39 | @dataclass 40 | class Relationship: 41 | source: str 42 | target: str 43 | source_urls: List[str] 44 | common_keywords: List[str] 45 | semantic_similarity: float 46 | 47 | class GraphRAGSystem: 48 | def __init__(self, graph : Graph , gemini_api_key: str, token_budget: Optional[int] = None): 49 | self.entities: Dict[str, Entity] = {} 50 | self.relationships: List[Relationship] = [] 51 | self.knowledge_graph = nx.DiGraph() 52 | self.keyword_index = defaultdict(list) # Keyword -> URLs that contain it 53 | self.graph = graph 54 | self.token_budget: Optional[int] = token_budget 55 | 56 | # Initialize models (heavier embedding model) 57 | self.embedding_model = SentenceTransformer('all-mpnet-base-v2') 58 | 59 | # Initialize Gemini 60 | client = genai.Client(api_key=os.environ['gemini_api_key']) 61 | chat = client.chats.create(model='gemini-2.0-flash') 62 | self.llm = chat 63 | 64 | def _estimate_tokens_text(self, text: str) -> int: 65 | """Rough token estimator: ~1 token per 4 characters as a safe upper bound.""" 66 | if not text: 67 | return 0 68 | # Use characters/4 to avoid underestimating for long words; clamp at >= 1 for non-empty 69 | return max(1, len(text) // 4) 70 | 71 | def _estimate_entity_tokens(self, entity: Entity) -> int: 72 | """Estimate tokens contributed by an entity block in the prompt.""" 73 | url = entity.source_urls[0] if entity.source_urls else "" 74 | header = self._extract_entity_name(url) 75 | meta = ", ".join((entity.keywords or [])[:5]) 76 | overhead = len(header) + len(url) + len(meta) + 32 # headings + labels 77 | return (overhead // 4) + self._estimate_tokens_text(entity.content_snippet or "") 78 | 79 | def _normalize_keyword(self, kw) -> str: 80 | """Convert keyword-like values (bytes, numpy scalars) to normalized lowercase str.""" 81 | try: 82 | import numpy as _np # local import to avoid top-level alias issues 83 | if isinstance(kw, (_np.bytes_, bytes)): 84 | kw = kw.decode('utf-8', errors='ignore') 85 | except Exception: 86 | # If numpy is not available or decode fails, fall back to str 87 | pass 88 | if isinstance(kw, bytes): 89 | kw = kw.decode('utf-8', errors='ignore') 90 | return str(kw).strip().lower() 91 | 92 | def load_from_kg_json(self): 93 | """ load knoweldge graph directly from graphNode""" 94 | 95 | 96 | print("🔍 Loading knowledge graph from graphNode...") 97 | 98 | 99 | # Normalize keywords and build keyword index 100 | for url, node in self.graph.nodes.items(): 101 | normalized_keywords = [] 102 | try: 103 | iterable_keywords = node.keywords or [] 104 | except Exception: 105 | iterable_keywords = [] 106 | for kw in iterable_keywords: 107 | if kw is None: 108 | continue 109 | norm_kw = self._normalize_keyword(kw) 110 | if not norm_kw: 111 | continue 112 | normalized_keywords.append(norm_kw) 113 | self.keyword_index[norm_kw].append(node.url) 114 | node.keywords = normalized_keywords 115 | 116 | # Create entities and relationships before building the NetworkX graph 117 | nodes_list = [self.graph.nodes[_] for _ in self.graph.nodes] 118 | self._create_entities_from_nodes(nodes_list) 119 | self._create_relationships_from_edges(self.graph.edges) 120 | 121 | # Now build the NetworkX graph with nodes and edges available 122 | self._build_networkx_graph() 123 | 124 | def _create_entities_from_nodes(self, all_nodes : List[GraphNode]): 125 | """Create entities from kg.json nodes""" 126 | print("🏗️ Creating entities from nodes...") 127 | 128 | for node in all_nodes: 129 | entity = Entity( 130 | source_urls=[node.url], 131 | keywords=node.keywords, 132 | content_snippet=node.content, 133 | embedding = node.embedding , 134 | depth=node.depth, 135 | ) 136 | 137 | self.entities[node.url] = entity 138 | 139 | def _create_relationships_from_edges(self, edges: List[Dict]): 140 | """Create relationships from kg.json edges""" 141 | print("🔗 Creating relationships from edges...") 142 | 143 | for edge in edges: 144 | relationship = Relationship( 145 | source=edge["source"], 146 | target=edge["target"], 147 | source_urls=[edge["source"], edge["target"]], 148 | common_keywords=edge.get("common_keywords", []), 149 | semantic_similarity=edge.get("semantic_similarity", 0.0) 150 | ) 151 | 152 | self.relationships.append(relationship) 153 | 154 | def _extract_entity_name(self, url: str) -> str: 155 | """Extract a readable entity name from URL""" 156 | if url.endswith('/'): 157 | url = url[:-1] 158 | 159 | parts = url.split('/') 160 | if len(parts) >= 2: 161 | name = parts[-1].replace('-', ' ').replace('_', ' ').title() 162 | if name: 163 | return name 164 | 165 | return url.split('/')[-1] or "Home" 166 | 167 | def _build_networkx_graph(self): 168 | """Build NetworkX graph from entities and relationships""" 169 | print("🕸️ Building NetworkX graph...") 170 | 171 | # Add nodes 172 | for url, node in self.graph.nodes.items(): 173 | self.knowledge_graph.add_node( 174 | url, 175 | keywords=node.keywords, 176 | depth=node.depth, 177 | embedding=node.embedding 178 | ) 179 | 180 | # Add edges 181 | for rel in self.relationships: 182 | if rel.source in self.entities and rel.target in self.entities: 183 | self.knowledge_graph.add_edge( 184 | rel.source, 185 | rel.target, 186 | common_keywords=rel.common_keywords, 187 | semantic_similarity=rel.semantic_similarity 188 | ) 189 | 190 | async def retrieve_and_generate(self, query: str, top_k: int = 10) -> str: 191 | """Enhanced query processing using both keywords and embeddings""" 192 | 193 | if self.token_budget is not None: 194 | print(f"[Budget] Token budget for this query: {self.token_budget} tokens") 195 | 196 | # Step 1: Find relevant URLs using multiple methods 197 | relevant_urls = await self._find_relevant_urls(query, top_k) 198 | 199 | # Step 2: Expand context using graph relationships 200 | expanded_context = self._expand_context_with_graph(relevant_urls) 201 | 202 | # Step 4: Generate answer using LLM 203 | answer = await self._generate_enhanced_answer(query, expanded_context) 204 | 205 | return answer 206 | 207 | async def _find_relevant_urls(self, query: str, top_k: int) -> List[str]: 208 | """Find relevant top k URLs using keyword matching and semantic similarity""" 209 | # Method 1: Keyword-based retrieval 210 | query_words = set(query.lower().split()) 211 | keyword_scores = defaultdict(float) 212 | 213 | for word in query_words: 214 | if word in self.keyword_index: 215 | for url in self.keyword_index[word]: 216 | keyword_scores[url] += 1.0 217 | 218 | # Method 2: Semantic similarity (if embeddings exist) 219 | similarity_scores = {} 220 | query_embedding = self.embedding_model.encode(query) 221 | # Ensure 1D for consistency 222 | if isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1: 223 | query_embedding = query_embedding.flatten() 224 | 225 | for url, entity in self.entities.items(): 226 | 227 | # Convert entity embedding to numpy array if available 228 | entity_embedding = None 229 | if entity.embedding is not None: 230 | try: 231 | entity_embedding = np.array(entity.embedding, dtype=np.float32) 232 | except Exception: 233 | entity_embedding = None 234 | 235 | # Ensure both embeddings are 1D vectors and match dimensions; otherwise re-embed with current model 236 | if entity_embedding is not None: 237 | if entity_embedding.ndim > 1: 238 | entity_embedding = entity_embedding.flatten() 239 | 240 | # Re-embed if missing or dimension mismatch with current query embedding 241 | needs_reembed = ( 242 | entity_embedding is None 243 | or not isinstance(entity_embedding, np.ndarray) 244 | or entity_embedding.size == 0 245 | or entity_embedding.shape[0] != query_embedding.shape[0] 246 | ) 247 | 248 | if needs_reembed: 249 | # Fallback text to embed if content is unavailable 250 | text_to_embed = ( 251 | (entity.content_snippet or "").strip() 252 | or (" ".join(entity.keywords) if entity.keywords else "") 253 | or url 254 | ) 255 | entity_embedding = self.embedding_model.encode(text_to_embed) 256 | if isinstance(entity_embedding, np.ndarray) and entity_embedding.ndim > 1: 257 | entity_embedding = entity_embedding.flatten() 258 | # Cache back to entity to avoid repeated re-embedding 259 | try: 260 | self.entities[url].embedding = entity_embedding.tolist() 261 | except Exception: 262 | pass 263 | 264 | # Calculate cosine similarity between 1D vectors 265 | similarity = np.dot(query_embedding, entity_embedding) / ( 266 | np.linalg.norm(query_embedding) * np.linalg.norm(entity_embedding) 267 | ) 268 | similarity_scores[url] = similarity 269 | 270 | # Combine scores 271 | combined_scores = {} 272 | all_urls = set(keyword_scores.keys()) | set(similarity_scores.keys()) 273 | 274 | for url in all_urls: 275 | score = ( 276 | keyword_scores.get(url, 0) * 2.0 + # Keyword match is most important 277 | similarity_scores.get(url, 0) * 1.5 # Semantic similarity 278 | ) 279 | combined_scores[url] = score 280 | 281 | # Sort and return top k 282 | sorted_urls = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) 283 | top_urls = [url for url, _ in sorted_urls[:top_k]] 284 | 285 | # Fallback: if nothing scored (e.g., no embeddings and no keyword matches), return first k nodes 286 | if not top_urls: 287 | try: 288 | return list(self.graph.nodes.keys())[:top_k] 289 | except Exception: 290 | return [] 291 | 292 | 293 | return top_urls 294 | 295 | def _expand_context_with_graph(self, seed_urls: List[str]) -> Dict: 296 | """Expand context using graph relationships and high-value neighbors with optional token budget.""" 297 | expanded_urls: Set[str] = set() 298 | important_relationships: List[Dict] = [] 299 | 300 | # Helper to try-include a URL respecting budget 301 | used_tokens = 0 302 | limit = self.token_budget if self.token_budget is not None else None 303 | included_count_seeds = 0 304 | included_count_neighbors = 0 305 | 306 | def try_include(url: str) -> bool: 307 | nonlocal used_tokens 308 | if url not in self.entities: 309 | return False 310 | if url in expanded_urls: 311 | return False 312 | est = self._estimate_entity_tokens(self.entities[url]) 313 | if limit is not None and used_tokens + est > limit: 314 | return False 315 | expanded_urls.add(url) 316 | used_tokens += est 317 | return True 318 | 319 | # Include seeds first 320 | for url in seed_urls: 321 | if try_include(url): 322 | included_count_seeds += 1 323 | 324 | if limit is not None: 325 | print(f"[Budget] After seeds: included={included_count_seeds}, used={used_tokens}/{limit}") 326 | 327 | # Then include up to 5 neighbors per seed 328 | for url in seed_urls: 329 | if url in self.knowledge_graph: 330 | neighbors = list(self.knowledge_graph.neighbors(url))[:5] 331 | for neighbor in neighbors: 332 | if try_include(neighbor): 333 | included_count_neighbors += 1 334 | # Record relationship if present 335 | if self.knowledge_graph.has_edge(url, neighbor): 336 | edge_data = self.knowledge_graph[url][neighbor] 337 | elif self.knowledge_graph.has_edge(neighbor, url): 338 | edge_data = self.knowledge_graph[neighbor][url] 339 | else: 340 | edge_data = {} 341 | important_relationships.append({ 342 | "source": url, 343 | "target": neighbor, 344 | "common_keywords": edge_data.get("common_keywords", []), 345 | "semantic_similarity": edge_data.get("semantic_similarity", 0.0) 346 | }) 347 | if limit is not None and used_tokens >= limit: 348 | break 349 | 350 | if limit is not None: 351 | print(f"[Budget] After neighbors: added={included_count_neighbors}, used={used_tokens}/{limit}") 352 | 353 | # Build final entities list 354 | entities = [self.entities[url] for url in expanded_urls if url in self.entities] 355 | return {"relationships": important_relationships, "entities": entities} 356 | 357 | 358 | async def _generate_enhanced_answer(self, query: str, context: Dict) -> str: 359 | """Generate enhanced answer using LLM with structured context""" 360 | 361 | # Prepare structured context 362 | context_text = "\n**Documentation Context:**\n" 363 | for entity in context.get('entities', []): 364 | url = entity.source_urls[0] if entity.source_urls else "Unknown" 365 | name = self._extract_entity_name(url) 366 | context_text += f"**{name}**\n" 367 | context_text += f"URL: {url}\n" 368 | if entity.keywords: 369 | context_text += f"Keywords: {', '.join(entity.keywords[:5])}\n" 370 | content_snippet = entity.content_snippet 371 | context_text += f"Content: {content_snippet}\n\n" 372 | 373 | # Add relationship information 374 | relationships_text = "" 375 | if context.get('relationships'): 376 | relationships_text = "\n**Related Concepts:**\n" 377 | for rel in context['relationships']: 378 | source_name = self._extract_entity_name(rel['source']) 379 | target_name = self._extract_entity_name(rel['target']) 380 | relationships_text += f"- {source_name} → {target_name}" 381 | 382 | # Add content snippets from related entities 383 | source_entity = self.entities.get(rel['source']) 384 | target_entity = self.entities.get(rel['target']) 385 | 386 | if source_entity and target_entity: 387 | # Get brief content snippets (first 100 chars) 388 | source_snippet = source_entity.content_snippet 389 | target_snippet = target_entity.content_snippet 390 | relationships_text += f"\n Source: {source_snippet}\n Target: {target_snippet}" 391 | 392 | if rel.get('common_keywords'): 393 | relationships_text += f"\n Shared keywords: {', '.join(rel['common_keywords'][:3])}" 394 | relationships_text += "\n\n" 395 | 396 | # Construct prompt 397 | prompt = f""" 398 | Based on the following documentation context, answer the user's question comprehensively and accurately. 399 | 400 | Question: {query} 401 | 402 | {context_text} 403 | {relationships_text} 404 | 405 | Instructions: 406 | 1. Provide a clear, comprehensive answer to the question 407 | 2. Use specific information from the documentation 408 | 3. Include relevant code examples or commands when applicable 409 | 4. Mention specific URLs when referencing particular features 410 | 5. If the question asks about multiple topics, organize your answer with clear sections 411 | 6. Be practical and actionable in your recommendations 412 | 413 | Answer: 414 | """ 415 | 416 | if self.token_budget is not None: 417 | est_ctx_tokens = self._estimate_tokens_text(context_text + relationships_text) 418 | overhead = self._estimate_tokens_text(prompt.replace(context_text, '').replace(relationships_text, '')) 419 | print(f"[Budget] Prompt context tokens≈{est_ctx_tokens}, overhead≈{overhead}. Total≈{est_ctx_tokens + overhead} / limit={self.token_budget}") 420 | 421 | try: 422 | response = self.llm.send_message(prompt) 423 | return response.text 424 | except Exception as e: 425 | return f"Error generating answer: {e}" 426 | 427 | 428 | 429 | 430 | # Convenience function for easy usage 431 | async def create_graphrag( graph : Graph , gemini_api_key: str, token_budget: Optional[int] = None) -> GraphRAGSystem: 432 | """Create and initialize GraphRAG system from kg.json file""" 433 | 434 | # Create and initialize system 435 | rag_system = GraphRAGSystem(graph , gemini_api_key, token_budget=token_budget) 436 | rag_system.load_from_kg_json() 437 | 438 | return rag_system 439 | -------------------------------------------------------------------------------- /deepcrawl.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import asyncio 3 | import numpy as np 4 | import os 5 | import json 6 | import base64 7 | from pathlib import Path 8 | from typing import List, Optional, Dict 9 | from crawl4ai.proxy_strategy import ProxyConfig 10 | import sys 11 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult 12 | from crawl4ai import RoundRobinProxyStrategy 13 | from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy 14 | from crawl4ai import LLMConfig 15 | from crawl4ai import PruningContentFilter, BM25ContentFilter 16 | from crawl4ai import DefaultMarkdownGenerator 17 | from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain 18 | from crawl4ai import BrowserConfig 19 | import argparse 20 | from sentence_transformers import SentenceTransformer 21 | from dotenv import load_dotenv 22 | from summa import keywords as textrank_keywords 23 | import re 24 | from collections import Counter 25 | from pathlib import Path 26 | from typing import Set, Optional, List, Dict, Any , TypedDict 27 | import h5py 28 | from dataclasses import dataclass, field 29 | import hashlib 30 | 31 | 32 | @dataclass 33 | class GraphNode: 34 | """Simple graph node representation for the crawler""" 35 | url: str 36 | content: str 37 | depth: int 38 | keywords : List[str] 39 | embedding : List[float] 40 | children: List['GraphNode'] = field(default_factory=list) 41 | 42 | def add_child(self, child_node: 'GraphNode') -> None: 43 | """Add a child GraphNode to this node""" 44 | self.children.append(child_node) 45 | 46 | 47 | 48 | class MetadataDict(TypedDict): 49 | total_nodes: int 50 | max_depth: int 51 | root_url: str 52 | 53 | @dataclass 54 | class Graph : 55 | nodes: Dict[str, GraphNode] = field(default_factory=dict) 56 | edges: List['Edge'] = field(default_factory=list) 57 | metadata : MetadataDict = field(default_factory=dict) 58 | 59 | 60 | 61 | class Edge(TypedDict) : 62 | source : str 63 | target : str 64 | common_keywords : List[str] 65 | semantic_similarity : float 66 | 67 | 68 | top_k = 25 69 | 70 | 71 | def extract_keywords_textrank(content: str, top_k: int ) -> List[str]: 72 | """Extract keywords using TextRank algorithm from summa library""" 73 | # if not TEXTRANK_AVAILABLE or not content.strip(): 74 | # return extract_keywords_fallback(content, top_k) 75 | 76 | try: 77 | # Clean content and extract keywords 78 | cleaned_content = re.sub(r'[^\w\s]', ' ', content) 79 | cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip() 80 | 81 | if len(cleaned_content) < 50: # Too short for TextRank 82 | return extract_keywords_fallback(content, top_k) 83 | 84 | keywords_text = textrank_keywords.keywords(cleaned_content, words=top_k, split=True) 85 | return [kw.strip() for kw in keywords_text if kw.strip()] 86 | 87 | except Exception as e: 88 | print(f"⚠️ TextRank failed: {e}. Using fallback method.") 89 | return extract_keywords_fallback(content, top_k) 90 | 91 | def extract_keywords_fallback(content: str, top_k: int = 10) -> List[str]: 92 | """Fallback keyword extraction using simple frequency analysis""" 93 | if not content.strip(): 94 | return [] 95 | 96 | # Remove common stop words 97 | stop_words = { 98 | 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 99 | 'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 100 | 'above', 'below', 'between', 'among', 'is', 'are', 'was', 'were', 'be', 'been', 101 | 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 102 | 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 103 | 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 104 | 'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs' 105 | } 106 | 107 | # Extract words (alphabetic, length >= 3) 108 | words = re.findall(r'\b[a-zA-Z]{3,}\b', content.lower()) 109 | 110 | # Filter out stop words and get frequency 111 | filtered_words = [word for word in words if word not in stop_words] 112 | word_freq = Counter(filtered_words) 113 | 114 | # Return top k most frequent words 115 | return [word for word, _ in word_freq.most_common(top_k)] 116 | 117 | def get_common_keywords(keywords1: List[str], keywords2: List[str]) -> List[str]: 118 | """Find common keywords between two lists""" 119 | set1 = set(keyword.lower() for keyword in keywords1) 120 | set2 = set(keyword.lower() for keyword in keywords2) 121 | common = set1.intersection(set2) 122 | return list(common) 123 | 124 | 125 | def create_embeddings(content: str) -> List[float]: 126 | """Create embeddings for a given content""" 127 | return SentenceTransformer('all-MiniLM-L6-v2').encode(content) 128 | 129 | def find_parent_node(graph: GraphNode, target_url: str) -> Optional[GraphNode]: 130 | """ 131 | Search for a parent node that contains the target URL as a child. 132 | Uses depth-first search to traverse the graph. 133 | 134 | Args: 135 | graph: The root GraphNode of the graph 136 | target_url: The URL to search for among children 137 | 138 | Returns: 139 | GraphNode if found, None otherwise 140 | """ 141 | def dfs_search(node: GraphNode) -> Optional[GraphNode]: 142 | # Check if current node has the target URL as a child 143 | for child in node.children: 144 | if child.url == target_url: 145 | return node 146 | 147 | # Recursively search in children 148 | for child in node.children: 149 | result = dfs_search(child) 150 | if result: 151 | return result 152 | 153 | return None 154 | 155 | return dfs_search(graph) 156 | 157 | def find_node_by_url(graph: GraphNode, target_url: str) -> Optional[GraphNode]: 158 | """ 159 | Find a specific node by its URL in the graph. 160 | Uses depth-first search to traverse the graph. 161 | 162 | Args: 163 | graph: The root GraphNode of the graph 164 | target_url: The URL to search for 165 | 166 | Returns: 167 | GraphNode if found, None otherwise 168 | """ 169 | def dfs_search(node: GraphNode) -> Optional[GraphNode]: 170 | if node.url == target_url: 171 | return node 172 | 173 | for child in node.children: 174 | result = dfs_search(child) 175 | if result: 176 | return result 177 | 178 | return None 179 | 180 | return dfs_search(graph) 181 | 182 | 183 | 184 | async def deep_crawl(doc_url: str, max_depth: Optional[int], max_pages: Optional[int]): 185 | """deep crawl with bfs""" 186 | 187 | print("\n ===== deep crawling == ") 188 | 189 | deep_crawl_strategy = BFSDeepCrawlStrategy( max_depth = float('inf') if max_depth is None else max_depth , include_external = False) 190 | 191 | if max_pages is not None : 192 | deep_crawl_strategy.max_pages = max_pages 193 | 194 | 195 | model = SentenceTransformer('all-MiniLM-L6-v2') 196 | 197 | 198 | 199 | 200 | async with AsyncWebCrawler() as crawler: 201 | results : List[CrawlResult] = await crawler.arun( 202 | url = doc_url, 203 | config = CrawlerRunConfig(deep_crawl_strategy = deep_crawl_strategy) , 204 | ) 205 | 206 | root : GraphNode 207 | graph : Graph = Graph() 208 | 209 | print(f"deep crawl returned : {len(results)} pages ") 210 | for i , result in enumerate(results): 211 | depth = result.metadata.get("depth") 212 | parent_url = result.metadata.get("parent_url") 213 | #score = result.metadata.get("score", 0.0) # Get URL relevance score, default to 0.0 214 | 215 | #skipping for 404 : 216 | if(result.markdown is None ) : continue 217 | condition = result.markdown.find('404') != -1 218 | if(condition): continue 219 | 220 | # Debug: Print score information 221 | # print(f"URL: {result.url[:50]}... | Depth: {depth} | Score: {score:.3f}") 222 | keywords = extract_keywords_textrank(result.markdown, top_k) 223 | #embedding = create_embeddings(result.markdown) 224 | 225 | if result.url == doc_url : 226 | # keywords = extract_keywords_textrank(result.markdown, top_k) 227 | root = GraphNode(url=doc_url, content=result.markdown, depth=0, keywords = keywords , embedding =model.encode(result.markdown) , children = [] ) 228 | graph.nodes[doc_url] = root 229 | continue 230 | 231 | try : 232 | 233 | parent_node = graph.nodes[parent_url] 234 | child_node = GraphNode(url=result.url, content=result.markdown, depth=depth, keywords = keywords , embedding = model.encode(result.markdown) , children = [] ) 235 | parent_node.add_child(child_node) 236 | graph.nodes[result.url] = child_node 237 | 238 | common_keywords = get_common_keywords(parent_node.keywords , keywords) 239 | semantic_similarity = len(common_keywords) / max(len(parent_node.keywords), len(keywords)) 240 | 241 | 242 | graph.edges.append({ 243 | 'source' : parent_node.url , 244 | 'target' : result.url , 245 | 'common_keywords' : common_keywords , 246 | 'semantic_similarity' : semantic_similarity 247 | }) 248 | except Exception as e: 249 | print(f"Error adding child: {e}") 250 | continue 251 | 252 | graph.metadata = { 253 | 'total_nodes' : len(graph.nodes), 254 | 'max_depth' : max(node.depth for node in graph.nodes.values()) , 255 | 'root_url' : doc_url 256 | } 257 | 258 | return graph 259 | 260 | def print_graph_structure(root: GraphNode): 261 | """Pretty-print the graph rooted at ``root`` now that we use ``GraphNode`` objects. 262 | 263 | The routine traverses the graph (DFS), shows each node with its depth, score, 264 | content length, and the number of children, then outputs summary statistics. 265 | """ 266 | 267 | if root is None: 268 | print("❌ Graph is empty!") 269 | return 270 | 271 | # ── Collect all nodes ────────────────────────────────────────────────────── 272 | all_nodes: List[GraphNode] = [] 273 | stack: List[GraphNode] = [root] 274 | while stack: 275 | node = stack.pop() 276 | all_nodes.append(node) 277 | stack.extend(node.children) 278 | 279 | # Sort by depth for nicer visual ordering 280 | all_nodes.sort(key=lambda n: n.depth) 281 | 282 | print("\n" + "=" * 80) 283 | print("📊 KNOWLEDGE GRAPH STRUCTURE") 284 | print("=" * 80) 285 | 286 | print(f"📈 Total Nodes: {len(all_nodes)}") 287 | max_depth_val = max(n.depth for n in all_nodes) 288 | print(f"🌳 Max Depth: {max_depth_val}") 289 | print(f"🎯 Root URL: {root.url}") 290 | print("\n" + "-" * 80) 291 | 292 | # ── Per-node details ───────────────────────────────────────────────────── 293 | for node in all_nodes: 294 | indent = " " * node.depth 295 | children_count = len(node.children) 296 | content_length = len(node.content or "") 297 | 298 | print(f"{indent}📍 Node: {node.url[:60]}{'...' if len(node.url) > 60 else ''}") 299 | print(f"{indent} ├─ Depth: {node.depth}") 300 | # print(f"{indent} ├─ Score: {node.score:.3f}") 301 | print(f"{indent} ├─ Content Length: {content_length:,} chars") 302 | print(f"{indent} └─ Children: {children_count}") 303 | 304 | # Display children URLs 305 | for idx, child in enumerate(node.children): 306 | child_prefix = " └─" if idx == children_count - 1 else " ├─" 307 | print(f"{indent}{child_prefix} ➤ {child.url[:50]}{'...' if len(child.url) > 50 else ''}") 308 | if children_count: 309 | print() 310 | 311 | # ── Summary statistics ──────────────────────────────────────────────────── 312 | depths = [n.depth for n in all_nodes] 313 | # scores = [n.score for n in all_nodes] 314 | children_counts = [len(n.children) for n in all_nodes] 315 | 316 | print("=" * 80) 317 | print("📊 GRAPH SUMMARY") 318 | print("=" * 80) 319 | 320 | # Depth distribution 321 | print("📊 Depth Distribution:") 322 | depth_counts: Dict[int, int] = {} 323 | for d in depths: 324 | depth_counts[d] = depth_counts.get(d, 0) + 1 325 | for d in sorted(depth_counts): 326 | print(f" Depth {d}: {depth_counts[d]} nodes") 327 | 328 | # Connectivity stats 329 | print(f"\n📊 Connectivity:") 330 | if children_counts: 331 | avg_children = sum(children_counts) / len(children_counts) 332 | print(f" Average Children per Node: {avg_children:.1f}") 333 | print(f" Nodes with Children: {sum(1 for c in children_counts if c > 0)}") 334 | print(f" Leaf Nodes: {sum(1 for c in children_counts if c == 0)}") 335 | 336 | print("=" * 80) 337 | 338 | 339 | import math 340 | 341 | def clean_embedding(embedding): 342 | if embedding is None: 343 | print(f" Returning None here") 344 | return None 345 | if isinstance(embedding, np.ndarray): 346 | embedding = embedding.tolist() 347 | 348 | # Replace None values with 0.0 instead of keeping them as None 349 | # JSON with allow_nan=False cannot serialize None values in numeric arrays 350 | return [x if isinstance(x, (int, float)) and not (math.isnan(x) or math.isinf(x)) else 0.0 for x in embedding] 351 | 352 | import numpy as np 353 | 354 | def save_graph_hdf5(graph : Graph, filepath: str): 355 | """ 356 | Save the graph structure to HDF5 format for efficient storage and retrieval. 357 | 358 | HDF5 (Hierarchical Data Format 5) is a binary format that provides: 359 | - Efficient storage of large datasets 360 | - Hierarchical organization (groups and datasets) 361 | - Metadata storage via attributes 362 | - Cross-platform compatibility 363 | 364 | Actual layout written by this function (slashes in URLs are NOT used as group names): 365 | /metadata (group with attributes: total_nodes, max_depth, root_url) 366 | /nodes (group) 367 | /n_ (group per node; is a stable hash of URL) 368 | attrs: 369 | url (original URL, string) 370 | depth (integer) 371 | content (string; may be large) 372 | datasets: 373 | embedding (1D float array, optional) 374 | keywords (1D variable-length UTF-8 string array) 375 | /nodes_index (structured dataset: columns id, url, depth) 376 | /edges (structured dataset: columns source_id, target_id, semantic_similarity, common_keywords) 377 | """ 378 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 379 | 380 | # Collect all nodes 381 | all_nodes = list(graph.nodes.values()) 382 | 383 | 384 | print(f"🔍 Extracting keywords for {len(all_nodes)} nodes...") 385 | 386 | # Prepare metadata 387 | metadata = { 388 | "total_nodes": graph.metadata['total_nodes'], 389 | "max_depth": graph.metadata['max_depth'] , 390 | "root_url": graph.metadata['root_url'], 391 | } 392 | 393 | dt_str = h5py.string_dtype('utf-8') 394 | 395 | try : 396 | with h5py.File(filepath, "w") as f: 397 | # Store metadata as attributes 398 | meta_grp = f.create_group("metadata") 399 | for k, v in metadata.items(): 400 | meta_grp.attrs[k] = v 401 | 402 | # Store nodes 403 | nodes_grp = f.create_group("nodes") 404 | 405 | # Precompute safe, stable IDs for each node (avoid '/' in group names) 406 | url_to_id: Dict[str, str] = {} 407 | for node in all_nodes: 408 | node_id = hashlib.md5(node.url.encode('utf-8')).hexdigest()[:16] 409 | url_to_id[node.url] = node_id 410 | 411 | for node in all_nodes: 412 | try : 413 | node_id = url_to_id[node.url] 414 | node_grp = nodes_grp.create_group(f"n_{node_id}") 415 | except ValueError : 416 | print(f" value error , name alr exists") 417 | continue 418 | 419 | dt = h5py.string_dtype('utf-8') # Variable-length UTF-8 420 | node_grp.attrs["depth"] = node.depth 421 | node_grp.attrs["url"] = node.url 422 | 423 | 424 | embedding = clean_embedding(node.embedding) 425 | content = node.content 426 | if embedding is not None: 427 | node_grp.create_dataset("embedding", data=np.array(embedding)) 428 | keywords = node.keywords 429 | if keywords: 430 | node_grp.create_dataset("keywords", data=node.keywords , dtype = dt_str) 431 | if content : 432 | node_grp.create_dataset("content", data=node.content, dtype=dt_str) 433 | 434 | # Create a compact index for nodes (id -> url, depth) 435 | node_index_dtype = np.dtype([ 436 | ('id', dt_str), 437 | ('url', dt_str), 438 | ('depth', np.int32), 439 | ]) 440 | node_index_rows = [ 441 | (url_to_id[node.url], node.url, int(node.depth)) 442 | for node in all_nodes 443 | ] 444 | if len(node_index_rows): 445 | f.create_dataset( 446 | "nodes_index", 447 | data=np.array(node_index_rows, dtype=node_index_dtype) 448 | ) 449 | 450 | # making my own datatype for saving edges- 451 | 452 | 453 | edge_dtype = np.dtype([ 454 | ('source_id' , dt_str), 455 | ('target_id' , dt_str) , 456 | ('semantic_similarity' , np.float64) , 457 | ('common_keywords' , dt_str), 458 | ]) 459 | # print(float(e['semantic_similarity']) 460 | 461 | edge_row = [] 462 | for e in graph.edges: 463 | try: 464 | sid = url_to_id[e['source']] 465 | tid = url_to_id[e['target']] 466 | except KeyError: 467 | # In case an edge references a node that wasn't saved (shouldn't happen) 468 | continue 469 | edge_row.append(( 470 | sid, 471 | tid, 472 | float(e['semantic_similarity']), 473 | ",".join(e['common_keywords']) 474 | )) 475 | 476 | edge_arr = np.array(edge_row , dtype = edge_dtype) 477 | 478 | if len(edge_arr) : 479 | print(f" len edges data = {len(edge_arr)}") 480 | # Create structured array (like a database table) 481 | f.create_dataset( 482 | "edges", 483 | data = edge_arr 484 | ) 485 | 486 | except Exception as e: 487 | print("oops error uWu" , e) 488 | # Remove the partially created file if an error occurred 489 | if os.path.exists(filepath): 490 | os.remove(filepath) 491 | exit(1) 492 | 493 | print(f"💾 Knowledge graph to {filepath} (HDF5)") 494 | 495 | async def main(url: str, max_depth: Optional[int], max_pages: Optional[int]) -> Graph: 496 | print("======= running deep crwal ===============" ) 497 | graph = await deep_crawl(url, max_depth, max_pages) 498 | return graph 499 | 500 | if __name__ == "__main__": 501 | parser = argparse.ArgumentParser(description = "pass multiple varirables") 502 | parser.add_argument("--max_depth" , type = int , help = "maximum depth of pages") 503 | parser.add_argument ("--max_pages" , type = int , help = "maximum number of pages") 504 | parser.add_argument("--url" , type = str , required = True, help = "doc url") 505 | parser.add_argument("--output_dir" , type = str , required = True, help = "output dir") 506 | parser.add_argument("--name" , type = str , required = True, help = "name") 507 | args = parser.parse_args() 508 | doc_url = args.url 509 | max_depth = args.max_depth 510 | max_pages = args.max_pages 511 | OUTPUT_DIR = args.output_dir 512 | name = args.name 513 | asyncio.run(main(args.url, args.max_depth, args.max_pages)) 514 | 515 | 516 | 517 | 518 | --------------------------------------------------------------------------------