├── .gitignore
├── requirements.txt
├── script.py
├── improvements.md
├── ideas.md
├── README.md
├── main.py
├── graphrag.py
└── deepcrawl.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | rag-app/node_modules/
 2 | rag-app/build/
 3 | rag-app/.env
 4 | rag-app/.env.local
 5 | rag-app/.next
 6 | __pycache__
 7 | .env
 8 | kg.json
 9 | enhanced_kg.json
10 | grpc
11 | .cursor/rules
12 | output/
13 | grpc/kg.json
14 | grpc/enhanced_kg.json
15 | ragtest/input
16 | ragtest
17 | grpc/node_modules
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Crawl4AI GraphRAG project dependencies
 2 | 
 3 | # Crawling / Graph / NLP
 4 | crawl4ai[all]>=0.6.0
 5 | sentence-transformers>=3.1.1   # Updated for Python 3.13 support
 6 | networkx>=3.4.1                # Updated
 7 | scikit-learn>=1.5.2            # Updated for Python 3.13
 8 | numpy>=1.26.4                  # Updated (1.24.x doesn't support 3.13)
 9 | summa>=1.2.0
10 | spacy>=3.7.0
11 | pandas>=2.2.3                  # Updated
12 | 
13 | # Google Gemini integration
14 | google-genai
15 | # Environment & utilities
16 | python-dotenv>=1.0.0
17 | 
18 | # gRPC server & Protocol Buffers
19 | grpcio>=1.73.1
20 | protobuf
21 | torch>=2.0.0  # Required for sentence-transformers
22 | h5py
23 | hashlib
24 | # Note: Removed asyncio-run>=0.1.1 as it's invalid/unnecessary (use built-in asyncio.run()) 


--------------------------------------------------------------------------------
/script.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import arxiv
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser(description="Search arXiv from the command line")
 7 |     parser.add_argument("query", type=str, help="Search query")
 8 |     parser.add_argument("--max_results", type=int, default=5, help="Maximum number of results to fetch")
 9 |     parser.add_argument("--sort", type=str, default="submittedDate", choices=["relevance", "lastUpdatedDate", "submittedDate"], help="Sort criterion")
10 |     parser.add_argument("--pdf", action="store_true", help="Print PDF links instead of abstract URLs")
11 | 
12 |     args = parser.parse_args()
13 | 
14 |     # Map string sort to enum
15 |     sort_map = {
16 |         "relevance": arxiv.SortCriterion.Relevance,
17 |         "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
18 |         "submittedDate": arxiv.SortCriterion.SubmittedDate,
19 |     }
20 | 
21 |     search = arxiv.Search(
22 |         query=args.query,
23 |         max_results=args.max_results,
24 |         sort_by=sort_map[args.sort],
25 |     )
26 | 
27 |     for result in search.results():
28 |         print("Title:", result.title)
29 |         print("Authors:", ", ".join(a.name for a in result.authors))
30 |         print("Published:", result.published.date())
31 |         print("Link:", result.pdf_url if args.pdf else result.entry_id)
32 |         print("Summary:", result.summary[:250].replace("\n", " "), "...\n")
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 
37 | 


--------------------------------------------------------------------------------
/improvements.md:
--------------------------------------------------------------------------------
  1 | # Docbook Improvement Plan
  2 | 
  3 | This document outlines a comprehensive plan to improve the `docbook` tool, focusing on software engineering best practices, CLI experience, agent integration, and advanced GraphRAG capabilities.
  4 | 
  5 | ## 1. Software Design & Architecture
  6 | 
  7 | ### **Current Issues**
  8 | - **Monolithic Scripts**: `deepcrawl.py` and `graphrag.py` mix logic, data definitions, and execution.
  9 | - **Hardcoded Dependencies**: Service instantiation happens inside classes.
 10 | - **Print Debugging**: Extensive use of `print()` instead of structured logging.
 11 | - **Global State**: Reliance on global variables and top-level execution code.
 12 | 
 13 | ### **Proposed Refactoring**
 14 | 1.  **Package Structure**:
 15 |     ```text
 16 |     docbook/
 17 |     ├── src/
 18 |     │   └── docbook/
 19 |     │       ├── __init__.py
 20 |     │       ├── cli.py            # Entry point (Typer)
 21 |     │       ├── config.py         # Pydantic settings
 22 |     │       ├── core/             # Core logic
 23 |     │       │   ├── graph.py      # Graph data structures
 24 |     │       │   └── storage.py    # HDF5/Database IO
 25 |     │       ├── crawler/          # Crawling logic
 26 |     │       │   └── deepcrawl.py
 27 |     │       ├── rag/              # RAG & Retrieval logic
 28 |     │       │   ├── graphrag.py
 29 |     │       │   └── embeddings.py
 30 |     │       └── agent/            # Agentic interfaces
 31 |     │           └── tools.py
 32 |     ├── pyproject.toml            # Dependency management
 33 |     └── README.md
 34 |     ```
 35 | 
 36 | 2.  **Dependency Injection**:
 37 |     - Pass `LLMClient`, `EmbeddingModel`, and `Storage` instances into classes rather than creating them inside.
 38 |     
 39 | 3.  **Configuration Management**:
 40 |     - Use `pydantic-settings` to manage `.env` and CLI args.
 41 |     
 42 | 4.  **Logging**:
 43 |     - Replace `print` with standard `logging` or `structlog` for better observability.
 44 | 
 45 | ## 2. CLI Experience (The "Good CLI" Goal)
 46 | 
 47 | Switch from `argparse` to **Typer** or **Click** for a modern, composable CLI with rich help and auto-completion.
 48 | 
 49 | ### **Proposed Commands**
 50 | ```bash
 51 | # Crawl a documentation site
 52 | docbook crawl https://docs.example.com --name "example-docs" --depth 2 --output ./data
 53 | 
 54 | # Query the knowledge graph via CLI
 55 | docbook ask "How do I configure auth?" --kg ./data/example-docs_kg.h5
 56 | 
 57 | # Start an API server for agents
 58 | docbook serve --kg ./data/example-docs_kg.h5 --port 8000
 59 | 
 60 | # Inspect graph stats
 61 | docbook inspect ./data/example-docs_kg.h5
 62 | ```
 63 | 
 64 | ## 3. Minimal Coding Agent
 65 | 
 66 | To enable a "minimal coding agent that can fetch docs content," we need to expose the Knowledge Graph (KG) as a **Tool** that an LLM can call.
 67 | 
 68 | ### **Design**
 69 | 1.  **Tool Interface**: Define a standard Python interface `DocsTool`:
 70 |     - `search(query: str) -> List[Node]`
 71 |     - `read_node(url: str) -> str`
 72 |     - `get_related(url: str) -> List[Node]`
 73 | 
 74 | 2.  **Agent Loop**:
 75 |     - Use a lightweight ReAct (Reasoning + Acting) loop.
 76 |     - **Prompt**: "You are a coding assistant. Use `search_docs` to find API references. Read the content before writing code."
 77 |     - **Implementation**:
 78 |     ```python
 79 |     class DocAgent:
 80 |         def __init__(self, kg_path):
 81 |             self.rag = load_graphrag(kg_path)
 82 |             
 83 |         def solve(self, task):
 84 |             # 1. Plan
 85 |             # 2. Call rag.retrieve_and_generate() or specific lookup tools
 86 |             # 3. Generate Code
 87 |     ```
 88 | 
 89 | ## 4. Advanced GraphRAG Architecture (Research-Backed)
 90 | 
 91 | Current implementation uses a basic "Keyword + Semantic" retrieval on a tree structure. We can improve this with recent research findings.
 92 | 
 93 | ### **A. Hybrid Construction (NLP + LLM)**
 94 | Instead of relying solely on LLMs for extraction (slow/expensive) or simple TextRank (low context), use a hybrid approach:
 95 | -   **Fast Entity Extraction**: Use **GliNER** (Generalist Model for Named Entity Recognition) or **Spacy** to extract entities (Functions, Classes, Constants) from code blocks and text.
 96 | -   **LLM refinement**: Only use LLM to summarize complex relationships between high-level entities.
 97 | 
 98 | ### **B. Community Summarization (Microsoft GraphRAG style)**
 99 | -   **Cluster Nodes**: Use **Leiden** or **Louvain** algorithms to detect communities of related nodes (e.g., "Authentication Module", "Database Drivers").
100 | -   **Hierarchical Summaries**: Generate summaries for these clusters.
101 | -   **Retrieval**: Match query against *cluster summaries* first, then drill down to specific nodes. This answers "global" questions (e.g., "How is error handling structured?") better than simple similarity search.
102 | 
103 | ### **C. Agentic Graph Traversal (Graph-of-Thoughts)**
104 | Instead of a single retrieval step:
105 | 1.  **Start**: Search entry nodes (high similarity).
106 | 2.  **Navigate**: The Agent sees the node's content and its *outgoing edges* (links).
107 | 3.  **Decide**: The Agent decides whether to:
108 |     -   Stop and answer.
109 |     -   Follow a link ("This mentions `AuthConfig`, let me check that node").
110 |     -   Backtrack.
111 | This mimics how a human reads documentation (following hyperlinks).
112 | 
113 | ### **D. Improved Embedding Strategy**
114 | -   **Code-Aware Embeddings**: Use models trained on code (e.g., `jina-embeddings-v2-base-code` or `unixcoder`) for code snippets, rather than generic text embeddings (`all-MiniLM`).
115 | -   **Late Interaction (ColBERT)**: If performance allows, use ColBERT-style token-level interaction for higher precision fetching.
116 | 
117 | ## 5. Implementation Roadmap
118 | 
119 | 1.  **Refactor**: Move current code into `src/` structure and switch to `Typer`.
120 | 2.  **Upgrade Graph**: Modify `deepcrawl` to use **GliNER** for better entity tagging during crawl.
121 | 3.  **Agent API**: Create the `DocsTool` class and a simple `docbook serve` endpoint.
122 | 4.  **Advanced RAG**: Implement "Community Summarization" as a post-processing step after crawling.
123 | 
124 | 


--------------------------------------------------------------------------------
/ideas.md:
--------------------------------------------------------------------------------
 1 | ## Practical GraphRAG / KG-RAG Ideas (from arXiv)
 2 | 
 3 | Each entry includes the paper link, brief info, how it works, and concrete integration ideas for this repo (see `deepcrawl.py`, `graphrag.py`, `main.py`).
 4 | 
 5 | ### 1) LEGO-GraphRAG: Modularizing Graph-based Retrieval-Augmented Generation for Design Space Exploration
 6 | [arXiv:2411.05844](http://arxiv.org/abs/2411.05844v3)
 7 | - **info**: Proposes a modular GraphRAG framework enabling plug-and-play components and ablations across stages (indexing, retrieval, reasoning).
 8 | - **how it works**: Decouples graph construction, retrieval, expansion, and generation with configuration-driven choices per module.
 9 | - **how to integrate**:
10 |   - Refactor `graphrag.py` `GraphRAGSystem` into clear modules/interfaces: `Retriever`, `GraphExpander`, `Reranker`, `AnswerGenerator` with a config in `main.py`.
11 |   - Add a registry pattern to toggle between keyword vs embedding retrieval in `_find_relevant_urls()` and neighbor strategies in `_expand_context_with_graph()`.
12 | 
13 | ### 2) PolyG: Effective and Efficient GraphRAG with Adaptive Graph Traversal
14 | [arXiv:2504.02112](http://arxiv.org/abs/2504.02112v1)
15 | - **info**: Introduces adaptive traversal that selects expansion paths per query and budget, outperforming fixed BFS/DFS.
16 | - **how it works**: Learns/uses heuristics (e.g., keyword overlap, semantic similarity, node centrality) to choose next-hop neighbors under a token/time budget.
17 | - **how to integrate**:
18 |   - In `GraphRAGSystem._expand_context_with_graph`, add a traversal policy that ranks neighbors by a weighted score of (cosine similarity, common keywords, degree/centrality, depth penalty).
19 |   - Add a retrieval budget (tokens or nodes) in `main.py` args; stop expansion when budget is hit.
20 | 
21 | ### 3) KG-Infused RAG: Augmenting Corpus-Based RAG with External Knowledge Graphs
22 | [arXiv:2506.09542](http://arxiv.org/abs/2506.09542v1)
23 | - **info**: Fuses unstructured retrieval with external KGs to improve grounding and coverage.
24 | - **how it works**: Entity/link detection connects text chunks to KG nodes; combines textual evidence with KG triples during retrieval and generation.
25 | - **how to integrate**:
26 |   - In `deepcrawl.py`, add optional entity linking step (e.g., spaCy NER + simple Wikidata lookup) storing `entities` per node.
27 |   - Extend `GraphRAGSystem._find_relevant_urls` to also retrieve from linked KG neighbors (stored as metadata) and merge with vector/keyword scores.
28 | 
29 | ### 4) KET-RAG: A Cost-Efficient Multi-Granular Indexing Framework for Graph-RAG
30 | [arXiv:2502.09304](http://arxiv.org/abs/2502.09304v2)
31 | - **info**: Reduces cost via multi-granularity indices and routing, querying fine-grained units only when needed.
32 | - **how it works**: Two-stage retrieval: coarse (page/section) → fine (paragraph/snippet) with gating.
33 | - **how to integrate**:
34 |   - During crawl, store multi-granularity chunks (page → section → paragraph) and precompute embeddings per level.
35 |   - Implement two-stage retrieval in `GraphRAGSystem`: first rank pages/sections; only embed/expand into paragraphs for top-K seeds.
36 | 
37 | ### 5) Walk&Retrieve: Zero-shot RAG via Knowledge Graph Walks
38 | [arXiv:2505.16849](http://arxiv.org/abs/2505.16849v2)
39 | - **info**: Uses guided random walks on the KG to gather compact, relevant subgraphs without supervision.
40 | - **how it works**: Starts from seed nodes derived from query terms/entities; performs biased walks to collect paths as context.
41 | - **how to integrate**:
42 |   - Add `walk_based_retrieval(query, seeds, steps, bias)` that samples paths from the NetworkX graph in `graphrag.py`.
43 |   - Use keyword/embedding similarity as transition bias; materialize unique nodes/edges from visited paths into the context.
44 | 
45 | ### 6) Empowering GraphRAG with Knowledge Filtering and Integration
46 | [arXiv:2503.13804](http://arxiv.org/abs/2503.13804v1)
47 | - **info**: Improves GraphRAG by filtering noisy knowledge and integrating signals before generation.
48 | - **how it works**: Node/edge quality estimation; prune or down-weight low-signal parts; integrate multi-source knowledge consistently.
49 | - **how to integrate**:
50 |   - Compute and store a `quality_score` per node/edge (signals: text length, dedup %, similarity to root domain, outbound degree anomalies).
51 |   - During retrieval and expansion, weight scores by `quality_score`; drop nodes below a threshold.
52 | 
53 | ### 7) When to use Graphs in RAG: A Comprehensive Analysis
54 | [arXiv:2506.05690](http://arxiv.org/abs/2506.05690v1)
55 | - **info**: Provides criteria for when GraphRAG beats vanilla RAG (e.g., multi-hop, hierarchical, entity-rich queries).
56 | - **how it works**: Empirical analysis across tasks shows benefits w.r.t. structure and reasoning depth.
57 | - **how to integrate**:
58 |   - Add a routing policy in `main.py`: detect entity density, query length, and estimated hop depth; choose between vector-only, graph-only, or hybrid pipeline.
59 |   - Log decisions to compare outcomes across policies.
60 | 
61 | ### 8) GraphRAG under Fire
62 | [arXiv:2501.14050](http://arxiv.org/abs/2501.14050v3)
63 | - **info**: Evaluates GraphRAG robustness against poisoning/attacks; proposes defenses.
64 | - **how it works**: Identifies attack surfaces (malicious pages, link spam) and mitigation (provenance, trust scoring, anomaly detection).
65 | - **how to integrate**:
66 |   - Track provenance per node (crawl timestamp, domain, referrer). Add a `trust_score` combining domain whitelist/blacklist and anomaly scores.
67 |   - Filter/rerank retrieval by `trust_score`; add simple content sanitization (strip scripts/iframes) in `deepcrawl.py` extraction.
68 | 
69 | ### 9) GraphRAG-Bench: Challenging Domain-Specific Reasoning Benchmark
70 | [arXiv:2506.02404](http://arxiv.org/abs/2506.02404v3)
71 | - **info**: Benchmark focusing on domain-specific, multi-step reasoning for GraphRAG.
72 | - **how it works**: Curates tasks requiring graph traversal, cross-page synthesis, and hierarchical context.
73 | - **how to integrate**:
74 |   - Create a small benchmark YAML/JSON in `output/bench/` with question → gold citations → expected hops from your crawled docs.
75 |   - Add `--eval bench.json` flag in `main.py` to run batch evaluation and record metrics (EM, F1, citation-precision).
76 | 
77 | ### 10) Know3-RAG: Knowledge-aware RAG with Adaptive Retrieval, Generation, and Filtering
78 | [arXiv:2505.12662](http://arxiv.org/abs/2505.12662v1)
79 | - **info**: Iterative pipeline adaptively retrieves, generates, and filters content to reduce hallucinations.
80 | - **how it works**: Uses a control loop: retrieve → generate → evidence-check → refine retrieval.
81 | - **how to integrate**:
82 |   - Add an optional iterative loop in `GraphRAGSystem.retrieve_and_generate`: after first answer, run a verifier that checks citation span overlap; if low, trigger another retrieval round with adjusted seeds.
83 |   - Introduce a lightweight reranker (e.g., cosine + keyword + trust score) before generation.
84 | 
85 | ---
86 | 
87 | ### General quick wins for this codebase
88 | - **Budgeted expansion**: Add `--token_budget` and trim contexts by highest utility-per-token.
89 | - **Neighbor selection**: Blend cosine similarity with `common_keywords` and centrality for picking neighbors.
90 | - **Provenance & trust**: Store `provenance`, `trust_score` in HDF5; use in ranking.
91 | - **Two-stage retrieval**: Coarse (page/section) → fine (paragraph) to cut cost.
92 | - **Evaluation harness**: Batch mode with saved prompts/contexts for reproducibility and metric logging.
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # DocBook: GraphRAG Documentation Assistant
  3 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/Saarthakkj/docbook)
  4 | 
  5 | 
  6 | A Python-based system that crawls documentation websites using Crawl4AI, builds a knowledge graph with embeddings and relationships, and provides intelligent question-answering capabilities using Retrieval-Augmented Generation (RAG) powered by Google's Gemini API.
  7 | 
  8 | ## Project Overview
  9 | 
 10 | DocBook is designed to create an intelligent assistant for any documentation website. It performs a deep crawl of the target site, extracts content, builds a structured knowledge graph with semantic embeddings, and enables natural language querying of the documentation using an enhanced GraphRAG implementation.
 11 | 
 12 | Key technical aspects:
 13 | - **Crawling**: Uses Crawl4AI for asynchronous, depth-limited crawling with content extraction.
 14 | - **Graph Construction**: Builds a hierarchical graph with nodes representing pages, including metadata like depth, keywords (extracted via TextRank), and embeddings (using Sentence Transformers).
 15 | - **RAG System**: Combines keyword matching, semantic similarity (cosine similarity on embeddings), and graph traversal for context retrieval, followed by generation using Gemini API.
 16 | - **Persistence**: Saves knowledge graphs as HDF5 for efficient storage and reuse, avoiding repeated crawls.
 17 | 
 18 | ## Features
 19 | 
 20 | - **Asynchronous Deep Crawling**: BFS-based crawling with configurable max depth and pages, skipping 404 errors.
 21 | - **Keyword Extraction**: Uses TextRank (from summa library) with fallback to frequency-based method for robustness.
 22 | - **Embeddings**: Generates vector embeddings using 'all-MiniLM-L6-v2' model from Sentence Transformers.
 23 | - **Knowledge Graph**: Directed graph with NAVIGATES_TO relationships, including common keywords and semantic similarity scores.
 24 | - **Multi-Method Retrieval**: Combines keyword indexing, semantic similarity, and graph expansion for relevant context.
 25 | - **Interactive Q&A**: Command-line interface for querying with 'quit' to exit.
 26 | - **HDF5 Storage**: Efficient binary format for storing large graphs with metadata, embeddings, and hierarchical structure.
 27 | 
 28 | ## Architecture and Technical Details
 29 | 
 30 | ### Core Components
 31 | 
 32 | 1. **deepcrawl.py**:
 33 |    - **GraphNode dataclass**: Represents nodes with url, content, depth, keywords, embedding, and children list.
 34 |    - **Graph dataclass**: Contains nodes dict, edges list, and metadata dict with total_nodes, max_depth, root_url.
 35 |    - **Deep crawling**: Uses BFS strategy with AsyncWebCrawler, configurable depth/page limits.
 36 |    - **Keyword extraction**: Primary method uses TextRank via `extract_keywords_textrank()`, fallback to frequency-based analysis.
 37 |    - **Embeddings**: Creates embeddings using SentenceTransformer('all-MiniLM-L6-v2').
 38 |    - **Edge computation**: Calculates semantic similarity as ratio of common keywords between parent-child nodes.
 39 |    - **HDF5 persistence**: `save_graph_hdf5()` stores graph in hierarchical format with metadata, nodes (with embeddings/keywords), and edges.
 40 |    - **Graph utilities**: `print_graph_structure()` for visualization, `find_parent_node()` for tree traversal.
 41 | 
 42 | 2. **graphrag.py**:
 43 |    - **Entity**: Dataclass with source_urls, keywords, content_snippet, embedding, depth.
 44 |    - **Relationship**: Dataclass with source, target, source_urls, common_keywords, semantic_similarity.
 45 |    - **GraphRAGSystem**: Main RAG class with:
 46 |      - Initialization with Graph object and Gemini API key
 47 |      - SentenceTransformer model loading for embeddings
 48 |      - NetworkX DiGraph construction for graph operations
 49 |      - Keyword index (defaultdict) mapping keywords to URL lists
 50 |      - `load_from_kg_json()`: Creates entities from GraphNodes, relationships from edges
 51 |      - `retrieve_and_generate()`: Main query method combining retrieval and generation
 52 |      - `_find_relevant_urls()`: Multi-score ranking (keyword overlap + cosine similarity)
 53 |      - `_expand_context_with_graph()`: Graph traversal to add neighbor nodes (top 5 per seed)
 54 |      - `_generate_enhanced_answer()`: Constructs detailed prompt with context sections for LLM
 55 | 
 56 | 3. **main.py**:
 57 |    - **CLI argument parsing**: Required args (--url, --output_dir, --name), optional (--max_depth, --max_pages).
 58 |    - **Graph persistence check**: Looks for existing `{name}_kg.h5` file to avoid re-crawling.
 59 |    - **Crawling workflow**: If no existing graph, runs `deepcrawl.deep_crawl()` and saves with `save_graph_hdf5()`.
 60 |    - **Graph loading**: Uses `load_graph_hdf5()` to reconstruct Graph object from HDF5.
 61 |    - **RAG initialization**: Creates GraphRAGSystem from loaded graph.
 62 |    - **Interactive loop**: Continuous Q&A interface with error handling and 'quit' command.
 63 |    - **Debug utilities**: `debug_save_process()` and `inspect_saved_graph()` for troubleshooting.
 64 | 
 65 | ### How It Works (Step-by-Step)
 66 | 
 67 | 1. **Crawling Phase**:
 68 |    - BFS traversal starting from root URL with depth/page limits
 69 |    - Content extraction to markdown, keyword extraction via TextRank
 70 |    - Embedding generation for each page's content
 71 |    - Parent-child relationship establishment with similarity scoring
 72 | 
 73 | 2. **Graph Construction**:
 74 |    - Nodes stored as GraphNode objects with full content and metadata
 75 |    - Edges contain semantic similarity scores and common keywords
 76 |    - HDF5 storage with hierarchical structure: /metadata, /nodes, /nodes_index, /edges
 77 | 
 78 | 3. **RAG System Loading**:
 79 |    - Graph reconstruction from HDF5 into memory
 80 |    - Entity creation from nodes, relationship mapping from edges
 81 |    - NetworkX graph building for efficient traversal
 82 |    - Keyword index construction for fast lookup
 83 | 
 84 | 4. **Query Processing**:
 85 |    - Keyword matching and embedding similarity scoring
 86 |    - Graph expansion to include relevant neighbors
 87 |    - Context compilation with entities, relationships, and content snippets
 88 |    - LLM generation with structured prompt including instructions and context
 89 | 
 90 | ## Installation
 91 | 
 92 | ### Prerequisites
 93 | - Python 3.8+
 94 | - uv (fast Python package manager and environment tool)
 95 | - Google Gemini API key (from https://ai.dev/apikey)
 96 | 
 97 | ### Steps (using uv)
 98 | 1. Clone the repository:
 99 |    ```
100 |    git clone https://github.com/your-org/docbook.git
101 |    cd docbook
102 |    ```
103 | 
104 | 2. Install uv (if not already installed):
105 |    ```
106 |    curl -LsSf https://astral.sh/uv/install.sh | sh
107 |    ```
108 | 
109 | 3. Create and activate a virtual environment:
110 |    ```
111 |    uv venv .venv
112 |    . .venv/bin/activate
113 |    ```
114 | 
115 | 4. Install dependencies with uv:
116 |    ```
117 |    uv pip install -r requirements.txt
118 |    ```
119 | 
120 | 5. Set environment variables in `.env`:
121 |    ```
122 |    gemini_api_key=your_api_key_here
123 |    ```
124 | 
125 | ## Usage
126 | 
127 | Run a crawl and interactive query session (first run creates the HDF5 knowledge graph, later runs reuse it):
128 | 
129 | ```
130 | python main.py \
131 |   --url https://docs.crawl4ai.com/ \
132 |   --output_dir ./output \
133 |   --name crawl4ai_docs \
134 |   --max_depth 3 \
135 |   --max_pages 50
136 | ```
137 | 
138 | - Output file: `./output/crawl4ai_docs_kg.h5`.
139 | - Subsequent runs with the same `--name` and `--output_dir` load the graph directly (no re-crawl).
140 | - In the prompt, type `quit` to exit.
141 | 
142 | Notes:
143 | - The environment variable name is lowercase and case-sensitive: `gemini_api_key`.
144 | - Embedding model: `'all-MiniLM-L6-v2'` via Sentence Transformers.
145 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from deepcrawl import GraphNode , Graph
  2 | import deepcrawl
  3 | import asyncio
  4 | import argparse
  5 | import sys
  6 | import subprocess
  7 | import os 
  8 | import json
  9 | import os
 10 | from dotenv import load_dotenv
 11 | from graphrag import GraphRAGSystem, create_graphrag 
 12 | from deepcrawl import  save_graph_hdf5
 13 | 
 14 | load_dotenv()
 15 | 
 16 | 
 17 | 
 18 | def parse_arguments(): 
 19 |     '''parse command line arguments'''
 20 |     
 21 |     parser = argparse.ArgumentParser(
 22 |         description= "GraphRAG implementation using Crawl4AI" , 
 23 |         formatter_class = argparse.RawDescriptionHelpFormatter , 
 24 |         epilog="""
 25 | Examples : 
 26 |     # Quick start with default settings 
 27 |     python main.py --url https://docs.crawl4ai.com/
 28 |     
 29 |     #Custom crawling parameters 
 30 |     python main.py --url https://crawl4ai.com/ --max_depth 3 max_pages 50
 31 |         """
 32 |     )
 33 |     
 34 |     
 35 |     parser = argparse.ArgumentParser(description = "pass multiple varirables")
 36 |     parser.add_argument("--max_depth"  , type = int  , help = "maximum depth of pages")
 37 |     parser.add_argument ("--max_pages" , type = int ,  help = "maximum number of pages")
 38 | 
 39 |     parser.add_argument("--url" , type = str  , required = True, help = "doc url")
 40 |     parser.add_argument("--output_dir" , type = str  , required = True, help = "output directory")
 41 | 
 42 |     parser.add_argument("--name" , type = str  , required = True, help = "name of your documentation")
 43 |     parser.add_argument("--token_budget", type=int, default=None, help="Approximate token budget for context assembly")
 44 | 
 45 | 
 46 |     return parser.parse_args()
 47 | 
 48 | 
 49 | async def run_deepcrawl(url : str , max_depth : int , max_pages : int ) -> Graph:
 50 |     """run deepcrawl.py with specified parameters"""
 51 |     
 52 |     
 53 |     return await deepcrawl.main(url , max_depth , max_pages )
 54 | 
 55 | import json
 56 | import os
 57 | 
 58 | def inspect_saved_graph(filepath: str):
 59 |     """Inspect what's actually in the saved JSON file"""
 60 |     print(f"🔍 Inspecting saved graph: {filepath}")
 61 |     
 62 |     if not os.path.exists(filepath):
 63 |         print("❌ File doesn't exist")
 64 |         return
 65 |     
 66 |     file_size = os.path.getsize(filepath)
 67 |     print(f"📏 File size: {file_size} bytes")
 68 |     
 69 |     try:
 70 |         with open(filepath, 'r', encoding='utf-8') as f:
 71 |             data = json.load(f)
 72 |         
 73 |         print("📊 JSON structure analysis:")
 74 |         print(f"  🗂️  Top-level keys: {list(data.keys())}")
 75 |         
 76 |         if 'metadata' in data:
 77 |             print(f"  📈 Metadata: {data['metadata']}")
 78 |         
 79 |         if 'nodes' in data:
 80 |             node_count = len(data['nodes'])
 81 |             print(f"  🌐 Nodes count: {node_count}")
 82 |             
 83 |             if node_count > 0:
 84 |                 # Show first few node keys
 85 |                 node_keys = list(data['nodes'].keys())[:3]
 86 |                 print(f"  🔑 Sample node keys: {node_keys}")
 87 |                 
 88 |                 # Show structure of first node
 89 |                 if node_keys:
 90 |                     first_node = data['nodes'][node_keys[0]]
 91 |                     print(f"  📝 First node structure: {list(first_node.keys())}")
 92 |                     print(f"  📄 First node content length: {len(first_node.get('content', ''))}")
 93 |                     print(f"  🏷️  First node keywords count: {len(first_node.get('keywords', []))}")
 94 |             else:
 95 |                 print("  ⚠️  No nodes found!")
 96 |         
 97 |         if 'edges' in data:
 98 |             edge_count = len(data['edges'])
 99 |             print(f"  🔗 Edges count: {edge_count}")
100 |             
101 |             if edge_count > 0:
102 |                 print(f"  🔗 First edge: {data['edges'][0]}")
103 |             else:
104 |                 print("  ⚠️  No edges found!")
105 |         
106 |         return data
107 |         
108 |     except json.JSONDecodeError as e:
109 |         print(f"❌ JSON decode error: {e}")
110 |     except Exception as e:
111 |         print(f"❌ Error reading file: {e}")
112 | 
113 | def debug_graph_before_save(graph):
114 |     """Debug the graph object before saving"""
115 |     print("🔍 Debugging graph object before save:")
116 |     print(f"  🌐 Root URL: {graph.url}")
117 |     print(f"  📊 Root depth: {graph.depth}")
118 |     print(f"  📄 Root content length: {len(graph.content) if graph.content else 0}")
119 |     print(f"  🔗 Root children count: {len(graph.children)}")
120 |     
121 |     # Traverse and count all nodes
122 |     all_nodes = []
123 |     stack = [graph]
124 |     while stack:
125 |         node = stack.pop()
126 |         all_nodes.append(node)
127 |         stack.extend(node.children)
128 |     
129 |     print(f"  🌳 Total nodes in tree: {len(all_nodes)}")
130 |     
131 |     # Show depth distribution
132 |     depth_counts = {}
133 |     for node in all_nodes:
134 |         depth_counts[node.depth] = depth_counts.get(node.depth, 0) + 1
135 |     
136 |     print(f"  📊 Depth distribution: {depth_counts}")
137 |     
138 |     # Show some sample URLs
139 |     sample_urls = [node.url for node in all_nodes[:5]]
140 |     print(f"  🔗 Sample URLs: {sample_urls}")
141 |     
142 |     # Check for empty content
143 |     empty_content_count = sum(1 for node in all_nodes if not node.content or len(node.content.strip()) == 0)
144 |     print(f"  ⚠️  Nodes with empty content: {empty_content_count}/{len(all_nodes)}")
145 |     
146 |     return all_nodes
147 | 
148 | def debug_save_process(graph, filepath: str):
149 |     """Debug the entire save process step by step"""
150 |     print("🚀 Starting comprehensive save debug...")
151 |     
152 |     # 1. Debug input graph
153 |     all_nodes = debug_graph_before_save(graph)
154 |     
155 |     if len(all_nodes) == 0:
156 |         print("❌ PROBLEM: Graph has no nodes!")
157 |         return
158 |     
159 |     if len(all_nodes) == 1:
160 |         print("⚠️  WARNING: Graph has only root node (no children crawled)")
161 |     
162 |     # 2. Test keyword extraction on a sample
163 |     print("\n🔍 Testing keyword extraction...")
164 |     try:
165 |         sample_node = all_nodes[0]
166 |         if hasattr(sample_node, 'content') and sample_node.content:
167 |             print(f"  📝 Sample content length: {len(sample_node.content)}")
168 |             # Test if extract_keywords_textrank function exists and works
169 |             try:
170 |                 keywords = extract_keywords_textrank(sample_node.content, 5)  # Assuming top_k=5
171 |                 print(f"  ✅ Keywords extracted: {keywords}")
172 |             except NameError:
173 |                 print("  ❌ extract_keywords_textrank function not defined!")
174 |             except Exception as e:
175 |                 print(f"  ❌ Keyword extraction failed: {e}")
176 |         else:
177 |             print("  ⚠️  Sample node has no content")
178 |     except Exception as e:
179 |         print(f"  ❌ Error testing keywords: {e}")
180 |     
181 |     # 3. Test common keywords function
182 |     print("\n🔍 Testing common keywords function...")
183 |     try:
184 |         test_result = get_common_keywords(['test', 'word'], ['test', 'another'])
185 |         print(f"  ✅ get_common_keywords works: {test_result}")
186 |     except NameError:
187 |         print("  ❌ get_common_keywords function not defined!")
188 |     except Exception as e:
189 |         print(f"  ❌ get_common_keywords failed: {e}")
190 |     
191 |     # 4. Now try the actual save
192 |     print(f"\n💾 Attempting save to: {filepath}")
193 |     try:
194 |         save_graph(graph, filepath)
195 |         
196 |         # 5. Inspect what was actually saved
197 |         print("\n🔍 Inspecting saved result...")
198 |         inspect_saved_graph(filepath)
199 |         
200 |     except Exception as e:
201 |         print(f"❌ Save failed: {e}")
202 |         import traceback
203 |         traceback.print_exc()
204 | 
205 | import h5py
206 | import numpy as np
207 | 
208 | 
209 | def load_graph_hdf5(filepath: str) -> Graph:
210 |     """Load the knowledge graph from HDF5 and reconstruct the GraphNode tree."""
211 |     
212 |     with h5py.File(filepath, "r") as f:
213 |         # Load metadata
214 |         metadata = {}
215 |         if "metadata" in f:
216 |             meta_grp = f["metadata"]
217 |             for k in meta_grp.attrs:
218 |                 metadata[k] = meta_grp.attrs[k]
219 |         root_url = metadata.get("root_url")
220 |         if not root_url:
221 |             raise ValueError("No root_url found in metadata")
222 | 
223 |         # Build an ID -> URL map and load nodes (url -> GraphNode)
224 |         id_to_url = {}
225 |         if "nodes_index" in f:
226 |             # Preferred path: use the explicit index
227 |             idx = f["nodes_index"][:]
228 |             for row in idx:
229 |                 rid = row["id"].decode("utf-8") if isinstance(row["id"], (bytes, bytearray)) else str(row["id"])
230 |                 rurl = row["url"].decode("utf-8") if isinstance(row["url"], (bytes, bytearray)) else str(row["url"]) 
231 |                 id_to_url[rid] = rurl
232 |         
233 |         node_map = {}
234 |         if "nodes" not in f:
235 |             raise ValueError("No nodes found in HDF5 file.")
236 |         nodes_grp = f["nodes"]
237 |         for node_group_name in nodes_grp:
238 |             node_grp = nodes_grp[node_group_name]
239 |             # Recover URL and depth
240 |             raw_url = node_grp.attrs.get("url", "")
241 |             url = raw_url.decode("utf-8") if isinstance(raw_url, (bytes, bytearray)) else str(raw_url)
242 |             raw_depth = node_grp.attrs.get("depth", 0)
243 |             depth = int(raw_depth)
244 |             
245 |             # Datasets: content, keywords, embedding
246 |             content = ""
247 |             if "content" in node_grp:
248 |                 raw_content = node_grp["content"][()]
249 |                 content = raw_content.decode("utf-8") if isinstance(raw_content, (bytes, bytearray)) else str(raw_content)
250 |             
251 |             keywords = []
252 |             if "keywords" in node_grp:
253 |                 kw_arr = node_grp["keywords"][:]
254 |                 for kw in kw_arr:
255 |                     if isinstance(kw, (bytes, bytearray)):
256 |                         kw = kw.decode("utf-8", errors="ignore")
257 |                     kw = str(kw)
258 |                     if kw:
259 |                         keywords.append(kw)
260 |             
261 |             embedding = None
262 |             if "embedding" in node_grp:
263 |                 emb = node_grp["embedding"][:]
264 |                 try:
265 |                     embedding = emb.astype(float).tolist()
266 |                 except Exception:
267 |                     embedding = [float(x) for x in emb]
268 |             
269 |             node = GraphNode(
270 |                 url=url,
271 |                 content=content,
272 |                 depth=depth,
273 |                 keywords=keywords,
274 |                 embedding=embedding,
275 |             )
276 |             node_map[url] = node
277 | 
278 |         # Load and apply edges to link children
279 |         graph = Graph(nodes=node_map, edges=[], metadata=metadata)
280 |         if "edges" in f:
281 |             edges_dataset = f["edges"][:]
282 |             for edge in edges_dataset:
283 |                 # Resolve IDs back to URLs
284 |                 raw_sid = edge["source_id"]
285 |                 raw_tid = edge["target_id"]
286 |                 sid = raw_sid.decode("utf-8") if isinstance(raw_sid, (bytes, bytearray)) else str(raw_sid)
287 |                 tid = raw_tid.decode("utf-8") if isinstance(raw_tid, (bytes, bytearray)) else str(raw_tid)
288 |                 source_url = id_to_url.get(sid, None)
289 |                 target_url = id_to_url.get(tid, None)
290 |                 if source_url is None or target_url is None:
291 |                     # Fallback: try to find URLs by scanning node groups' attrs if index is missing
292 |                     if not id_to_url:
293 |                         # Attempt to rebuild id_to_url from group names (n_<id>) and attrs
294 |                         for ngn in nodes_grp:
295 |                             ngr = nodes_grp[ngn]
296 |                             rid = ngn[2:] if ngn.startswith("n_") else ngn
297 |                             rurl = ngr.attrs.get("url", "")
298 |                             rurl = rurl.decode("utf-8") if isinstance(rurl, (bytes, bytearray)) else str(rurl)
299 |                             id_to_url[rid] = rurl
300 |                         source_url = id_to_url.get(sid)
301 |                         target_url = id_to_url.get(tid)
302 |                 sim = float(edge["semantic_similarity"]) if "semantic_similarity" in edge.dtype.names else 0.0
303 |                 raw_ckw = edge["common_keywords"] if "common_keywords" in edge.dtype.names else b""
304 |                 if isinstance(raw_ckw, (bytes, bytearray)):
305 |                     common_kw_str = raw_ckw.decode("utf-8", errors="ignore")
306 |                 else:
307 |                     common_kw_str = str(raw_ckw)
308 |                 common_kw = common_kw_str.split(',') if common_kw_str else []
309 | 
310 |                 if source_url in node_map and target_url in node_map:
311 |                     node_map[source_url].children.append(node_map[target_url])
312 | 
313 |                 graph.edges.append({
314 |                     'source': source_url if source_url is not None else '',
315 |                     'target': target_url if target_url is not None else '',
316 |                     'common_keywords': common_kw,
317 |                     'semantic_similarity': sim
318 |                 })
319 |         return graph
320 | 
321 | 
322 | def print_graph_nodes_sample(graph: Graph, limit: int = 10):
323 |     """Print a small sample of nodes to verify that the loaded graph is not empty."""
324 |     try:
325 |         total_nodes = len(graph.nodes) if hasattr(graph, 'nodes') and graph.nodes is not None else 0
326 |         total_edges = len(graph.edges) if hasattr(graph, 'edges') and graph.edges is not None else 0
327 |     except Exception:
328 |         total_nodes = 0
329 |         total_edges = 0
330 |     
331 |     if total_nodes == 0:
332 |         print("⚠️  Graph appears to have no nodes.")
333 |         return
334 |     
335 |     print(f"🌐 Total nodes: {total_nodes}")
336 |     print(f"🔗 Total edges: {total_edges}")
337 |     sample_urls = list(graph.nodes.keys())[:limit]
338 |     print(f"🔎 Sample of {len(sample_urls)} node URLs: {sample_urls}")
339 |     for url in sample_urls:
340 |         node = graph.nodes[url]
341 |         depth = getattr(node, 'depth', None)
342 |         kw_count = len((getattr(node, 'keywords', []) or []))
343 |         content_len = len((getattr(node, 'content', '') or ''))
344 |         print(f"  - {url} (depth={depth}, keywords={kw_count}, content_len={content_len})")
345 | 
346 | 
347 | async def main():
348 |     """Main function to demonstrate GraphRAG implementation"""
349 |     
350 |     
351 |     args = parse_arguments()
352 |     
353 |     
354 |     
355 |     # Check for required API key
356 |     gemini_api_key = os.getenv("gemini_api_key")
357 |     if not gemini_api_key:
358 |         print("❌ Error: GEMINI_API_KEY environment variable not set")
359 |         print("Please set your Gemini API key in the .env file")
360 |         return
361 |     
362 |     print("🚀 Starting Docbook ")
363 |     print("=" * 60)
364 |     
365 |     # Step 1: Load the knowledge graph from deepcrawl.py output
366 |     print("📂 Loading graphRG system.....")
367 |     
368 |     rag_system = None
369 |     
370 |     kg_path = os.path.join(args.output_dir, f"{args.name}_kg.h5")
371 | 
372 |     if os.path.exists(kg_path):
373 |         print(f"Found existing knowledge graph at {kg_path}")
374 |         graph = load_graph_hdf5(kg_path)
375 |         # print_graph_nodes_sample(graph, limit=5)
376 |     else:
377 |         print(f"No existing graph found, running deepcrawl...")
378 |         graph = await run_deepcrawl(args.url, args.max_depth, args.max_pages)
379 |         # root = graph.nodes[args.url]
380 |         save_graph_hdf5(graph, kg_path)
381 |         print(f"Knowledge graph saved to {kg_path}")
382 |         # print_graph_nodes_sample(graph, limit=5)
383 | 
384 |     rag_system = await create_graphrag(
385 |         graph,
386 |         gemini_api_key,
387 |         token_budget=args.token_budget
388 |     )
389 |     print("🤖 GraphRAG Query Interface Ready!")
390 |     print(f"Ask questions about {args.name} documentation.")
391 |     print("Type 'quit' to exit.")
392 |     print("=" * 60)
393 |     
394 |     
395 |     while True:
396 |         print("\n" + "-" * 40)
397 |         user_query = input("🔍 Enter your question: ").strip()
398 |         
399 |         if user_query.lower() in ['quit', 'exit', 'q']:
400 |             print("👋 Thanks for using GraphRAG! Goodbye!")
401 |             break
402 |         
403 |         if not user_query:
404 |             continue
405 |         
406 |         try:
407 |             print("\n🔎 Processing your query...")
408 |             answer = await rag_system.retrieve_and_generate(user_query)
409 |             print("\n📝 Answer:")
410 |             print("=" * 50)
411 |             print(answer)
412 |             print("=" * 50)
413 |             
414 |         except Exception as e:
415 |             print(f"❌ Error processing query: {e}")
416 |             print("Please try a different question.")
417 | 
418 | if __name__ == "__main__":
419 |     asyncio.run(main())
420 | 


--------------------------------------------------------------------------------
/graphrag.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     TODO : 
  3 |         - introduce cuGRAPh in networkx
  4 |         - any other score to add in find_relevant_url method?
  5 | """
  6 | 
  7 | #!/usr/bin/env python3
  8 | """
  9 | GraphRAG System for Documentation Crawling
 10 | Optimized to work directly with hd5py format from deepcrawl.py
 11 | """
 12 | 
 13 | import asyncio
 14 | import json
 15 | import numpy as np
 16 | from typing import Dict, List, Set, Tuple, Optional
 17 | from dataclasses import dataclass
 18 | # pip install sentence-transformers
 19 | from sentence_transformers import SentenceTransformer
 20 | import networkx as nx
 21 | from sklearn.metrics.pairwise import cosine_similarity
 22 | import os
 23 | from dotenv import load_dotenv
 24 | from collections import Counter, defaultdict
 25 | from google import genai
 26 | from google.genai import types
 27 | from deepcrawl import GraphNode , Graph
 28 | 
 29 | load_dotenv()
 30 | 
 31 | @dataclass
 32 | class Entity:
 33 |     source_urls: List[str]
 34 |     keywords: List[str]
 35 |     content_snippet: str
 36 |     depth: int
 37 |     embedding: List[float]
 38 | 
 39 | @dataclass
 40 | class Relationship:
 41 |     source: str
 42 |     target: str
 43 |     source_urls: List[str]
 44 |     common_keywords: List[str]
 45 |     semantic_similarity: float
 46 | 
 47 | class GraphRAGSystem:
 48 |     def __init__(self, graph : Graph , gemini_api_key: str, token_budget: Optional[int] = None):
 49 |         self.entities: Dict[str, Entity] = {}
 50 |         self.relationships: List[Relationship] = []
 51 |         self.knowledge_graph = nx.DiGraph()
 52 |         self.keyword_index = defaultdict(list)  # Keyword -> URLs that contain it
 53 |         self.graph = graph
 54 |         self.token_budget: Optional[int] = token_budget
 55 |         
 56 |         # Initialize models (heavier embedding model)
 57 |         self.embedding_model = SentenceTransformer('all-mpnet-base-v2')
 58 |         
 59 |         # Initialize Gemini
 60 |         client = genai.Client(api_key=os.environ['gemini_api_key'])
 61 |         chat = client.chats.create(model='gemini-2.0-flash')
 62 |         self.llm = chat
 63 |         
 64 |     def _estimate_tokens_text(self, text: str) -> int:
 65 |         """Rough token estimator: ~1 token per 4 characters as a safe upper bound."""
 66 |         if not text:
 67 |             return 0
 68 |         # Use characters/4 to avoid underestimating for long words; clamp at >= 1 for non-empty
 69 |         return max(1, len(text) // 4)
 70 | 
 71 |     def _estimate_entity_tokens(self, entity: Entity) -> int:
 72 |         """Estimate tokens contributed by an entity block in the prompt."""
 73 |         url = entity.source_urls[0] if entity.source_urls else ""
 74 |         header = self._extract_entity_name(url)
 75 |         meta = ", ".join((entity.keywords or [])[:5])
 76 |         overhead = len(header) + len(url) + len(meta) + 32  # headings + labels
 77 |         return (overhead // 4) + self._estimate_tokens_text(entity.content_snippet or "")
 78 | 
 79 |     def _normalize_keyword(self, kw) -> str:
 80 |         """Convert keyword-like values (bytes, numpy scalars) to normalized lowercase str."""
 81 |         try:
 82 |             import numpy as _np  # local import to avoid top-level alias issues
 83 |             if isinstance(kw, (_np.bytes_, bytes)):
 84 |                 kw = kw.decode('utf-8', errors='ignore')
 85 |         except Exception:
 86 |             # If numpy is not available or decode fails, fall back to str
 87 |             pass
 88 |         if isinstance(kw, bytes):
 89 |             kw = kw.decode('utf-8', errors='ignore')
 90 |         return str(kw).strip().lower()
 91 | 
 92 |     def load_from_kg_json(self):
 93 |         """ load knoweldge graph directly from graphNode"""
 94 |         
 95 |         
 96 |         print("🔍 Loading knowledge graph from graphNode...")
 97 |         
 98 |         
 99 |         # Normalize keywords and build keyword index
100 |         for url, node in self.graph.nodes.items():
101 |             normalized_keywords = []
102 |             try:
103 |                 iterable_keywords = node.keywords or []
104 |             except Exception:
105 |                 iterable_keywords = []
106 |             for kw in iterable_keywords:
107 |                 if kw is None:
108 |                     continue
109 |                 norm_kw = self._normalize_keyword(kw)
110 |                 if not norm_kw:
111 |                     continue
112 |                 normalized_keywords.append(norm_kw)
113 |                 self.keyword_index[norm_kw].append(node.url)
114 |             node.keywords = normalized_keywords
115 | 
116 |         # Create entities and relationships before building the NetworkX graph
117 |         nodes_list = [self.graph.nodes[_] for _ in self.graph.nodes]
118 |         self._create_entities_from_nodes(nodes_list)
119 |         self._create_relationships_from_edges(self.graph.edges)
120 | 
121 |         # Now build the NetworkX graph with nodes and edges available
122 |         self._build_networkx_graph()
123 |         
124 |     def _create_entities_from_nodes(self, all_nodes : List[GraphNode]):
125 |         """Create entities from kg.json nodes"""
126 |         print("🏗️  Creating entities from nodes...")
127 |         
128 |         for  node in all_nodes:
129 |             entity = Entity(
130 |                 source_urls=[node.url],
131 |                 keywords=node.keywords,
132 |                 content_snippet=node.content,
133 |                 embedding = node.embedding ,  
134 |                 depth=node.depth,
135 |             )
136 |             
137 |             self.entities[node.url] = entity
138 |     
139 |     def _create_relationships_from_edges(self, edges: List[Dict]):
140 |         """Create relationships from kg.json edges"""
141 |         print("🔗 Creating relationships from edges...")
142 |         
143 |         for edge in edges:
144 |             relationship = Relationship(
145 |                 source=edge["source"],
146 |                 target=edge["target"],
147 |                 source_urls=[edge["source"], edge["target"]],
148 |                 common_keywords=edge.get("common_keywords", []),
149 |                 semantic_similarity=edge.get("semantic_similarity", 0.0)
150 |             )
151 |             
152 |             self.relationships.append(relationship)
153 |     
154 |     def _extract_entity_name(self, url: str) -> str:
155 |         """Extract a readable entity name from URL"""
156 |         if url.endswith('/'):
157 |             url = url[:-1]
158 |         
159 |         parts = url.split('/')
160 |         if len(parts) >= 2:
161 |             name = parts[-1].replace('-', ' ').replace('_', ' ').title()
162 |             if name:
163 |                 return name
164 |         
165 |         return url.split('/')[-1] or "Home"
166 |             
167 |     def _build_networkx_graph(self):
168 |         """Build NetworkX graph from entities and relationships"""
169 |         print("🕸️  Building NetworkX graph...")
170 |         
171 |         # Add nodes
172 |         for url, node in self.graph.nodes.items():
173 |             self.knowledge_graph.add_node(
174 |                 url,
175 |                 keywords=node.keywords,
176 |                 depth=node.depth,
177 |                 embedding=node.embedding
178 |             )
179 |         
180 |         # Add edges
181 |         for rel in self.relationships:
182 |             if rel.source in self.entities and rel.target in self.entities:
183 |                 self.knowledge_graph.add_edge(
184 |                     rel.source,
185 |                     rel.target,
186 |                     common_keywords=rel.common_keywords,
187 |                     semantic_similarity=rel.semantic_similarity
188 |                 )
189 |                 
190 |     async def retrieve_and_generate(self, query: str, top_k: int = 10) -> str:
191 |         """Enhanced query processing using both keywords and embeddings"""
192 |         
193 |         if self.token_budget is not None:
194 |             print(f"[Budget] Token budget for this query: {self.token_budget} tokens")
195 | 
196 |         # Step 1: Find relevant URLs using multiple methods
197 |         relevant_urls = await self._find_relevant_urls(query, top_k)
198 |         
199 |         # Step 2: Expand context using graph relationships
200 |         expanded_context = self._expand_context_with_graph(relevant_urls)
201 |                 
202 |         # Step 4: Generate answer using LLM
203 |         answer = await self._generate_enhanced_answer(query, expanded_context)
204 |         
205 |         return answer
206 |     
207 |     async def _find_relevant_urls(self, query: str, top_k: int) -> List[str]:
208 |         """Find relevant top k URLs using keyword matching and semantic similarity"""
209 |         # Method 1: Keyword-based retrieval
210 |         query_words = set(query.lower().split())
211 |         keyword_scores = defaultdict(float)
212 |         
213 |         for word in query_words:
214 |             if word in self.keyword_index:
215 |                 for url in self.keyword_index[word]:
216 |                     keyword_scores[url] += 1.0
217 |         
218 |         # Method 2: Semantic similarity (if embeddings exist)
219 |         similarity_scores = {}
220 |         query_embedding = self.embedding_model.encode(query)
221 |         # Ensure 1D for consistency
222 |         if isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1:
223 |             query_embedding = query_embedding.flatten()
224 |         
225 |         for url, entity in self.entities.items():
226 |             
227 |             # Convert entity embedding to numpy array if available
228 |             entity_embedding = None
229 |             if entity.embedding is not None:
230 |                 try:
231 |                     entity_embedding = np.array(entity.embedding, dtype=np.float32)
232 |                 except Exception:
233 |                     entity_embedding = None
234 | 
235 |             # Ensure both embeddings are 1D vectors and match dimensions; otherwise re-embed with current model
236 |             if entity_embedding is not None:
237 |                 if entity_embedding.ndim > 1:
238 |                     entity_embedding = entity_embedding.flatten()
239 | 
240 |             # Re-embed if missing or dimension mismatch with current query embedding
241 |             needs_reembed = (
242 |                 entity_embedding is None
243 |                 or not isinstance(entity_embedding, np.ndarray)
244 |                 or entity_embedding.size == 0
245 |                 or entity_embedding.shape[0] != query_embedding.shape[0]
246 |             )
247 | 
248 |             if needs_reembed:
249 |                 # Fallback text to embed if content is unavailable
250 |                 text_to_embed = (
251 |                     (entity.content_snippet or "").strip()
252 |                     or (" ".join(entity.keywords) if entity.keywords else "")
253 |                     or url
254 |                 )
255 |                 entity_embedding = self.embedding_model.encode(text_to_embed)
256 |                 if isinstance(entity_embedding, np.ndarray) and entity_embedding.ndim > 1:
257 |                     entity_embedding = entity_embedding.flatten()
258 |                 # Cache back to entity to avoid repeated re-embedding
259 |                 try:
260 |                     self.entities[url].embedding = entity_embedding.tolist()
261 |                 except Exception:
262 |                     pass
263 | 
264 |             # Calculate cosine similarity between 1D vectors
265 |             similarity = np.dot(query_embedding, entity_embedding) / (
266 |                 np.linalg.norm(query_embedding) * np.linalg.norm(entity_embedding)
267 |             )
268 |             similarity_scores[url] = similarity
269 |     
270 |         # Combine scores
271 |         combined_scores = {}
272 |         all_urls = set(keyword_scores.keys()) | set(similarity_scores.keys())
273 |         
274 |         for url in all_urls:
275 |             score = (
276 |                 keyword_scores.get(url, 0) * 2.0 +  # Keyword match is most important
277 |                 similarity_scores.get(url, 0) * 1.5   # Semantic similarity
278 |             )
279 |             combined_scores[url] = score
280 |         
281 |         # Sort and return top k
282 |         sorted_urls = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
283 |         top_urls = [url for url, _ in sorted_urls[:top_k]]
284 | 
285 |         # Fallback: if nothing scored (e.g., no embeddings and no keyword matches), return first k nodes
286 |         if not top_urls:
287 |             try:
288 |                 return list(self.graph.nodes.keys())[:top_k]
289 |             except Exception:
290 |                 return []
291 |             
292 |             
293 |         return top_urls
294 |     
295 |     def _expand_context_with_graph(self, seed_urls: List[str]) -> Dict:
296 |         """Expand context using graph relationships and high-value neighbors with optional token budget."""
297 |         expanded_urls: Set[str] = set()
298 |         important_relationships: List[Dict] = []
299 | 
300 |         # Helper to try-include a URL respecting budget
301 |         used_tokens = 0
302 |         limit = self.token_budget if self.token_budget is not None else None
303 |         included_count_seeds = 0
304 |         included_count_neighbors = 0
305 | 
306 |         def try_include(url: str) -> bool:
307 |             nonlocal used_tokens
308 |             if url not in self.entities:
309 |                 return False
310 |             if url in expanded_urls:
311 |                 return False
312 |             est = self._estimate_entity_tokens(self.entities[url])
313 |             if limit is not None and used_tokens + est > limit:
314 |                 return False
315 |             expanded_urls.add(url)
316 |             used_tokens += est
317 |             return True
318 | 
319 |         # Include seeds first
320 |         for url in seed_urls:
321 |             if try_include(url):
322 |                 included_count_seeds += 1
323 | 
324 |         if limit is not None:
325 |             print(f"[Budget] After seeds: included={included_count_seeds}, used={used_tokens}/{limit}")
326 | 
327 |         # Then include up to 5 neighbors per seed
328 |         for url in seed_urls:
329 |             if url in self.knowledge_graph:
330 |                 neighbors = list(self.knowledge_graph.neighbors(url))[:5]
331 |                 for neighbor in neighbors:
332 |                     if try_include(neighbor):
333 |                         included_count_neighbors += 1
334 |                         # Record relationship if present
335 |                         if self.knowledge_graph.has_edge(url, neighbor):
336 |                             edge_data = self.knowledge_graph[url][neighbor]
337 |                         elif self.knowledge_graph.has_edge(neighbor, url):
338 |                             edge_data = self.knowledge_graph[neighbor][url]
339 |                         else:
340 |                             edge_data = {}
341 |                         important_relationships.append({
342 |                             "source": url,
343 |                             "target": neighbor,
344 |                             "common_keywords": edge_data.get("common_keywords", []),
345 |                             "semantic_similarity": edge_data.get("semantic_similarity", 0.0)
346 |                         })
347 |                 if limit is not None and used_tokens >= limit:
348 |                     break
349 | 
350 |         if limit is not None:
351 |             print(f"[Budget] After neighbors: added={included_count_neighbors}, used={used_tokens}/{limit}")
352 | 
353 |         # Build final entities list
354 |         entities = [self.entities[url] for url in expanded_urls if url in self.entities]
355 |         return {"relationships": important_relationships, "entities": entities}
356 | 
357 | 
358 |     async def _generate_enhanced_answer(self, query: str, context: Dict) -> str:
359 |         """Generate enhanced answer using LLM with structured context"""
360 |         
361 |         # Prepare structured context
362 |         context_text = "\n**Documentation Context:**\n"
363 |         for entity in context.get('entities', []):
364 |             url = entity.source_urls[0] if entity.source_urls else "Unknown"
365 |             name = self._extract_entity_name(url)
366 |             context_text += f"**{name}**\n"
367 |             context_text += f"URL: {url}\n"
368 |             if entity.keywords:
369 |                 context_text += f"Keywords: {', '.join(entity.keywords[:5])}\n"
370 |             content_snippet = entity.content_snippet 
371 |             context_text += f"Content: {content_snippet}\n\n"
372 |         
373 |         # Add relationship information
374 |         relationships_text = ""
375 |         if context.get('relationships'):
376 |             relationships_text = "\n**Related Concepts:**\n"
377 |             for rel in context['relationships']:
378 |                 source_name = self._extract_entity_name(rel['source'])
379 |                 target_name = self._extract_entity_name(rel['target'])
380 |                 relationships_text += f"- {source_name} → {target_name}"
381 |                 
382 |                 # Add content snippets from related entities
383 |                 source_entity = self.entities.get(rel['source'])
384 |                 target_entity = self.entities.get(rel['target'])
385 |                 
386 |                 if source_entity and target_entity:
387 |                     # Get brief content snippets (first 100 chars)
388 |                     source_snippet = source_entity.content_snippet   
389 |                     target_snippet = target_entity.content_snippet 
390 |                     relationships_text += f"\n  Source: {source_snippet}\n  Target: {target_snippet}"
391 |                 
392 |                 if rel.get('common_keywords'):
393 |                     relationships_text += f"\n  Shared keywords: {', '.join(rel['common_keywords'][:3])}"
394 |                 relationships_text += "\n\n"
395 |         
396 |         # Construct prompt
397 |         prompt = f"""
398 |         Based on the following documentation context, answer the user's question comprehensively and accurately.
399 | 
400 |         Question: {query}
401 | 
402 |         {context_text}
403 |         {relationships_text}
404 | 
405 |         Instructions:
406 |         1. Provide a clear, comprehensive answer to the question
407 |         2. Use specific information from the documentation
408 |         3. Include relevant code examples or commands when applicable
409 |         4. Mention specific URLs when referencing particular features
410 |         5. If the question asks about multiple topics, organize your answer with clear sections
411 |         6. Be practical and actionable in your recommendations
412 | 
413 |         Answer:
414 |         """
415 |         
416 |         if self.token_budget is not None:
417 |             est_ctx_tokens = self._estimate_tokens_text(context_text + relationships_text)
418 |             overhead = self._estimate_tokens_text(prompt.replace(context_text, '').replace(relationships_text, ''))
419 |             print(f"[Budget] Prompt context tokens≈{est_ctx_tokens}, overhead≈{overhead}. Total≈{est_ctx_tokens + overhead} / limit={self.token_budget}")
420 | 
421 |         try:
422 |             response = self.llm.send_message(prompt)
423 |             return response.text
424 |         except Exception as e:
425 |             return f"Error generating answer: {e}"
426 |     
427 | 
428 | 
429 | 
430 | # Convenience function for easy usage
431 | async def create_graphrag( graph : Graph ,  gemini_api_key: str, token_budget: Optional[int] = None) -> GraphRAGSystem:
432 |     """Create and initialize GraphRAG system from kg.json file"""
433 |     
434 |     # Create and initialize system
435 |     rag_system = GraphRAGSystem(graph , gemini_api_key, token_budget=token_budget)
436 |     rag_system.load_from_kg_json()
437 |     
438 |     return rag_system 
439 | 


--------------------------------------------------------------------------------
/deepcrawl.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import asyncio
  3 | import numpy as np
  4 | import os
  5 | import json
  6 | import base64
  7 | from pathlib import Path
  8 | from typing import List, Optional, Dict
  9 | from crawl4ai.proxy_strategy import ProxyConfig
 10 | import sys
 11 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 12 | from crawl4ai import RoundRobinProxyStrategy
 13 | from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 14 | from crawl4ai import LLMConfig
 15 | from crawl4ai import PruningContentFilter, BM25ContentFilter
 16 | from crawl4ai import DefaultMarkdownGenerator
 17 | from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
 18 | from crawl4ai import BrowserConfig
 19 | import argparse
 20 | from sentence_transformers import SentenceTransformer
 21 | from dotenv import load_dotenv
 22 | from summa import keywords as textrank_keywords
 23 | import re
 24 | from collections import Counter
 25 | from pathlib import Path
 26 | from typing import Set, Optional, List, Dict, Any , TypedDict
 27 | import h5py
 28 | from dataclasses import dataclass, field
 29 | import hashlib
 30 | 
 31 | 
 32 | @dataclass
 33 | class GraphNode:
 34 |     """Simple graph node representation for the crawler"""
 35 |     url: str
 36 |     content: str
 37 |     depth: int
 38 |     keywords : List[str]
 39 |     embedding : List[float]
 40 |     children: List['GraphNode'] = field(default_factory=list)
 41 |     
 42 |     def add_child(self, child_node: 'GraphNode') -> None:
 43 |         """Add a child GraphNode to this node"""
 44 |         self.children.append(child_node)
 45 |         
 46 |         
 47 | 
 48 | class MetadataDict(TypedDict):
 49 |     total_nodes: int
 50 |     max_depth: int
 51 |     root_url: str
 52 |     
 53 | @dataclass        
 54 | class Graph : 
 55 |     nodes: Dict[str, GraphNode] = field(default_factory=dict)
 56 |     edges: List['Edge'] = field(default_factory=list)
 57 |     metadata : MetadataDict = field(default_factory=dict)
 58 | 
 59 |         
 60 |         
 61 | class Edge(TypedDict) : 
 62 |     source : str 
 63 |     target : str 
 64 |     common_keywords : List[str]
 65 |     semantic_similarity : float
 66 |     
 67 | 
 68 | top_k = 25
 69 | 
 70 | 
 71 | def extract_keywords_textrank(content: str, top_k: int ) -> List[str]:
 72 |     """Extract keywords using TextRank algorithm from summa library"""
 73 |     # if not TEXTRANK_AVAILABLE or not content.strip():
 74 |     #     return extract_keywords_fallback(content, top_k)
 75 |     
 76 |     try:
 77 |         # Clean content and extract keywords
 78 |         cleaned_content = re.sub(r'[^\w\s]', ' ', content)
 79 |         cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()
 80 |         
 81 |         if len(cleaned_content) < 50:  # Too short for TextRank
 82 |             return extract_keywords_fallback(content, top_k)
 83 |             
 84 |         keywords_text = textrank_keywords.keywords(cleaned_content, words=top_k, split=True)
 85 |         return [kw.strip() for kw in keywords_text if kw.strip()]
 86 |         
 87 |     except Exception as e:
 88 |         print(f"⚠️  TextRank failed: {e}. Using fallback method.")
 89 |         return extract_keywords_fallback(content, top_k)
 90 | 
 91 | def extract_keywords_fallback(content: str, top_k: int = 10) -> List[str]:
 92 |     """Fallback keyword extraction using simple frequency analysis"""
 93 |     if not content.strip():
 94 |         return []
 95 |     
 96 |     # Remove common stop words
 97 |     stop_words = {
 98 |         'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 
 99 |         'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after', 
100 |         'above', 'below', 'between', 'among', 'is', 'are', 'was', 'were', 'be', 'been', 
101 |         'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 
102 |         'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those', 'i', 
103 |         'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'my', 
104 |         'your', 'his', 'its', 'our', 'their', 'mine', 'yours', 'ours', 'theirs'
105 |     }
106 |     
107 |     # Extract words (alphabetic, length >= 3)
108 |     words = re.findall(r'\b[a-zA-Z]{3,}\b', content.lower())
109 |     
110 |     # Filter out stop words and get frequency
111 |     filtered_words = [word for word in words if word not in stop_words]
112 |     word_freq = Counter(filtered_words)
113 |     
114 |     # Return top k most frequent words
115 |     return [word for word, _ in word_freq.most_common(top_k)]
116 | 
117 | def get_common_keywords(keywords1: List[str], keywords2: List[str]) -> List[str]:
118 |     """Find common keywords between two lists"""
119 |     set1 = set(keyword.lower() for keyword in keywords1)
120 |     set2 = set(keyword.lower() for keyword in keywords2)
121 |     common = set1.intersection(set2)
122 |     return list(common)
123 | 
124 | 
125 | def create_embeddings(content: str) -> List[float]:
126 |     """Create embeddings for a given content"""
127 |     return SentenceTransformer('all-MiniLM-L6-v2').encode(content)
128 |     
129 | def find_parent_node(graph: GraphNode, target_url: str) -> Optional[GraphNode]:
130 |     """
131 |     Search for a parent node that contains the target URL as a child.
132 |     Uses depth-first search to traverse the graph.
133 |     
134 |     Args:
135 |         graph: The root GraphNode of the graph
136 |         target_url: The URL to search for among children
137 |         
138 |     Returns:
139 |         GraphNode if found, None otherwise
140 |     """
141 |     def dfs_search(node: GraphNode) -> Optional[GraphNode]:
142 |         # Check if current node has the target URL as a child
143 |         for child in node.children:
144 |             if child.url == target_url:
145 |                 return node
146 |         
147 |         # Recursively search in children
148 |         for child in node.children:
149 |             result = dfs_search(child)
150 |             if result:
151 |                 return result
152 |         
153 |         return None
154 |     
155 |     return dfs_search(graph)
156 | 
157 | def find_node_by_url(graph: GraphNode, target_url: str) -> Optional[GraphNode]:
158 |     """
159 |     Find a specific node by its URL in the graph.
160 |     Uses depth-first search to traverse the graph.
161 |     
162 |     Args:
163 |         graph: The root GraphNode of the graph
164 |         target_url: The URL to search for
165 |         
166 |     Returns:
167 |         GraphNode if found, None otherwise
168 |     """
169 |     def dfs_search(node: GraphNode) -> Optional[GraphNode]:
170 |         if node.url == target_url:
171 |             return node
172 |         
173 |         for child in node.children:
174 |             result = dfs_search(child)
175 |             if result:
176 |                 return result
177 |         
178 |         return None
179 |     
180 |     return dfs_search(graph)
181 |     
182 | 
183 | 
184 | async def deep_crawl(doc_url: str, max_depth: Optional[int], max_pages: Optional[int]):
185 |     """deep crawl with bfs"""
186 | 
187 |     print("\n ===== deep crawling == ")
188 |     
189 |     deep_crawl_strategy = BFSDeepCrawlStrategy(    max_depth = float('inf') if max_depth is None else max_depth , include_external = False)
190 |     
191 |     if max_pages is not None : 
192 |         deep_crawl_strategy.max_pages = max_pages
193 | 
194 | 
195 |     model = SentenceTransformer('all-MiniLM-L6-v2')
196 |     
197 |     
198 | 
199 | 
200 |     async with AsyncWebCrawler() as crawler:
201 |         results : List[CrawlResult] = await crawler.arun(
202 |             url = doc_url, 
203 |             config = CrawlerRunConfig(deep_crawl_strategy = deep_crawl_strategy) ,
204 |         )
205 |         
206 |         root : GraphNode
207 |         graph : Graph = Graph()
208 | 
209 |         print(f"deep crawl returned  : {len(results)} pages ")
210 |         for i , result in enumerate(results):
211 |             depth = result.metadata.get("depth")
212 |             parent_url = result.metadata.get("parent_url")
213 |             #score = result.metadata.get("score", 0.0)  # Get URL relevance score, default to 0.0
214 |             
215 |             #skipping for 404 : 
216 |             if(result.markdown is None ) : continue
217 |             condition = result.markdown.find('404') != -1
218 |             if(condition): continue
219 | 
220 |             # Debug: Print score information
221 |             # print(f"URL: {result.url[:50]}... | Depth: {depth} | Score: {score:.3f}")
222 |             keywords = extract_keywords_textrank(result.markdown, top_k)
223 |             #embedding = create_embeddings(result.markdown)
224 | 
225 |             if result.url == doc_url : 
226 |                 # keywords = extract_keywords_textrank(result.markdown, top_k)
227 |                 root = GraphNode(url=doc_url, content=result.markdown, depth=0,  keywords = keywords , embedding =model.encode(result.markdown) , children = [] )
228 |                 graph.nodes[doc_url] = root
229 |                 continue
230 |             
231 |             try : 
232 |                 
233 |                 parent_node = graph.nodes[parent_url]
234 |                 child_node = GraphNode(url=result.url, content=result.markdown, depth=depth, keywords = keywords , embedding = model.encode(result.markdown) ,  children = [] )
235 |                 parent_node.add_child(child_node)
236 |                 graph.nodes[result.url] = child_node
237 |                 
238 |                 common_keywords = get_common_keywords(parent_node.keywords , keywords)
239 |                 semantic_similarity = len(common_keywords) / max(len(parent_node.keywords), len(keywords))
240 |                 
241 |                 
242 |                 graph.edges.append({
243 |                     'source' : parent_node.url , 
244 |                     'target' : result.url , 
245 |                     'common_keywords' : common_keywords , 
246 |                     'semantic_similarity' : semantic_similarity
247 |                 })
248 |             except Exception as e:
249 |                 print(f"Error adding child: {e}")
250 |                 continue
251 |         
252 |         graph.metadata = {
253 |             'total_nodes' : len(graph.nodes),  
254 |             'max_depth' : max(node.depth for node in graph.nodes.values()) , 
255 |             'root_url' : doc_url
256 |         }
257 |         
258 |     return graph 
259 | 
260 | def print_graph_structure(root: GraphNode):
261 |     """Pretty-print the graph rooted at ``root`` now that we use ``GraphNode`` objects.
262 | 
263 |     The routine traverses the graph (DFS), shows each node with its depth, score,
264 |     content length, and the number of children, then outputs summary statistics.
265 |     """
266 | 
267 |     if root is None:
268 |         print("❌ Graph is empty!")
269 |         return
270 | 
271 |     # ── Collect all nodes ──────────────────────────────────────────────────────
272 |     all_nodes: List[GraphNode] = []
273 |     stack: List[GraphNode] = [root]
274 |     while stack:
275 |         node = stack.pop()
276 |         all_nodes.append(node)
277 |         stack.extend(node.children)
278 | 
279 |     # Sort by depth for nicer visual ordering
280 |     all_nodes.sort(key=lambda n: n.depth)
281 | 
282 |     print("\n" + "=" * 80)
283 |     print("📊 KNOWLEDGE GRAPH STRUCTURE")
284 |     print("=" * 80)
285 | 
286 |     print(f"📈 Total Nodes: {len(all_nodes)}")
287 |     max_depth_val = max(n.depth for n in all_nodes)
288 |     print(f"🌳 Max Depth: {max_depth_val}")
289 |     print(f"🎯 Root URL: {root.url}")
290 |     print("\n" + "-" * 80)
291 | 
292 |     # ── Per-node details ─────────────────────────────────────────────────────
293 |     for node in all_nodes:
294 |         indent = "  " * node.depth
295 |         children_count = len(node.children)
296 |         content_length = len(node.content or "")
297 | 
298 |         print(f"{indent}📍 Node: {node.url[:60]}{'...' if len(node.url) > 60 else ''}")
299 |         print(f"{indent}   ├─ Depth: {node.depth}")
300 |         # print(f"{indent}   ├─ Score: {node.score:.3f}")
301 |         print(f"{indent}   ├─ Content Length: {content_length:,} chars")
302 |         print(f"{indent}   └─ Children: {children_count}")
303 | 
304 |         # Display children URLs
305 |         for idx, child in enumerate(node.children):
306 |             child_prefix = "   └─" if idx == children_count - 1 else "   ├─"
307 |             print(f"{indent}{child_prefix} ➤ {child.url[:50]}{'...' if len(child.url) > 50 else ''}")
308 |         if children_count:
309 |             print()
310 | 
311 |     # ── Summary statistics ────────────────────────────────────────────────────
312 |     depths = [n.depth for n in all_nodes]
313 |     # scores = [n.score for n in all_nodes]
314 |     children_counts = [len(n.children) for n in all_nodes]
315 | 
316 |     print("=" * 80)
317 |     print("📊 GRAPH SUMMARY")
318 |     print("=" * 80)
319 | 
320 |     # Depth distribution
321 |     print("📊 Depth Distribution:")
322 |     depth_counts: Dict[int, int] = {}
323 |     for d in depths:
324 |         depth_counts[d] = depth_counts.get(d, 0) + 1
325 |     for d in sorted(depth_counts):
326 |         print(f"   Depth {d}: {depth_counts[d]} nodes")
327 | 
328 |     # Connectivity stats
329 |     print(f"\n📊 Connectivity:")
330 |     if children_counts:
331 |         avg_children = sum(children_counts) / len(children_counts)
332 |         print(f"   Average Children per Node: {avg_children:.1f}")
333 |         print(f"   Nodes with Children: {sum(1 for c in children_counts if c > 0)}")
334 |         print(f"   Leaf Nodes: {sum(1 for c in children_counts if c == 0)}")
335 | 
336 |     print("=" * 80)
337 | 
338 | 
339 | import math
340 | 
341 | def clean_embedding(embedding):
342 |     if embedding is None:
343 |         print(f" Returning None here")
344 |         return None
345 |     if isinstance(embedding, np.ndarray):
346 |         embedding = embedding.tolist()
347 |     
348 |     # Replace None values with 0.0 instead of keeping them as None
349 |     # JSON with allow_nan=False cannot serialize None values in numeric arrays
350 |     return [x if isinstance(x, (int, float)) and not (math.isnan(x) or math.isinf(x)) else 0.0 for x in embedding]
351 | 
352 | import numpy as np
353 | 
354 | def save_graph_hdf5(graph : Graph, filepath: str):
355 |     """
356 |     Save the graph structure to HDF5 format for efficient storage and retrieval.
357 |     
358 |     HDF5 (Hierarchical Data Format 5) is a binary format that provides:
359 |     - Efficient storage of large datasets
360 |     - Hierarchical organization (groups and datasets)
361 |     - Metadata storage via attributes
362 |     - Cross-platform compatibility
363 |     
364 |     Actual layout written by this function (slashes in URLs are NOT used as group names):
365 |     /metadata                  (group with attributes: total_nodes, max_depth, root_url)
366 |     /nodes                     (group)
367 |         /n_<id>                (group per node; <id> is a stable hash of URL)
368 |             attrs:
369 |                 url            (original URL, string)
370 |                 depth          (integer)
371 |                 content        (string; may be large)
372 |             datasets:
373 |                 embedding      (1D float array, optional)
374 |                 keywords       (1D variable-length UTF-8 string array)
375 |     /nodes_index               (structured dataset: columns id, url, depth)
376 |     /edges                     (structured dataset: columns source_id, target_id, semantic_similarity, common_keywords)
377 |     """
378 |     os.makedirs(os.path.dirname(filepath), exist_ok=True)
379 | 
380 |     # Collect all nodes
381 |     all_nodes = list(graph.nodes.values())
382 | 
383 |     
384 |     print(f"🔍 Extracting keywords for {len(all_nodes)} nodes...")
385 |     
386 |     # Prepare metadata
387 |     metadata = {
388 |         "total_nodes":  graph.metadata['total_nodes'], 
389 |         "max_depth": graph.metadata['max_depth'] , 
390 |         "root_url": graph.metadata['root_url'],
391 |     }
392 |     
393 |     dt_str = h5py.string_dtype('utf-8')
394 |     
395 |     try : 
396 |         with h5py.File(filepath, "w") as f:
397 |             # Store metadata as attributes
398 |             meta_grp = f.create_group("metadata")
399 |             for k, v in metadata.items():
400 |                 meta_grp.attrs[k] = v
401 | 
402 |             # Store nodes
403 |             nodes_grp = f.create_group("nodes")
404 |             
405 |             # Precompute safe, stable IDs for each node (avoid '/' in group names)
406 |             url_to_id: Dict[str, str] = {}
407 |             for node in all_nodes:
408 |                 node_id = hashlib.md5(node.url.encode('utf-8')).hexdigest()[:16]
409 |                 url_to_id[node.url] = node_id
410 | 
411 |             for node in all_nodes:
412 |                 try : 
413 |                     node_id = url_to_id[node.url]
414 |                     node_grp = nodes_grp.create_group(f"n_{node_id}")
415 |                 except ValueError : 
416 |                     print(f" value error  , name alr exists")
417 |                     continue
418 |                     
419 |                 dt = h5py.string_dtype('utf-8')  # Variable-length UTF-8
420 |                 node_grp.attrs["depth"] = node.depth
421 |                 node_grp.attrs["url"] = node.url
422 |                 
423 |                 
424 |                 embedding = clean_embedding(node.embedding)
425 |                 content = node.content
426 |                 if embedding is not None:
427 |                     node_grp.create_dataset("embedding", data=np.array(embedding))
428 |                 keywords = node.keywords
429 |                 if keywords:
430 |                     node_grp.create_dataset("keywords", data=node.keywords , dtype = dt_str)
431 |                 if content : 
432 |                     node_grp.create_dataset("content", data=node.content, dtype=dt_str)
433 |             
434 |             # Create a compact index for nodes (id -> url, depth)
435 |             node_index_dtype = np.dtype([
436 |                 ('id', dt_str),
437 |                 ('url', dt_str),
438 |                 ('depth', np.int32),
439 |             ])
440 |             node_index_rows = [
441 |                 (url_to_id[node.url], node.url, int(node.depth))
442 |                 for node in all_nodes
443 |             ]
444 |             if len(node_index_rows):
445 |                 f.create_dataset(
446 |                     "nodes_index",
447 |                     data=np.array(node_index_rows, dtype=node_index_dtype)
448 |                 )
449 |             
450 |             # making my own datatype for saving edges- 
451 |             
452 |             
453 |             edge_dtype = np.dtype([
454 |                 ('source_id' , dt_str), 
455 |                 ('target_id' , dt_str) ,
456 |                 ('semantic_similarity' , np.float64) ,
457 |                 ('common_keywords' , dt_str),
458 |             ])
459 |             # print(float(e['semantic_similarity'])
460 |             
461 |             edge_row = []
462 |             for e in graph.edges:
463 |                 try:
464 |                     sid = url_to_id[e['source']]
465 |                     tid = url_to_id[e['target']]
466 |                 except KeyError:
467 |                     # In case an edge references a node that wasn't saved (shouldn't happen)
468 |                     continue
469 |                 edge_row.append((
470 |                     sid,
471 |                     tid,
472 |                     float(e['semantic_similarity']),
473 |                     ",".join(e['common_keywords'])
474 |                 ))
475 |             
476 |             edge_arr = np.array(edge_row , dtype = edge_dtype)
477 | 
478 |             if len(edge_arr) :
479 |                 print(f" len edges data = {len(edge_arr)}")
480 |                 # Create structured array (like a database table)
481 |                 f.create_dataset(
482 |                     "edges",
483 |                     data = edge_arr
484 |                 )
485 |                 
486 |     except Exception as e: 
487 |         print("oops error uWu" , e)
488 |         # Remove the partially created file if an error occurred
489 |         if os.path.exists(filepath):
490 |             os.remove(filepath)
491 |         exit(1)
492 | 
493 |     print(f"💾 Knowledge graph  to {filepath} (HDF5)")
494 | 
495 | async def main(url: str, max_depth: Optional[int], max_pages: Optional[int]) -> Graph:
496 |     print("======= running deep crwal ===============" )
497 |     graph  = await deep_crawl(url, max_depth, max_pages)
498 |     return graph
499 |     
500 | if __name__ == "__main__":
501 |     parser = argparse.ArgumentParser(description = "pass multiple varirables")
502 |     parser.add_argument("--max_depth"  , type = int  , help = "maximum depth of pages")
503 |     parser.add_argument ("--max_pages" , type = int ,  help = "maximum number of pages")
504 |     parser.add_argument("--url" , type = str  , required = True, help = "doc url")
505 |     parser.add_argument("--output_dir" , type = str  , required = True, help = "output dir")
506 |     parser.add_argument("--name" , type = str  , required = True, help = "name")
507 |     args = parser.parse_args()
508 |     doc_url = args.url 
509 |     max_depth = args.max_depth
510 |     max_pages = args.max_pages
511 |     OUTPUT_DIR = args.output_dir
512 |     name = args.name
513 |     asyncio.run(main(args.url, args.max_depth, args.max_pages))
514 | 
515 | 
516 | 
517 | 
518 | 


--------------------------------------------------------------------------------